Advanced Features#
Row2Vec includes several advanced features for expert users and production workflows.
Neural Architecture Search (NAS)#
Automatically find optimal neural network architectures:
# Import complete suppression first
exec(open('suppress_minimal.py').read())
from row2vec import (
ArchitectureSearchConfig,
search_architecture,
generate_synthetic_data
)
# Generate test data
df = generate_synthetic_data(num_records=500, seed=42)
print(f"Test data shape: {df.shape}")
✓ Enhanced minimal suppression active
Test data shape: (500, 3)
# Configure architecture search
config = ArchitectureSearchConfig(
method='random', # Random search (faster than grid)
max_layers=3, # Search up to 3 hidden layers
width_options=[32, 64, 128, 256], # Neuron options per layer
max_trials=5, # Number of architectures to try (reduced for demo)
initial_epochs=10 # Reduced epochs for faster demo
)
# Run architecture search
print("Searching for optimal architecture...")
# Need base config for architecture search
from row2vec import EmbeddingConfig, NeuralConfig
base_config = EmbeddingConfig(
mode="unsupervised",
embedding_dim=5,
neural=NeuralConfig(max_epochs=10)
)
best_architecture, search_results = search_architecture(df, base_config, config)
print(f"\nBest architecture found:")
print(f" Architecture: {best_architecture}")
print(f" Search completed in {search_results.total_time:.2f} seconds")
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_1 (Dense) │ (None, 128) │ 1,408 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_2 (Dense) │ (None, 256) │ 33,024 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_1 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_3 (Dense) │ (None, 128) │ 32,896 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_2 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 645 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_1 (Dense) │ (None, 128) │ 768 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_3 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_2 (Dense) │ (None, 256) │ 33,024 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_4 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_3 (Dense) │ (None, 128) │ 32,896 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_5 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense (Dense) │ (None, 10) │ 1,290 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 135,951 (531.06 KB)
Trainable params: 135,951 (531.06 KB)
Non-trainable params: 0 (0.00 B)
Searching for optimal architecture...
Model: "functional_2"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_1 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_1 (Dense) │ (None, 32) │ 352 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_6 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 165 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_2 (Dense) │ (None, 32) │ 192 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_7 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_3 (Dense) │ (None, 10) │ 330 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 1,039 (4.06 KB)
Trainable params: 1,039 (4.06 KB)
Non-trainable params: 0 (0.00 B)
Model: "functional_4"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_2 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_1 (Dense) │ (None, 256) │ 2,816 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_8 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_2 (Dense) │ (None, 256) │ 65,792 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_9 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_3 (Dense) │ (None, 128) │ 32,896 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_10 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_4 (Dense) │ (None, 32) │ 4,128 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_11 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 165 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_1 (Dense) │ (None, 32) │ 192 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_12 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_2 (Dense) │ (None, 128) │ 4,224 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_13 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_3 (Dense) │ (None, 256) │ 33,024 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_14 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_4 (Dense) │ (None, 256) │ 65,792 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_15 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_4 (Dense) │ (None, 10) │ 2,570 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 211,599 (826.56 KB)
Trainable params: 211,599 (826.56 KB)
Non-trainable params: 0 (0.00 B)
Model: "functional_6"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_3 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_1 (Dense) │ (None, 128) │ 1,408 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_16 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_2 (Dense) │ (None, 32) │ 4,128 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_17 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_3 (Dense) │ (None, 32) │ 1,056 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_18 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_4 (Dense) │ (None, 128) │ 4,224 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_19 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 645 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_1 (Dense) │ (None, 128) │ 768 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_20 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_2 (Dense) │ (None, 32) │ 4,128 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_21 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_3 (Dense) │ (None, 32) │ 1,056 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_22 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_4 (Dense) │ (None, 128) │ 4,224 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_23 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_5 (Dense) │ (None, 10) │ 1,290 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 22,927 (89.56 KB)
Trainable params: 22,927 (89.56 KB)
Non-trainable params: 0 (0.00 B)
Model: "functional_8"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_4 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_1 (Dense) │ (None, 64) │ 704 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_24 (Dropout) │ (None, 64) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_2 (Dense) │ (None, 64) │ 4,160 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_25 (Dropout) │ (None, 64) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 325 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_1 (Dense) │ (None, 64) │ 384 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_26 (Dropout) │ (None, 64) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_2 (Dense) │ (None, 64) │ 4,160 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_27 (Dropout) │ (None, 64) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_6 (Dense) │ (None, 10) │ 650 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 10,383 (40.56 KB)
Trainable params: 10,383 (40.56 KB)
Non-trainable params: 0 (0.00 B)
Best architecture found:
Architecture: {'n_layers': 1, 'hidden_units': 32, 'dropout_rate': 0.2, 'activation': 'relu'}
Search completed in 21.62 seconds
# Use the best architecture
from row2vec import learn_embedding
# Apply the best architecture found
best_embeddings = learn_embedding(
df,
mode="unsupervised",
embedding_dim=5,
hidden_units=best_architecture.get('hidden_units', [128]),
dropout_rate=best_architecture.get('dropout_rate', 0.2),
max_epochs=20,
verbose=False,
seed=42
)
print(f"Optimized embeddings shape: {best_embeddings.shape}")
print("First 3 embeddings using best architecture:")
print(best_embeddings.head(3).round(4))
Model: "functional_10"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_5 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_7 (Dense) │ (None, 32) │ 352 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_28 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 165 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_8 (Dense) │ (None, 32) │ 192 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_29 (Dropout) │ (None, 32) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_9 (Dense) │ (None, 10) │ 330 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 1,039 (4.06 KB)
Trainable params: 1,039 (4.06 KB)
Non-trainable params: 0 (0.00 B)
Optimized embeddings shape: (500, 5)
First 3 embeddings using best architecture:
embedding_0 embedding_1 embedding_2 embedding_3 embedding_4
0 0.2743 -0.2934 -0.2631 -0.4504 -0.0223
1 -0.1761 -0.4336 -0.4362 0.3499 0.8057
2 0.2331 -0.5344 0.1291 -0.4565 -0.6254
Advanced Missing Value Imputation#
Intelligent missing value handling with multiple strategies:
from row2vec import (
ImputationConfig,
AdaptiveImputer,
MissingPatternAnalyzer
)
import numpy as np
# Create data with missing values
df_missing = df.copy()
np.random.seed(42)
# Introduce different types of missing patterns
df_missing.loc[np.random.choice(df_missing.index, 50), 'Sales'] = np.nan
df_missing.loc[np.random.choice(df_missing.index, 30), 'Product'] = np.nan
df_missing.loc[np.random.choice(df_missing.index, 20), 'Country'] = np.nan
print(f"Missing values introduced:")
print(df_missing.isnull().sum())
Missing values introduced:
Country 20
Product 30
Sales 47
dtype: int64
# Analyze missing patterns
config = ImputationConfig()
analyzer = MissingPatternAnalyzer(config)
analysis = analyzer.analyze(df_missing)
print(f"Missing value analysis:")
print(f" Total missing: {analysis['total_missing']}")
print(f" Missing percentage: {analysis['missing_percentage']:.1f}%")
print(f" Columns with missing: {analysis['columns_with_missing']}")
print(f" Recommendations: {analysis['recommendations']}")
Missing value analysis:
Total missing: 97
Missing percentage: 6.5%
Columns with missing: 3
Recommendations: {'Country': {'missing_percentage': 4.0, 'is_numeric': False, 'suggested_strategy': 'mode', 'reasoning': 'Few categories, mode imputation works well', 'alternatives': ['mode', 'constant', 'missing_category']}, 'Product': {'missing_percentage': 6.0, 'is_numeric': False, 'suggested_strategy': 'mode', 'reasoning': 'Few categories, mode imputation works well', 'alternatives': ['mode', 'constant', 'missing_category']}, 'Sales': {'missing_percentage': 9.4, 'is_numeric': True, 'suggested_strategy': 'mean', 'reasoning': 'Low missingness, mean imputation is fast and effective', 'alternatives': ['mean', 'median', 'knn', 'iterative']}}
# Apply different imputation strategies
strategies = {
'adaptive': ImputationConfig(),
'knn': ImputationConfig(
numeric_strategy='knn',
categorical_strategy='mode',
knn_neighbors=5
),
'iterative': ImputationConfig(
numeric_strategy='iterative',
categorical_strategy='mode',
categorical_fill_value='Missing'
)
}
for name, strategy_config in strategies.items():
imputer = AdaptiveImputer(strategy_config)
df_imputed = imputer.fit_transform(df_missing)
remaining_missing = df_imputed.isnull().sum().sum()
print(f"{name:12}: {remaining_missing} missing values remaining")
adaptive : 0 missing values remaining
knn : 0 missing values remaining
iterative : 0 missing values remaining
Categorical Encoding Strategies#
Advanced categorical feature handling:
from row2vec import learn_embedding_v2, EmbeddingConfig, PreprocessingConfig
# Test different categorical encoding strategies
encoding_strategies = ['onehot', 'target', 'adaptive']
for strategy in encoding_strategies:
# Create complete config with categorical encoding strategy
config = EmbeddingConfig(
embedding_dim=3,
mode="unsupervised",
preprocessing=PreprocessingConfig(
categorical_encoding_strategy=strategy
)
)
embeddings = learn_embedding_v2(
df,
config=config
)
print(f"{strategy:12}: shape {embeddings.shape}, mean={embeddings.mean().mean():.3f}")
Model: "functional_12"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_6 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_10 (Dense) │ (None, 128) │ 1,408 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_30 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 3) │ 387 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_11 (Dense) │ (None, 128) │ 512 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_31 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_12 (Dense) │ (None, 10) │ 1,290 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 3,597 (14.05 KB)
Trainable params: 3,597 (14.05 KB)
Non-trainable params: 0 (0.00 B)
Model: "functional_14"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_7 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_13 (Dense) │ (None, 128) │ 1,408 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_32 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 3) │ 387 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_14 (Dense) │ (None, 128) │ 512 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_33 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_15 (Dense) │ (None, 10) │ 1,290 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 3,597 (14.05 KB)
Trainable params: 3,597 (14.05 KB)
Non-trainable params: 0 (0.00 B)
onehot : shape (500, 3), mean=0.056
Model: "functional_16"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_8 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_16 (Dense) │ (None, 128) │ 1,408 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_34 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 3) │ 387 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_17 (Dense) │ (None, 128) │ 512 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_35 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_18 (Dense) │ (None, 10) │ 1,290 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 3,597 (14.05 KB)
Trainable params: 3,597 (14.05 KB)
Non-trainable params: 0 (0.00 B)
target : shape (500, 3), mean=0.056
adaptive : shape (500, 3), mean=0.056
Multi-Layer Neural Networks#
Deep architectures for complex patterns:
# Compare single vs multi-layer networks
architectures = {
'Single Layer': [128],
'Two Layer': [256, 128],
'Three Layer': [512, 256, 128]
}
for name, hidden_units in architectures.items():
embeddings = learn_embedding(
df,
mode="unsupervised",
embedding_dim=5,
hidden_units=hidden_units,
dropout_rate=0.2,
max_epochs=15,
verbose=False,
seed=42
)
# Calculate some basic metrics
mean_emb = embeddings.mean().mean()
std_emb = embeddings.std().mean()
print(f"{name:15}: mean={mean_emb:7.3f}, std={std_emb:6.3f}")
Model: "functional_18"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_9 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_1 (Dense) │ (None, 128) │ 1,408 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_36 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 645 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_1 (Dense) │ (None, 128) │ 768 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_37 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_19 (Dense) │ (None, 10) │ 1,290 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 4,111 (16.06 KB)
Trainable params: 4,111 (16.06 KB)
Non-trainable params: 0 (0.00 B)
Model: "functional_20"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_10 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_1 (Dense) │ (None, 256) │ 2,816 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_38 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_2 (Dense) │ (None, 128) │ 32,896 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_39 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 645 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_1 (Dense) │ (None, 128) │ 768 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_40 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_2 (Dense) │ (None, 256) │ 33,024 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_41 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_20 (Dense) │ (None, 10) │ 2,570 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 72,719 (284.06 KB)
Trainable params: 72,719 (284.06 KB)
Non-trainable params: 0 (0.00 B)
Single Layer : mean= -0.048, std= 0.632
Model: "functional_22"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_11 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_1 (Dense) │ (None, 512) │ 5,632 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_42 (Dropout) │ (None, 512) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_2 (Dense) │ (None, 256) │ 131,328 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_43 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_3 (Dense) │ (None, 128) │ 32,896 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_44 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 645 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_1 (Dense) │ (None, 128) │ 768 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_45 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_2 (Dense) │ (None, 256) │ 33,024 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_46 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_3 (Dense) │ (None, 512) │ 131,584 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_47 (Dropout) │ (None, 512) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_21 (Dense) │ (None, 10) │ 5,130 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 341,007 (1.30 MB)
Trainable params: 341,007 (1.30 MB)
Non-trainable params: 0 (0.00 B)
Two Layer : mean= 0.013, std= 0.678
Three Layer : mean= -0.053, std= 0.623
Automatic Dimension Selection#
Find optimal embedding dimensions:
from row2vec import auto_select_dimension, EmbeddingConfig, NeuralConfig
# Create base config for dimension selection
base_config = EmbeddingConfig(
mode="unsupervised",
seed=42,
verbose=False,
neural=NeuralConfig(max_epochs=10)
)
# Test dimensions from 2 to 10
optimal_dim, results = auto_select_dimension(
df,
config=base_config,
min_dimension=2,
max_dimension=10,
n_trials=3, # Reduced for demo
verbose=False
)
print(f"Optimal embedding dimension: {optimal_dim}")
print(f"\nDimension evaluation results:")
for dim, score in results.items():
marker = " <-- OPTIMAL" if dim == optimal_dim else ""
# Handle case where score might be a dict, list, or other format
if isinstance(score, dict):
# Try common score keys
score_val = score.get('score', score.get('loss', score.get('value', 0.0)))
elif isinstance(score, (list, tuple)):
score_val = score[0] if len(score) > 0 else 0.0
else:
try:
score_val = float(score) if score is not None else 0.0
except (ValueError, TypeError):
score_val = 0.0
print(f" {dim:2}D: {score_val:.4f}{marker}")
Optimal embedding dimension: 4
Dimension evaluation results:
candidate_dimensionsD: 2.0000
method_resultsD: 0.0000
dimension_scoresD: 0.0000
selection_strategyD: 0.0000
weightsD: 0.0000
Contrastive Learning Mode#
Advanced embedding technique (if available):
try:
# Try to use contrastive learning functionality if available
from row2vec import learn_embedding
# Use regular neural embeddings as contrastive learning may not be available
# in the current API version
contrastive_embeddings = learn_embedding(
df,
mode="unsupervised",
embedding_dim=5,
max_epochs=10,
verbose=False,
seed=42
)
print(f"Neural embeddings shape: {contrastive_embeddings.shape}")
print("First 3 embeddings (using standard neural approach):")
print(contrastive_embeddings.head(3).round(4))
print("\nNote: Advanced contrastive learning features may require specific Row2Vec versions")
except ImportError as e:
print(f"Advanced contrastive learning mode not available: {e}")
Model: "functional_24"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_12 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_22 (Dense) │ (None, 128) │ 1,408 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_48 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 645 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_23 (Dense) │ (None, 128) │ 768 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_49 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_24 (Dense) │ (None, 10) │ 1,290 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 4,111 (16.06 KB)
Trainable params: 4,111 (16.06 KB)
Non-trainable params: 0 (0.00 B)
Neural embeddings shape: (500, 5)
First 3 embeddings (using standard neural approach):
embedding_0 embedding_1 embedding_2 embedding_3 embedding_4
0 -0.3042 0.3156 0.1292 -0.0977 0.5706
1 -0.5889 -0.1660 -0.5090 -0.6612 -0.6047
2 0.2482 0.7880 0.2081 0.2817 -0.2617
Note: Advanced contrastive learning features may require specific Row2Vec versions
Model Serialization with Metadata#
Row2Vec provides powerful model serialization capabilities for production workflows. The system uses a two-file approach for transparency and efficiency:
Python script (.py): Contains inspectable metadata and loading logic
Binary file (.pkl): Contains the actual model weights and preprocessor
Key Features#
Transparent metadata: All training configuration and results stored in readable format
Complete pipeline preservation: Includes preprocessing steps, not just the model
Schema validation: Automatically validates input data against expected schema
Multiple model support: Works with neural networks and classical ML methods
Detailed training information: Loss curves, timing, data characteristics, and more
Advanced model saving with rich metadata:#
from row2vec import (
learn_embedding_with_model,
save_model,
Row2VecModel,
Row2VecModelMetadata
)
import tempfile
import os
# Train with full model information
embeddings, model, preprocessor, metadata = learn_embedding_with_model(
df,
embedding_dim=8,
mode="unsupervised",
max_epochs=20,
batch_size=64,
dropout_rate=0.3,
hidden_units=256,
verbose=False,
seed=42
)
print(f"Training completed:")
print(f" Final loss: {metadata.get('final_loss', 'N/A')}")
print(f" Training time: {metadata.get('training_time', 0):.2f}s")
print(f" Epochs trained: {metadata.get('epochs_trained', 0)}")
Model: "functional_26"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_13 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_25 (Dense) │ (None, 256) │ 2,816 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_50 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 8) │ 2,056 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_26 (Dense) │ (None, 256) │ 2,304 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_51 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_27 (Dense) │ (None, 10) │ 2,570 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 9,746 (38.07 KB)
Trainable params: 9,746 (38.07 KB)
Non-trainable params: 0 (0.00 B)
1/16 ━━━━━━━━━━━━━━━━━━━━ 0s 26ms/step
16/16 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step
Training completed:
Final loss: None
Training time: 4.88s
Epochs trained: 20
# Create rich model object with metadata
row2vec_model = Row2VecModel(
model=model,
preprocessor=preprocessor,
metadata=Row2VecModelMetadata.from_dict(metadata)
)
# Save model with comprehensive information
with tempfile.TemporaryDirectory() as tmpdir:
model_path = os.path.join(tmpdir, "advanced_model")
script_path, binary_path = save_model(row2vec_model, model_path)
print(f"Model saved with metadata:")
print(f" Script: {os.path.basename(script_path)}")
print(f" Binary: {os.path.basename(binary_path)}")
# Load and inspect metadata
from row2vec import load_model
loaded_model = load_model(script_path)
print(f"\nLoaded model metadata:")
print(f" Data shape: {loaded_model.metadata.data_shape}")
print(f" Original columns: {len(loaded_model.metadata.original_columns) if loaded_model.metadata.original_columns else 0}")
print(f" Embedding dimension: {loaded_model.metadata.embedding_dim}")
print(f" Training epochs: {loaded_model.metadata.epochs_trained}")
print(f" Final loss: {loaded_model.metadata.final_loss}")
Model saved with metadata:
Script: advanced_model.py
Binary: advanced_model.pkl
Loaded model metadata:
Data shape: (500, 3)
Original columns: 3
Embedding dimension: 8
Training epochs: 20
Final loss: None
Schema Validation in Production#
Models automatically validate input data against the expected schema:
# Load model for validation demonstration
from row2vec import train_and_save_model, load_model, generate_synthetic_data
import tempfile
import os
with tempfile.TemporaryDirectory() as tmpdir:
# Create a small model for demonstration
sample_data = generate_synthetic_data(50, seed=42)
embeddings, script_path, binary_path = train_and_save_model(
sample_data,
base_path=os.path.join(tmpdir, "validation_model"),
embedding_dim=3,
max_epochs=5,
verbose=False
)
# Load the model
model = load_model(script_path)
# This will pass validation (correct schema)
correct_data = generate_synthetic_data(10, seed=123)
embeddings = model.predict(correct_data)
print(f"✓ Validation passed: {embeddings.shape}")
# This would fail validation (missing column)
# incorrect_data = correct_data.drop(columns=["Sales"])
# embeddings = model.predict(incorrect_data) # Would raise ValueError
# Skip validation if needed (not recommended for production)
partial_data = correct_data[['Country', 'Product']] # Missing columns
try:
embeddings_unvalidated = model.predict(partial_data, validate_schema=False)
print(f"⚠️ Unvalidated prediction: {embeddings_unvalidated.shape}")
print(" (Schema validation was skipped)")
except Exception as e:
print(f"Even unvalidated prediction failed: {e}")
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[13], line 9
6 with tempfile.TemporaryDirectory() as tmpdir:
7 # Create a small model for demonstration
8 sample_data = generate_synthetic_data(50, seed=42)
----> 9 embeddings, script_path, binary_path = train_and_save_model(
10 sample_data,
11 base_path=os.path.join(tmpdir, "validation_model"),
12 embedding_dim=3,
13 max_epochs=5,
14 verbose=False
15 )
17 # Load the model
18 model = load_model(script_path)
File ~/work/row2vec/row2vec/row2vec/serialization.py:633, in train_and_save_model(df, base_path, embedding_dim, mode, reference_column, max_epochs, batch_size, dropout_rate, hidden_units, early_stopping, seed, verbose, scale_method, scale_range, log_level, log_file, enable_logging, n_neighbors, perplexity, min_dist, n_iter, similar_pairs, dissimilar_pairs, auto_pairs, negative_samples, contrastive_loss, margin, overwrite, include_training_history)
630 from .core import learn_embedding_with_model
632 # Train the model and get all components
--> 633 embeddings, model, preprocessor, metadata = learn_embedding_with_model(
634 df=df,
635 embedding_dim=embedding_dim,
636 mode=mode,
637 reference_column=reference_column,
638 max_epochs=max_epochs,
639 batch_size=batch_size,
640 dropout_rate=dropout_rate,
641 hidden_units=hidden_units,
642 early_stopping=early_stopping,
643 seed=seed,
644 verbose=verbose,
645 scale_method=scale_method,
646 scale_range=scale_range,
647 log_level=log_level,
648 log_file=log_file,
649 enable_logging=enable_logging,
650 n_neighbors=n_neighbors,
651 perplexity=perplexity,
652 min_dist=min_dist,
653 n_iter=n_iter,
654 # Contrastive learning parameters
655 similar_pairs=similar_pairs,
656 dissimilar_pairs=dissimilar_pairs,
657 auto_pairs=auto_pairs,
658 negative_samples=negative_samples,
659 contrastive_loss=contrastive_loss,
660 margin=margin,
661 )
663 # Optionally remove training history to reduce file size
664 if not include_training_history:
File ~/work/row2vec/row2vec/row2vec/core.py:1599, in learn_embedding_with_model(df, embedding_dim, mode, reference_column, max_epochs, batch_size, dropout_rate, hidden_units, early_stopping, seed, verbose, scale_method, scale_range, log_level, log_file, enable_logging, n_neighbors, perplexity, min_dist, n_iter, similar_pairs, dissimilar_pairs, auto_pairs, negative_samples, contrastive_loss, margin, config)
1591 original_schema = create_dataframe_schema(df)
1593 # We'll need to essentially duplicate the learn_embedding logic but capture additional info
1594 # For brevity, I'll implement this by calling the existing function and then reconstructing
1595 # the model and preprocessor. This is not the most efficient approach, but it maintains
1596 # compatibility with the existing codebase.
1597
1598 # First, get the embeddings using the existing function
-> 1599 embeddings = learn_embedding(
1600 df=df,
1601 embedding_dim=embedding_dim,
1602 mode=mode,
1603 reference_column=reference_column,
1604 max_epochs=max_epochs,
1605 batch_size=batch_size,
1606 dropout_rate=dropout_rate,
1607 hidden_units=hidden_units,
1608 early_stopping=early_stopping,
1609 seed=seed,
1610 verbose=verbose,
1611 scale_method=scale_method,
1612 scale_range=scale_range,
1613 log_level=log_level,
1614 log_file=log_file,
1615 enable_logging=enable_logging,
1616 n_neighbors=n_neighbors,
1617 perplexity=perplexity,
1618 min_dist=min_dist,
1619 n_iter=n_iter,
1620 similar_pairs=similar_pairs,
1621 dissimilar_pairs=dissimilar_pairs,
1622 auto_pairs=auto_pairs,
1623 negative_samples=negative_samples,
1624 contrastive_loss=contrastive_loss,
1625 margin=margin,
1626 )
1628 # Initialize configuration with intelligent defaults if none provided
1629 if config is None:
File ~/work/row2vec/row2vec/row2vec/core.py:1139, in learn_embedding(df, embedding_dim, mode, reference_column, max_epochs, batch_size, dropout_rate, hidden_units, early_stopping, seed, verbose, scale_method, scale_range, log_level, log_file, enable_logging, n_neighbors, perplexity, min_dist, n_iter, similar_pairs, dissimilar_pairs, auto_pairs, contrastive_loss, margin, negative_samples, config)
1136 logger.start_training(**config_dict)
1138 # === INPUT VALIDATION ===
-> 1139 _validate_inputs(
1140 df,
1141 embedding_dim,
1142 mode,
1143 reference_column,
1144 max_epochs,
1145 batch_size,
1146 dropout_rate,
1147 hidden_units,
1148 scale_method,
1149 scale_range,
1150 n_neighbors,
1151 perplexity,
1152 min_dist,
1153 n_iter,
1154 similar_pairs,
1155 dissimilar_pairs,
1156 auto_pairs,
1157 contrastive_loss,
1158 margin,
1159 negative_samples,
1160 )
1162 # Resolve scaling behavior defaults
1163 if scale_method is None:
File ~/work/row2vec/row2vec/row2vec/core.py:276, in _validate_inputs(df, embedding_dim, mode, reference_column, max_epochs, batch_size, dropout_rate, hidden_units, scale_method, scale_range, n_neighbors, perplexity, min_dist, n_iter, similar_pairs, dissimilar_pairs, auto_pairs, contrastive_loss, margin, negative_samples)
274 # Only validate batch_size for neural methods that actually use it
275 if mode in ["unsupervised", "target", "contrastive"] and batch_size > df.shape[0]:
--> 276 raise ValueError(
277 f"batch_size ({batch_size}) cannot be larger than dataset size ({df.shape[0]})",
278 )
280 if not isinstance(dropout_rate, int | float) or not (0 <= dropout_rate < 1):
281 raise ValueError(
282 f"dropout_rate must be a number between 0 and 1, got {dropout_rate}",
283 )
ValueError: batch_size (64) cannot be larger than dataset size (50)
Best Practices for Model Serialization#
Use descriptive base paths: Include version, date, or dataset info in model names
Enable training history: Keep detailed metadata for debugging and analysis
Validate schemas in production: Always use
validate_schema=Truefor safetyStore models with data descriptions: Keep training data documentation alongside models
Version your models: Use systematic naming for model iterations
Test model loading: Always verify saved models can be loaded and used
# Example of production-ready model naming and metadata
import datetime
# Create descriptive model path with timestamp and version
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_name = f"customer_segmentation_v2.1_{timestamp}"
print(f"Production model naming example:")
print(f" Model name: {model_name}")
print(f" Files would be: {model_name}.py and {model_name}.pkl")
print(f" Include metadata: dataset version, feature engineering steps, validation scores")
Production model naming example:
Model name: customer_segmentation_v2.1_20251013_083201
Files would be: customer_segmentation_v2.1_20251013_083201.py and customer_segmentation_v2.1_20251013_083201.pkl
Include metadata: dataset version, feature engineering steps, validation scores
Custom Configuration Objects#
Use configuration objects for complex setups:
from row2vec import (
EmbeddingConfig,
NeuralConfig,
ScalingConfig,
LoggingConfig
)
# Create comprehensive configuration
embedding_config = EmbeddingConfig(
embedding_dim=6,
mode="unsupervised",
seed=42
)
neural_config = NeuralConfig(
max_epochs=25,
batch_size=128,
dropout_rate=0.25,
hidden_units=[512, 256], # Multi-layer
early_stopping=True,
activation="relu"
)
scaling_config = ScalingConfig(
method="standard",
range=None # Not applicable for standard scaling
)
logging_config = LoggingConfig(
level="INFO",
file=None,
enabled=True
)
print("Configuration objects created:")
print(f" Embedding: {embedding_config.embedding_dim}D {embedding_config.mode}")
print(f" Neural: {neural_config.max_epochs} epochs, {neural_config.hidden_units} units")
print(f" Scaling: {scaling_config.method}")
print(f" Logging: {logging_config.level}")
Configuration objects created:
Embedding: 6D unsupervised
Neural: 25 epochs, [512, 256] units
Scaling: standard
Logging: INFO
Performance Monitoring#
Built-in performance tracking:
from row2vec import get_logger
import time
# Enable performance logging
logger = get_logger()
# Logger is already configured with INFO level
# Monitor embedding generation
start_time = time.time()
embeddings_monitored = learn_embedding(
df,
mode="unsupervised",
embedding_dim=10,
max_epochs=20,
batch_size=128,
verbose=True, # Enable verbose output
seed=42
)
total_time = time.time() - start_time
print(f"\nPerformance summary:")
print(f" Total time: {total_time:.2f} seconds")
print(f" Records processed: {len(df)}")
print(f" Records per second: {len(df)/total_time:.1f}")
print(f" Final embeddings: {embeddings_monitored.shape}")
Model: "functional_30"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_15 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_31 (Dense) │ (None, 128) │ 1,408 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_54 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 10) │ 1,290 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_32 (Dense) │ (None, 128) │ 1,408 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_55 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_33 (Dense) │ (None, 10) │ 1,290 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 5,396 (21.08 KB)
Trainable params: 5,396 (21.08 KB)
Non-trainable params: 0 (0.00 B)
Epoch 1/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 2s 875ms/step - loss: 0.3052
4/4 ━━━━━━━━━━━━━━━━━━━━ 1s 43ms/step - loss: 0.2889 - val_loss: 0.2685
Epoch 2/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2761
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2613 - val_loss: 0.2428
Epoch 3/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2518
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.2378 - val_loss: 0.2188
Epoch 4/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2269
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.2152 - val_loss: 0.1966
Epoch 5/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2032
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1930 - val_loss: 0.1759
Epoch 6/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1877
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1798 - val_loss: 0.1569
Epoch 7/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1692
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1611 - val_loss: 0.1405
Epoch 8/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1516
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1464 - val_loss: 0.1272
Epoch 9/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1378
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1357 - val_loss: 0.1168
Epoch 10/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1315
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1257 - val_loss: 0.1085
Epoch 11/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1235
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1209 - val_loss: 0.1014
Epoch 12/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1195
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1144 - val_loss: 0.0945
Epoch 13/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1077
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1051 - val_loss: 0.0873
Epoch 14/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1020
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1026 - val_loss: 0.0796
Epoch 15/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0983
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0944 - val_loss: 0.0719
Epoch 16/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0860
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0849 - val_loss: 0.0646
Epoch 17/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0840
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0799 - val_loss: 0.0576
Epoch 18/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0759
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0744 - val_loss: 0.0508
Epoch 19/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0685
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0681 - val_loss: 0.0448
Epoch 20/20
1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0642
4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0630 - val_loss: 0.0398
Performance summary:
Total time: 2.25 seconds
Records processed: 500
Records per second: 222.6
Final embeddings: (500, 10)
Production Considerations#
Key settings for production use:
# Production-optimized configuration
production_embeddings = learn_embedding(
df,
mode="unsupervised",
embedding_dim=8, # Appropriate for dataset feature count
max_epochs=50, # Reduced for demo (100 in production)
batch_size=64, # Appropriate for dataset size
dropout_rate=0.2, # Conservative regularization
hidden_units=[512, 256], # Deep architecture
early_stopping=True, # Prevent overfitting
scale_method="standard", # Standardized outputs
seed=42, # Reproducible results
verbose=False
)
print(f"Production embeddings generated:")
print(f" Shape: {production_embeddings.shape}")
print(f" Value range: [{production_embeddings.min().min():.3f}, {production_embeddings.max().max():.3f}]")
print(f" Mean: {production_embeddings.mean().mean():.3f}")
print(f" Std: {production_embeddings.std().mean():.3f}")
Model: "functional_32"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_16 (InputLayer) │ (None, 10) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_1 (Dense) │ (None, 512) │ 5,632 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_56 (Dropout) │ (None, 512) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ encoder_hidden_2 (Dense) │ (None, 256) │ 131,328 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_57 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 8) │ 2,056 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_1 (Dense) │ (None, 256) │ 2,304 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_58 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ decoder_hidden_2 (Dense) │ (None, 512) │ 131,584 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_59 (Dropout) │ (None, 512) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_34 (Dense) │ (None, 10) │ 5,130 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 278,034 (1.06 MB)
Trainable params: 278,034 (1.06 MB)
Non-trainable params: 0 (0.00 B)
Production embeddings generated:
Shape: (500, 8)
Value range: [-2.494, 2.758]
Mean: 0.000
Std: 1.001
Next Steps#
You now know Row2Vec’s advanced capabilities! For more:
📖 CLI Guide - Batch processing and automation
🔍 API Reference - Complete parameter documentation
🏠 Back to Examples for practical applications