Advanced Features

Advanced Features#

Row2Vec includes several advanced features for expert users and production workflows.

Neural Architecture Search (NAS)#

Automatically find optimal neural network architectures:

# Import complete suppression first
exec(open('suppress_minimal.py').read())

from row2vec import (
    ArchitectureSearchConfig,
    search_architecture,
    generate_synthetic_data
)

# Generate test data
df = generate_synthetic_data(num_records=500, seed=42)
print(f"Test data shape: {df.shape}")

✓ Enhanced minimal suppression active

Test data shape: (500, 3)

# Configure architecture search
config = ArchitectureSearchConfig(
    method='random',        # Random search (faster than grid)
    max_layers=3,          # Search up to 3 hidden layers
    width_options=[32, 64, 128, 256],  # Neuron options per layer
    max_trials=5,          # Number of architectures to try (reduced for demo)
    initial_epochs=10      # Reduced epochs for faster demo
)

# Run architecture search
print("Searching for optimal architecture...")

# Need base config for architecture search
from row2vec import EmbeddingConfig, NeuralConfig
base_config = EmbeddingConfig(
    mode="unsupervised",
    embedding_dim=5,
    neural=NeuralConfig(max_epochs=10)
)

best_architecture, search_results = search_architecture(df, base_config, config)

print(f"\nBest architecture found:")
print(f"  Architecture: {best_architecture}")
print(f"  Search completed in {search_results.total_time:.2f} seconds")

Model: "functional"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer)        │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_1 (Dense)        │ (None, 128)            │         1,408 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_2 (Dense)        │ (None, 256)            │        33,024 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_1 (Dropout)             │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_3 (Dense)        │ (None, 128)            │        32,896 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_2 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           645 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_1 (Dense)        │ (None, 128)            │           768 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_3 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_2 (Dense)        │ (None, 256)            │        33,024 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_4 (Dropout)             │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_3 (Dense)        │ (None, 128)            │        32,896 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_5 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 10)             │         1,290 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 135,951 (531.06 KB)

 Trainable params: 135,951 (531.06 KB)

 Non-trainable params: 0 (0.00 B)

Searching for optimal architecture...

Model: "functional_2"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_1 (InputLayer)      │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 32)             │           352 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_6 (Dropout)             │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           165 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 32)             │           192 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_7 (Dropout)             │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_3 (Dense)                 │ (None, 10)             │           330 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 1,039 (4.06 KB)

 Trainable params: 1,039 (4.06 KB)

 Non-trainable params: 0 (0.00 B)

Model: "functional_4"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_2 (InputLayer)      │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_1 (Dense)        │ (None, 256)            │         2,816 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_8 (Dropout)             │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_2 (Dense)        │ (None, 256)            │        65,792 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_9 (Dropout)             │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_3 (Dense)        │ (None, 128)            │        32,896 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_10 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_4 (Dense)        │ (None, 32)             │         4,128 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_11 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           165 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_1 (Dense)        │ (None, 32)             │           192 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_12 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_2 (Dense)        │ (None, 128)            │         4,224 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_13 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_3 (Dense)        │ (None, 256)            │        33,024 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_14 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_4 (Dense)        │ (None, 256)            │        65,792 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_15 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_4 (Dense)                 │ (None, 10)             │         2,570 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 211,599 (826.56 KB)

 Trainable params: 211,599 (826.56 KB)

 Non-trainable params: 0 (0.00 B)

Model: "functional_6"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_3 (InputLayer)      │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_1 (Dense)        │ (None, 128)            │         1,408 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_16 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_2 (Dense)        │ (None, 32)             │         4,128 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_17 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_3 (Dense)        │ (None, 32)             │         1,056 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_18 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_4 (Dense)        │ (None, 128)            │         4,224 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_19 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           645 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_1 (Dense)        │ (None, 128)            │           768 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_20 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_2 (Dense)        │ (None, 32)             │         4,128 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_21 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_3 (Dense)        │ (None, 32)             │         1,056 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_22 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_4 (Dense)        │ (None, 128)            │         4,224 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_23 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_5 (Dense)                 │ (None, 10)             │         1,290 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 22,927 (89.56 KB)

 Trainable params: 22,927 (89.56 KB)

 Non-trainable params: 0 (0.00 B)

Model: "functional_8"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_4 (InputLayer)      │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_1 (Dense)        │ (None, 64)             │           704 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_24 (Dropout)            │ (None, 64)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_2 (Dense)        │ (None, 64)             │         4,160 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_25 (Dropout)            │ (None, 64)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           325 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_1 (Dense)        │ (None, 64)             │           384 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_26 (Dropout)            │ (None, 64)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_2 (Dense)        │ (None, 64)             │         4,160 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_27 (Dropout)            │ (None, 64)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_6 (Dense)                 │ (None, 10)             │           650 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 10,383 (40.56 KB)

 Trainable params: 10,383 (40.56 KB)

 Non-trainable params: 0 (0.00 B)

Best architecture found:
  Architecture: {'n_layers': 1, 'hidden_units': 32, 'dropout_rate': 0.2, 'activation': 'relu'}
  Search completed in 21.62 seconds

# Use the best architecture
from row2vec import learn_embedding

# Apply the best architecture found
best_embeddings = learn_embedding(
    df,
    mode="unsupervised",
    embedding_dim=5,
    hidden_units=best_architecture.get('hidden_units', [128]),
    dropout_rate=best_architecture.get('dropout_rate', 0.2),
    max_epochs=20,
    verbose=False,
    seed=42
)

print(f"Optimized embeddings shape: {best_embeddings.shape}")
print("First 3 embeddings using best architecture:")
print(best_embeddings.head(3).round(4))

Model: "functional_10"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_5 (InputLayer)      │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_7 (Dense)                 │ (None, 32)             │           352 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_28 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           165 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_8 (Dense)                 │ (None, 32)             │           192 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_29 (Dropout)            │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_9 (Dense)                 │ (None, 10)             │           330 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 1,039 (4.06 KB)

 Trainable params: 1,039 (4.06 KB)

 Non-trainable params: 0 (0.00 B)

Optimized embeddings shape: (500, 5)
First 3 embeddings using best architecture:
   embedding_0  embedding_1  embedding_2  embedding_3  embedding_4
0       0.2743      -0.2934      -0.2631      -0.4504      -0.0223
1      -0.1761      -0.4336      -0.4362       0.3499       0.8057
2       0.2331      -0.5344       0.1291      -0.4565      -0.6254

Advanced Missing Value Imputation#

Intelligent missing value handling with multiple strategies:

from row2vec import (
    ImputationConfig,
    AdaptiveImputer,
    MissingPatternAnalyzer
)
import numpy as np

# Create data with missing values
df_missing = df.copy()
np.random.seed(42)

# Introduce different types of missing patterns
df_missing.loc[np.random.choice(df_missing.index, 50), 'Sales'] = np.nan
df_missing.loc[np.random.choice(df_missing.index, 30), 'Product'] = np.nan
df_missing.loc[np.random.choice(df_missing.index, 20), 'Country'] = np.nan

print(f"Missing values introduced:")
print(df_missing.isnull().sum())

Missing values introduced:
Country    20
Product    30
Sales      47
dtype: int64

# Analyze missing patterns
config = ImputationConfig()
analyzer = MissingPatternAnalyzer(config)
analysis = analyzer.analyze(df_missing)

print(f"Missing value analysis:")
print(f"  Total missing: {analysis['total_missing']}")
print(f"  Missing percentage: {analysis['missing_percentage']:.1f}%")
print(f"  Columns with missing: {analysis['columns_with_missing']}")
print(f"  Recommendations: {analysis['recommendations']}")

Missing value analysis:
  Total missing: 97
  Missing percentage: 6.5%
  Columns with missing: 3
  Recommendations: {'Country': {'missing_percentage': 4.0, 'is_numeric': False, 'suggested_strategy': 'mode', 'reasoning': 'Few categories, mode imputation works well', 'alternatives': ['mode', 'constant', 'missing_category']}, 'Product': {'missing_percentage': 6.0, 'is_numeric': False, 'suggested_strategy': 'mode', 'reasoning': 'Few categories, mode imputation works well', 'alternatives': ['mode', 'constant', 'missing_category']}, 'Sales': {'missing_percentage': 9.4, 'is_numeric': True, 'suggested_strategy': 'mean', 'reasoning': 'Low missingness, mean imputation is fast and effective', 'alternatives': ['mean', 'median', 'knn', 'iterative']}}

# Apply different imputation strategies
strategies = {
    'adaptive': ImputationConfig(),
    'knn': ImputationConfig(
        numeric_strategy='knn',
        categorical_strategy='mode',
        knn_neighbors=5
    ),
    'iterative': ImputationConfig(
        numeric_strategy='iterative',
        categorical_strategy='mode',
        categorical_fill_value='Missing'
    )
}

for name, strategy_config in strategies.items():
    imputer = AdaptiveImputer(strategy_config)
    df_imputed = imputer.fit_transform(df_missing)

    remaining_missing = df_imputed.isnull().sum().sum()
    print(f"{name:12}: {remaining_missing} missing values remaining")

adaptive    : 0 missing values remaining
knn         : 0 missing values remaining
iterative   : 0 missing values remaining

Categorical Encoding Strategies#

Advanced categorical feature handling:

from row2vec import learn_embedding_v2, EmbeddingConfig, PreprocessingConfig

# Test different categorical encoding strategies
encoding_strategies = ['onehot', 'target', 'adaptive']

for strategy in encoding_strategies:
    # Create complete config with categorical encoding strategy
    config = EmbeddingConfig(
        embedding_dim=3,
        mode="unsupervised",
        preprocessing=PreprocessingConfig(
            categorical_encoding_strategy=strategy
        )
    )

    embeddings = learn_embedding_v2(
        df,
        config=config
    )

    print(f"{strategy:12}: shape {embeddings.shape}, mean={embeddings.mean().mean():.3f}")

Model: "functional_12"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_6 (InputLayer)      │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_10 (Dense)                │ (None, 128)            │         1,408 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_30 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 3)              │           387 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_11 (Dense)                │ (None, 128)            │           512 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_31 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_12 (Dense)                │ (None, 10)             │         1,290 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 3,597 (14.05 KB)

 Trainable params: 3,597 (14.05 KB)

 Non-trainable params: 0 (0.00 B)

Model: "functional_14"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_7 (InputLayer)      │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_13 (Dense)                │ (None, 128)            │         1,408 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_32 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 3)              │           387 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_14 (Dense)                │ (None, 128)            │           512 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_33 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_15 (Dense)                │ (None, 10)             │         1,290 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 3,597 (14.05 KB)

 Trainable params: 3,597 (14.05 KB)

 Non-trainable params: 0 (0.00 B)

onehot      : shape (500, 3), mean=0.056

Model: "functional_16"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_8 (InputLayer)      │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_16 (Dense)                │ (None, 128)            │         1,408 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_34 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 3)              │           387 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_17 (Dense)                │ (None, 128)            │           512 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_35 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_18 (Dense)                │ (None, 10)             │         1,290 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 3,597 (14.05 KB)

 Trainable params: 3,597 (14.05 KB)

 Non-trainable params: 0 (0.00 B)

target      : shape (500, 3), mean=0.056

adaptive    : shape (500, 3), mean=0.056

Multi-Layer Neural Networks#

Deep architectures for complex patterns:

# Compare single vs multi-layer networks
architectures = {
    'Single Layer': [128],
    'Two Layer': [256, 128],
    'Three Layer': [512, 256, 128]
}

for name, hidden_units in architectures.items():
    embeddings = learn_embedding(
        df,
        mode="unsupervised",
        embedding_dim=5,
        hidden_units=hidden_units,
        dropout_rate=0.2,
        max_epochs=15,
        verbose=False,
        seed=42
    )

    # Calculate some basic metrics
    mean_emb = embeddings.mean().mean()
    std_emb = embeddings.std().mean()

    print(f"{name:15}: mean={mean_emb:7.3f}, std={std_emb:6.3f}")

Model: "functional_18"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_9 (InputLayer)      │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_1 (Dense)        │ (None, 128)            │         1,408 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_36 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           645 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_1 (Dense)        │ (None, 128)            │           768 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_37 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_19 (Dense)                │ (None, 10)             │         1,290 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 4,111 (16.06 KB)

 Trainable params: 4,111 (16.06 KB)

 Non-trainable params: 0 (0.00 B)

Model: "functional_20"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_10 (InputLayer)     │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_1 (Dense)        │ (None, 256)            │         2,816 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_38 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_2 (Dense)        │ (None, 128)            │        32,896 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_39 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           645 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_1 (Dense)        │ (None, 128)            │           768 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_40 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_2 (Dense)        │ (None, 256)            │        33,024 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_41 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_20 (Dense)                │ (None, 10)             │         2,570 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 72,719 (284.06 KB)

 Trainable params: 72,719 (284.06 KB)

 Non-trainable params: 0 (0.00 B)

Single Layer   : mean= -0.048, std= 0.632

Model: "functional_22"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_11 (InputLayer)     │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_1 (Dense)        │ (None, 512)            │         5,632 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_42 (Dropout)            │ (None, 512)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_2 (Dense)        │ (None, 256)            │       131,328 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_43 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_3 (Dense)        │ (None, 128)            │        32,896 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_44 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           645 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_1 (Dense)        │ (None, 128)            │           768 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_45 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_2 (Dense)        │ (None, 256)            │        33,024 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_46 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_3 (Dense)        │ (None, 512)            │       131,584 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_47 (Dropout)            │ (None, 512)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_21 (Dense)                │ (None, 10)             │         5,130 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 341,007 (1.30 MB)

 Trainable params: 341,007 (1.30 MB)

 Non-trainable params: 0 (0.00 B)

Two Layer      : mean=  0.013, std= 0.678

Three Layer    : mean= -0.053, std= 0.623

Automatic Dimension Selection#

Find optimal embedding dimensions:

from row2vec import auto_select_dimension, EmbeddingConfig, NeuralConfig

# Create base config for dimension selection
base_config = EmbeddingConfig(
    mode="unsupervised",
    seed=42,
    verbose=False,
    neural=NeuralConfig(max_epochs=10)
)

# Test dimensions from 2 to 10
optimal_dim, results = auto_select_dimension(
    df,
    config=base_config,
    min_dimension=2,
    max_dimension=10,
    n_trials=3,  # Reduced for demo
    verbose=False
)

print(f"Optimal embedding dimension: {optimal_dim}")
print(f"\nDimension evaluation results:")
for dim, score in results.items():
    marker = " <-- OPTIMAL" if dim == optimal_dim else ""
    # Handle case where score might be a dict, list, or other format
    if isinstance(score, dict):
        # Try common score keys
        score_val = score.get('score', score.get('loss', score.get('value', 0.0)))
    elif isinstance(score, (list, tuple)):
        score_val = score[0] if len(score) > 0 else 0.0
    else:
        try:
            score_val = float(score) if score is not None else 0.0
        except (ValueError, TypeError):
            score_val = 0.0
    print(f"  {dim:2}D: {score_val:.4f}{marker}")

Optimal embedding dimension: 4

Dimension evaluation results:
  candidate_dimensionsD: 2.0000
  method_resultsD: 0.0000
  dimension_scoresD: 0.0000
  selection_strategyD: 0.0000
  weightsD: 0.0000

Contrastive Learning Mode#

Advanced embedding technique (if available):

try:
    # Try to use contrastive learning functionality if available
    from row2vec import learn_embedding

    # Use regular neural embeddings as contrastive learning may not be available
    # in the current API version
    contrastive_embeddings = learn_embedding(
        df,
        mode="unsupervised",
        embedding_dim=5,
        max_epochs=10,
        verbose=False,
        seed=42
    )

    print(f"Neural embeddings shape: {contrastive_embeddings.shape}")
    print("First 3 embeddings (using standard neural approach):")
    print(contrastive_embeddings.head(3).round(4))
    print("\nNote: Advanced contrastive learning features may require specific Row2Vec versions")

except ImportError as e:
    print(f"Advanced contrastive learning mode not available: {e}")

Model: "functional_24"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_12 (InputLayer)     │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_22 (Dense)                │ (None, 128)            │         1,408 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_48 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           645 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_23 (Dense)                │ (None, 128)            │           768 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_49 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_24 (Dense)                │ (None, 10)             │         1,290 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 4,111 (16.06 KB)

 Trainable params: 4,111 (16.06 KB)

 Non-trainable params: 0 (0.00 B)

Neural embeddings shape: (500, 5)
First 3 embeddings (using standard neural approach):
   embedding_0  embedding_1  embedding_2  embedding_3  embedding_4
0      -0.3042       0.3156       0.1292      -0.0977       0.5706
1      -0.5889      -0.1660      -0.5090      -0.6612      -0.6047
2       0.2482       0.7880       0.2081       0.2817      -0.2617

Note: Advanced contrastive learning features may require specific Row2Vec versions

Model Serialization with Metadata#

Row2Vec provides powerful model serialization capabilities for production workflows. The system uses a two-file approach for transparency and efficiency:

Python script (.py): Contains inspectable metadata and loading logic
Binary file (.pkl): Contains the actual model weights and preprocessor

Key Features#

Transparent metadata: All training configuration and results stored in readable format
Complete pipeline preservation: Includes preprocessing steps, not just the model
Schema validation: Automatically validates input data against expected schema
Multiple model support: Works with neural networks and classical ML methods
Detailed training information: Loss curves, timing, data characteristics, and more

Advanced model saving with rich metadata:#

from row2vec import (
    learn_embedding_with_model,
    save_model,
    Row2VecModel,
    Row2VecModelMetadata
)
import tempfile
import os

# Train with full model information
embeddings, model, preprocessor, metadata = learn_embedding_with_model(
    df,
    embedding_dim=8,
    mode="unsupervised",
    max_epochs=20,
    batch_size=64,
    dropout_rate=0.3,
    hidden_units=256,
    verbose=False,
    seed=42
)

print(f"Training completed:")
print(f"  Final loss: {metadata.get('final_loss', 'N/A')}")
print(f"  Training time: {metadata.get('training_time', 0):.2f}s")
print(f"  Epochs trained: {metadata.get('epochs_trained', 0)}")

Model: "functional_26"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_13 (InputLayer)     │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_25 (Dense)                │ (None, 256)            │         2,816 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_50 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 8)              │         2,056 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_26 (Dense)                │ (None, 256)            │         2,304 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_51 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_27 (Dense)                │ (None, 10)             │         2,570 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 9,746 (38.07 KB)

 Trainable params: 9,746 (38.07 KB)

 Non-trainable params: 0 (0.00 B)

 1/16 ━━━━━━━━━━━━━━━━━━━━ 0s 26ms/step


16/16 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step

Training completed:
  Final loss: None
  Training time: 4.88s
  Epochs trained: 20

# Create rich model object with metadata
row2vec_model = Row2VecModel(
    model=model,
    preprocessor=preprocessor,
    metadata=Row2VecModelMetadata.from_dict(metadata)
)

# Save model with comprehensive information
with tempfile.TemporaryDirectory() as tmpdir:
    model_path = os.path.join(tmpdir, "advanced_model")

    script_path, binary_path = save_model(row2vec_model, model_path)

    print(f"Model saved with metadata:")
    print(f"  Script: {os.path.basename(script_path)}")
    print(f"  Binary: {os.path.basename(binary_path)}")

    # Load and inspect metadata
    from row2vec import load_model
    loaded_model = load_model(script_path)

    print(f"\nLoaded model metadata:")
    print(f"  Data shape: {loaded_model.metadata.data_shape}")
    print(f"  Original columns: {len(loaded_model.metadata.original_columns) if loaded_model.metadata.original_columns else 0}")
    print(f"  Embedding dimension: {loaded_model.metadata.embedding_dim}")
    print(f"  Training epochs: {loaded_model.metadata.epochs_trained}")
    print(f"  Final loss: {loaded_model.metadata.final_loss}")

Model saved with metadata:
  Script: advanced_model.py
  Binary: advanced_model.pkl

Loaded model metadata:
  Data shape: (500, 3)
  Original columns: 3
  Embedding dimension: 8
  Training epochs: 20
  Final loss: None

Schema Validation in Production#

Models automatically validate input data against the expected schema:

# Load model for validation demonstration
from row2vec import train_and_save_model, load_model, generate_synthetic_data
import tempfile
import os

with tempfile.TemporaryDirectory() as tmpdir:
    # Create a small model for demonstration
    sample_data = generate_synthetic_data(50, seed=42)
    embeddings, script_path, binary_path = train_and_save_model(
        sample_data,
        base_path=os.path.join(tmpdir, "validation_model"),
        embedding_dim=3,
        max_epochs=5,
        verbose=False
    )

    # Load the model
    model = load_model(script_path)

    # This will pass validation (correct schema)
    correct_data = generate_synthetic_data(10, seed=123)
    embeddings = model.predict(correct_data)
    print(f"✓ Validation passed: {embeddings.shape}")

    # This would fail validation (missing column)
    # incorrect_data = correct_data.drop(columns=["Sales"])
    # embeddings = model.predict(incorrect_data)  # Would raise ValueError

    # Skip validation if needed (not recommended for production)
    partial_data = correct_data[['Country', 'Product']]  # Missing columns
    try:
        embeddings_unvalidated = model.predict(partial_data, validate_schema=False)
        print(f"⚠️  Unvalidated prediction: {embeddings_unvalidated.shape}")
        print("   (Schema validation was skipped)")
    except Exception as e:
        print(f"Even unvalidated prediction failed: {e}")

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[13], line 9
with tempfile.TemporaryDirectory() as tmpdir:
   # Create a small model for demonstration
   sample_data = generate_synthetic_data(50, seed=42)
----> 9     embeddings, script_path, binary_path = train_and_save_model(
       sample_data,
       base_path=os.path.join(tmpdir, "validation_model"),
       embedding_dim=3,
       max_epochs=5,
       verbose=False
   )
   # Load the model
   model = load_model(script_path)

File ~/work/row2vec/row2vec/row2vec/serialization.py:633, in train_and_save_model(df, base_path, embedding_dim, mode, reference_column, max_epochs, batch_size, dropout_rate, hidden_units, early_stopping, seed, verbose, scale_method, scale_range, log_level, log_file, enable_logging, n_neighbors, perplexity, min_dist, n_iter, similar_pairs, dissimilar_pairs, auto_pairs, negative_samples, contrastive_loss, margin, overwrite, include_training_history)
from .core import learn_embedding_with_model
# Train the model and get all components
--> 633 embeddings, model, preprocessor, metadata = learn_embedding_with_model(
   df=df,
   embedding_dim=embedding_dim,
   mode=mode,
   reference_column=reference_column,
   max_epochs=max_epochs,
   batch_size=batch_size,
   dropout_rate=dropout_rate,
   hidden_units=hidden_units,
   early_stopping=early_stopping,
   seed=seed,
   verbose=verbose,
   scale_method=scale_method,
   scale_range=scale_range,
   log_level=log_level,
   log_file=log_file,
   enable_logging=enable_logging,
   n_neighbors=n_neighbors,
   perplexity=perplexity,
   min_dist=min_dist,
   n_iter=n_iter,
   # Contrastive learning parameters
   similar_pairs=similar_pairs,
   dissimilar_pairs=dissimilar_pairs,
   auto_pairs=auto_pairs,
   negative_samples=negative_samples,
   contrastive_loss=contrastive_loss,
   margin=margin,
)
# Optionally remove training history to reduce file size
if not include_training_history:

File ~/work/row2vec/row2vec/row2vec/core.py:1599, in learn_embedding_with_model(df, embedding_dim, mode, reference_column, max_epochs, batch_size, dropout_rate, hidden_units, early_stopping, seed, verbose, scale_method, scale_range, log_level, log_file, enable_logging, n_neighbors, perplexity, min_dist, n_iter, similar_pairs, dissimilar_pairs, auto_pairs, negative_samples, contrastive_loss, margin, config)
original_schema = create_dataframe_schema(df)
# We'll need to essentially duplicate the learn_embedding logic but capture additional info
# For brevity, I'll implement this by calling the existing function and then reconstructing
# the model and preprocessor. This is not the most efficient approach, but it maintains
# compatibility with the existing codebase.

# First, get the embeddings using the existing function
-> 1599 embeddings = learn_embedding(
   df=df,
   embedding_dim=embedding_dim,
   mode=mode,
   reference_column=reference_column,
   max_epochs=max_epochs,
   batch_size=batch_size,
   dropout_rate=dropout_rate,
   hidden_units=hidden_units,
   early_stopping=early_stopping,
   seed=seed,
   verbose=verbose,
   scale_method=scale_method,
   scale_range=scale_range,
   log_level=log_level,
   log_file=log_file,
   enable_logging=enable_logging,
   n_neighbors=n_neighbors,
   perplexity=perplexity,
   min_dist=min_dist,
   n_iter=n_iter,
   similar_pairs=similar_pairs,
   dissimilar_pairs=dissimilar_pairs,
   auto_pairs=auto_pairs,
   negative_samples=negative_samples,
   contrastive_loss=contrastive_loss,
   margin=margin,
)
# Initialize configuration with intelligent defaults if none provided
if config is None:

File ~/work/row2vec/row2vec/row2vec/core.py:1139, in learn_embedding(df, embedding_dim, mode, reference_column, max_epochs, batch_size, dropout_rate, hidden_units, early_stopping, seed, verbose, scale_method, scale_range, log_level, log_file, enable_logging, n_neighbors, perplexity, min_dist, n_iter, similar_pairs, dissimilar_pairs, auto_pairs, contrastive_loss, margin, negative_samples, config)
   logger.start_training(**config_dict)
# === INPUT VALIDATION ===
-> 1139 _validate_inputs(
   df,
   embedding_dim,
   mode,
   reference_column,
   max_epochs,
   batch_size,
   dropout_rate,
   hidden_units,
   scale_method,
   scale_range,
   n_neighbors,
   perplexity,
   min_dist,
   n_iter,
   similar_pairs,
   dissimilar_pairs,
   auto_pairs,
   contrastive_loss,
   margin,
   negative_samples,
)
# Resolve scaling behavior defaults
if scale_method is None:

File ~/work/row2vec/row2vec/row2vec/core.py:276, in _validate_inputs(df, embedding_dim, mode, reference_column, max_epochs, batch_size, dropout_rate, hidden_units, scale_method, scale_range, n_neighbors, perplexity, min_dist, n_iter, similar_pairs, dissimilar_pairs, auto_pairs, contrastive_loss, margin, negative_samples)
# Only validate batch_size for neural methods that actually use it
if mode in ["unsupervised", "target", "contrastive"] and batch_size > df.shape[0]:
--> 276     raise ValueError(
       f"batch_size ({batch_size}) cannot be larger than dataset size ({df.shape[0]})",
   )
if not isinstance(dropout_rate, int | float) or not (0 <= dropout_rate < 1):
   raise ValueError(
       f"dropout_rate must be a number between 0 and 1, got {dropout_rate}",
   )

ValueError: batch_size (64) cannot be larger than dataset size (50)

Best Practices for Model Serialization#

Use descriptive base paths: Include version, date, or dataset info in model names
Enable training history: Keep detailed metadata for debugging and analysis
Validate schemas in production: Always use validate_schema=True for safety
Store models with data descriptions: Keep training data documentation alongside models
Version your models: Use systematic naming for model iterations
Test model loading: Always verify saved models can be loaded and used

# Example of production-ready model naming and metadata
import datetime

# Create descriptive model path with timestamp and version
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_name = f"customer_segmentation_v2.1_{timestamp}"

print(f"Production model naming example:")
print(f"  Model name: {model_name}")
print(f"  Files would be: {model_name}.py and {model_name}.pkl")
print(f"  Include metadata: dataset version, feature engineering steps, validation scores")

Production model naming example:
  Model name: customer_segmentation_v2.1_20251013_083201
  Files would be: customer_segmentation_v2.1_20251013_083201.py and customer_segmentation_v2.1_20251013_083201.pkl
  Include metadata: dataset version, feature engineering steps, validation scores

Custom Configuration Objects#

Use configuration objects for complex setups:

from row2vec import (
    EmbeddingConfig,
    NeuralConfig,
    ScalingConfig,
    LoggingConfig
)

# Create comprehensive configuration
embedding_config = EmbeddingConfig(
    embedding_dim=6,
    mode="unsupervised",
    seed=42
)

neural_config = NeuralConfig(
    max_epochs=25,
    batch_size=128,
    dropout_rate=0.25,
    hidden_units=[512, 256],  # Multi-layer
    early_stopping=True,
    activation="relu"
)

scaling_config = ScalingConfig(
    method="standard",
    range=None  # Not applicable for standard scaling
)

logging_config = LoggingConfig(
    level="INFO",
    file=None,
    enabled=True
)

print("Configuration objects created:")
print(f"  Embedding: {embedding_config.embedding_dim}D {embedding_config.mode}")
print(f"  Neural: {neural_config.max_epochs} epochs, {neural_config.hidden_units} units")
print(f"  Scaling: {scaling_config.method}")
print(f"  Logging: {logging_config.level}")

Configuration objects created:
  Embedding: 6D unsupervised
  Neural: 25 epochs, [512, 256] units
  Scaling: standard
  Logging: INFO

Performance Monitoring#

Built-in performance tracking:

from row2vec import get_logger
import time

# Enable performance logging
logger = get_logger()
# Logger is already configured with INFO level

# Monitor embedding generation
start_time = time.time()

embeddings_monitored = learn_embedding(
    df,
    mode="unsupervised",
    embedding_dim=10,
    max_epochs=20,
    batch_size=128,
    verbose=True,  # Enable verbose output
    seed=42
)

total_time = time.time() - start_time

print(f"\nPerformance summary:")
print(f"  Total time: {total_time:.2f} seconds")
print(f"  Records processed: {len(df)}")
print(f"  Records per second: {len(df)/total_time:.1f}")
print(f"  Final embeddings: {embeddings_monitored.shape}")

Model: "functional_30"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_15 (InputLayer)     │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_31 (Dense)                │ (None, 128)            │         1,408 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_54 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 10)             │         1,290 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_32 (Dense)                │ (None, 128)            │         1,408 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_55 (Dropout)            │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_33 (Dense)                │ (None, 10)             │         1,290 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 5,396 (21.08 KB)

 Trainable params: 5,396 (21.08 KB)

 Non-trainable params: 0 (0.00 B)

Epoch 1/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 2s 875ms/step - loss: 0.3052


4/4 ━━━━━━━━━━━━━━━━━━━━ 1s 43ms/step - loss: 0.2889 - val_loss: 0.2685

Epoch 2/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2761


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2613 - val_loss: 0.2428

Epoch 3/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2518


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.2378 - val_loss: 0.2188

Epoch 4/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2269


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.2152 - val_loss: 0.1966

Epoch 5/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.2032


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1930 - val_loss: 0.1759

Epoch 6/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1877


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1798 - val_loss: 0.1569

Epoch 7/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1692


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1611 - val_loss: 0.1405

Epoch 8/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1516


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1464 - val_loss: 0.1272

Epoch 9/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1378


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1357 - val_loss: 0.1168

Epoch 10/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1315


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1257 - val_loss: 0.1085

Epoch 11/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1235


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1209 - val_loss: 0.1014

Epoch 12/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1195


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1144 - val_loss: 0.0945

Epoch 13/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1077


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1051 - val_loss: 0.0873

Epoch 14/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.1020


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.1026 - val_loss: 0.0796

Epoch 15/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0983


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0944 - val_loss: 0.0719

Epoch 16/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0860


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0849 - val_loss: 0.0646

Epoch 17/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0840


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0799 - val_loss: 0.0576

Epoch 18/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0759


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0744 - val_loss: 0.0508

Epoch 19/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0685


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0681 - val_loss: 0.0448

Epoch 20/20

1/4 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step - loss: 0.0642


4/4 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - loss: 0.0630 - val_loss: 0.0398

Performance summary:
  Total time: 2.25 seconds
  Records processed: 500
  Records per second: 222.6
  Final embeddings: (500, 10)

Production Considerations#

Key settings for production use:

# Production-optimized configuration
production_embeddings = learn_embedding(
    df,
    mode="unsupervised",
    embedding_dim=8,          # Appropriate for dataset feature count
    max_epochs=50,            # Reduced for demo (100 in production)
    batch_size=64,            # Appropriate for dataset size
    dropout_rate=0.2,         # Conservative regularization
    hidden_units=[512, 256],  # Deep architecture
    early_stopping=True,      # Prevent overfitting
    scale_method="standard",  # Standardized outputs
    seed=42,                  # Reproducible results
    verbose=False
)

print(f"Production embeddings generated:")
print(f"  Shape: {production_embeddings.shape}")
print(f"  Value range: [{production_embeddings.min().min():.3f}, {production_embeddings.max().max():.3f}]")
print(f"  Mean: {production_embeddings.mean().mean():.3f}")
print(f"  Std: {production_embeddings.std().mean():.3f}")

Model: "functional_32"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_16 (InputLayer)     │ (None, 10)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_1 (Dense)        │ (None, 512)            │         5,632 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_56 (Dropout)            │ (None, 512)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_hidden_2 (Dense)        │ (None, 256)            │       131,328 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_57 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 8)              │         2,056 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_1 (Dense)        │ (None, 256)            │         2,304 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_58 (Dropout)            │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_hidden_2 (Dense)        │ (None, 512)            │       131,584 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_59 (Dropout)            │ (None, 512)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_34 (Dense)                │ (None, 10)             │         5,130 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 278,034 (1.06 MB)

 Trainable params: 278,034 (1.06 MB)

 Non-trainable params: 0 (0.00 B)

Production embeddings generated:
  Shape: (500, 8)
  Value range: [-2.494, 2.758]
  Mean: 0.000
  Std: 1.001

Next Steps#

You now know Row2Vec’s advanced capabilities! For more:

📖 CLI Guide - Batch processing and automation
🔍 API Reference - Complete parameter documentation
🏠 Back to Examples for practical applications