Adult Dataset Example#

The Adult dataset demonstrates Row2Vec’s capabilities with high-cardinality categorical features and larger datasets.

Load and Explore Data#

# Import complete suppression first
exec(open('suppress_minimal.py').read())

import pandas as pd
import numpy as np
from row2vec import learn_embedding
import os

# Load Adult dataset
data_path = os.path.join('..', 'data', 'adult.csv')
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
✓ Enhanced minimal suppression active
Dataset shape: (48842, 15)

Columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
# Preview the data
print("First 5 records:")
print(df.head())
First 5 records:
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0              13  United-States  <=50K  
2             0             0              40  United-States  <=50K  
3             0             0              40  United-States  <=50K  
4             0             0              40           Cuba  <=50K  
# Check for high-cardinality categoricals
print("Categorical column cardinalities:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    unique_count = df[col].nunique()
    print(f"  {col}: {unique_count} unique values")
    if unique_count <= 10:  # Show values for low-cardinality columns
        print(f"    Values: {df[col].unique()[:10].tolist()}")
Categorical column cardinalities:
  workclass: 8 unique values
    Values: ['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', nan, 'Self-emp-inc', 'Without-pay', 'Never-worked']
  education: 16 unique values
  marital-status: 7 unique values
    Values: ['Never-married', 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed']
  occupation: 14 unique values
  relationship: 6 unique values
    Values: ['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative']
  race: 5 unique values
    Values: ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']
  sex: 2 unique values
    Values: ['Male', 'Female']
  native-country: 41 unique values
  income: 2 unique values
    Values: ['<=50K', '>50K']

Data Preprocessing#

# Remove columns not useful for general embeddings
cols_to_drop = ['fnlwgt', 'education-num', 'income']  # fnlwgt is just a weight, education-num duplicates education
df_features = df.drop(columns=cols_to_drop)

print(f"Features for embedding: {df_features.columns.tolist()}")
print(f"Shape: {df_features.shape}")

# Check missing values
missing = df_features.isnull().sum()
if missing.any():
    print(f"\nMissing values:")
    print(missing[missing > 0])
else:
    print("\nNo missing values detected")
Features for embedding: ['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
Shape: (48842, 12)

Missing values:
workclass         2799
occupation        2809
native-country     857
dtype: int64

Unsupervised Embeddings#

Generate embeddings for the entire dataset:

# Sample for faster demo (remove sampling for full dataset)
df_sample = df_features.sample(n=5000, random_state=42)
print(f"Working with sample of {len(df_sample)} records")

# Generate 10D embeddings
embeddings = learn_embedding(
    df_sample,
    mode="unsupervised",
    embedding_dim=10,
    max_epochs=30,
    batch_size=128,
    dropout_rate=0.3,
    hidden_units=256,
    verbose=False,
    seed=42
)

print(f"\nEmbeddings shape: {embeddings.shape}")
print("\nEmbedding statistics:")
print(embeddings.describe().round(3))
Working with sample of 5000 records
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer)        │ (None, 101)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 256)            │        26,112 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 10)             │         2,570 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 256)            │         2,816 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_1 (Dropout)             │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 101)            │        25,957 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 57,455 (224.43 KB)
 Trainable params: 57,455 (224.43 KB)
 Non-trainable params: 0 (0.00 B)
Embeddings shape: (5000, 10)

Embedding statistics:
       embedding_0  embedding_1  embedding_2  embedding_3  embedding_4  \
count     5000.000     5000.000     5000.000     5000.000     5000.000   
mean         0.038       -0.075       -0.226       -0.087        0.187   
std          0.595        0.610        0.803        0.716        0.645   
min         -1.361       -3.390       -2.058       -1.871       -1.846   
25%         -0.374       -0.437       -0.829       -0.612       -0.213   
50%         -0.015       -0.017       -0.280       -0.049        0.154   
75%          0.403        0.344        0.329        0.428        0.512   
max          2.831        1.868        6.003        1.970        6.058   

       embedding_5  embedding_6  embedding_7  embedding_8  embedding_9  
count     5000.000     5000.000     5000.000     5000.000     5000.000  
mean        -0.028        0.067       -0.085        0.046        0.101  
std          0.751        0.672        0.668        0.586        0.645  
min         -2.666       -3.178       -3.595       -3.724       -1.519  
25%         -0.515       -0.300       -0.448       -0.256       -0.361  
50%         -0.070        0.031       -0.045        0.087        0.028  
75%          0.414        0.452        0.295        0.409        0.478  
max          5.181        3.295        3.443        2.084        4.513  

High-Cardinality Categorical Embeddings#

The Adult dataset’s ‘occupation’ column has many categories - perfect for target-based embeddings:

# Get occupation counts
occupation_counts = df['occupation'].value_counts()
print(f"Occupation column: {len(occupation_counts)} unique values")
print("\nTop 10 occupations:")
print(occupation_counts.head(10))
Occupation column: 14 unique values

Top 10 occupations:
occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Name: count, dtype: int64
# Learn embeddings for occupations
# df_sample already contains occupation column, so we can use it directly
occupation_embeddings = learn_embedding(
    df_sample,
    mode="target",
    reference_column="occupation",
    embedding_dim=3,
    max_epochs=40,
    batch_size=128,
    verbose=False,
    seed=42
)

print(f"Occupation embeddings shape: {occupation_embeddings.shape}")
print("\nOccupation embeddings (3D):")
print(occupation_embeddings.round(3))
Model: "functional_2"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_1 (InputLayer)      │ (None, 87)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_3 (Dense)                 │ (None, 128)            │        11,264 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_2 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 3)              │           387 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_4 (Dense)                 │ (None, 14)             │            56 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 11,707 (45.73 KB)
 Trainable params: 11,707 (45.73 KB)
 Non-trainable params: 0 (0.00 B)
Occupation embeddings shape: (14, 3)

Occupation embeddings (3D):
          embedding_0  embedding_1  embedding_2
category                                       
0               2.445        0.873        2.140
1              -1.734       -0.412        1.660
2               2.864       -1.616        0.076
3               3.826        0.851        0.230
4               3.530       -1.690       -1.161
5               2.071       -1.488        1.006
6               2.460       -1.688        1.049
7               2.370       -0.589        1.986
8               1.258        0.616        2.206
9               3.094        2.700        0.074
10              0.376        0.111        0.949
11              3.734        0.032        0.748
12              2.563        0.601        1.171
13              2.398       -1.639        0.325

Analyze Occupation Relationships#

# Verify occupation_embeddings exists
if 'occupation_embeddings' not in locals():
    print("ERROR: occupation_embeddings not found, recreating...")
    occupation_embeddings = learn_embedding(
        df_sample,
        mode="target",
        reference_column="occupation",
        embedding_dim=3,
        max_epochs=40,
        batch_size=128,
        verbose=False,
        seed=42
    )

import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity matrix
similarity_matrix = cosine_similarity(occupation_embeddings.values)
occupations = occupation_embeddings.index.tolist()

print("Most similar occupation pairs:")
print("-" * 40)

# Find most similar pairs
for i in range(len(occupations)):
    for j in range(i+1, len(occupations)):
        similarity = similarity_matrix[i][j]
        if similarity > 0.8:  # High similarity threshold
            print(f"{occupations[i]} <-> {occupations[j]}: {similarity:.3f}")
Most similar occupation pairs:
----------------------------------------
0 <-> 3: 0.802
0 <-> 7: 0.900
0 <-> 8: 0.948
0 <-> 10: 0.882
0 <-> 11: 0.840
0 <-> 12: 0.959
2 <-> 4: 0.950
2 <-> 5: 0.933
2 <-> 6: 0.947
2 <-> 11: 0.854
2 <-> 13: 0.992
3 <-> 9: 0.878
3 <-> 11: 0.969
3 <-> 12: 0.936
4 <-> 13: 0.910
5 <-> 6: 0.999
5 <-> 7: 0.902
5 <-> 11: 0.808
5 <-> 13: 0.965
6 <-> 7: 0.895
6 <-> 11: 0.823
6 <-> 13: 0.974
7 <-> 8: 0.851
7 <-> 10: 0.838
7 <-> 11: 0.861
7 <-> 12: 0.887
8 <-> 10: 0.982
8 <-> 12: 0.820
9 <-> 12: 0.815
11 <-> 12: 0.954
11 <-> 13: 0.821
# Verify occupation_embeddings exists
if 'occupation_embeddings' not in locals():
    print("ERROR: occupation_embeddings not found, recreating...")
    occupation_embeddings = learn_embedding(
        df_sample,
        mode="target",
        reference_column="occupation",
        embedding_dim=3,
        max_epochs=40,
        batch_size=128,
        verbose=False,
        seed=42
    )
    occupations = occupation_embeddings.index.tolist()

# Visualize occupation embeddings in 2D (project from 3D to 2D)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
occupation_2d = pca.fit_transform(occupation_embeddings.values)

plt.figure(figsize=(12, 8))
plt.scatter(occupation_2d[:, 0], occupation_2d[:, 1], s=100, alpha=0.7)

# Label each point
for i, occupation in enumerate(occupations):
    plt.annotate(
        occupation,
        (occupation_2d[i, 0], occupation_2d[i, 1]),
        xytext=(5, 5),
        textcoords='offset points',
        fontsize=8,
        alpha=0.8
    )

plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Occupation Embeddings Visualization (2D Projection)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Notice how similar occupations cluster together!")
_images/dacda50b0c40ba0e74699a9426fa667531d2c4cb5f99cbaf57f1b1e2f4a61b88.png
Notice how similar occupations cluster together!

Work Class Embeddings#

# Learn embeddings for work class
# df_sample already contains workclass column, so we can use it directly
workclass_embeddings = learn_embedding(
    df_sample,
    mode="target",
    reference_column="workclass",
    embedding_dim=2,
    max_epochs=30,
    verbose=False,
    seed=42
)

print(f"Work class embeddings shape: {workclass_embeddings.shape}")
print("\nWork class embeddings:")
print(workclass_embeddings.round(3))
Model: "functional_4"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_2 (InputLayer)      │ (None, 94)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_5 (Dense)                 │ (None, 128)            │        12,160 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_3 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 2)              │           258 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_6 (Dense)                 │ (None, 7)              │            21 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 12,439 (48.59 KB)
 Trainable params: 12,439 (48.59 KB)
 Non-trainable params: 0 (0.00 B)
Work class embeddings shape: (7, 2)

Work class embeddings:
          embedding_0  embedding_1
category                          
0              -3.429        1.326
1              -3.981        0.188
2              -4.749        2.017
3              -2.858        1.971
4              -2.570        2.595
5              -3.861        0.506
6               0.569        2.710
# Verify workclass_embeddings exists
if 'workclass_embeddings' not in locals():
    print("ERROR: workclass_embeddings not found, recreating...")
    workclass_embeddings = learn_embedding(
        df_sample,
        mode="target",
        reference_column="workclass",
        embedding_dim=2,
        max_epochs=30,
        verbose=False,
        seed=42
    )

# Visualize work class embeddings
plt.figure(figsize=(10, 6))
plt.scatter(workclass_embeddings.iloc[:, 0], workclass_embeddings.iloc[:, 1], s=150, alpha=0.7)

for i, workclass in enumerate(workclass_embeddings.index):
    plt.annotate(
        workclass,
        (workclass_embeddings.iloc[i, 0], workclass_embeddings.iloc[i, 1]),
        xytext=(5, 5),
        textcoords='offset points',
        fontsize=10
    )

plt.xlabel('Embedding Dimension 0')
plt.ylabel('Embedding Dimension 1')
plt.title('Work Class Embeddings')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
_images/b7aed256f5f59716ea8c00f98fee4d374e3c3bfbc57c5da9ab946c57aa51d8b9.png

Compare Methods on Adult Data#

import time

# Compare different methods
methods = {
    "Neural": {"mode": "unsupervised", "max_epochs": 20},
    "PCA": {"mode": "pca"},
    "t-SNE": {"mode": "tsne", "perplexity": 50}
}

# Use smaller sample for t-SNE (it's slow)
small_sample = df_features.sample(n=1000, random_state=42)

results = {}
for name, params in methods.items():
    print(f"Running {name}...")
    start = time.time()

    emb = learn_embedding(
        small_sample,
        embedding_dim=2,
        verbose=False,
        seed=42,
        **params
    )

    elapsed = time.time() - start
    results[name] = {
        "time": elapsed,
        "shape": emb.shape,
        "mean": emb.mean().mean(),
        "std": emb.std().mean()
    }

print("\nMethod comparison results:")
print("-" * 60)
for method, stats in results.items():
    print(f"{method:8} | Time: {stats['time']:6.2f}s | Mean: {stats['mean']:7.3f} | Std: {stats['std']:6.3f}")
Running Neural...
Model: "functional_6"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_3 (InputLayer)      │ (None, 87)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_7 (Dense)                 │ (None, 128)            │        11,264 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_4 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 2)              │           258 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_8 (Dense)                 │ (None, 128)            │           384 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_5 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_9 (Dense)                 │ (None, 87)             │        11,223 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 23,129 (90.35 KB)
 Trainable params: 23,129 (90.35 KB)
 Non-trainable params: 0 (0.00 B)
Running PCA...
Running t-SNE...
Method comparison results:
------------------------------------------------------------
Neural   | Time:   2.61s | Mean:  -0.018 | Std:  0.853
PCA      | Time:   0.05s | Mean:   0.000 | Std:  1.104
t-SNE    | Time:   3.05s | Mean:  -0.509 | Std: 15.800

Feature Engineering with Embeddings#

Use embeddings as features for income prediction:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Prepare target variable
income_binary = (df.loc[df_sample.index, 'income'] == '>50K').astype(int)

print(f"Income distribution in sample:")
print(f"<=50K: {(income_binary == 0).sum()} ({(income_binary == 0).mean():.1%})")
print(f">50K:  {(income_binary == 1).sum()} ({(income_binary == 1).mean():.1%})")

# Use embeddings as features
X = embeddings
y = income_binary

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nIncome prediction using embeddings:")
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['<=50K', '>50K']))
Income distribution in sample:
<=50K: 3819 (76.4%)
>50K:  1181 (23.6%)
Income prediction using embeddings:
Accuracy: 0.812

Classification Report:
              precision    recall  f1-score   support

       <=50K       0.85      0.91      0.88       764
        >50K       0.63      0.49      0.55       236

    accuracy                           0.81      1000
   macro avg       0.74      0.70      0.72      1000
weighted avg       0.80      0.81      0.80      1000

Scaling for Different Ranges#

# Generate scaled embeddings for different use cases
scaled_embeddings = learn_embedding(
    df_sample,
    mode="unsupervised",
    embedding_dim=5,
    max_epochs=20,
    scale_method="minmax",
    scale_range=(-1.0, 1.0),
    verbose=False,
    seed=42
)

print("Scaled embeddings statistics:")
print(f"Min: {scaled_embeddings.min().min():.3f}")
print(f"Max: {scaled_embeddings.max().max():.3f}")
print(f"Mean: {scaled_embeddings.mean().mean():.3f}")
print(f"Std: {scaled_embeddings.std().mean():.3f}")

print("\nFirst 5 scaled embeddings:")
print(scaled_embeddings.head().round(3))
Model: "functional_8"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_4 (InputLayer)      │ (None, 101)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_10 (Dense)                │ (None, 128)            │        13,056 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_6 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 5)              │           645 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_11 (Dense)                │ (None, 128)            │           768 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_7 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_12 (Dense)                │ (None, 101)            │        13,029 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 27,498 (107.41 KB)
 Trainable params: 27,498 (107.41 KB)
 Non-trainable params: 0 (0.00 B)
Scaled embeddings statistics:
Min: -1.000
Max: 1.000
Mean: 0.021
Std: 0.202

First 5 scaled embeddings:
   embedding_0  embedding_1  embedding_2  embedding_3  embedding_4
0       -0.480        0.033        0.909       -0.204       -0.392
1       -0.530        0.041        0.764        0.148       -0.143
2       -0.386       -0.030        0.662       -0.307       -0.158
3       -0.478       -0.107        0.718        0.108       -0.104
4       -0.096       -0.092        0.719       -0.374       -0.177

Production Model#

Create a model ready for deployment:

from row2vec import train_and_save_model
import tempfile
import os

with tempfile.TemporaryDirectory() as tmpdir:
    model_path = os.path.join(tmpdir, "adult_model")

    # Train production model on larger dataset
    production_sample = df_features.sample(n=10000, random_state=42)

    embeddings_prod, script_path, binary_path = train_and_save_model(
        production_sample,
        base_path=model_path,
        embedding_dim=15,
        mode="unsupervised",
        max_epochs=50,
        batch_size=256,
        dropout_rate=0.25,
        hidden_units=512,
        scale_method="standard",
        verbose=False,
        seed=42
    )

    print(f"Production model saved: {os.path.basename(script_path)}")
    print(f"Final embeddings shape: {embeddings_prod.shape}")

    # Load and test
    from row2vec import load_model
    model = load_model(script_path)

    # Test on new data
    test_data = df_features.sample(n=100, random_state=999)
    test_embeddings = model.predict(test_data)

    print(f"\nModel successfully applied to new data:")
    print(f"Test embeddings shape: {test_embeddings.shape}")
    print(f"Model training time: {model.metadata.training_time:.2f} seconds")
Model: "functional_10"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_5 (InputLayer)      │ (None, 103)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_13 (Dense)                │ (None, 512)            │        53,248 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_8 (Dropout)             │ (None, 512)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding (Dense)               │ (None, 15)             │         7,695 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_14 (Dense)                │ (None, 512)            │         8,192 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_9 (Dropout)             │ (None, 512)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_15 (Dense)                │ (None, 103)            │        52,839 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 121,974 (476.46 KB)
 Trainable params: 121,974 (476.46 KB)
 Non-trainable params: 0 (0.00 B)
  1/313 ━━━━━━━━━━━━━━━━━━━━ 8s 26ms/step

 83/313 ━━━━━━━━━━━━━━━━━━━━ 0s 615us/step

164/313 ━━━━━━━━━━━━━━━━━━━━ 0s 618us/step

245/313 ━━━━━━━━━━━━━━━━━━━━ 0s 618us/step

313/313 ━━━━━━━━━━━━━━━━━━━━ 0s 664us/step

313/313 ━━━━━━━━━━━━━━━━━━━━ 0s 698us/step
Production model saved: adult_model.py
Final embeddings shape: (10000, 15)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 30
     28 # Load and test
     29 from row2vec import load_model
---> 30 model = load_model(script_path)
     32 # Test on new data
     33 test_data = df_features.sample(n=100, random_state=999)

File ~/work/row2vec/row2vec/row2vec/serialization.py:398, in load_model(script_path)
    392 # Execute the script in a controlled namespace
    393 # Add the script directory to help find the binary file
    394 namespace = {
    395     "__script_dir__": str(script_path.parent),
    396     "__script_path__": str(script_path),
    397 }
--> 398 exec(script_path.read_text(encoding="utf-8"), namespace)
    400 # Get the load function from the script
    401 if "load_model" not in namespace:

File <string>:43

NameError: name 'nan' is not defined

Key Insights#

  1. High-Cardinality Handling: Row2Vec excels with many categorical values

  2. Occupation Relationships: Similar jobs cluster in embedding space

  3. Scalability: Handles 45K+ records efficiently with sampling

  4. Feature Quality: Embeddings achieve good accuracy on income prediction

  5. Production Ready: Easy to save, load, and deploy models

Next Steps#