Adult Dataset Example#
The Adult dataset demonstrates Row2Vec’s capabilities with high-cardinality categorical features and larger datasets.
Load and Explore Data#
# Import complete suppression first
exec(open('suppress_minimal.py').read())
import pandas as pd
import numpy as np
from row2vec import learn_embedding
import os
# Load Adult dataset
data_path = os.path.join('..', 'data', 'adult.csv')
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
✓ Enhanced minimal suppression active
Dataset shape: (48842, 15)
Columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
# Preview the data
print("First 5 records:")
print(df.head())
First 5 records:
age workclass fnlwgt education education-num \
0 39 State-gov 77516 Bachelors 13
1 50 Self-emp-not-inc 83311 Bachelors 13
2 38 Private 215646 HS-grad 9
3 53 Private 234721 11th 7
4 28 Private 338409 Bachelors 13
marital-status occupation relationship race sex \
0 Never-married Adm-clerical Not-in-family White Male
1 Married-civ-spouse Exec-managerial Husband White Male
2 Divorced Handlers-cleaners Not-in-family White Male
3 Married-civ-spouse Handlers-cleaners Husband Black Male
4 Married-civ-spouse Prof-specialty Wife Black Female
capital-gain capital-loss hours-per-week native-country income
0 2174 0 40 United-States <=50K
1 0 0 13 United-States <=50K
2 0 0 40 United-States <=50K
3 0 0 40 United-States <=50K
4 0 0 40 Cuba <=50K
# Check for high-cardinality categoricals
print("Categorical column cardinalities:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
unique_count = df[col].nunique()
print(f" {col}: {unique_count} unique values")
if unique_count <= 10: # Show values for low-cardinality columns
print(f" Values: {df[col].unique()[:10].tolist()}")
Categorical column cardinalities:
workclass: 8 unique values
Values: ['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', nan, 'Self-emp-inc', 'Without-pay', 'Never-worked']
education: 16 unique values
marital-status: 7 unique values
Values: ['Never-married', 'Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Separated', 'Married-AF-spouse', 'Widowed']
occupation: 14 unique values
relationship: 6 unique values
Values: ['Not-in-family', 'Husband', 'Wife', 'Own-child', 'Unmarried', 'Other-relative']
race: 5 unique values
Values: ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other']
sex: 2 unique values
Values: ['Male', 'Female']
native-country: 41 unique values
income: 2 unique values
Values: ['<=50K', '>50K']
Data Preprocessing#
# Remove columns not useful for general embeddings
cols_to_drop = ['fnlwgt', 'education-num', 'income'] # fnlwgt is just a weight, education-num duplicates education
df_features = df.drop(columns=cols_to_drop)
print(f"Features for embedding: {df_features.columns.tolist()}")
print(f"Shape: {df_features.shape}")
# Check missing values
missing = df_features.isnull().sum()
if missing.any():
print(f"\nMissing values:")
print(missing[missing > 0])
else:
print("\nNo missing values detected")
Features for embedding: ['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
Shape: (48842, 12)
Missing values:
workclass 2799
occupation 2809
native-country 857
dtype: int64
Unsupervised Embeddings#
Generate embeddings for the entire dataset:
# Sample for faster demo (remove sampling for full dataset)
df_sample = df_features.sample(n=5000, random_state=42)
print(f"Working with sample of {len(df_sample)} records")
# Generate 10D embeddings
embeddings = learn_embedding(
df_sample,
mode="unsupervised",
embedding_dim=10,
max_epochs=30,
batch_size=128,
dropout_rate=0.3,
hidden_units=256,
verbose=False,
seed=42
)
print(f"\nEmbeddings shape: {embeddings.shape}")
print("\nEmbedding statistics:")
print(embeddings.describe().round(3))
Working with sample of 5000 records
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer (InputLayer) │ (None, 101) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense (Dense) │ (None, 256) │ 26,112 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 10) │ 2,570 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_1 (Dense) │ (None, 256) │ 2,816 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_1 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_2 (Dense) │ (None, 101) │ 25,957 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 57,455 (224.43 KB)
Trainable params: 57,455 (224.43 KB)
Non-trainable params: 0 (0.00 B)
Embeddings shape: (5000, 10)
Embedding statistics:
embedding_0 embedding_1 embedding_2 embedding_3 embedding_4 \
count 5000.000 5000.000 5000.000 5000.000 5000.000
mean 0.038 -0.075 -0.226 -0.087 0.187
std 0.595 0.610 0.803 0.716 0.645
min -1.361 -3.390 -2.058 -1.871 -1.846
25% -0.374 -0.437 -0.829 -0.612 -0.213
50% -0.015 -0.017 -0.280 -0.049 0.154
75% 0.403 0.344 0.329 0.428 0.512
max 2.831 1.868 6.003 1.970 6.058
embedding_5 embedding_6 embedding_7 embedding_8 embedding_9
count 5000.000 5000.000 5000.000 5000.000 5000.000
mean -0.028 0.067 -0.085 0.046 0.101
std 0.751 0.672 0.668 0.586 0.645
min -2.666 -3.178 -3.595 -3.724 -1.519
25% -0.515 -0.300 -0.448 -0.256 -0.361
50% -0.070 0.031 -0.045 0.087 0.028
75% 0.414 0.452 0.295 0.409 0.478
max 5.181 3.295 3.443 2.084 4.513
High-Cardinality Categorical Embeddings#
The Adult dataset’s ‘occupation’ column has many categories - perfect for target-based embeddings:
# Get occupation counts
occupation_counts = df['occupation'].value_counts()
print(f"Occupation column: {len(occupation_counts)} unique values")
print("\nTop 10 occupations:")
print(occupation_counts.head(10))
Occupation column: 14 unique values
Top 10 occupations:
occupation
Prof-specialty 6172
Craft-repair 6112
Exec-managerial 6086
Adm-clerical 5611
Sales 5504
Other-service 4923
Machine-op-inspct 3022
Transport-moving 2355
Handlers-cleaners 2072
Farming-fishing 1490
Name: count, dtype: int64
# Learn embeddings for occupations
# df_sample already contains occupation column, so we can use it directly
occupation_embeddings = learn_embedding(
df_sample,
mode="target",
reference_column="occupation",
embedding_dim=3,
max_epochs=40,
batch_size=128,
verbose=False,
seed=42
)
print(f"Occupation embeddings shape: {occupation_embeddings.shape}")
print("\nOccupation embeddings (3D):")
print(occupation_embeddings.round(3))
Model: "functional_2"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_1 (InputLayer) │ (None, 87) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_3 (Dense) │ (None, 128) │ 11,264 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_2 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 3) │ 387 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_4 (Dense) │ (None, 14) │ 56 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 11,707 (45.73 KB)
Trainable params: 11,707 (45.73 KB)
Non-trainable params: 0 (0.00 B)
Occupation embeddings shape: (14, 3)
Occupation embeddings (3D):
embedding_0 embedding_1 embedding_2
category
0 2.445 0.873 2.140
1 -1.734 -0.412 1.660
2 2.864 -1.616 0.076
3 3.826 0.851 0.230
4 3.530 -1.690 -1.161
5 2.071 -1.488 1.006
6 2.460 -1.688 1.049
7 2.370 -0.589 1.986
8 1.258 0.616 2.206
9 3.094 2.700 0.074
10 0.376 0.111 0.949
11 3.734 0.032 0.748
12 2.563 0.601 1.171
13 2.398 -1.639 0.325
Analyze Occupation Relationships#
# Verify occupation_embeddings exists
if 'occupation_embeddings' not in locals():
print("ERROR: occupation_embeddings not found, recreating...")
occupation_embeddings = learn_embedding(
df_sample,
mode="target",
reference_column="occupation",
embedding_dim=3,
max_epochs=40,
batch_size=128,
verbose=False,
seed=42
)
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
# Calculate similarity matrix
similarity_matrix = cosine_similarity(occupation_embeddings.values)
occupations = occupation_embeddings.index.tolist()
print("Most similar occupation pairs:")
print("-" * 40)
# Find most similar pairs
for i in range(len(occupations)):
for j in range(i+1, len(occupations)):
similarity = similarity_matrix[i][j]
if similarity > 0.8: # High similarity threshold
print(f"{occupations[i]} <-> {occupations[j]}: {similarity:.3f}")
Most similar occupation pairs:
----------------------------------------
0 <-> 3: 0.802
0 <-> 7: 0.900
0 <-> 8: 0.948
0 <-> 10: 0.882
0 <-> 11: 0.840
0 <-> 12: 0.959
2 <-> 4: 0.950
2 <-> 5: 0.933
2 <-> 6: 0.947
2 <-> 11: 0.854
2 <-> 13: 0.992
3 <-> 9: 0.878
3 <-> 11: 0.969
3 <-> 12: 0.936
4 <-> 13: 0.910
5 <-> 6: 0.999
5 <-> 7: 0.902
5 <-> 11: 0.808
5 <-> 13: 0.965
6 <-> 7: 0.895
6 <-> 11: 0.823
6 <-> 13: 0.974
7 <-> 8: 0.851
7 <-> 10: 0.838
7 <-> 11: 0.861
7 <-> 12: 0.887
8 <-> 10: 0.982
8 <-> 12: 0.820
9 <-> 12: 0.815
11 <-> 12: 0.954
11 <-> 13: 0.821
# Verify occupation_embeddings exists
if 'occupation_embeddings' not in locals():
print("ERROR: occupation_embeddings not found, recreating...")
occupation_embeddings = learn_embedding(
df_sample,
mode="target",
reference_column="occupation",
embedding_dim=3,
max_epochs=40,
batch_size=128,
verbose=False,
seed=42
)
occupations = occupation_embeddings.index.tolist()
# Visualize occupation embeddings in 2D (project from 3D to 2D)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
occupation_2d = pca.fit_transform(occupation_embeddings.values)
plt.figure(figsize=(12, 8))
plt.scatter(occupation_2d[:, 0], occupation_2d[:, 1], s=100, alpha=0.7)
# Label each point
for i, occupation in enumerate(occupations):
plt.annotate(
occupation,
(occupation_2d[i, 0], occupation_2d[i, 1]),
xytext=(5, 5),
textcoords='offset points',
fontsize=8,
alpha=0.8
)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('Occupation Embeddings Visualization (2D Projection)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("Notice how similar occupations cluster together!")
Notice how similar occupations cluster together!
Work Class Embeddings#
# Learn embeddings for work class
# df_sample already contains workclass column, so we can use it directly
workclass_embeddings = learn_embedding(
df_sample,
mode="target",
reference_column="workclass",
embedding_dim=2,
max_epochs=30,
verbose=False,
seed=42
)
print(f"Work class embeddings shape: {workclass_embeddings.shape}")
print("\nWork class embeddings:")
print(workclass_embeddings.round(3))
Model: "functional_4"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_2 (InputLayer) │ (None, 94) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_5 (Dense) │ (None, 128) │ 12,160 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_3 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 2) │ 258 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_6 (Dense) │ (None, 7) │ 21 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 12,439 (48.59 KB)
Trainable params: 12,439 (48.59 KB)
Non-trainable params: 0 (0.00 B)
Work class embeddings shape: (7, 2)
Work class embeddings:
embedding_0 embedding_1
category
0 -3.429 1.326
1 -3.981 0.188
2 -4.749 2.017
3 -2.858 1.971
4 -2.570 2.595
5 -3.861 0.506
6 0.569 2.710
# Verify workclass_embeddings exists
if 'workclass_embeddings' not in locals():
print("ERROR: workclass_embeddings not found, recreating...")
workclass_embeddings = learn_embedding(
df_sample,
mode="target",
reference_column="workclass",
embedding_dim=2,
max_epochs=30,
verbose=False,
seed=42
)
# Visualize work class embeddings
plt.figure(figsize=(10, 6))
plt.scatter(workclass_embeddings.iloc[:, 0], workclass_embeddings.iloc[:, 1], s=150, alpha=0.7)
for i, workclass in enumerate(workclass_embeddings.index):
plt.annotate(
workclass,
(workclass_embeddings.iloc[i, 0], workclass_embeddings.iloc[i, 1]),
xytext=(5, 5),
textcoords='offset points',
fontsize=10
)
plt.xlabel('Embedding Dimension 0')
plt.ylabel('Embedding Dimension 1')
plt.title('Work Class Embeddings')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
Compare Methods on Adult Data#
import time
# Compare different methods
methods = {
"Neural": {"mode": "unsupervised", "max_epochs": 20},
"PCA": {"mode": "pca"},
"t-SNE": {"mode": "tsne", "perplexity": 50}
}
# Use smaller sample for t-SNE (it's slow)
small_sample = df_features.sample(n=1000, random_state=42)
results = {}
for name, params in methods.items():
print(f"Running {name}...")
start = time.time()
emb = learn_embedding(
small_sample,
embedding_dim=2,
verbose=False,
seed=42,
**params
)
elapsed = time.time() - start
results[name] = {
"time": elapsed,
"shape": emb.shape,
"mean": emb.mean().mean(),
"std": emb.std().mean()
}
print("\nMethod comparison results:")
print("-" * 60)
for method, stats in results.items():
print(f"{method:8} | Time: {stats['time']:6.2f}s | Mean: {stats['mean']:7.3f} | Std: {stats['std']:6.3f}")
Running Neural...
Model: "functional_6"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_3 (InputLayer) │ (None, 87) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_7 (Dense) │ (None, 128) │ 11,264 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_4 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 2) │ 258 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_8 (Dense) │ (None, 128) │ 384 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_5 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_9 (Dense) │ (None, 87) │ 11,223 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 23,129 (90.35 KB)
Trainable params: 23,129 (90.35 KB)
Non-trainable params: 0 (0.00 B)
Running PCA...
Running t-SNE...
Method comparison results:
------------------------------------------------------------
Neural | Time: 2.61s | Mean: -0.018 | Std: 0.853
PCA | Time: 0.05s | Mean: 0.000 | Std: 1.104
t-SNE | Time: 3.05s | Mean: -0.509 | Std: 15.800
Feature Engineering with Embeddings#
Use embeddings as features for income prediction:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Prepare target variable
income_binary = (df.loc[df_sample.index, 'income'] == '>50K').astype(int)
print(f"Income distribution in sample:")
print(f"<=50K: {(income_binary == 0).sum()} ({(income_binary == 0).mean():.1%})")
print(f">50K: {(income_binary == 1).sum()} ({(income_binary == 1).mean():.1%})")
# Use embeddings as features
X = embeddings
y = income_binary
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
# Evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nIncome prediction using embeddings:")
print(f"Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['<=50K', '>50K']))
Income distribution in sample:
<=50K: 3819 (76.4%)
>50K: 1181 (23.6%)
Income prediction using embeddings:
Accuracy: 0.812
Classification Report:
precision recall f1-score support
<=50K 0.85 0.91 0.88 764
>50K 0.63 0.49 0.55 236
accuracy 0.81 1000
macro avg 0.74 0.70 0.72 1000
weighted avg 0.80 0.81 0.80 1000
Scaling for Different Ranges#
# Generate scaled embeddings for different use cases
scaled_embeddings = learn_embedding(
df_sample,
mode="unsupervised",
embedding_dim=5,
max_epochs=20,
scale_method="minmax",
scale_range=(-1.0, 1.0),
verbose=False,
seed=42
)
print("Scaled embeddings statistics:")
print(f"Min: {scaled_embeddings.min().min():.3f}")
print(f"Max: {scaled_embeddings.max().max():.3f}")
print(f"Mean: {scaled_embeddings.mean().mean():.3f}")
print(f"Std: {scaled_embeddings.std().mean():.3f}")
print("\nFirst 5 scaled embeddings:")
print(scaled_embeddings.head().round(3))
Model: "functional_8"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_4 (InputLayer) │ (None, 101) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_10 (Dense) │ (None, 128) │ 13,056 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_6 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 645 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_11 (Dense) │ (None, 128) │ 768 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_7 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_12 (Dense) │ (None, 101) │ 13,029 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 27,498 (107.41 KB)
Trainable params: 27,498 (107.41 KB)
Non-trainable params: 0 (0.00 B)
Scaled embeddings statistics:
Min: -1.000
Max: 1.000
Mean: 0.021
Std: 0.202
First 5 scaled embeddings:
embedding_0 embedding_1 embedding_2 embedding_3 embedding_4
0 -0.480 0.033 0.909 -0.204 -0.392
1 -0.530 0.041 0.764 0.148 -0.143
2 -0.386 -0.030 0.662 -0.307 -0.158
3 -0.478 -0.107 0.718 0.108 -0.104
4 -0.096 -0.092 0.719 -0.374 -0.177
Production Model#
Create a model ready for deployment:
from row2vec import train_and_save_model
import tempfile
import os
with tempfile.TemporaryDirectory() as tmpdir:
model_path = os.path.join(tmpdir, "adult_model")
# Train production model on larger dataset
production_sample = df_features.sample(n=10000, random_state=42)
embeddings_prod, script_path, binary_path = train_and_save_model(
production_sample,
base_path=model_path,
embedding_dim=15,
mode="unsupervised",
max_epochs=50,
batch_size=256,
dropout_rate=0.25,
hidden_units=512,
scale_method="standard",
verbose=False,
seed=42
)
print(f"Production model saved: {os.path.basename(script_path)}")
print(f"Final embeddings shape: {embeddings_prod.shape}")
# Load and test
from row2vec import load_model
model = load_model(script_path)
# Test on new data
test_data = df_features.sample(n=100, random_state=999)
test_embeddings = model.predict(test_data)
print(f"\nModel successfully applied to new data:")
print(f"Test embeddings shape: {test_embeddings.shape}")
print(f"Model training time: {model.metadata.training_time:.2f} seconds")
Model: "functional_10"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_5 (InputLayer) │ (None, 103) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_13 (Dense) │ (None, 512) │ 53,248 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_8 (Dropout) │ (None, 512) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 15) │ 7,695 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_14 (Dense) │ (None, 512) │ 8,192 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_9 (Dropout) │ (None, 512) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_15 (Dense) │ (None, 103) │ 52,839 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 121,974 (476.46 KB)
Trainable params: 121,974 (476.46 KB)
Non-trainable params: 0 (0.00 B)
1/313 ━━━━━━━━━━━━━━━━━━━━ 8s 26ms/step
83/313 ━━━━━━━━━━━━━━━━━━━━ 0s 615us/step
164/313 ━━━━━━━━━━━━━━━━━━━━ 0s 618us/step
245/313 ━━━━━━━━━━━━━━━━━━━━ 0s 618us/step
313/313 ━━━━━━━━━━━━━━━━━━━━ 0s 664us/step
313/313 ━━━━━━━━━━━━━━━━━━━━ 0s 698us/step
Production model saved: adult_model.py
Final embeddings shape: (10000, 15)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 30
28 # Load and test
29 from row2vec import load_model
---> 30 model = load_model(script_path)
32 # Test on new data
33 test_data = df_features.sample(n=100, random_state=999)
File ~/work/row2vec/row2vec/row2vec/serialization.py:398, in load_model(script_path)
392 # Execute the script in a controlled namespace
393 # Add the script directory to help find the binary file
394 namespace = {
395 "__script_dir__": str(script_path.parent),
396 "__script_path__": str(script_path),
397 }
--> 398 exec(script_path.read_text(encoding="utf-8"), namespace)
400 # Get the load function from the script
401 if "load_model" not in namespace:
File <string>:43
NameError: name 'nan' is not defined
Key Insights#
High-Cardinality Handling: Row2Vec excels with many categorical values
Occupation Relationships: Similar jobs cluster in embedding space
Scalability: Handles 45K+ records efficiently with sampling
Feature Quality: Embeddings achieve good accuracy on income prediction
Production Ready: Easy to save, load, and deploy models
Next Steps#
Explore Housing Example for regression features
Learn about Advanced Features like neural architecture search
Check out the CLI Guide for batch processing large datasets