Titanic Dataset Example#
A complete walkthrough using the famous Titanic dataset to demonstrate Row2Vec’s capabilities.
Load the Data#
# Import complete suppression first
exec(open('suppress_minimal.py').read())
import pandas as pd
import numpy as np
from row2vec import learn_embedding
import os
# Load Titanic dataset
data_path = os.path.join('..', 'data', 'titanic.csv')
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nMissing values per column:")
print(df.isnull().sum())
✓ Enhanced minimal suppression active
Dataset shape: (887, 8)
Columns: ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']
Missing values per column:
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
Siblings/Spouses Aboard 0
Parents/Children Aboard 0
Fare 0
dtype: int64
Data Preview#
print("First 5 passengers:")
print(df.head())
First 5 passengers:
Survived Pclass Name \
0 0 3 Mr. Owen Harris Braund
1 1 1 Mrs. John Bradley (Florence Briggs Thayer) Cum...
2 1 3 Miss. Laina Heikkinen
3 1 1 Mrs. Jacques Heath (Lily May Peel) Futrelle
4 0 3 Mr. William Henry Allen
Sex Age Siblings/Spouses Aboard Parents/Children Aboard Fare
0 male 22.0 1 0 7.2500
1 female 38.0 1 0 71.2833
2 female 26.0 0 0 7.9250
3 female 35.0 1 0 53.1000
4 male 35.0 0 0 8.0500
# Basic statistics
print("Survival rate:", df['Survived'].mean())
print("Average age:", df['Age'].mean())
print("Average fare:", df['Fare'].mean())
print("\nPassenger classes:")
print(df['Pclass'].value_counts().sort_index())
Survival rate: 0.3855693348365276
Average age: 29.471443066516347
Average fare: 32.30542018038331
Passenger classes:
Pclass
1 216
2 184
3 487
Name: count, dtype: int64
Data Preparation#
# For unsupervised learning, we'll drop the target and ID columns
df_features = df.drop(columns=['Survived', 'Name'])
print(f"Feature columns: {df_features.columns.tolist()}")
print(f"Shape for embedding: {df_features.shape}")
# Row2Vec handles missing values automatically, but let's see what we're dealing with
missing_counts = df_features.isnull().sum()
print(f"\nColumns with missing values:")
for col, count in missing_counts[missing_counts > 0].items():
print(f" {col}: {count} missing ({count/len(df_features)*100:.1f}%)")
Feature columns: ['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']
Shape for embedding: (887, 6)
Columns with missing values:
Unsupervised Row Embeddings#
Create a 2D embedding for each passenger:
# Generate 2D embeddings for visualization
embeddings_2d = learn_embedding(
df_features,
mode="unsupervised",
embedding_dim=2,
max_epochs=50,
batch_size=32,
dropout_rate=0.2,
hidden_units=128,
verbose=False,
seed=42
)
print(f"Embeddings shape: {embeddings_2d.shape}")
print("\nFirst 5 passenger embeddings:")
print(embeddings_2d.head())
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer (InputLayer) │ (None, 7) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense (Dense) │ (None, 128) │ 1,024 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 2) │ 258 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_1 (Dense) │ (None, 128) │ 384 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_1 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_2 (Dense) │ (None, 7) │ 903 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 2,569 (10.04 KB)
Trainable params: 2,569 (10.04 KB)
Non-trainable params: 0 (0.00 B)
Embeddings shape: (887, 2)
First 5 passenger embeddings:
embedding_0 embedding_1
0 1.216501 0.154850
1 -2.013922 1.380972
2 0.497961 0.077350
3 -1.714123 1.226604
4 0.821716 1.138183
Visualize the Embeddings#
import matplotlib.pyplot as plt
# Add survival information for coloring
embeddings_with_survival = embeddings_2d.copy()
embeddings_with_survival['Survived'] = df['Survived'].values
# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Plot 1: Colored by survival
scatter1 = axes[0].scatter(
embeddings_with_survival['embedding_0'],
embeddings_with_survival['embedding_1'],
c=embeddings_with_survival['Survived'],
cmap='coolwarm',
alpha=0.6,
s=20
)
axes[0].set_xlabel('Embedding Dimension 0')
axes[0].set_ylabel('Embedding Dimension 1')
axes[0].set_title('Passenger Embeddings Colored by Survival')
plt.colorbar(scatter1, ax=axes[0], label='Survived')
# Plot 2: Colored by passenger class
embeddings_with_class = embeddings_2d.copy()
embeddings_with_class['Pclass'] = df['Pclass'].values
scatter2 = axes[1].scatter(
embeddings_with_class['embedding_0'],
embeddings_with_class['embedding_1'],
c=embeddings_with_class['Pclass'],
cmap='viridis',
alpha=0.6,
s=20
)
axes[1].set_xlabel('Embedding Dimension 0')
axes[1].set_ylabel('Embedding Dimension 1')
axes[1].set_title('Passenger Embeddings Colored by Class')
plt.colorbar(scatter2, ax=axes[1], label='Passenger Class')
plt.tight_layout()
plt.show()
print("Notice how passengers with similar survival outcomes and classes cluster together!")
Notice how passengers with similar survival outcomes and classes cluster together!
Higher-Dimensional Embeddings#
For machine learning, we typically want more dimensions:
# Generate 5D embeddings for ML features (reduced from 10D due to feature count)
embeddings_5d = learn_embedding(
df_features,
mode="unsupervised",
embedding_dim=5,
max_epochs=50,
batch_size=64,
dropout_rate=0.25,
hidden_units=256,
verbose=False,
seed=42
)
print(f"5D Embeddings shape: {embeddings_5d.shape}")
print("\nStatistics per dimension:")
print(embeddings_5d.describe().round(3))
Model: "functional_2"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_1 (InputLayer) │ (None, 7) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_3 (Dense) │ (None, 256) │ 2,048 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_2 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 1,285 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_4 (Dense) │ (None, 256) │ 1,536 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_3 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_5 (Dense) │ (None, 7) │ 1,799 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 6,668 (26.05 KB)
Trainable params: 6,668 (26.05 KB)
Non-trainable params: 0 (0.00 B)
5D Embeddings shape: (887, 5)
Statistics per dimension:
embedding_0 embedding_1 embedding_2 embedding_3 embedding_4
count 887.000 887.000 887.000 887.000 887.000
mean -0.161 0.033 0.184 0.072 -0.233
std 0.964 0.947 0.984 1.081 0.911
min -1.733 -3.551 -6.879 -5.364 -6.559
25% -0.852 -0.445 -0.127 -0.376 -0.501
50% -0.353 0.012 0.484 0.102 0.138
75% 0.265 0.587 0.790 0.808 0.221
max 6.228 2.895 1.540 2.562 1.307
Target-Based Embeddings#
Learn embeddings for categorical columns:
Sex Embeddings#
# Learn embeddings for Sex categories
sex_embeddings = learn_embedding(
df,
mode="target",
reference_column="Sex",
embedding_dim=2,
max_epochs=30,
verbose=False,
seed=42
)
print("Sex category embeddings:")
print(sex_embeddings)
# Calculate distance between the two sex categories
# Use iloc to access by position since index might be numerical
if len(sex_embeddings) >= 2:
emb_1 = sex_embeddings.iloc[0].values
emb_2 = sex_embeddings.iloc[1].values
distance = np.linalg.norm(emb_1 - emb_2)
print(f"\nEuclidean distance between sex categories: {distance:.3f}")
else:
print("\nInsufficient categories for distance calculation")
Model: "functional_4"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_2 (InputLayer) │ (None, 7) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_6 (Dense) │ (None, 128) │ 1,024 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_4 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 2) │ 258 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_7 (Dense) │ (None, 2) │ 6 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 1,288 (5.03 KB)
Trainable params: 1,288 (5.03 KB)
Non-trainable params: 0 (0.00 B)
Sex category embeddings:
embedding_0 embedding_1
category
0 9.819839 11.902977
1 10.392104 13.733018
Euclidean distance between sex categories: 1.917
Passenger Class Embeddings#
# Learn embeddings for passenger classes
pclass_embeddings = learn_embedding(
df,
mode="target",
reference_column="Pclass",
embedding_dim=3,
max_epochs=30,
verbose=False,
seed=42
)
print("Passenger class embeddings:")
print(pclass_embeddings)
# Analyze relationships between classes
import itertools
print("\nPairwise distances between classes:")
# Use iloc to access by position since indices might be different
if len(pclass_embeddings) >= 2:
for i, j in itertools.combinations(range(len(pclass_embeddings)), 2):
dist = np.linalg.norm(
pclass_embeddings.iloc[i].values -
pclass_embeddings.iloc[j].values
)
print(f" Category {i} <-> Category {j}: {dist:.3f}")
else:
print(" Insufficient categories for distance calculation")
Model: "functional_6"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_3 (InputLayer) │ (None, 8) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_8 (Dense) │ (None, 128) │ 1,152 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_5 (Dropout) │ (None, 128) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 3) │ 387 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_9 (Dense) │ (None, 3) │ 12 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 1,551 (6.06 KB)
Trainable params: 1,551 (6.06 KB)
Non-trainable params: 0 (0.00 B)
Passenger class embeddings:
embedding_0 embedding_1 embedding_2
category
0 -11.703871 -24.683310 6.032131
1 -13.441813 -27.182577 5.752655
2 -11.548326 -22.959591 4.434523
Pairwise distances between classes:
Category 0 <-> Category 1: 3.057
Category 0 <-> Category 2: 2.355
Category 1 <-> Category 2: 4.812
Compare with Classical Methods#
Let’s see how neural embeddings compare to classical methods:
# PCA
pca_embeddings = learn_embedding(
df_features,
mode="pca",
embedding_dim=2,
verbose=False
)
# t-SNE
tsne_embeddings = learn_embedding(
df_features,
mode="tsne",
embedding_dim=2,
perplexity=30,
verbose=False,
seed=42
)
print("Method comparison (2D embeddings):")
print("-" * 50)
print(f"Neural: mean={embeddings_2d.mean().mean():.3f}, std={embeddings_2d.std().mean():.3f}")
print(f"PCA: mean={pca_embeddings.mean().mean():.3f}, std={pca_embeddings.std().mean():.3f}")
print(f"t-SNE: mean={tsne_embeddings.mean().mean():.3f}, std={tsne_embeddings.std().mean():.3f}")
Method comparison (2D embeddings):
--------------------------------------------------
Neural: mean=0.164, std=1.435
PCA: mean=0.000, std=1.313
t-SNE: mean=-0.053, std=18.237
Visualize Method Comparison#
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
methods = [
("Neural (Autoencoder)", embeddings_2d),
("PCA", pca_embeddings),
("t-SNE", tsne_embeddings)
]
for ax, (name, emb) in zip(axes, methods):
scatter = ax.scatter(
emb.iloc[:, 0],
emb.iloc[:, 1],
c=df['Survived'].values,
cmap='coolwarm',
alpha=0.6,
s=20
)
ax.set_xlabel('Dimension 0')
ax.set_ylabel('Dimension 1')
ax.set_title(f'{name} Embeddings')
plt.colorbar(scatter, ax=axes, label='Survived', fraction=0.02)
plt.tight_layout()
plt.show()
print("Each method reveals different aspects of the data structure!")
Each method reveals different aspects of the data structure!
Use Embeddings as Features#
Demonstrate using embeddings for downstream ML:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Prepare data
X = embeddings_5d
y = df['Survived']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Train classifier on embeddings
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
# Evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Survival prediction using 5D embeddings:")
print(f"Accuracy: {accuracy:.3f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Died', 'Survived']))
Survival prediction using 5D embeddings:
Accuracy: 0.742
Classification Report:
precision recall f1-score support
Died 0.80 0.77 0.79 109
Survived 0.66 0.70 0.68 69
accuracy 0.74 178
macro avg 0.73 0.73 0.73 178
weighted avg 0.74 0.74 0.74 178
Save Model for Production#
from row2vec import train_and_save_model
import tempfile
import os
# Create a production-ready model
with tempfile.TemporaryDirectory() as tmpdir:
model_path = os.path.join(tmpdir, "titanic_model")
embeddings_final, script_path, binary_path = train_and_save_model(
df_features,
base_path=model_path,
embedding_dim=5,
mode="unsupervised",
max_epochs=50,
batch_size=64,
dropout_rate=0.25,
hidden_units=256,
verbose=False,
seed=42
)
print(f"Model saved to: {os.path.basename(script_path)}")
# Show model can be loaded and used
from row2vec import load_model
model = load_model(script_path)
print(f"\nModel metadata:")
print(f" Mode: {model.metadata.mode}")
print(f" Embedding dimensions: {model.metadata.embedding_dim}")
print(f" Training epochs: {model.metadata.epochs_trained}")
# Handle case where final_loss might be None
if model.metadata.final_loss is not None:
print(f" Final loss: {model.metadata.final_loss:.4f}")
else:
print(f" Final loss: Not recorded")
Model: "functional_8"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer_4 (InputLayer) │ (None, 7) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_10 (Dense) │ (None, 256) │ 2,048 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_6 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding (Dense) │ (None, 5) │ 1,285 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_11 (Dense) │ (None, 256) │ 1,536 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout_7 (Dropout) │ (None, 256) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_12 (Dense) │ (None, 7) │ 1,799 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 6,668 (26.05 KB)
Trainable params: 6,668 (26.05 KB)
Non-trainable params: 0 (0.00 B)
1/28 ━━━━━━━━━━━━━━━━━━━━ 0s 25ms/step
28/28 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step
Model saved to: titanic_model.py
Model metadata:
Mode: unsupervised
Embedding dimensions: 5
Training epochs: 33
Final loss: Not recorded
Key Takeaways#
Automatic Preprocessing: Row2Vec handled missing Age values automatically
Multiple Modes: Neural, PCA, and t-SNE each reveal different patterns
Visualization: 2D embeddings are great for understanding data structure
ML Features: Higher-dimensional embeddings work well as ML features
Target Embeddings: Learn meaningful representations for categorical values
Production Ready: Models can be saved and deployed easily
Next Steps#
Try the Adult Dataset Example for high-cardinality categoricals
Explore Advanced Features like neural architecture search
Learn about the CLI for batch processing