from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
import numpy as np
import pandas as pd
import os
try:
import google.colab
IN_COLAB = True
except:
IN_COLAB = False
if not IN_COLAB:
os.chdir("..")
%load_ext autoreload
%autoreload 2
Utility Functions
def make_mixed_classification(n_samples, n_features, n_categories):
X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5)
cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
for col in cat_cols:
X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
col_names = []
num_col_names=[]
cat_col_names=[]
for i in range(X.shape[-1]):
if i in cat_cols:
col_names.append(f"cat_col_{i}")
cat_col_names.append(f"cat_col_{i}")
if i in num_cols:
col_names.append(f"num_col_{i}")
num_col_names.append(f"num_col_{i}")
X = pd.DataFrame(X, columns=col_names)
y = pd.Series(y, name="target")
data = X.join(y)
return data, cat_col_names, num_col_names
def print_metrics(y_true, y_pred, tag):
if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
y_true = y_true.values
if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
y_pred = y_pred.values
if y_true.ndim>1:
y_true=y_true.ravel()
if y_pred.ndim>1:
y_pred=y_pred.ravel()
val_acc = accuracy_score(y_true, y_pred)
val_f1 = f1_score(y_true, y_pred)
print(f"{tag} Acc: {val_acc} | {tag} F1: {val_f1}")
Generate Synthetic Data
First of all, let's create a synthetic data which is a mix of numerical and categorical features
data, cat_col_names, num_col_names = make_mixed_classification(n_samples=10000, n_features=20, n_categories=4)
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)
Importing the Library
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
Category Embedding Model
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
batch_size=1024,
max_epochs=100,
gpus=-1, #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()
model_config = CategoryEmbeddingModelConfig(
task="classification",
layers="1024-512-512", # Number of nodes in each layer
activation="LeakyReLU", # Activation between each layers
learning_rate = 1e-3
)
experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example", run_name="synthetic_classification_cat_embedding", exp_watch="gradients", log_target="wandb", log_logits=True)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
experiment_config=experiment_config
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
Node Model
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
batch_size=64,
accumulate_grad_batches=16,
max_epochs=100,
gpus=-1, #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()
model_config = NodeConfig(
task="classification",
learning_rate = 1e-3
)
experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example",
run_name="synthetic_classification_node",
exp_watch="gradients",
log_target="wandb",
log_logits=True)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
experiment_config=experiment_config
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
Node Model with Category Embeddings
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
batch_size=64,
# accumulate_grad_batches=16,
max_epochs=100,
min_epochs=10,
gpus=-1, #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()
model_config = NodeConfig(
task="classification",
learning_rate = 1e-3,
embed_categorical=True
)
experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example",
run_name="synthetic_classification_node_cat_embed",
exp_watch="gradients",
log_target="wandb",
log_logits=True)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
experiment_config=experiment_config
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
TabNet Model
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
batch_size=2048,
# accumulate_grad_batches=16,
max_epochs=50,
min_epochs=10,
early_stopping=None,
gpus=-1, #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()
model_config = TabNetModelConfig(
task="classification",
learning_rate = 1e-5,
n_d = 16, n_a=16, n_steps=4,
)
experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example",
run_name="synthetic_classification_tabnet",
exp_watch="gradients",
log_target="wandb",
log_logits=True)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
experiment_config=experiment_config
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)