Skip to content
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
import numpy as np
import pandas as pd
import os
os.chdir("..")
%load_ext autoreload
%autoreload 2
d:\Playground\tabular\pytorch-tabular

Utility Functions

def make_mixed_classification(n_samples, n_features, n_categories):
    X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5)
    cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
    col_names = [] 
    num_col_names=[]
    cat_col_names=[]
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    y = pd.Series(y, name="target")
    data = X.join(y)
    return data, cat_col_names, num_col_names

def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = accuracy_score(y_true, y_pred)
    val_f1 = f1_score(y_true, y_pred)
    print(f"{tag} Acc: {val_acc} | {tag} F1: {val_f1}")

Generate Synthetic Data

First of all, let's create a synthetic data which is a mix of numerical and categorical features

data, cat_col_names, num_col_names = make_mixed_classification(n_samples=10000, n_features=20, n_categories=4)
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

Importing the Library

from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig

Category Embedding Model

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    gpus=1, #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    learning_rate = 1e-3
)

experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example", run_name="synthetic_classification_cat_embedding", exp_watch="gradients", log_target="wandb", log_logits=True)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    experiment_config=experiment_config
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.8748, device='cuda:0'),
 'train_accuracy': tensor(0.6827, device='cuda:0'),
 'train_loss': tensor(0.5106, device='cuda:0'),
 'valid_accuracy': tensor(0.8489, device='cuda:0'),
 'valid_loss': tensor(0.4546, device='cuda:0')}
--------------------------------------------------------------------------------


Node Model

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
    batch_size=64,
    accumulate_grad_batches=16,
    max_epochs=100,
    gpus=1, #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()

model_config = NodeConfig(
    task="classification",
    learning_rate = 1e-3
)

experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example", 
                                     run_name="synthetic_classification_node", 
                                     exp_watch="gradients", 
                                     log_target="wandb", 
                                     log_logits=True)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    experiment_config=experiment_config
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.7960, device='cuda:0'),
 'train_accuracy': tensor(0.7202, device='cuda:0'),
 'train_loss': tensor(0.4127, device='cuda:0'),
 'valid_accuracy': tensor(0.7858, device='cuda:0'),
 'valid_loss': tensor(0.5181, device='cuda:0')}
--------------------------------------------------------------------------------


Node Model with Category Embeddings

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
    batch_size=64,
#     accumulate_grad_batches=16,
    max_epochs=100,
    min_epochs=10,
    gpus=1, #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()

model_config = NodeConfig(
    task="classification",
    learning_rate = 1e-3,
    embed_categorical=True
)

experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example", 
                                     run_name="synthetic_classification_node_cat_embed", 
                                     exp_watch="gradients", 
                                     log_target="wandb", 
                                     log_logits=True)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    experiment_config=experiment_config
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.8088, device='cuda:0'),
 'train_accuracy': tensor(0.6652, device='cuda:0'),
 'valid_accuracy': tensor(0.7991, device='cuda:0'),
 'valid_loss': tensor(0.5047, device='cuda:0')}
--------------------------------------------------------------------------------


TabNet Model

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=2048,
#     accumulate_grad_batches=16,
    max_epochs=50,
    min_epochs=10,
    early_stopping=None,
    gpus=1, #index of the GPU to use. 0, means CPU
)
optimizer_config = OptimizerConfig()

model_config = TabNetModelConfig(
    task="classification",
    learning_rate = 1e-5,
n_d = 16, n_a=16, n_steps=4,
)

experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example", 
                                     run_name="synthetic_classification_tabnet", 
                                     exp_watch="gradients", 
                                     log_target="wandb", 
                                     log_logits=True)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    experiment_config=experiment_config
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.4896, device='cuda:0'),
 'train_accuracy': tensor(0.5057, device='cuda:0'),
 'valid_accuracy': tensor(0.5324, device='cuda:0'),
 'valid_loss': tensor(0.9689, device='cuda:0')}
--------------------------------------------------------------------------------