Skip to content
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
import numpy as np
import pandas as pd
import os
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if not IN_COLAB:
    os.chdir("..")
%load_ext autoreload
%autoreload 2

Utility Functions

def make_imbalanced_mixed_classification(n_samples, n_features, n_categories):
    X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5, weights=[0.7], flip_y=0.3)
    cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
    col_names = [] 
    num_col_names=[]
    cat_col_names=[]
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    y = pd.Series(y, name="target")
    data = X.join(y)
    return data, cat_col_names, num_col_names

def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = accuracy_score(y_true, y_pred)
    val_f1 = f1_score(y_true, y_pred)
    print(f"{tag} Acc: {val_acc} | {tag} F1: {val_f1}")

Generate Synthetic Data

First of all, let's create a synthetic data which is a mix of numerical and categorical features

data, cat_col_names, num_col_names = make_imbalanced_mixed_classification(n_samples=10000, n_features=20, n_categories=4)
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

Importing the Library

from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig

Define the Configs

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    gpus=-1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    learning_rate = 1e-3,
    metrics=["f1","accuracy"], 
    metrics_params=[{"num_classes":2},{}]
)

Training the Model

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.6652, device='cuda:0'),
 'test_f1': tensor(0.6652, device='cuda:0'),
 'train_accuracy': tensor(0.6135, device='cuda:0'),
 'train_f1': tensor(0.6135, device='cuda:0'),
 'train_loss': tensor(0.6997, device='cuda:0'),
 'valid_accuracy': tensor(0.6891, device='cuda:0'),
 'valid_f1': tensor(0.6891, device='cuda:0'),
 'valid_loss': tensor(0.6467, device='cuda:0')}
--------------------------------------------------------------------------------


Custom Sampler

PyTorch Tabular also allows custom batching strategy through Custom Samplers which comes in handy when working with imbalanced data.

Although you can use any sampler, Pytorch Tabular has a few handy utility functions which takes in the target array and implements WeightedRandomSampler using inverse frequency sampling to combat imbalance. This is analogous to preprocessing techniques like Under or OverSampling in traditional ML systems.

from pytorch_tabular.utils import get_balanced_sampler, get_class_weighted_cross_entropy
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
sampler = get_balanced_sampler(train['target'].values.ravel())

tabular_model.fit(train=train, validation=val, train_sampler=sampler)
result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.6460, device='cuda:0'),
 'test_f1': tensor(0.6460, device='cuda:0'),
 'train_accuracy': tensor(0.5246, device='cuda:0'),
 'train_f1': tensor(0.5246, device='cuda:0'),
 'train_loss': tensor(0.7055, device='cuda:0'),
 'valid_accuracy': tensor(0.6400, device='cuda:0'),
 'valid_f1': tensor(0.6400, device='cuda:0'),
 'valid_loss': tensor(0.6756, device='cuda:0')}
--------------------------------------------------------------------------------


Custom Weighted Loss

If Samplers were like Over/Under Sampling, Custom Weighted Loss is similar to class_weights. Depending on the problem, one of these might help you with imbalance. You can easily make calculate the class_weights and provide them to the CrossEntropyLoss using the parameter weight. To make this easier, PyTorch Tabular has a handy utility method which calculates smoothed class weights and initializes a weighted loss. Once you have that loss, it's just a matter of passing it to the 1fit1 method using the loss parameter.

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
weighted_loss = get_class_weighted_cross_entropy(train["target"].values.ravel(), mu=0.1)

tabular_model.fit(train=train, validation=val, loss=weighted_loss)
result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.6684, device='cuda:0'),
 'test_f1': tensor(0.6684, device='cuda:0'),
 'train_accuracy': tensor(0.6253, device='cuda:0'),
 'train_f1': tensor(0.6253, device='cuda:0'),
 'train_loss': tensor(0.6659, device='cuda:0'),
 'valid_accuracy': tensor(0.6944, device='cuda:0'),
 'valid_f1': tensor(0.6944, device='cuda:0'),
 'valid_loss': tensor(0.6401, device='cuda:0')}
--------------------------------------------------------------------------------