Skip to content
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
import numpy as np
import pandas as pd
import os
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if not IN_COLAB:
    os.chdir("..")
%load_ext autoreload
%autoreload 2

Utility Functions

def make_mixed_classification(n_samples, n_features, n_categories):
    X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5)
    cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
    col_names = [] 
    num_col_names=[]
    cat_col_names=[]
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    y = pd.Series(y, name="target")
    data = X.join(y)
    return data, cat_col_names, num_col_names

def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = accuracy_score(y_true, y_pred)
    val_f1 = f1_score(y_true, y_pred)
    print(f"{tag} Acc: {val_acc} | {tag} F1: {val_f1}")

Generate Synthetic Data

First of all, let's create a synthetic data which is a mix of numerical and categorical features

data, cat_col_names, num_col_names = make_mixed_classification(n_samples=10000, n_features=20, n_categories=4)
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

Importing the Library

from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig

Define the Configs

This is the most crucial step in the process. There are four configs that you need to provide(most of them have intelligent default values), which will drive the rest of the process.

  • DataConfig - Define the target column names, categorical and numerical column names, any transformation you need to do, etc.
  • ModelConfig - There is a specific config for each of the models. This determines which model we are going to train and also lets you define the hyperparameters of the model
  • TrainerConfig - This let's you configure the training process by setting things like batch_size, epochs, early stopping, etc. The vast majority of parameters are directly borrowed from PyTorch Lightning and is passed to the underlying Trainer object during training
  • OptimizerConfig - This let's you define and use different Optimizers and LearningRate Schedulers. Standard PyTorch Optimizers and Learning RateSchedulers are supported. For custom optimizers, you can use the parameter in the fit method to overwrite this. The custom optimizer should be PyTorch compatible
  • ExperimentConfig - This is an optional parameter. If set, this defines the Experiment Tracking. Right now, only two experiment tracking frameworks are supported: Tensorboard and Weights&Biases. W&B experiment tracker has more features like tracking the gradients and logits across epochs.
data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    gpus=-1, #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    learning_rate = 1e-3
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

Training the Model

Now that we have defined the configs and the TabularModel. We just need to call the fit method and pass the train and test dataframes. We can also pass in validation dataframe. But if omitted, TabularModel will separate 20%(also configurable) at random from the data as validation.

By default, EarlyStopping is enabled and is monitoring Validation Loss with a patience of 3 epochs. The trainer also saves the best model(based on validation loss) and loads that model at the end of training. TrainerConfig has the parameters to tweak this default behaviour.

tabular_model.fit(train=train, validation=val)

Evaluating the Model

Loss and Metrics on New Data

To evaluate the model on new data on the same metrics/loss that was used during training, we can use the evaluate method

result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.6924, device='cuda:0'),
 'train_accuracy': tensor(0.6051, device='cuda:0'),
 'train_loss': tensor(0.6258, device='cuda:0'),
 'valid_accuracy': tensor(0.7440, device='cuda:0'),
 'valid_loss': tensor(0.5769, device='cuda:0')}
--------------------------------------------------------------------------------


New Predictions as DataFrame

To get the prediction as a dataframe, we can use the predict method. This will add predictions to the same dataframe that was passed in. For classification problems, we get both the probabilities and the final prediction taking 0.5 as the threshold

pred_df = tabular_model.predict(test)
pred_df.head()
num_col_0 num_col_1 num_col_2 num_col_3 num_col_4 num_col_5 cat_col_6 num_col_7 num_col_8 num_col_9 ... num_col_14 num_col_15 num_col_16 cat_col_17 num_col_18 num_col_19 target 0_probability 1_probability prediction
6252 -2.790932 -3.304646 -2.010758 3.205420 -0.356361 -0.744417 2.0 -1.492040 -1.061102 1.364186 ... -0.660336 -0.705788 0.229519 2.0 -0.464394 2.879481 0 0.549150 0.450850 0
4684 -0.139585 -1.360640 -1.207160 2.690514 1.072764 -3.499028 3.0 0.953991 0.439317 1.243788 ... -2.726836 0.944248 0.821184 2.0 -1.199147 0.126323 1 0.868879 0.131121 0
1731 0.001421 -0.046718 -0.279572 0.363639 0.852329 0.089246 2.0 0.194984 -1.005871 2.668561 ... -0.508633 0.508788 -0.097083 1.0 -0.282642 -0.190155 0 0.331293 0.668707 1
4742 0.086662 1.549718 0.798527 0.916448 -1.085978 0.512223 0.0 1.538725 0.475361 1.518521 ... 0.326685 1.343219 -1.147619 3.0 0.857619 0.532915 1 0.809316 0.190684 0
4521 0.982186 0.909692 -0.117476 -0.168583 -0.088413 -0.206658 0.0 -0.137569 1.253686 -1.678887 ... -0.282845 0.458761 1.381926 1.0 -0.475947 -0.400418 1 0.695432 0.304568 0

5 rows × 24 columns

print_metrics(test['target'], pred_df["prediction"], tag="Holdout")
Holdout Acc: 0.6 | Holdout F1: 0.5575221238938053

Saving and Loading the Model

tabular_model.save_model("examples/basic")
loaded_model = TabularModel.load_from_checkpoint("examples/basic")
result = loaded_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.6924, device='cuda:0')}
--------------------------------------------------------------------------------