Skip to content
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import random
import numpy as np
import pandas as pd
import os
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if not IN_COLAB:
    os.chdir("..")
%load_ext autoreload
%autoreload 2

Utility Functions

def make_mixed_regression(n_samples, n_features, n_categories):
    X,y = make_regression(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5, n_targets=1)
    cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
    col_names = [] 
    num_col_names=[]
    cat_col_names=[]
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    y = pd.DataFrame(y, columns=["target"])
    data = X.join(y)
    return data, cat_col_names, num_col_names

def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = mean_squared_error(y_true, y_pred)
    val_f1 = mean_absolute_error(y_true, y_pred)
    print(f"{tag} MSE: {val_acc} | {tag} MAE: {val_f1}")

Generate Synthetic Data

First of all, let's create a synthetic data which is a mix of numerical and categorical features

data, cat_col_names, num_col_names = make_mixed_regression(n_samples=10000, n_features=20, n_categories=4)
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

Importing the Library

from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig, ModelConfig
from pytorch_tabular.models import BaseModel

Defining a Custom Model

import torch
import torch.nn as nn
import torch.nn.functional as F
from omegaconf import DictConfig
from typing import Dict
from dataclasses import dataclass, field

PyTorch Tabular is very easy to extend and infinitely customizable. All the models that have been implemented in PyTorch Tabular inherits an Abstract Class BaseModel which is in fact a PyTorchLightning Model.

It handles all the major functions like decoding the config params and setting up the loss and metrics. It also calculates the Loss and metrics and feeds it back to the PyTorch Lightning Trainer which does the back-propagation.

There are two methods that need to be defined in any class that inherits the Base Model:

  1. _build_network --> This is where you initialize the components required for your model to work
  2. forward --> This is the forward pass of the model.

While this is the bare minimum, you can redefine or use any of the Pytorch Lightning standard methods to tweak your model and training to your liking.

In addition to the model, you will also need to define a config. Configs are python dataclasses and should inherit ModelConfig and will have all the parameters of the ModelConfig. by default. Any additional parameter should be defined in the dataclass.

Key things to note:

  1. All the different parameters in the different configs(like TrainerConfig, OptimizerConfig, etc) are all available in config before calling super() and in self.hparams after.
  2. the input batch at the forward method is a dictionary with keys continuous and categorical
  3. In the \_build_network method, save every component that you want access in the forward to self
  4. The forward method should just have the forward pass and return the outut of the forward pass. In case of classification, do not apply a Sigmoid or Softmax at the end in the forward pass.
@dataclass
class MyAwesomeModelConfig(ModelConfig):
    use_batch_norm: bool = True

class MyAwesomeRegressionModel(BaseModel):
    def __init__(
        self,
        config: DictConfig,
        **kwargs
    ):
        # Save any attribute that you need in _build_network before calling super()
        # The embedding_dims will be available in the config object and after the super() call, it will be available in self.hparams
        self.embedding_cat_dim = sum([y for x, y in config.embedding_dims])
        super().__init__(config, **kwargs)

    def _build_network(self):
        self.embedding_layers = nn.ModuleList(
            [nn.Embedding(x, y) for x, y in self.hparams.embedding_dims]
        )
        #Continuous and Categorical Dimensions are precalculated and stored in the config
        inp_dim = self.embedding_cat_dim + self.hparams.continuous_dim
        self.linear_layer_1 = nn.Linear(inp_dim, 200)
        self.linear_layer_2 = nn.Linear(inp_dim+200, 70)
        self.linear_layer_3 = nn.Linear(inp_dim+70, 1)
        self.input_batch_norm = nn.BatchNorm1d(self.hparams.continuous_dim)
        if self.hparams.use_batch_norm:
            self.batch_norm_2 = nn.BatchNorm1d(inp_dim+200)
            self.batch_norm_3 = nn.BatchNorm1d(inp_dim+70)
        self.embedding_drop = nn.Dropout(0.6)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x: Dict):
        continuous_data, categorical_data = x["continuous"], x["categorical"]
        x = [
                embedding_layer(categorical_data[:, i])
                for i, embedding_layer in enumerate(self.embedding_layers)
            ]
        x = torch.cat(x, 1)
        x = self.embedding_drop(x)

        continuous_data = self.input_batch_norm(continuous_data)
        inp = torch.cat([x, continuous_data], 1)
        x = F.relu(self.linear_layer_1(inp))
        x = self.dropout(x)
        x = torch.cat([x,inp], 1)
        if self.hparams.use_batch_norm:
            x = self.batch_norm_1(x)
        x = F.relu(self.linear_layer_2(x))
        x = self.dropout(x)
        x = torch.cat([x,inp], 1)
        if self.hparams.use_batch_norm:
            x = self.batch_norm_3(x)
        x = self.linear_layer_3(x)
        # target_range is a parameter defined in the ModelConfig and will be available in the config
        if (
            (self.hparams.task == "regression")
            and (self.hparams.target_range is not None)
        ):
            for i in range(self.hparams.output_dim):
                y_min, y_max = self.hparams.target_range[i]
                x[:, i] = y_min + nn.Sigmoid()(x[:, i]) * (y_max - y_min)
        return x

Define the Configs

There is one deviation from the normal when we create a TabularModel object with the configs. Earlier the model was inferred from the config and initialized autmatically. But here, we have to use the model_callable parameter of the TabularModel and pass in the model class(not the initialized object)

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    gpus=-1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()

model_config = MyAwesomeModelConfig(
    task="regression",
    use_batch_norm =False,
    learning_rate = 1e-3
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    model_callable = MyAwesomeRegressionModel
)

Training the Model

The rest of the process is business-as-usual

tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_mean_squared_error': tensor(14005.2725, device='cuda:0'),
 'train_loss': tensor(3013.1719, device='cuda:0'),
 'train_mean_squared_error': tensor(15613.8184, device='cuda:0'),
 'valid_loss': tensor(1060.1337, device='cuda:0'),
 'valid_mean_squared_error': tensor(15130.5811, device='cuda:0')}
--------------------------------------------------------------------------------


pred_df = tabular_model.predict(test)
pred_df.head()
num_col_0 cat_col_1 num_col_2 num_col_3 num_col_4 num_col_5 num_col_6 num_col_7 num_col_8 num_col_9 ... cat_col_12 num_col_13 cat_col_14 num_col_15 num_col_16 cat_col_17 num_col_18 num_col_19 target target_prediction
6252 0.321476 0.0 -0.836426 -0.200794 -1.372801 0.148776 1.607678 -0.710938 0.099704 2.494107 ... 2.0 2.410212 0.0 -0.416442 -0.843505 2.0 0.150040 -0.636704 -119.618988 -174.084061
4684 0.291679 1.0 -0.213108 1.888767 1.209858 -0.684209 0.065715 -1.661187 -2.164594 -1.212303 ... 1.0 1.778092 0.0 -1.007395 0.304803 2.0 -0.638452 0.672491 -207.596232 -171.813644
1731 -1.547951 1.0 1.517188 -0.638986 2.356890 0.826815 -0.570187 -0.415643 0.787585 0.027579 ... 2.0 -0.324598 2.0 1.993319 0.028488 1.0 1.121574 -0.146075 272.098656 198.954025
4742 0.911628 3.0 0.089328 -0.304067 0.984190 -1.114405 0.594178 -0.785370 -0.994555 -0.379163 ... 2.0 -0.217751 0.0 -1.001061 -0.725295 0.0 -0.511682 -0.721897 21.896867 106.040825
4521 0.087945 2.0 -0.320962 -0.231244 0.423397 -0.512270 -0.314670 -0.440412 -0.386701 0.966912 ... 3.0 1.654840 3.0 1.296487 1.079245 3.0 0.327339 -0.365532 46.346326 51.485107

5 rows × 22 columns

print_metrics(test['target'], pred_df["target_prediction"], tag="Holdout")
Holdout MSE: 2515.075280931874 | Holdout MAE: 38.42137329388225