Skip to content
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import random
import numpy as np
import pandas as pd
import os
os.chdir("..")
%load_ext autoreload
%autoreload 2

Utility Functions

def make_mixed_regression(n_samples, n_features, n_categories):
    X,y = make_regression(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5, n_targets=2)
    cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
    col_names = [] 
    num_col_names=[]
    cat_col_names=[]
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    y = pd.DataFrame(y, columns=["target_1","target_2"])
    data = X.join(y)
    return data, cat_col_names, num_col_names

def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = mean_squared_error(y_true, y_pred)
    val_f1 = mean_absolute_error(y_true, y_pred)
    print(f"{tag} MSE: {val_acc} | {tag} MAE: {val_f1}")

Generate Synthetic Data

First of all, let's create a synthetic data which is a mix of numerical and categorical features and have multiple targets for regression

data, cat_col_names, num_col_names = make_mixed_regression(n_samples=10000, n_features=20, n_categories=4)
target_cols = ['target_1','target_2']
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

Importing the Library

from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
batch_size = 1024 #Will set the same in the Trainer YAML file
steps_per_epoch = int(train.shape[0]/1024)
epochs = 20

Basic

Define the Configs

In the Basic tutorial, we saw how we declare these params programatically. We can also use YAML files to manage the configuration. In that case, we just need to pass in the path to the file as the argument in TabularModel. Let's use a YAML file for TrainerConfig.

For the Learning Rate Scheduler, let's use a OneCycleLR popularized by fast.ai.

data_config = DataConfig(
    target=target_cols, #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)

optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.00478, "epochs": epochs, "steps_per_epoch":steps_per_epoch})

model_config = CategoryEmbeddingModelConfig(
    task="regression",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    learning_rate = 1e-3
)

Trainer Config YAML file

batch_size: 1024
fast_dev_run: false
max_epochs: 20
min_epochs: 1
gpus: 1
accumulate_grad_batches: 1
auto_lr_find: false
check_val_every_n_epoch: 1
gradient_clip_val: 0.0
overfit_batches: 0.0
profiler: null
early_stopping: null #null because we want to turn off early stopping. With OneCycleLR, it doesnt always work great
early_stopping_min_delta: 0.001
early_stopping_mode: min
early_stopping_patience: 3
checkpoints: valid_loss
checkpoints_path: saved_models
checkpoints_mode: min
checkpoints_save_top_k: 1
load_best: true
track_grad_norm: -1
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config="examples/yaml_config/trainer_config.yml",
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss_0': tensor(5011.1665, device='cuda:0'),
 'test_loss_1': tensor(7716.1343, device='cuda:0'),
 'test_mean_squared_error': tensor(27298.9863, device='cuda:0'),
 'test_mean_squared_error_0': tensor(9825.0400, device='cuda:0'),
 'test_mean_squared_error_1': tensor(17473.9434, device='cuda:0'),
 'train_loss': tensor(25364.0469, device='cuda:0'),
 'train_loss_0': tensor(8346.8223, device='cuda:0'),
 'train_loss_1': tensor(14448.5986, device='cuda:0'),
 'train_mean_squared_error': tensor(31460.0508, device='cuda:0'),
 'train_mean_squared_error_0': tensor(11417.3340, device='cuda:0'),
 'train_mean_squared_error_1': tensor(20042.7129, device='cuda:0'),
 'valid_loss': tensor(13237.4072, device='cuda:0'),
 'valid_loss_0': tensor(5411.6162, device='cuda:0'),
 'valid_loss_1': tensor(7825.7910, device='cuda:0'),
 'valid_mean_squared_error': tensor(26181.8242, device='cuda:0'),
 'valid_mean_squared_error_0': tensor(9960.1143, device='cuda:0'),
 'valid_mean_squared_error_1': tensor(16221.7100, device='cuda:0')}
--------------------------------------------------------------------------------


We can see the metrics and loss for each target and a total loss/metric. We can pin the EarlyStopping or the Checkpoint Saving on any one of these metrics

pred_df = tabular_model.predict(test)
pred_df.head()
num_col_0 cat_col_1 num_col_2 num_col_3 num_col_4 cat_col_5 num_col_6 num_col_7 num_col_8 cat_col_9 ... num_col_14 num_col_15 num_col_16 num_col_17 num_col_18 num_col_19 target_1 target_2 target_1_prediction target_2_prediction
6252 0.087964 3.0 0.441456 -0.798374 -0.182298 2.0 0.457820 -1.089108 -0.608747 2.0 ... 0.896081 -0.209956 -0.025792 -0.295642 -1.723547 1.124269 1.506144 52.811114 10.602814 35.312042
4684 1.032769 2.0 1.171905 0.644801 -2.029677 1.0 0.592990 0.129360 0.295198 0.0 ... 0.147287 0.425943 0.001516 -0.258499 -1.083438 0.588479 119.189386 93.043074 25.709728 40.447422
1731 -0.652624 0.0 -2.423879 -1.889541 -0.452306 0.0 -0.676392 0.196521 1.440117 3.0 ... 1.800940 0.840644 0.709004 -0.681052 0.128104 -0.040158 174.493212 239.079479 70.475441 137.578629
4742 -0.451170 2.0 0.574557 -1.094271 -0.875318 3.0 -0.732752 0.853738 0.713685 1.0 ... -0.107895 0.275938 -0.720602 -0.758199 0.161861 0.435219 -16.423023 -114.608320 -29.784119 -126.163574
4521 0.010387 2.0 -2.604310 -0.208409 -0.979168 3.0 0.448619 -2.838460 0.532355 0.0 ... -0.256978 -0.846792 0.109045 -0.299561 0.051376 -0.390358 11.986265 -16.142641 8.191200 11.106244

5 rows × 24 columns

print("Target 1")
print_metrics(test['target_1'], pred_df["target_1_prediction"], tag="Holdout")
print("Target 2")
print_metrics(test['target_2'], pred_df["target_2_prediction"], tag="Holdout")
Target 1
Holdout MSE: 8054.416835172999 | Holdout MAE: 69.09794806576706
Target 2
Holdout MSE: 13669.876481275782 | Holdout MAE: 90.77229285927402

Advanced

Let's do the following: 1. A data transform for the continuous columns 2. Set Target Ranges for the multiple targets 3. Use NODE model 4. A Custom Optimizer

#Since we are using a lower learning rate, increasing the epochs
batch_size = 512
steps_per_epoch = int(train.shape[0]/batch_size)
epochs = 50
data_config = DataConfig(
    target=target_cols, #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform="quantile_normal"
)


trainer_config = TrainerConfig(
    auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
    batch_size=batch_size,
    max_epochs=epochs,
    early_stopping=None,
    accumulate_grad_batches=2,
    gpus=1, #index of the GPU to use. 0, means CPU
)


optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":2e-3, "epochs": epochs, "steps_per_epoch":steps_per_epoch})

model_config = NodeConfig(
    task="regression",
    num_layers=2, # Number of Dense Layers
    num_trees=1024, #Number of Trees in each layer
    depth=5, #Depth of each Tree
    embed_categorical=False, #If True, will use a learned embedding, else it will use LeaveOneOutEncoding for categorical columns
    learning_rate = 1e-3,
    target_range=[(train[col].min(),train[col].max()) for col in target_cols]
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
from torch_optimizer import QHAdam
from sklearn.preprocessing import PowerTransformer
tabular_model.fit(train=train, 
                  validation=val, 
                #   target_transform=PowerTransformer(method="yeo-johnson"), 
                  optimizer=QHAdam, 
                  optimizer_params={"nus": (0.7, 1.0), "betas": (0.95, 0.998)})
result = tabular_model.evaluate(test)

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss_0': tensor(6342.2148, device='cuda:0'),
 'test_loss_1': tensor(6692.8682, device='cuda:0'),
 'test_mean_squared_error': tensor(26778.7949, device='cuda:0'),
 'test_mean_squared_error_0': tensor(10557.6211, device='cuda:0'),
 'test_mean_squared_error_1': tensor(16221.1748, device='cuda:0'),
 'valid_loss': tensor(13153.5664, device='cuda:0'),
 'valid_loss_0': tensor(6820.9976, device='cuda:0'),
 'valid_loss_1': tensor(6332.5688, device='cuda:0'),
 'valid_mean_squared_error': tensor(25619.0547, device='cuda:0'),
 'valid_mean_squared_error_0': tensor(10790.4199, device='cuda:0'),
 'valid_mean_squared_error_1': tensor(14828.6348, device='cuda:0')}
--------------------------------------------------------------------------------

pred_df = tabular_model.predict(test)
pred_df.head()
num_col_0 num_col_1 cat_col_2 num_col_3 cat_col_4 num_col_5 num_col_6 num_col_7 num_col_8 num_col_9 cat_col_10 num_col_11 num_col_12 num_col_13 num_col_14 num_col_15 num_col_16 num_col_17 num_col_18 cat_col_19 target_1 target_2 target_1_prediction target_2_prediction
6252 0.087964 2.258349 2.0 -0.798374 1.0 0.381874 0.457820 -1.089108 -0.608747 0.659034 3.0 -0.412531 1.310448 -2.574799 0.896081 -0.209956 -0.025792 -0.295642 -1.723547 3.0 1.506144 52.811114 21.943665 57.454468
4684 1.032769 0.416355 3.0 0.644801 0.0 -0.660151 0.592990 0.129360 0.295198 -0.692314 1.0 1.455192 -0.332637 2.235635 0.147287 0.425943 0.001516 -0.258499 -1.083438 2.0 119.189386 93.043074 37.636200 76.193054
1731 -0.652624 -1.583903 0.0 -1.889541 1.0 -1.430775 -0.676392 0.196521 1.440117 0.760415 1.0 1.329153 0.642723 -0.446183 1.800940 0.840644 0.709004 -0.681052 0.128104 1.0 174.493212 239.079479 49.566559 107.815674
4742 -0.451170 0.239125 2.0 -1.094271 0.0 0.956782 -0.732752 0.853738 0.713685 -0.373434 1.0 0.307559 -1.287622 -0.098005 -0.107895 0.275938 -0.720602 -0.758199 0.161861 2.0 -16.423023 -114.608320 -38.183868 -67.347626
4521 0.010387 0.323700 0.0 -0.208409 0.0 1.944288 0.448619 -2.838460 0.532355 -2.779626 1.0 1.566557 -0.628510 -1.486698 -0.256978 -0.846792 0.109045 -0.299561 0.051376 1.0 11.986265 -16.142641 -32.305481 -51.654846
print("Target 1")
print_metrics(test['target_1'], pred_df["target_1_prediction"], tag="Holdout")
print("Target 2")
print_metrics(test['target_2'], pred_df["target_2_prediction"], tag="Holdout")
Target 1
Holdout MSE: 6342.214983511141 | Holdout MAE: 61.97182702152271
Target 2
Holdout MSE: 6692.867945325126 | Holdout MAE: 60.507998557001855

Comparison

Target Basic Advanced
Target 1 MSE 8054.42 6342.21
Target 1 MAE 69.09 61.97
Target 2 MSE 13669.88 6692.86
Target 2 MAE 90.77 60.51