from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import random
import numpy as np
import pandas as pd
import os
try:
import google.colab
IN_COLAB = True
except:
IN_COLAB = False
if not IN_COLAB:
os.chdir("..")
%load_ext autoreload
%autoreload 2
Utility Functions
def make_mixed_regression(n_samples, n_features, n_categories):
X,y = make_regression(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5, n_targets=2)
cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
for col in cat_cols:
X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
col_names = []
num_col_names=[]
cat_col_names=[]
for i in range(X.shape[-1]):
if i in cat_cols:
col_names.append(f"cat_col_{i}")
cat_col_names.append(f"cat_col_{i}")
if i in num_cols:
col_names.append(f"num_col_{i}")
num_col_names.append(f"num_col_{i}")
X = pd.DataFrame(X, columns=col_names)
y = pd.DataFrame(y, columns=["target_1","target_2"])
data = X.join(y)
return data, cat_col_names, num_col_names
def print_metrics(y_true, y_pred, tag):
if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
y_true = y_true.values
if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
y_pred = y_pred.values
if y_true.ndim>1:
y_true=y_true.ravel()
if y_pred.ndim>1:
y_pred=y_pred.ravel()
val_acc = mean_squared_error(y_true, y_pred)
val_f1 = mean_absolute_error(y_true, y_pred)
print(f"{tag} MSE: {val_acc} | {tag} MAE: {val_f1}")
Generate Synthetic Data
First of all, let's create a synthetic data which is a mix of numerical and categorical features and have multiple targets for regression
data, cat_col_names, num_col_names = make_mixed_regression(n_samples=10000, n_features=20, n_categories=4)
target_cols = ['target_1','target_2']
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)
Importing the Library
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
batch_size = 1024 #Will set the same in the Trainer YAML file
steps_per_epoch = int(train.shape[0]/1024)
epochs = 20
Basic
Define the Configs
In the Basic tutorial, we saw how we declare these params programatically. We can also use YAML files to manage the configuration. In that case, we just need to pass in the path to the file as the argument in TabularModel
. Let's use a YAML file for TrainerConfig.
For the Learning Rate Scheduler, let's use a OneCycleLR popularized by fast.ai.
data_config = DataConfig(
target=target_cols, #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
)
optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.00478, "epochs": epochs, "steps_per_epoch":steps_per_epoch})
model_config = CategoryEmbeddingModelConfig(
task="regression",
layers="1024-512-512", # Number of nodes in each layer
activation="LeakyReLU", # Activation between each layers
learning_rate = 1e-3
)
Trainer Config YAML file
batch_size: 1024
fast_dev_run: false
max_epochs: 20
min_epochs: 1
gpus: -1
accumulate_grad_batches: 1
auto_lr_find: false
check_val_every_n_epoch: 1
gradient_clip_val: 0.0
overfit_batches: 0.0
profiler: null
early_stopping: null #null because we want to turn off early stopping. With OneCycleLR, it doesnt always work great
early_stopping_min_delta: 0.001
early_stopping_mode: min
early_stopping_patience: 3
checkpoints: valid_loss
checkpoints_path: saved_models
checkpoints_mode: min
checkpoints_save_top_k: 1
load_best: true
track_grad_norm: -1
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config="examples/yaml_config/trainer_config.yml",
)
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
We can see the metrics and loss for each target and a total loss/metric. We can pin the EarlyStopping or the Checkpoint Saving on any one of these metrics
pred_df = tabular_model.predict(test)
pred_df.head()
print("Target 1")
print_metrics(test['target_1'], pred_df["target_1_prediction"], tag="Holdout")
print("Target 2")
print_metrics(test['target_2'], pred_df["target_2_prediction"], tag="Holdout")
Advanced
Let's do the following: 1. A data transform for the continuous columns 2. Set Target Ranges for the multiple targets 3. Use NODE model 4. A Custom Optimizer
#Since we are using a lower learning rate, increasing the epochs
batch_size = 512
steps_per_epoch = int(train.shape[0]/batch_size)
epochs = 50
data_config = DataConfig(
target=target_cols, #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
continuous_feature_transform="quantile_normal"
)
trainer_config = TrainerConfig(
auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
batch_size=batch_size,
max_epochs=epochs,
early_stopping=None,
accumulate_grad_batches=2,
gpus=-1, #index of the GPU to use. -1 means using all available GPUs. None, means CPU
)
optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":2e-3, "epochs": epochs, "steps_per_epoch":steps_per_epoch})
model_config = NodeConfig(
task="regression",
num_layers=2, # Number of Dense Layers
num_trees=1024, #Number of Trees in each layer
depth=5, #Depth of each Tree
embed_categorical=False, #If True, will use a learned embedding, else it will use LeaveOneOutEncoding for categorical columns
learning_rate = 1e-3,
target_range=[(float(train[col].min()),float(train[col].max())) for col in target_cols]
)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
)
from torch_optimizer import QHAdam
from sklearn.preprocessing import PowerTransformer
tabular_model.fit(train=train,
validation=val,
# target_transform=PowerTransformer(method="yeo-johnson"),
optimizer=QHAdam,
optimizer_params={"nus": (0.7, 1.0), "betas": (0.95, 0.998)})
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)
pred_df.head()
print("Target 1")
print_metrics(test['target_1'], pred_df["target_1_prediction"], tag="Holdout")
print("Target 2")
print_metrics(test['target_2'], pred_df["target_2_prediction"], tag="Holdout")
Comparison
Target | Basic | Advanced |
---|---|---|
Target 1 MSE | 8054.42 | 6342.21 |
Target 1 MAE | 69.09 | 61.97 |
Target 2 MSE | 13669.88 | 6692.86 |
Target 2 MAE | 90.77 | 60.51 |