from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
import numpy as np
import pandas as pd
from pytorch_tabular.utils import make_mixed_dataset, print_metrics
import os
%load_ext autoreload
%autoreload 2
Importing the Library¶
Define the Configs¶
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
batch_size=1024,
max_epochs=100,
early_stopping="valid_loss", # Monitor valid_loss for early stopping
early_stopping_mode = "min", # Set the mode as min because for val_loss, lower is better
early_stopping_patience=5, # No. of epochs of degradation training will wait before terminating
checkpoints="valid_loss", # Save best checkpoint monitoring val_loss
load_best=True, # After training, load the best checkpoint
# accelerator="cpu"
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(
layers="", # No additional layer in head, just a mapping layer to output_dim
dropout=0.1,
initialization="kaiming"
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)
model_config = CategoryEmbeddingModelConfig(
task="classification",
layers="1024-512-512", # Number of nodes in each layer
activation="LeakyReLU", # Activation between each layers
head = "LinearHead", #Linear Head
head_config = head_config, # Linear Head Config
learning_rate = 1e-3,
metrics=["f1_score","accuracy"],
metrics_params=[{"num_classes":2},{}], # f1_score needs num_classes
metrics_prob_input=[True, False] # f1_score needs probability scores, while accuracy doesn't
)
Training the Model¶
Custom Sampler¶
PyTorch Tabular also allows custom batching strategy through Custom Samplers which comes in handy when working with imbalanced data.
Although you can use any sampler, Pytorch Tabular has a few handy utility functions which takes in the target array and implements WeightedRandomSampler using inverse frequency sampling to combat imbalance. This is analogous to preprocessing techniques like Under or OverSampling in traditional ML systems.
Custom Weighted Loss¶
If Samplers were like Over/Under Sampling, Custom Weighted Loss is similar to class_weights
. Depending on the problem, one of these might help you with imbalance. You can easily make calculate the class_weights and provide them to the CrossEntropyLoss using the parameter weight
. To make this easier, PyTorch Tabular has a handy utility method which calculates smoothed class weights and initializes a weighted loss. Once you have that loss, it's just a matter of passing it to the 1fit1 method using the loss
parameter.
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
verbose=False
)
weighted_loss = get_class_weighted_cross_entropy(train["target"].values.ravel(), mu=0.1)
tabular_model.fit(train=train, validation=val, loss=weighted_loss)