from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
import numpy as np
import pandas as pd
import os
try:
import google.colab
IN_COLAB = True
except:
IN_COLAB = False
if not IN_COLAB:
os.chdir("..")
%load_ext autoreload
%autoreload 2
Utility Functions
def make_mixed_classification(n_samples, n_features, n_categories):
X,y = make_classification(n_samples=n_samples, n_features=n_features, random_state=42, n_informative=5)
cat_cols = random.choices(list(range(X.shape[-1])),k=n_categories)
num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
for col in cat_cols:
X[:,col] = pd.qcut(X[:,col], q=4).codes.astype(int)
col_names = []
num_col_names=[]
cat_col_names=[]
for i in range(X.shape[-1]):
if i in cat_cols:
col_names.append(f"cat_col_{i}")
cat_col_names.append(f"cat_col_{i}")
if i in num_cols:
col_names.append(f"num_col_{i}")
num_col_names.append(f"num_col_{i}")
X = pd.DataFrame(X, columns=col_names)
y = pd.Series(y, name="target")
data = X.join(y)
return data, cat_col_names, num_col_names
def print_metrics(y_true, y_pred, tag):
if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
y_true = y_true.values
if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
y_pred = y_pred.values
if y_true.ndim>1:
y_true=y_true.ravel()
if y_pred.ndim>1:
y_pred=y_pred.ravel()
val_acc = accuracy_score(y_true, y_pred)
val_f1 = f1_score(y_true, y_pred)
print(f"{tag} Acc: {val_acc} | {tag} F1: {val_f1}")
Generate Synthetic Data
First of all, let's create a synthetic data which is a mix of numerical and categorical features
data, cat_col_names, num_col_names = make_mixed_classification(n_samples=10000, n_features=20, n_categories=4)
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)
Importing the Library
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
Define the Configs
This is the most crucial step in the process. There are four configs that you need to provide(most of them have intelligent default values), which will drive the rest of the process.
- DataConfig - Define the target column names, categorical and numerical column names, any transformation you need to do, etc.
- ModelConfig - There is a specific config for each of the models. This determines which model we are going to train and also lets you define the hyperparameters of the model
- TrainerConfig - This let's you configure the training process by setting things like batch_size, epochs, early stopping, etc. The vast majority of parameters are directly borrowed from PyTorch Lightning and is passed to the underlying Trainer object during training
- OptimizerConfig - This let's you define and use different Optimizers and LearningRate Schedulers. Standard PyTorch Optimizers and Learning RateSchedulers are supported. For custom optimizers, you can use the parameter in the fit method to overwrite this. The custom optimizer should be PyTorch compatible
- ExperimentConfig - This is an optional parameter. If set, this defines the Experiment Tracking. Right now, only two experiment tracking frameworks are supported: Tensorboard and Weights&Biases. W&B experiment tracker has more features like tracking the gradients and logits across epochs.
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
batch_size=1024,
max_epochs=100,
gpus=-1, #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()
model_config = CategoryEmbeddingModelConfig(
task="classification",
layers="1024-512-512", # Number of nodes in each layer
activation="LeakyReLU", # Activation between each layers
learning_rate = 1e-3
)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
)
Training the Model
Now that we have defined the configs and the TabularModel. We just need to call the fit
method and pass the train and test dataframes. We can also pass in validation dataframe. But if omitted, TabularModel will separate 20%(also configurable) at random from the data as validation.
By default, EarlyStopping is enabled and is monitoring Validation Loss with a patience of 3 epochs. The trainer also saves the best model(based on validation loss) and loads that model at the end of training. TrainerConfig
has the parameters to tweak this default behaviour.
tabular_model.fit(train=train, validation=val)
Evaluating the Model
Loss and Metrics on New Data
To evaluate the model on new data on the same metrics/loss that was used during training, we can use the evaluate
method
result = tabular_model.evaluate(test)
New Predictions as DataFrame
To get the prediction as a dataframe, we can use the predict
method. This will add predictions to the same dataframe that was passed in. For classification problems, we get both the probabilities and the final prediction taking 0.5 as the threshold
pred_df = tabular_model.predict(test)
pred_df.head()
print_metrics(test['target'], pred_df["prediction"], tag="Holdout")
Saving and Loading the Model
tabular_model.save_model("examples/basic")
loaded_model = TabularModel.load_from_checkpoint("examples/basic")
result = loaded_model.evaluate(test)