import warnings

import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from pytorch_tabular.utils import load_covertype_dataset

data, cat_col_names, num_col_names, target_col = load_covertype_dataset()

Importing the Library¶

from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    CategoryEmbeddingModelConfig,
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

train, test = train_test_split(data, random_state=42)

Bagged Prediction¶

This is when we train different models on different folds of the data and finally ensemble them together to get the final prediction. This is a very powerful technique and can be used to increase the accuracy of the model and is something that is very frequently done in Kaggle.

data_config = DataConfig(
    target=[
        target_col
    ],  # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    batch_size=1024,
    max_epochs=100,
    early_stopping="valid_loss",  # Monitor valid_loss for early stopping
    early_stopping_mode="min",  # Set the mode as min because for val_loss, lower is better
    early_stopping_patience=5,  # No. of epochs of degradation training will wait before terminating
    checkpoints="valid_loss",  # Save best checkpoint monitoring val_loss
    load_best=True,  # After training, load the best checkpoint
    progress_bar="none",  # Turning off Progress bar
    trainer_kwargs=dict(enable_model_summary=False),  # Turning off model summary
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="",
    dropout=0.1,
    initialization=(  # No additional layer in head, just a mapping layer to output_dim
        "kaiming"
    ),
).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="1024-512-512",  # Number of nodes in each layer
    activation="LeakyReLU",  # Activation between each layers
    learning_rate=1e-3,
    head="LinearHead",  # Linear Head
    head_config=head_config,  # Linear Head Config
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    verbose=False,
)

tabular_model.fit(train=train)
pred_df = tabular_model.predict(test)

Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/manujosephv/miniconda3/envs/lightning_upgrade/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.

We can use the high level method bagging_predict in TabularModel'

The arguments are as follows: - cv can either be an integer or a KFold object. If it is an integer, it will be treated as the number of folds in a KFold. For classification problems, it will be a StratifiedKFold. if its a KFold object, it will be used as is. - train is the training dataset. - test is the test dataset. - aggregate is how we ensemble the predictions. It can be mean, median, min, max, and hard_voting. hard_voting is only valid for classification. We can also pass in a custom function that takes takes in a list of 3D arrays (num_samples, num_cv, num_targets) and returns a 2D array of final probabilities (num_samples, num_targets) - weights is used for aggregating the predictions from each fold. If None, will use equal weights. This is only used when aggregate is "mean" - return_raw_predictions If True, will return the raw predictions from each fold. Defaults to False.

For the entire list of arguments, please refer to the docstring.

# cross validation loop usnig sklearn
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    bagged_pred_df = tabular_model.bagging_predict(
        cv=3, train=train, test=test, aggregate="mean"
    )

2023-12-31 18:04:38,344 - {pytorch_tabular.tabular_model:2276} - INFO - Running Fold 1/3

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

2023-12-31 18:08:17,581 - {pytorch_tabular.tabular_model:2307} - INFO - Fold 1/3 prediction done

2023-12-31 18:08:17,590 - {pytorch_tabular.tabular_model:2276} - INFO - Running Fold 2/3

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

2023-12-31 18:13:21,030 - {pytorch_tabular.tabular_model:2307} - INFO - Fold 2/3 prediction done

2023-12-31 18:13:21,039 - {pytorch_tabular.tabular_model:2276} - INFO - Running Fold 3/3

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

2023-12-31 18:16:38,902 - {pytorch_tabular.tabular_model:2307} - INFO - Fold 3/3 prediction done

# Calculating metrics
orig_acc = accuracy_score(test[target_col].values, pred_df["prediction"].values)
bagged_acc = accuracy_score(test[target_col].values, bagged_pred_df["prediction"].values)
print(f"Original Accuracy: {orig_acc} | Bagged Accuracy: {bagged_acc}")

Original Accuracy: 0.9506860443502028 | Bagged Accuracy: 0.9533778992516506