Skip to content
import os
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if not IN_COLAB:
    os.chdir("..")
from sklearn.datasets import fetch_covtype
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import category_encoders as ce
%load_ext autoreload
%autoreload 2

Utility Functions

def load_classification_data():
    dataset = fetch_covtype(data_home="data")
    data = np.hstack([dataset.data, dataset.target.reshape(-1, 1)])
    col_names = [f"feature_{i}" for i in range(data.shape[-1])]
    col_names[-1] = "target"
    data = pd.DataFrame(data, columns=col_names)
    data["feature_0_cat"] = pd.qcut(data["feature_0"], q=4)
    data["feature_0_cat"] = "feature_0_" + data.feature_0_cat.cat.codes.astype(str)
    test_idx = data.sample(int(0.2 * len(data)), random_state=42).index
    test = data[data.index.isin(test_idx)]
    train = data[~data.index.isin(test_idx)]
    return (train, test, ["target"])

def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = accuracy_score(y_true, y_pred)
    val_f1 = f1_score(y_true, y_pred, average="macro")
    print(f"{tag} Acc: {val_acc} | {tag} F1: {val_f1}")

Load Forest Cover Data

train, test, target_col = load_classification_data()
train, val = train_test_split(train, random_state=42)
cat_col_names = ["feature_0_cat"]
num_col_names = [col for col in train.columns if col not in cat_col_names+target_col]
encoder = ce.OneHotEncoder(cols=cat_col_names)
train_transform = encoder.fit_transform(train)
val_transform = encoder.transform(val)
test_transform = encoder.transform(test)
D:\miniconda3\envs\df_encoder\lib\site-packages\category_encoders\utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  elif pd.api.types.is_categorical(cols):

Baseline

Let's use the default LightGBM model as a baseline.

clf = lgb.LGBMClassifier(random_state=42, n_jobs=-1)
clf.fit(train_transform.drop(columns=target_col), train_transform[target_col].values.ravel())
val_pred = clf.predict(val_transform.drop(columns=target_col))
print_metrics(val_transform[target_col], val_pred, "Validation")
test_pred = clf.predict(test_transform.drop(columns='target'))
print_metrics(test_transform[target_col], test_pred, "Holdout")
Validation Acc: 0.8528953641472251 | Validation F1: 0.825508819288814
Holdout Acc: 0.8517409338909829 | Holdout F1: 0.8175438711213123

CategoryEmbedding Model

from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, NodeConfig, TabNetModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
data_config = DataConfig(
    target=target_col, #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    continuous_feature_transform="quantile_normal",
    normalize_continuous_features=True
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=1000,
    gpus=-1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
optimizer_config = OptimizerConfig()
model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="4096-4096-512",  # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    learning_rate = 1e-3,
    metrics=["accuracy", "f1"],
    metrics_params=[{},{"average":"micro"}]
)
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train, test=test)
result = tabular_model.evaluate(test)
print(result)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': tensor(0.7331, device='cuda:0'),
 'train_accuracy': tensor(0.6792, device='cuda:0'),
 'train_loss': tensor(0.7323, device='cuda:0'),
 'valid_accuracy': tensor(0.7256, device='cuda:0'),
 'valid_loss': tensor(0.6508, device='cuda:0')}
--------------------------------------------------------------------------------

[{'train_loss': 0.7322986721992493, 'valid_loss': 0.6507797837257385, 'valid_accuracy': 0.7255920171737671, 'train_accuracy': 0.6791602969169617, 'test_accuracy': 0.7330596446990967}]

To get the prediction as a dataframe, we can use the predict method. This will add predictions to the same dataframe that was passed in. For classification problems, we get both the probabilities and the final prediction taking 0.5 as the threshold

pred_df = tabular_model.predict(test)
pred_df.head()
feature_0 feature_1 feature_2 feature_3 feature_4 feature_5 feature_6 feature_7 feature_8 feature_9 ... target feature_0_cat 1.0_probability 2.0_probability 3.0_probability 4.0_probability 5.0_probability 6.0_probability 7.0_probability prediction
0 2596.0 51.0 3.0 258.0 0.0 510.0 221.0 232.0 148.0 6279.0 ... 5.0 4 0.091117 0.903000 0.000026 3.998069e-07 0.005816 0.000012 0.000030 2.0
2 2804.0 139.0 9.0 268.0 65.0 3180.0 234.0 238.0 135.0 6121.0 ... 2.0 4 0.106677 0.869829 0.000271 2.589042e-06 0.022369 0.000835 0.000017 2.0
6 2606.0 45.0 7.0 270.0 5.0 633.0 222.0 225.0 138.0 6256.0 ... 5.0 4 0.109588 0.880163 0.000282 4.664133e-06 0.009279 0.000315 0.000369 2.0
7 2605.0 49.0 4.0 234.0 7.0 573.0 222.0 230.0 144.0 6228.0 ... 5.0 4 0.332467 0.664670 0.000001 3.689538e-08 0.002125 0.000001 0.000736 2.0
12 2742.0 134.0 22.0 150.0 69.0 3215.0 248.0 224.0 92.0 6091.0 ... 2.0 4 0.043308 0.917662 0.000034 1.030319e-06 0.038968 0.000015 0.000012 2.0

5 rows × 64 columns

print_metrics(test['target'], pred_df["prediction"], tag="Holdout")
Holdout Acc: 0.6147828780915991 | Holdout F1: 0.3269562480388109

Extract the Learned Embedding

For the models that support (CategoryEmbeddingModel and CategoryEmbeddingNODE), we can extract the learned embeddings into a sci-kit learn style Transformer. You can use this in your Sci-kit Learn pipelines and workflows as a drop in replacement.

transformer = CategoricalEmbeddingTransformer(tabular_model)
train_transform = transformer.fit_transform(train)
clf = lgb.LGBMClassifier(random_state=42)
clf.fit(train_transform.drop(columns='target'), train_transform['target'])
LGBMClassifier(random_state=42)
val_transform = transformer.transform(val)
val_pred = clf.predict(val_transform.drop(columns=target_col))
print_metrics(val_transform[target_col], val_pred, "Validation")
test_transform = transformer.transform(test)
test_pred = clf.predict(test_transform.drop(columns=target_col))
print_metrics(test_transform[target_col], test_pred, "Holdout")
Validation Acc: 0.8561396865829626 | Validation F1: 0.8260076319996745
Holdout Acc: 0.8555876835166348 | Holdout F1: 0.8233005227790506

Split One-Hot Encoding Neural Embedding
Validation Accuracy 85.28% 85.61%
Validation F1 82.55% 82.60%
Holdout Accuracy 85.17% 85.55%
Holdout F1 81.75% 82.33%