Importing the Library¶
Cross Validation¶
data_config = DataConfig(
target=[
"target"
], # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
continuous_cols=num_col_names,
categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
batch_size=1024,
max_epochs=100,
early_stopping="valid_loss", # Monitor valid_loss for early stopping
early_stopping_mode="min", # Set the mode as min because for val_loss, lower is better
early_stopping_patience=5, # No. of epochs of degradation training will wait before terminating
checkpoints="valid_loss", # Save best checkpoint monitoring val_loss
load_best=True, # After training, load the best checkpoint
progress_bar="none", # Turning off Progress bar
trainer_kwargs=dict(enable_model_summary=False), # Turning off model summary
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(
layers="",
dropout=0.1,
initialization=( # No additional layer in head, just a mapping layer to output_dim
"kaiming"
),
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)
model_config = CategoryEmbeddingModelConfig(
task="classification",
layers="1024-512-512", # Number of nodes in each layer
activation="LeakyReLU", # Activation between each layers
learning_rate=1e-3,
head="LinearHead", # Linear Head
head_config=head_config, # Linear Head Config
)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
verbose=False,
)
Using High-Level API¶
We can use the high level method cross_validate
in TabularModel
'
The arguments are as follows:
- cv
can either be an integer or a KFold
object. If it is an integer, it will be treated as the number of folds in a KFold. For classification problems, it will be a StratifiedKFold. if its a KFold
object, it will be used as is.
- metric
is the metric to be used for evaluation. It can either be a string (name of the metric) or a callable. If it is a callable, it should take in two arguments, the predictions and the targets. The predictions should be the dataframe output from the model.predict
and the target can be a series or an array.
- train
is the training dataset.
- return_oof
is a boolean. If set to True, it will return the out-of-fold predictions for the training dataset. This is useful for stacking models.
- reset_datamodule
is a boolean. If set to True, it will reset the datamodule after each fold, and is the right way of doing cross validation. If set to False, it will not reset the datamodule and will be faster, but will have a small amount of data leakage. This is useful when working with huge datasets and you want to save time.
# cross validation loop usnig sklearn
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
kf = KFold(n_splits=5, shuffle=True, random_state=42)
def _accuracy(y_true, y_pred):
return accuracy_score(y_true, y_pred["prediction"].values)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
cv_scores, oof_predictions = tabular_model.cross_validate(
cv=2, train=train, metric=_accuracy, return_oof=True, reset_datamodule=False
)
Using Low-Level API¶
Sometimes, we might want to do something more than just a plain, vanilla, cross validation. For a example, we might want to do a cross validation with mutiple metrics, or we might want to do a cross validation with a custom metric which relies on something other than the target and predictions. In such cases, we can use the low level API.
def _accuracy(y_true, y_pred):
return accuracy_score(y_true, y_pred["prediction"].values)
def _roc_auc_score(y_true, y_pred):
return roc_auc_score(y_true, y_pred["class_1_probability"].values)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Initialize the tabular model onece
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
verbose=False,
)
acc_metrics = []
roc_metrics = []
preds = []
datamodule = None
model = None
with warnings.catch_warnings():
warnings.simplefilter("ignore")
for fold, (train_idx, val_idx) in enumerate(kf.split(train)):
train_fold = train.iloc[train_idx]
val_fold = train.iloc[val_idx]
if datamodule is None:
# Initialize datamodule and model in the first fold
# uses train data from this fold to fit all transformers
datamodule = tabular_model.prepare_dataloader(
train=train_fold, validation=val_fold, seed=42
)
model = tabular_model.prepare_model(datamodule)
else:
# Creates a copy of the datamodule with same transformers but different train and validation data
datamodule = datamodule.copy(train=train_fold, validation=val_fold)
# Train the model
tabular_model.train(model, datamodule)
pred_df = tabular_model.predict(val_fold)
acc_metrics.append(_accuracy(val_fold["target"], pred_df))
roc_metrics.append(_roc_auc_score(val_fold["target"], pred_df))
print(
f"[bold red]Fold:[/bold red] {fold} | [bold green]Accuracy:[/bold green]"
f" {acc_metrics[-1]} | [bold green]AUC:[/bold green] {roc_metrics[-1]}"
)
# Reset the trained weights before next fold
tabular_model.model.reset_weights()