API
pytorch_tabular.tabular_model.TabularModel
__init__(self, config=None, data_config=None, model_config=None, optimizer_config=None, trainer_config=None, experiment_config=None, model_callable=None)
special
The core model which orchestrates everything from initializing the datamodule, the model, trainer, etc.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
config |
Optional[omegaconf.dictconfig.DictConfig] |
Single OmegaConf DictConfig object or the path to the yaml file holding all the config parameters. Defaults to None. |
None |
data_config |
Union[pytorch_tabular.config.config.DataConfig, str] |
DataConfig object or path to the yaml file. Defaults to None. |
None |
model_config |
Union[pytorch_tabular.config.config.ModelConfig, str] |
A subclass of ModelConfig or path to the yaml file. Determines which model to run from the type of config. Defaults to None. |
None |
optimizer_config |
Union[pytorch_tabular.config.config.OptimizerConfig, str] |
OptimizerConfig object or path to the yaml file. Defaults to None. |
None |
trainer_config |
Union[pytorch_tabular.config.config.TrainerConfig, str] |
TrainerConfig object or path to the yaml file. Defaults to None. |
None |
experiment_config |
Union[pytorch_tabular.config.config.ExperimentConfig, str] |
ExperimentConfig object or path to the yaml file. If Provided configures the experiment tracking. Defaults to None. |
None |
model_callable |
Optional[Callable] |
If provided, will override the model callable that will be loaded from the config. Typically used when providing Custom Models |
None |
Source code in pytorch_tabular/tabular_model.py
def __init__(
self,
config: Optional[DictConfig] = None,
data_config: Optional[Union[DataConfig, str]] = None,
model_config: Optional[Union[ModelConfig, str]] = None,
optimizer_config: Optional[Union[OptimizerConfig, str]] = None,
trainer_config: Optional[Union[TrainerConfig, str]] = None,
experiment_config: Optional[Union[ExperimentConfig, str]] = None,
model_callable: Optional[Callable] = None,
) -> None:
"""The core model which orchestrates everything from initializing the datamodule, the model, trainer, etc.
Args:
config (Optional[Union[DictConfig, str]], optional): Single OmegaConf DictConfig object or
the path to the yaml file holding all the config parameters. Defaults to None.
data_config (Optional[Union[DataConfig, str]], optional): DataConfig object or path to the yaml file. Defaults to None.
model_config (Optional[Union[ModelConfig, str]], optional): A subclass of ModelConfig or path to the yaml file.
Determines which model to run from the type of config. Defaults to None.
optimizer_config (Optional[Union[OptimizerConfig, str]], optional): OptimizerConfig object or path to the yaml file.
Defaults to None.
trainer_config (Optional[Union[TrainerConfig, str]], optional): TrainerConfig object or path to the yaml file.
Defaults to None.
experiment_config (Optional[Union[ExperimentConfig, str]], optional): ExperimentConfig object or path to the yaml file.
If Provided configures the experiment tracking. Defaults to None.
model_callable (Optional[Callable], optional): If provided, will override the model callable that will be loaded from the config.
Typically used when providing Custom Models
"""
super().__init__()
self.exp_manager = ExperimentRunManager()
if config is None:
assert (
(data_config is not None)
or (model_config is not None)
or (optimizer_config is not None)
or (trainer_config is not None)
), "If `config` is None, `data_config`, `model_config`, `trainer_config`, and `optimizer_config` cannot be None"
data_config = self._read_parse_config(data_config, DataConfig)
model_config = self._read_parse_config(model_config, ModelConfig)
# # Re-routing to Categorical embedding Model if embed_categorical is true for NODE
# if (
# hasattr(model_config, "_model_name")
# and (model_config._model_name == "NODEModel")
# and (model_config.embed_categorical)
# and ("CategoryEmbedding" not in model_config._model_name)
# ):
# model_config._model_name = (
# "CategoryEmbedding" + model_config._model_name
# )
trainer_config = self._read_parse_config(trainer_config, TrainerConfig)
optimizer_config = self._read_parse_config(
optimizer_config, OptimizerConfig
)
if experiment_config is None:
logger.info("Experiment Tracking is turned off")
self.track_experiment = False
self.config = OmegaConf.merge(
OmegaConf.to_container(data_config),
OmegaConf.to_container(model_config),
OmegaConf.to_container(trainer_config),
OmegaConf.to_container(optimizer_config),
)
else:
experiment_config = self._read_parse_config(
experiment_config, ExperimentConfig
)
self.track_experiment = True
self.config = OmegaConf.merge(
OmegaConf.to_container(data_config),
OmegaConf.to_container(model_config),
OmegaConf.to_container(trainer_config),
OmegaConf.to_container(experiment_config),
OmegaConf.to_container(optimizer_config),
)
else:
self.config = config
if not hasattr(config, "log_target") and (config.log_target is not None):
experiment_config = OmegaConf.structured(experiment_config)
self.track_experiment = True
else:
logger.info("Experiment Tracking is turned off")
self.track_experiment = False
self.name, self.uid = self._get_run_name_uid()
if self.track_experiment:
self._setup_experiment_tracking()
else:
self.logger = None
self.exp_manager = ExperimentRunManager()
if model_callable is None:
self.model_callable = getattr(
getattr(models, self.config._module_src), self.config._model_name
)
self.custom_model = False
else:
self.model_callable = model_callable
self.custom_model = True
self._run_validation()
data_aware_initialization(self)
Performs data-aware initialization for NODE
Source code in pytorch_tabular/tabular_model.py
def data_aware_initialization(self):
"""Performs data-aware initialization for NODE"""
logger.info("Data Aware Initialization....")
# Need a big batch to initialize properly
alt_loader = self.datamodule.train_dataloader(batch_size=2000)
batch = next(iter(alt_loader))
for k, v in batch.items():
if isinstance(v, list) and (len(v) == 0):
# Skipping empty list
continue
# batch[k] = v.to("cpu" if self.config.gpu == 0 else "cuda")
batch[k] = v.to(self.model.device)
# single forward pass to initialize the ODST
with torch.no_grad():
self.model(batch)
evaluate(self, test)
Evaluates the dataframe using the loss and metrics already set in config
Parameters:
Name | Type | Description | Default |
---|---|---|---|
test |
Optional[pandas.core.frame.DataFrame] |
The dataframe to be evaluated. If not provided, will try to use the test provided during fit. If that was also not provided will return an empty dictionary |
required |
Returns:
Type | Description |
---|---|
Union[dict, list] |
Union[dict, list]: The final test result dictionary. |
Source code in pytorch_tabular/tabular_model.py
def evaluate(self, test: Optional[pd.DataFrame]) -> Union[dict, list]:
"""Evaluates the dataframe using the loss and metrics already set in config
Args:
test (Optional[pd.DataFrame]): The dataframe to be evaluated. If not provided, will try to use the
test provided during fit. If that was also not provided will return an empty dictionary
Returns:
Union[dict, list]: The final test result dictionary.
"""
if test is not None:
test_loader = self.datamodule.prepare_inference_dataloader(test)
elif self.test is not None:
test_loader = self.datamodule.test_dataloader()
else:
return {}
result = self.trainer.test(
test_dataloaders=test_loader,
ckpt_path="best" if self.config.checkpoints else None,
)
return result
find_learning_rate(self, train, validation=None, test=None, loss=None, metrics=None, optimizer=None, optimizer_params={}, min_lr=1e-08, max_lr=1, num_training=100, mode='exponential', early_stop_threshold=4.0, plot=True)
Enables the user to do a range test of good initial learning rates, to reduce the amount of guesswork in picking a good starting learning rate.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
train |
DataFrame |
Training Dataframe |
required |
valid |
Optional[pd.DataFrame] |
If provided, will use this dataframe as the validation while training. Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None. |
required |
test |
Optional[pandas.core.frame.DataFrame] |
If provided, will use as the hold-out data, which you'll be able to check performance after the model is trained. Defaults to None. |
None |
loss |
Optional[torch.nn.modules.module.Module] |
Custom Loss functions which are not in standard pytorch library |
None |
metrics |
Optional[List[Callable]] |
Custom metric functions(Callable) which has the signature metric_fn(y_hat, y) |
None |
optimizer |
Optional[torch.optim.optimizer.Optimizer] |
Custom optimizers which are a drop in replacements for standard PyToch optimizers. This should be the Class and not the initialized object |
None |
optimizer_params |
Dict |
The parmeters to initialize the custom optimizer. |
{} |
min_lr |
float |
minimum learning rate to investigate |
1e-08 |
max_lr |
float |
maximum learning rate to investigate |
1 |
num_training |
int |
number of learning rates to test |
100 |
mode |
str |
search strategy, either 'linear' or 'exponential'. If set to 'linear' the learning rate will be searched by linearly increasing after each batch. If set to 'exponential', will increase learning rate exponentially. |
'exponential' |
early_stop_threshold(Optional[float], |
optional |
threshold for stopping the search. If the loss at any point is larger than early_stop_threshold*best_loss then the search is stopped. To disable, set to None. |
required |
plot(bool, |
optional |
If true, will plot using matplotlib |
required |
Source code in pytorch_tabular/tabular_model.py
def find_learning_rate(
self,
train: pd.DataFrame,
validation: Optional[pd.DataFrame] = None,
test: Optional[pd.DataFrame] = None,
loss: Optional[torch.nn.Module] = None,
metrics: Optional[List[Callable]] = None,
optimizer: Optional[torch.optim.Optimizer] = None,
optimizer_params: Dict = {},
min_lr: float = 1e-8,
max_lr: float = 1,
num_training: int = 100,
mode: str = "exponential",
early_stop_threshold: float = 4.0,
plot=True,
) -> None:
"""Enables the user to do a range test of good initial learning rates, to reduce the amount of guesswork in picking a good starting learning rate.
Args:
train (pd.DataFrame): Training Dataframe
valid (Optional[pd.DataFrame], optional): If provided, will use this dataframe as the validation while training.
Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None.
test (Optional[pd.DataFrame], optional): If provided, will use as the hold-out data,
which you'll be able to check performance after the model is trained. Defaults to None.
loss (Optional[torch.nn.Module], optional): Custom Loss functions which are not in standard pytorch library
metrics (Optional[List[Callable]], optional): Custom metric functions(Callable) which has the signature metric_fn(y_hat, y)
optimizer (Optional[torch.optim.Optimizer], optional): Custom optimizers which are a drop in replacements for standard PyToch optimizers.
This should be the Class and not the initialized object
optimizer_params (Optional[Dict], optional): The parmeters to initialize the custom optimizer.
min_lr (Optional[float], optional): minimum learning rate to investigate
max_lr (Optional[float], optional): maximum learning rate to investigate
num_training (Optional[int], optional): number of learning rates to test
mode (Optional[str], optional): search strategy, either 'linear' or 'exponential'. If set to
'linear' the learning rate will be searched by linearly increasing
after each batch. If set to 'exponential', will increase learning
rate exponentially.
early_stop_threshold(Optional[float], optional): threshold for stopping the search. If the
loss at any point is larger than early_stop_threshold*best_loss
then the search is stopped. To disable, set to None.
plot(bool, optional): If true, will plot using matplotlib
"""
train_loader, val_loader = self._pre_fit(
train,
validation,
test,
loss,
metrics,
optimizer,
optimizer_params,
target_transform=None,
max_epochs=None,
min_epochs=None,
reset=True,
)
lr_finder = self.trainer.tuner.lr_find(
self.model,
train_loader,
val_loader,
min_lr,
max_lr,
num_training,
mode,
early_stop_threshold,
)
if plot:
fig = lr_finder.plot(suggest=True)
fig.show()
new_lr = lr_finder.suggestion()
# cancelling the model and trainer that was loaded
self.model = None
self.trainer = None
self.datamodule = None
return new_lr, pd.DataFrame(lr_finder.results)
fit(self, train, validation=None, test=None, loss=None, metrics=None, optimizer=None, optimizer_params={}, train_sampler=None, target_transform=None, max_epochs=None, min_epochs=None, reset=False)
The fit method which takes in the data and triggers the training
Parameters:
Name | Type | Description | Default |
---|---|---|---|
train |
DataFrame |
Training Dataframe |
required |
valid |
Optional[pd.DataFrame] |
If provided, will use this dataframe as the validation while training. Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None. |
required |
test |
Optional[pandas.core.frame.DataFrame] |
If provided, will use as the hold-out data, which you'll be able to check performance after the model is trained. Defaults to None. |
None |
loss |
Optional[torch.nn.modules.module.Module] |
Custom Loss functions which are not in standard pytorch library |
None |
metrics |
Optional[List[Callable]] |
Custom metric functions(Callable) which has the signature metric_fn(y_hat, y) and works on torch tensor inputs |
None |
optimizer |
Optional[torch.optim.optimizer.Optimizer] |
Custom optimizers which are a drop in replacements for standard PyToch optimizers. This should be the Class and not the initialized object |
None |
optimizer_params |
Dict |
The parmeters to initialize the custom optimizer. |
{} |
train_sampler |
Optional[torch.utils.data.sampler.Sampler] |
Custom PyTorch batch samplers which will be passed to the DataLoaders. Useful for dealing with imbalanced data and other custom batching strategies |
None |
target_transform |
Union[sklearn.base.TransformerMixin, Tuple] |
If provided, applies the transform to the target before modelling and inverse the transform during prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or a tuple of callables (transform_func, inverse_transform_func) |
None |
max_epochs |
Optional[int] |
Overwrite maximum number of epochs to be run |
None |
min_epochs |
Optional[int] |
Overwrite minimum number of epochs to be run |
None |
reset |
bool |
(bool): Flag to reset the model and train again from scratch |
False |
Source code in pytorch_tabular/tabular_model.py
def fit(
self,
train: pd.DataFrame,
validation: Optional[pd.DataFrame] = None,
test: Optional[pd.DataFrame] = None,
loss: Optional[torch.nn.Module] = None,
metrics: Optional[List[Callable]] = None,
optimizer: Optional[torch.optim.Optimizer] = None,
optimizer_params: Dict = {},
train_sampler: Optional[torch.utils.data.Sampler] = None,
target_transform: Optional[Union[TransformerMixin, Tuple]] = None,
max_epochs: Optional[int] = None,
min_epochs: Optional[int] = None,
reset: bool = False,
) -> None:
"""The fit method which takes in the data and triggers the training
Args:
train (pd.DataFrame): Training Dataframe
valid (Optional[pd.DataFrame], optional): If provided, will use this dataframe as the validation while training.
Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None.
test (Optional[pd.DataFrame], optional): If provided, will use as the hold-out data,
which you'll be able to check performance after the model is trained. Defaults to None.
loss (Optional[torch.nn.Module], optional): Custom Loss functions which are not in standard pytorch library
metrics (Optional[List[Callable]], optional): Custom metric functions(Callable) which has the
signature metric_fn(y_hat, y) and works on torch tensor inputs
optimizer (Optional[torch.optim.Optimizer], optional): Custom optimizers which are a drop in replacements for standard PyToch optimizers.
This should be the Class and not the initialized object
optimizer_params (Optional[Dict], optional): The parmeters to initialize the custom optimizer.
train_sampler (Optional[torch.utils.data.Sampler], optional): Custom PyTorch batch samplers which will be passed to the DataLoaders. Useful for dealing with imbalanced data and other custom batching strategies
target_transform (Optional[Union[TransformerMixin, Tuple(Callable)]], optional): If provided, applies the transform to the target before modelling
and inverse the transform during prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or
a tuple of callables (transform_func, inverse_transform_func)
max_epochs (Optional[int]): Overwrite maximum number of epochs to be run
min_epochs (Optional[int]): Overwrite minimum number of epochs to be run
reset: (bool): Flag to reset the model and train again from scratch
"""
train_loader, val_loader = self._pre_fit(
train,
validation,
test,
loss,
metrics,
optimizer,
optimizer_params,
train_sampler,
target_transform,
max_epochs,
min_epochs,
reset,
)
self.model.train()
if self.config.auto_lr_find and (not self.config.fast_dev_run):
self.trainer.tune(self.model, train_loader, val_loader)
# Parameters in NODE needs to be initialized again
if self.config._model_name in ["CategoryEmbeddingNODEModel", "NODEModel"]:
self.data_aware_initialization()
self.model.train()
self.trainer.fit(self.model, train_loader, val_loader)
logger.info("Training the model completed...")
if self.config.load_best:
self.load_best_model()
load_best_model(self)
Loads the best model after training is done
Source code in pytorch_tabular/tabular_model.py
def load_best_model(self):
"""Loads the best model after training is done"""
if self.trainer.checkpoint_callback is not None:
logger.info("Loading the best model...")
ckpt_path = self.trainer.checkpoint_callback.best_model_path
logger.debug(f"Model Checkpoint: {ckpt_path}")
ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
self.model.load_state_dict(ckpt["state_dict"])
else:
logger.info(
"No best model available to load. Did you run it more than 1 epoch?..."
)
load_from_checkpoint(dir)
classmethod
Loads a saved model from the directory
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dir |
str |
The directory where the model wa saved, along with the checkpoints |
required |
Returns:
Type | Description |
---|---|
TabularModel |
The saved TabularModel |
Source code in pytorch_tabular/tabular_model.py
@classmethod
def load_from_checkpoint(cls, dir: str):
"""Loads a saved model from the directory
Args:
dir (str): The directory where the model wa saved, along with the checkpoints
Returns:
TabularModel: The saved TabularModel
"""
config = OmegaConf.load(os.path.join(dir, "config.yml"))
datamodule = joblib.load(os.path.join(dir, "datamodule.sav"))
if (
hasattr(config, "log_target")
and (config.log_target is not None)
and os.path.exists(os.path.join(dir, "exp_logger.sav"))
):
logger = joblib.load(os.path.join(dir, "exp_logger.sav"))
else:
logger = None
if os.path.exists(os.path.join(dir, "callbacks.sav")):
callbacks = joblib.load(os.path.join(dir, "callbacks.sav"))
else:
callbacks = []
if os.path.exists(os.path.join(dir, "custom_model_callable.sav")):
model_callable = joblib.load(os.path.join(dir, "custom_model_callable.sav"))
custom_model = True
else:
model_callable = getattr(
getattr(models, config._module_src), config._model_name
)
custom_model = False
custom_params = joblib.load(os.path.join(dir, "custom_params.sav"))
model_args = {}
if custom_params.get("custom_loss") is not None:
model_args['loss'] = "MSELoss"
if custom_params.get("custom_metrics") is not None:
model_args['metrics'] = ["mean_squared_error"]
model_args['metric_params'] = [{}]
if custom_params.get("custom_optimizer") is not None:
model_args['optimizer'] = "Adam"
if custom_params.get("custom_optimizer_params") is not None:
model_args['optimizer_params'] = {}
# Initializing with default metrics, losses, and optimizers. Will revert once initialized
model = model_callable.load_from_checkpoint(
checkpoint_path=os.path.join(dir, "model.ckpt"),
**model_args
)
# else:
# # Initializing with default values
# model = model_callable.load_from_checkpoint(
# checkpoint_path=os.path.join(dir, "model.ckpt"),
# )
# Updating config with custom parameters for experiment tracking
if custom_params.get("custom_loss") is not None:
model.custom_loss = custom_params["custom_loss"]
if custom_params.get("custom_metrics") is not None:
model.custom_metrics = custom_params["custom_metrics"]
if custom_params.get("custom_optimizer") is not None:
model.custom_optimizer = custom_params["custom_optimizer"]
if custom_params.get("custom_optimizer_params") is not None:
model.custom_optimizer_params = custom_params["custom_optimizer_params"]
model._setup_loss()
model._setup_metrics()
tabular_model = cls(config=config, model_callable=model_callable)
tabular_model.model = model
tabular_model.custom_model = custom_model
tabular_model.datamodule = datamodule
tabular_model.callbacks = callbacks
tabular_model._prepare_trainer()
tabular_model.trainer.model = model
tabular_model.logger = logger
return tabular_model
predict(self, test, quantiles=[0.25, 0.5, 0.75], n_samples=100, ret_logits=False)
Uses the trained model to predict on new data and return as a dataframe
Parameters:
Name | Type | Description | Default |
---|---|---|---|
test |
DataFrame |
The new dataframe with the features defined during training |
required |
quantiles |
Optional[List] |
For probabilistic models like Mixture Density Networks, this specifies
the different quantiles to be extracted apart from the |
[0.25, 0.5, 0.75] |
n_samples |
Optional[int] |
Number of samples to draw from the posterior to estimate the quantiles. Ignored for non-probabilistic models. Defaults to 100 |
100 |
ret_logits |
bool |
Flag to return raw model outputs/logits except the backbone features along with the dataframe. Defaults to False |
False |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: Returns a dataframe with predictions and features. If classification, it returns probabilities and final prediction |
Source code in pytorch_tabular/tabular_model.py
def predict(
self,
test: pd.DataFrame,
quantiles: Optional[List] = [0.25, 0.5, 0.75],
n_samples: Optional[int] = 100,
ret_logits=False,
) -> pd.DataFrame:
"""Uses the trained model to predict on new data and return as a dataframe
Args:
test (pd.DataFrame): The new dataframe with the features defined during training
quantiles (Optional[List]): For probabilistic models like Mixture Density Networks, this specifies
the different quantiles to be extracted apart from the `central_tendency` and added to the dataframe.
For other models it is ignored. Defaults to [0.25, 0.5, 0.75]
n_samples (Optional[int]): Number of samples to draw from the posterior to estimate the quantiles.
Ignored for non-probabilistic models. Defaults to 100
ret_logits (bool): Flag to return raw model outputs/logits except the backbone features along
with the dataframe. Defaults to False
Returns:
pd.DataFrame: Returns a dataframe with predictions and features.
If classification, it returns probabilities and final prediction
"""
assert all(
[q <= 1 and q >= 0 for q in quantiles]
), "Quantiles should be a decimal between 0 and 1"
self.model.eval()
inference_dataloader = self.datamodule.prepare_inference_dataloader(test)
point_predictions = []
quantile_predictions = []
logits_predictions = defaultdict(list)
is_probabilistic = (
hasattr(self.model.hparams, "_probabilistic")
and self.model.hparams._probabilistic
)
for batch in tqdm(inference_dataloader, desc="Generating Predictions..."):
for k, v in batch.items():
if isinstance(v, list) and (len(v) == 0):
# Skipping empty list
continue
batch[k] = v.to(self.model.device)
if is_probabilistic:
samples, ret_value = self.model.sample(
batch, n_samples, ret_model_output=True
)
y_hat = torch.mean(samples, dim=-1)
quantile_preds = []
for q in quantiles:
quantile_preds.append(
torch.quantile(samples, q=q, dim=-1).unsqueeze(1)
)
else:
y_hat, ret_value = self.model.predict(batch, ret_model_output=True)
if ret_logits:
for k, v in ret_value.items():
# if k == "backbone_features":
# continue
logits_predictions[k].append(v.detach().cpu())
point_predictions.append(y_hat.detach().cpu())
if is_probabilistic:
quantile_predictions.append(
torch.cat(quantile_preds, dim=-1).detach().cpu()
)
point_predictions = torch.cat(point_predictions, dim=0)
if point_predictions.ndim == 1:
point_predictions = point_predictions.unsqueeze(-1)
if is_probabilistic:
quantile_predictions = torch.cat(quantile_predictions, dim=0).unsqueeze(-1)
if quantile_predictions.ndim == 2:
quantile_predictions = quantile_predictions.unsqueeze(-1)
pred_df = test.copy()
if self.config.task == "regression":
point_predictions = point_predictions.numpy()
# Probabilistic Models are only implemented for Regression
if is_probabilistic:
quantile_predictions = quantile_predictions.numpy()
for i, target_col in enumerate(self.config.target):
if self.datamodule.do_target_transform:
if self.config.target[i] in pred_df.columns:
pred_df[
self.config.target[i]
] = self.datamodule.target_transforms[i].inverse_transform(
pred_df[self.config.target[i]].values.reshape(-1, 1)
)
pred_df[
f"{target_col}_prediction"
] = self.datamodule.target_transforms[i].inverse_transform(
point_predictions[:, i].reshape(-1, 1)
)
if is_probabilistic:
for j, q in enumerate(quantiles):
pred_df[
f"{target_col}_q{int(q*100)}"
] = self.datamodule.target_transforms[i].inverse_transform(
quantile_predictions[:, j, i].reshape(-1, 1)
)
else:
pred_df[f"{target_col}_prediction"] = point_predictions[:, i]
if is_probabilistic:
for j, q in enumerate(quantiles):
pred_df[
f"{target_col}_q{int(q*100)}"
] = quantile_predictions[:, j, i].reshape(-1, 1)
elif self.config.task == "classification":
point_predictions = nn.Softmax(dim=-1)(point_predictions).numpy()
for i, class_ in enumerate(self.datamodule.label_encoder.classes_):
pred_df[f"{class_}_probability"] = point_predictions[:, i]
pred_df[f"prediction"] = self.datamodule.label_encoder.inverse_transform(
np.argmax(point_predictions, axis=1)
)
if ret_logits:
for k, v in logits_predictions.items():
v = torch.cat(v, dim=0).numpy()
if v.ndim == 1:
v = v.reshape(-1, 1)
for i in range(v.shape[-1]):
if v.shape[-1] > 1:
pred_df[f"{k}_{i}"] = v[:, i]
else:
pred_df[f"{k}"] = v[:, i]
return pred_df
save_model(self, dir)
Saves the model and checkpoints in the specified directory
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dir |
str |
The path to the directory to save the model |
required |
Source code in pytorch_tabular/tabular_model.py
def save_model(self, dir: str):
"""Saves the model and checkpoints in the specified directory
Args:
dir (str): The path to the directory to save the model
"""
if os.path.exists(dir) and (os.listdir(dir)):
logger.warning("Directory is not empty. Overwriting the contents.")
for f in os.listdir(dir):
os.remove(os.path.join(dir, f))
os.makedirs(dir, exist_ok=True)
with open(os.path.join(dir, "config.yml"), "w") as fp:
OmegaConf.save(self.config, fp, resolve=True)
joblib.dump(self.datamodule, os.path.join(dir, "datamodule.sav"))
if hasattr(self.config, "log_target") and self.config.log_target is not None:
joblib.dump(self.logger, os.path.join(dir, "exp_logger.sav"))
if hasattr(self, "callbacks"):
joblib.dump(self.callbacks, os.path.join(dir, "callbacks.sav"))
self.trainer.save_checkpoint(os.path.join(dir, "model.ckpt"))
custom_params = {}
custom_params["custom_loss"] = self.model.custom_loss
custom_params["custom_metrics"] = self.model.custom_metrics
custom_params["custom_optimizer"] = self.model.custom_optimizer
custom_params["custom_optimizer_params"] = self.model.custom_optimizer_params
joblib.dump(custom_params, os.path.join(dir, "custom_params.sav"))
if self.custom_model:
joblib.dump(
self.model_callable, os.path.join(dir, "custom_model_callable.sav")
)
pytorch_tabular.tabular_datamodule.TabularDatamodule
__init__(self, train, config, validation=None, test=None, target_transform=None, train_sampler=None)
special
The Pytorch Lightning Datamodule for Tabular Data
Parameters:
Name | Type | Description | Default |
---|---|---|---|
train |
DataFrame |
The Training Dataframe |
required |
config |
omegaconf.dictconfig.DictConfig |
Merged configuration object from ModelConfig, DataConfig, |
required |
validation |
DataFrame |
Validation Dataframe. |
None |
test |
DataFrame |
Holdout DataFrame to check final performance on. |
None |
target_transform |
Union[sklearn.base.TransformerMixin, Tuple] |
If provided, applies the transform to the target before modelling |
None |
Source code in pytorch_tabular/tabular_datamodule.py
def __init__(
self,
train: pd.DataFrame,
config: DictConfig,
validation: pd.DataFrame = None,
test: pd.DataFrame = None,
target_transform: Optional[Union[TransformerMixin, Tuple]] = None,
train_sampler: Optional[torch.utils.data.Sampler] = None,
):
"""The Pytorch Lightning Datamodule for Tabular Data
Args:
train (pd.DataFrame): The Training Dataframe
config (DictConfig): Merged configuration object from ModelConfig, DataConfig,
TrainerConfig, OptimizerConfig & ExperimentConfig
validation (pd.DataFrame, optional): Validation Dataframe.
If left empty, we use the validation split from DataConfig to split a random sample as validation.
Defaults to None.
test (pd.DataFrame, optional): Holdout DataFrame to check final performance on.
Defaults to None.
target_transform (Optional[Union[TransformerMixin, Tuple(Callable)]], optional): If provided, applies the transform to the target before modelling
and inverse the transform during prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or
a tuple of callables (transform_func, inverse_transform_func)
"""
super().__init__()
self.train = train.copy()
self.validation = validation
if target_transform is not None:
if isinstance(target_transform, Iterable):
target_transform = FunctionTransformer(
func=target_transform[0], inverse_func=target_transform[1]
)
self.do_target_transform = True
else:
self.do_target_transform = False
self.target_transform_template = target_transform
self.test = test if test is None else test.copy()
self.target = config.target
self.batch_size = config.batch_size
self.train_sampler = train_sampler
self.config = config
self._fitted = False
add_datepart(df, field_name, frequency, prefix=None, drop=True)
classmethod
Helper function that adds columns relevant to a date in the column field_name
of df
.
Source code in pytorch_tabular/tabular_datamodule.py
@classmethod
def add_datepart(
cls,
df: pd.DataFrame,
field_name: str,
frequency: str,
prefix: str = None,
drop: bool = True,
):
"Helper function that adds columns relevant to a date in the column `field_name` of `df`."
field = df[field_name]
prefix = (
re.sub("[Dd]ate$", "", field_name) if prefix is None else prefix
) + "_"
attr = cls.time_features_from_frequency_str(frequency)
added_features = []
for n in attr:
if n == "Week":
continue
df[prefix + n] = getattr(field.dt, n.lower())
added_features.append(prefix + n)
# Pandas removed `dt.week` in v1.1.10
if "Week" in attr:
week = (
field.dt.isocalendar().week
if hasattr(field.dt, "isocalendar")
else field.dt.week
)
df.insert(3, prefix + "Week", week)
added_features.append(prefix + "Week")
# Not adding Elapsed by default. Need to route it through config
# mask = ~field.isna()
# df[prefix + "Elapsed"] = np.where(
# mask, field.values.astype(np.int64) // 10 ** 9, None
# )
# added_features.append(prefix + "Elapsed")
if drop:
df.drop(field_name, axis=1, inplace=True)
# Removing features woth zero variations
for col in added_features:
if len(df[col].unique()) == 1:
df.drop(columns=col, inplace=True)
added_features.remove(col)
return df, added_features
do_leave_one_out_encoder(self)
Checks the special condition for NODE where we use a LeaveOneOutEncoder to encode categorical columns
Returns:
Type | Description |
---|---|
bool |
bool |
Source code in pytorch_tabular/tabular_datamodule.py
def do_leave_one_out_encoder(self) -> bool:
"""Checks the special condition for NODE where we use a LeaveOneOutEncoder to encode categorical columns
Returns:
bool
"""
return (self.config._model_name == "NODEModel") and (
not self.config.embed_categorical
)
make_date(df, date_field)
classmethod
Make sure df[date_field]
is of the right date type.
Source code in pytorch_tabular/tabular_datamodule.py
@classmethod
def make_date(cls, df: pd.DataFrame, date_field: str):
"Make sure `df[date_field]` is of the right date type."
field_dtype = df[date_field].dtype
if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
field_dtype = np.datetime64
if not np.issubdtype(field_dtype, np.datetime64):
df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
return df
prepare_inference_dataloader(self, df)
Function that prepares and loads the new data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
Dataframe with the features and target |
required |
Returns:
Type | Description |
---|---|
torch.utils.data.dataloader.DataLoader |
DataLoader: The dataloader for the passed in dataframe |
Source code in pytorch_tabular/tabular_datamodule.py
def prepare_inference_dataloader(self, df: pd.DataFrame) -> DataLoader:
"""Function that prepares and loads the new data.
Args:
df (pd.DataFrame): Dataframe with the features and target
Returns:
DataLoader: The dataloader for the passed in dataframe
"""
df = df.copy()
if len(set(self.target) - set(df.columns)) > 0:
if self.config.task == "classification":
df.loc[:, self.target] = np.array(
[self.label_encoder.classes_[0]] * len(df)
)
else:
df.loc[:, self.target] = np.zeros((len(df), len(self.target)))
df, _ = self.preprocess_data(df, stage="inference")
dataset = TabularDataset(
task=self.config.task,
data=df,
categorical_cols=self.config.categorical_cols,
continuous_cols=self.config.continuous_cols,
embed_categorical=(not self.do_leave_one_out_encoder()),
target=self.target
if all([col in df.columns for col in self.target])
else None,
)
return DataLoader(
dataset,
self.batch_size,
shuffle=False,
num_workers=self.config.num_workers,
)
preprocess_data(self, data, stage='inference')
The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before feeding into the dataloder
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
DataFrame |
A dataframe with the features and target |
required |
stage |
str |
Internal parameter. Used to distinguisj between fit and inference. Defaults to "inference". |
'inference' |
Returns:
Type | Description |
---|---|
Tuple[pandas.core.frame.DataFrame, list] |
tuple[pd.DataFrame, list]: Returns the processed dataframe and the added features(list) as a tuple |
Source code in pytorch_tabular/tabular_datamodule.py
def preprocess_data(
self, data: pd.DataFrame, stage: str = "inference"
) -> Tuple[pd.DataFrame, list]:
"""The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before feeding into the dataloder
Args:
data (pd.DataFrame): A dataframe with the features and target
stage (str, optional): Internal parameter. Used to distinguisj between fit and inference. Defaults to "inference".
Returns:
tuple[pd.DataFrame, list]: Returns the processed dataframe and the added features(list) as a tuple
"""
logger.info(f"Preprocessing data: Stage: {stage}...")
added_features = None
if self.config.encode_date_columns:
for field_name, freq in self.config.date_columns:
data = self.make_date(data, field_name)
data, added_features = self.add_datepart(
data, field_name, frequency=freq, prefix=None, drop=True
)
# The only features that are added are the date features extracted
# from the date which are categorical in nature
if (added_features is not None) and (stage == "fit"):
logger.debug(
f"Added {added_features} features after encoding the date_columns"
)
self.config.categorical_cols += added_features
self.config.categorical_dim = (
len(self.config.categorical_cols)
if self.config.categorical_cols is not None
else 0
)
# Encoding Categorical Columns
if len(self.config.categorical_cols) > 0:
if stage == "fit":
if self.do_leave_one_out_encoder():
logger.debug("Encoding Categorical Columns using LeavOneOutEncoder")
self.categorical_encoder = ce.LeaveOneOutEncoder(
cols=self.config.categorical_cols, random_state=42
)
# Multi-Target Regression uses the first target to encode the categorical columns
if len(self.config.target) > 1:
logger.warning(
f"Multi-Target Regression: using the first target({self.config.target[0]}) to encode the categorical columns"
)
data = self.categorical_encoder.fit_transform(
data, data[self.config.target[0]]
)
else:
logger.debug("Encoding Categorical Columns using OrdinalEncoder")
self.categorical_encoder = OrdinalEncoder(
cols=self.config.categorical_cols
)
data = self.categorical_encoder.fit_transform(data)
else:
data = self.categorical_encoder.transform(data)
# Transforming Continuous Columns
if (self.config.continuous_feature_transform is not None) and (
len(self.config.continuous_cols) > 0
):
if stage == "fit":
transform = self.CONTINUOUS_TRANSFORMS[
self.config.continuous_feature_transform
]
self.continuous_transform = transform["callable"](**transform["params"])
# TODO implement quantile noise
data.loc[
:, self.config.continuous_cols
] = self.continuous_transform.fit_transform(
data.loc[:, self.config.continuous_cols]
)
else:
data.loc[
:, self.config.continuous_cols
] = self.continuous_transform.transform(
data.loc[:, self.config.continuous_cols]
)
# Normalizing Continuous Columns
if (self.config.normalize_continuous_features) and (
len(self.config.continuous_cols) > 0
):
if stage == "fit":
self.scaler = StandardScaler()
data.loc[:, self.config.continuous_cols] = self.scaler.fit_transform(
data.loc[:, self.config.continuous_cols]
)
else:
data.loc[:, self.config.continuous_cols] = self.scaler.transform(
data.loc[:, self.config.continuous_cols]
)
# Converting target labels to a 0 indexed label
if self.config.task == "classification":
if stage == "fit":
self.label_encoder = LabelEncoder()
data[self.config.target[0]] = self.label_encoder.fit_transform(
data[self.config.target[0]]
)
else:
if self.config.target[0] in data.columns:
data[self.config.target[0]] = self.label_encoder.transform(
data[self.config.target[0]]
)
# Target Transforms
if all([col in data.columns for col in self.config.target]):
if self.do_target_transform:
target_transforms = []
for col in self.config.target:
_target_transform = copy.deepcopy(self.target_transform_template)
data[col] = _target_transform.fit_transform(
data[col].values.reshape(-1, 1)
)
target_transforms.append(_target_transform)
self.target_transforms = target_transforms
return data, added_features
setup(self, stage=None)
Data Operations you want to perform on all GPUs, like train-test split, transformations, etc. This is called before accessing the dataloaders
Parameters:
Name | Type | Description | Default |
---|---|---|---|
stage |
Optional[str] |
Internal parameter to distinguish between fit and inference. Defaults to None. |
None |
Source code in pytorch_tabular/tabular_datamodule.py
def setup(self, stage: Optional[str] = None) -> None:
"""Data Operations you want to perform on all GPUs, like train-test split, transformations, etc.
This is called before accessing the dataloaders
Args:
stage (Optional[str], optional): Internal parameter to distinguish between fit and inference. Defaults to None.
"""
if stage == "fit" or stage is None:
if self.validation is None:
logger.debug(
f"No validation data provided. Using {self.config.validation_split*100}% of train data as validation"
)
val_idx = self.train.sample(
int(self.config.validation_split * len(self.train)), random_state=42
).index
self.validation = self.train[self.train.index.isin(val_idx)]
self.train = self.train[~self.train.index.isin(val_idx)]
else:
self.validation = self.validation.copy()
# Preprocessing Train, Validation
self.train, _ = self.preprocess_data(self.train, stage="fit")
self.validation, _ = self.preprocess_data(
self.validation, stage="inference"
)
if self.test is not None:
self.test, _ = self.preprocess_data(self.test, stage="inference")
# Calculating the categorical dims and embedding dims etc and updating the config
self.update_config()
self._fitted = True
test_dataloader(self)
Function that loads the validation set.
Source code in pytorch_tabular/tabular_datamodule.py
def test_dataloader(self) -> DataLoader:
""" Function that loads the validation set. """
if self.test is not None:
dataset = TabularDataset(
task=self.config.task,
data=self.test,
categorical_cols=self.config.categorical_cols,
continuous_cols=self.config.continuous_cols,
embed_categorical=(not self.do_leave_one_out_encoder()),
target=self.target,
)
return DataLoader(
dataset,
self.batch_size,
shuffle=False,
num_workers=self.config.num_workers,
)
time_features_from_frequency_str(freq_str)
classmethod
Returns a list of time features that will be appropriate for the given frequency string.
Parameters
freq_str Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
Source code in pytorch_tabular/tabular_datamodule.py
@classmethod
def time_features_from_frequency_str(cls, freq_str: str) -> List[str]:
"""
Returns a list of time features that will be appropriate for the given frequency string.
Parameters
----------
freq_str
Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.
"""
features_by_offsets = {
offsets.YearBegin: [],
offsets.YearEnd: [],
offsets.MonthBegin: [
"Month",
"Quarter",
"Is_quarter_end",
"Is_quarter_start",
"Is_year_end",
"Is_year_start",
],
offsets.MonthEnd: [
"Month",
"Quarter",
"Is_quarter_end",
"Is_quarter_start",
"Is_year_end",
"Is_year_start",
],
offsets.Week: [
"Month",
"Quarter",
"Is_quarter_end",
"Is_quarter_start",
"Is_year_end",
"Is_year_start",
"Is_month_start",
"Week",
],
offsets.Day: [
"Month",
"Quarter",
"Is_quarter_end",
"Is_quarter_start",
"Is_year_end",
"Is_year_start",
"Is_month_start",
"Week" "Day",
"Dayofweek",
"Dayofyear",
],
offsets.BusinessDay: [
"Month",
"Quarter",
"Is_quarter_end",
"Is_quarter_start",
"Is_year_end",
"Is_year_start",
"Is_month_start",
"Week" "Day",
"Dayofweek",
"Dayofyear",
],
offsets.Hour: [
"Month",
"Quarter",
"Is_quarter_end",
"Is_quarter_start",
"Is_year_end",
"Is_year_start",
"Is_month_start",
"Week" "Day",
"Dayofweek",
"Dayofyear",
"Hour",
],
offsets.Minute: [
"Month",
"Quarter",
"Is_quarter_end",
"Is_quarter_start",
"Is_year_end",
"Is_year_start",
"Is_month_start",
"Week" "Day",
"Dayofweek",
"Dayofyear",
"Hour",
"Minute",
],
}
offset = to_offset(freq_str)
for offset_type, feature in features_by_offsets.items():
if isinstance(offset, offset_type):
return feature
supported_freq_msg = f"""
Unsupported frequency {freq_str}
The following frequencies are supported:
Y, YS - yearly
alias: A
M, MS - monthly
W - weekly
D - daily
B - business days
H - hourly
T - minutely
alias: min
"""
raise RuntimeError(supported_freq_msg)
train_dataloader(self, batch_size=None)
Function that loads the train set.
Source code in pytorch_tabular/tabular_datamodule.py
def train_dataloader(self, batch_size: Optional[int] = None) -> DataLoader:
""" Function that loads the train set. """
dataset = TabularDataset(
task=self.config.task,
data=self.train,
categorical_cols=self.config.categorical_cols,
continuous_cols=self.config.continuous_cols,
embed_categorical=(not self.do_leave_one_out_encoder()),
target=self.target,
)
return DataLoader(
dataset,
batch_size if batch_size is not None else self.batch_size,
shuffle=True if self.train_sampler is None else False,
num_workers=self.config.num_workers,
sampler=self.train_sampler,
)
update_config(self)
Calculates and updates a few key information to the config object
Exceptions:
Type | Description |
---|---|
NotImplementedError |
[description] |
Source code in pytorch_tabular/tabular_datamodule.py
def update_config(self) -> None:
"""Calculates and updates a few key information to the config object
Raises:
NotImplementedError: [description]
"""
if self.config.task == "regression":
self.config.output_dim = len(self.config.target)
elif self.config.task == "classification":
self.config.output_dim = len(self.train[self.config.target[0]].unique())
if not self.do_leave_one_out_encoder():
self.config.categorical_cardinality = [
int(self.train[col].fillna("NA").nunique()) + 1
for col in self.config.categorical_cols
]
if self.config.embedding_dims is None:
self.config.embedding_dims = [
(x, min(50, (x + 1) // 2))
for x in self.config.categorical_cardinality
]
val_dataloader(self)
Function that loads the validation set.
Source code in pytorch_tabular/tabular_datamodule.py
def val_dataloader(self) -> DataLoader:
""" Function that loads the validation set. """
dataset = TabularDataset(
task=self.config.task,
data=self.validation,
categorical_cols=self.config.categorical_cols,
continuous_cols=self.config.continuous_cols,
embed_categorical=(not self.do_leave_one_out_encoder()),
target=self.target,
)
return DataLoader(
dataset, self.batch_size, shuffle=False, num_workers=self.config.num_workers
)
pytorch_tabular.models.category_embedding.category_embedding_model.CategoryEmbeddingModel
forward(self, x)
Same as :meth:torch.nn.Module.forward()
, however in Lightning you want this to define
the operations you want to use for prediction (i.e.: on a server or as a feature extractor).
Normally you'd call self()
from your :meth:training_step
method.
This makes it easy to write a complex system for training with the outputs
you'd want in a prediction setting.
You may also find the :func:~pytorch_lightning.core.decorators.auto_move_data
decorator useful
when using the module outside Lightning in a production setting.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
*args |
|
Whatever you decide to pass into the forward method. |
required |
**kwargs |
|
Keyword arguments are also possible. |
required |
Returns:
Type | Description |
---|---|
|
Predicted output |
Examples:
.. code-block:: python
# example if we were using this model as a feature extractor
def forward(self, x):
feature_maps = self.convnet(x)
return feature_maps
def training_step(self, batch, batch_idx):
x, y = batch
feature_maps = self(x)
logits = self.classifier(feature_maps)
# ...
return loss
# splitting it this way allows model to be used a feature extractor
model = MyModelAbove()
inputs = server.get_request()
results = model(inputs)
server.write_results(results)
# -------------
# This is in stark contrast to torch.nn.Module where normally you would have this:
def forward(self, batch):
x, y = batch
feature_maps = self.convnet(x)
logits = self.classifier(feature_maps)
return logits
Source code in pytorch_tabular/models/category_embedding/category_embedding_model.py
def forward(self, x: Dict):
x = self.unpack_input(x)
x = self.backbone(x)
y_hat = self.output_layer(x)
if (self.hparams.task == "regression") and (
self.hparams.target_range is not None
):
for i in range(self.hparams.output_dim):
y_min, y_max = self.hparams.target_range[i]
y_hat[:, i] = y_min + nn.Sigmoid()(y_hat[:, i]) * (y_max - y_min)
return {"logits": y_hat, "backbone_features": x}
pytorch_tabular.models.category_embedding.config.CategoryEmbeddingModelConfig
dataclass
CategoryEmbeddingModel configuration
Parameters:
Name | Type | Description | Default |
---|---|---|---|
task |
str |
Specify whether the problem is regression of classification.Choices are: regression classification |
required |
learning_rate |
float |
The learning rate of the model |
required |
loss |
Optional[str] |
The loss function to be applied. By Default it is MSELoss for regression and CrossEntropyLoss for classification. Unless you are sure what you are doing, leave it at MSELoss or L1Loss for regression and CrossEntropyLoss for classification |
required |
metrics |
Optional[List[str]] |
the list of metrics you need to track during training. The metrics should be one of the metrics implemented in PyTorch Lightning. By default, it is Accuracy if classification and MeanSquaredLogError for regression |
required |
metrics_params |
Optional[List] |
The parameters to be passed to the Metrics initialized |
required |
target_range |
Optional[List] |
The range in which we should limit the output variable. Currently ignored for multi-target regression Typically used for Regression problems. If left empty, will not apply any restrictions |
required |
layers |
str |
Hyphen-separated number of layers and units in the classification head. eg. 32-64-32. |
required |
batch_norm_continuous_input |
bool |
If True, we will normalize the contiinuous layer by passing it through a BatchNorm layer |
required |
activation |
str |
The activation type in the classification head. The default activation in PyTorch like ReLU, TanH, LeakyReLU, etc. https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity |
required |
embedding_dims |
Optional[List[int]] |
The dimensions of the embedding for each categorical column as a list of tuples (cardinality, embedding_dim). If left empty, will infer using the cardinality of the categorical column using the rule min(50, (x + 1) // 2) |
required |
embedding_dropout |
float |
probability of an embedding element to be zeroed. |
required |
dropout |
float |
probability of an classification element to be zeroed. |
required |
use_batch_norm |
bool |
Flag to include a BatchNorm layer after each Linear Layer+DropOut |
required |
initialization |
str |
Initialization scheme for the linear layers. Choices are: |
required |
Exceptions:
Type | Description |
---|---|
NotImplementedError |
Raises an error if task is not in ['regression','classification'] |
pytorch_tabular.models.node.config.NodeConfig
dataclass
Model configuration
Parameters:
Name | Type | Description | Default |
---|---|---|---|
task |
str |
Specify whether the problem is regression of classification.Choices are: regression classification |
required |
learning_rate |
float |
The learning rate of the model |
required |
loss |
Optional[str] |
The loss function to be applied. By Default it is MSELoss for regression and CrossEntropyLoss for classification. Unless you are sure what you are doing, leave it at MSELoss or L1Loss for regression and CrossEntropyLoss for classification |
required |
metrics |
Optional[List[str]] |
the list of metrics you need to track during training. The metrics should be one of the metrics implemented in PyTorch Lightning. By default, it is Accuracy if classification and MeanSquaredLogError for regression |
required |
metrics_params |
Optional[List] |
The parameters to be passed to the Metrics initialized |
required |
target_range |
Optional[List] |
The range in which we should limit the output variable. Currently ignored for multi-target regression Typically used for Regression problems. If left empty, will not apply any restrictions |
required |
num_layers |
int |
Number of Oblivious Decision Tree Layers in the Dense Architecture |
required |
num_trees |
int |
Number of Oblivious Decision Trees in each layer |
required |
additional_tree_output_dim |
int |
The additional output dimensions which is only used to
pass through different layers of the architectures. Only the first |
required |
depth |
int |
The depth of the individual Oblivious Decision Trees |
required |
choice_function |
str |
Generates a sparse probability distribution to be used as feature weights(aka, soft feature selection) Choices are: ['entmax15', 'sparsemax'] |
required |
bin_function |
str |
Generates a sparse probability distribution to be used as tree leaf weights Choices are: ['entmoid15', 'sparsemoid'] |
required |
max_features |
Optional[int] |
If not None, sets a max limit on the number of features to be carried forward from layer to layer in the Dense Architecture |
required |
input_dropout |
float |
Dropout to be applied to the inputs between layers of the Dense Architecture |
required |
initialize_response |
str |
Initializing the response variable in the Oblivious Decision Trees. By default, it is a standard normal distribution. Choices are: ['normal', 'uniform'] |
required |
initialize_selection_logits |
str |
Initializing the feature selector. By default is a uniform distribution across the features. Choices are: ['uniform', 'normal'] |
required |
threshold_init_beta |
float |
Used in the Data-aware initialization of thresholds where the threshold is initialized randomly (with a beta distribution) to feature values in the first batch. It initializes threshold to a q-th quantile of data points. where q ~ Beta(:threshold_init_beta:, :threshold_init_beta:) If this param is set to 1, initial thresholds will have the same distribution as data points If greater than 1 (e.g. 10), thresholds will be closer to median data value If less than 1 (e.g. 0.1), thresholds will approach min/max data values. |
required |
threshold_init_cutoff |
float |
Used in the Data-aware initialization of scales(used in the scaling ODTs). It is initialized in such a way that all the samples in the first batch belong to the linear region of the entmoid/sparsemoid(bin-selectors) and thereby have non-zero gradients Threshold log-temperatures initializer, in (0, inf) By default(1.0), log-temperatures are initialized in such a way that all bin selectors end up in the linear region of sparse-sigmoid. The temperatures are then scaled by this parameter. Setting this value > 1.0 will result in some margin between data points and sparse-sigmoid cutoff value Setting this value < 1.0 will cause (1 - value) part of data points to end up in flat sparse-sigmoid region For instance, threshold_init_cutoff = 0.9 will set 10% points equal to 0.0 or 1.0 Setting this value > 1.0 will result in a margin between data points and sparse-sigmoid cutoff value All points will be between (0.5 - 0.5 / threshold_init_cutoff) and (0.5 + 0.5 / threshold_init_cutoff) |
required |
embed_categorical |
bool |
Flag to embed categorical columns using an Embedding Layer. If turned off, the categorical columns are encoded using LeaveOneOutEncoder |
required |
embedding_dims |
Optional[List[int]] |
The dimensions of the embedding for each categorical column as a list of tuples (cardinality, embedding_dim). If left empty, will infer using the cardinality of the categorical column using the rule min(50, (x + 1) // 2) |
required |
embedding_dropout |
float |
probability of an embedding element to be zeroed. |
required |
Exceptions:
Type | Description |
---|---|
NotImplementedError |
Raises an error if task is not in ['regression','classification'] |
pytorch_tabular.models.node.node_model.NODEModel
forward(self, x)
Same as :meth:torch.nn.Module.forward()
, however in Lightning you want this to define
the operations you want to use for prediction (i.e.: on a server or as a feature extractor).
Normally you'd call self()
from your :meth:training_step
method.
This makes it easy to write a complex system for training with the outputs
you'd want in a prediction setting.
You may also find the :func:~pytorch_lightning.core.decorators.auto_move_data
decorator useful
when using the module outside Lightning in a production setting.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
*args |
|
Whatever you decide to pass into the forward method. |
required |
**kwargs |
|
Keyword arguments are also possible. |
required |
Returns:
Type | Description |
---|---|
|
Predicted output |
Examples:
.. code-block:: python
# example if we were using this model as a feature extractor
def forward(self, x):
feature_maps = self.convnet(x)
return feature_maps
def training_step(self, batch, batch_idx):
x, y = batch
feature_maps = self(x)
logits = self.classifier(feature_maps)
# ...
return loss
# splitting it this way allows model to be used a feature extractor
model = MyModelAbove()
inputs = server.get_request()
results = model(inputs)
server.write_results(results)
# -------------
# This is in stark contrast to torch.nn.Module where normally you would have this:
def forward(self, batch):
x, y = batch
feature_maps = self.convnet(x)
logits = self.classifier(feature_maps)
return logits
Source code in pytorch_tabular/models/node/node_model.py
def forward(self, x: Dict):
x = self.unpack_input(x)
if self.hparams.embed_categorical:
if self.hparams.embedding_dropout != 0 and self.embedding_cat_dim != 0:
x = self.embedding_dropout(x)
x = self.backbone(x)
y_hat = self.output_response(x)
if (self.hparams.task == "regression") and (
self.hparams.target_range is not None
):
for i in range(self.hparams.output_dim):
y_min, y_max = self.hparams.target_range[i]
y_hat[:, i] = y_min + nn.Sigmoid()(y_hat[:, i]) * (y_max - y_min)
return {"logits": y_hat, "backbone_features": x}
pytorch_tabular.models.tabnet.tabnet_model.TabNetModel
forward(self, x)
Same as :meth:torch.nn.Module.forward()
, however in Lightning you want this to define
the operations you want to use for prediction (i.e.: on a server or as a feature extractor).
Normally you'd call self()
from your :meth:training_step
method.
This makes it easy to write a complex system for training with the outputs
you'd want in a prediction setting.
You may also find the :func:~pytorch_lightning.core.decorators.auto_move_data
decorator useful
when using the module outside Lightning in a production setting.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
*args |
|
Whatever you decide to pass into the forward method. |
required |
**kwargs |
|
Keyword arguments are also possible. |
required |
Returns:
Type | Description |
---|---|
|
Predicted output |
Examples:
.. code-block:: python
# example if we were using this model as a feature extractor
def forward(self, x):
feature_maps = self.convnet(x)
return feature_maps
def training_step(self, batch, batch_idx):
x, y = batch
feature_maps = self(x)
logits = self.classifier(feature_maps)
# ...
return loss
# splitting it this way allows model to be used a feature extractor
model = MyModelAbove()
inputs = server.get_request()
results = model(inputs)
server.write_results(results)
# -------------
# This is in stark contrast to torch.nn.Module where normally you would have this:
def forward(self, batch):
x, y = batch
feature_maps = self.convnet(x)
logits = self.classifier(feature_maps)
return logits
Source code in pytorch_tabular/models/tabnet/tabnet_model.py
def forward(self, x: Dict):
# unpacking into a tuple
x = self.unpack_input(x)
# Returns output
x = self.backbone(x)
if (self.hparams.task == "regression") and (
self.hparams.target_range is not None
):
for i in range(self.hparams.output_dim):
y_min, y_max = self.hparams.target_range[i]
x[:, i] = y_min + nn.Sigmoid()(x[:, i]) * (y_max - y_min)
return {"logits": x} # No Easy way to access the raw features in TabNet
pytorch_tabular.models.tabnet.config.TabNetModelConfig
dataclass
Model configuration
Parameters:
Name | Type | Description | Default |
---|---|---|---|
task |
str |
Specify whether the problem is regression of classification.Choices are: regression classification |
required |
learning_rate |
float |
The learning rate of the model |
required |
loss |
Optional[str] |
The loss function to be applied. |
required |
metrics |
Optional[List[str]] |
the list of metrics you need to track during training. |
required |
metrics_params |
Optional[List] |
The parameters to be passed to the Metrics initialized |
required |
target_range |
Optional[List] |
The range in which we should limit the output variable. Currently ignored for multi-target regression |
required |
n_d |
int |
Dimension of the prediction layer (usually between 4 and 64) |
required |
n_a |
int |
Dimension of the attention layer (usually between 4 and 64) |
required |
n_steps |
int |
Number of sucessive steps in the newtork (usually betwenn 3 and 10) |
required |
gamma |
float |
Float above 1, scaling factor for attention updates (usually betwenn 1.0 to 2.0) |
required |
embedding_dims |
Optional[List[int]] |
The dimensions of the embedding for each categorical column as |
required |
n_independent |
int |
Number of independent GLU layer in each GLU block (default 2) |
required |
n_shared |
int |
Number of independent GLU layer in each GLU block (default 2) |
required |
virtual_batch_size |
int |
Batch size for Ghost Batch Normalization |
required |
mask_type |
str |
Either 'sparsemax' or 'entmax' : this is the masking function to useChoices are: sparsemax entmax |
required |
Exceptions:
Type | Description |
---|---|
NotImplementedError |
Raises an error if task is not in ['regression','classification'] |
pytorch_tabular.config.config.TrainerConfig
dataclass
Trainer configuration
Parameters:
Name | Type | Description | Default |
---|---|---|---|
batch_size |
int |
Number of samples in each batch of training |
required |
fast_dev_run |
bool |
Quick Debug Run of Val |
required |
max_epochs |
int |
Maximum number of epochs to be run |
required |
min_epochs |
int |
Minimum number of epochs to be run |
required |
gpus |
int |
The index of the GPU to be used. If |
required |
accumulate_grad_batches |
int |
Accumulates grads every k batches or as set up in the dict. Trainer also calls optimizer.step() for the last indivisible step number. |
required |
auto_lr_find |
bool |
Runs a learning rate finder algorithm (see this paper) when calling trainer.tune(), to find optimal initial learning rate. |
required |
check_val_every_n_epoch |
int |
Check val every n train epochs. |
required |
gradient_clip_val |
float |
Gradient clipping value |
required |
overfit_batches |
float |
Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. If the training dataloaders have shuffle=True, Lightning will automatically disable it. Useful for quickly debugging or trying to overfit on purpose. |
required |
profiler |
Optional[str] |
To profile individual steps during training and assist in identifying bottlenecks. Choices are: 'None' 'simple' 'advanced' |
required |
early_stopping |
str |
The loss/metric that needed to be monitored for early stopping. If None, there will be no early stopping |
required |
early_stopping_min_delta |
float |
The minimum delta in the loss/metric which qualifies as an improvement in early stopping |
required |
early_stopping_mode |
str |
The direction in which the loss/metric should be optimized. Choices are |
required |
early_stopping_patience |
int |
The number of epochs to wait until there is no further improvements in loss/metric |
required |
checkpoints |
str |
The loss/metric that needed to be monitored for checkpoints. If None, there will be no checkpoints |
required |
checkpoints_path |
str |
The path where the saved models will be |
required |
checkpoints_name(Optional[str]) |
|
The name under which the models will be saved.
If left blank, first it will look for |
required |
checkpoints_mode |
str |
The direction in which the loss/metric should be optimized |
required |
checkpoints_save_top_k |
int |
The number of best models to save |
required |
load_best |
bool |
Flag to load the best model saved during training |
required |
track_grad_norm |
int |
Track and Log Gradient Norms in the logger. -1 by default means no tracking. 1 for the L1 norm, 2 for L2 norm, etc. |
required |
pytorch_tabular.config.config.DataConfig
dataclass
Data configuration.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
target |
List[str] |
A list of strings with the names of the target column(s) |
required |
continuous_cols |
List[str] |
Column names of the numeric fields. Defaults to [] |
required |
categorical_cols |
List |
Column names of the categorical fields to treat differently. Defaults to [] |
required |
date_columns |
List |
(Column names, Freq) tuples of the date fields. For eg. a field named introduction_date and with a monthly frequency should have an entry ('intro_date','M'} |
required |
encode_date_columns |
bool |
Whether or not to encode the derived variables from date |
required |
validation_split |
Optional[float] |
Percentage of Training rows to keep aside as validation. Used only if Validation Data is not given separately |
required |
continuous_feature_transform |
Optional[str] |
Whether or not to transform the features before modelling. By default it is turned off.Choices are: None "yeo-johnson" "box-cox" "quantile_normal" "quantile_uniform" |
required |
normalize_continuous_features |
bool |
Flag to normalize the input features(continuous) |
required |
quantile_noise |
int |
NOT IMPLEMENTED. If specified fits QuantileTransformer on data with added gaussian noise with std = :quantile_noise: * data.std ; this will cause discrete values to be more separable. Please note that this transformation does NOT apply gaussian noise to the resulting data, the noise is only applied for QuantileTransformer |
required |
num_workers |
Optional[int] |
The number of workers used for data loading. For Windows always set to 0 |
required |
pytorch_tabular.config.config.ModelConfig
dataclass
Base Model configuration
Parameters:
Name | Type | Description | Default |
---|---|---|---|
task |
str |
Specify whether the problem is regression of classification.Choices are: regression classification |
required |
learning_rate |
float |
The learning rate of the model |
required |
loss |
Optional[str] |
The loss function to be applied. By Default it is MSELoss for regression and CrossEntropyLoss for classification. Unless you are sure what you are doing, leave it at MSELoss or L1Loss for regression and CrossEntropyLoss for classification |
required |
metrics |
Optional[List[str]] |
the list of metrics you need to track during training. The metrics should be one of the metrics implemented in PyTorch Lightning. By default, it is accuracy if classification and mean_squared_error for regression |
required |
metrics_params |
Optional[List] |
The parameters to be passed to the metrics function |
required |
target_range |
Optional[List] |
The range in which we should limit the output variable. Currently ignored for multi-target regression Typically used for Regression problems. If left empty, will not apply any restrictions |
required |
Exceptions:
Type | Description |
---|---|
NotImplementedError |
Raises an error if task is not regression or classification |
pytorch_tabular.config.config.ExperimentConfig
dataclass
Experiment configuration. Experiment Tracking with WandB and Tensorboard
Parameters:
Name | Type | Description | Default |
---|---|---|---|
project_name |
str |
The name of the project under which all runs will be logged. For Tensorboard this defines the folder under which the logs will be saved and for W&B it defines the project name. |
required |
run_name |
Optional[str] |
The name of the run; a specific identifier to recognize the run. If left blank, will be assigned a auto-generated name |
required |
exp_watch |
Optional[str] |
The level of logging required.
Can be |
required |
log_target |
str |
Determines where logging happens - Tensorboard or W&BChoices are: wandb tensorboard |
required |
log_logits |
bool |
Turn this on to log the logits as a histogram in W&B |
required |
exp_log_freq |
int |
step count between logging of gradients and parameters. |
required |
_exp_version_manager |
str |
The location of the yaml file which manages versions of experiments |
required |
pytorch_tabular.config.config.OptimizerConfig
dataclass
Optimizer and Learning Rate Scheduler configuration.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
optimizer |
str |
Any of the standard optimizers from
torch.optim. Defaults to |
required |
optimizer_params |
dict |
The parameters for the optimizer. If left blank, will use default parameters. |
required |
lr_scheduler |
Optional[str] |
The name of the LearningRateScheduler to use, if any, from torch.optim.lr_scheduler.
If None, will not use any scheduler. Defaults to |
required |
lr_scheduler_params |
Optional[dict] |
The parameters for the LearningRateScheduler. If left blank, will use default parameters. |
required |
lr_scheduler_monitor_metric |
Optional[str] |
Used with |
required |
pytorch_tabular.config.config.ExperimentRunManager
__init__(self, exp_version_manager='.tmp/exp_version_manager.yml')
special
The manages the versions of the experiments based on the name. It is a simple dictionary(yaml) based lookup. Primary purpose is to avoid overwriting of saved models while runing the training without changing the experiment name.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
exp_version_manager |
str |
The path of the yml file which acts as version control. |
'.tmp/exp_version_manager.yml' |
Source code in pytorch_tabular/config/config.py
def __init__(
self,
exp_version_manager: str = ".tmp/exp_version_manager.yml",
) -> None:
"""The manages the versions of the experiments based on the name. It is a simple dictionary(yaml) based lookup.
Primary purpose is to avoid overwriting of saved models while runing the training without changing the experiment name.
Args:
exp_version_manager (str, optional): The path of the yml file which acts as version control.
Defaults to ".tmp/exp_version_manager.yml".
"""
super().__init__()
self._exp_version_manager = exp_version_manager
if os.path.exists(exp_version_manager):
self.exp_version_manager = OmegaConf.load(exp_version_manager)
else:
self.exp_version_manager = OmegaConf.create({})
os.makedirs(".tmp", exist_ok=True)
with open(self._exp_version_manager, "w") as file:
OmegaConf.save(config=self.exp_version_manager, f=file)