Skip to content

API

pytorch_tabular.tabular_model.TabularModel

__init__(self, config=None, data_config=None, model_config=None, optimizer_config=None, trainer_config=None, experiment_config=None, model_callable=None) special

The core model which orchestrates everything from initializing the datamodule, the model, trainer, etc.

Parameters:

Name Type Description Default
config Optional[omegaconf.dictconfig.DictConfig]

Single OmegaConf DictConfig object or the path to the yaml file holding all the config parameters. Defaults to None.

None
data_config Union[pytorch_tabular.config.config.DataConfig, str]

DataConfig object or path to the yaml file. Defaults to None.

None
model_config Union[pytorch_tabular.config.config.ModelConfig, str]

A subclass of ModelConfig or path to the yaml file. Determines which model to run from the type of config. Defaults to None.

None
optimizer_config Union[pytorch_tabular.config.config.OptimizerConfig, str]

OptimizerConfig object or path to the yaml file. Defaults to None.

None
trainer_config Union[pytorch_tabular.config.config.TrainerConfig, str]

TrainerConfig object or path to the yaml file. Defaults to None.

None
experiment_config Union[pytorch_tabular.config.config.ExperimentConfig, str]

ExperimentConfig object or path to the yaml file. If Provided configures the experiment tracking. Defaults to None.

None
model_callable Optional[Callable]

If provided, will override the model callable that will be loaded from the config. Typically used when providing Custom Models

None
Source code in pytorch_tabular/tabular_model.py
def __init__(
    self,
    config: Optional[DictConfig] = None,
    data_config: Optional[Union[DataConfig, str]] = None,
    model_config: Optional[Union[ModelConfig, str]] = None,
    optimizer_config: Optional[Union[OptimizerConfig, str]] = None,
    trainer_config: Optional[Union[TrainerConfig, str]] = None,
    experiment_config: Optional[Union[ExperimentConfig, str]] = None,
    model_callable: Optional[Callable] = None,
) -> None:
    """The core model which orchestrates everything from initializing the datamodule, the model, trainer, etc.

    Args:
        config (Optional[Union[DictConfig, str]], optional): Single OmegaConf DictConfig object or
            the path to the yaml file holding all the config parameters. Defaults to None.

        data_config (Optional[Union[DataConfig, str]], optional): DataConfig object or path to the yaml file. Defaults to None.

        model_config (Optional[Union[ModelConfig, str]], optional): A subclass of ModelConfig or path to the yaml file.
            Determines which model to run from the type of config. Defaults to None.

        optimizer_config (Optional[Union[OptimizerConfig, str]], optional): OptimizerConfig object or path to the yaml file.
            Defaults to None.

        trainer_config (Optional[Union[TrainerConfig, str]], optional): TrainerConfig object or path to the yaml file.
            Defaults to None.

        experiment_config (Optional[Union[ExperimentConfig, str]], optional): ExperimentConfig object or path to the yaml file.
            If Provided configures the experiment tracking. Defaults to None.

        model_callable (Optional[Callable], optional): If provided, will override the model callable that will be loaded from the config.
            Typically used when providing Custom Models
    """
    super().__init__()
    self.exp_manager = ExperimentRunManager()
    if config is None:
        assert (
            (data_config is not None)
            or (model_config is not None)
            or (optimizer_config is not None)
            or (trainer_config is not None)
        ), "If `config` is None, `data_config`, `model_config`, `trainer_config`, and `optimizer_config` cannot be None"
        data_config = self._read_parse_config(data_config, DataConfig)
        model_config = self._read_parse_config(model_config, ModelConfig)
        trainer_config = self._read_parse_config(trainer_config, TrainerConfig)
        optimizer_config = self._read_parse_config(
            optimizer_config, OptimizerConfig
        )
        if experiment_config is None:
            logger.info("Experiment Tracking is turned off")
            self.track_experiment = False
            self.config = OmegaConf.merge(
                OmegaConf.to_container(data_config),
                OmegaConf.to_container(model_config),
                OmegaConf.to_container(trainer_config),
                OmegaConf.to_container(optimizer_config),
            )
        else:
            experiment_config = self._read_parse_config(
                experiment_config, ExperimentConfig
            )
            self.track_experiment = True
            self.config = OmegaConf.merge(
                OmegaConf.to_container(data_config),
                OmegaConf.to_container(model_config),
                OmegaConf.to_container(trainer_config),
                OmegaConf.to_container(experiment_config),
                OmegaConf.to_container(optimizer_config),
            )
    else:
        self.config = config
        if hasattr(config, "log_target") and (config.log_target is not None):
            # experiment_config = OmegaConf.structured(experiment_config)
            self.track_experiment = True
        else:
            logger.info("Experiment Tracking is turned off")
            self.track_experiment = False

    self.name, self.uid = self._get_run_name_uid()
    if self.track_experiment:
        self._setup_experiment_tracking()
    else:
        self.logger = None

    self.exp_manager = ExperimentRunManager()
    if model_callable is None:
        self.model_callable = getattr(
            getattr(models, self.config._module_src), self.config._model_name
        )
        self.custom_model = False
    else:
        self.model_callable = model_callable
        self.custom_model = True
    self._run_validation()

evaluate(self, test)

Evaluates the dataframe using the loss and metrics already set in config

Parameters:

Name Type Description Default
test Optional[pandas.core.frame.DataFrame]

The dataframe to be evaluated. If not provided, will try to use the test provided during fit. If that was also not provided will return an empty dictionary

required

Returns:

Type Description
Union[dict, list]

Union[dict, list]: The final test result dictionary.

Source code in pytorch_tabular/tabular_model.py
def evaluate(self, test: Optional[pd.DataFrame]) -> Union[dict, list]:
    """Evaluates the dataframe using the loss and metrics already set in config

    Args:
        test (Optional[pd.DataFrame]): The dataframe to be evaluated. If not provided, will try to use the
            test provided during fit. If that was also not provided will return an empty dictionary

    Returns:
        Union[dict, list]: The final test result dictionary.
    """
    if test is not None:
        test_loader = self.datamodule.prepare_inference_dataloader(test)
    elif self.test is not None:
        test_loader = self.datamodule.test_dataloader()
    else:
        return {}
    result = self.trainer.test(
        test_dataloaders=test_loader,
        ckpt_path="best" if self.config.checkpoints else None,
    )
    return result

find_learning_rate(self, train, validation=None, test=None, loss=None, metrics=None, optimizer=None, optimizer_params={}, min_lr=1e-08, max_lr=1, num_training=100, mode='exponential', early_stop_threshold=4.0, plot=True)

Enables the user to do a range test of good initial learning rates, to reduce the amount of guesswork in picking a good starting learning rate.

Parameters:

Name Type Description Default
train DataFrame

Training Dataframe

required
valid Optional[pd.DataFrame]

If provided, will use this dataframe as the validation while training. Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None.

required
test Optional[pandas.core.frame.DataFrame]

If provided, will use as the hold-out data, which you'll be able to check performance after the model is trained. Defaults to None.

None
loss Optional[torch.nn.modules.module.Module]

Custom Loss functions which are not in standard pytorch library

None
metrics Optional[List[Callable]]

Custom metric functions(Callable) which has the signature metric_fn(y_hat, y)

None
optimizer Optional[torch.optim.optimizer.Optimizer]

Custom optimizers which are a drop in replacements for standard PyToch optimizers. This should be the Class and not the initialized object

None
optimizer_params Dict

The parmeters to initialize the custom optimizer.

{}
min_lr float

minimum learning rate to investigate

1e-08
max_lr float

maximum learning rate to investigate

1
num_training int

number of learning rates to test

100
mode str

search strategy, either 'linear' or 'exponential'. If set to 'linear' the learning rate will be searched by linearly increasing after each batch. If set to 'exponential', will increase learning rate exponentially.

'exponential'
early_stop_threshold(Optional[float], optional

threshold for stopping the search. If the loss at any point is larger than early_stop_threshold*best_loss then the search is stopped. To disable, set to None.

required
plot(bool, optional

If true, will plot using matplotlib

required
Source code in pytorch_tabular/tabular_model.py
def find_learning_rate(
    self,
    train: pd.DataFrame,
    validation: Optional[pd.DataFrame] = None,
    test: Optional[pd.DataFrame] = None,
    loss: Optional[torch.nn.Module] = None,
    metrics: Optional[List[Callable]] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    optimizer_params: Dict = {},
    min_lr: float = 1e-8,
    max_lr: float = 1,
    num_training: int = 100,
    mode: str = "exponential",
    early_stop_threshold: float = 4.0,
    plot=True,
) -> None:
    """Enables the user to do a range test of good initial learning rates, to reduce the amount of guesswork in picking a good starting learning rate.

    Args:
        train (pd.DataFrame): Training Dataframe

        valid (Optional[pd.DataFrame], optional): If provided, will use this dataframe as the validation while training.
            Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None.

        test (Optional[pd.DataFrame], optional): If provided, will use as the hold-out data,
            which you'll be able to check performance after the model is trained. Defaults to None.

        loss (Optional[torch.nn.Module], optional): Custom Loss functions which are not in standard pytorch library

        metrics (Optional[List[Callable]], optional): Custom metric functions(Callable) which has the signature metric_fn(y_hat, y)

        optimizer (Optional[torch.optim.Optimizer], optional): Custom optimizers which are a drop in replacements for standard PyToch optimizers.
            This should be the Class and not the initialized object

        optimizer_params (Optional[Dict], optional): The parmeters to initialize the custom optimizer.

        min_lr (Optional[float], optional): minimum learning rate to investigate

        max_lr (Optional[float], optional): maximum learning rate to investigate

        num_training (Optional[int], optional): number of learning rates to test

        mode (Optional[str], optional): search strategy, either 'linear' or 'exponential'. If set to
            'linear' the learning rate will be searched by linearly increasing
            after each batch. If set to 'exponential', will increase learning
            rate exponentially.

        early_stop_threshold(Optional[float], optional): threshold for stopping the search. If the
            loss at any point is larger than early_stop_threshold*best_loss
            then the search is stopped. To disable, set to None.

        plot(bool, optional): If true, will plot using matplotlib
    """

    train_loader, val_loader = self._pre_fit(
        train,
        validation,
        test,
        loss,
        metrics,
        optimizer,
        optimizer_params,
        target_transform=None,
        max_epochs=None,
        min_epochs=None,
        reset=True,
    )
    lr_finder = self.trainer.tuner.lr_find(
        self.model,
        train_loader,
        val_loader,
        min_lr,
        max_lr,
        num_training,
        mode,
        early_stop_threshold,
    )
    if plot:
        fig = lr_finder.plot(suggest=True)
        fig.show()
    new_lr = lr_finder.suggestion()
    # cancelling the model and trainer that was loaded
    self.model = None
    self.trainer = None
    self.datamodule = None
    return new_lr, pd.DataFrame(lr_finder.results)

fit(self, train, validation=None, test=None, loss=None, metrics=None, optimizer=None, optimizer_params={}, train_sampler=None, target_transform=None, max_epochs=None, min_epochs=None, reset=False, seed=None)

The fit method which takes in the data and triggers the training

Parameters:

Name Type Description Default
train DataFrame

Training Dataframe

required
valid Optional[pd.DataFrame]

If provided, will use this dataframe as the validation while training. Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None.

required
test Optional[pandas.core.frame.DataFrame]

If provided, will use as the hold-out data, which you'll be able to check performance after the model is trained. Defaults to None.

None
loss Optional[torch.nn.modules.module.Module]

Custom Loss functions which are not in standard pytorch library

None
metrics Optional[List[Callable]]

Custom metric functions(Callable) which has the signature metric_fn(y_hat, y) and works on torch tensor inputs

None
optimizer Optional[torch.optim.optimizer.Optimizer]

Custom optimizers which are a drop in replacements for standard PyToch optimizers. This should be the Class and not the initialized object

None
optimizer_params Dict

The parmeters to initialize the custom optimizer.

{}
train_sampler Optional[torch.utils.data.sampler.Sampler]

Custom PyTorch batch samplers which will be passed to the DataLoaders. Useful for dealing with imbalanced data and other custom batching strategies

None
target_transform Union[sklearn.base.TransformerMixin, Tuple]

If provided, applies the transform to the target before modelling and inverse the transform during prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or a tuple of callables (transform_func, inverse_transform_func)

None
max_epochs Optional[int]

Overwrite maximum number of epochs to be run

None
min_epochs Optional[int]

Overwrite minimum number of epochs to be run

None
reset bool

(bool): Flag to reset the model and train again from scratch

False
seed Optional[int]

(int): If you have to override the default seed set as part of of ModelConfig

None
Source code in pytorch_tabular/tabular_model.py
def fit(
    self,
    train: pd.DataFrame,
    validation: Optional[pd.DataFrame] = None,
    test: Optional[pd.DataFrame] = None,
    loss: Optional[torch.nn.Module] = None,
    metrics: Optional[List[Callable]] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    optimizer_params: Dict = {},
    train_sampler: Optional[torch.utils.data.Sampler] = None,
    target_transform: Optional[Union[TransformerMixin, Tuple]] = None,
    max_epochs: Optional[int] = None,
    min_epochs: Optional[int] = None,
    reset: bool = False,
    seed: Optional[int] = None,
) -> None:
    """The fit method which takes in the data and triggers the training

    Args:
        train (pd.DataFrame): Training Dataframe

        valid (Optional[pd.DataFrame], optional): If provided, will use this dataframe as the validation while training.
            Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None.

        test (Optional[pd.DataFrame], optional): If provided, will use as the hold-out data,
            which you'll be able to check performance after the model is trained. Defaults to None.

        loss (Optional[torch.nn.Module], optional): Custom Loss functions which are not in standard pytorch library

        metrics (Optional[List[Callable]], optional): Custom metric functions(Callable) which has the
            signature metric_fn(y_hat, y) and works on torch tensor inputs

        optimizer (Optional[torch.optim.Optimizer], optional): Custom optimizers which are a drop in replacements for standard PyToch optimizers.
            This should be the Class and not the initialized object

        optimizer_params (Optional[Dict], optional): The parmeters to initialize the custom optimizer.

        train_sampler (Optional[torch.utils.data.Sampler], optional): Custom PyTorch batch samplers which will be passed to the DataLoaders. Useful for dealing with imbalanced data and other custom batching strategies

        target_transform (Optional[Union[TransformerMixin, Tuple(Callable)]], optional): If provided, applies the transform to the target before modelling
            and inverse the transform during prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or
            a tuple of callables (transform_func, inverse_transform_func)

        max_epochs (Optional[int]): Overwrite maximum number of epochs to be run

        min_epochs (Optional[int]): Overwrite minimum number of epochs to be run

        reset: (bool): Flag to reset the model and train again from scratch

        seed: (int): If you have to override the default seed set as part of of ModelConfig
    """
    seed_everything(seed if seed is not None else self.config.seed)
    train_loader, val_loader = self._pre_fit(
        train,
        validation,
        test,
        loss,
        metrics,
        optimizer,
        optimizer_params,
        train_sampler,
        target_transform,
        max_epochs,
        min_epochs,
        reset,
    )
    self.model.train()
    if self.config.auto_lr_find and (not self.config.fast_dev_run):
        self.trainer.tune(self.model, train_loader, val_loader)
        # Parameters in models needs to be initialized again after LR find
        self.model.data_aware_initialization(self.datamodule)
    self.model.train()
    self.trainer.fit(self.model, train_loader, val_loader)
    logger.info("Training the model completed...")
    if self.config.load_best:
        self.load_best_model()

load_best_model(self)

Loads the best model after training is done

Source code in pytorch_tabular/tabular_model.py
def load_best_model(self):
    """Loads the best model after training is done"""
    if self.trainer.checkpoint_callback is not None:
        logger.info("Loading the best model...")
        ckpt_path = self.trainer.checkpoint_callback.best_model_path
        if ckpt_path != "":
            logger.debug(f"Model Checkpoint: {ckpt_path}")
            ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
            self.model.load_state_dict(ckpt["state_dict"])
        else:
            logger.info(
                "No best model available to load. Did you run it more than 1 epoch?..."
            )
    else:
        logger.info(
            "No best model available to load. Did you run it more than 1 epoch?..."
        )

load_from_checkpoint(dir, map_location=None, strict=True) classmethod

Loads a saved model from the directory

Parameters:

Name Type Description Default
dir str

The directory where the model wa saved, along with the checkpoints

required
strict bool) – Whether to strictly enforce that the keys in checkpoint_path match the keys returned by this module’s state dict. Default

True.

True

Returns:

Type Description
TabularModel

The saved TabularModel

Source code in pytorch_tabular/tabular_model.py
@classmethod
def load_from_checkpoint(cls, dir: str, map_location = None, strict=True):
    """Loads a saved model from the directory

    Args:
        dir (str): The directory where the model wa saved, along with the checkpoints
        map_location (Union[Dict[str, str], str, device, int, Callable, None]) – If your checkpoint 
            saved a GPU model and you now load on CPUs or a different number of GPUs, use this to map 
            to the new setup. The behaviour is the same as in torch.load()
        strict (bool) – Whether to strictly enforce that the keys in checkpoint_path match the keys 
            returned by this module’s state dict. Default: True.

    Returns:
        TabularModel: The saved TabularModel
    """
    config = OmegaConf.load(os.path.join(dir, "config.yml"))
    datamodule = joblib.load(os.path.join(dir, "datamodule.sav"))
    if (
        hasattr(config, "log_target")
        and (config.log_target is not None)
        and os.path.exists(os.path.join(dir, "exp_logger.sav"))
    ):
        logger = joblib.load(os.path.join(dir, "exp_logger.sav"))
    else:
        logger = None
    if os.path.exists(os.path.join(dir, "callbacks.sav")):
        callbacks = joblib.load(os.path.join(dir, "callbacks.sav"))
    else:
        callbacks = []
    if os.path.exists(os.path.join(dir, "custom_model_callable.sav")):
        model_callable = joblib.load(os.path.join(dir, "custom_model_callable.sav"))
        custom_model = True
    else:
        model_callable = getattr(
            getattr(models, config._module_src), config._model_name
        )
        custom_model = False
    custom_params = joblib.load(os.path.join(dir, "custom_params.sav"))
    model_args = {}
    if custom_params.get("custom_loss") is not None:
        model_args["loss"] = "MSELoss"
    if custom_params.get("custom_metrics") is not None:
        model_args["metrics"] = ["mean_squared_error"]
        model_args["metric_params"] = [{}]
    if custom_params.get("custom_optimizer") is not None:
        model_args["optimizer"] = "Adam"
    if custom_params.get("custom_optimizer_params") is not None:
        model_args["optimizer_params"] = {}

    # Initializing with default metrics, losses, and optimizers. Will revert once initialized
    model = model_callable.load_from_checkpoint(
        checkpoint_path=os.path.join(dir, "model.ckpt"),map_location=map_location, strict=strict, **model_args
    )
    # else:
    #     # Initializing with default values
    #     model = model_callable.load_from_checkpoint(
    #         checkpoint_path=os.path.join(dir, "model.ckpt"),
    #     )
    # Updating config with custom parameters for experiment tracking
    if custom_params.get("custom_loss") is not None:
        model.custom_loss = custom_params["custom_loss"]
    if custom_params.get("custom_metrics") is not None:
        model.custom_metrics = custom_params["custom_metrics"]
    if custom_params.get("custom_optimizer") is not None:
        model.custom_optimizer = custom_params["custom_optimizer"]
    if custom_params.get("custom_optimizer_params") is not None:
        model.custom_optimizer_params = custom_params["custom_optimizer_params"]
    model._setup_loss()
    model._setup_metrics()
    tabular_model = cls(config=config, model_callable=model_callable)
    tabular_model.model = model
    tabular_model.custom_model = custom_model
    tabular_model.datamodule = datamodule
    tabular_model.callbacks = callbacks
    tabular_model._prepare_trainer()
    tabular_model.trainer.model = model
    tabular_model.logger = logger
    return tabular_model

predict(self, test, quantiles=[0.25, 0.5, 0.75], n_samples=100, ret_logits=False)

Uses the trained model to predict on new data and return as a dataframe

Parameters:

Name Type Description Default
test DataFrame

The new dataframe with the features defined during training

required
quantiles Optional[List]

For probabilistic models like Mixture Density Networks, this specifies the different quantiles to be extracted apart from the central_tendency and added to the dataframe. For other models it is ignored. Defaults to [0.25, 0.5, 0.75]

[0.25, 0.5, 0.75]
n_samples Optional[int]

Number of samples to draw from the posterior to estimate the quantiles. Ignored for non-probabilistic models. Defaults to 100

100
ret_logits bool

Flag to return raw model outputs/logits except the backbone features along with the dataframe. Defaults to False

False

Returns:

Type Description
DataFrame

pd.DataFrame: Returns a dataframe with predictions and features. If classification, it returns probabilities and final prediction

Source code in pytorch_tabular/tabular_model.py
def predict(
    self,
    test: pd.DataFrame,
    quantiles: Optional[List] = [0.25, 0.5, 0.75],
    n_samples: Optional[int] = 100,
    ret_logits=False,
) -> pd.DataFrame:
    """Uses the trained model to predict on new data and return as a dataframe

    Args:
        test (pd.DataFrame): The new dataframe with the features defined during training
        quantiles (Optional[List]): For probabilistic models like Mixture Density Networks, this specifies
            the different quantiles to be extracted apart from the `central_tendency` and added to the dataframe.
            For other models it is ignored. Defaults to [0.25, 0.5, 0.75]
        n_samples (Optional[int]): Number of samples to draw from the posterior to estimate the quantiles.
            Ignored for non-probabilistic models. Defaults to 100
        ret_logits (bool): Flag to return raw model outputs/logits except the backbone features along
            with the dataframe. Defaults to False

    Returns:
        pd.DataFrame: Returns a dataframe with predictions and features.
            If classification, it returns probabilities and final prediction
    """
    assert all(
        [q <= 1 and q >= 0 for q in quantiles]
    ), "Quantiles should be a decimal between 0 and 1"
    self.model.eval()
    inference_dataloader = self.datamodule.prepare_inference_dataloader(test)
    point_predictions = []
    quantile_predictions = []
    logits_predictions = defaultdict(list)
    is_probabilistic = (
        hasattr(self.model.hparams, "_probabilistic")
        and self.model.hparams._probabilistic
    )
    for batch in tqdm(inference_dataloader, desc="Generating Predictions..."):
        for k, v in batch.items():
            if isinstance(v, list) and (len(v) == 0):
                # Skipping empty list
                continue
            batch[k] = v.to(self.model.device)
        if is_probabilistic:
            samples, ret_value = self.model.sample(
                batch, n_samples, ret_model_output=True
            )
            y_hat = torch.mean(samples, dim=-1)
            quantile_preds = []
            for q in quantiles:
                quantile_preds.append(
                    torch.quantile(samples, q=q, dim=-1).unsqueeze(1)
                )
        else:
            y_hat, ret_value = self.model.predict(batch, ret_model_output=True)
        if ret_logits:
            for k, v in ret_value.items():
                # if k == "backbone_features":
                #     continue
                logits_predictions[k].append(v.detach().cpu())
        point_predictions.append(y_hat.detach().cpu())
        if is_probabilistic:
            quantile_predictions.append(
                torch.cat(quantile_preds, dim=-1).detach().cpu()
            )
    point_predictions = torch.cat(point_predictions, dim=0)
    if point_predictions.ndim == 1:
        point_predictions = point_predictions.unsqueeze(-1)
    if is_probabilistic:
        quantile_predictions = torch.cat(quantile_predictions, dim=0).unsqueeze(-1)
        if quantile_predictions.ndim == 2:
            quantile_predictions = quantile_predictions.unsqueeze(-1)
    pred_df = test.copy()
    if self.config.task == "regression":
        point_predictions = point_predictions.numpy()
        # Probabilistic Models are only implemented for Regression
        if is_probabilistic:
            quantile_predictions = quantile_predictions.numpy()
        for i, target_col in enumerate(self.config.target):
            if self.datamodule.do_target_transform:
                if self.config.target[i] in pred_df.columns:
                    pred_df[
                        self.config.target[i]
                    ] = self.datamodule.target_transforms[i].inverse_transform(
                        pred_df[self.config.target[i]].values.reshape(-1, 1)
                    )
                pred_df[
                    f"{target_col}_prediction"
                ] = self.datamodule.target_transforms[i].inverse_transform(
                    point_predictions[:, i].reshape(-1, 1)
                )
                if is_probabilistic:
                    for j, q in enumerate(quantiles):
                        pred_df[
                            f"{target_col}_q{int(q*100)}"
                        ] = self.datamodule.target_transforms[i].inverse_transform(
                            quantile_predictions[:, j, i].reshape(-1, 1)
                        )
            else:
                pred_df[f"{target_col}_prediction"] = point_predictions[:, i]
                if is_probabilistic:
                    for j, q in enumerate(quantiles):
                        pred_df[
                            f"{target_col}_q{int(q*100)}"
                        ] = quantile_predictions[:, j, i].reshape(-1, 1)

    elif self.config.task == "classification":
        point_predictions = nn.Softmax(dim=-1)(point_predictions).numpy()
        for i, class_ in enumerate(self.datamodule.label_encoder.classes_):
            pred_df[f"{class_}_probability"] = point_predictions[:, i]
        pred_df[f"prediction"] = self.datamodule.label_encoder.inverse_transform(
            np.argmax(point_predictions, axis=1)
        )
    if ret_logits:
        for k, v in logits_predictions.items():
            v = torch.cat(v, dim=0).numpy()
            if v.ndim == 1:
                v = v.reshape(-1, 1)
            for i in range(v.shape[-1]):
                if v.shape[-1] > 1:
                    pred_df[f"{k}_{i}"] = v[:, i]
                else:
                    pred_df[f"{k}"] = v[:, i]
    return pred_df

save_model(self, dir)

Saves the model and checkpoints in the specified directory

Parameters:

Name Type Description Default
dir str

The path to the directory to save the model

required
Source code in pytorch_tabular/tabular_model.py
def save_model(self, dir: str):
    """Saves the model and checkpoints in the specified directory

    Args:
        dir (str): The path to the directory to save the model
    """
    if os.path.exists(dir) and (os.listdir(dir)):
        logger.warning("Directory is not empty. Overwriting the contents.")
        for f in os.listdir(dir):
            os.remove(os.path.join(dir, f))
    os.makedirs(dir, exist_ok=True)
    with open(os.path.join(dir, "config.yml"), "w") as fp:
        OmegaConf.save(self.config, fp, resolve=True)
    joblib.dump(self.datamodule, os.path.join(dir, "datamodule.sav"))
    if hasattr(self.config, "log_target") and self.config.log_target is not None:
        joblib.dump(self.logger, os.path.join(dir, "exp_logger.sav"))
    if hasattr(self, "callbacks"):
        joblib.dump(self.callbacks, os.path.join(dir, "callbacks.sav"))
    self.trainer.save_checkpoint(os.path.join(dir, "model.ckpt"))
    custom_params = {}
    custom_params["custom_loss"] = self.model.custom_loss
    custom_params["custom_metrics"] = self.model.custom_metrics
    custom_params["custom_optimizer"] = self.model.custom_optimizer
    custom_params["custom_optimizer_params"] = self.model.custom_optimizer_params
    joblib.dump(custom_params, os.path.join(dir, "custom_params.sav"))
    if self.custom_model:
        joblib.dump(
            self.model_callable, os.path.join(dir, "custom_model_callable.sav")
        )

pytorch_tabular.tabular_datamodule.TabularDatamodule

__init__(self, train, config, validation=None, test=None, target_transform=None, train_sampler=None) special

The Pytorch Lightning Datamodule for Tabular Data

Parameters:

Name Type Description Default
train DataFrame

The Training Dataframe

required
config omegaconf.dictconfig.DictConfig

Merged configuration object from ModelConfig, DataConfig,

required
validation DataFrame

Validation Dataframe.

None
test DataFrame

Holdout DataFrame to check final performance on.

None
target_transform Union[sklearn.base.TransformerMixin, Tuple]

If provided, applies the transform to the target before modelling

None
Source code in pytorch_tabular/tabular_datamodule.py
def __init__(
    self,
    train: pd.DataFrame,
    config: DictConfig,
    validation: pd.DataFrame = None,
    test: pd.DataFrame = None,
    target_transform: Optional[Union[TransformerMixin, Tuple]] = None,
    train_sampler: Optional[torch.utils.data.Sampler] = None,
):
    """The Pytorch Lightning Datamodule for Tabular Data

    Args:
        train (pd.DataFrame): The Training Dataframe
        config (DictConfig): Merged configuration object from ModelConfig, DataConfig,
        TrainerConfig, OptimizerConfig & ExperimentConfig
        validation (pd.DataFrame, optional): Validation Dataframe.
        If left empty, we use the validation split from DataConfig to split a random sample as validation.
        Defaults to None.
        test (pd.DataFrame, optional): Holdout DataFrame to check final performance on.
        Defaults to None.
        target_transform (Optional[Union[TransformerMixin, Tuple(Callable)]], optional): If provided, applies the transform to the target before modelling
        and inverse the transform during prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or
        a tuple of callables (transform_func, inverse_transform_func)
    """
    super().__init__()
    self.train = train.copy()
    self.validation = validation
    if target_transform is not None:
        if isinstance(target_transform, Iterable):
            target_transform = FunctionTransformer(
                func=target_transform[0], inverse_func=target_transform[1]
            )
        self.do_target_transform = True
    else:
        self.do_target_transform = False
    self.target_transform_template = target_transform
    self.test = test if test is None else test.copy()
    self.target = config.target
    self.batch_size = config.batch_size
    self.train_sampler = train_sampler
    self.config = config
    self._fitted = False

add_datepart(df, field_name, frequency, prefix=None, drop=True) classmethod

Helper function that adds columns relevant to a date in the column field_name of df.

Source code in pytorch_tabular/tabular_datamodule.py
@classmethod
def add_datepart(
    cls,
    df: pd.DataFrame,
    field_name: str,
    frequency: str,
    prefix: str = None,
    drop: bool = True,
):
    "Helper function that adds columns relevant to a date in the column `field_name` of `df`."
    field = df[field_name]
    prefix = (
        re.sub("[Dd]ate$", "", field_name) if prefix is None else prefix
    ) + "_"
    attr = cls.time_features_from_frequency_str(frequency)
    added_features = []
    for n in attr:
        if n == "Week":
            continue
        df[prefix + n] = getattr(field.dt, n.lower())
        added_features.append(prefix + n)
    # Pandas removed `dt.week` in v1.1.10
    if "Week" in attr:
        week = (
            field.dt.isocalendar().week
            if hasattr(field.dt, "isocalendar")
            else field.dt.week
        )
        df.insert(3, prefix + "Week", week)
        added_features.append(prefix + "Week")
    #TODO Not adding Elapsed by default. Need to route it through config
    # mask = ~field.isna()
    # df[prefix + "Elapsed"] = np.where(
    #     mask, field.values.astype(np.int64) // 10 ** 9, None
    # )
    # added_features.append(prefix + "Elapsed")
    if drop:
        df.drop(field_name, axis=1, inplace=True)

    # Removing features woth zero variations
    # for col in added_features:
    #     if len(df[col].unique()) == 1:
    #         df.drop(columns=col, inplace=True)
    #         added_features.remove(col)
    return df, added_features

do_leave_one_out_encoder(self)

Checks the special condition for NODE where we use a LeaveOneOutEncoder to encode categorical columns

Returns:

Type Description
bool

bool

Source code in pytorch_tabular/tabular_datamodule.py
def do_leave_one_out_encoder(self) -> bool:
    """Checks the special condition for NODE where we use a LeaveOneOutEncoder to encode categorical columns

    Returns:
        bool
    """
    return (self.config._model_name == "NODEModel") and (
        not self.config.embed_categorical
    )

make_date(df, date_field) classmethod

Make sure df[date_field] is of the right date type.

Source code in pytorch_tabular/tabular_datamodule.py
@classmethod
def make_date(cls, df: pd.DataFrame, date_field: str):
    "Make sure `df[date_field]` is of the right date type."
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
    return df

prepare_inference_dataloader(self, df)

Function that prepares and loads the new data.

Parameters:

Name Type Description Default
df DataFrame

Dataframe with the features and target

required

Returns:

Type Description
torch.utils.data.dataloader.DataLoader

DataLoader: The dataloader for the passed in dataframe

Source code in pytorch_tabular/tabular_datamodule.py
def prepare_inference_dataloader(self, df: pd.DataFrame) -> DataLoader:
    """Function that prepares and loads the new data.

    Args:
        df (pd.DataFrame): Dataframe with the features and target

    Returns:
        DataLoader: The dataloader for the passed in dataframe
    """
    df = df.copy()
    if len(set(self.target) - set(df.columns)) > 0:
        if self.config.task == "classification":
            df.loc[:, self.target] = np.array(
                [self.label_encoder.classes_[0]] * len(df)
            )
        else:
            df.loc[:, self.target] = np.zeros((len(df), len(self.target)))
    df, _ = self.preprocess_data(df, stage="inference")

    dataset = TabularDataset(
        task=self.config.task,
        data=df,
        categorical_cols=self.config.categorical_cols,
        continuous_cols=self.config.continuous_cols,
        embed_categorical=(not self.do_leave_one_out_encoder()),
        target=self.target
        if all([col in df.columns for col in self.target])
        else None,
    )
    return DataLoader(
        dataset,
        self.batch_size,
        shuffle=False,
        num_workers=self.config.num_workers,
    )

preprocess_data(self, data, stage='inference')

The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before feeding into the dataloder

Parameters:

Name Type Description Default
data DataFrame

A dataframe with the features and target

required
stage str

Internal parameter. Used to distinguisj between fit and inference. Defaults to "inference".

'inference'

Returns:

Type Description
Tuple[pandas.core.frame.DataFrame, list]

tuple[pd.DataFrame, list]: Returns the processed dataframe and the added features(list) as a tuple

Source code in pytorch_tabular/tabular_datamodule.py
def preprocess_data(
    self, data: pd.DataFrame, stage: str = "inference"
) -> Tuple[pd.DataFrame, list]:
    """The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before feeding into the dataloder

    Args:
        data (pd.DataFrame): A dataframe with the features and target
        stage (str, optional): Internal parameter. Used to distinguisj between fit and inference. Defaults to "inference".

    Returns:
        tuple[pd.DataFrame, list]: Returns the processed dataframe and the added features(list) as a tuple
    """
    logger.info(f"Preprocessing data: Stage: {stage}...")
    added_features = None
    if self.config.encode_date_columns:
        for field_name, freq in self.config.date_columns:
            data = self.make_date(data, field_name)
            data, added_features = self.add_datepart(
                data, field_name, frequency=freq, prefix=None, drop=True
            )
    # The only features that are added are the date features extracted
    # from the date which are categorical in nature
    if (added_features is not None) and (stage == "fit"):
        logger.debug(
            f"Added {added_features} features after encoding the date_columns"
        )
        self.config.categorical_cols += added_features
        self.config.categorical_dim = (
            len(self.config.categorical_cols)
            if self.config.categorical_cols is not None
            else 0
        )
    # Encoding Categorical Columns
    if len(self.config.categorical_cols) > 0:
        if stage == "fit":
            if self.do_leave_one_out_encoder():
                logger.debug("Encoding Categorical Columns using LeavOneOutEncoder")
                self.categorical_encoder = ce.LeaveOneOutEncoder(
                    cols=self.config.categorical_cols, random_state=42
                )
                # Multi-Target Regression uses the first target to encode the categorical columns
                if len(self.config.target) > 1:
                    logger.warning(
                        f"Multi-Target Regression: using the first target({self.config.target[0]}) to encode the categorical columns"
                    )
                data = self.categorical_encoder.fit_transform(
                    data, data[self.config.target[0]]
                )
            else:
                logger.debug("Encoding Categorical Columns using OrdinalEncoder")
                self.categorical_encoder = OrdinalEncoder(
                    cols=self.config.categorical_cols
                )
                data = self.categorical_encoder.fit_transform(data)
        else:
            data = self.categorical_encoder.transform(data)

    # Transforming Continuous Columns
    if (self.config.continuous_feature_transform is not None) and (
        len(self.config.continuous_cols) > 0
    ):
        if stage == "fit":
            transform = self.CONTINUOUS_TRANSFORMS[
                self.config.continuous_feature_transform
            ]
            self.continuous_transform = transform["callable"](**transform["params"])
            # TODO implement quantile noise
            data.loc[
                :, self.config.continuous_cols
            ] = self.continuous_transform.fit_transform(
                data.loc[:, self.config.continuous_cols]
            )
        else:
            data.loc[
                :, self.config.continuous_cols
            ] = self.continuous_transform.transform(
                data.loc[:, self.config.continuous_cols]
            )

    # Normalizing Continuous Columns
    if (self.config.normalize_continuous_features) and (
        len(self.config.continuous_cols) > 0
    ):
        if stage == "fit":
            self.scaler = StandardScaler()
            data.loc[:, self.config.continuous_cols] = self.scaler.fit_transform(
                data.loc[:, self.config.continuous_cols]
            )
        else:
            data.loc[:, self.config.continuous_cols] = self.scaler.transform(
                data.loc[:, self.config.continuous_cols]
            )

    # Converting target labels to a 0 indexed label
    if self.config.task == "classification":
        if stage == "fit":
            self.label_encoder = LabelEncoder()
            data[self.config.target[0]] = self.label_encoder.fit_transform(
                data[self.config.target[0]]
            )
        else:
            if self.config.target[0] in data.columns:
                data[self.config.target[0]] = self.label_encoder.transform(
                    data[self.config.target[0]]
                )
    # Target Transforms
    if all([col in data.columns for col in self.config.target]):
        if self.do_target_transform:
            if stage == "fit":
                target_transforms = []
                for col in self.config.target:
                    _target_transform = copy.deepcopy(self.target_transform_template)
                    data[col] = _target_transform.fit_transform(
                        data[col].values.reshape(-1, 1)
                    )
                    target_transforms.append(_target_transform)
                self.target_transforms = target_transforms
            else:
                for col, _target_transform in zip(self.config.target, self.target_transforms):
                    data[col] = _target_transform.transform(
                        data[col].values.reshape(-1, 1)
                    )
    return data, added_features

setup(self, stage=None)

Data Operations you want to perform on all GPUs, like train-test split, transformations, etc. This is called before accessing the dataloaders

Parameters:

Name Type Description Default
stage Optional[str]

Internal parameter to distinguish between fit and inference. Defaults to None.

None
Source code in pytorch_tabular/tabular_datamodule.py
def setup(self, stage: Optional[str] = None) -> None:
    """Data Operations you want to perform on all GPUs, like train-test split, transformations, etc.
    This is called before accessing the dataloaders

    Args:
        stage (Optional[str], optional): Internal parameter to distinguish between fit and inference. Defaults to None.
    """
    if stage == "fit" or stage is None:
        if self.validation is None:
            logger.debug(
                f"No validation data provided. Using {self.config.validation_split*100}% of train data as validation"
            )
            val_idx = self.train.sample(
                int(self.config.validation_split * len(self.train)), random_state=42
            ).index
            self.validation = self.train[self.train.index.isin(val_idx)]
            self.train = self.train[~self.train.index.isin(val_idx)]
        else:
            self.validation = self.validation.copy()
        # Preprocessing Train, Validation
        self.train, _ = self.preprocess_data(self.train, stage="fit")
        self.validation, _ = self.preprocess_data(
            self.validation, stage="inference"
        )
        if self.test is not None:
            self.test, _ = self.preprocess_data(self.test, stage="inference")
        # Calculating the categorical dims and embedding dims etc and updating the config
        self.update_config()
        self._fitted = True

test_dataloader(self)

Function that loads the validation set.

Source code in pytorch_tabular/tabular_datamodule.py
def test_dataloader(self) -> DataLoader:
    """ Function that loads the validation set. """
    if self.test is not None:
        dataset = TabularDataset(
            task=self.config.task,
            data=self.test,
            categorical_cols=self.config.categorical_cols,
            continuous_cols=self.config.continuous_cols,
            embed_categorical=(not self.do_leave_one_out_encoder()),
            target=self.target,
        )
        return DataLoader(
            dataset,
            self.batch_size,
            shuffle=False,
            num_workers=self.config.num_workers,
        )

time_features_from_frequency_str(freq_str) classmethod

Returns a list of time features that will be appropriate for the given frequency string.

Parameters

freq_str Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.

Source code in pytorch_tabular/tabular_datamodule.py
@classmethod
def time_features_from_frequency_str(cls, freq_str: str) -> List[str]:
    """
    Returns a list of time features that will be appropriate for the given frequency string.

    Parameters
    ----------

    freq_str
        Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.

    """

    features_by_offsets = {
        offsets.YearBegin: [],
        offsets.YearEnd: [],
        offsets.MonthBegin: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
        ],
        offsets.MonthEnd: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
        ],
        offsets.Week: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week",
        ],
        offsets.Day: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week" "Day",
            "Dayofweek",
            "Dayofyear",
        ],
        offsets.BusinessDay: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week" "Day",
            "Dayofweek",
            "Dayofyear",
        ],
        offsets.Hour: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week" "Day",
            "Dayofweek",
            "Dayofyear",
            "Hour",
        ],
        offsets.Minute: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week" "Day",
            "Dayofweek",
            "Dayofyear",
            "Hour",
            "Minute",
        ],
    }

    offset = to_offset(freq_str)

    for offset_type, feature in features_by_offsets.items():
        if isinstance(offset, offset_type):
            return feature

    supported_freq_msg = f"""
    Unsupported frequency {freq_str}

    The following frequencies are supported:

        Y, YS   - yearly
            alias: A
        M, MS   - monthly
        W   - weekly
        D   - daily
        B   - business days
        H   - hourly
        T   - minutely
            alias: min
    """
    raise RuntimeError(supported_freq_msg)

train_dataloader(self, batch_size=None)

Function that loads the train set.

Source code in pytorch_tabular/tabular_datamodule.py
def train_dataloader(self, batch_size: Optional[int] = None) -> DataLoader:
    """ Function that loads the train set. """
    dataset = TabularDataset(
        task=self.config.task,
        data=self.train,
        categorical_cols=self.config.categorical_cols,
        continuous_cols=self.config.continuous_cols,
        embed_categorical=(not self.do_leave_one_out_encoder()),
        target=self.target,
    )
    return DataLoader(
        dataset,
        batch_size if batch_size is not None else self.batch_size,
        shuffle=True if self.train_sampler is None else False,
        num_workers=self.config.num_workers,
        sampler=self.train_sampler,
    )

update_config(self)

Calculates and updates a few key information to the config object

Exceptions:

Type Description
NotImplementedError

[description]

Source code in pytorch_tabular/tabular_datamodule.py
def update_config(self) -> None:
    """Calculates and updates a few key information to the config object

    Raises:
        NotImplementedError: [description]
    """
    if self.config.task == "regression":
        self.config.output_dim = len(self.config.target)
    elif self.config.task == "classification":
        self.config.output_dim = len(self.train[self.config.target[0]].unique())
    if not self.do_leave_one_out_encoder():
        self.config.categorical_cardinality = [
            int(self.train[col].fillna("NA").nunique()) + 1
            for col in self.config.categorical_cols
        ]
        if hasattr(self.config, "embedding_dims") and self.config.embedding_dims is None:
            self.config.embedding_dims = [
                (x, min(50, (x + 1) // 2))
                for x in self.config.categorical_cardinality
            ]

val_dataloader(self)

Function that loads the validation set.

Source code in pytorch_tabular/tabular_datamodule.py
def val_dataloader(self) -> DataLoader:
    """ Function that loads the validation set. """
    dataset = TabularDataset(
        task=self.config.task,
        data=self.validation,
        categorical_cols=self.config.categorical_cols,
        continuous_cols=self.config.continuous_cols,
        embed_categorical=(not self.do_leave_one_out_encoder()),
        target=self.target,
    )
    return DataLoader(
        dataset, self.batch_size, shuffle=False, num_workers=self.config.num_workers
    )

pytorch_tabular.models.category_embedding.category_embedding_model.CategoryEmbeddingModel

forward(self, x)

Same as :meth:torch.nn.Module.forward().

Parameters:

Name Type Description Default
*args

Whatever you decide to pass into the forward method.

required
**kwargs

Keyword arguments are also possible.

required

Returns:

Type Description

Your model's output

Source code in pytorch_tabular/models/category_embedding/category_embedding_model.py
def forward(self, x: Dict):
    x = self.unpack_input(x)
    x = self.backbone(x)
    y_hat = self.output_layer(x)
    if (self.hparams.task == "regression") and (
        self.hparams.target_range is not None
    ):
        for i in range(self.hparams.output_dim):
            y_min, y_max = self.hparams.target_range[i]
            y_hat[:, i] = y_min + nn.Sigmoid()(y_hat[:, i]) * (y_max - y_min)
    return {"logits": y_hat, "backbone_features": x}

pytorch_tabular.models.category_embedding.config.CategoryEmbeddingModelConfig dataclass

CategoryEmbeddingModel configuration

Parameters:

Name Type Description Default
task str

Specify whether the problem is regression of classification.Choices are: regression classification

required
learning_rate float

The learning rate of the model

required
loss Optional[str]

The loss function to be applied. By Default it is MSELoss for regression and CrossEntropyLoss for classification. Unless you are sure what you are doing, leave it at MSELoss or L1Loss for regression and CrossEntropyLoss for classification

required
metrics Optional[List[str]]

the list of metrics you need to track during training. The metrics should be one of the metrics implemented in PyTorch Lightning. By default, it is Accuracy if classification and MeanSquaredLogError for regression

required
metrics_params Optional[List]

The parameters to be passed to the Metrics initialized

required
target_range Optional[List]

The range in which we should limit the output variable. Currently ignored for multi-target regression Typically used for Regression problems. If left empty, will not apply any restrictions

required
layers str

Hyphen-separated number of layers and units in the classification head. eg. 32-64-32.

required
batch_norm_continuous_input bool

If True, we will normalize the contiinuous layer by passing it through a BatchNorm layer

required
activation str

The activation type in the classification head. The default activation in PyTorch like ReLU, TanH, LeakyReLU, etc. https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity

required
embedding_dims Optional[List[int]]

The dimensions of the embedding for each categorical column as a list of tuples (cardinality, embedding_dim). If left empty, will infer using the cardinality of the categorical column using the rule min(50, (x + 1) // 2)

required
embedding_dropout float

probability of an embedding element to be zeroed.

required
dropout float

probability of an classification element to be zeroed.

required
use_batch_norm bool

Flag to include a BatchNorm layer after each Linear Layer+DropOut

required
initialization str

Initialization scheme for the linear layers. Choices are: kaiming xavier random

required

Exceptions:

Type Description
NotImplementedError

Raises an error if task is not in ['regression','classification']

pytorch_tabular.models.node.config.NodeConfig dataclass

Model configuration

Parameters:

Name Type Description Default
task str

Specify whether the problem is regression of classification.Choices are: regression classification

required
learning_rate float

The learning rate of the model

required
loss Optional[str]

The loss function to be applied. By Default it is MSELoss for regression and CrossEntropyLoss for classification. Unless you are sure what you are doing, leave it at MSELoss or L1Loss for regression and CrossEntropyLoss for classification

required
metrics Optional[List[str]]

the list of metrics you need to track during training. The metrics should be one of the metrics implemented in PyTorch Lightning. By default, it is Accuracy if classification and MeanSquaredLogError for regression

required
metrics_params Optional[List]

The parameters to be passed to the Metrics initialized

required
target_range Optional[List]

The range in which we should limit the output variable. Currently ignored for multi-target regression Typically used for Regression problems. If left empty, will not apply any restrictions

required
num_layers int

Number of Oblivious Decision Tree Layers in the Dense Architecture

required
num_trees int

Number of Oblivious Decision Trees in each layer

required
additional_tree_output_dim int

The additional output dimensions which is only used to pass through different layers of the architectures. Only the first output_dim outputs will be used for prediction

required
depth int

The depth of the individual Oblivious Decision Trees

required
choice_function str

Generates a sparse probability distribution to be used as feature weights(aka, soft feature selection) Choices are: ['entmax15', 'sparsemax']

required
bin_function str

Generates a sparse probability distribution to be used as tree leaf weights Choices are: ['entmoid15', 'sparsemoid']

required
max_features Optional[int]

If not None, sets a max limit on the number of features to be carried forward from layer to layer in the Dense Architecture

required
input_dropout float

Dropout to be applied to the inputs between layers of the Dense Architecture

required
initialize_response str

Initializing the response variable in the Oblivious Decision Trees. By default, it is a standard normal distribution. Choices are: ['normal', 'uniform']

required
initialize_selection_logits str

Initializing the feature selector. By default is a uniform distribution across the features. Choices are: ['uniform', 'normal']

required
threshold_init_beta float

Used in the Data-aware initialization of thresholds where the threshold is initialized randomly (with a beta distribution) to feature values in the first batch. It initializes threshold to a q-th quantile of data points. where q ~ Beta(:threshold_init_beta:, :threshold_init_beta:) If this param is set to 1, initial thresholds will have the same distribution as data points If greater than 1 (e.g. 10), thresholds will be closer to median data value If less than 1 (e.g. 0.1), thresholds will approach min/max data values.

required
threshold_init_cutoff float

Used in the Data-aware initialization of scales(used in the scaling ODTs). It is initialized in such a way that all the samples in the first batch belong to the linear region of the entmoid/sparsemoid(bin-selectors) and thereby have non-zero gradients Threshold log-temperatures initializer, in (0, inf) By default(1.0), log-temperatures are initialized in such a way that all bin selectors end up in the linear region of sparse-sigmoid. The temperatures are then scaled by this parameter. Setting this value > 1.0 will result in some margin between data points and sparse-sigmoid cutoff value Setting this value < 1.0 will cause (1 - value) part of data points to end up in flat sparse-sigmoid region For instance, threshold_init_cutoff = 0.9 will set 10% points equal to 0.0 or 1.0 Setting this value > 1.0 will result in a margin between data points and sparse-sigmoid cutoff value All points will be between (0.5 - 0.5 / threshold_init_cutoff) and (0.5 + 0.5 / threshold_init_cutoff)

required
embed_categorical bool

Flag to embed categorical columns using an Embedding Layer. If turned off, the categorical columns are encoded using LeaveOneOutEncoder

required
embedding_dims Optional[List[int]]

The dimensions of the embedding for each categorical column as a list of tuples (cardinality, embedding_dim). If left empty, will infer using the cardinality of the categorical column using the rule min(50, (x + 1) // 2)

required
embedding_dropout float

probability of an embedding element to be zeroed.

required

Exceptions:

Type Description
NotImplementedError

Raises an error if task is not in ['regression','classification']

pytorch_tabular.models.node.node_model.NODEModel

data_aware_initialization(self, datamodule)

Performs data-aware initialization for NODE

Source code in pytorch_tabular/models/node/node_model.py
def data_aware_initialization(self, datamodule):
    """Performs data-aware initialization for NODE"""
    logger.info("Data Aware Initialization....")
    # Need a big batch to initialize properly
    alt_loader = datamodule.train_dataloader(batch_size=2000)
    batch = next(iter(alt_loader))
    for k, v in batch.items():
        if isinstance(v, list) and (len(v) == 0):
            # Skipping empty list
            continue
        # batch[k] = v.to("cpu" if self.config.gpu == 0 else "cuda")
        batch[k] = v.to(self.device)

    # single forward pass to initialize the ODST
    with torch.no_grad():
        self(batch)

forward(self, x)

Same as :meth:torch.nn.Module.forward().

Parameters:

Name Type Description Default
*args

Whatever you decide to pass into the forward method.

required
**kwargs

Keyword arguments are also possible.

required

Returns:

Type Description

Your model's output

Source code in pytorch_tabular/models/node/node_model.py
def forward(self, x: Dict):
    x = self.unpack_input(x)
    if self.hparams.embed_categorical:
        if self.hparams.embedding_dropout != 0 and self.embedding_cat_dim != 0:
            x = self.embedding_dropout(x)
    x = self.backbone(x)
    y_hat = self.output_response(x)
    if (self.hparams.task == "regression") and (
        self.hparams.target_range is not None
    ):
        for i in range(self.hparams.output_dim):
            y_min, y_max = self.hparams.target_range[i]
            y_hat[:, i] = y_min + nn.Sigmoid()(y_hat[:, i]) * (y_max - y_min)
    return {"logits": y_hat, "backbone_features": x}

pytorch_tabular.models.tabnet.tabnet_model.TabNetModel

forward(self, x)

Same as :meth:torch.nn.Module.forward().

Parameters:

Name Type Description Default
*args

Whatever you decide to pass into the forward method.

required
**kwargs

Keyword arguments are also possible.

required

Returns:

Type Description

Your model's output

Source code in pytorch_tabular/models/tabnet/tabnet_model.py
def forward(self, x: Dict):
    # unpacking into a tuple
    x = self.unpack_input(x)
    # Returns output
    x = self.backbone(x)
    if (self.hparams.task == "regression") and (
        self.hparams.target_range is not None
    ):
        for i in range(self.hparams.output_dim):
            y_min, y_max = self.hparams.target_range[i]
            x[:, i] = y_min + nn.Sigmoid()(x[:, i]) * (y_max - y_min)
    return {"logits": x}  # No Easy way to access the raw features in TabNet

pytorch_tabular.models.tabnet.config.TabNetModelConfig dataclass

Model configuration

Parameters:

Name Type Description Default
task str

Specify whether the problem is regression of classification.Choices are: regression classification

required
learning_rate float

The learning rate of the model

required
loss Optional[str]

The loss function to be applied.

required
metrics Optional[List[str]]

the list of metrics you need to track during training.

required
metrics_params Optional[List]

The parameters to be passed to the Metrics initialized

required
target_range Optional[List]

The range in which we should limit the output variable. Currently ignored for multi-target regression

required
n_d int

Dimension of the prediction layer (usually between 4 and 64)

required
n_a int

Dimension of the attention layer (usually between 4 and 64)

required
n_steps int

Number of sucessive steps in the newtork (usually betwenn 3 and 10)

required
gamma float

Float above 1, scaling factor for attention updates (usually betwenn 1.0 to 2.0)

required
embedding_dims Optional[List[int]]

The dimensions of the embedding for each categorical column as

required
n_independent int

Number of independent GLU layer in each GLU block (default 2)

required
n_shared int

Number of independent GLU layer in each GLU block (default 2)

required
virtual_batch_size int

Batch size for Ghost Batch Normalization

required
mask_type str

Either 'sparsemax' or 'entmax' : this is the masking function to useChoices are: sparsemax entmax

required

Exceptions:

Type Description
NotImplementedError

Raises an error if task is not in ['regression','classification']

pytorch_tabular.config.config.TrainerConfig dataclass

Trainer configuration

Parameters:

Name Type Description Default
batch_size int

Number of samples in each batch of training

required
fast_dev_run bool

Quick Debug Run of Val

required
max_epochs int

Maximum number of epochs to be run

required
min_epochs int

Force training for at least these many epochs. 1 by default

required
max_time Optional[int]

Stop training after this amount of time has passed. Disabled by default (None)

required
gpus int

Number of gpus to train on (int) or which GPUs to train on (list or str). -1 uses all available GPUs. By default uses CPU (None)

required
accumulate_grad_batches int

Accumulates grads every k batches or as set up in the dict. Trainer also calls optimizer.step() for the last indivisible step number.

required
auto_lr_find bool

Runs a learning rate finder algorithm (see this paper) when calling trainer.tune(), to find optimal initial learning rate.

required
auto_select_gpus bool

If enabled and gpus is an integer, pick available gpus automatically. This is especially useful when GPUs are configured to be in 'exclusive mode', such that only one process at a time can access them.

required
check_val_every_n_epoch int

Check val every n train epochs.

required
deterministic bool

If true enables cudnn.deterministic. Might make your system slower, but ensures reproducibility.

required
gradient_clip_val float

Gradient clipping value

required
overfit_batches float

Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. If the training dataloaders have shuffle=True, Lightning will automatically disable it. Useful for quickly debugging or trying to overfit on purpose.

required
profiler Optional[str, NoneType]

To profile individual steps during training and assist in identifying bottlenecks. Choices are: 'None' 'simple' 'advanced'

required
early_stopping str

The loss/metric that needed to be monitored for early stopping. If None, there will be no early stopping

required
early_stopping_min_delta float

The minimum delta in the loss/metric which qualifies as an improvement in early stopping

required
early_stopping_mode str

The direction in which the loss/metric should be optimized. Choices are max and min

required
early_stopping_patience int

The number of epochs to wait until there is no further improvements in loss/metric

required
checkpoints str

The loss/metric that needed to be monitored for checkpoints. If None, there will be no checkpoints

required
checkpoints_path str

The path where the saved models will be

required
checkpoints_name(Optional[str])

The name under which the models will be saved. If left blank, first it will look for run_name in experiment_config and if that is also None then it will use a generic name like task_version.

required
checkpoints_mode str

The direction in which the loss/metric should be optimized

required
checkpoints_save_top_k int

The number of best models to save

required
load_best bool

Flag to load the best model saved during training

required
track_grad_norm int

Track and Log Gradient Norms in the logger. -1 by default means no tracking. 1 for the L1 norm, 2 for L2 norm, etc.

required

pytorch_tabular.config.config.DataConfig dataclass

Data configuration.

Parameters:

Name Type Description Default
target List[str]

A list of strings with the names of the target column(s)

required
continuous_cols List[str]

Column names of the numeric fields. Defaults to []

required
categorical_cols List

Column names of the categorical fields to treat differently. Defaults to []

required
date_columns List

(Column names, Freq) tuples of the date fields. For eg. a field named introduction_date and with a monthly frequency should have an entry ('intro_date','M'}

required
encode_date_columns bool

Whether or not to encode the derived variables from date

required
validation_split Optional[float, NoneType]

Percentage of Training rows to keep aside as validation. Used only if Validation Data is not given separately

required
continuous_feature_transform Optional[str, NoneType]

Whether or not to transform the features before modelling. By default it is turned off.Choices are: None "yeo-johnson" "box-cox" "quantile_normal" "quantile_uniform"

required
normalize_continuous_features bool

Flag to normalize the input features(continuous)

required
quantile_noise int

NOT IMPLEMENTED. If specified fits QuantileTransformer on data with added gaussian noise with std = :quantile_noise: * data.std ; this will cause discrete values to be more separable. Please note that this transformation does NOT apply gaussian noise to the resulting data, the noise is only applied for QuantileTransformer

required
num_workers Optional[int, NoneType]

The number of workers used for data loading. For Windows always set to 0

required

pytorch_tabular.config.config.ModelConfig dataclass

Base Model configuration

Parameters:

Name Type Description Default
task str

Specify whether the problem is regression of classification.Choices are: regression classification

required
embedding_dims Optional[List[int], NoneType]

The dimensions of the embedding for each categorical column as a list of tuples (cardinality, embedding_dim). If left empty, will infer using the cardinality of the categorical column using the rule min(50, (x + 1) // 2). Will only be used if the model uses categorical embedding.

required
learning_rate float

The learning rate of the model

required
loss Optional[str, NoneType]

The loss function to be applied. By Default it is MSELoss for regression and CrossEntropyLoss for classification. Unless you are sure what you are doing, leave it at MSELoss or L1Loss for regression and CrossEntropyLoss for classification

required
metrics Optional[List[str], NoneType]

the list of metrics you need to track during training. The metrics should be one of the metrics implemented in PyTorch Lightning. By default, it is accuracy if classification and mean_squared_error for regression

required
metrics_params Optional[List, NoneType]

The parameters to be passed to the metrics function

required
target_range Optional[List, NoneType]

The range in which we should limit the output variable. Currently ignored for multi-target regression Typically used for Regression problems. If left empty, will not apply any restrictions

required

Exceptions:

Type Description
NotImplementedError

Raises an error if task is not regression or classification

pytorch_tabular.config.config.ExperimentConfig dataclass

Experiment configuration. Experiment Tracking with WandB and Tensorboard

Parameters:

Name Type Description Default
project_name str

The name of the project under which all runs will be logged. For Tensorboard this defines the folder under which the logs will be saved and for W&B it defines the project name.

required
run_name Optional[str, NoneType]

The name of the run; a specific identifier to recognize the run. If left blank, will be assigned a auto-generated name

required
exp_watch Optional[str, NoneType]

The level of logging required. Can be gradients, parameters, all or None. Defaults to None

required
log_target str

Determines where logging happens - Tensorboard or W&BChoices are: wandb tensorboard

required
log_logits bool

Turn this on to log the logits as a histogram in W&B

required
exp_log_freq int

step count between logging of gradients and parameters.

required
_exp_version_manager str

The location of the yaml file which manages versions of experiments

required

pytorch_tabular.config.config.OptimizerConfig dataclass

Optimizer and Learning Rate Scheduler configuration.

Parameters:

Name Type Description Default
optimizer str

Any of the standard optimizers from torch.optim. Defaults to Adam"

required
optimizer_params dict

The parameters for the optimizer. If left blank, will use default parameters.

required
lr_scheduler Optional[str, NoneType]

The name of the LearningRateScheduler to use, if any, from torch.optim.lr_scheduler. If None, will not use any scheduler. Defaults to None

required
lr_scheduler_params Optional[dict, NoneType]

The parameters for the LearningRateScheduler. If left blank, will use default parameters.

required
lr_scheduler_monitor_metric Optional[str, NoneType]

Used with ReduceLROnPlateau, where the plateau is decided based on this metric

required

pytorch_tabular.config.config.ExperimentRunManager

__init__(self, exp_version_manager='.pt_tmp/exp_version_manager.yml') special

The manages the versions of the experiments based on the name. It is a simple dictionary(yaml) based lookup. Primary purpose is to avoid overwriting of saved models while runing the training without changing the experiment name.

Parameters:

Name Type Description Default
exp_version_manager str

The path of the yml file which acts as version control.

'.pt_tmp/exp_version_manager.yml'
Source code in pytorch_tabular/config/config.py
def __init__(
    self,
    exp_version_manager: str = ".pt_tmp/exp_version_manager.yml",
) -> None:
    """The manages the versions of the experiments based on the name. It is a simple dictionary(yaml) based lookup.
    Primary purpose is to avoid overwriting of saved models while runing the training without changing the experiment name.

    Args:
        exp_version_manager (str, optional): The path of the yml file which acts as version control.
        Defaults to ".pt_tmp/exp_version_manager.yml".
    """
    super().__init__()
    self._exp_version_manager = exp_version_manager
    if os.path.exists(exp_version_manager):
        self.exp_version_manager = OmegaConf.load(exp_version_manager)
    else:
        self.exp_version_manager = OmegaConf.create({})
        os.makedirs(os.path.split(exp_version_manager)[0], exist_ok=True)
        with open(self._exp_version_manager, "w") as file:
            OmegaConf.save(config=self.exp_version_manager, f=file)