Skip to content

Core Classes

pytorch_tabular.TabularModel(config=None, data_config=None, model_config=None, optimizer_config=None, trainer_config=None, experiment_config=None, model_callable=None, model_state_dict_path=None)

The core model which orchestrates everything from initializing the datamodule, the model, trainer, etc.

PARAMETER DESCRIPTION
config

Single OmegaConf DictConfig object or the path to the yaml file holding all the config parameters. Defaults to None.

TYPE: Optional[Union[DictConfig, str]] DEFAULT: None

data_config

DataConfig object or path to the yaml file. Defaults to None.

TYPE: Optional[Union[DataConfig, str]] DEFAULT: None

model_config

A subclass of ModelConfig or path to the yaml file. Determines which model to run from the type of config. Defaults to None.

TYPE: Optional[Union[ModelConfig, str]] DEFAULT: None

optimizer_config

OptimizerConfig object or path to the yaml file. Defaults to None.

TYPE: Optional[Union[OptimizerConfig, str]] DEFAULT: None

trainer_config

TrainerConfig object or path to the yaml file. Defaults to None.

TYPE: Optional[Union[TrainerConfig, str]] DEFAULT: None

experiment_config

ExperimentConfig object or path to the yaml file. If Provided configures the experiment tracking. Defaults to None.

TYPE: Optional[Union[ExperimentConfig, str]] DEFAULT: None

model_callable

If provided, will override the model callable that will be loaded from the config. Typically used when providing Custom Models

TYPE: Optional[Callable] DEFAULT: None

model_state_dict_path

If provided, will load the state dict after initializing the model from config.

TYPE: Optional[Union[str, Path]] DEFAULT: None

Source code in src/pytorch_tabular/tabular_model.py
def __init__(
    self,
    config: Optional[DictConfig] = None,
    data_config: Optional[Union[DataConfig, str]] = None,
    model_config: Optional[Union[ModelConfig, str]] = None,
    optimizer_config: Optional[Union[OptimizerConfig, str]] = None,
    trainer_config: Optional[Union[TrainerConfig, str]] = None,
    experiment_config: Optional[Union[ExperimentConfig, str]] = None,
    model_callable: Optional[Callable] = None,
    model_state_dict_path: Optional[Union[str, Path]] = None,
) -> None:
    """The core model which orchestrates everything from initializing the datamodule, the model, trainer, etc.

    Args:
        config (Optional[Union[DictConfig, str]], optional): Single OmegaConf DictConfig object or
            the path to the yaml file holding all the config parameters. Defaults to None.

        data_config (Optional[Union[DataConfig, str]], optional):
            DataConfig object or path to the yaml file. Defaults to None.

        model_config (Optional[Union[ModelConfig, str]], optional):
            A subclass of ModelConfig or path to the yaml file.
            Determines which model to run from the type of config. Defaults to None.

        optimizer_config (Optional[Union[OptimizerConfig, str]], optional):
            OptimizerConfig object or path to the yaml file. Defaults to None.

        trainer_config (Optional[Union[TrainerConfig, str]], optional):
            TrainerConfig object or path to the yaml file. Defaults to None.

        experiment_config (Optional[Union[ExperimentConfig, str]], optional):
            ExperimentConfig object or path to the yaml file.
            If Provided configures the experiment tracking. Defaults to None.

        model_callable (Optional[Callable], optional):
            If provided, will override the model callable that will be loaded from the config.
            Typically used when providing Custom Models

        model_state_dict_path (Optional[Union[str, Path]], optional):
            If provided, will load the state dict after initializing the model from config.
    """
    super().__init__()
    self.exp_manager = ExperimentRunManager()
    if config is None:
        assert any(c is not None for c in (data_config, model_config, optimizer_config, trainer_config)), (
            "If `config` is None, `data_config`, `model_config`, `trainer_config`,"
            " and `optimizer_config` cannot be None"
        )
        data_config = self._read_parse_config(data_config, DataConfig)
        model_config = self._read_parse_config(model_config, ModelConfig)
        trainer_config = self._read_parse_config(trainer_config, TrainerConfig)
        optimizer_config = self._read_parse_config(optimizer_config, OptimizerConfig)
        if model_config.task != "ssl":
            assert (
                data_config.target is not None
            ), f"`target` in data_config should not be None for {model_config.task} task"
        if experiment_config is None:
            logger.info("Experiment Tracking is turned off")
            self.track_experiment = False
            self.config = OmegaConf.merge(
                OmegaConf.to_container(data_config),
                OmegaConf.to_container(model_config),
                OmegaConf.to_container(trainer_config),
                OmegaConf.to_container(optimizer_config),
            )
        else:
            experiment_config = self._read_parse_config(experiment_config, ExperimentConfig)
            self.track_experiment = True
            self.config = OmegaConf.merge(
                OmegaConf.to_container(data_config),
                OmegaConf.to_container(model_config),
                OmegaConf.to_container(trainer_config),
                OmegaConf.to_container(experiment_config),
                OmegaConf.to_container(optimizer_config),
            )
    else:
        self.config = config
        if hasattr(config, "log_target") and (config.log_target is not None):
            # experiment_config = OmegaConf.structured(experiment_config)
            self.track_experiment = True
        else:
            logger.info("Experiment Tracking is turned off")
            self.track_experiment = False

    self.name, self.uid = self._get_run_name_uid()
    if self.track_experiment:
        self._setup_experiment_tracking()
    else:
        self.logger = None

    self.exp_manager = ExperimentRunManager()
    if model_callable is None:
        self.model_callable = getattr_nested(self.config._module_src, self.config._model_name)
        self.custom_model = False
    else:
        self.model_callable = model_callable
        self.custom_model = True
    self.model_state_dict_path = model_state_dict_path
    self._is_config_updated_with_data = False
    self._run_validation()

create_finetune_model(task, head, head_config, target=None, optimizer_config=None, trainer_config=None, experiment_config=None, loss=None, metrics=None, metrics_prob_input=None, metrics_params=None, optimizer=None, optimizer_params={}, learning_rate=None, target_range=None)

Creates a new TabularModel model using the pretrained weights and the new task and head

PARAMETER DESCRIPTION
task

The task to be performed. One of "regression", "classification"

TYPE: str

head

The head to be used for the model. Should be one of the heads defined in pytorch_tabular.models.common.heads. Defaults to LinearHead. Choices are: [None,LinearHead,MixtureDensityHead].

TYPE: str

head_config

The config as a dict which defines the head. If left empty, will be initialized as default linear head.

TYPE: Dict

target

The target column name if not provided in the initial pretraining stage. Defaults to None.

TYPE: Optional[str] DEFAULT: None

optimizer_config

If provided, will redefine the optimizer for fine-tuning stage. Defaults to None.

TYPE: Optional[OptimizerConfig] DEFAULT: None

trainer_config

If provided, will redefine the trainer for fine-tuning stage. Defaults to None.

TYPE: Optional[TrainerConfig] DEFAULT: None

experiment_config

If provided, will redefine the experiment for fine-tuning stage. Defaults to None.

TYPE: Optional[ExperimentConfig] DEFAULT: None

loss

If provided, will be used as the loss function for the fine-tuning. By Default it is MSELoss for regression and CrossEntropyLoss for classification.

TYPE: Optional[torch.nn.Module] DEFAULT: None

metrics

List of metrics (either callables or str) to be used for the fine-tuning stage. If str, it should be one of the functional metrics implemented in torchmetrics.functional. Defaults to None.

TYPE: Optional[List[Callable]] DEFAULT: None

metrics_prob_input

Is a mandatory parameter for classification metrics This defines whether the input to the metric function is the probability or the class. Length should be same as the number of metrics. Defaults to None.

TYPE: Optional[List[bool]] DEFAULT: None

metrics_params

The parameters for the metrics in the same order as metrics. For eg. f1_score for multi-class needs a parameter average to fully define the metric. Defaults to None.

TYPE: Optional[Dict] DEFAULT: None

optimizer

Custom optimizers which are a drop in replacements for standard PyTorch optimizers. If provided, the OptimizerConfig is ignored in favor of this. Defaults to None.

TYPE: Optional[torch.optim.Optimizer] DEFAULT: None

optimizer_params

The parameters for the optimizer. Defaults to {}.

TYPE: Dict DEFAULT: {}

learning_rate

The learning rate to be used. Defaults to 1e-3.

TYPE: Optional[float] DEFAULT: None

target_range

The target range for the regression task. Is ignored for classification. Defaults to None.

TYPE: Optional[Tuple[float, float]] DEFAULT: None

RETURNS DESCRIPTION
TabularModel

The new TabularModel model for fine-tuning

TYPE: TabularModel

Source code in src/pytorch_tabular/tabular_model.py
def create_finetune_model(
    self,
    task: str,
    head: str,
    head_config: Dict,
    target: Optional[str] = None,
    optimizer_config: Optional[OptimizerConfig] = None,
    trainer_config: Optional[TrainerConfig] = None,
    experiment_config: Optional[ExperimentConfig] = None,
    loss: Optional[torch.nn.Module] = None,
    metrics: Optional[List[Union[Callable, str]]] = None,
    metrics_prob_input: Optional[List[bool]] = None,
    metrics_params: Optional[Dict] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    optimizer_params: Dict = {},
    learning_rate: Optional[float] = None,
    target_range: Optional[Tuple[float, float]] = None,
):
    """Creates a new TabularModel model using the pretrained weights and the new task and head
    Args:
        task (str): The task to be performed. One of "regression", "classification"

        head (str): The head to be used for the model. Should be one of the heads defined
            in `pytorch_tabular.models.common.heads`. Defaults to  LinearHead. Choices are:
            [`None`,`LinearHead`,`MixtureDensityHead`].

        head_config (Dict): The config as a dict which defines the head. If left empty,
            will be initialized as default linear head.

        target (Optional[str], optional): The target column name if not provided in the initial pretraining stage.
            Defaults to None.

        optimizer_config (Optional[OptimizerConfig], optional):
            If provided, will redefine the optimizer for fine-tuning stage. Defaults to None.

        trainer_config (Optional[TrainerConfig], optional):
            If provided, will redefine the trainer for fine-tuning stage. Defaults to None.

        experiment_config (Optional[ExperimentConfig], optional):
            If provided, will redefine the experiment for fine-tuning stage. Defaults to None.

        loss (Optional[torch.nn.Module], optional):
            If provided, will be used as the loss function for the fine-tuning.
            By Default it is MSELoss for regression and CrossEntropyLoss for classification.

        metrics (Optional[List[Callable]], optional): List of metrics (either callables or str) to be used for the
            fine-tuning stage. If str, it should be one of the functional metrics implemented in
            ``torchmetrics.functional``. Defaults to None.

        metrics_prob_input (Optional[List[bool]], optional): Is a mandatory parameter for classification metrics
            This defines whether the input to the metric function is the probability or the class.
            Length should be same as the number of metrics. Defaults to None.

        metrics_params (Optional[Dict], optional): The parameters for the metrics in the same order as metrics.
            For eg. f1_score for multi-class needs a parameter `average` to fully define the metric.
            Defaults to None.

        optimizer (Optional[torch.optim.Optimizer], optional):
            Custom optimizers which are a drop in replacements for standard PyTorch optimizers. If provided,
            the OptimizerConfig is ignored in favor of this. Defaults to None.

        optimizer_params (Dict, optional): The parameters for the optimizer. Defaults to {}.

        learning_rate (Optional[float], optional): The learning rate to be used. Defaults to 1e-3.

        target_range (Optional[Tuple[float, float]], optional): The target range for the regression task.
            Is ignored for classification. Defaults to None.
    Returns:
        TabularModel (TabularModel): The new TabularModel model for fine-tuning
    """
    config = self.config
    if target is None:
        assert (
            hasattr(config, "target") and config.target is not None
        ), "`target` cannot be None if it was not set in the initial `DataConfig`"
    else:
        assert isinstance(target, list), "`target` should be a list of strings"
        config.target = target
    config.task = task
    # Add code to update configs with newly provided ones
    if optimizer_config is not None:
        for key, value in optimizer_config.__dict__.items():
            config[key] = value
        if len(optimizer_params) > 0:
            config.optimizer_params = optimizer_params
        else:
            config.optimizer_params = {}
    if trainer_config is not None:
        for key, value in trainer_config.__dict__.items():
            config[key] = value
    if experiment_config is not None:
        for key, value in experiment_config.__dict__.items():
            config[key] = value
    else:
        if self.track_experiment:
            # Renaming the experiment run so that a different log is created for finetuning
            logger.info(f"Renaming the experiment run for finetuning as {config['run_name'] + '_finetuned'}")
            config["run_name"] = config["run_name"] + "_finetuned"

    datamodule = self.datamodule
    # Setting the attributes from new config
    datamodule.target = config.target
    datamodule.batch_size = config.batch_size
    datamodule.seed = config.seed
    model_callable = _GenericModel
    inferred_config = self.datamodule.update_config(config)
    inferred_config = OmegaConf.structured(inferred_config)
    # Adding dummy attributes for compatibility. Not used because custom metrics are provided
    if not hasattr(config, "metrics"):
        config.metrics = "dummy"
    if not hasattr(config, "metrics_params"):
        config.metrics_params = {}
    if not hasattr(config, "metrics_prob_input"):
        config.metrics_prob_input = metrics_prob_input or [False]
    if metrics is not None:
        assert len(metrics) == len(metrics_params), "Number of metrics and metrics_params should be same"
        assert len(metrics) == len(metrics_prob_input), "Number of metrics and metrics_prob_input should be same"
        metrics = [getattr(torchmetrics.functional, m) if isinstance(m, str) else m for m in metrics]
    if task == "regression":
        loss = loss or torch.nn.MSELoss()
        if metrics is None:
            metrics = [torchmetrics.functional.mean_squared_error]
            metrics_params = [{}]
    elif task == "classification":
        loss = loss or torch.nn.CrossEntropyLoss()
        if metrics is None:
            metrics = [torchmetrics.functional.accuracy]
            metrics_params = [{"task": "multiclass", "num_classes": inferred_config.output_dim, "top_k": 1}]
            metrics_prob_input = [False]
        else:
            for i, mp in enumerate(metrics_params):
                # For classification task, output_dim == number of classses
                metrics_params[i]["task"] = mp.get("task", "multiclass")
                metrics_params[i]["num_classes"] = mp.get("num_classes", inferred_config.output_dim)
                metrics_params[i]["top_k"] = mp.get("top_k", 1)
    # Forming partial callables using metrics and metric params
    metrics = [partial(m, **mp) for m, mp in zip(metrics, metrics_params)]
    self.model.mode = "finetune"
    if learning_rate is not None:
        config.learning_rate = learning_rate
    config.target_range = target_range
    model_args = {
        "backbone": self.model,
        "head": head,
        "head_config": head_config,
        "config": config,
        "inferred_config": inferred_config,
        "custom_loss": loss,
        "custom_metrics": metrics,
        "custom_metrics_prob_inputs": metrics_prob_input,
        "custom_optimizer": optimizer,
        "custom_optimizer_params": optimizer_params,
    }
    # Initializing with default metrics, losses, and optimizers. Will revert once initialized
    model = model_callable(
        **model_args,
    )
    tabular_model = TabularModel(config=config)
    tabular_model.model = model
    tabular_model.datamodule = datamodule
    # Setting a flag to identify this as a fine-tune model
    tabular_model._is_finetune_model = True
    return tabular_model

evaluate(test=None, test_loader=None, ckpt_path=None, verbose=True)

Evaluates the dataframe using the loss and metrics already set in config.

PARAMETER DESCRIPTION
test

The dataframe to be evaluated. If not provided, will try to use the test provided during fit. If that was also not provided will return an empty dictionary

TYPE: Optional[pd.DataFrame] DEFAULT: None

test_loader

The dataloader to be used for evaluation. If provided, will use the dataloader instead of the test dataframe or the test data provided during fit. DEPRECATION: providing test data during fit is deprecated and will be removed in a future release. Defaults to None.

TYPE: Optional[torch.utils.data.DataLoader] DEFAULT: None

ckpt_path

The path to the checkpoint to be loaded. If not provided, will try to use the best checkpoint during training.

TYPE: Optional[Union[str, Path]] DEFAULT: None

verbose

If true, will print the results. Defaults to True.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
Union[dict, list]

The final test result dictionary.

Source code in src/pytorch_tabular/tabular_model.py
def evaluate(
    self,
    test: Optional[pd.DataFrame] = None,
    test_loader: Optional[torch.utils.data.DataLoader] = None,
    ckpt_path: Optional[Union[str, Path]] = None,
    verbose: bool = True,
) -> Union[dict, list]:
    """Evaluates the dataframe using the loss and metrics already set in config.

    Args:
        test (Optional[pd.DataFrame]): The dataframe to be evaluated. If not provided, will try to use the
            test provided during fit. If that was also not provided will return an empty dictionary

        test_loader (Optional[torch.utils.data.DataLoader], optional): The dataloader to be used for evaluation.
            If provided, will use the dataloader instead of the test dataframe or the test data provided during fit.
            DEPRECATION: providing test data during fit is deprecated and will be removed in a future release.
            Defaults to None.

        ckpt_path (Optional[Union[str, Path]], optional): The path to the checkpoint to be loaded. If not provided,
            will try to use the best checkpoint during training.

        verbose (bool, optional): If true, will print the results. Defaults to True.
    Returns:
        The final test result dictionary.
    """
    if test_loader is None and test is None:
        warnings.warn(
            "Providing test in fit is deprecated."
            " Not providing `test` or `test_loader` in `evaluate` will cause an error in a future release."
        )
    if test_loader is None:
        if test is not None:
            test_loader = self.datamodule.prepare_inference_dataloader(test)
        elif self.datamodule.test is not None:
            warnings.warn(
                "Providing test in fit is deprecated."
                " Not providing `test` or `test_loader` in `evaluate` will cause an error in a future release."
            )
            test_loader = self.datamodule.test_dataloader()
        else:
            return {}
    result = self.trainer.test(
        model=self.model,
        dataloaders=test_loader,
        ckpt_path=ckpt_path,
        verbose=verbose,
    )
    return result

feature_importance()

Returns the feature importance of the model as a pandas DataFrame.

Source code in src/pytorch_tabular/tabular_model.py
def feature_importance(self) -> pd.DataFrame:
    """Returns the feature importance of the model as a pandas DataFrame."""
    return self.model.feature_importance()

find_learning_rate(model, datamodule, min_lr=1e-08, max_lr=1, num_training=100, mode='exponential', early_stop_threshold=4.0, plot=True, callbacks=None)

Enables the user to do a range test of good initial learning rates, to reduce the amount of guesswork in picking a good starting learning rate.

PARAMETER DESCRIPTION
model

The PyTorch Lightning model to be trained.

TYPE: pl.LightningModule

datamodule

The datamodule

TYPE: TabularDatamodule

min_lr

minimum learning rate to investigate

TYPE: Optional[float] DEFAULT: 1e-08

max_lr

maximum learning rate to investigate

TYPE: Optional[float] DEFAULT: 1

num_training

number of learning rates to test

TYPE: Optional[int] DEFAULT: 100

mode

search strategy, either 'linear' or 'exponential'. If set to 'linear' the learning rate will be searched by linearly increasing after each batch. If set to 'exponential', will increase learning rate exponentially.

TYPE: Optional[str] DEFAULT: 'exponential'

early_stop_threshold

threshold for stopping the search. If the loss at any point is larger than early_stop_threshold*best_loss then the search is stopped. To disable, set to None.

TYPE: Optional[float] DEFAULT: 4.0

plot

If true, will plot using matplotlib

TYPE: bool DEFAULT: True

callbacks

If provided, will be added to the callbacks for Trainer.

TYPE: Optional[List] DEFAULT: None

RETURNS DESCRIPTION
Tuple[float, pd.DataFrame]

The suggested learning rate and the learning rate finder results

Source code in src/pytorch_tabular/tabular_model.py
def find_learning_rate(
    self,
    model: pl.LightningModule,
    datamodule: TabularDatamodule,
    min_lr: float = 1e-8,
    max_lr: float = 1,
    num_training: int = 100,
    mode: str = "exponential",
    early_stop_threshold: Optional[float] = 4.0,
    plot: bool = True,
    callbacks: Optional[List] = None,
) -> Tuple[float, pd.DataFrame]:
    """Enables the user to do a range test of good initial learning rates, to reduce the amount of guesswork in
    picking a good starting learning rate.

    Args:
        model (pl.LightningModule): The PyTorch Lightning model to be trained.

        datamodule (TabularDatamodule): The datamodule

        min_lr (Optional[float], optional): minimum learning rate to investigate

        max_lr (Optional[float], optional): maximum learning rate to investigate

        num_training (Optional[int], optional): number of learning rates to test

        mode (Optional[str], optional): search strategy, either 'linear' or 'exponential'. If set to
            'linear' the learning rate will be searched by linearly increasing
            after each batch. If set to 'exponential', will increase learning
            rate exponentially.

        early_stop_threshold (Optional[float], optional): threshold for stopping the search. If the
            loss at any point is larger than early_stop_threshold*best_loss
            then the search is stopped. To disable, set to None.

        plot (bool, optional): If true, will plot using matplotlib

        callbacks (Optional[List], optional): If provided, will be added to the callbacks for Trainer.

    Returns:
        The suggested learning rate and the learning rate finder results
    """
    self._prepare_for_training(model, datamodule, callbacks, max_epochs=None, min_epochs=None)
    train_loader, _ = datamodule.train_dataloader(), datamodule.val_dataloader()
    lr_finder = self.trainer.tuner.lr_find(
        model=self.model,
        train_dataloaders=train_loader,
        val_dataloaders=None,
        min_lr=min_lr,
        max_lr=max_lr,
        num_training=num_training,
        mode=mode,
        early_stop_threshold=early_stop_threshold,
    )
    if plot:
        fig = lr_finder.plot(suggest=True)
        fig.show()
    new_lr = lr_finder.suggestion()
    # cancelling the model and trainer that was loaded
    self.model = None
    self.trainer = None
    self.datamodule = None
    self.callbacks = None
    return new_lr, pd.DataFrame(lr_finder.results)

finetune(train, validation=None, train_sampler=None, target_transform=None, max_epochs=None, min_epochs=None, seed=42, callbacks=None, datamodule=None, freeze_backbone=False)

Finetunes the model on the provided data

PARAMETER DESCRIPTION
train

The training data with labels

TYPE: pd.DataFrame

validation

The validation data with labels. Defaults to None.

TYPE: Optional[pd.DataFrame] DEFAULT: None

train_sampler

If provided, will be used as a batch sampler for training. Defaults to None.

TYPE: Optional[torch.utils.data.Sampler] DEFAULT: None

target_transform

If provided, will be used to transform the target before training and inverse transform the predictions.

TYPE: Optional[Union[TransformerMixin, Tuple]] DEFAULT: None

max_epochs

The maximum number of epochs to train for. Defaults to None.

TYPE: Optional[int] DEFAULT: None

min_epochs

The minimum number of epochs to train for. Defaults to None.

TYPE: Optional[int] DEFAULT: None

seed

The seed to be used for training. Defaults to 42.

TYPE: Optional[int] DEFAULT: 42

callbacks

If provided, will be added to the callbacks for Trainer. Defaults to None.

TYPE: Optional[List[pl.Callback]] DEFAULT: None

datamodule

If provided, will be used as the datamodule for training. Defaults to None.

TYPE: Optional[TabularDatamodule] DEFAULT: None

freeze_backbone

If True, will freeze the backbone by tirning off gradients. Defaults to False, which means the pretrained weights are also further tuned during fine-tuning.

TYPE: bool DEFAULT: False

RETURNS DESCRIPTION
pl.Trainer

pl.Trainer: The trainer object

Source code in src/pytorch_tabular/tabular_model.py
def finetune(
    self,
    train,
    validation: Optional[pd.DataFrame] = None,
    train_sampler: Optional[torch.utils.data.Sampler] = None,
    target_transform: Optional[Union[TransformerMixin, Tuple]] = None,
    max_epochs: Optional[int] = None,
    min_epochs: Optional[int] = None,
    seed: Optional[int] = 42,
    callbacks: Optional[List[pl.Callback]] = None,
    datamodule: Optional[TabularDatamodule] = None,
    freeze_backbone: bool = False,
) -> pl.Trainer:
    """Finetunes the model on the provided data
    Args:
        train (pd.DataFrame): The training data with labels

        validation (Optional[pd.DataFrame], optional): The validation data with labels. Defaults to None.

        train_sampler (Optional[torch.utils.data.Sampler], optional): If provided, will be used as a batch sampler
            for training. Defaults to None.

        target_transform (Optional[Union[TransformerMixin, Tuple]], optional): If provided, will be used
            to transform the target before training and inverse transform the predictions.

        max_epochs (Optional[int], optional): The maximum number of epochs to train for. Defaults to None.

        min_epochs (Optional[int], optional): The minimum number of epochs to train for. Defaults to None.

        seed (Optional[int], optional): The seed to be used for training. Defaults to 42.

        callbacks (Optional[List[pl.Callback]], optional): If provided, will be added to the callbacks for Trainer.
            Defaults to None.

        datamodule (Optional[TabularDatamodule], optional): If provided, will be used as the datamodule
            for training. Defaults to None.

        freeze_backbone (bool, optional): If True, will freeze the backbone by tirning off gradients.
            Defaults to False, which means the pretrained weights are also further tuned during fine-tuning.

    Returns:
        pl.Trainer: The trainer object
    """
    assert (
        self._is_finetune_model
    ), "finetune() can only be called on a finetune model created using `TabularModel.create_finetune_model()`"
    seed = seed if seed is not None else self.config.seed
    seed_everything(seed)
    if datamodule is None:
        target_transform = self._check_and_set_target_transform(target_transform)
        self.datamodule._set_target_transform(target_transform)
        if self.config.task == "classification":
            self.datamodule.label_encoder = LabelEncoder()
            self.datamodule.label_encoder.fit(train[self.config.target[0]])
        elif self.config.task == "regression":
            target_transforms = []
            if target_transform is not None:
                for col in self.config.target:
                    _target_transform = copy.deepcopy(self.datamodule.target_transform_template)
                    _target_transform.fit(train[col].values.reshape(-1, 1))
                    target_transforms.append(_target_transform)
            self.datamodule.target_transforms = target_transforms
        self.datamodule.train = self.datamodule._prepare_inference_data(train)
        if validation is not None:
            self.datamodule.validation = self.datamodule._prepare_inference_data(validation)
        else:
            self.datamodule.validation = None
        self.datamodule.train_sampler = train_sampler
        datamodule = self.datamodule
    else:
        if train is not None:
            warnings.warn(
                "train data is provided but datamodule is provided."
                " Ignoring the train data and using the datamodule"
            )
    if freeze_backbone:
        for param in self.model.backbone.parameters():
            param.requires_grad = False
    return self.train(
        self.model,
        datamodule,
        callbacks=callbacks,
        max_epochs=max_epochs,
        min_epochs=min_epochs,
    )

fit(train, validation=None, test=None, loss=None, metrics=None, metrics_prob_inputs=None, optimizer=None, optimizer_params={}, train_sampler=None, target_transform=None, max_epochs=None, min_epochs=None, seed=42, callbacks=None, datamodule=None)

The fit method which takes in the data and triggers the training.

PARAMETER DESCRIPTION
train

Training Dataframe

TYPE: pd.DataFrame

validation

If provided, will use this dataframe as the validation while training. Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None.

TYPE: Optional[pd.DataFrame] DEFAULT: None

test

If provided, will use as the hold-out data, which you'll be able to check performance after the model is trained. Defaults to None. DEPRECATED. Will be removed in the next version.

TYPE: Optional[pd.DataFrame] DEFAULT: None

loss

Custom Loss functions which are not in standard pytorch library

TYPE: Optional[torch.nn.Module] DEFAULT: None

metrics

Custom metric functions(Callable) which has the signature metric_fn(y_hat, y) and works on torch tensor inputs. y_hat is expected to be of shape (batch_size, num_classes) for classification and (batch_size, 1) for regression and y is expected to be of shape (batch_size, 1)

TYPE: Optional[List[Callable]] DEFAULT: None

metrics_prob_inputs

This is a mandatory parameter for classification metrics. If the metric function requires probabilities as inputs, set this to True. The length of the list should be equal to the number of metrics. Defaults to None.

TYPE: Optional[List[bool]] DEFAULT: None

optimizer

Custom optimizers which are a drop in replacements for standard PyToch optimizers. This should be the Class and not the initialized object

TYPE: Optional[torch.optim.Optimizer] DEFAULT: None

optimizer_params

The parmeters to initialize the custom optimizer.

TYPE: Optional[Dict] DEFAULT: {}

train_sampler

Custom PyTorch batch samplers which will be passed to the DataLoaders. Useful for dealing with imbalanced data and other custom batching strategies

TYPE: Optional[torch.utils.data.Sampler] DEFAULT: None

target_transform

If provided, applies the transform to the target before modelling and inverse the transform during prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or a tuple of callables (transform_func, inverse_transform_func)

TYPE: Optional[Union[TransformerMixin, Tuple(Callable)]] DEFAULT: None

max_epochs

Overwrite maximum number of epochs to be run. Defaults to None.

TYPE: Optional[int] DEFAULT: None

min_epochs

Overwrite minimum number of epochs to be run. Defaults to None.

TYPE: Optional[int] DEFAULT: None

seed

(int): Random seed for reproducibility. Defaults to 42.

TYPE: Optional[int] DEFAULT: 42

callbacks

List of callbacks to be used during training. Defaults to None.

TYPE: Optional[List[pl.Callback]] DEFAULT: None

datamodule

The datamodule. If provided, will ignore the rest of the parameters like train, test etc and use the datamodule. Defaults to None.

TYPE: Optional[TabularDatamodule] DEFAULT: None

RETURNS DESCRIPTION
pl.Trainer

pl.Trainer: The PyTorch Lightning Trainer instance

Source code in src/pytorch_tabular/tabular_model.py
def fit(
    self,
    train: Optional[pd.DataFrame],
    validation: Optional[pd.DataFrame] = None,
    test: Optional[pd.DataFrame] = None,  # TODO: Deprecate test in next version
    loss: Optional[torch.nn.Module] = None,
    metrics: Optional[List[Callable]] = None,
    metrics_prob_inputs: Optional[List[bool]] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    optimizer_params: Dict = {},
    train_sampler: Optional[torch.utils.data.Sampler] = None,
    target_transform: Optional[Union[TransformerMixin, Tuple]] = None,
    max_epochs: Optional[int] = None,
    min_epochs: Optional[int] = None,
    seed: Optional[int] = 42,
    callbacks: Optional[List[pl.Callback]] = None,
    datamodule: Optional[TabularDatamodule] = None,
) -> pl.Trainer:
    """The fit method which takes in the data and triggers the training.

    Args:
        train (pd.DataFrame): Training Dataframe

        validation (Optional[pd.DataFrame], optional):
            If provided, will use this dataframe as the validation while training.
            Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation.
            Defaults to None.

        test (Optional[pd.DataFrame], optional): If provided, will use as the hold-out data,
            which you'll be able to check performance after the model is trained. Defaults to None.
            DEPRECATED. Will be removed in the next version.

        loss (Optional[torch.nn.Module], optional): Custom Loss functions which are not in standard pytorch library

        metrics (Optional[List[Callable]], optional): Custom metric functions(Callable) which has the
            signature metric_fn(y_hat, y) and works on torch tensor inputs. y_hat is expected to be of shape
            (batch_size, num_classes) for classification and (batch_size, 1) for regression and y is expected to be
            of shape (batch_size, 1)

        metrics_prob_inputs (Optional[List[bool]], optional): This is a mandatory parameter for
            classification metrics. If the metric function requires probabilities as inputs, set this to True.
            The length of the list should be equal to the number of metrics. Defaults to None.

        optimizer (Optional[torch.optim.Optimizer], optional):
            Custom optimizers which are a drop in replacements for
            standard PyToch optimizers. This should be the Class and not the initialized object

        optimizer_params (Optional[Dict], optional): The parmeters to initialize the custom optimizer.

        train_sampler (Optional[torch.utils.data.Sampler], optional):
            Custom PyTorch batch samplers which will be passed
            to the DataLoaders. Useful for dealing with imbalanced data and other custom batching strategies

        target_transform (Optional[Union[TransformerMixin, Tuple(Callable)]], optional):
            If provided, applies the transform to the target before modelling and inverse the transform during
            prediction. The parameter can either be a sklearn Transformer
            which has an inverse_transform method, or a tuple of callables (transform_func, inverse_transform_func)

        max_epochs (Optional[int]): Overwrite maximum number of epochs to be run. Defaults to None.

        min_epochs (Optional[int]): Overwrite minimum number of epochs to be run. Defaults to None.

        seed: (int): Random seed for reproducibility. Defaults to 42.

        callbacks (Optional[List[pl.Callback]], optional):
            List of callbacks to be used during training. Defaults to None.

        datamodule (Optional[TabularDatamodule], optional): The datamodule.
            If provided, will ignore the rest of the parameters like train, test etc and use the datamodule.
            Defaults to None.

    Returns:
        pl.Trainer: The PyTorch Lightning Trainer instance
    """
    assert (
        self.config.task != "ssl"
    ), "`fit` is not valid for SSL task. Please use `pretrain` for semi-supervised learning"
    if metrics is not None:
        assert len(metrics) == len(
            metrics_prob_inputs
        ), "The length of `metrics` and `metrics_prob_inputs` should be equal"
    seed = seed if seed is not None else self.config.seed
    seed_everything(seed)
    if datamodule is None:
        datamodule = self.prepare_dataloader(train, validation, test, train_sampler, target_transform, seed)
    else:
        if train is not None:
            warnings.warn(
                "train data is provided but datamodule is provided."
                " Ignoring the train data and using the datamodule"
            )
        if test is not None:
            warnings.warn(
                "Providing test data in `fit` is deprecated and will be removed in next major release."
                " Plese use `evaluate` for evaluating on test data"
            )
    model = self.prepare_model(
        datamodule,
        loss,
        metrics,
        metrics_prob_inputs,
        optimizer,
        optimizer_params,
    )

    return self.train(model, datamodule, callbacks, max_epochs, min_epochs)

load_best_model()

Loads the best model after training is done.

Source code in src/pytorch_tabular/tabular_model.py
def load_best_model(self) -> None:
    """Loads the best model after training is done."""
    if self.trainer.checkpoint_callback is not None:
        logger.info("Loading the best model")
        ckpt_path = self.trainer.checkpoint_callback.best_model_path
        if ckpt_path != "":
            logger.debug(f"Model Checkpoint: {ckpt_path}")
            ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
            self.model.load_state_dict(ckpt["state_dict"])
        else:
            logger.warning("No best model available to load. Did you run it more than 1 epoch?...")
    else:
        logger.warning("No best model available to load. Checkpoint Callback needs to be enabled for this to work")

load_from_checkpoint(dir, map_location=None, strict=True) classmethod

(Deprecated: Use load_model instead) Loads a saved model from the directory.

PARAMETER DESCRIPTION
dir

The directory where the model was saved, along with the checkpoints

TYPE: str

map_location

If your checkpoint saved a GPU model and you now load on CPUs or a different number of GPUs, use this to map to the new setup. The behaviour is the same as in torch.load()

TYPE: Union[Dict[str, str], str, device, int, Callable, None]) DEFAULT: None

strict

Whether to strictly enforce that the keys in checkpoint_path match the keys returned by this module's state dict. Default: True.

TYPE: bool) DEFAULT: True

RETURNS DESCRIPTION
TabularModel

The saved TabularModel

TYPE: TabularModel

Source code in src/pytorch_tabular/tabular_model.py
@classmethod
def load_from_checkpoint(cls, dir: str, map_location=None, strict=True):
    """(Deprecated: Use `load_model` instead) Loads a saved model from the directory.

    Args:
        dir (str): The directory where the model was saved, along with the checkpoints
        map_location (Union[Dict[str, str], str, device, int, Callable, None]) : If your checkpoint
            saved a GPU model and you now load on CPUs or a different number of GPUs, use this to map
            to the new setup. The behaviour is the same as in torch.load()
        strict (bool) : Whether to strictly enforce that the keys in checkpoint_path match the keys
            returned by this module's state dict. Default: True.

    Returns:
        TabularModel (TabularModel): The saved TabularModel
    """

    warnings.warn(
        "`load_from_checkpoint` is deprecated. Use `load_model` instead.",
        DeprecationWarning,
    )
    return cls.load_model(dir, map_location, strict)

load_model(dir, map_location=None, strict=True) classmethod

Loads a saved model from the directory.

PARAMETER DESCRIPTION
dir

The directory where the model wa saved, along with the checkpoints

TYPE: str

map_location

If your checkpoint saved a GPU model and you now load on CPUs or a different number of GPUs, use this to map to the new setup. The behaviour is the same as in torch.load()

TYPE: Union[Dict[str, str], str, device, int, Callable, None]) DEFAULT: None

strict

Whether to strictly enforce that the keys in checkpoint_path match the keys returned by this module's state dict. Default: True.

TYPE: bool) DEFAULT: True

RETURNS DESCRIPTION
TabularModel

The saved TabularModel

TYPE: TabularModel

Source code in src/pytorch_tabular/tabular_model.py
@classmethod
def load_model(cls, dir: str, map_location=None, strict=True):
    """Loads a saved model from the directory.

    Args:
        dir (str): The directory where the model wa saved, along with the checkpoints
        map_location (Union[Dict[str, str], str, device, int, Callable, None]) : If your checkpoint
            saved a GPU model and you now load on CPUs or a different number of GPUs, use this to map
            to the new setup. The behaviour is the same as in torch.load()
        strict (bool) : Whether to strictly enforce that the keys in checkpoint_path match the keys
            returned by this module's state dict. Default: True.

    Returns:
        TabularModel (TabularModel): The saved TabularModel
    """
    config = OmegaConf.load(os.path.join(dir, "config.yml"))
    datamodule = joblib.load(os.path.join(dir, "datamodule.sav"))
    if (
        hasattr(config, "log_target")
        and (config.log_target is not None)
        and os.path.exists(os.path.join(dir, "exp_logger.sav"))
    ):
        logger = joblib.load(os.path.join(dir, "exp_logger.sav"))
    else:
        logger = None
    if os.path.exists(os.path.join(dir, "callbacks.sav")):
        callbacks = joblib.load(os.path.join(dir, "callbacks.sav"))
        # Excluding Gradient Accumulation Scheduler Callback as we are creating
        # a new one in trainer
        callbacks = [c for c in callbacks if not isinstance(c, GradientAccumulationScheduler)]
    else:
        callbacks = []
    if os.path.exists(os.path.join(dir, "custom_model_callable.sav")):
        model_callable = joblib.load(os.path.join(dir, "custom_model_callable.sav"))
        custom_model = True
    else:
        model_callable = getattr_nested(config._module_src, config._model_name)
        # model_callable = getattr(
        #     getattr(models, config._module_src), config._model_name
        # )
        custom_model = False
    inferred_config = datamodule.update_config(config)
    inferred_config = OmegaConf.structured(inferred_config)
    model_args = {
        "config": config,
        "inferred_config": inferred_config,
    }
    custom_params = joblib.load(os.path.join(dir, "custom_params.sav"))
    if custom_params.get("custom_loss") is not None:
        model_args["loss"] = "MSELoss"  # For compatibility. Not Used
    if custom_params.get("custom_metrics") is not None:
        model_args["metrics"] = ["mean_squared_error"]  # For compatibility. Not Used
        model_args["metrics_params"] = [{}]  # For compatibility. Not Used
        model_args["metrics_prob_inputs"] = [False]  # For compatibility. Not Used
    if custom_params.get("custom_optimizer") is not None:
        model_args["optimizer"] = "Adam"  # For compatibility. Not Used
    if custom_params.get("custom_optimizer_params") is not None:
        model_args["optimizer_params"] = {}  # For compatibility. Not Used

    # Initializing with default metrics, losses, and optimizers. Will revert once initialized
    model = model_callable.load_from_checkpoint(
        checkpoint_path=os.path.join(dir, "model.ckpt"),
        map_location=map_location,
        strict=strict,
        **model_args,
    )
    # Updating config with custom parameters for experiment tracking
    if custom_params.get("custom_loss") is not None:
        model.custom_loss = custom_params["custom_loss"]
    if custom_params.get("custom_metrics") is not None:
        model.custom_metrics = custom_params["custom_metrics"]
    if custom_params.get("custom_optimizer") is not None:
        model.custom_optimizer = custom_params["custom_optimizer"]
    if custom_params.get("custom_optimizer_params") is not None:
        model.custom_optimizer_params = custom_params["custom_optimizer_params"]
    model._setup_loss()
    model._setup_metrics()
    tabular_model = cls(config=config, model_callable=model_callable)
    tabular_model.model = model
    tabular_model.custom_model = custom_model
    tabular_model.datamodule = datamodule
    tabular_model.callbacks = callbacks
    tabular_model.trainer = tabular_model._prepare_trainer(callbacks=callbacks)
    tabular_model.trainer.model = model
    tabular_model.logger = logger
    return tabular_model

load_weights(path)

Loads the model weights in the specified directory.

PARAMETER DESCRIPTION
path

The path to the file to load the model from

TYPE: str

Source code in src/pytorch_tabular/tabular_model.py
def load_weights(self, path: Union[str, Path]) -> None:
    """Loads the model weights in the specified directory.

    Args:
        path (str): The path to the file to load the model from
    """
    self._load_weights(self.model, path)

predict(test, quantiles=[0.25, 0.5, 0.75], n_samples=100, ret_logits=False, include_input_features=True, device=None)

Uses the trained model to predict on new data and return as a dataframe.

PARAMETER DESCRIPTION
test

The new dataframe with the features defined during training

TYPE: pd.DataFrame

quantiles

For probabilistic models like Mixture Density Networks, this specifies the different quantiles to be extracted apart from the central_tendency and added to the dataframe. For other models it is ignored. Defaults to [0.25, 0.5, 0.75]

TYPE: Optional[List] DEFAULT: [0.25, 0.5, 0.75]

n_samples

Number of samples to draw from the posterior to estimate the quantiles. Ignored for non-probabilistic models. Defaults to 100

TYPE: Optional[int] DEFAULT: 100

ret_logits

Flag to return raw model outputs/logits except the backbone features along with the dataframe. Defaults to False

TYPE: bool DEFAULT: False

include_input_features

Flag to include the input features in the returned dataframe. Defaults to True

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
pd.DataFrame

pd.DataFrame: Returns a dataframe with predictions and features (if include_input_features=True). If classification, it returns probabilities and final prediction

Source code in src/pytorch_tabular/tabular_model.py
def predict(
    self,
    test: pd.DataFrame,
    quantiles: Optional[List] = [0.25, 0.5, 0.75],
    n_samples: Optional[int] = 100,
    ret_logits=False,
    include_input_features: bool = True,
    device: Optional[torch.device] = None,
) -> pd.DataFrame:
    """Uses the trained model to predict on new data and return as a dataframe.

    Args:
        test (pd.DataFrame): The new dataframe with the features defined during training
        quantiles (Optional[List]): For probabilistic models like Mixture Density Networks, this specifies
            the different quantiles to be extracted apart from the `central_tendency` and added to the dataframe.
            For other models it is ignored. Defaults to [0.25, 0.5, 0.75]
        n_samples (Optional[int]): Number of samples to draw from the posterior to estimate the quantiles.
            Ignored for non-probabilistic models. Defaults to 100
        ret_logits (bool): Flag to return raw model outputs/logits except the backbone features along
            with the dataframe. Defaults to False
        include_input_features (bool): Flag to include the input features in the returned dataframe.
            Defaults to True

    Returns:
        pd.DataFrame: Returns a dataframe with predictions and features (if `include_input_features=True`).
            If classification, it returns probabilities and final prediction
    """
    warnings.warn(
        "Default for `include_input_features` will change from True to False in the next release."
        " Please set it explicitly.",
        DeprecationWarning,
    )
    assert all(q <= 1 and q >= 0 for q in quantiles), "Quantiles should be a decimal between 0 and 1"
    if device is not None:
        if isinstance(device, str):
            device = torch.device(device)
        if self.model.device != device:
            model = self.model.to(device)
        else:
            model = self.model
    else:
        model = self.model
    model.eval()
    inference_dataloader = self.datamodule.prepare_inference_dataloader(test)
    point_predictions = []
    quantile_predictions = []
    logits_predictions = defaultdict(list)
    is_probabilistic = hasattr(model.hparams, "_probabilistic") and model.hparams._probabilistic
    for batch in track(inference_dataloader, description="Generating Predictions..."):
        for k, v in batch.items():
            if isinstance(v, list) and (len(v) == 0):
                # Skipping empty list
                continue
            batch[k] = v.to(model.device)
        if is_probabilistic:
            samples, ret_value = model.sample(batch, n_samples, ret_model_output=True)
            y_hat = torch.mean(samples, dim=-1)
            quantile_preds = []
            for q in quantiles:
                quantile_preds.append(torch.quantile(samples, q=q, dim=-1).unsqueeze(1))
        else:
            y_hat, ret_value = model.predict(batch, ret_model_output=True)
        if ret_logits:
            for k, v in ret_value.items():
                # if k == "backbone_features":
                #     continue
                logits_predictions[k].append(v.detach().cpu())
        point_predictions.append(y_hat.detach().cpu())
        if is_probabilistic:
            quantile_predictions.append(torch.cat(quantile_preds, dim=-1).detach().cpu())
    point_predictions = torch.cat(point_predictions, dim=0)
    if point_predictions.ndim == 1:
        point_predictions = point_predictions.unsqueeze(-1)
    if is_probabilistic:
        quantile_predictions = torch.cat(quantile_predictions, dim=0).unsqueeze(-1)
        if quantile_predictions.ndim == 2:
            quantile_predictions = quantile_predictions.unsqueeze(-1)
    if include_input_features:
        pred_df = test.copy()
    else:
        pred_df = pd.DataFrame(index=test.index)
    if self.config.task == "regression":
        point_predictions = point_predictions.numpy()
        # Probabilistic Models are only implemented for Regression
        if is_probabilistic:
            quantile_predictions = quantile_predictions.numpy()
        for i, target_col in enumerate(self.config.target):
            if self.datamodule.do_target_transform:
                if self.config.target[i] in pred_df.columns:
                    pred_df[self.config.target[i]] = self.datamodule.target_transforms[i].inverse_transform(
                        pred_df[self.config.target[i]].values.reshape(-1, 1)
                    )
                pred_df[f"{target_col}_prediction"] = self.datamodule.target_transforms[i].inverse_transform(
                    point_predictions[:, i].reshape(-1, 1)
                )
                if is_probabilistic:
                    for j, q in enumerate(quantiles):
                        pred_df[f"{target_col}_q{int(q*100)}"] = self.datamodule.target_transforms[
                            i
                        ].inverse_transform(quantile_predictions[:, j, i].reshape(-1, 1))
            else:
                pred_df[f"{target_col}_prediction"] = point_predictions[:, i]
                if is_probabilistic:
                    for j, q in enumerate(quantiles):
                        pred_df[f"{target_col}_q{int(q*100)}"] = quantile_predictions[:, j, i].reshape(-1, 1)

    elif self.config.task == "classification":
        point_predictions = nn.Softmax(dim=-1)(point_predictions).numpy()
        for i, class_ in enumerate(self.datamodule.label_encoder.classes_):
            pred_df[f"{class_}_probability"] = point_predictions[:, i]
        pred_df["prediction"] = self.datamodule.label_encoder.inverse_transform(
            np.argmax(point_predictions, axis=1)
        )
    if ret_logits:
        for k, v in logits_predictions.items():
            v = torch.cat(v, dim=0).numpy()
            if v.ndim == 1:
                v = v.reshape(-1, 1)
            for i in range(v.shape[-1]):
                if v.shape[-1] > 1:
                    pred_df[f"{k}_{i}"] = v[:, i]
                else:
                    pred_df[f"{k}"] = v[:, i]
    return pred_df

prepare_dataloader(train, validation=None, test=None, train_sampler=None, target_transform=None, seed=42)

Prepares the dataloaders for training and validation.

PARAMETER DESCRIPTION
train

Training Dataframe

TYPE: pd.DataFrame

validation

If provided, will use this dataframe as the validation while training. Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None.

TYPE: Optional[pd.DataFrame] DEFAULT: None

test

If provided, will use as the hold-out data, which you'll be able to check performance after the model is trained. Defaults to None.

TYPE: Optional[pd.DataFrame] DEFAULT: None

train_sampler

Custom PyTorch batch samplers which will be passed to the DataLoaders. Useful for dealing with imbalanced data and other custom batching strategies

TYPE: Optional[torch.utils.data.Sampler] DEFAULT: None

target_transform

If provided, applies the transform to the target before modelling and inverse the transform during prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or a tuple of callables (transform_func, inverse_transform_func)

TYPE: Optional[Union[TransformerMixin, Tuple(Callable)]] DEFAULT: None

seed

Random seed for reproducibility. Defaults to 42.

TYPE: Optional[int] DEFAULT: 42

RETURNS DESCRIPTION
TabularDatamodule

The prepared datamodule

TYPE: TabularDatamodule

Source code in src/pytorch_tabular/tabular_model.py
def prepare_dataloader(
    self,
    train: pd.DataFrame,
    validation: Optional[pd.DataFrame] = None,
    test: Optional[pd.DataFrame] = None,
    train_sampler: Optional[torch.utils.data.Sampler] = None,
    target_transform: Optional[Union[TransformerMixin, Tuple]] = None,
    seed: Optional[int] = 42,
) -> TabularDatamodule:
    """Prepares the dataloaders for training and validation.

    Args:
        train (pd.DataFrame): Training Dataframe

        validation (Optional[pd.DataFrame], optional):
            If provided, will use this dataframe as the validation while training.
            Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation.
            Defaults to None.

        test (Optional[pd.DataFrame], optional): If provided, will use as the hold-out data,
            which you'll be able to check performance after the model is trained. Defaults to None.

        train_sampler (Optional[torch.utils.data.Sampler], optional):
            Custom PyTorch batch samplers which will be passed to the DataLoaders.
            Useful for dealing with imbalanced data and other custom batching strategies

        target_transform (Optional[Union[TransformerMixin, Tuple(Callable)]], optional):
            If provided, applies the transform to the target before modelling and inverse the transform during
            prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or
            a tuple of callables (transform_func, inverse_transform_func)

        seed (Optional[int], optional): Random seed for reproducibility. Defaults to 42.

    Returns:
        TabularDatamodule: The prepared datamodule
    """
    if test is not None:
        warnings.warn(
            "Providing test data in `fit` is deprecated and will be removed in next major release."
            " Plese use `evaluate` for evaluating on test data"
        )
    logger.info("Preparing the DataLoaders")
    target_transform = self._check_and_set_target_transform(target_transform)

    datamodule = TabularDatamodule(
        train=train,
        validation=validation,
        config=self.config,
        test=test,
        target_transform=target_transform,
        train_sampler=train_sampler,
        seed=seed,
    )
    datamodule.prepare_data()
    datamodule.setup("fit")
    return datamodule

prepare_model(datamodule, loss=None, metrics=None, metrics_prob_inputs=None, optimizer=None, optimizer_params={})

Prepares the model for training.

PARAMETER DESCRIPTION
datamodule

The datamodule

TYPE: TabularDatamodule

loss

Custom Loss functions which are not in standard pytorch library

TYPE: Optional[torch.nn.Module] DEFAULT: None

metrics

Custom metric functions(Callable) which has the signature metric_fn(y_hat, y) and works on torch tensor inputs

TYPE: Optional[List[Callable]] DEFAULT: None

metrics_prob_inputs

This is a mandatory parameter for classification metrics. If the metric function requires probabilities as inputs, set this to True. The length of the list should be equal to the number of metrics. Defaults to None.

TYPE: Optional[List[bool]] DEFAULT: None

optimizer

Custom optimizers which are a drop in replacements for standard PyToch optimizers. This should be the Class and not the initialized object

TYPE: Optional[torch.optim.Optimizer] DEFAULT: None

optimizer_params

The parmeters to initialize the custom optimizer.

TYPE: Optional[Dict] DEFAULT: {}

RETURNS DESCRIPTION
BaseModel

The prepared model

TYPE: BaseModel

Source code in src/pytorch_tabular/tabular_model.py
def prepare_model(
    self,
    datamodule: TabularDatamodule,
    loss: Optional[torch.nn.Module] = None,
    metrics: Optional[List[Callable]] = None,
    metrics_prob_inputs: Optional[List[bool]] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    optimizer_params: Dict = {},
) -> BaseModel:
    """Prepares the model for training.

    Args:
        datamodule (TabularDatamodule): The datamodule

        loss (Optional[torch.nn.Module], optional): Custom Loss functions which are not in standard pytorch library

        metrics (Optional[List[Callable]], optional): Custom metric functions(Callable) which has the
            signature metric_fn(y_hat, y) and works on torch tensor inputs

        metrics_prob_inputs (Optional[List[bool]], optional): This is a mandatory parameter for
            classification metrics. If the metric function requires probabilities as inputs, set this to True.
            The length of the list should be equal to the number of metrics. Defaults to None.

        optimizer (Optional[torch.optim.Optimizer], optional):
            Custom optimizers which are a drop in replacements for standard PyToch optimizers.
            This should be the Class and not the initialized object

        optimizer_params (Optional[Dict], optional): The parmeters to initialize the custom optimizer.

    Returns:
        BaseModel: The prepared model
    """
    logger.info(f"Preparing the Model: {self.config._model_name}")
    # Fetching the config as some data specific configs have been added in the datamodule
    self.inferred_config = self._read_parse_config(datamodule.update_config(self.config), InferredConfig)
    model = self.model_callable(
        self.config,
        custom_loss=loss,  # Unused in SSL tasks
        custom_metrics=metrics,  # Unused in SSL tasks
        custom_metrics_prob_inputs=metrics_prob_inputs,  # Unused in SSL tasks
        custom_optimizer=optimizer,
        custom_optimizer_params=optimizer_params,
        inferred_config=self.inferred_config,
    )
    # Data Aware Initialization(for the models that need it)
    model.data_aware_initialization(datamodule)
    if self.model_state_dict_path is not None:
        self._load_weights(model, self.model_state_dict_path)
    if self.track_experiment and self.config.log_target == "wandb":
        self.logger.watch(model, log=self.config.exp_watch, log_freq=self.config.exp_log_freq)
    return model

pretrain(train, validation=None, optimizer=None, optimizer_params={}, max_epochs=None, min_epochs=None, seed=42, callbacks=None, datamodule=None)

The pretrained method which takes in the data and triggers the training.

PARAMETER DESCRIPTION
train

Training Dataframe

TYPE: pd.DataFrame

validation

If provided, will use this dataframe as the validation while training. Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation. Defaults to None.

TYPE: Optional[pd.DataFrame] DEFAULT: None

optimizer

Custom optimizers which are a drop in replacements for standard PyToch optimizers. This should be the Class and not the initialized object

TYPE: Optional[torch.optim.Optimizer] DEFAULT: None

optimizer_params

The parmeters to initialize the custom optimizer.

TYPE: Optional[Dict] DEFAULT: {}

max_epochs

Overwrite maximum number of epochs to be run. Defaults to None.

TYPE: Optional[int] DEFAULT: None

min_epochs

Overwrite minimum number of epochs to be run. Defaults to None.

TYPE: Optional[int] DEFAULT: None

seed

(int): Random seed for reproducibility. Defaults to 42.

TYPE: Optional[int] DEFAULT: 42

callbacks

List of callbacks to be used during training. Defaults to None.

TYPE: Optional[List[pl.Callback]] DEFAULT: None

datamodule

The datamodule. If provided, will ignore the rest of the parameters like train, test etc and use the datamodule. Defaults to None.

TYPE: Optional[TabularDatamodule] DEFAULT: None

RETURNS DESCRIPTION
pl.Trainer

pl.Trainer: The PyTorch Lightning Trainer instance

Source code in src/pytorch_tabular/tabular_model.py
def pretrain(
    self,
    train: Optional[pd.DataFrame],
    validation: Optional[pd.DataFrame] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    optimizer_params: Dict = {},
    # train_sampler: Optional[torch.utils.data.Sampler] = None,
    max_epochs: Optional[int] = None,
    min_epochs: Optional[int] = None,
    seed: Optional[int] = 42,
    callbacks: Optional[List[pl.Callback]] = None,
    datamodule: Optional[TabularDatamodule] = None,
) -> pl.Trainer:
    """The pretrained method which takes in the data and triggers the training.

    Args:
        train (pd.DataFrame): Training Dataframe

        validation (Optional[pd.DataFrame], optional): If provided, will use this dataframe as the validation while
            training. Used in Early Stopping and Logging. If left empty, will use 20% of Train data as validation.
            Defaults to None.

        optimizer (Optional[torch.optim.Optimizer], optional): Custom optimizers which are a drop in replacements
            for standard PyToch optimizers. This should be the Class and not the initialized object

        optimizer_params (Optional[Dict], optional): The parmeters to initialize the custom optimizer.

        max_epochs (Optional[int]): Overwrite maximum number of epochs to be run. Defaults to None.

        min_epochs (Optional[int]): Overwrite minimum number of epochs to be run. Defaults to None.

        seed: (int): Random seed for reproducibility. Defaults to 42.

        callbacks (Optional[List[pl.Callback]], optional): List of callbacks to be used during training.
            Defaults to None.

        datamodule (Optional[TabularDatamodule], optional): The datamodule. If provided, will ignore the rest of the
            parameters like train, test etc and use the datamodule. Defaults to None.

    Returns:
        pl.Trainer: The PyTorch Lightning Trainer instance
    """
    assert (
        self.config.task == "ssl"
    ), f"`pretrain` is not valid for {self.config.task} task. Please use `fit` instead."
    seed = seed if seed is not None else self.config.seed
    seed_everything(seed)
    if datamodule is None:
        datamodule = self.prepare_dataloader(
            train,
            validation,
            test=None,
            train_sampler=None,
            target_transform=None,
            seed=seed,
        )
    else:
        if train is not None:
            warnings.warn(
                "train data is provided but datamodule is provided."
                " Ignoring the train data and using the datamodule"
            )
    model = self.prepare_model(
        datamodule,
        optimizer,
        optimizer_params,
    )

    return self.train(model, datamodule, callbacks, max_epochs, min_epochs)

save_config(dir)

Saves the config in the specified directory.

Source code in src/pytorch_tabular/tabular_model.py
def save_config(self, dir: str) -> None:
    """Saves the config in the specified directory."""
    with open(os.path.join(dir, "config.yml"), "w") as fp:
        OmegaConf.save(self.config, fp, resolve=True)

save_datamodule(dir)

Saves the datamodule in the specified directory.

PARAMETER DESCRIPTION
dir

The path to the directory to save the datamodule

TYPE: str

Source code in src/pytorch_tabular/tabular_model.py
def save_datamodule(self, dir: str) -> None:
    """Saves the datamodule in the specified directory.

    Args:
        dir (str): The path to the directory to save the datamodule
    """
    joblib.dump(self.datamodule, os.path.join(dir, "datamodule.sav"))

save_model(dir)

Saves the model and checkpoints in the specified directory.

PARAMETER DESCRIPTION
dir

The path to the directory to save the model

TYPE: str

Source code in src/pytorch_tabular/tabular_model.py
def save_model(self, dir: str) -> None:
    """Saves the model and checkpoints in the specified directory.

    Args:
        dir (str): The path to the directory to save the model
    """
    if os.path.exists(dir) and (os.listdir(dir)):
        logger.warning("Directory is not empty. Overwriting the contents.")
        for f in os.listdir(dir):
            os.remove(os.path.join(dir, f))
    os.makedirs(dir, exist_ok=True)
    self.save_config(dir)
    self.save_datamodule(dir)
    if hasattr(self.config, "log_target") and self.config.log_target is not None:
        joblib.dump(self.logger, os.path.join(dir, "exp_logger.sav"))
    if hasattr(self, "callbacks"):
        joblib.dump(self.callbacks, os.path.join(dir, "callbacks.sav"))
    self.trainer.save_checkpoint(os.path.join(dir, "model.ckpt"))
    custom_params = {}
    custom_params["custom_loss"] = self.model.custom_loss
    custom_params["custom_metrics"] = self.model.custom_metrics
    custom_params["custom_metrics_prob_inputs"] = self.model.custom_metrics_prob_inputs
    custom_params["custom_optimizer"] = self.model.custom_optimizer
    custom_params["custom_optimizer_params"] = self.model.custom_optimizer_params
    joblib.dump(custom_params, os.path.join(dir, "custom_params.sav"))
    if self.custom_model:
        joblib.dump(self.model_callable, os.path.join(dir, "custom_model_callable.sav"))

save_model_for_inference(path, kind='pytorch', onnx_export_params={'opset_version': 12})

Saves the model for inference.

PARAMETER DESCRIPTION
path

path to save the model

TYPE: Union[str, Path]

kind

"pytorch" or "onnx" (Experimental)

TYPE: str DEFAULT: 'pytorch'

onnx_export_params

parameters for onnx export to be passed to torch.onnx.export

TYPE: Dict DEFAULT: {'opset_version': 12}

RETURNS DESCRIPTION
bool

True if the model was saved successfully

TYPE: bool

Source code in src/pytorch_tabular/tabular_model.py
def save_model_for_inference(
    self,
    path: Union[str, Path],
    kind: str = "pytorch",
    onnx_export_params: Dict = {"opset_version": 12},
) -> bool:
    """Saves the model for inference.

    Args:
        path (Union[str, Path]): path to save the model
        kind (str): "pytorch" or "onnx" (Experimental)
        onnx_export_params (Dict): parameters for onnx export to be
            passed to torch.onnx.export

    Returns:
        bool: True if the model was saved successfully
    """
    if kind == "pytorch":
        torch.save(self.model, str(path))
        return True
    elif kind == "onnx":
        # Export the model
        onnx_export_params["input_names"] = ["categorical", "continuous"]
        onnx_export_params["output_names"] = onnx_export_params.get("output_names", ["output"])
        onnx_export_params["dynamic_axes"] = {
            onnx_export_params["input_names"][0]: {0: "batch_size"},
            onnx_export_params["output_names"][0]: {0: "batch_size"},
        }
        cat = torch.zeros(
            self.config.batch_size,
            len(self.config.categorical_cols),
            dtype=torch.int,
        )
        cont = torch.randn(
            self.config.batch_size,
            len(self.config.continuous_cols),
            requires_grad=True,
        )
        x = {"continuous": cont, "categorical": cat}
        torch.onnx.export(self.model, x, str(path), **onnx_export_params)
        return True
    else:
        raise ValueError("`kind` must be either pytorch or onnx")

save_weights(path)

Saves the model weights in the specified directory.

PARAMETER DESCRIPTION
path

The path to the file to save the model

TYPE: str

Source code in src/pytorch_tabular/tabular_model.py
def save_weights(self, path: Union[str, Path]) -> None:
    """Saves the model weights in the specified directory.

    Args:
        path (str): The path to the file to save the model
    """
    torch.save(self.model.state_dict(), path)

summary(max_depth=-1)

Prints a summary of the model.

PARAMETER DESCRIPTION
max_depth

The maximum depth to traverse the modules and displayed in the summary. Defaults to -1, which means will display all the modules.

TYPE: int DEFAULT: -1

Source code in src/pytorch_tabular/tabular_model.py
def summary(self, max_depth: int = -1) -> None:
    """Prints a summary of the model.

    Args:
        max_depth (int): The maximum depth to traverse the modules and displayed in the summary.
            Defaults to -1, which means will display all the modules.
    """
    print(summarize(self.model, max_depth=max_depth))

train(model, datamodule, callbacks=None, max_epochs=None, min_epochs=None)

Trains the model.

PARAMETER DESCRIPTION
model

The PyTorch Lightning model to be trained.

TYPE: pl.LightningModule

datamodule

The datamodule

TYPE: TabularDatamodule

callbacks

List of callbacks to be used during training. Defaults to None.

TYPE: Optional[List[pl.Callback]] DEFAULT: None

max_epochs

Overwrite maximum number of epochs to be run. Defaults to None.

TYPE: Optional[int] DEFAULT: None

min_epochs

Overwrite minimum number of epochs to be run. Defaults to None.

TYPE: Optional[int] DEFAULT: None

RETURNS DESCRIPTION
pl.Trainer

pl.Trainer: The PyTorch Lightning Trainer instance

Source code in src/pytorch_tabular/tabular_model.py
def train(
    self,
    model: pl.LightningModule,
    datamodule: TabularDatamodule,
    callbacks: Optional[List[pl.Callback]] = None,
    max_epochs: int = None,
    min_epochs: int = None,
) -> pl.Trainer:
    """Trains the model.

    Args:
        model (pl.LightningModule): The PyTorch Lightning model to be trained.

        datamodule (TabularDatamodule): The datamodule

        callbacks (Optional[List[pl.Callback]], optional):
            List of callbacks to be used during training. Defaults to None.

        max_epochs (Optional[int]): Overwrite maximum number of epochs to be run. Defaults to None.

        min_epochs (Optional[int]): Overwrite minimum number of epochs to be run. Defaults to None.

    Returns:
        pl.Trainer: The PyTorch Lightning Trainer instance
    """
    self._prepare_for_training(model, datamodule, callbacks, max_epochs, min_epochs)
    train_loader, val_loader = (
        self.datamodule.train_dataloader(),
        self.datamodule.val_dataloader(),
    )
    self.model.train()
    if self.config.auto_lr_find and (not self.config.fast_dev_run):
        logger.info("Auto LR Find Started")
        result = self.trainer.tune(self.model, train_loader, val_loader)
        logger.info(
            f"Suggested LR: {result['lr_find'].suggestion()}."
            f" For plot and detailed analysis, use `find_learning_rate` method."
        )
        # Parameters in models needs to be initialized again after LR find
        self.model.data_aware_initialization(self.datamodule)
    self.model.train()
    logger.info("Training Started")
    self.trainer.fit(self.model, train_loader, val_loader)
    logger.info("Training the model completed")
    if self.config.load_best:
        self.load_best_model()
    return self.trainer

pytorch_tabular.TabularDatamodule(train, config, validation=None, test=None, target_transform=None, train_sampler=None, seed=42)

Bases: pl.LightningDataModule

The Pytorch Lightning Datamodule for Tabular Data.

PARAMETER DESCRIPTION
train

The Training Dataframe

TYPE: pd.DataFrame

config

Merged configuration object from ModelConfig, DataConfig, TrainerConfig, OptimizerConfig & ExperimentConfig

TYPE: DictConfig

validation

Validation Dataframe. If left empty, we use the validation split from DataConfig to split a random sample as validation. Defaults to None.

TYPE: pd.DataFrame DEFAULT: None

test

Holdout DataFrame to check final performance on. Defaults to None.

TYPE: pd.DataFrame DEFAULT: None

target_transform

If provided, applies the transform to the target before modelling and inverse the transform during prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or a tuple of callables (transform_func, inverse_transform_func)

TYPE: Optional[Union[TransformerMixin, Tuple(Callable)]] DEFAULT: None

Source code in src/pytorch_tabular/tabular_datamodule.py
def __init__(
    self,
    train: pd.DataFrame,
    config: DictConfig,
    validation: pd.DataFrame = None,
    test: pd.DataFrame = None,
    target_transform: Optional[Union[TransformerMixin, Tuple]] = None,
    train_sampler: Optional[torch.utils.data.Sampler] = None,
    seed: Optional[int] = 42,
):
    """The Pytorch Lightning Datamodule for Tabular Data.

    Args:
        train (pd.DataFrame): The Training Dataframe

        config (DictConfig): Merged configuration object from ModelConfig, DataConfig,
            TrainerConfig, OptimizerConfig & ExperimentConfig

        validation (pd.DataFrame, optional): Validation Dataframe.
            If left empty, we use the validation split from DataConfig to split a random sample as validation.
            Defaults to None.

        test (pd.DataFrame, optional): Holdout DataFrame to check final performance on.
            Defaults to None.

        target_transform (Optional[Union[TransformerMixin, Tuple(Callable)]], optional):
            If provided, applies the transform to the target before modelling and inverse the transform during
            prediction. The parameter can either be a sklearn Transformer which has an inverse_transform method, or
            a tuple of callables (transform_func, inverse_transform_func)
    """
    super().__init__()
    self.train = train.copy()
    self.validation = validation
    self._set_target_transform(target_transform)
    self.test = test if test is None else test.copy()
    self.target = config.target
    self.batch_size = config.batch_size
    self.train_sampler = train_sampler
    self.config = config
    self.seed = seed
    self._fitted = False

add_datepart(df, field_name, frequency, prefix=None, drop=True) classmethod

Helper function that adds columns relevant to a date in the column field_name of df.

PARAMETER DESCRIPTION
df

Dataframe

TYPE: pd.DataFrame

field_name

Date field name

TYPE: str

frequency

Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.

TYPE: str

prefix

Prefix to add to the new columns. Defaults to None.

TYPE: str DEFAULT: None

drop

Drop the original column. Defaults to True.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
Tuple[pd.DataFrame, List[str]]

Dataframe with added columns and list of added columns

Source code in src/pytorch_tabular/tabular_datamodule.py
@classmethod
def add_datepart(
    cls,
    df: pd.DataFrame,
    field_name: str,
    frequency: str,
    prefix: str = None,
    drop: bool = True,
) -> Tuple[pd.DataFrame, List[str]]:
    """Helper function that adds columns relevant to a date in the column `field_name` of `df`.

    Args:
        df (pd.DataFrame): Dataframe

        field_name (str): Date field name

        frequency (str): Frequency string of the form `[multiple][granularity]` such as "12H", "5min", "1D" etc.

        prefix (str, optional): Prefix to add to the new columns. Defaults to None.

        drop (bool, optional): Drop the original column. Defaults to True.

    Returns:
        Dataframe with added columns and list of added columns
    """
    field = df[field_name]
    prefix = (re.sub("[Dd]ate$", "", field_name) if prefix is None else prefix) + "_"
    attr = cls.time_features_from_frequency_str(frequency)
    added_features = []
    for n in attr:
        if n == "Week":
            continue
        df[prefix + n] = getattr(field.dt, n.lower())
        added_features.append(prefix + n)
    # Pandas removed `dt.week` in v1.1.10
    if "Week" in attr:
        week = field.dt.isocalendar().week if hasattr(field.dt, "isocalendar") else field.dt.week
        df.insert(3, prefix + "Week", week)
        added_features.append(prefix + "Week")
    # TODO Not adding Elapsed by default. Need to route it through config
    # mask = ~field.isna()
    # df[prefix + "Elapsed"] = np.where(
    #     mask, field.values.astype(np.int64) // 10 ** 9, None
    # )
    # added_features.append(prefix + "Elapsed")
    if drop:
        df.drop(field_name, axis=1, inplace=True)

    # Removing features woth zero variations
    # for col in added_features:
    #     if len(df[col].unique()) == 1:
    #         df.drop(columns=col, inplace=True)
    #         added_features.remove(col)
    return df, added_features

do_leave_one_out_encoder()

Checks the special condition for NODE where we use a LeaveOneOutEncoder to encode categorical columns DEPRECATED: Automatically encoding categorical columns using LeaveOneOutEncoder is deprecated.

RETURNS DESCRIPTION
bool

True if LeaveOneOutEncoder is used

TYPE: bool

Source code in src/pytorch_tabular/tabular_datamodule.py
def do_leave_one_out_encoder(self) -> bool:
    """Checks the special condition for NODE where we use a LeaveOneOutEncoder to encode categorical columns
    DEPRECATED: Automatically encoding categorical columns using LeaveOneOutEncoder is deprecated.

    Returns:
        bool: True if LeaveOneOutEncoder is used
    """
    if hasattr(self.config, "_model_name"):
        return (self.config._model_name == "NODEModel") and (not self.config.embed_categorical)
    else:
        return False

load_datamodule(path) classmethod

Loads a datamodule from a path.

PARAMETER DESCRIPTION
path

Path to the datamodule

TYPE: Union[str, Path]

RETURNS DESCRIPTION
TabularDatamodule

The datamodule loaded from the path

TYPE: TabularDatamodule

Source code in src/pytorch_tabular/tabular_datamodule.py
@classmethod
def load_datamodule(cls, path: Union[str, Path]):
    """Loads a datamodule from a path.

    Args:
        path (Union[str, Path]): Path to the datamodule

    Returns:
        TabularDatamodule (TabularDatamodule): The datamodule loaded from the path
    """
    if isinstance(path, str):
        path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"{path} does not exist.")
    datamodule = joblib.load(path)
    return datamodule

make_date(df, date_field) classmethod

Make sure df[date_field] is of the right date type.

PARAMETER DESCRIPTION
df

Dataframe

TYPE: pd.DataFrame

date_field

Date field name

TYPE: str

RETURNS DESCRIPTION
pd.DataFrame

Dataframe with date field converted to datetime

Source code in src/pytorch_tabular/tabular_datamodule.py
@classmethod
def make_date(cls, df: pd.DataFrame, date_field: str) -> pd.DataFrame:
    """Make sure `df[date_field]` is of the right date type.

    Args:
        df (pd.DataFrame): Dataframe

        date_field (str): Date field name

    Returns:
        Dataframe with date field converted to datetime
    """
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
    return df

prepare_inference_dataloader(df, batch_size=None)

Function that prepares and loads the new data.

PARAMETER DESCRIPTION
df

Dataframe with the features and target

TYPE: pd.DataFrame

batch_size

Batch size. Defaults to self.batch_size.

TYPE: Optional[int] DEFAULT: None

RETURNS DESCRIPTION
DataLoader

The dataloader for the passed in dataframe

TYPE: DataLoader

Source code in src/pytorch_tabular/tabular_datamodule.py
def prepare_inference_dataloader(self, df: pd.DataFrame, batch_size: Optional[int] = None) -> DataLoader:
    """Function that prepares and loads the new data.

    Args:
        df (pd.DataFrame): Dataframe with the features and target
        batch_size (Optional[int], optional): Batch size. Defaults to `self.batch_size`.

    Returns:
        DataLoader: The dataloader for the passed in dataframe
    """
    df = df.copy()
    df = self._prepare_inference_data(df)
    dataset = TabularDataset(
        task=self.config.task,
        data=df,
        categorical_cols=self.config.categorical_cols,
        continuous_cols=self.config.continuous_cols,
        embed_categorical=(not self.do_leave_one_out_encoder()),
        target=self.target if all(col in df.columns for col in self.target) else None,
    )
    return DataLoader(
        dataset,
        batch_size if batch_size is not None else self.batch_size,
        shuffle=False,
        num_workers=self.config.num_workers,
    )

preprocess_data(data, stage='inference')

The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before feeding into the dataloder.

PARAMETER DESCRIPTION
data

A dataframe with the features and target

TYPE: pd.DataFrame

stage

Internal parameter. Used to distinguisj between fit and inference. Defaults to "inference".

TYPE: str DEFAULT: 'inference'

RETURNS DESCRIPTION
Tuple[pd.DataFrame, list]

Returns the processed dataframe and the added features(list) as a tuple

Source code in src/pytorch_tabular/tabular_datamodule.py
def preprocess_data(self, data: pd.DataFrame, stage: str = "inference") -> Tuple[pd.DataFrame, list]:
    """The preprocessing, like Categorical Encoding, Normalization, etc. which any dataframe should undergo before
    feeding into the dataloder.

    Args:
        data (pd.DataFrame): A dataframe with the features and target
        stage (str, optional): Internal parameter. Used to distinguisj between fit and inference.
            Defaults to "inference".

    Returns:
        Returns the processed dataframe and the added features(list) as a tuple
    """
    added_features = None
    if self.config.encode_date_columns:
        data, added_features = self._encode_date_columns(data)
    # The only features that are added are the date features extracted
    # from the date which are categorical in nature
    if (added_features is not None) and (stage == "fit"):
        logger.debug(f"Added {added_features} features after encoding the date_columns")
        self.config.categorical_cols += added_features
        # Update the categorical dimension in config
        self.config.categorical_dim = (
            len(self.config.categorical_cols) if self.config.categorical_cols is not None else 0
        )
    # Encoding Categorical Columns
    if len(self.config.categorical_cols) > 0:
        data = self._encode_categorical_columns(data, stage)

    # Transforming Continuous Columns
    if (self.config.continuous_feature_transform is not None) and (len(self.config.continuous_cols) > 0):
        data = self._transform_continuous_columns(data, stage)
    # Normalizing Continuous Columns
    if (self.config.normalize_continuous_features) and (len(self.config.continuous_cols) > 0):
        data = self._normalize_continuous_columns(data, stage)
    # Converting target labels to a 0 indexed label
    data = self._label_encode_target(data, stage)
    # Target Transforms
    data = self._target_transform(data, stage)
    return data, added_features

save_dataloader(path)

Saves the dataloader to a path.

PARAMETER DESCRIPTION
path

Path to save the dataloader

TYPE: Union[str, Path]

Source code in src/pytorch_tabular/tabular_datamodule.py
def save_dataloader(self, path: Union[str, Path]) -> None:
    """Saves the dataloader to a path.

    Args:
        path (Union[str, Path]): Path to save the dataloader
    """
    if isinstance(path, str):
        path = Path(path)
    joblib.dump(self, path)

setup(stage=None)

Data Operations you want to perform on all GPUs, like train-test split, transformations, etc. This is called before accessing the dataloaders.

PARAMETER DESCRIPTION
stage

Internal parameter to distinguish between fit and inference. Defaults to None.

TYPE: Optional[str] DEFAULT: None

Source code in src/pytorch_tabular/tabular_datamodule.py
def setup(self, stage: Optional[str] = None) -> None:
    """Data Operations you want to perform on all GPUs, like train-test split, transformations, etc. This is called
    before accessing the dataloaders.

    Args:
        stage (Optional[str], optional):
            Internal parameter to distinguish between fit and inference. Defaults to None.
    """
    if stage == "fit" or stage is None:
        logger.info(f"Setting up the datamodule for {self.config.task} task")
        if self.validation is None:
            logger.debug(
                f"No validation data provided."
                f" Using {self.config.validation_split*100}% of train data as validation"
            )
            val_idx = self.train.sample(
                int(self.config.validation_split * len(self.train)),
                random_state=self.seed,
            ).index
            self.validation = self.train[self.train.index.isin(val_idx)]
            self.train = self.train[~self.train.index.isin(val_idx)]
        else:
            self.validation = self.validation.copy()
        # Preprocessing Train, Validation
        self.train, _ = self.preprocess_data(self.train, stage="fit")
        self.validation, _ = self.preprocess_data(self.validation, stage="inference")
        if self.test is not None:
            self.test, _ = self.preprocess_data(self.test, stage="inference")
        self._fitted = True

test_dataloader(batch_size=None)

Function that loads the validation set.

PARAMETER DESCRIPTION
batch_size

Batch size. Defaults to self.batch_size.

TYPE: Optional[int] DEFAULT: None

RETURNS DESCRIPTION
DataLoader

Test dataloader

TYPE: DataLoader

Source code in src/pytorch_tabular/tabular_datamodule.py
def test_dataloader(self, batch_size: Optional[int] = None) -> DataLoader:
    """Function that loads the validation set.

    Args:
        batch_size (Optional[int], optional): Batch size. Defaults to `self.batch_size`.

    Returns:
        DataLoader: Test dataloader
    """
    if self.test is not None:
        dataset = TabularDataset(
            task=self.config.task,
            data=self.test,
            categorical_cols=self.config.categorical_cols,
            continuous_cols=self.config.continuous_cols,
            embed_categorical=(not self.do_leave_one_out_encoder()),
            target=self.target,
        )
        return DataLoader(
            dataset,
            batch_size if batch_size is not None else self.batch_size,
            shuffle=False,
            num_workers=self.config.num_workers,
            pin_memory=self.config.pin_memory,
        )

time_features_from_frequency_str(freq_str) classmethod

Returns a list of time features that will be appropriate for the given frequency string.

PARAMETER DESCRIPTION
freq_str

Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc.

TYPE: str

RETURNS DESCRIPTION
List[str]

List of added features

Source code in src/pytorch_tabular/tabular_datamodule.py
@classmethod
def time_features_from_frequency_str(cls, freq_str: str) -> List[str]:
    """Returns a list of time features that will be appropriate for the given frequency string.

    Args:
        freq_str (str): Frequency string of the form `[multiple][granularity]` such as "12H", "5min", "1D" etc.

    Returns:
        List of added features
    """

    features_by_offsets = {
        offsets.YearBegin: [],
        offsets.YearEnd: [],
        offsets.MonthBegin: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
        ],
        offsets.MonthEnd: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
        ],
        offsets.Week: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week",
        ],
        offsets.Day: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week" "Day",
            "Dayofweek",
            "Dayofyear",
        ],
        offsets.BusinessDay: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week" "Day",
            "Dayofweek",
            "Dayofyear",
        ],
        offsets.Hour: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week" "Day",
            "Dayofweek",
            "Dayofyear",
            "Hour",
        ],
        offsets.Minute: [
            "Month",
            "Quarter",
            "Is_quarter_end",
            "Is_quarter_start",
            "Is_year_end",
            "Is_year_start",
            "Is_month_start",
            "Week" "Day",
            "Dayofweek",
            "Dayofyear",
            "Hour",
            "Minute",
        ],
    }

    offset = to_offset(freq_str)

    for offset_type, feature in features_by_offsets.items():
        if isinstance(offset, offset_type):
            return feature

    supported_freq_msg = f"""
    Unsupported frequency {freq_str}

    The following frequencies are supported:

        Y, YS   - yearly
            alias: A
        M, MS   - monthly
        W   - weekly
        D   - daily
        B   - business days
        H   - hourly
        T   - minutely
            alias: min
    """
    raise RuntimeError(supported_freq_msg)

train_dataloader(batch_size=None)

Function that loads the train set.

PARAMETER DESCRIPTION
batch_size

Batch size. Defaults to self.batch_size.

TYPE: Optional[int] DEFAULT: None

RETURNS DESCRIPTION
DataLoader

Train dataloader

TYPE: DataLoader

Source code in src/pytorch_tabular/tabular_datamodule.py
def train_dataloader(self, batch_size: Optional[int] = None) -> DataLoader:
    """Function that loads the train set.

    Args:
        batch_size (Optional[int], optional): Batch size. Defaults to `self.batch_size`.

    Returns:
        DataLoader: Train dataloader
    """
    dataset = TabularDataset(
        task=self.config.task,
        data=self.train,
        categorical_cols=self.config.categorical_cols,
        continuous_cols=self.config.continuous_cols,
        embed_categorical=(not self.do_leave_one_out_encoder()),
        target=self.target,
    )
    return DataLoader(
        dataset,
        batch_size if batch_size is not None else self.batch_size,
        shuffle=True if self.train_sampler is None else False,
        num_workers=self.config.num_workers,
        sampler=self.train_sampler,
        pin_memory=self.config.pin_memory,
    )

update_config(config)

Calculates and updates a few key information to the config object.

PARAMETER DESCRIPTION
config

The config object

TYPE: DictConfig

RETURNS DESCRIPTION
InferredConfig

The updated config object

TYPE: InferredConfig

Source code in src/pytorch_tabular/tabular_datamodule.py
def update_config(self, config) -> InferredConfig:
    """Calculates and updates a few key information to the config object.

    Args:
        config (DictConfig): The config object

    Returns:
        InferredConfig: The updated config object
    """
    categorical_dim = len(config.categorical_cols)
    continuous_dim = len(config.continuous_cols)
    if config.task == "regression":
        output_dim = len(config.target)
    elif config.task == "classification":
        output_dim = len(self.train[config.target[0]].unique())
    else:
        output_dim = None
    categorical_cardinality = None
    embedding_dims = None
    if not self.do_leave_one_out_encoder():
        categorical_cardinality = [
            int(self.train[col].fillna("NA").nunique()) + 1 for col in config.categorical_cols
        ]
        embedding_dims = [(x, min(50, (x + 1) // 2)) for x in categorical_cardinality]
        if hasattr(config, "embedding_dims"):
            if config.embedding_dims is not None:
                embedding_dims = config.embedding_dims
    return InferredConfig(
        categorical_dim=categorical_dim,
        continuous_dim=continuous_dim,
        output_dim=output_dim,
        categorical_cardinality=categorical_cardinality,
        embedding_dims=embedding_dims,
    )

val_dataloader(batch_size=None)

Function that loads the validation set.

PARAMETER DESCRIPTION
batch_size

Batch size. Defaults to self.batch_size.

TYPE: Optional[int] DEFAULT: None

RETURNS DESCRIPTION
DataLoader

Validation dataloader

TYPE: DataLoader

Source code in src/pytorch_tabular/tabular_datamodule.py
def val_dataloader(self, batch_size: Optional[int] = None) -> DataLoader:
    """Function that loads the validation set.

    Args:
        batch_size (Optional[int], optional): Batch size. Defaults to `self.batch_size`.

    Returns:
        DataLoader: Validation dataloader
    """
    dataset = TabularDataset(
        task=self.config.task,
        data=self.validation,
        categorical_cols=self.config.categorical_cols,
        continuous_cols=self.config.continuous_cols,
        embed_categorical=(not self.do_leave_one_out_encoder()),
        target=self.target,
    )
    return DataLoader(
        dataset,
        batch_size if batch_size is not None else self.batch_size,
        shuffle=False,
        num_workers=self.config.num_workers,
        pin_memory=self.config.pin_memory,
    )