Skip to content

Utilities

Special Feature Classes

pytorch_tabular.CategoricalEmbeddingTransformer(tabular_model)

Bases: BaseEstimator, TransformerMixin

Initializes the Transformer and extracts the neural embeddings.

PARAMETER DESCRIPTION
tabular_model

The trained TabularModel object

TYPE: TabularModel

Source code in src/pytorch_tabular/categorical_encoders.py
def __init__(self, tabular_model):
    """Initializes the Transformer and extracts the neural embeddings.

    Args:
        tabular_model (TabularModel): The trained TabularModel object
    """
    self._categorical_encoder = tabular_model.datamodule.categorical_encoder
    self.cols = tabular_model.model.hparams.categorical_cols
    # dict {str: np.ndarray} column name --> mapping from category (index of df) to value (column of df)
    self._mapping = {}

    self._extract_embedding(tabular_model.model)

fit(X, y=None)

Just for compatibility.

Does not do anything

Source code in src/pytorch_tabular/categorical_encoders.py
def fit(self, X, y=None):
    """Just for compatibility.

    Does not do anything
    """
    return self

fit_transform(X, y=None)

Encode given columns of X based on the learned embedding.

PARAMETER DESCRIPTION
X

DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.

TYPE: pd.DataFrame

y

Only for compatibility. Not used. Defaults to None.

TYPE: [type] DEFAULT: None

RETURNS DESCRIPTION
pd.DataFrame

pd.DataFrame: The encoded dataframe

Source code in src/pytorch_tabular/categorical_encoders.py
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """Encode given columns of X based on the learned embedding.

    Args:
        X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
        y ([type], optional): Only for compatibility. Not used. Defaults to None.

    Returns:
        pd.DataFrame: The encoded dataframe
    """
    self.fit(X, y)
    return self.transform(X)

transform(X, y=None)

Transforms the categorical columns specified to the trained neural embedding from the model.

PARAMETER DESCRIPTION
X

DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.

TYPE: pd.DataFrame

y

Only for compatibility. Not used. Defaults to None.

TYPE: [type] DEFAULT: None

RAISES DESCRIPTION
ValueError

[description]

RETURNS DESCRIPTION
pd.DataFrame

pd.DataFrame: The encoded dataframe

Source code in src/pytorch_tabular/categorical_encoders.py
def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """Transforms the categorical columns specified to the trained neural embedding from the model.

    Args:
        X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
        y ([type], optional): Only for compatibility. Not used. Defaults to None.

    Raises:
        ValueError: [description]

    Returns:
        pd.DataFrame: The encoded dataframe
    """
    if not self._mapping:
        raise ValueError(
            "Passed model should either have an attribute `embeddng_layers`"
            " or a method `extract_embedding` defined for `transform`."
        )
    assert all(c in X.columns for c in self.cols)

    X_encoded = X.copy(deep=True)
    for col, mapping in track(
        self._mapping.items(),
        description="Encoding the data...",
        total=len(self._mapping.values()),
    ):
        for dim in range(mapping[self.NAN_CATEGORY].shape[0]):
            X_encoded.loc[:, f"{col}_embed_dim_{dim}"] = (
                X_encoded[col].fillna(self.NAN_CATEGORY).map({k: v[dim] for k, v in mapping.items()})
            )
            # Filling unseen categories also with NAN Embedding
            X_encoded[f"{col}_embed_dim_{dim}"].fillna(mapping[self.NAN_CATEGORY][dim], inplace=True)
    X_encoded.drop(columns=self.cols, inplace=True)
    return X_encoded

pytorch_tabular.DeepFeatureExtractor(tabular_model, extract_keys=['backbone_features'], drop_original=True)

Bases: BaseEstimator, TransformerMixin

Initializes the Transformer and extracts the neural features.

PARAMETER DESCRIPTION
tabular_model

The trained TabularModel object

TYPE: TabularModel

Source code in src/pytorch_tabular/feature_extractor.py
def __init__(self, tabular_model, extract_keys=["backbone_features"], drop_original=True):
    """Initializes the Transformer and extracts the neural features.

    Args:
        tabular_model (TabularModel): The trained TabularModel object
    """
    assert not (
        isinstance(tabular_model.model, NODEModel)
        or isinstance(tabular_model.model, TabNetModel)
        or isinstance(tabular_model.model, MDNModel)
    ), "FeatureExtractor doesn't work for Mixture Density Networks, NODE Model, & Tabnet Model"
    self.tabular_model = tabular_model
    self.extract_keys = extract_keys
    self.drop_original = drop_original

fit(X, y=None)

Just for compatibility.

Does not do anything

Source code in src/pytorch_tabular/feature_extractor.py
def fit(self, X, y=None):
    """Just for compatibility.

    Does not do anything
    """
    return self

fit_transform(X, y=None)

Encode given columns of X based on the learned features.

PARAMETER DESCRIPTION
X

DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.

TYPE: pd.DataFrame

y

Only for compatibility. Not used. Defaults to None.

TYPE: [type] DEFAULT: None

RETURNS DESCRIPTION
pd.DataFrame

pd.DataFrame: The encoded dataframe

Source code in src/pytorch_tabular/feature_extractor.py
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """Encode given columns of X based on the learned features.

    Args:
        X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
        y ([type], optional): Only for compatibility. Not used. Defaults to None.

    Returns:
        pd.DataFrame: The encoded dataframe
    """
    self.fit(X, y)
    return self.transform(X)

transform(X, y=None)

Transforms the categorical columns specified to the trained neural features from the model.

PARAMETER DESCRIPTION
X

DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.

TYPE: pd.DataFrame

y

Only for compatibility. Not used. Defaults to None.

TYPE: [type] DEFAULT: None

RAISES DESCRIPTION
ValueError

[description]

RETURNS DESCRIPTION
pd.DataFrame

pd.DataFrame: The encoded dataframe

Source code in src/pytorch_tabular/feature_extractor.py
def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """Transforms the categorical columns specified to the trained neural features from the model.

    Args:
        X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
        y ([type], optional): Only for compatibility. Not used. Defaults to None.

    Raises:
        ValueError: [description]

    Returns:
        pd.DataFrame: The encoded dataframe
    """

    X_encoded = X.copy(deep=True)
    orig_features = X_encoded.columns
    self.tabular_model.model.eval()
    inference_dataloader = self.tabular_model.datamodule.prepare_inference_dataloader(X_encoded)
    logits_predictions = defaultdict(list)
    for batch in track(inference_dataloader, description="Generating Features..."):
        for k, v in batch.items():
            if isinstance(v, list) and (len(v) == 0):
                # Skipping empty list
                continue
            batch[k] = v.to(self.tabular_model.model.device)
        if self.tabular_model.config.task == "ssl":
            ret_value = {"backbone_features": self.tabular_model.model.predict(batch, ret_model_output=True)}
        else:
            _, ret_value = self.tabular_model.model.predict(batch, ret_model_output=True)
        for k in self.extract_keys:
            if k in ret_value.keys():
                logits_predictions[k].append(ret_value[k].detach().cpu())

    for k, v in logits_predictions.items():
        v = torch.cat(v, dim=0).numpy()
        if v.ndim == 1:
            v = v.reshape(-1, 1)
        for i in range(v.shape[-1]):
            if v.shape[-1] > 1:
                X_encoded[f"{k}_{i}"] = v[:, i]
            else:
                X_encoded[f"{k}"] = v[:, i]

    if self.drop_original:
        X_encoded.drop(columns=orig_features, inplace=True)
    return X_encoded

Data Utilities

pytorch_tabular.utils.get_balanced_sampler(y_train)

Source code in src/pytorch_tabular/utils/data_utils.py
def get_balanced_sampler(y_train):
    assert y_train.ndim == 1, "Utility function only works for binary classification"
    y_train = LabelEncoder().fit_transform(y_train)
    class_sample_counts = np.bincount(y_train)
    # compute weight for all the samples in the dataset
    # samples_weights contain the probability for each example in dataset to be sampled
    class_weights = 1.0 / torch.Tensor(class_sample_counts)
    train_samples_weight = [class_weights[class_id] for class_id in y_train]
    # now lets initialize samplers
    train_sampler = torch.utils.data.sampler.WeightedRandomSampler(train_samples_weight, len(y_train))
    return train_sampler

pytorch_tabular.utils.get_class_weighted_cross_entropy(y_train, mu=1.0)

Source code in src/pytorch_tabular/utils/data_utils.py
def get_class_weighted_cross_entropy(y_train, mu=1.0):
    assert y_train.ndim == 1, "Utility function only works for binary classification"
    y_train = LabelEncoder().fit_transform(y_train)
    weights = _make_smooth_weights_for_balanced_classes(y_train, mu=mu)
    criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(weights))
    return criterion

pytorch_tabular.utils.get_gaussian_centers(y, n_components)

Source code in src/pytorch_tabular/utils/data_utils.py
def get_gaussian_centers(y, n_components):
    if isinstance(y, pd.Series) or isinstance(y, pd.DataFrame):
        y = y.values
    if y.ndim == 1:
        y = y.reshape(-1, 1)
    cluster = KMeans(n_clusters=n_components, random_state=42).fit(y)
    return cluster.cluster_centers_.ravel().tolist()

NN Utilities

pytorch_tabular.utils._initialize_layers(activation, initialization, layers)

Source code in src/pytorch_tabular/utils/nn_utils.py
def _initialize_layers(activation, initialization, layers):
    if type(layers) == nn.Sequential:
        for layer in layers:
            if hasattr(layer, "weight"):
                _initialize_layers(activation, initialization, layer)
    else:
        if activation == "ReLU":
            nonlinearity = "relu"
        elif activation == "LeakyReLU":
            nonlinearity = "leaky_relu"
        else:
            if initialization == "kaiming":
                logger.warning("Kaiming initialization is only recommended for ReLU and LeakyReLU.")
                nonlinearity = "leaky_relu"
            else:
                nonlinearity = "relu"

        if initialization == "kaiming":
            nn.init.kaiming_normal_(layers.weight, nonlinearity=nonlinearity)
        elif initialization == "xavier":
            nn.init.xavier_normal_(
                layers.weight,
                gain=nn.init.calculate_gain(nonlinearity) if activation in ["ReLU", "LeakyReLU"] else 1,
            )
        elif initialization == "random":
            nn.init.normal_(layers.weight)

pytorch_tabular.utils._initialize_kaiming(x, initialization, d_sqrt_inv)

Source code in src/pytorch_tabular/utils/nn_utils.py
def _initialize_kaiming(x, initialization, d_sqrt_inv):
    if initialization == "kaiming_uniform":
        nn.init.uniform_(x, a=-d_sqrt_inv, b=d_sqrt_inv)
    elif initialization == "kaiming_normal":
        nn.init.normal_(x, std=d_sqrt_inv)
    elif initialization is None:
        pass
    else:
        raise NotImplementedError("initialization should be either of `kaiming_normal`, `kaiming_uniform`, `None`")

pytorch_tabular.utils._linear_dropout_bn(activation, initialization, use_batch_norm, in_units, out_units, dropout)

Source code in src/pytorch_tabular/utils/nn_utils.py
def _linear_dropout_bn(activation, initialization, use_batch_norm, in_units, out_units, dropout):
    if isinstance(activation, str):
        _activation = getattr(nn, activation)
    else:
        _activation = activation
    layers = []
    if use_batch_norm:
        layers.append(nn.BatchNorm1d(num_features=in_units))
    linear = nn.Linear(in_units, out_units)
    _initialize_layers(activation, initialization, linear)
    layers.extend([linear, _activation()])
    if dropout != 0:
        layers.append(nn.Dropout(dropout))
    return layers

pytorch_tabular.utils._make_ix_like(input, dim=0)

Source code in src/pytorch_tabular/utils/nn_utils.py
def _make_ix_like(input, dim=0):
    d = input.size(dim)
    rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
    view = [1] * input.dim()
    view[0] = -1
    return rho.view(view).transpose(0, dim)

pytorch_tabular.utils.reset_all_weights(model)

Resets all parameters in a network.

PARAMETER DESCRIPTION
model

The model to reset the parameters of.

TYPE: nn.Module

refs
  • https://discuss.pytorch.org/t/how-to-re-set-alll-parameters-in-a-network/20819/6
  • https://stackoverflow.com/questions/63627997/reset-parameters-of-a-neural-network-in-pytorch
  • https://pytorch.org/docs/stable/generated/torch.nn.Module.html
Source code in src/pytorch_tabular/utils/nn_utils.py
def reset_all_weights(model: nn.Module) -> None:
    """Resets all parameters in a network.

    Args:
        model: The model to reset the parameters of.

    refs:
        - https://discuss.pytorch.org/t/how-to-re-set-alll-parameters-in-a-network/20819/6
        - https://stackoverflow.com/questions/63627997/reset-parameters-of-a-neural-network-in-pytorch
        - https://pytorch.org/docs/stable/generated/torch.nn.Module.html
    """

    @torch.no_grad()
    def weight_reset(m: nn.Module):
        # - check if the current module has reset_parameters & if it's callabed called it on m
        reset_parameters = getattr(m, "reset_parameters", None)
        if callable(reset_parameters):
            m.reset_parameters()

    # Applies fn recursively to every submodule see: https://pytorch.org/docs/stable/generated/torch.nn.Module.html
    model.apply(fn=weight_reset)

pytorch_tabular.utils.to_one_hot(y, depth=None)

Takes integer with n dims and converts it to 1-hot representation with n + 1 dims.

The n+1'st dimension will have zeros everywhere but at y'th index, where it will be equal to 1.

PARAMETER DESCRIPTION
y

input integer (IntTensor, LongTensor or Variable) of any shape

depth

the size of the one hot dimension

TYPE: int DEFAULT: None

Source code in src/pytorch_tabular/utils/nn_utils.py
def to_one_hot(y, depth=None):
    r"""Takes integer with n dims and converts it to 1-hot representation with n + 1 dims.

    The n+1'st dimension will have zeros everywhere but at y'th index, where it will be equal to 1.
    Args:
        y: input integer (IntTensor, LongTensor or Variable) of any shape
        depth (int):  the size of the one hot dimension
    """
    y_flat = y.to(torch.int64).view(-1, 1)
    depth = depth if depth is not None else int(torch.max(y_flat)) + 1
    y_one_hot = torch.zeros(y_flat.size()[0], depth, device=y.device).scatter_(1, y_flat, 1)
    y_one_hot = y_one_hot.view(*(tuple(y.shape) + (-1,)))
    return y_one_hot

Python Utilities

pytorch_tabular.utils.getattr_nested(_module_src, _model_name)

Source code in src/pytorch_tabular/utils/python_utils.py
def getattr_nested(_module_src, _model_name):
    module = root_module
    for m in _module_src.split("."):
        module = getattr(module, m)
    return getattr(module, _model_name)

pytorch_tabular.utils.ifnone(arg, default_arg)

Source code in src/pytorch_tabular/utils/python_utils.py
def ifnone(arg, default_arg):
    return default_arg if arg is None else arg

pytorch_tabular.utils.check_numpy(x)

Makes sure x is a numpy array.

Source code in src/pytorch_tabular/utils/python_utils.py
def check_numpy(x):
    """Makes sure x is a numpy array."""
    if isinstance(x, torch.Tensor):
        x = x.detach().cpu().numpy()
    x = np.asarray(x)
    assert isinstance(x, np.ndarray)
    return x

pytorch_tabular.utils.pl_load(path_or_url, map_location=None)

Loads a checkpoint.

PARAMETER DESCRIPTION
path_or_url

Path or URL of the checkpoint.

TYPE: Union[IO, _PATH]

map_location

a function, torch.device, string or a dict specifying how to remap storage locations.

TYPE: _MAP_LOCATION_TYPE DEFAULT: None

Source code in src/pytorch_tabular/utils/python_utils.py
def pl_load(
    path_or_url: Union[IO, _PATH],
    map_location: _MAP_LOCATION_TYPE = None,
) -> Any:
    """Loads a checkpoint.

    Args:
        path_or_url: Path or URL of the checkpoint.
        map_location: a function, ``torch.device``, string or a dict specifying how to remap storage locations.
    """
    if not isinstance(path_or_url, (str, Path)):
        # any sort of BytesIO or similar
        return torch.load(path_or_url, map_location=map_location)
    if str(path_or_url).startswith("http"):
        return torch.hub.load_state_dict_from_url(
            str(path_or_url),
            map_location=map_location,  # type: ignore[arg-type] # upstream annotation is not correct
        )
    fs = get_filesystem(path_or_url)
    with fs.open(path_or_url, "rb") as f:
        return torch.load(f, map_location=map_location)

pytorch_tabular.utils.generate_doc_dataclass(dataclass, desc=None, width=100)

Source code in src/pytorch_tabular/utils/python_utils.py
def generate_doc_dataclass(dataclass, desc=None, width=100):
    if desc is not None:
        doc_str = f"{desc}\nArgs:"
    else:
        doc_str = "Args:"
    for key in dataclass.__dataclass_fields__.keys():
        if key.startswith("_"):  # Skipping private fields
            continue
        atr = dataclass.__dataclass_fields__[key]
        if atr.init:
            type = str(atr.type).replace("<class '", "").replace("'>", "").replace("typing.", "")
            help_str = atr.metadata.get("help", "")
            if "choices" in atr.metadata.keys():
                help_str += f'. Choices are: [{",".join(["`"+str(ch)+"`" for ch in atr.metadata["choices"]])}].'
            # help_str += f'. Defaults to {atr.default}'
            h_str = textwrap.fill(
                f"{key} ({type}): {help_str}",
                width=width,
                subsequent_indent="\t\t",
                initial_indent="\t",
            )
            h_str = f"\n{h_str}\n"
            doc_str += h_str
    return doc_str