Utilities¶

Special Feature Classes¶

Bases: BaseEstimator, TransformerMixin

Source code in src/pytorch_tabular/categorical_encoders.py

class CategoricalEmbeddingTransformer(BaseEstimator, TransformerMixin):
    NAN_CATEGORY = 0

    def __init__(self, tabular_model):
        """Initializes the Transformer and extracts the neural embeddings.

        Args:
            tabular_model (TabularModel): The trained TabularModel object

        """
        self._categorical_encoder = tabular_model.datamodule.categorical_encoder
        self.cols = tabular_model.model.hparams.categorical_cols
        # dict {str: np.ndarray} column name --> mapping from category (index of df) to value (column of df)
        self._mapping = {}

        self._extract_embedding(tabular_model.model)

    def _extract_embedding(self, model):
        try:
            embedding_layer = model.extract_embedding()
        except ValueError as e:
            logger.error(
                f"Extracting embedding layer from model received this error: {e}."
                f" Some models do not support this feature."
            )
            embedding_layer = None
        if embedding_layer is not None:
            for i, col in enumerate(self.cols):
                self._mapping[col] = {}
                embedding = embedding_layer[i]
                self._mapping[col][self.NAN_CATEGORY] = embedding.weight[0, :].detach().cpu().numpy().ravel()
                for key in self._categorical_encoder._mapping[col].index:
                    self._mapping[col][key] = (
                        embedding.weight[self._categorical_encoder._mapping[col].loc[key], :]
                        .detach()
                        .cpu()
                        .numpy()
                        .ravel()
                    )
        else:
            raise ValueError("Passed model doesn't support this feature.")

    def fit(self, X, y=None):
        """Just for compatibility.

        Does not do anything

        """
        return self

    def transform(self, X: DataFrame, y=None) -> DataFrame:
        """Transforms the categorical columns specified to the trained neural embedding from the model.

        Args:
            X (DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
            y ([type], optional): Only for compatibility. Not used. Defaults to None.

        Raises:
            ValueError: [description]

        Returns:
            DataFrame: The encoded dataframe

        """
        if not self._mapping:
            raise ValueError(
                "Passed model should either have an attribute `embeddng_layers`"
                " or a method `extract_embedding` defined for `transform`."
            )
        assert all(c in X.columns for c in self.cols)

        X_encoded = X.copy(deep=True)
        for col, mapping in track(
            self._mapping.items(),
            description="Encoding the data...",
            total=len(self._mapping.values()),
        ):
            for dim in range(mapping[self.NAN_CATEGORY].shape[0]):
                X_encoded.loc[:, f"{col}_embed_dim_{dim}"] = (
                    X_encoded[col].fillna(self.NAN_CATEGORY).map({k: v[dim] for k, v in mapping.items()})
                )
                # Filling unseen categories also with NAN Embedding
                X_encoded[f"{col}_embed_dim_{dim}"].fillna(mapping[self.NAN_CATEGORY][dim], inplace=True)
        X_encoded.drop(columns=self.cols, inplace=True)
        return X_encoded

    def fit_transform(self, X: DataFrame, y=None) -> DataFrame:
        """Encode given columns of X based on the learned embedding.

        Args:
            X (DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
            y ([type], optional): Only for compatibility. Not used. Defaults to None.

        Returns:
            DataFrame: The encoded dataframe

        """
        self.fit(X, y)
        return self.transform(X)

    def save_as_object_file(self, path):
        if not self._mapping:
            raise ValueError("`fit` method must be called before `save_as_object_file`.")
        pickle.dump(self.__dict__, open(path, "wb"))

    def load_from_object_file(self, path):
        for k, v in pickle.load(open(path, "rb")).items():
            setattr(self, k, v)

`init(tabular_model)` ¶

Initializes the Transformer and extracts the neural embeddings.

Parameters:

Name	Type	Description	Default
`tabular_model`	`TabularModel`	The trained TabularModel object	required

Source code in src/pytorch_tabular/categorical_encoders.py

def __init__(self, tabular_model):
    """Initializes the Transformer and extracts the neural embeddings.

    Args:
        tabular_model (TabularModel): The trained TabularModel object

    """
    self._categorical_encoder = tabular_model.datamodule.categorical_encoder
    self.cols = tabular_model.model.hparams.categorical_cols
    # dict {str: np.ndarray} column name --> mapping from category (index of df) to value (column of df)
    self._mapping = {}

    self._extract_embedding(tabular_model.model)

`fit(X, y=None)` ¶

Just for compatibility.

Does not do anything

Source code in src/pytorch_tabular/categorical_encoders.py

def fit(self, X, y=None):
    """Just for compatibility.

    Does not do anything

    """
    return self

`fit_transform(X, y=None)` ¶

Encode given columns of X based on the learned embedding.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.	required
`y`	`[type]`	Only for compatibility. Not used. Defaults to None.	`None`

Returns:

Name	Type	Description
`DataFrame`	`DataFrame`	The encoded dataframe

Source code in src/pytorch_tabular/categorical_encoders.py

def fit_transform(self, X: DataFrame, y=None) -> DataFrame:
    """Encode given columns of X based on the learned embedding.

    Args:
        X (DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
        y ([type], optional): Only for compatibility. Not used. Defaults to None.

    Returns:
        DataFrame: The encoded dataframe

    """
    self.fit(X, y)
    return self.transform(X)

`transform(X, y=None)` ¶

Transforms the categorical columns specified to the trained neural embedding from the model.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.	required
`y`	`[type]`	Only for compatibility. Not used. Defaults to None.	`None`

Raises:

Type	Description
`ValueError`	[description]

Returns:

Name	Type	Description
`DataFrame`	`DataFrame`	The encoded dataframe

Source code in src/pytorch_tabular/categorical_encoders.py

def transform(self, X: DataFrame, y=None) -> DataFrame:
    """Transforms the categorical columns specified to the trained neural embedding from the model.

    Args:
        X (DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
        y ([type], optional): Only for compatibility. Not used. Defaults to None.

    Raises:
        ValueError: [description]

    Returns:
        DataFrame: The encoded dataframe

    """
    if not self._mapping:
        raise ValueError(
            "Passed model should either have an attribute `embeddng_layers`"
            " or a method `extract_embedding` defined for `transform`."
        )
    assert all(c in X.columns for c in self.cols)

    X_encoded = X.copy(deep=True)
    for col, mapping in track(
        self._mapping.items(),
        description="Encoding the data...",
        total=len(self._mapping.values()),
    ):
        for dim in range(mapping[self.NAN_CATEGORY].shape[0]):
            X_encoded.loc[:, f"{col}_embed_dim_{dim}"] = (
                X_encoded[col].fillna(self.NAN_CATEGORY).map({k: v[dim] for k, v in mapping.items()})
            )
            # Filling unseen categories also with NAN Embedding
            X_encoded[f"{col}_embed_dim_{dim}"].fillna(mapping[self.NAN_CATEGORY][dim], inplace=True)
    X_encoded.drop(columns=self.cols, inplace=True)
    return X_encoded

Bases: BaseEstimator, TransformerMixin

Source code in src/pytorch_tabular/feature_extractor.py

class DeepFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, tabular_model, extract_keys=["backbone_features"], drop_original=True):
        """Initializes the Transformer and extracts the neural features.

        Args:
            tabular_model (TabularModel): The trained TabularModel object
            extract_keys (list, optional): The keys of the features to extract. Defaults to ["backbone_features"].
            drop_original (bool, optional): Whether to drop the original columns. Defaults to True.

        """
        assert not (
            isinstance(tabular_model.model, NODEModel)
            or isinstance(tabular_model.model, TabNetModel)
            or isinstance(tabular_model.model, MDNModel)
        ), "FeatureExtractor doesn't work for Mixture Density Networks, NODE Model, & Tabnet Model"
        self.tabular_model = tabular_model
        self.extract_keys = extract_keys
        self.drop_original = drop_original

    def fit(self, X, y=None):
        """Just for compatibility.

        Does not do anything

        """
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """Transforms the categorical columns specified to the trained neural features from the model.

        Args:
            X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
            y ([type], optional): Only for compatibility. Not used. Defaults to None.

        Raises:
            ValueError: [description]

        Returns:
            pd.DataFrame: The encoded dataframe

        """

        X_encoded = X.copy(deep=True)
        orig_features = X_encoded.columns
        self.tabular_model.model.eval()
        inference_dataloader = self.tabular_model.datamodule.prepare_inference_dataloader(X_encoded)
        logits_predictions = defaultdict(list)
        for batch in track(inference_dataloader, description="Generating Features..."):
            for k, v in batch.items():
                if isinstance(v, list) and (len(v) == 0):
                    # Skipping empty list
                    continue
                batch[k] = v.to(self.tabular_model.model.device)
            if self.tabular_model.config.task == "ssl":
                ret_value = {"backbone_features": self.tabular_model.model.predict(batch, ret_model_output=True)}
            else:
                _, ret_value = self.tabular_model.model.predict(batch, ret_model_output=True)
            for k in self.extract_keys:
                if k in ret_value.keys():
                    logits_predictions[k].append(ret_value[k].detach().cpu())

        logits_dfs = []
        for k, v in logits_predictions.items():
            v = torch.cat(v, dim=0).numpy()
            if v.ndim == 1:
                v = v.reshape(-1, 1)
            if v.shape[-1] > 1:
                temp_df = pd.DataFrame({f"{k}_{i}": v[:, i] for i in range(v.shape[-1])})
            else:
                temp_df = pd.DataFrame({f"{k}": v[:, 0]})

            # Append the temp DataFrame to the list
            logits_dfs.append(temp_df)

        preds = pd.concat(logits_dfs, axis=1)
        X_encoded = pd.concat([X_encoded, preds], axis=1)

        if self.drop_original:
            X_encoded.drop(columns=orig_features, inplace=True)
        return X_encoded

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """Encode given columns of X based on the learned features.

        Args:
            X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
            y ([type], optional): Only for compatibility. Not used. Defaults to None.

        Returns:
            pd.DataFrame: The encoded dataframe

        """
        self.fit(X, y)
        return self.transform(X)

    def save_as_object_file(self, path):
        """Saves the feature extractor as a pickle file.

        Args:
            path (str): The path to save the file

        """
        if not self._mapping:
            raise ValueError("`fit` method must be called before `save_as_object_file`.")
        pickle.dump(self.__dict__, open(path, "wb"))

    def load_from_object_file(self, path):
        """Loads the feature extractor from a pickle file.

        Args:
            path (str): The path to load the file from

        """
        for k, v in pickle.load(open(path, "rb")).items():
            setattr(self, k, v)

`init(tabular_model, extract_keys=['backbone_features'], drop_original=True)` ¶

Initializes the Transformer and extracts the neural features.

Parameters:

Name	Type	Description	Default
`tabular_model`	`TabularModel`	The trained TabularModel object	required
`extract_keys`	`list`	The keys of the features to extract. Defaults to ["backbone_features"].	`['backbone_features']`
`drop_original`	`bool`	Whether to drop the original columns. Defaults to True.	`True`

Source code in src/pytorch_tabular/feature_extractor.py

def __init__(self, tabular_model, extract_keys=["backbone_features"], drop_original=True):
    """Initializes the Transformer and extracts the neural features.

    Args:
        tabular_model (TabularModel): The trained TabularModel object
        extract_keys (list, optional): The keys of the features to extract. Defaults to ["backbone_features"].
        drop_original (bool, optional): Whether to drop the original columns. Defaults to True.

    """
    assert not (
        isinstance(tabular_model.model, NODEModel)
        or isinstance(tabular_model.model, TabNetModel)
        or isinstance(tabular_model.model, MDNModel)
    ), "FeatureExtractor doesn't work for Mixture Density Networks, NODE Model, & Tabnet Model"
    self.tabular_model = tabular_model
    self.extract_keys = extract_keys
    self.drop_original = drop_original

`fit(X, y=None)` ¶

Just for compatibility.

Does not do anything

Source code in src/pytorch_tabular/feature_extractor.py

def fit(self, X, y=None):
    """Just for compatibility.

    Does not do anything

    """
    return self

`fit_transform(X, y=None)` ¶

Encode given columns of X based on the learned features.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.	required
`y`	`[type]`	Only for compatibility. Not used. Defaults to None.	`None`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The encoded dataframe

Source code in src/pytorch_tabular/feature_extractor.py

def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """Encode given columns of X based on the learned features.

    Args:
        X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
        y ([type], optional): Only for compatibility. Not used. Defaults to None.

    Returns:
        pd.DataFrame: The encoded dataframe

    """
    self.fit(X, y)
    return self.transform(X)

`load_from_object_file(path)` ¶

Loads the feature extractor from a pickle file.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to load the file from	required

Source code in src/pytorch_tabular/feature_extractor.py

def load_from_object_file(self, path):
    """Loads the feature extractor from a pickle file.

    Args:
        path (str): The path to load the file from

    """
    for k, v in pickle.load(open(path, "rb")).items():
        setattr(self, k, v)

`save_as_object_file(path)` ¶

Saves the feature extractor as a pickle file.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to save the file	required

Source code in src/pytorch_tabular/feature_extractor.py

def save_as_object_file(self, path):
    """Saves the feature extractor as a pickle file.

    Args:
        path (str): The path to save the file

    """
    if not self._mapping:
        raise ValueError("`fit` method must be called before `save_as_object_file`.")
    pickle.dump(self.__dict__, open(path, "wb"))

`transform(X, y=None)` ¶

Transforms the categorical columns specified to the trained neural features from the model.

Parameters:

Name	Type	Description	Default
`X`	`DataFrame`	DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.	required
`y`	`[type]`	Only for compatibility. Not used. Defaults to None.	`None`

Raises:

Type	Description
`ValueError`	[description]

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The encoded dataframe

Source code in src/pytorch_tabular/feature_extractor.py

def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
    """Transforms the categorical columns specified to the trained neural features from the model.

    Args:
        X (pd.DataFrame): DataFrame of features, shape (n_samples, n_features). Must contain columns to encode.
        y ([type], optional): Only for compatibility. Not used. Defaults to None.

    Raises:
        ValueError: [description]

    Returns:
        pd.DataFrame: The encoded dataframe

    """

    X_encoded = X.copy(deep=True)
    orig_features = X_encoded.columns
    self.tabular_model.model.eval()
    inference_dataloader = self.tabular_model.datamodule.prepare_inference_dataloader(X_encoded)
    logits_predictions = defaultdict(list)
    for batch in track(inference_dataloader, description="Generating Features..."):
        for k, v in batch.items():
            if isinstance(v, list) and (len(v) == 0):
                # Skipping empty list
                continue
            batch[k] = v.to(self.tabular_model.model.device)
        if self.tabular_model.config.task == "ssl":
            ret_value = {"backbone_features": self.tabular_model.model.predict(batch, ret_model_output=True)}
        else:
            _, ret_value = self.tabular_model.model.predict(batch, ret_model_output=True)
        for k in self.extract_keys:
            if k in ret_value.keys():
                logits_predictions[k].append(ret_value[k].detach().cpu())

    logits_dfs = []
    for k, v in logits_predictions.items():
        v = torch.cat(v, dim=0).numpy()
        if v.ndim == 1:
            v = v.reshape(-1, 1)
        if v.shape[-1] > 1:
            temp_df = pd.DataFrame({f"{k}_{i}": v[:, i] for i in range(v.shape[-1])})
        else:
            temp_df = pd.DataFrame({f"{k}": v[:, 0]})

        # Append the temp DataFrame to the list
        logits_dfs.append(temp_df)

    preds = pd.concat(logits_dfs, axis=1)
    X_encoded = pd.concat([X_encoded, preds], axis=1)

    if self.drop_original:
        X_encoded.drop(columns=orig_features, inplace=True)
    return X_encoded

Data Utilities¶

Source code in src/pytorch_tabular/utils/data_utils.py

def get_balanced_sampler(y_train):
    assert y_train.ndim == 1, "Utility function only works for binary classification"
    y_train = LabelEncoder().fit_transform(y_train)
    class_sample_counts = np.bincount(y_train)
    # compute weight for all the samples in the dataset
    # samples_weights contain the probability for each example in dataset to be sampled
    class_weights = 1.0 / torch.Tensor(class_sample_counts)
    train_samples_weight = [class_weights[class_id] for class_id in y_train]
    # now lets initialize samplers
    train_sampler = torch.utils.data.sampler.WeightedRandomSampler(train_samples_weight, len(y_train))
    return train_sampler

Source code in src/pytorch_tabular/utils/data_utils.py

def get_class_weighted_cross_entropy(y_train, mu=1.0):
    assert y_train.ndim == 1, "Utility function only works for binary classification"
    y_train = LabelEncoder().fit_transform(y_train)
    weights = _make_smooth_weights_for_balanced_classes(y_train, mu=mu)
    criterion = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(weights))
    return criterion

Source code in src/pytorch_tabular/utils/data_utils.py

def get_gaussian_centers(y, n_components):
    if isinstance(y, Series) or isinstance(y, DataFrame):
        y = y.values
    if y.ndim == 1:
        y = y.reshape(-1, 1)
    cluster = KMeans(n_clusters=n_components, random_state=42).fit(y)
    return cluster.cluster_centers_.ravel().tolist()

Predicting forest cover type from cartographic variables only (no remotely sensed data). The actual forest cover type for a given observation (30 x 30 meter cell) was determined from US Forest Service (USFS) Region 2 Resource Information System (RIS) data. Independent variables were derived from data originally obtained from US Geological Survey (USGS) and USFS data. Data is in raw form (not scaled) and contains binary (0 or 1) columns of data for qualitative independent variables (wilderness areas and soil types).

This study area includes four wilderness areas located in the Roosevelt National Forest of northern Colorado. These areas represent forests with minimal human-caused disturbances, so that existing forest cover types are more a result of ecological processes rather than forest management practices.

It is from UCI ML Repository, but with small changes: - The one hot encoded columns are converted to categorical - Soli Type and Wilderness type

Parameters:

Name	Type	Description	Default
`download_dir`	`str`	Directory to download the data to. Defaults to None, which will download to ~/.pytorch_tabular/datasets/	`None`

Source code in src/pytorch_tabular/utils/data_utils.py

def load_covertype_dataset(download_dir=None):
    """Predicting forest cover type from cartographic variables only (no remotely sensed data). The actual forest cover
    type for a given observation (30 x 30 meter cell) was determined from US Forest Service (USFS) Region 2 Resource
    Information System (RIS) data. Independent variables were derived from data originally obtained from US Geological
    Survey (USGS) and USFS data. Data is in raw form (not scaled) and contains binary (0 or 1) columns of data for
    qualitative independent variables (wilderness areas and soil types).

    This study area includes four wilderness areas located in the Roosevelt National Forest of northern Colorado.
    These areas represent forests with minimal human-caused disturbances, so that existing forest cover types are more a
    result of ecological processes rather than forest management practices.

    It is from [UCI ML Repository](https://archive.ics.uci.edu/ml/datasets/covertype), but with small changes:
    - The one hot encoded columns are converted to categorical - Soli Type and Wilderness type

    Args:
        download_dir (str): Directory to download the data to. Defaults to None, which will download
            to ~/.pytorch_tabular/datasets/

    """
    if download_dir is None:
        download_dir = os.path.join(os.path.expanduser("~"), ".pytorch_tabular", "datasets")
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    file_path = os.path.join(download_dir, "covertype.csv")
    if not os.path.exists(file_path):
        logger.info("Downloading Covertype Dataset")
        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
        response = requests.get(url)
        with open(os.path.join(download_dir, "covertype.data.gz"), "wb") as f:
            f.write(response.content)
        with gzip.open(os.path.join(download_dir, "covertype.data.gz"), "rb") as f_in:
            with open(os.path.join(download_dir, "covertype.csv"), "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(os.path.join(download_dir, "covertype.data.gz"))
    df = pd.read_csv(file_path, header=None)
    df.columns = (
        [
            "Elevation",
            "Aspect",
            "Slope",
            "Horizontal_Distance_To_Hydrology",
            "Vertical_Distance_To_Hydrology",
            "Horizontal_Distance_To_Roadways",
            "Hillshade_9am",
            "Hillshade_Noon",
            "Hillshade_3pm",
            "Horizontal_Distance_To_Fire_Points",
        ]
        + [f"Wilderness_Area_{i}" for i in range(4)]
        + [f"Soil_Type_{i}" for i in range(40)]
        + ["Cover_Type"]
    )
    # convert one hot encoded columns to categorical
    df["Wilderness_Area"] = df[[f"Wilderness_Area_{i}" for i in range(4)]].idxmax(axis=1).str.split("_").str[-1]
    df["Soil_Type"] = df[[f"Soil_Type_{i}" for i in range(40)]].idxmax(axis=1).str.split("_").str[-1]
    df.drop(
        [f"Wilderness_Area_{i}" for i in range(4)] + [f"Soil_Type_{i}" for i in range(40)],
        axis=1,
        inplace=True,
    )
    continuous_cols = [
        "Elevation",
        "Aspect",
        "Slope",
        "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology",
        "Horizontal_Distance_To_Roadways",
        "Hillshade_9am",
        "Hillshade_Noon",
        "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points",
    ]
    categorical_cols = ["Wilderness_Area", "Soil_Type"]
    return df, categorical_cols, continuous_cols, "Cover_Type"

Creates a synthetic dataset with mixed data types.

Parameters:

Name	Type	Description	Default
`task`	`str`	Either "classification" or "regression"	required
`n_samples`	`int`	Number of samples to generate	required
`n_features`	`int`	Number of total features to generate	`7`
`n_categories`	`int`	Number of features to be categorical	`2`
`n_informative`	`int`	Number of informative features	`5`
`random_state`	`int`	Random seed for reproducibility	`42`
`n_targets`	`int`	Number of targets to generate. n_targets>1 will generate a multi-target dataset for regression and multi-class dataset for classification. Defaults to 2 classes for classification and 1 for regression	`None`
`kwargs`		Additional arguments to pass to the make_classification or make_regression function	`{}`

Source code in src/pytorch_tabular/utils/data_utils.py

def make_mixed_dataset(
    task,
    n_samples,
    n_features=7,
    n_categories=2,
    n_informative=5,
    random_state=42,
    n_targets=None,
    **kwargs,
):
    """Creates a synthetic dataset with mixed data types.

    Args:
        task (str): Either "classification" or "regression"
        n_samples (int): Number of samples to generate
        n_features (int): Number of total features to generate
        n_categories (int): Number of features to be categorical
        n_informative (int): Number of informative features
        random_state (int): Random seed for reproducibility
        n_targets (int): Number of targets to generate. n_targets>1 will generate a multi-target dataset
            for regression and multi-class dataset for classification.
            Defaults to 2 classes for classification and 1 for regression
        kwargs: Additional arguments to pass to the make_classification or make_regression function

    """
    assert n_features >= n_categories, "n_features must be greater than or equal to n_categories"
    assert n_informative <= n_features, "n_informative must be less than or equal to n_features"
    assert task in [
        "classification",
        "regression",
    ], "task must be either classification or regression"
    if n_targets is None:
        n_targets = 1 if task == "regression" else 2
    if task == "classification":
        X, y = make_classification(
            n_samples=n_samples,
            n_features=n_features,
            random_state=random_state,
            n_informative=n_informative,
            n_classes=n_targets,
            **kwargs,
        )
    elif task == "regression":
        X, y = make_regression(
            n_samples=n_samples,
            n_features=n_features,
            random_state=random_state,
            n_informative=n_informative,
            n_targets=n_targets,
            **kwargs,
        )
    cat_cols = random.choices(list(range(X.shape[-1])), k=n_categories)
    num_cols = [i for i in range(X.shape[-1]) if i not in cat_cols]
    for col in cat_cols:
        X[:, col] = pd.qcut(X[:, col], q=4).codes.astype(int)
    col_names = []
    num_col_names = []
    cat_col_names = []
    for i in range(X.shape[-1]):
        if i in cat_cols:
            col_names.append(f"cat_col_{i}")
            cat_col_names.append(f"cat_col_{i}")
        if i in num_cols:
            col_names.append(f"num_col_{i}")
            num_col_names.append(f"num_col_{i}")
    X = pd.DataFrame(X, columns=col_names)
    if n_targets == 1 or task == "classification":
        y = pd.Series(y, name="target")
    else:
        y = pd.DataFrame(y, columns=[f"target_{i}" for i in range(n_targets)])
    if task == "classification":
        y = "class_" + y.astype(str)
    data = X.join(y)
    return data, cat_col_names, num_col_names

Source code in src/pytorch_tabular/utils/data_utils.py

def print_metrics(metrics, y_true, y_pred, tag, return_dict=False):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim > 1:
        y_true = y_true.ravel()
    if y_pred.ndim > 1:
        y_pred = y_pred.ravel()
    print_str_l = []
    res_d = {}
    for metric, name, params in metrics:
        score = metric(y_true, y_pred, **params)
        print_str_l.append(f"{tag} {name}: {score}")
        res_d[name] = score
    print((" | ".join(print_str_l)).strip())
    if return_dict:
        return res_d

NN Utilities¶

Source code in src/pytorch_tabular/utils/nn_utils.py

def _initialize_layers(activation, initialization, layers):
    if type(layers) is nn.Sequential:
        for layer in layers:
            if hasattr(layer, "weight"):
                _initialize_layers(activation, initialization, layer)
    else:
        if activation == "ReLU":
            nonlinearity = "relu"
        elif activation == "LeakyReLU":
            nonlinearity = "leaky_relu"
        else:
            if initialization == "kaiming":
                logger.warning("Kaiming initialization is only recommended for ReLU and" " LeakyReLU.")
                nonlinearity = "leaky_relu"
            else:
                nonlinearity = "relu"

        if initialization == "kaiming":
            nn.init.kaiming_normal_(layers.weight, nonlinearity=nonlinearity)
        elif initialization == "xavier":
            nn.init.xavier_normal_(
                layers.weight,
                gain=(nn.init.calculate_gain(nonlinearity) if activation in ["ReLU", "LeakyReLU"] else 1),
            )
        elif initialization == "random":
            nn.init.normal_(layers.weight)

Source code in src/pytorch_tabular/utils/nn_utils.py

def _initialize_kaiming(x, initialization, d_sqrt_inv):
    if initialization == "kaiming_uniform":
        nn.init.uniform_(x, a=-d_sqrt_inv, b=d_sqrt_inv)
    elif initialization == "kaiming_normal":
        nn.init.normal_(x, std=d_sqrt_inv)
    elif initialization is None:
        pass
    else:
        raise NotImplementedError("initialization should be either of `kaiming_normal`, `kaiming_uniform`," " `None`")

Source code in src/pytorch_tabular/utils/nn_utils.py

def _linear_dropout_bn(activation, initialization, use_batch_norm, in_units, out_units, dropout):
    if isinstance(activation, str):
        _activation = getattr(nn, activation)
    else:
        _activation = activation
    layers = []
    if use_batch_norm:
        from pytorch_tabular.models.common.layers.batch_norm import BatchNorm1d

        layers.append(BatchNorm1d(num_features=in_units))
    linear = nn.Linear(in_units, out_units)
    _initialize_layers(activation, initialization, linear)
    layers.extend([linear, _activation()])
    if dropout != 0:
        layers.append(nn.Dropout(dropout))
    return layers

Source code in src/pytorch_tabular/utils/nn_utils.py

def _make_ix_like(input, dim=0):
    d = input.size(dim)
    rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype)
    view = [1] * input.dim()
    view[0] = -1
    return rho.view(view).transpose(0, dim)

Resets all parameters in a network.

Parameters:

Name	Type	Description	Default
`model`	`Module`	The model to reset the parameters of.	required

refs

https://discuss.pytorch.org/t/how-to-re-set-alll-parameters-in-a-network/20819/6
https://stackoverflow.com/questions/63627997/reset-parameters-of-a-neural-network-in-pytorch
https://pytorch.org/docs/stable/generated/torch.nn.Module.html

Source code in src/pytorch_tabular/utils/nn_utils.py

def reset_all_weights(model: nn.Module) -> None:
    """Resets all parameters in a network.

    Args:
        model: The model to reset the parameters of.

    refs:
        - https://discuss.pytorch.org/t/how-to-re-set-alll-parameters-in-a-network/20819/6
        - https://stackoverflow.com/questions/63627997/reset-parameters-of-a-neural-network-in-pytorch
        - https://pytorch.org/docs/stable/generated/torch.nn.Module.html

    """

    @torch.no_grad()
    def weight_reset(m: nn.Module):
        # - check if the current module has reset_parameters & if it's callabed called it on m
        reset_parameters = getattr(m, "reset_parameters", None)
        if callable(reset_parameters):
            m.reset_parameters()

    # Applies fn recursively to every submodule see: https://pytorch.org/docs/stable/generated/torch.nn.Module.html
    model.apply(fn=weight_reset)

Takes integer with n dims and converts it to 1-hot representation with n + 1 dims.

The n+1'st dimension will have zeros everywhere but at y'th index, where it will be equal to 1. Args: y: input integer (IntTensor, LongTensor or Variable) of any shape depth (int): the size of the one hot dimension

Source code in src/pytorch_tabular/utils/nn_utils.py

def to_one_hot(y, depth=None):
    r"""Takes integer with n dims and converts it to 1-hot representation with n + 1 dims.

    The n+1'st dimension will have zeros everywhere but at y'th index, where it will be equal to 1.
    Args:
        y: input integer (IntTensor, LongTensor or Variable) of any shape
        depth (int):  the size of the one hot dimension

    """
    y_flat = y.to(torch.int64).view(-1, 1)
    depth = depth or int(torch.max(y_flat)) + 1
    y_one_hot = torch.zeros(y_flat.size()[0], depth, device=y.device).scatter_(1, y_flat, 1)
    y_one_hot = y_one_hot.view(*(tuple(y.shape) + (-1,)))
    return y_one_hot

Source code in src/pytorch_tabular/utils/nn_utils.py

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)