Source code for robustx.datasets.provided_datasets.ExampleDatasetLoader

from abc import ABC, abstractmethod

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from ..DatasetLoader import DatasetLoader



[docs]
class ExampleDatasetLoader(DatasetLoader, ABC):
    """
    An abstract extension of DatasetLoader class which stores example datasets provided within the library

        ...

    Attributes / Properties
    ------------------------

    _categorical: list[str]
        Stores the list of categorical column names

    _numerical: list[str]
        Stores the list of numerical column names

    __missing_num: any
        Value representing missing numerical data

    __missing_cat: any
        Value representing missing categorical data

    -------

    Methods
    -------

    categorical -> list[str]:
        Returns the list of categorical features

    numerical -> list[str]:
        Returns the list of numerical features

    load_data() -> None:
        Abstract method to load data into the dataset

    get_default_preprocessed_features() -> pd.DataFrame:
        Abstract method to get the default preprocessed dataset

    get_preprocessed_features() -> pd.DataFrame:
        Returns the dataset preprocessed according to user specifications (imputing, scaling, encoding)

    default_preprocess() -> None:
        Preprocesses and updates the dataset using the default preprocessing method

    preprocess() -> None:
        Preprocesses and updates the dataset based on user-provided parameters
    -------
    """

    def __init__(self, categoricals, numericals, missing_val_num=np.nan, missing_val_cat=np.nan, seed=None):
        """
        Initializes the ExampleDatasetLoader with categorical and numerical features, as well as values for missing data.

        @param categoricals: list[str], List of categorical features
        @param numericals: list[str], List of numerical features
        @param missing_val_num: optional, Value to represent missing numerical data (default: np.nan)
        @param missing_val_cat: optional, Value to represent missing categorical data (default: np.nan)
        """
        super().__init__(seed=seed)
        self._categorical = categoricals
        self._numerical = numericals
        self.__missing_num = missing_val_num
        self.__missing_cat = missing_val_cat

    @property
    def categorical(self):
        """
        Returns all categorical column names
        @return: list[str]
        """
        return self._categorical

    @property
    def numerical(self) -> list[str]:
        """
        Returns all numerical column names
        @return: list[str]
        """
        return self._numerical


[docs]
    @abstractmethod
    def load_data(self):
        """
        Loads data into data attribute
        @return: None
        """
        pass



[docs]
    @abstractmethod
    def get_default_preprocessed_features(self) -> pd.DataFrame:
        """
        Returns a preprocessed version of the dataset by using a default/standard preprocessing pipeline
        @return: pd.DataFrame
        """
        pass



[docs]
    def get_preprocessed_features(
            self,
            impute_strategy_numeric: str = 'mean',
            impute_strategy_categoric: str = 'most_frequent',
            fill_value_categoric: str = None,
            fill_value_numeric: str = None,
            scale_method: str = 'standard',
            encode_categorical: bool = True,
            selected_features: list = None
    ) -> pd.DataFrame:
        """
        Returns a preprocessed version of the dataset based on what the user inputs
        @param impute_strategy_numeric:  strategy for imputing missing numeric values ('mean', 'median')
        @param impute_strategy_categoric: strategy for imputing missing categoric values ('most_frequent', 'constant')
        @param fill_value_categoric: value to use for constant imputing strategy for categorical features
        @param fill_value_numeric: value to use for constant imputing strategy for numerical features
        @param scale_method: method for scaling numerical features ('standard', 'minmax', None)
        @param encode_categorical: whether to encode categorical features (True/False)
        @param selected_features: list of features to select, if None all features are used
        @return: pd.DataFrame
        """
        # Extract only the selected features and separate into numerical, categorical
        if selected_features is not None:
            data_selected = self.data[selected_features]
            numeric_features = list(set(self.numerical) & set(selected_features))
            categoric_features = list(set(self.categorical) & set(selected_features))
        else:
            numeric_features = self.numerical
            categoric_features = self.categorical
            data_selected = self.data

        if len(self.numerical) > 0:
            # Impute numerical features
            if impute_strategy_categoric == 'constant':
                numerical_imputer = SimpleImputer(strategy=impute_strategy_numeric, missing_values=self.__missing_num,
                                                  fill_value=fill_value_numeric)
            else:
                numerical_imputer = SimpleImputer(strategy=impute_strategy_numeric, missing_values=self.__missing_num)

            numerical_data_imputed = pd.DataFrame(numerical_imputer.fit_transform(data_selected[numeric_features]),
                                                  columns=numeric_features)

            # Scale numerical features
            if scale_method == 'standard':
                scaler = StandardScaler()
                numerical_data_scaled = pd.DataFrame(scaler.fit_transform(numerical_data_imputed),
                                                     columns=numeric_features)
            elif scale_method == 'minmax':
                scaler = MinMaxScaler()
                numerical_data_scaled = pd.DataFrame(scaler.fit_transform(numerical_data_imputed),
                                                     columns=numeric_features)
            else:
                numerical_data_scaled = numerical_data_imputed
        else:
            numerical_data_scaled = pd.DataFrame()

        if len(self.categorical) > 0:
            # Impute categorical features
            if impute_strategy_categoric == 'constant':
                categorical_imputer = SimpleImputer(strategy=impute_strategy_categoric,
                                                    missing_values=self.__missing_cat,
                                                    fill_value=fill_value_categoric)
            else:
                categorical_imputer = SimpleImputer(strategy=impute_strategy_categoric,
                                                    missing_values=self.__missing_cat)

            categorical_data_imputed = pd.DataFrame(categorical_imputer.fit_transform(self.data[categoric_features]),
                                                    columns=categoric_features)

            # Encode categorical features
            if encode_categorical:
                categorical_data_encoded = pd.get_dummies(categorical_data_imputed, drop_first=True,
                                                          columns=categoric_features)
            else:
                categorical_data_encoded = categorical_data_imputed
        else:
            categorical_data_encoded = pd.DataFrame()

        # Join preprocessed categorical and numerical features
        preprocessed_data = pd.concat([categorical_data_encoded, numerical_data_scaled], axis=1)

        return preprocessed_data



[docs]
    def default_preprocess(self):
        """
        Changes the data attribute to be preprocessed using the default method
        @return: None
        """
        preprocessed = self.get_default_preprocessed_features()
        self.data = pd.concat([preprocessed, self.y], axis=1).drop_duplicates()



[docs]
    def preprocess(
            self,
            impute_strategy_numeric: str = 'mean',
            impute_strategy_categoric: str = 'most_frequent',
            fill_value_categoric: str = None,
            fill_value_numeric: str = None,
            scale_method: str = 'standard',
            encode_categorical: bool = True,
            selected_features: list = None
    ):
        """
        Changes the data attribute to be preprocessed based on parameters
        @param impute_strategy_numeric:  strategy for imputing missing numeric values ('mean', 'median')
        @param impute_strategy_categoric: strategy for imputing missing categoric values ('most_frequent', 'constant')
        @param fill_value_categoric: value to use for constant imputing strategy for categorical features
        @param fill_value_numeric: value to use for constant imputing strategy for numerical features
        @param scale_method: method for scaling numerical features ('standard', 'minmax', None)
        @param encode_categorical: whether to encode categorical features (True/False)
        @param selected_features: list of features to select, if None all features are used
        @return: None
        """
        preprocessed = self.get_preprocessed_features(
            impute_strategy_numeric=impute_strategy_numeric,
            impute_strategy_categoric=impute_strategy_categoric,
            fill_value_categoric=fill_value_categoric,
            fill_value_numeric=fill_value_numeric,
            scale_method=scale_method,
            encode_categorical=encode_categorical,
            selected_features=selected_features
        )
        self.data = pd.concat([preprocessed, self.y], axis=1).drop_duplicates()