from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from ..DatasetLoader import DatasetLoader
[docs]
class ExampleDatasetLoader(DatasetLoader, ABC):
"""
An abstract extension of DatasetLoader class which stores example datasets provided within the library
...
Attributes / Properties
------------------------
_categorical: list[str]
Stores the list of categorical column names
_numerical: list[str]
Stores the list of numerical column names
__missing_num: any
Value representing missing numerical data
__missing_cat: any
Value representing missing categorical data
-------
Methods
-------
categorical -> list[str]:
Returns the list of categorical features
numerical -> list[str]:
Returns the list of numerical features
load_data() -> None:
Abstract method to load data into the dataset
get_default_preprocessed_features() -> pd.DataFrame:
Abstract method to get the default preprocessed dataset
get_preprocessed_features() -> pd.DataFrame:
Returns the dataset preprocessed according to user specifications (imputing, scaling, encoding)
default_preprocess() -> None:
Preprocesses and updates the dataset using the default preprocessing method
preprocess() -> None:
Preprocesses and updates the dataset based on user-provided parameters
-------
"""
def __init__(self, categoricals, numericals, missing_val_num=np.nan, missing_val_cat=np.nan, seed=None):
"""
Initializes the ExampleDatasetLoader with categorical and numerical features, as well as values for missing data.
@param categoricals: list[str], List of categorical features
@param numericals: list[str], List of numerical features
@param missing_val_num: optional, Value to represent missing numerical data (default: np.nan)
@param missing_val_cat: optional, Value to represent missing categorical data (default: np.nan)
"""
super().__init__(seed=seed)
self._categorical = categoricals
self._numerical = numericals
self.__missing_num = missing_val_num
self.__missing_cat = missing_val_cat
@property
def categorical(self):
"""
Returns all categorical column names
@return: list[str]
"""
return self._categorical
@property
def numerical(self) -> list[str]:
"""
Returns all numerical column names
@return: list[str]
"""
return self._numerical
[docs]
@abstractmethod
def load_data(self):
"""
Loads data into data attribute
@return: None
"""
pass
[docs]
@abstractmethod
def get_default_preprocessed_features(self) -> pd.DataFrame:
"""
Returns a preprocessed version of the dataset by using a default/standard preprocessing pipeline
@return: pd.DataFrame
"""
pass
[docs]
def get_preprocessed_features(
self,
impute_strategy_numeric: str = 'mean',
impute_strategy_categoric: str = 'most_frequent',
fill_value_categoric: str = None,
fill_value_numeric: str = None,
scale_method: str = 'standard',
encode_categorical: bool = True,
selected_features: list = None
) -> pd.DataFrame:
"""
Returns a preprocessed version of the dataset based on what the user inputs
@param impute_strategy_numeric: strategy for imputing missing numeric values ('mean', 'median')
@param impute_strategy_categoric: strategy for imputing missing categoric values ('most_frequent', 'constant')
@param fill_value_categoric: value to use for constant imputing strategy for categorical features
@param fill_value_numeric: value to use for constant imputing strategy for numerical features
@param scale_method: method for scaling numerical features ('standard', 'minmax', None)
@param encode_categorical: whether to encode categorical features (True/False)
@param selected_features: list of features to select, if None all features are used
@return: pd.DataFrame
"""
# Extract only the selected features and separate into numerical, categorical
if selected_features is not None:
data_selected = self.data[selected_features]
numeric_features = list(set(self.numerical) & set(selected_features))
categoric_features = list(set(self.categorical) & set(selected_features))
else:
numeric_features = self.numerical
categoric_features = self.categorical
data_selected = self.data
if len(self.numerical) > 0:
# Impute numerical features
if impute_strategy_categoric == 'constant':
numerical_imputer = SimpleImputer(strategy=impute_strategy_numeric, missing_values=self.__missing_num,
fill_value=fill_value_numeric)
else:
numerical_imputer = SimpleImputer(strategy=impute_strategy_numeric, missing_values=self.__missing_num)
numerical_data_imputed = pd.DataFrame(numerical_imputer.fit_transform(data_selected[numeric_features]),
columns=numeric_features)
# Scale numerical features
if scale_method == 'standard':
scaler = StandardScaler()
numerical_data_scaled = pd.DataFrame(scaler.fit_transform(numerical_data_imputed),
columns=numeric_features)
elif scale_method == 'minmax':
scaler = MinMaxScaler()
numerical_data_scaled = pd.DataFrame(scaler.fit_transform(numerical_data_imputed),
columns=numeric_features)
else:
numerical_data_scaled = numerical_data_imputed
else:
numerical_data_scaled = pd.DataFrame()
if len(self.categorical) > 0:
# Impute categorical features
if impute_strategy_categoric == 'constant':
categorical_imputer = SimpleImputer(strategy=impute_strategy_categoric,
missing_values=self.__missing_cat,
fill_value=fill_value_categoric)
else:
categorical_imputer = SimpleImputer(strategy=impute_strategy_categoric,
missing_values=self.__missing_cat)
categorical_data_imputed = pd.DataFrame(categorical_imputer.fit_transform(self.data[categoric_features]),
columns=categoric_features)
# Encode categorical features
if encode_categorical:
categorical_data_encoded = pd.get_dummies(categorical_data_imputed, drop_first=True,
columns=categoric_features)
else:
categorical_data_encoded = categorical_data_imputed
else:
categorical_data_encoded = pd.DataFrame()
# Join preprocessed categorical and numerical features
preprocessed_data = pd.concat([categorical_data_encoded, numerical_data_scaled], axis=1)
return preprocessed_data
[docs]
def default_preprocess(self):
"""
Changes the data attribute to be preprocessed using the default method
@return: None
"""
preprocessed = self.get_default_preprocessed_features()
self.data = pd.concat([preprocessed, self.y], axis=1).drop_duplicates()
[docs]
def preprocess(
self,
impute_strategy_numeric: str = 'mean',
impute_strategy_categoric: str = 'most_frequent',
fill_value_categoric: str = None,
fill_value_numeric: str = None,
scale_method: str = 'standard',
encode_categorical: bool = True,
selected_features: list = None
):
"""
Changes the data attribute to be preprocessed based on parameters
@param impute_strategy_numeric: strategy for imputing missing numeric values ('mean', 'median')
@param impute_strategy_categoric: strategy for imputing missing categoric values ('most_frequent', 'constant')
@param fill_value_categoric: value to use for constant imputing strategy for categorical features
@param fill_value_numeric: value to use for constant imputing strategy for numerical features
@param scale_method: method for scaling numerical features ('standard', 'minmax', None)
@param encode_categorical: whether to encode categorical features (True/False)
@param selected_features: list of features to select, if None all features are used
@return: None
"""
preprocessed = self.get_preprocessed_features(
impute_strategy_numeric=impute_strategy_numeric,
impute_strategy_categoric=impute_strategy_categoric,
fill_value_categoric=fill_value_categoric,
fill_value_numeric=fill_value_numeric,
scale_method=scale_method,
encode_categorical=encode_categorical,
selected_features=selected_features
)
self.data = pd.concat([preprocessed, self.y], axis=1).drop_duplicates()