Source code for robustx.datasets.provided_datasets.AdultDatasetLoader

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from robustx.datasets.provided_datasets.ExampleDatasetLoader import ExampleDatasetLoader


[docs] class AdultDatasetLoader(ExampleDatasetLoader): def __init__(self, seed=None): categoricals = [ "workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country" ] numericals = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"] super().__init__(categoricals, numericals, missing_val_cat='?', seed=seed) @property def X(self) -> pd.DataFrame: return self._data.drop(columns=["income"]) @property def y(self) -> pd.Series: return self._data[["income"]]
[docs] def load_data(self): url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" column_names = [ "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income" ] self._data = pd.read_csv(url, names=column_names, na_values=" ?", skipinitialspace=True)
[docs] def get_default_preprocessed_features(self) -> pd.DataFrame: numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, self.numerical), ('cat', categorical_transformer, self.categorical) ]) data = self._data.dropna() data_features = data.drop(columns=["income"]) data_preprocessed = preprocessor.fit_transform(data_features) data_preprocessed_df = pd.DataFrame(data_preprocessed.toarray()) self._data["income"] = self._data["income"].map({"<=50K": 0, ">50K": 1}) return data_preprocessed_df