Source code for robustx.datasets.provided_datasets.TitanicDatasetLoader

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from robustx.datasets.provided_datasets.ExampleDatasetLoader import ExampleDatasetLoader


[docs] class TitanicDatasetLoader(ExampleDatasetLoader): def __init__(self, seed): categoricals = ["Pclass", "Sex", "Embarked", "Cabin"] numericals = ["Age", "SibSp", "Parch", "Fare"] super().__init__(categoricals, numericals, seed=seed) @property def X(self) -> pd.DataFrame: return self.data.drop(columns=["Survived"]) @property def y(self) -> pd.Series: return self.data["Survived"]
[docs] def load_data(self): url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv" self._data = pd.read_csv(url)
[docs] def get_default_preprocessed_features(self) -> pd.DataFrame: numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, self.numerical), ('cat', categorical_transformer, self.categorical) ]) # Impute and preprocess the data data_features = self._data.drop(columns=["Survived"]) # Log the presence of NaNs before preprocessing print("NaNs before preprocessing:") print(data_features.isna().sum()) data_preprocessed = preprocessor.fit_transform(data_features) # Ensure that the output is a dense array if isinstance(data_preprocessed, np.ndarray): data_preprocessed = pd.DataFrame(data_preprocessed, columns=self.get_feature_names(preprocessor, self.categorical, self.numerical)) # Log the presence of NaNs after preprocessing print("NaNs after preprocessing:") print(pd.DataFrame(data_preprocessed).isna().sum()) return pd.DataFrame.sparse.from_spmatrix(data_preprocessed)
[docs] def get_feature_names(self, preprocessor, categorical_features, numerical_features): categorical_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out( categorical_features) all_feature_names = list(numerical_features) + list(categorical_names) return all_feature_names