Source code for robustx.generators.robust_CE_methods.STCE

import numpy as np
import pandas as pd
import torch

from robustx.generators.CEGenerator import CEGenerator
from robustx.robustness_evaluations.DeltaRobustnessEvaluator import DeltaRobustnessEvaluator
from robustx.robustness_evaluations.ModelChangesRobustnessEvaluator import ModelChangesRobustnessEvaluator


[docs] class STCE(CEGenerator): """ A counterfactual explanation generator that uses the T-Rex method for finding robust counterfactual explanations. Inherits from the CEGenerator class and implements the _generation_method to find counterfactual examples with robustness checks using a specified base method and evaluator. The method iterates over positive instances and evaluates their robustness, returning those with stable counterfactuals. Attributes: None specific to this class, but utilizes the task and model from the CEGenerator base class. """ def _generation_method(self, instance, robustness_check: ModelChangesRobustnessEvaluator.__class__ = DeltaRobustnessEvaluator, column_name="target", neg_value=0, K=40, threshold=0.4, **kwargs): """ Generates a counterfactual explanation using the T-Rex method. @param instance: The instance for which to generate a counterfactual. Can be a DataFrame or Series. @param robustness_check: The robustness evaluator to check model changes with respect to input perturbations. @param column_name: The name of the target column. @param neg_value: The value considered negative in the target variable. @param K: The number of samples for stability evaluation. @param threshold: The threshold for counterfactual stability. @param kwargs: Additional keyword arguments. @return: A DataFrame containing the counterfactual explanation if found, otherwise the original instance. """ positives = self.task.training_data.data[self.task.training_data.data[column_name] == neg_value].drop(columns=[column_name]) # Compute Euclidean distances between the instance and each positive sample instance_values = instance.values.flatten() # Drop target column from instance positives['distance'] = positives.apply(lambda x: np.linalg.norm(x.values - instance_values), axis=1) # Sort positives by distance positives = positives.sort_values(by='distance') positives = positives.drop(columns=["distance"]) evaluator = robustness_check(self.task) for _, positive in positives.iterrows(): if evaluator.evaluate(positive, desired_output=1 - neg_value, **kwargs): val = self.counterfactual_stability(positive) if val > threshold: return pd.DataFrame(positive).T return pd.DataFrame(instance).T
[docs] def counterfactual_stability(self, xp): """ Evaluates the stability of a given counterfactual instance. @param xp: The instance for which to evaluate counterfactual stability. @return: A tensor representing the stability score of the counterfactual. """ k = 1000 # Predict probability for the given instance score_x = self.task.model.predict_proba(xp) # Prepare a DataFrame with the predicted score score_x = pd.DataFrame([score_x[1]] * k) score_x.reset_index(drop=True, inplace=True) # Generate Gaussian samples based on the input instance gaussian_samples = np.random.normal(xp.to_numpy(), 0.1, (k, len(xp.T))) # Get model scores for the Gaussian samples model_scores = pd.DataFrame(self.task.model.predict_proba(gaussian_samples)[1]) model_scores.columns = range(model_scores.shape[1]) # Calculate the stability score using tensor operations res = torch.tensor(np.sum((model_scores.values - abs((model_scores.values - score_x.values))) / len(model_scores.values)), requires_grad=True) return res