Source code for robustx.robustness_evaluations.ApproximateDeltaRobustnessEvaluator

from robustx.lib.intabs.IntervalAbstractionPyTorch import IntervalAbstractionPytorch
from robustx.robustness_evaluations.ModelChangesRobustnessEvaluator import ModelChangesRobustnessEvaluator
from robustx.lib.tasks.Task import Task
import numpy as np
import torch.nn as nn

[docs] class ApproximateDeltaRobustnessEvaluator(ModelChangesRobustnessEvaluator): """ A robustness evaluator that uses a Approximate Plausible Δ model shifts (APΔS) approach to evaluate the robustness of a model's predictions when a delta perturbation is applied. This class inherits from ModelChangesRobustnessEvaluator and uses the a probabilistic approach to determine if the model's prediction remains stable under model perturbations. Attributes: task (Task): The task to solve, inherited from ModelChangesRobustnessEvaluator. alpha (Float):Confidence in the prediction. R (Float): Fraction of samples for which the predictions should remain stable. """ def __init__(self, ct: Task, alpha=0.999, R=0.995): """ Initializes the DeltaRobustnessEvaluator with a given task. @param ct: The task to solve, provided as a Task instance. """ super().__init__(ct) self.alpha = alpha self.R = R self.number_of_samples = np.ceil(np.log(1 - self.alpha) / np.log(self.R))
[docs] def evaluate(self, ce, desired_outcome=0, delta=0.5, bias_delta=0): """ Evaluates whether the model's prediction for a given instance is robust to changes in the input. @param instance: The instance to evaluate. @param desired_output: The desired output for the model (0 or 1). The evaluation will check if the model's output matches this. @param delta: The maximum allowable perturbation in the input features. @param bias_delta: Additional bias to apply to the delta changes. @param M: A large constant used in MILP formulation for modeling constraints. @param epsilon: A small constant used to ensure numerical stability. @return: A boolean indicating whether the model's prediction is robust given the desired output. """ # Store initial weights old_weights = {} old_biases = {} i = 0 for _, layer in enumerate(self.task.model.get_torch_model()): if isinstance(layer, nn.Linear): old_weights[i] = layer.weight.detach().numpy() old_biases[i] = layer.bias.detach().numpy() i += 1 for _ in range(int(self.number_of_samples)): input_features = ce.detach().numpy() for l in range(0,len(old_weights)): layer_weights = old_weights[l] layer_biases = old_biases[l] weights_perturbation = np.random.uniform(-delta, delta, layer_weights.shape) if bias_delta > 0: biases_perturbation = np.random.uniform(-bias_delta, bias_delta, layer_biases.shape) layer_weights = layer_weights+weights_perturbation if bias_delta > 0: layer_biases = layer_biases+biases_perturbation preactivated_res = np.dot(input_features, layer_weights.T) + layer_biases if l != len(old_weights)-1: #relu activated_res = np.maximum(0.0, preactivated_res) else: #sigmoid activated_res = 1/(1 + np.exp(-preactivated_res)) input_features = activated_res #print(input_features) if (input_features.item() < 0.5 and desired_outcome == 1) or (input_features.item() >= 0.5 and desired_outcome == 0): return 0 return 1