Source code for robustx.lib.intabs.IntervalAbstractionPyTorch

from robustx.lib.models.pytorch_models.SimpleNNModel import SimpleNNModel



[docs]
class IntervalAbstractionPytorch:
    """
    Converts a provided PyTorch neural network into a representation of an Interval Neural Network

        ...

    Attributes / Properties
    -------

    bias_intervals: Dict[str, (int, int)]
        The intervals for each of the biases in the NN with key 'bias_into_l{layer_idx}_n{dest_idx}', where input layer
        has a layer_idx of 0

    weight_intervals: Dict[str, (int, int)]
        The intervals for each of the weights in the NN with key 'weight_into_l{layer_idx}_n{dest_idx}', where input
        layer has a layer_idx of 0

    layers: list[int]
        Stores the number of nodes in each layer in a list

    model: BaseModel
        The original model

    delta: int
        The perturbation to each weight in the Neural Network

    bias_delta: int
        The perturbation to each bias in the Neural Network

    -------

    Methods
    -------

    evaluate() -> int:
        Returns the proportion of CEs which are robust for the given parameters

    -------
    """
    def __init__(self, model: SimpleNNModel, delta: float, bias_delta=None):
        """
        @param model: SimpleNNModel, the Neural network to create an INN of
        @param delta: int, perturbation to weights
        @param bias_delta: int, perturbation to bias, default is delta itself
        """
        self.layers = [model.input_dim] + model.hidden_dim + [model.output_dim]
        self.model = model
        self.delta = delta
        if bias_delta is None:
            self.bias_delta = delta
        else:
            self.bias_delta = bias_delta
        self.weight_intervals, self.bias_intervals = self.create_weights_and_bias_dictionary()


[docs]
    def create_weights_and_bias_dictionary(self):
        """
        Generates the intervals for each weight and bias in the Neural Network
        """

        # Extract the weights and biases as numpy arrays for each layer
        params = {}
        for name, param in self.model.get_torch_model().named_parameters():
            params[name] = param.detach().numpy()

        weight_dict = {}
        bias_dict = {}

        # Loop through layers
        for layer_idx in range(0, len(params) // 2):

            # Get weights and biases
            weights = params[f'{layer_idx * 2}.weight']
            biases = params[f'{layer_idx * 2}.bias']

            for dest_idx in range(weights.shape[0]):

                # Set the interval for biases
                bias_key = f'bias_into_l{layer_idx + 1}_n{dest_idx}'
                bias_dict[bias_key] = [biases[dest_idx] - self.bias_delta, biases[dest_idx] + self.bias_delta]

                for src_idx in range(weights.shape[1]):
                    # Set the interval for weights
                    weight_key = f'weight_l{layer_idx}_n{src_idx}_to_l{layer_idx + 1}_n{dest_idx}'
                    weight = weights[dest_idx, src_idx]
                    weight_dict[weight_key] = [weight - self.delta, weight + self.delta]

        return weight_dict, bias_dict