Source code for MRCpy.phi.random_relu_phi

''' Random Relu Features. '''

import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array, check_random_state
from sklearn.utils.validation import check_is_fitted

# Import the feature mapping base class
from MRCpy.phi import BasePhi



[docs]
class RandomReLUPhi(BasePhi):
    '''
    ReLU features

    Rectified Linear Unit (ReLU) features are given by:

    .. math::  z(x) = \max(w^t * (2\sigma^2,x), 0)

    where w is a vector(dimension d) of random weights uniformly distributed
    over a sphere of unit radius and
    :math:`\sigma` is the scaling parameter similar
    to the one in random Fourier features.

    ReLU function is defined as:

    .. math::           f(x) = \max(0, x)

    Note that when using ReLU feature mapping, training
    and testing instances are expected to be normalized.

    .. seealso:: For more information about ReLU Features check:

                        [1] **ReLU Features:** `Sun, Y., Gilbert, A.,
                        & Tewari, A. (2019).
                        On the approximation properties of random
                        relu features.
                        arXiv: Machine Learning.
                        <https://arxiv.org/pdf/1810.04374.pdf>`_

                For more information about MRC, one can refer to the
                following resources:

                        [2] `Mazuelas, S., Zanoni, A., & Pérez, A. (2020).
                        Minimax Classification with
                        0-1 Loss and Performance Guarantees. Advances in
                        Neural Information Processing
                        Systems, 33, 302-312.
                        <https://arxiv.org/abs/2010.07964>`_

                        [3] `Mazuelas, S., Shen, Y., & Pérez, A. (2020).
                        Generalized Maximum
                        Entropy for Supervised Classification.
                        arXiv preprint arXiv:2007.05447.
                        <https://arxiv.org/abs/2007.05447>`_

                        [4] `Bondugula, K., Mazuelas, S., & Pérez, A. (2021).
                        MRCpy: A Library for Minimax Risk Classifiers.
                        arXiv preprint arXiv:2108.01952.
                        <https://arxiv.org/abs/2108.01952>`_

    Parameters
    ----------
    n_classes : `int`
        Number of classes in the dataset.

    fit_intercept : `bool`, default = `True`
        Whether to calculate the intercept.
        If set to false, no intercept will be used in calculations
        (i.e. data is expected to be already centered).

    one_hot : `bool`, default = `False`
        Controls the method used for evaluating the features of the
        given instances in the binary case.
        Only applies in the binary case, namely, only when there are two
        classes. If set to true, one-hot-encoding will be used. If set to
        false a more efficient shorcut will be performed.

    sigma : `str` or `float`, default = 'scale'
        When given a string, it defines the type of heuristic to be used
        to calculate the scaling parameter `sigma` using the data.
        For comparison its relation with parameter `gamma` used in
        other methods is :math:`\gamma=1/(2\sigma^2)`.
        When given a float, it is the value for the scaling parameter.

        'scale'
            Approximates `sigma` by
            :math:`\sqrt{\\frac{\\textrm{n_features} * \\textrm{var}(X)}{2}}`
            so that `gamma` is
            :math:`\\frac{1}{\\textrm{n_features} * \\textrm{var}(X)}`
            where `var` is the variance function.

        'avg_ann_50'
            Approximates `sigma` by the average distance to the
            :math:`50^{\\textrm{th}}`
            nearest neighbour estimated from 1000 samples of the dataset using
            the function `rff_sigma`.

    n_components : `int`, default = `600`
        Number of features which the transformer transforms the input into.

    random_state : `int`, `RandomState` instance, default = None
        Random seed used to produce the `random_weights_`
        used for the approximation of the gaussian kernel.

    Attributes
    ----------
    random_weights_ : `array`-like of shape (`n_features`, `n_components`)
        Random weights applied to the training samples as a step for
        computing the ReLU random features.

    is_fitted_ : `bool`
        Whether the feature mappings has learned its hyperparameters (if any)
        and the length of the feature mapping is set.

    len_ : `int`
        Length of the feature mapping vector.
    '''


[docs]
    def __init__(self, n_classes, fit_intercept=True, sigma='scale',
                 n_components=600, random_state=None, one_hot=False):

        # Call the base class init function.
        super().__init__(n_classes=n_classes, fit_intercept=fit_intercept,
                         one_hot=one_hot)

        self.sigma = sigma
        self.n_components = n_components
        self.random_state = random_state



[docs]
    def fit(self, X, Y=None):
        '''
        Learns the set of random weights for computing the features space.
        Also, compute the scaling parameter if the value is not given.

        Parameters
        ----------
        X : `array`-like of shape (`n_samples`, `n_dimensions`)
            Unlabeled training instances
            used to learn the feature configurations.

        Y : `array`-like of shape (`n_samples`,), default = `None`
            This argument will never be used in this case.
            It is present for the consistency of signature of function
            among different feature mappings.

        Returns
        -------
        self :
            Fitted estimator

        '''

        X = check_array(X, accept_sparse=True)

        d = X.shape[1]
        # Evaluate the sigma according to the sigma type given in self.sigma
        if self.sigma == 'scale':
            self.sigma_val = np.sqrt((d * X.var()) / 2)

        elif self.sigma == 'avg_ann_50':
            self.sigma_val = self.rff_sigma(X)

        elif type(self.sigma) != str:
            self.sigma_val = self.sigma

        else:
            raise ValueError('Unexpected value for sigma ...')

        # Obtain the random weights uniformly distributed on
        # sphere of unit radius in dimension d.
        # Step 1 Generate samples distributed with unit variance
        #        in each dimension using the gaussian distribution.
        self.random_state_ = check_random_state(self.random_state)
        self.random_weights_ = \
            self.random_state_.normal(0, 1, size=(d + 1, self.n_components))
        # Step 2 Normalize the samples so that they are on a unit sphere.
        self.random_weights_ = self.random_weights_ / \
            np.linalg.norm(self.random_weights_, axis=0)

        # Sets the length of the feature mapping
        super().fit(X, Y)

        return self



[docs]
    def transform(self, X):
        '''
        Compute the ReLU random features (:math:`z(x)`).

        Parameters
        ----------
        X : `array`-like of shape (`n_samples`, `n_dimensions`)
            Unlabeled training instances.

        Returns
        -------
        X_feat : `array`-like of shape (`n_samples`, `n_features`)
            Transformed features from the given instances.

        '''

        check_is_fitted(self, ["random_weights_", "is_fitted_"])
        X = check_array(X, accept_sparse=True)
        n = X.shape[0]

        # Adding scaling parameter
        X = np.hstack(([[2 * self.sigma_val**2]] * n, X))
        X_feat = X @ self.random_weights_

        # ReLu on the computed feature matrix
        X_feat[X_feat < 0] = 0

        return X_feat



[docs]
    def rff_sigma(self, X):

        '''

        Computes the scaling parameter for the ReLU features
        using the heuristic given in the paper "Compact Nonlinear Maps
        and Circulant Extensions" :ref:`[1] <refpr>`.

        The heuristic states that the scaling parameter is obtained as
        the average distance to the 50th nearest neighbour estimated
        from 1000 samples of the dataset.

        .. _refpr:
        .. seealso:: [1] `Yu, F. X., Kumar, S., Rowley, H., & Chang,
                        S. F. (2015). Compact nonlinear maps and circulant
                        extensions.
                        <https://arxiv.org/pdf/1503.03893.pdf>`_

        Parameters
        ----------
        X : `array`-like of shape (`n_samples`, `n_dimensions`)
            Unlabeled instances.

        Returns
        -------
        sigma : `float` value
            Scaling parameter computed using the heuristic.
        '''
        if X.shape[0] < 50:
            neighbour_ind = X.shape[0] - 2
        else:
            neighbour_ind = 50

        # Find the nearest neighbors
        nbrs = NearestNeighbors(n_neighbors=(neighbour_ind + 1),
                                algorithm='ball_tree').fit(X)
        distances, indices = nbrs.kneighbors(X)

        # Compute the average distance to the 50th nearest neighbour
        sigma = np.average(distances[:, neighbour_ind])

        return sigma