''' Random Relu Features. '''
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array, check_random_state
from sklearn.utils.validation import check_is_fitted
# Import the feature mapping base class
from MRCpy.phi import BasePhi
[docs]class RandomReLUPhi(BasePhi):
'''
ReLU features
Rectified Linear Unit (ReLU) features are given by:
.. math:: z(x) = \max(w^t * (2\sigma^2,x), 0)
where w is a vector(dimension d) of random weights uniformly distributed
over a sphere of unit radius and
:math:`\sigma` is the scaling parameter similar
to the one in random Fourier features.
ReLU function is defined as:
.. math:: f(x) = \max(0, x)
Note that when using ReLU feature mapping, training
and testing instances are expected to be normalized.
.. seealso:: For more information about ReLU Features check:
[1] **ReLU Features:** `Sun, Y., Gilbert, A.,
& Tewari, A. (2019).
On the approximation properties of random
relu features.
arXiv: Machine Learning.
<https://arxiv.org/pdf/1810.04374.pdf>`_
For more information about MRC, one can refer to the
following resources:
[2] `Mazuelas, S., Zanoni, A., & Pérez, A. (2020).
Minimax Classification with
0-1 Loss and Performance Guarantees. Advances in
Neural Information Processing
Systems, 33, 302-312.
<https://arxiv.org/abs/2010.07964>`_
[3] `Mazuelas, S., Shen, Y., & Pérez, A. (2020).
Generalized Maximum
Entropy for Supervised Classification.
arXiv preprint arXiv:2007.05447.
<https://arxiv.org/abs/2007.05447>`_
[4] `Bondugula, K., Mazuelas, S., & Pérez, A. (2021).
MRCpy: A Library for Minimax Risk Classifiers.
arXiv preprint arXiv:2108.01952.
<https://arxiv.org/abs/2108.01952>`_
Parameters
----------
n_classes : `int`
Number of classes in the dataset.
fit_intercept : `bool`, default = `True`
Whether to calculate the intercept.
If set to false, no intercept will be used in calculations
(i.e. data is expected to be already centered).
one_hot : `bool`, default = `False`
Controls the method used for evaluating the features of the
given instances in the binary case.
Only applies in the binary case, namely, only when there are two
classes. If set to true, one-hot-encoding will be used. If set to
false a more efficient shorcut will be performed.
sigma : `str` or `float`, default = 'scale'
When given a string, it defines the type of heuristic to be used
to calculate the scaling parameter `sigma` using the data.
For comparison its relation with parameter `gamma` used in
other methods is :math:`\gamma=1/(2\sigma^2)`.
When given a float, it is the value for the scaling parameter.
'scale'
Approximates `sigma` by
:math:`\sqrt{\\frac{\\textrm{n_features} * \\textrm{var}(X)}{2}}`
so that `gamma` is
:math:`\\frac{1}{\\textrm{n_features} * \\textrm{var}(X)}`
where `var` is the variance function.
'avg_ann_50'
Approximates `sigma` by the average distance to the
:math:`50^{\\textrm{th}}`
nearest neighbour estimated from 1000 samples of the dataset using
the function `rff_sigma`.
n_components : `int`, default = `600`
Number of features which the transformer transforms the input into.
random_state : `int`, `RandomState` instance, default = None
Random seed used to produce the `random_weights_`
used for the approximation of the gaussian kernel.
Attributes
----------
random_weights_ : `array`-like of shape (`n_features`, `n_components`)
Random weights applied to the training samples as a step for
computing the ReLU random features.
is_fitted_ : `bool`
Whether the feature mappings has learned its hyperparameters (if any)
and the length of the feature mapping is set.
len_ : `int`
Length of the feature mapping vector.
'''
[docs] def __init__(self, n_classes, fit_intercept=True, sigma='scale',
n_components=600, random_state=None, one_hot=False):
# Call the base class init function.
super().__init__(n_classes=n_classes, fit_intercept=fit_intercept,
one_hot=one_hot)
self.sigma = sigma
self.n_components = n_components
self.random_state = random_state
[docs] def fit(self, X, Y=None):
'''
Learns the set of random weights for computing the features space.
Also, compute the scaling parameter if the value is not given.
Parameters
----------
X : `array`-like of shape (`n_samples`, `n_dimensions`)
Unlabeled training instances
used to learn the feature configurations.
Y : `array`-like of shape (`n_samples`,), default = `None`
This argument will never be used in this case.
It is present for the consistency of signature of function
among different feature mappings.
Returns
-------
self :
Fitted estimator
'''
X = check_array(X, accept_sparse=True)
d = X.shape[1]
# Evaluate the sigma according to the sigma type given in self.sigma
if self.sigma == 'scale':
self.sigma_val = np.sqrt((d * X.var()) / 2)
elif self.sigma == 'avg_ann_50':
self.sigma_val = self.rff_sigma(X)
elif type(self.sigma) != str:
self.sigma_val = self.sigma
else:
raise ValueError('Unexpected value for sigma ...')
# Obtain the random weights uniformly distributed on
# sphere of unit radius in dimension d.
# Step 1 Generate samples distributed with unit variance
# in each dimension using the gaussian distribution.
self.random_state_ = check_random_state(self.random_state)
self.random_weights_ = \
self.random_state_.normal(0, 1, size=(d + 1, self.n_components))
# Step 2 Normalize the samples so that they are on a unit sphere.
self.random_weights_ = self.random_weights_ / \
np.linalg.norm(self.random_weights_, axis=0)
# Sets the length of the feature mapping
super().fit(X, Y)
return self
[docs] def rff_sigma(self, X):
'''
Computes the scaling parameter for the ReLU features
using the heuristic given in the paper "Compact Nonlinear Maps
and Circulant Extensions" :ref:`[1] <refpr>`.
The heuristic states that the scaling parameter is obtained as
the average distance to the 50th nearest neighbour estimated
from 1000 samples of the dataset.
.. _refpr:
.. seealso:: [1] `Yu, F. X., Kumar, S., Rowley, H., & Chang,
S. F. (2015). Compact nonlinear maps and circulant
extensions.
<https://arxiv.org/pdf/1503.03893.pdf>`_
Parameters
----------
X : `array`-like of shape (`n_samples`, `n_dimensions`)
Unlabeled instances.
Returns
-------
sigma : `float` value
Scaling parameter computed using the heuristic.
'''
if X.shape[0] < 50:
neighbour_ind = X.shape[0] - 2
else:
neighbour_ind = 50
# Find the nearest neighbors
nbrs = NearestNeighbors(n_neighbors=(neighbour_ind + 1),
algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)
# Compute the average distance to the 50th nearest neighbour
sigma = np.average(distances[:, neighbour_ind])
return sigma