Skip to content

KernelRegressionNW

Nadaraya-Watson kernel regression is a non-parametric technique used for estimating the conditional expectation of a random variable. It works by placing a kernel function at each training data point and computing a weighted average of the target values, where weights are determined by the proximity of the query point to each training point.

This class is a simple implementation of the Nadaraya-Watson kernel regression estimator for usage with scikit-learn.

Bases: MultiOutputMixin, RegressorMixin, BaseEstimator

Source code in src/nadaraya_watson/kernel_regression.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
class KernelRegressionNW(MultiOutputMixin, RegressorMixin, BaseEstimator):
    # "noqa: RUF012"  : this is the sklearn default approach to parameter constraints
    _parameter_constraints = {  # noqa: RUF012
        "bandwidth": [
            Interval(Real, 0, None, closed="neither"),
            StrOptions({"scott", "silverman"}),
        ],
        "kernel": [StrOptions(set(VALID_KERNELS))],
        "metric": [StrOptions(set(VALID_METRICS))],
    }

    def __init__(
        self,
        *,
        bandwidth: Union[float, Literal["scott", "silverman"]] = 1.0,
        kernel: str = "gaussian",
        metric: str = "euclidean",
    ):
        """Initialize the Nadaraya-Watson Kernel regression estimator.
        Parameters
        ----------
        bandwidth : float or str, default=1.0
            The bandwidth of the kernel. If a string, it must be one of "scott" or "silverman".
        kernel : str, default="gaussian"
            The kernel to use. Must be one of the valid kernels.
        metric : str, default="euclidean"
            The distance metric to use. Must be one of the valid metrics.
        """
        self.bandwidth = bandwidth
        self.kernel = kernel
        self.metric = metric

    @classmethod
    def valid_metrics(cls) -> list[str]:
        """Return a list of valid metrics.

        Please note that some names are actually synonymous.
        Please do not feed these values directly to sklearn for grid-search cross-validation.

        e.g. "euclidean" and "l2" are identical.
        Returns
        -------
        list of str
            A list of valid metric names.
        """
        return list(VALID_METRICS.keys())

    @classmethod
    def valid_kernels(cls) -> list[str]:
        """Return a list of valid kernels.

        Returns
        -------
        list of str
            A list of valid kernel names.
        """
        return list(VALID_KERNELS)

    @_fit_context(
        # KernelDensity.metric is not validated yet
        prefer_skip_nested_validation=False
    )
    def fit(self, X, y, sample_weight=None):
        """Fit the Nadaraya-Watson Kernel regression estimator on the data.

        Parameters
        ----------
        X : array-like
            array of shape (n_samples, n_features)

            Training data.
        y : array-like
            array of shape (n_samples,) or (n_samples, n_targets)

            Target values.
        sample_weight : array-like
            array of shape (n_samples,)
            Individual weights for each sample; ignored if None is passed.

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        if isinstance(self.bandwidth, str):
            if self.bandwidth == "scott":
                self.bandwidth_ = X.shape[0] ** (-1 / (X.shape[1] + 4))
            elif self.bandwidth == "silverman":
                self.bandwidth_ = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
        else:
            self.bandwidth_ = self.bandwidth
        X, y = validate_data(self, X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True)
        if sample_weight is not None and not isinstance(sample_weight, float):
            sample_weight = _check_sample_weight(sample_weight, X)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64, ensure_non_negative=True)

        self.X_fit = X
        self.y = y
        self.sample_weight = sample_weight

        return self

    def predict(self, X):
        """Predict using Nadaraya-Watson Kernel regression estimator.

        Parameters
        ----------
        X : array-like
            array of shape (n_samples, n_features)
            Samples.

        Returns
        -------
        y_prediction : ndarray
            array of shape (n_samples,) or (n_samples, n_targets)
            Returns predicted values.
        """
        check_is_fitted(self)
        X = validate_data(self, X, accept_sparse=("csr", "csc"), reset=False)

        # shape (n_samples_X_fit, n_samples_X_predict)
        distance_matrix = pairwise_distances(self.X_fit, X, metric=self.metric)
        log_density = KERNELS[self.kernel](distance_matrix / self.bandwidth_)

        # Removing the max for numerical stability
        # Removing a warning when subtracting -np.inf from a line of -np.inf : this produces a NaN and is expected behavior
        with np.errstate(invalid="ignore", divide="ignore"):
            log_density -= np.max(log_density, axis=0, keepdims=True)

        # Broadcast self.sample_weight to shape (n_samples_X_fit, n_samples_X_predict) from (n_samples_X_fit, )
        weight = np.exp(log_density)
        if self.sample_weight is not None:
            weight *= self.sample_weight[:, None]
        weight /= np.sum(weight, axis=0, keepdims=True)

        # weighted sum of predictions
        y_prediction = np.einsum("ij,ik->jk", weight, self.y)

        return y_prediction

__init__(*, bandwidth=1.0, kernel='gaussian', metric='euclidean')

Initialize the Nadaraya-Watson Kernel regression estimator.

Parameters:

Name Type Description Default
bandwidth float or str

The bandwidth of the kernel. If a string, it must be one of "scott" or "silverman".

1.0
kernel str

The kernel to use. Must be one of the valid kernels.

"gaussian"
metric str

The distance metric to use. Must be one of the valid metrics.

"euclidean"
Source code in src/nadaraya_watson/kernel_regression.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def __init__(
    self,
    *,
    bandwidth: Union[float, Literal["scott", "silverman"]] = 1.0,
    kernel: str = "gaussian",
    metric: str = "euclidean",
):
    """Initialize the Nadaraya-Watson Kernel regression estimator.
    Parameters
    ----------
    bandwidth : float or str, default=1.0
        The bandwidth of the kernel. If a string, it must be one of "scott" or "silverman".
    kernel : str, default="gaussian"
        The kernel to use. Must be one of the valid kernels.
    metric : str, default="euclidean"
        The distance metric to use. Must be one of the valid metrics.
    """
    self.bandwidth = bandwidth
    self.kernel = kernel
    self.metric = metric

fit(X, y, sample_weight=None)

Fit the Nadaraya-Watson Kernel regression estimator on the data.

Parameters:

Name Type Description Default
X array - like

array of shape (n_samples, n_features)

Training data.

required
y array - like

array of shape (n_samples,) or (n_samples, n_targets)

Target values.

required
sample_weight array - like

array of shape (n_samples,) Individual weights for each sample; ignored if None is passed.

None

Returns:

Name Type Description
self object

Returns the instance itself.

Source code in src/nadaraya_watson/kernel_regression.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
@_fit_context(
    # KernelDensity.metric is not validated yet
    prefer_skip_nested_validation=False
)
def fit(self, X, y, sample_weight=None):
    """Fit the Nadaraya-Watson Kernel regression estimator on the data.

    Parameters
    ----------
    X : array-like
        array of shape (n_samples, n_features)

        Training data.
    y : array-like
        array of shape (n_samples,) or (n_samples, n_targets)

        Target values.
    sample_weight : array-like
        array of shape (n_samples,)
        Individual weights for each sample; ignored if None is passed.

    Returns
    -------
    self : object
        Returns the instance itself.
    """
    if isinstance(self.bandwidth, str):
        if self.bandwidth == "scott":
            self.bandwidth_ = X.shape[0] ** (-1 / (X.shape[1] + 4))
        elif self.bandwidth == "silverman":
            self.bandwidth_ = (X.shape[0] * (X.shape[1] + 2) / 4) ** (-1 / (X.shape[1] + 4))
    else:
        self.bandwidth_ = self.bandwidth
    X, y = validate_data(self, X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True)
    if sample_weight is not None and not isinstance(sample_weight, float):
        sample_weight = _check_sample_weight(sample_weight, X)

    if sample_weight is not None:
        sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64, ensure_non_negative=True)

    self.X_fit = X
    self.y = y
    self.sample_weight = sample_weight

    return self

predict(X)

Predict using Nadaraya-Watson Kernel regression estimator.

Parameters:

Name Type Description Default
X array - like

array of shape (n_samples, n_features) Samples.

required

Returns:

Name Type Description
y_prediction ndarray

array of shape (n_samples,) or (n_samples, n_targets) Returns predicted values.

Source code in src/nadaraya_watson/kernel_regression.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def predict(self, X):
    """Predict using Nadaraya-Watson Kernel regression estimator.

    Parameters
    ----------
    X : array-like
        array of shape (n_samples, n_features)
        Samples.

    Returns
    -------
    y_prediction : ndarray
        array of shape (n_samples,) or (n_samples, n_targets)
        Returns predicted values.
    """
    check_is_fitted(self)
    X = validate_data(self, X, accept_sparse=("csr", "csc"), reset=False)

    # shape (n_samples_X_fit, n_samples_X_predict)
    distance_matrix = pairwise_distances(self.X_fit, X, metric=self.metric)
    log_density = KERNELS[self.kernel](distance_matrix / self.bandwidth_)

    # Removing the max for numerical stability
    # Removing a warning when subtracting -np.inf from a line of -np.inf : this produces a NaN and is expected behavior
    with np.errstate(invalid="ignore", divide="ignore"):
        log_density -= np.max(log_density, axis=0, keepdims=True)

    # Broadcast self.sample_weight to shape (n_samples_X_fit, n_samples_X_predict) from (n_samples_X_fit, )
    weight = np.exp(log_density)
    if self.sample_weight is not None:
        weight *= self.sample_weight[:, None]
    weight /= np.sum(weight, axis=0, keepdims=True)

    # weighted sum of predictions
    y_prediction = np.einsum("ij,ik->jk", weight, self.y)

    return y_prediction

valid_kernels() classmethod

Return a list of valid kernels.

Returns:

Type Description
list of str

A list of valid kernel names.

Source code in src/nadaraya_watson/kernel_regression.py
64
65
66
67
68
69
70
71
72
73
@classmethod
def valid_kernels(cls) -> list[str]:
    """Return a list of valid kernels.

    Returns
    -------
    list of str
        A list of valid kernel names.
    """
    return list(VALID_KERNELS)

valid_metrics() classmethod

Return a list of valid metrics.

Please note that some names are actually synonymous. Please do not feed these values directly to sklearn for grid-search cross-validation.

e.g. "euclidean" and "l2" are identical.

Returns:

Type Description
list of str

A list of valid metric names.

Source code in src/nadaraya_watson/kernel_regression.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
@classmethod
def valid_metrics(cls) -> list[str]:
    """Return a list of valid metrics.

    Please note that some names are actually synonymous.
    Please do not feed these values directly to sklearn for grid-search cross-validation.

    e.g. "euclidean" and "l2" are identical.
    Returns
    -------
    list of str
        A list of valid metric names.
    """
    return list(VALID_METRICS.keys())