You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

702 lines
23 KiB

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

{{py:
"""
Template file to easily generate fused types consistent code using Tempita
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
Generated file: _sgd_fast.pyx
Each relevant function is duplicated for the dtypes float and double.
The keywords between double braces are substituted during the build.
"""
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
# The dtypes are defined as follows (name_suffix, c_type, np_type)
dtypes = [
("64", "double", "np.float64"),
("32", "float", "np.float32"),
]
}}
"""SGD implementation"""
import numpy as np
from time import time
from cython cimport floating
from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
from sklearn._loss._loss cimport CyLossFunction
from sklearn.utils._typedefs cimport uint32_t, uint8_t
from sklearn.utils._weight_vector cimport WeightVector32, WeightVector64
from sklearn.utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
cdef extern from *:
"""
/* Penalty constants */
#define NO_PENALTY 0
#define L1 1
#define L2 2
#define ELASTICNET 3
/* Learning rate constants */
#define CONSTANT 1
#define OPTIMAL 2
#define INVSCALING 3
#define ADAPTIVE 4
#define PA1 5
#define PA2 6
"""
int NO_PENALTY = 0
int L1 = 1
int L2 = 2
int ELASTICNET = 3
int CONSTANT = 1
int OPTIMAL = 2
int INVSCALING = 3
int ADAPTIVE = 4
int PA1 = 5
int PA2 = 6
# ----------------------------------------
# Extension Types for Loss Functions
# ----------------------------------------
cdef class Regression(CyLossFunction):
"""Base class for loss functions for regression"""
def py_loss(self, double p, double y):
"""Python version of `loss` for testing only.
Pytest needs a python function and can't use cdef functions.
Parameters
----------
p : double
The prediction, `p = w^T x + intercept`.
y : double
The true value (aka target).
Returns
-------
double
The loss evaluated at `p` and `y`.
"""
return self.cy_loss(y, p)
def py_dloss(self, double p, double y):
"""Python version of `dloss` for testing only.
Pytest needs a python function and can't use cdef functions.
Parameters
----------
p : double
The prediction, `p = w^T x`.
y : double
The true value (aka target).
Returns
-------
double
The derivative of the loss function with regards to `p`.
"""
return self.cy_gradient(y, p)
cdef class Classification(CyLossFunction):
"""Base class for loss functions for classification"""
def py_loss(self, double p, double y):
"""Python version of `loss` for testing only."""
return self.cy_loss(y, p)
def py_dloss(self, double p, double y):
"""Python version of `dloss` for testing only."""
return self.cy_gradient(y, p)
cdef class ModifiedHuber(Classification):
"""Modified Huber loss for binary classification with y in {-1, 1}
This is equivalent to quadratically smoothed SVM with gamma = 2.
See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
Stochastic Gradient Descent', ICML'04.
"""
cdef double cy_loss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z >= 1.0:
return 0.0
elif z >= -1.0:
return (1.0 - z) * (1.0 - z)
else:
return -4.0 * z
cdef double cy_gradient(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z >= 1.0:
return 0.0
elif z >= -1.0:
return 2.0 * (1.0 - z) * -y
else:
return -4.0 * y
def __reduce__(self):
return ModifiedHuber, ()
cdef class Hinge(Classification):
"""Hinge loss for binary classification tasks with y in {-1,1}
Parameters
----------
threshold : float > 0.0
Margin threshold. When threshold=1.0, one gets the loss used by SVM.
When threshold=0.0, one gets the loss used by the Perceptron.
"""
cdef double threshold
def __init__(self, double threshold=1.0):
self.threshold = threshold
cdef double cy_loss(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z <= self.threshold:
return self.threshold - z
return 0.0
cdef double cy_gradient(self, double y, double p) noexcept nogil:
cdef double z = p * y
if z <= self.threshold:
return -y
return 0.0
def __reduce__(self):
return Hinge, (self.threshold,)
cdef class SquaredHinge(Classification):
"""Squared Hinge loss for binary classification tasks with y in {-1,1}
Parameters
----------
threshold : float > 0.0
Margin threshold. When threshold=1.0, one gets the loss used by
(quadratically penalized) SVM.
"""
cdef double threshold
def __init__(self, double threshold=1.0):
self.threshold = threshold
cdef double cy_loss(self, double y, double p) noexcept nogil:
cdef double z = self.threshold - p * y
if z > 0:
return z * z
return 0.0
cdef double cy_gradient(self, double y, double p) noexcept nogil:
cdef double z = self.threshold - p * y
if z > 0:
return -2 * y * z
return 0.0
def __reduce__(self):
return SquaredHinge, (self.threshold,)
cdef class EpsilonInsensitive(Regression):
"""Epsilon-Insensitive loss (used by SVR).
loss = max(0, |y - p| - epsilon)
"""
cdef double epsilon
def __init__(self, double epsilon):
self.epsilon = epsilon
cdef double cy_loss(self, double y, double p) noexcept nogil:
cdef double ret = fabs(y - p) - self.epsilon
return ret if ret > 0 else 0
cdef double cy_gradient(self, double y, double p) noexcept nogil:
if y - p > self.epsilon:
return -1
elif p - y > self.epsilon:
return 1
else:
return 0
def __reduce__(self):
return EpsilonInsensitive, (self.epsilon,)
cdef class SquaredEpsilonInsensitive(Regression):
"""Epsilon-Insensitive loss.
loss = max(0, |y - p| - epsilon)^2
"""
cdef double epsilon
def __init__(self, double epsilon):
self.epsilon = epsilon
cdef double cy_loss(self, double y, double p) noexcept nogil:
cdef double ret = fabs(y - p) - self.epsilon
return ret * ret if ret > 0 else 0
cdef double cy_gradient(self, double y, double p) noexcept nogil:
cdef double z
z = y - p
if z > self.epsilon:
return -2 * (z - self.epsilon)
elif z < -self.epsilon:
return 2 * (-z - self.epsilon)
else:
return 0
def __reduce__(self):
return SquaredEpsilonInsensitive, (self.epsilon,)
{{for name_suffix, c_type, np_type in dtypes}}
def _plain_sgd{{name_suffix}}(
const {{c_type}}[::1] weights,
double intercept,
const {{c_type}}[::1] average_weights,
double average_intercept,
CyLossFunction loss,
int penalty_type,
double alpha,
double l1_ratio,
SequentialDataset{{name_suffix}} dataset,
const uint8_t[::1] validation_mask,
bint early_stopping,
validation_score_cb,
int n_iter_no_change,
unsigned int max_iter,
double tol,
int fit_intercept,
int verbose,
bint shuffle,
uint32_t seed,
double weight_pos,
double weight_neg,
int learning_rate,
double eta0,
double power_t,
bint one_class,
double t=1.0,
double intercept_decay=1.0,
int average=0,
):
"""SGD for generic loss functions and penalties with optional averaging
Parameters
----------
weights : ndarray[{{c_type}}, ndim=1]
The allocated vector of weights.
intercept : double
The initial intercept.
average_weights : ndarray[{{c_type}}, ndim=1]
The average weights as computed for ASGD. Should be None if average
is 0.
average_intercept : double
The average intercept for ASGD. Should be 0 if average is 0.
loss : CyLossFunction
A concrete ``CyLossFunction`` object.
penalty_type : int
The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
alpha : float
The regularization parameter.
l1_ratio : float
The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
dataset : SequentialDataset
A concrete ``SequentialDataset`` object.
validation_mask : ndarray[uint8_t, ndim=1]
Equal to True on the validation set.
early_stopping : boolean
Whether to use a stopping criterion based on the validation set.
validation_score_cb : callable
A callable to compute a validation score given the current
coefficients and intercept values.
Used only if early_stopping is True.
n_iter_no_change : int
Number of iteration with no improvement to wait before stopping.
max_iter : int
The maximum number of iterations (epochs).
tol: double
The tolerance for the stopping criterion.
fit_intercept : int
Whether or not to fit the intercept (1 or 0).
verbose : int
Print verbose output; 0 for quite.
shuffle : boolean
Whether to shuffle the training data before each epoch.
weight_pos : float
The weight of the positive class.
weight_neg : float
The weight of the negative class.
seed : uint32_t
Seed of the pseudorandom number generator used to shuffle the data.
learning_rate : int
The learning rate:
(1) constant, eta = eta0
(2) optimal, eta = 1.0/(alpha * t).
(3) inverse scaling, eta = eta0 / pow(t, power_t)
(4) adaptive decrease
(5) Passive Aggressive-I, eta = min(eta0, loss/norm(x)**2), see [1]
(6) Passive Aggressive-II, eta = 1.0 / (norm(x)**2 + 0.5/eta0), see [1]
eta0 : double
The initial learning rate.
For PA-1 (`learning_rate=PA1`) and PA-II (`PA2`), it specifies the
aggressiveness parameter for the passive-agressive algorithm, see [1] where it
is called C:
- For PA-I it is the maximum step size.
- For PA-II it regularizes the step size (the smaller `eta0` the more it
regularizes).
As a general rule-of-thumb for PA, `eta0` should be small when the data is noisy.
power_t : double
The exponent for inverse scaling learning rate.
one_class : boolean
Whether to solve the One-Class SVM optimization problem.
t : double
Initial state of the learning rate. This value is equal to the
iteration count except when the learning rate is set to `optimal`.
Default: 1.0.
average : int
The number of iterations before averaging starts. average=1 is
equivalent to averaging for all iterations.
Returns
-------
weights : array, shape=[n_features]
The fitted weight vector.
intercept : float
The fitted intercept term.
average_weights : array shape=[n_features]
The averaged weights across iterations. Values are valid only if
average > 0.
average_intercept : float
The averaged intercept across iterations.
Values are valid only if average > 0.
n_iter_ : int
The actual number of iter (epochs).
References
----------
.. [1] Online Passive-Aggressive Algorithms
<https://jmlr.org/papers/volume7/crammer06a/crammer06a.pdf>
K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
"""
# get the data information into easy vars
cdef Py_ssize_t n_samples = dataset.n_samples
cdef Py_ssize_t n_features = weights.shape[0]
cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
cdef {{c_type}} *x_data_ptr = NULL
cdef int *x_ind_ptr = NULL
# helper variables
cdef int no_improvement_count = 0
cdef bint infinity = False
cdef int xnnz
cdef double eta = 0.0
cdef double p = 0.0
cdef double update = 0.0
cdef double intercept_update = 0.0
cdef double sumloss = 0.0
cdef double cur_loss_val = 0.0
cdef double score = 0.0
cdef double objective_sum = 0.0
cdef double best_objective = INFINITY
cdef double best_score = -INFINITY
cdef {{c_type}} y = 0.0
cdef {{c_type}} sample_weight
cdef {{c_type}} class_weight = 1.0
cdef unsigned int count = 0
cdef unsigned int train_count = n_samples - np.sum(validation_mask)
cdef unsigned int epoch = 0
cdef unsigned int i = 0
cdef int is_hinge = isinstance(loss, Hinge)
cdef double optimal_init = 0.0
cdef double dloss = 0.0
cdef double MAX_DLOSS = 1e12
cdef long long sample_index
# q vector is only used for L1 regularization
cdef {{c_type}}[::1] q = None
cdef {{c_type}} * q_data_ptr = NULL
if penalty_type == L1 or penalty_type == ELASTICNET:
q = np.zeros((n_features,), dtype={{np_type}}, order="c")
q_data_ptr = &q[0]
cdef double u = 0.0
if penalty_type == L2:
l1_ratio = 0.0
elif penalty_type == L1:
l1_ratio = 1.0
eta = eta0
if learning_rate == OPTIMAL:
typw = np.sqrt(1.0 / np.sqrt(alpha))
# computing eta0, the initial learning rate
initial_eta0 = typw / max(1.0, loss.cy_gradient(1.0, -typw))
# initialize t such that eta at first sample equals eta0
optimal_init = 1.0 / (initial_eta0 * alpha)
t_start = time()
with nogil:
for epoch in range(max_iter):
sumloss = 0
objective_sum = 0
if verbose > 0:
with gil:
print("-- Epoch %d" % (epoch + 1))
if shuffle:
dataset.shuffle(seed)
for i in range(n_samples):
dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
&y, &sample_weight)
sample_index = dataset.index_data_ptr[dataset.current_index]
if validation_mask[sample_index]:
# do not learn on the validation set
continue
p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
if learning_rate == OPTIMAL:
eta = 1.0 / (alpha * (optimal_init + t - 1))
elif learning_rate == INVSCALING:
eta = eta0 / pow(t, power_t)
if verbose or not early_stopping:
cur_loss_val = loss.cy_loss(y, p)
sumloss += cur_loss_val
objective_sum += cur_loss_val
# for PA1/PA2 (passive/aggressive model, online algorithm) use only the loss
if learning_rate != PA1 and learning_rate != PA2:
# sum up all the terms in the optimization objective function
# (i.e. also include regularization in addition to the loss)
# Note: for the L2 term SGD optimizes 0.5 * L2**2, due to using
# weight decay that's why the 0.5 coefficient is required
if penalty_type > 0: # if regularization is enabled
objective_sum += alpha * (
(1 - l1_ratio) * 0.5 * w.norm() ** 2 +
l1_ratio * w.l1norm()
)
if one_class: # specific to One-Class SVM
# nu is alpha * 2 (alpha is set as nu / 2 by the caller)
objective_sum += intercept * (alpha * 2)
if y > 0.0:
class_weight = weight_pos
else:
class_weight = weight_neg
if learning_rate == PA1:
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
if update == 0:
continue
update = min(eta0, loss.cy_loss(y, p) / update)
elif learning_rate == PA2:
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
update = loss.cy_loss(y, p) / (update + 0.5 / eta0)
else:
dloss = loss.cy_gradient(y, p)
# clip dloss with large values to avoid numerical
# instabilities
if dloss < -MAX_DLOSS:
dloss = -MAX_DLOSS
elif dloss > MAX_DLOSS:
dloss = MAX_DLOSS
update = -eta * dloss
if learning_rate >= PA1:
if is_hinge:
# classification
update *= y
elif y - p < 0:
# regression
update *= -1
update *= class_weight * sample_weight
if penalty_type >= L2:
# do not scale to negative values when eta or alpha are too
# big: instead set the weights to zero
w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))
if update != 0.0:
w.add(x_data_ptr, x_ind_ptr, xnnz, update)
if fit_intercept == 1:
intercept_update = update
if one_class: # specific for One-Class SVM
intercept_update -= 2. * eta * alpha
if intercept_update != 0:
intercept += intercept_update * intercept_decay
if 0 < average <= t:
# compute the average for the intercept and update the
# average weights, this is done regardless as to whether
# the update is 0
w.add_average(x_data_ptr, x_ind_ptr, xnnz,
update, (t - average + 1))
average_intercept += ((intercept - average_intercept) /
(t - average + 1))
if penalty_type == L1 or penalty_type == ELASTICNET:
u += (l1_ratio * eta * alpha)
l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)
t += 1
count += 1
# floating-point under-/overflow check.
if (not isfinite(intercept) or any_nonfinite(weights)):
infinity = True
break
# evaluate the score on the validation set
if early_stopping:
with gil:
score = validation_score_cb(weights.base, intercept)
if verbose > 0: # report epoch information
print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
"Avg. loss: %f, Objective: %f, Validation score: %f"
% (w.norm(), np.nonzero(weights)[0].shape[0],
intercept, count, sumloss / train_count,
objective_sum / train_count, score))
print("Total training time: %.2f seconds."
% (time() - t_start))
if tol > -INFINITY and score < best_score + tol:
no_improvement_count += 1
else:
no_improvement_count = 0
if score > best_score:
best_score = score
# or evaluate the loss on the training set
else:
if verbose > 0: # report epoch information
with gil:
print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
"Avg. loss: %f, Objective: %f"
% (w.norm(), np.nonzero(weights)[0].shape[0],
intercept, count, sumloss / train_count,
objective_sum / train_count))
print("Total training time: %.2f seconds."
% (time() - t_start))
# true objective = objective_sum / number of samples
if (
tol > -INFINITY
and objective_sum / train_count > best_objective - tol
):
no_improvement_count += 1
else:
no_improvement_count = 0
if objective_sum / train_count < best_objective:
best_objective = objective_sum / train_count
# if there is no improvement several times in a row
if no_improvement_count >= n_iter_no_change:
if learning_rate == ADAPTIVE and eta > 1e-6:
eta = eta / 5
no_improvement_count = 0
else:
if verbose:
with gil:
print("Convergence after %d epochs took %.2f "
"seconds" % (epoch + 1, time() - t_start))
break
if infinity:
raise ValueError(("Floating-point under-/overflow occurred at epoch"
" #%d. Scaling input data with StandardScaler or"
" MinMaxScaler might help.") % (epoch + 1))
w.reset_wscale()
return (
weights.base,
intercept,
None if average_weights is None else average_weights.base,
average_intercept,
epoch + 1
)
{{endfor}}
cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil:
for i in range(w.shape[0]):
if not isfinite(w[i]):
return True
return 0
cdef inline double sqnorm(
floating * x_data_ptr,
int * x_ind_ptr,
int xnnz,
) noexcept nogil:
cdef double x_norm = 0.0
cdef int j
cdef double z
for j in range(xnnz):
z = x_data_ptr[j]
x_norm += z * z
return x_norm
{{for name_suffix, c_type, np_type in dtypes}}
cdef void l1penalty{{name_suffix}}(
WeightVector{{name_suffix}} w,
{{c_type}} * q_data_ptr,
int *x_ind_ptr,
int xnnz,
double u,
) noexcept nogil:
"""Apply the L1 penalty to each updated feature
This implements the truncated gradient approach by
[Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
"""
cdef double z = 0.0
cdef int j = 0
cdef int idx = 0
cdef double wscale = w.wscale
cdef {{c_type}} *w_data_ptr = w.w_data_ptr
for j in range(xnnz):
idx = x_ind_ptr[j]
z = w_data_ptr[idx]
if wscale * z > 0.0:
w_data_ptr[idx] = max(
0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))
elif wscale * z < 0.0:
w_data_ptr[idx] = min(
0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))
q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)
{{endfor}}