You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2284 lines
71 KiB

import pickle
import warnings
from unittest.mock import Mock
import joblib
import numpy as np
import pytest
import scipy.sparse as sp
from sklearn import datasets, linear_model, metrics
from sklearn.base import clone, is_classifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import _sgd_fast as sgd_fast
from sklearn.linear_model import _stochastic_gradient
from sklearn.model_selection import (
RandomizedSearchCV,
ShuffleSplit,
StratifiedShuffleSplit,
)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, scale
from sklearn.svm import OneClassSVM
from sklearn.utils import get_tags
from sklearn.utils._testing import (
assert_allclose,
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
def _update_kwargs(kwargs):
if "random_state" not in kwargs:
kwargs["random_state"] = 42
if "tol" not in kwargs:
kwargs["tol"] = None
if "max_iter" not in kwargs:
kwargs["max_iter"] = 5
class _SparseSGDClassifier(linear_model.SGDClassifier):
def fit(self, X, y, *args, **kw):
X = sp.csr_matrix(X)
return super().fit(X, y, *args, **kw)
def partial_fit(self, X, y, *args, **kw):
X = sp.csr_matrix(X)
return super().partial_fit(X, y, *args, **kw)
def decision_function(self, X):
X = sp.csr_matrix(X)
return super().decision_function(X)
def predict_proba(self, X):
X = sp.csr_matrix(X)
return super().predict_proba(X)
class _SparseSGDRegressor(linear_model.SGDRegressor):
def fit(self, X, y, *args, **kw):
X = sp.csr_matrix(X)
return linear_model.SGDRegressor.fit(self, X, y, *args, **kw)
def partial_fit(self, X, y, *args, **kw):
X = sp.csr_matrix(X)
return linear_model.SGDRegressor.partial_fit(self, X, y, *args, **kw)
def decision_function(self, X, *args, **kw):
# XXX untested as of v0.22
X = sp.csr_matrix(X)
return linear_model.SGDRegressor.decision_function(self, X, *args, **kw)
class _SparseSGDOneClassSVM(linear_model.SGDOneClassSVM):
def fit(self, X, *args, **kw):
X = sp.csr_matrix(X)
return linear_model.SGDOneClassSVM.fit(self, X, *args, **kw)
def partial_fit(self, X, *args, **kw):
X = sp.csr_matrix(X)
return linear_model.SGDOneClassSVM.partial_fit(self, X, *args, **kw)
def decision_function(self, X, *args, **kw):
X = sp.csr_matrix(X)
return linear_model.SGDOneClassSVM.decision_function(self, X, *args, **kw)
def SGDClassifier(**kwargs):
_update_kwargs(kwargs)
return linear_model.SGDClassifier(**kwargs)
def SGDRegressor(**kwargs):
_update_kwargs(kwargs)
return linear_model.SGDRegressor(**kwargs)
def SGDOneClassSVM(**kwargs):
_update_kwargs(kwargs)
return linear_model.SGDOneClassSVM(**kwargs)
def SparseSGDClassifier(**kwargs):
_update_kwargs(kwargs)
return _SparseSGDClassifier(**kwargs)
def SparseSGDRegressor(**kwargs):
_update_kwargs(kwargs)
return _SparseSGDRegressor(**kwargs)
def SparseSGDOneClassSVM(**kwargs):
_update_kwargs(kwargs)
return _SparseSGDOneClassSVM(**kwargs)
# Test Data
# test sample 1
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
Y = [1, 1, 1, 2, 2, 2]
T = np.array([[-1, -1], [2, 2], [3, 2]])
true_result = [1, 2, 2]
# test sample 2; string class labels
X2 = np.array(
[
[-1, 1],
[-0.75, 0.5],
[-1.5, 1.5],
[1, 1],
[0.75, 0.5],
[1.5, 1.5],
[-1, -1],
[0, -0.5],
[1, -1],
]
)
Y2 = ["one"] * 3 + ["two"] * 3 + ["three"] * 3
T2 = np.array([[-1.5, 0.5], [1, 2], [0, -2]])
true_result2 = ["one", "two", "three"]
# test sample 3
X3 = np.array(
[
[1, 1, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0],
[0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 1, 1],
[0, 0, 0, 0, 1, 1],
[0, 0, 0, 1, 0, 0],
[0, 0, 0, 1, 0, 0],
]
)
Y3 = np.array([1, 1, 1, 1, 2, 2, 2, 2])
# test sample 4 - two more or less redundant feature groups
X4 = np.array(
[
[1, 0.9, 0.8, 0, 0, 0],
[1, 0.84, 0.98, 0, 0, 0],
[1, 0.96, 0.88, 0, 0, 0],
[1, 0.91, 0.99, 0, 0, 0],
[0, 0, 0, 0.89, 0.91, 1],
[0, 0, 0, 0.79, 0.84, 1],
[0, 0, 0, 0.91, 0.95, 1],
[0, 0, 0, 0.93, 1, 1],
]
)
Y4 = np.array([1, 1, 1, 1, 2, 2, 2, 2])
iris = datasets.load_iris()
# test sample 5 - test sample 1 as binary classification problem
X5 = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
Y5 = [1, 1, 1, 2, 2, 2]
true_result5 = [0, 1, 1]
###############################################################################
# Common Test Case to classification and regression
# a simple implementation of ASGD to use for testing
# uses squared loss to find the gradient
def asgd(klass, X, y, eta, alpha, weight_init=None, intercept_init=0.0):
if weight_init is None:
weights = np.zeros(X.shape[1])
else:
weights = weight_init
average_weights = np.zeros(X.shape[1])
intercept = intercept_init
average_intercept = 0.0
decay = 1.0
# sparse data has a fixed decay of .01
if klass in (SparseSGDClassifier, SparseSGDRegressor):
decay = 0.01
for i, entry in enumerate(X):
p = np.dot(entry, weights)
p += intercept
gradient = p - y[i]
weights *= 1.0 - (eta * alpha)
weights += -(eta * gradient * entry)
intercept += -(eta * gradient) * decay
average_weights *= i
average_weights += weights
average_weights /= i + 1.0
average_intercept *= i
average_intercept += intercept
average_intercept /= i + 1.0
return average_weights, average_intercept
def _test_warm_start(klass, X, Y, lr):
# Test that explicit warm restart...
clf = klass(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr)
clf.fit(X, Y)
clf2 = klass(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr)
clf2.fit(X, Y, coef_init=clf.coef_.copy(), intercept_init=clf.intercept_.copy())
# ... and implicit warm restart are equivalent.
clf3 = klass(
alpha=0.01, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr
)
clf3.fit(X, Y)
assert clf3.t_ == clf.t_
assert_array_almost_equal(clf3.coef_, clf.coef_)
clf3.set_params(alpha=0.001)
clf3.fit(X, Y)
assert clf3.t_ == clf2.t_
assert_array_almost_equal(clf3.coef_, clf2.coef_)
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
def test_warm_start(klass, lr):
_test_warm_start(klass, X, Y, lr)
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_input_format(klass):
# Input format tests.
clf = klass(alpha=0.01, shuffle=False)
clf.fit(X, Y)
Y_ = np.array(Y)[:, np.newaxis]
Y_ = np.c_[Y_, Y_]
with pytest.raises(ValueError):
clf.fit(X, Y_)
@pytest.mark.parametrize("lr", ["pa1", "pa2"])
@pytest.mark.parametrize(
["est", "loss"], [(SGDClassifier, "squared_hinge"), (SGDRegressor, "squared_error")]
)
def test_learning_rate_PA_raises(lr, est, loss):
"""Test that SGD raises with forbidden loss for passive-aggressive algo."""
est = est(loss=loss, learning_rate=lr)
with pytest.raises(ValueError):
est.fit(X, Y)
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_clone(klass):
# Test whether clone works ok.
clf = klass(alpha=0.01, penalty="l1")
clf = clone(clf)
clf.set_params(penalty="l2")
clf.fit(X, Y)
clf2 = klass(alpha=0.01, penalty="l2")
clf2.fit(X, Y)
assert_array_equal(clf.coef_, clf2.coef_)
@pytest.mark.parametrize(
"klass",
[
SGDClassifier,
SparseSGDClassifier,
SGDRegressor,
SparseSGDRegressor,
SGDOneClassSVM,
SparseSGDOneClassSVM,
],
)
def test_plain_has_no_average_attr(klass):
clf = klass(average=True, eta0=0.01)
clf.fit(X, Y)
assert hasattr(clf, "_average_coef")
assert hasattr(clf, "_average_intercept")
assert hasattr(clf, "_standard_intercept")
assert hasattr(clf, "_standard_coef")
clf = klass()
clf.fit(X, Y)
assert not hasattr(clf, "_average_coef")
assert not hasattr(clf, "_average_intercept")
assert not hasattr(clf, "_standard_intercept")
assert not hasattr(clf, "_standard_coef")
@pytest.mark.parametrize(
"klass",
[
SGDClassifier,
SparseSGDClassifier,
SGDRegressor,
SparseSGDRegressor,
SGDOneClassSVM,
SparseSGDOneClassSVM,
],
)
def test_late_onset_averaging_not_reached(klass):
clf1 = klass(average=600)
clf2 = klass()
for _ in range(100):
if is_classifier(clf1):
clf1.partial_fit(X, Y, classes=np.unique(Y))
clf2.partial_fit(X, Y, classes=np.unique(Y))
else:
clf1.partial_fit(X, Y)
clf2.partial_fit(X, Y)
assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16)
if klass in [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]:
assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16)
elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
assert_allclose(clf1.offset_, clf2.offset_)
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_late_onset_averaging_reached(klass):
eta0 = 0.001
alpha = 0.0001
Y_encode = np.array(Y)
Y_encode[Y_encode == 1] = -1.0
Y_encode[Y_encode == 2] = 1.0
clf1 = klass(
average=7,
learning_rate="constant",
loss="squared_error",
eta0=eta0,
alpha=alpha,
max_iter=2,
shuffle=False,
)
clf2 = klass(
average=False,
learning_rate="constant",
loss="squared_error",
eta0=eta0,
alpha=alpha,
max_iter=1,
shuffle=False,
)
clf1.fit(X, Y_encode)
clf2.fit(X, Y_encode)
average_weights, average_intercept = asgd(
klass,
X,
Y_encode,
eta0,
alpha,
weight_init=clf2.coef_.ravel(),
intercept_init=clf2.intercept_,
)
assert_array_almost_equal(clf1.coef_.ravel(), average_weights.ravel(), decimal=16)
assert_almost_equal(clf1.intercept_, average_intercept, decimal=16)
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_early_stopping(klass):
X = iris.data[iris.target > 0]
Y = iris.target[iris.target > 0]
for early_stopping in [True, False]:
max_iter = 1000
clf = klass(early_stopping=early_stopping, tol=1e-3, max_iter=max_iter).fit(
X, Y
)
assert clf.n_iter_ < max_iter
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_adaptive_longer_than_constant(klass):
clf1 = klass(learning_rate="adaptive", eta0=0.01, tol=1e-3, max_iter=100)
clf1.fit(iris.data, iris.target)
clf2 = klass(learning_rate="constant", eta0=0.01, tol=1e-3, max_iter=100)
clf2.fit(iris.data, iris.target)
assert clf1.n_iter_ > clf2.n_iter_
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_validation_set_not_used_for_training(klass):
X, Y = iris.data, iris.target
validation_fraction = 0.4
seed = 42
shuffle = False
max_iter = 10
clf1 = klass(
early_stopping=True,
random_state=np.random.RandomState(seed),
validation_fraction=validation_fraction,
learning_rate="constant",
eta0=0.01,
tol=None,
max_iter=max_iter,
shuffle=shuffle,
)
clf1.fit(X, Y)
assert clf1.n_iter_ == max_iter
clf2 = klass(
early_stopping=False,
random_state=np.random.RandomState(seed),
learning_rate="constant",
eta0=0.01,
tol=None,
max_iter=max_iter,
shuffle=shuffle,
)
if is_classifier(clf2):
cv = StratifiedShuffleSplit(test_size=validation_fraction, random_state=seed)
else:
cv = ShuffleSplit(test_size=validation_fraction, random_state=seed)
idx_train, idx_val = next(cv.split(X, Y))
idx_train = np.sort(idx_train) # remove shuffling
clf2.fit(X[idx_train], Y[idx_train])
assert clf2.n_iter_ == max_iter
assert_array_equal(clf1.coef_, clf2.coef_)
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_n_iter_no_change(klass):
X, Y = iris.data, iris.target
# test that n_iter_ increases monotonically with n_iter_no_change
for early_stopping in [True, False]:
n_iter_list = [
klass(
early_stopping=early_stopping,
n_iter_no_change=n_iter_no_change,
tol=1e-4,
max_iter=1000,
)
.fit(X, Y)
.n_iter_
for n_iter_no_change in [2, 3, 10]
]
assert_array_equal(n_iter_list, sorted(n_iter_list))
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_not_enough_sample_for_early_stopping(klass):
# test an error is raised if the training or validation set is empty
clf = klass(early_stopping=True, validation_fraction=0.99)
with pytest.raises(ValueError):
clf.fit(X3, Y3)
@pytest.mark.parametrize("Estimator", [SGDClassifier, SGDRegressor])
@pytest.mark.parametrize("l1_ratio", [0, 0.7, 1])
def test_sgd_l1_ratio_not_used(Estimator, l1_ratio):
"""Check that l1_ratio is not used when penalty is not 'elasticnet'"""
clf1 = Estimator(penalty="l1", l1_ratio=None, random_state=0).fit(X, Y)
clf2 = Estimator(penalty="l1", l1_ratio=l1_ratio, random_state=0).fit(X, Y)
assert_allclose(clf1.coef_, clf2.coef_)
@pytest.mark.parametrize(
"Estimator", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_sgd_failing_penalty_validation(Estimator):
clf = Estimator(penalty="elasticnet", l1_ratio=None)
with pytest.raises(
ValueError, match="l1_ratio must be set when penalty is 'elasticnet'"
):
clf.fit(X, Y)
# TODO(1.10): remove this test
@pytest.mark.parametrize(
"klass",
[
SGDClassifier,
SparseSGDClassifier,
SGDRegressor,
SparseSGDRegressor,
SGDOneClassSVM,
SparseSGDOneClassSVM,
],
)
def test_power_t_limits(klass):
"""Check that a warning is raised when `power_t` is negative."""
# Check that negative values of `power_t` raise a warning
clf = klass(power_t=-1.0)
with pytest.warns(
FutureWarning, match="Negative values for `power_t` are deprecated"
):
clf.fit(X, Y)
# Check that values of 'power_t in range [0, inf) do not raise a warning
with warnings.catch_warnings(record=True) as w:
clf = klass(power_t=0.5)
clf.fit(X, Y)
assert len(w) == 0
###############################################################################
# Classification Test Case
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_clf(klass):
# Check that SGD gives any results :-)
for loss in ("hinge", "squared_hinge", "log_loss", "modified_huber"):
clf = klass(
penalty="l2",
alpha=0.01,
fit_intercept=True,
loss=loss,
max_iter=10,
shuffle=True,
)
clf.fit(X, Y)
# assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7)
assert_array_equal(clf.predict(T), true_result)
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
)
def test_provide_coef(klass):
"""Check that the shape of `coef_init` is validated."""
with pytest.raises(ValueError, match="Provided coef_init does not match dataset"):
klass().fit(X, Y, coef_init=np.zeros((3,)))
@pytest.mark.parametrize(
"klass, fit_params",
[
(SGDClassifier, {"intercept_init": np.zeros((3,))}),
(SparseSGDClassifier, {"intercept_init": np.zeros((3,))}),
(SGDOneClassSVM, {"offset_init": np.zeros((3,))}),
(SparseSGDOneClassSVM, {"offset_init": np.zeros((3,))}),
],
)
def test_set_intercept_offset(klass, fit_params):
"""Check that `intercept_init` or `offset_init` is validated."""
sgd_estimator = klass()
with pytest.raises(ValueError, match="does not match dataset"):
sgd_estimator.fit(X, Y, **fit_params)
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDRegressor, SparseSGDRegressor]
)
def test_sgd_early_stopping_with_partial_fit(klass):
"""Check that we raise an error for `early_stopping` used with
`partial_fit`.
"""
err_msg = "early_stopping should be False with partial_fit"
with pytest.raises(ValueError, match=err_msg):
klass(early_stopping=True).partial_fit(X, Y)
@pytest.mark.parametrize(
"klass, fit_params",
[
(SGDClassifier, {"intercept_init": 0}),
(SparseSGDClassifier, {"intercept_init": 0}),
(SGDOneClassSVM, {"offset_init": 0}),
(SparseSGDOneClassSVM, {"offset_init": 0}),
],
)
def test_set_intercept_offset_binary(klass, fit_params):
"""Check that we can pass a scaler with binary classification to
`intercept_init` or `offset_init`."""
klass().fit(X5, Y5, **fit_params)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_average_binary_computed_correctly(klass):
# Checks the SGDClassifier correctly computes the average weights
eta = 0.1
alpha = 2.0
n_samples = 20
n_features = 10
rng = np.random.RandomState(0)
X = rng.normal(size=(n_samples, n_features))
w = rng.normal(size=n_features)
clf = klass(
loss="squared_error",
learning_rate="constant",
eta0=eta,
alpha=alpha,
fit_intercept=True,
max_iter=1,
average=True,
shuffle=False,
)
# simple linear function without noise
y = np.dot(X, w)
y = np.sign(y)
clf.fit(X, y)
average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
average_weights = average_weights.reshape(1, -1)
assert_array_almost_equal(clf.coef_, average_weights, decimal=14)
assert_almost_equal(clf.intercept_, average_intercept, decimal=14)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_set_intercept_to_intercept(klass):
# Checks intercept_ shape consistency for the warm starts
# Inconsistent intercept_ shape.
clf = klass().fit(X5, Y5)
klass().fit(X5, Y5, intercept_init=clf.intercept_)
clf = klass().fit(X, Y)
klass().fit(X, Y, intercept_init=clf.intercept_)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_at_least_two_labels(klass):
# Target must have at least two labels
clf = klass(alpha=0.01, max_iter=20)
with pytest.raises(ValueError):
clf.fit(X2, np.ones(9))
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_partial_fit_weight_class_balanced(klass):
# partial_fit with class_weight='balanced' not supported"""
regex = (
r"class_weight 'balanced' is not supported for "
r"partial_fit\. In order to use 'balanced' weights, "
r"use compute_class_weight\('balanced', classes=classes, y=y\). "
r"In place of y you can use a large enough sample "
r"of the full training set target to properly "
r"estimate the class frequency distributions\. "
r"Pass the resulting weights as the class_weight "
r"parameter\."
)
with pytest.raises(ValueError, match=regex):
klass(class_weight="balanced").partial_fit(X, Y, classes=np.unique(Y))
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_multiclass(klass):
# Multi-class test case
clf = klass(alpha=0.01, max_iter=20).fit(X2, Y2)
assert clf.coef_.shape == (3, 2)
assert clf.intercept_.shape == (3,)
assert clf.decision_function([[0, 0]]).shape == (1, 3)
pred = clf.predict(T2)
assert_array_equal(pred, true_result2)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_multiclass_average(klass):
eta = 0.001
alpha = 0.01
# Multi-class average test case
clf = klass(
loss="squared_error",
learning_rate="constant",
eta0=eta,
alpha=alpha,
fit_intercept=True,
max_iter=1,
average=True,
shuffle=False,
)
np_Y2 = np.array(Y2)
clf.fit(X2, np_Y2)
classes = np.unique(np_Y2)
for i, cl in enumerate(classes):
y_i = np.ones(np_Y2.shape[0])
y_i[np_Y2 != cl] = -1
average_coef, average_intercept = asgd(klass, X2, y_i, eta, alpha)
assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16)
assert_almost_equal(average_intercept, clf.intercept_[i], decimal=16)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_multiclass_with_init_coef(klass):
# Multi-class test case
clf = klass(alpha=0.01, max_iter=20)
clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), intercept_init=np.zeros(3))
assert clf.coef_.shape == (3, 2)
assert clf.intercept_.shape, (3,)
pred = clf.predict(T2)
assert_array_equal(pred, true_result2)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_multiclass_njobs(klass):
# Multi-class test case with multi-core support
clf = klass(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2)
assert clf.coef_.shape == (3, 2)
assert clf.intercept_.shape == (3,)
assert clf.decision_function([[0, 0]]).shape == (1, 3)
pred = clf.predict(T2)
assert_array_equal(pred, true_result2)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_set_coef_multiclass(klass):
# Checks coef_init and intercept_init shape for multi-class
# problems
# Provided coef_ does not match dataset
clf = klass()
with pytest.raises(ValueError):
clf.fit(X2, Y2, coef_init=np.zeros((2, 2)))
# Provided coef_ does match dataset
clf = klass().fit(X2, Y2, coef_init=np.zeros((3, 2)))
# Provided intercept_ does not match dataset
clf = klass()
with pytest.raises(ValueError):
clf.fit(X2, Y2, intercept_init=np.zeros((1,)))
# Provided intercept_ does match dataset.
clf = klass().fit(X2, Y2, intercept_init=np.zeros((3,)))
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_predict_proba_method_access(klass):
# Checks that SGDClassifier predict_proba and predict_log_proba methods
# can either be accessed or raise an appropriate error message
# otherwise. See
# https://github.com/scikit-learn/scikit-learn/issues/10938 for more
# details.
for loss in linear_model.SGDClassifier.loss_functions:
clf = SGDClassifier(loss=loss)
if loss in ("log_loss", "modified_huber"):
assert hasattr(clf, "predict_proba")
assert hasattr(clf, "predict_log_proba")
else:
inner_msg = "probability estimates are not available for loss={!r}".format(
loss
)
assert not hasattr(clf, "predict_proba")
assert not hasattr(clf, "predict_log_proba")
with pytest.raises(
AttributeError, match="has no attribute 'predict_proba'"
) as exec_info:
clf.predict_proba
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
with pytest.raises(
AttributeError, match="has no attribute 'predict_log_proba'"
) as exec_info:
clf.predict_log_proba
assert isinstance(exec_info.value.__cause__, AttributeError)
assert inner_msg in str(exec_info.value.__cause__)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_proba(klass):
# Check SGD.predict_proba
# Hinge loss does not allow for conditional prob estimate.
# We cannot use the factory here, because it defines predict_proba
# anyway.
clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=10, tol=None).fit(X, Y)
assert not hasattr(clf, "predict_proba")
assert not hasattr(clf, "predict_log_proba")
# log and modified_huber losses can output probability estimates
# binary case
for loss in ["log_loss", "modified_huber"]:
clf = klass(loss=loss, alpha=0.01, max_iter=10)
clf.fit(X, Y)
p = clf.predict_proba([[3, 2]])
assert p[0, 1] > 0.5
p = clf.predict_proba([[-1, -1]])
assert p[0, 1] < 0.5
# If predict_proba is 0, we get "RuntimeWarning: divide by zero encountered
# in log". We avoid it here.
with np.errstate(divide="ignore"):
p = clf.predict_log_proba([[3, 2]])
assert p[0, 1] > p[0, 0]
p = clf.predict_log_proba([[-1, -1]])
assert p[0, 1] < p[0, 0]
# log loss multiclass probability estimates
clf = klass(loss="log_loss", alpha=0.01, max_iter=10).fit(X2, Y2)
d = clf.decision_function([[0.1, -0.1], [0.3, 0.2]])
p = clf.predict_proba([[0.1, -0.1], [0.3, 0.2]])
assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
assert_almost_equal(p[0].sum(), 1)
assert np.all(p[0] >= 0)
p = clf.predict_proba([[-1, -1]])
d = clf.decision_function([[-1, -1]])
assert_array_equal(np.argsort(p[0]), np.argsort(d[0]))
lp = clf.predict_log_proba([[3, 2]])
p = clf.predict_proba([[3, 2]])
assert_array_almost_equal(np.log(p), lp)
lp = clf.predict_log_proba([[-1, -1]])
p = clf.predict_proba([[-1, -1]])
assert_array_almost_equal(np.log(p), lp)
# Modified Huber multiclass probability estimates; requires a separate
# test because the hard zero/one probabilities may destroy the
# ordering present in decision_function output.
clf = klass(loss="modified_huber", alpha=0.01, max_iter=10)
clf.fit(X2, Y2)
d = clf.decision_function([[3, 2]])
p = clf.predict_proba([[3, 2]])
if klass != SparseSGDClassifier:
assert np.argmax(d, axis=1) == np.argmax(p, axis=1)
else: # XXX the sparse test gets a different X2 (?)
assert np.argmin(d, axis=1) == np.argmin(p, axis=1)
# the following sample produces decision_function values < -1,
# which would cause naive normalization to fail (see comment
# in SGDClassifier.predict_proba)
x = X.mean(axis=0)
d = clf.decision_function([x])
if np.all(d < -1): # XXX not true in sparse test case (why?)
p = clf.predict_proba([x])
assert_array_almost_equal(p[0], [1 / 3.0] * 3)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sgd_l1(klass):
# Test L1 regularization
n = len(X4)
rng = np.random.RandomState(13)
idx = np.arange(n)
rng.shuffle(idx)
X = X4[idx, :]
Y = Y4[idx]
clf = klass(
penalty="l1",
alpha=0.2,
fit_intercept=False,
max_iter=2000,
tol=None,
shuffle=False,
)
clf.fit(X, Y)
assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,)))
pred = clf.predict(X)
assert_array_equal(pred, Y)
# test sparsify with dense inputs
clf.sparsify()
assert sp.issparse(clf.coef_)
pred = clf.predict(X)
assert_array_equal(pred, Y)
# pickle and unpickle with sparse coef_
clf = pickle.loads(pickle.dumps(clf))
assert sp.issparse(clf.coef_)
pred = clf.predict(X)
assert_array_equal(pred, Y)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_class_weights(klass):
# Test class weights.
X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
y = [1, 1, 1, -1, -1]
clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None)
clf.fit(X, y)
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
# we give a small weights to class 1
clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001})
clf.fit(X, y)
# now the hyperplane should rotate clock-wise and
# the prediction on this point should shift
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_equal_class_weight(klass):
# Test if equal class weights approx. equals no class weights.
X = [[1, 0], [1, 0], [0, 1], [0, 1]]
y = [0, 0, 1, 1]
clf = klass(alpha=0.1, max_iter=1000, class_weight=None)
clf.fit(X, y)
X = [[1, 0], [0, 1]]
y = [0, 1]
clf_weighted = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5})
clf_weighted.fit(X, y)
# should be similar up to some epsilon due to learning rate schedule
assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_wrong_class_weight_label(klass):
# ValueError due to not existing class label.
clf = klass(alpha=0.1, max_iter=1000, class_weight={0: 0.5})
with pytest.raises(ValueError):
clf.fit(X, Y)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_weights_multiplied(klass):
# Tests that class_weight and sample_weight are multiplicative
class_weights = {1: 0.6, 2: 0.3}
rng = np.random.RandomState(0)
sample_weights = rng.random_sample(Y4.shape[0])
multiplied_together = np.copy(sample_weights)
multiplied_together[Y4 == 1] *= class_weights[1]
multiplied_together[Y4 == 2] *= class_weights[2]
clf1 = klass(alpha=0.1, max_iter=20, class_weight=class_weights)
clf2 = klass(alpha=0.1, max_iter=20)
clf1.fit(X4, Y4, sample_weight=sample_weights)
clf2.fit(X4, Y4, sample_weight=multiplied_together)
assert_almost_equal(clf1.coef_, clf2.coef_)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_balanced_weight(klass):
# Test class weights for imbalanced data"""
# compute reference metrics on iris dataset that is quite balanced by
# default
X, y = iris.data, iris.target
X = scale(X)
idx = np.arange(X.shape[0])
rng = np.random.RandomState(6)
rng.shuffle(idx)
X = X[idx]
y = y[idx]
clf = klass(alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False).fit(X, y)
f1 = metrics.f1_score(y, clf.predict(X), average="weighted")
assert_almost_equal(f1, 0.96, decimal=1)
# make the same prediction using balanced class_weight
clf_balanced = klass(
alpha=0.0001, max_iter=1000, class_weight="balanced", shuffle=False
).fit(X, y)
f1 = metrics.f1_score(y, clf_balanced.predict(X), average="weighted")
assert_almost_equal(f1, 0.96, decimal=1)
# Make sure that in the balanced case it does not change anything
# to use "balanced"
assert_array_almost_equal(clf.coef_, clf_balanced.coef_, 6)
# build a very very imbalanced dataset out of iris data
X_0 = X[y == 0, :]
y_0 = y[y == 0]
X_imbalanced = np.vstack([X] + [X_0] * 10)
y_imbalanced = np.concatenate([y] + [y_0] * 10)
# fit a model on the imbalanced data without class weight info
clf = klass(max_iter=1000, class_weight=None, shuffle=False)
clf.fit(X_imbalanced, y_imbalanced)
y_pred = clf.predict(X)
assert metrics.f1_score(y, y_pred, average="weighted") < 0.96
# fit a model with balanced class_weight enabled
clf = klass(max_iter=1000, class_weight="balanced", shuffle=False)
clf.fit(X_imbalanced, y_imbalanced)
y_pred = clf.predict(X)
assert metrics.f1_score(y, y_pred, average="weighted") > 0.96
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_sample_weights(klass):
# Test weights on individual samples
X = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
y = [1, 1, 1, -1, -1]
clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
clf.fit(X, y)
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
# we give a small weights to class 1
clf.fit(X, y, sample_weight=[0.001] * 3 + [1] * 2)
# now the hyperplane should rotate clock-wise and
# the prediction on this point should shift
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
@pytest.mark.parametrize(
"klass", [SGDClassifier, SparseSGDClassifier, SGDOneClassSVM, SparseSGDOneClassSVM]
)
def test_wrong_sample_weights(klass):
# Test if ValueError is raised if sample_weight has wrong shape
if klass in [SGDClassifier, SparseSGDClassifier]:
clf = klass(alpha=0.1, max_iter=1000, fit_intercept=False)
elif klass in [SGDOneClassSVM, SparseSGDOneClassSVM]:
clf = klass(nu=0.1, max_iter=1000, fit_intercept=False)
# provided sample_weight too long
with pytest.raises(ValueError):
clf.fit(X, Y, sample_weight=np.arange(7))
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_partial_fit_exception(klass):
clf = klass(alpha=0.01)
# classes was not specified
with pytest.raises(ValueError):
clf.partial_fit(X3, Y3)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_partial_fit_binary(klass):
third = X.shape[0] // 3
clf = klass(alpha=0.01)
classes = np.unique(Y)
clf.partial_fit(X[:third], Y[:third], classes=classes)
assert clf.coef_.shape == (1, X.shape[1])
assert clf.intercept_.shape == (1,)
assert clf.decision_function([[0, 0]]).shape == (1,)
id1 = id(clf.coef_.data)
clf.partial_fit(X[third:], Y[third:])
id2 = id(clf.coef_.data)
# check that coef_ haven't been re-allocated
assert id1, id2
y_pred = clf.predict(T)
assert_array_equal(y_pred, true_result)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_partial_fit_multiclass(klass):
third = X2.shape[0] // 3
clf = klass(alpha=0.01)
classes = np.unique(Y2)
clf.partial_fit(X2[:third], Y2[:third], classes=classes)
assert clf.coef_.shape == (3, X2.shape[1])
assert clf.intercept_.shape == (3,)
assert clf.decision_function([[0, 0]]).shape == (1, 3)
id1 = id(clf.coef_.data)
clf.partial_fit(X2[third:], Y2[third:])
id2 = id(clf.coef_.data)
# check that coef_ haven't been re-allocated
assert id1, id2
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_partial_fit_multiclass_average(klass):
third = X2.shape[0] // 3
clf = klass(alpha=0.01, average=X2.shape[0])
classes = np.unique(Y2)
clf.partial_fit(X2[:third], Y2[:third], classes=classes)
assert clf.coef_.shape == (3, X2.shape[1])
assert clf.intercept_.shape == (3,)
clf.partial_fit(X2[third:], Y2[third:])
assert clf.coef_.shape == (3, X2.shape[1])
assert clf.intercept_.shape == (3,)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_fit_then_partial_fit(klass):
# Partial_fit should work after initial fit in the multiclass case.
# Non-regression test for #2496; fit would previously produce a
# Fortran-ordered coef_ that subsequent partial_fit couldn't handle.
clf = klass()
clf.fit(X2, Y2)
clf.partial_fit(X2, Y2) # no exception here
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
def test_partial_fit_equal_fit_classif(klass, lr):
for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)):
clf = klass(alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False)
clf.fit(X_, Y_)
y_pred = clf.decision_function(T_)
t = clf.t_
classes = np.unique(Y_)
clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
for i in range(2):
clf.partial_fit(X_, Y_, classes=classes)
y_pred2 = clf.decision_function(T_)
assert clf.t_ == t
assert_array_almost_equal(y_pred, y_pred2, decimal=2)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_regression_losses(klass):
random_state = np.random.RandomState(1)
clf = klass(
alpha=0.01,
learning_rate="constant",
eta0=0.1,
loss="epsilon_insensitive",
random_state=random_state,
)
clf.fit(X, Y)
assert 1.0 == np.mean(clf.predict(X) == Y)
clf = klass(
alpha=0.01,
learning_rate="constant",
eta0=0.1,
loss="squared_epsilon_insensitive",
random_state=random_state,
)
clf.fit(X, Y)
assert 1.0 == np.mean(clf.predict(X) == Y)
clf = klass(alpha=0.01, loss="huber", random_state=random_state)
clf.fit(X, Y)
assert 1.0 == np.mean(clf.predict(X) == Y)
clf = klass(
alpha=0.01,
learning_rate="constant",
eta0=0.01,
loss="squared_error",
random_state=random_state,
)
clf.fit(X, Y)
assert 1.0 == np.mean(clf.predict(X) == Y)
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_warm_start_multiclass(klass):
_test_warm_start(klass, X2, Y2, "optimal")
@pytest.mark.parametrize("klass", [SGDClassifier, SparseSGDClassifier])
def test_multiple_fit(klass):
# Test multiple calls of fit w/ different shaped inputs.
clf = klass(alpha=0.01, shuffle=False)
clf.fit(X, Y)
assert hasattr(clf, "coef_")
# Non-regression test: try fitting with a different label set.
y = [["ham", "spam"][i] for i in LabelEncoder().fit_transform(Y)]
clf.fit(X[:, :-1], y)
###############################################################################
# Regression Test Case
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_sgd_reg(klass):
# Check that SGD gives any results.
clf = klass(alpha=0.1, max_iter=2, fit_intercept=False)
clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
assert clf.coef_[0] == clf.coef_[1]
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_sgd_averaged_computed_correctly(klass):
# Tests the average regressor matches the naive implementation
eta = 0.001
alpha = 0.01
n_samples = 20
n_features = 10
rng = np.random.RandomState(0)
X = rng.normal(size=(n_samples, n_features))
w = rng.normal(size=n_features)
# simple linear function without noise
y = np.dot(X, w)
clf = klass(
loss="squared_error",
learning_rate="constant",
eta0=eta,
alpha=alpha,
fit_intercept=True,
max_iter=1,
average=True,
shuffle=False,
)
clf.fit(X, y)
average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_sgd_averaged_partial_fit(klass):
# Tests whether the partial fit yields the same average as the fit
eta = 0.001
alpha = 0.01
n_samples = 20
n_features = 10
rng = np.random.RandomState(0)
X = rng.normal(size=(n_samples, n_features))
w = rng.normal(size=n_features)
# simple linear function without noise
y = np.dot(X, w)
clf = klass(
loss="squared_error",
learning_rate="constant",
eta0=eta,
alpha=alpha,
fit_intercept=True,
max_iter=1,
average=True,
shuffle=False,
)
clf.partial_fit(X[: int(n_samples / 2)][:], y[: int(n_samples / 2)])
clf.partial_fit(X[int(n_samples / 2) :][:], y[int(n_samples / 2) :])
average_weights, average_intercept = asgd(klass, X, y, eta, alpha)
assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16)
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_average_sparse(klass):
# Checks the average weights on data with 0s
eta = 0.001
alpha = 0.01
clf = klass(
loss="squared_error",
learning_rate="constant",
eta0=eta,
alpha=alpha,
fit_intercept=True,
max_iter=1,
average=True,
shuffle=False,
)
n_samples = Y3.shape[0]
clf.partial_fit(X3[: int(n_samples / 2)][:], Y3[: int(n_samples / 2)])
clf.partial_fit(X3[int(n_samples / 2) :][:], Y3[int(n_samples / 2) :])
average_weights, average_intercept = asgd(klass, X3, Y3, eta, alpha)
assert_array_almost_equal(clf.coef_, average_weights, decimal=16)
assert_almost_equal(clf.intercept_, average_intercept, decimal=16)
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_sgd_least_squares_fit(klass):
xmin, xmax = -5, 5
n_samples = 100
rng = np.random.RandomState(0)
X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
# simple linear function without noise
y = 0.5 * X.ravel()
clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False)
clf.fit(X, y)
score = clf.score(X, y)
assert score > 0.99
# simple linear function with noise
y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
clf = klass(loss="squared_error", alpha=0.1, max_iter=20, fit_intercept=False)
clf.fit(X, y)
score = clf.score(X, y)
assert score > 0.5
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_sgd_epsilon_insensitive(klass):
xmin, xmax = -5, 5
n_samples = 100
rng = np.random.RandomState(0)
X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
# simple linear function without noise
y = 0.5 * X.ravel()
clf = klass(
loss="epsilon_insensitive",
epsilon=0.01,
alpha=0.1,
max_iter=20,
fit_intercept=False,
)
clf.fit(X, y)
score = clf.score(X, y)
assert score > 0.99
# simple linear function with noise
y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
clf = klass(
loss="epsilon_insensitive",
epsilon=0.01,
alpha=0.1,
max_iter=20,
fit_intercept=False,
)
clf.fit(X, y)
score = clf.score(X, y)
assert score > 0.5
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_sgd_huber_fit(klass):
xmin, xmax = -5, 5
n_samples = 100
rng = np.random.RandomState(0)
X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
# simple linear function without noise
y = 0.5 * X.ravel()
clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
clf.fit(X, y)
score = clf.score(X, y)
assert score > 0.99
# simple linear function with noise
y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
clf = klass(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False)
clf.fit(X, y)
score = clf.score(X, y)
assert score > 0.5
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_elasticnet_convergence(klass):
# Check that the SGD output is consistent with coordinate descent
n_samples, n_features = 1000, 5
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
# ground_truth linear model that generate y from X and to which the
# models should converge if the regularizer would be set to 0.0
ground_truth_coef = rng.randn(n_features)
y = np.dot(X, ground_truth_coef)
# XXX: alpha = 0.1 seems to cause convergence problems
for alpha in [0.01, 0.001]:
for l1_ratio in [0.5, 0.8, 1.0]:
cd = linear_model.ElasticNet(
alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False
)
cd.fit(X, y)
sgd = klass(
penalty="elasticnet",
max_iter=50,
alpha=alpha,
l1_ratio=l1_ratio,
fit_intercept=False,
)
sgd.fit(X, y)
err_msg = (
"cd and sgd did not converge to comparable "
"results for alpha=%f and l1_ratio=%f" % (alpha, l1_ratio)
)
assert_almost_equal(cd.coef_, sgd.coef_, decimal=2, err_msg=err_msg)
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_partial_fit(klass):
third = X.shape[0] // 3
clf = klass(alpha=0.01)
clf.partial_fit(X[:third], Y[:third])
assert clf.coef_.shape == (X.shape[1],)
assert clf.intercept_.shape == (1,)
assert clf.predict([[0, 0]]).shape == (1,)
id1 = id(clf.coef_.data)
clf.partial_fit(X[third:], Y[third:])
id2 = id(clf.coef_.data)
# check that coef_ haven't been re-allocated
assert id1, id2
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
def test_partial_fit_equal_fit(klass, lr):
clf = klass(alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
clf.fit(X, Y)
y_pred = clf.predict(T)
t = clf.t_
clf = klass(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False)
for i in range(2):
clf.partial_fit(X, Y)
y_pred2 = clf.predict(T)
assert clf.t_ == t
assert_array_almost_equal(y_pred, y_pred2, decimal=2)
@pytest.mark.parametrize("klass", [SGDRegressor, SparseSGDRegressor])
def test_loss_function_epsilon(klass):
clf = klass(epsilon=0.9)
clf.set_params(epsilon=0.1)
assert clf.loss_functions["huber"][1] == 0.1
###############################################################################
# SGD One Class SVM Test Case
# a simple implementation of ASGD to use for testing SGDOneClassSVM
def asgd_oneclass(klass, X, eta, nu, coef_init=None, offset_init=0.0):
if coef_init is None:
coef = np.zeros(X.shape[1])
else:
coef = coef_init
average_coef = np.zeros(X.shape[1])
offset = offset_init
intercept = 1 - offset
average_intercept = 0.0
decay = 1.0
# sparse data has a fixed decay of .01
if klass == SparseSGDOneClassSVM:
decay = 0.01
for i, entry in enumerate(X):
p = np.dot(entry, coef)
p += intercept
if p <= 1.0:
gradient = -1
else:
gradient = 0
coef *= max(0, 1.0 - (eta * nu / 2))
coef += -(eta * gradient * entry)
intercept += -(eta * (nu + gradient)) * decay
average_coef *= i
average_coef += coef
average_coef /= i + 1.0
average_intercept *= i
average_intercept += intercept
average_intercept /= i + 1.0
return average_coef, 1 - average_intercept
@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
def _test_warm_start_oneclass(klass, X, lr):
# Test that explicit warm restart...
clf = klass(nu=0.5, eta0=0.01, shuffle=False, learning_rate=lr)
clf.fit(X)
clf2 = klass(nu=0.1, eta0=0.01, shuffle=False, learning_rate=lr)
clf2.fit(X, coef_init=clf.coef_.copy(), offset_init=clf.offset_.copy())
# ... and implicit warm restart are equivalent.
clf3 = klass(nu=0.5, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr)
clf3.fit(X)
assert clf3.t_ == clf.t_
assert_allclose(clf3.coef_, clf.coef_)
clf3.set_params(nu=0.1)
clf3.fit(X)
assert clf3.t_ == clf2.t_
assert_allclose(clf3.coef_, clf2.coef_)
@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
def test_warm_start_oneclass(klass, lr):
_test_warm_start_oneclass(klass, X, lr)
@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
def test_clone_oneclass(klass):
# Test whether clone works ok.
clf = klass(nu=0.5)
clf = clone(clf)
clf.set_params(nu=0.1)
clf.fit(X)
clf2 = klass(nu=0.1)
clf2.fit(X)
assert_array_equal(clf.coef_, clf2.coef_)
@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
def test_partial_fit_oneclass(klass):
third = X.shape[0] // 3
clf = klass(nu=0.1)
clf.partial_fit(X[:third])
assert clf.coef_.shape == (X.shape[1],)
assert clf.offset_.shape == (1,)
assert clf.predict([[0, 0]]).shape == (1,)
previous_coefs = clf.coef_
clf.partial_fit(X[third:])
# check that coef_ haven't been re-allocated
assert clf.coef_ is previous_coefs
# raises ValueError if number of features does not match previous data
with pytest.raises(ValueError):
clf.partial_fit(X[:, 1])
@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
@pytest.mark.parametrize("lr", ["constant", "optimal", "invscaling", "adaptive"])
def test_partial_fit_equal_fit_oneclass(klass, lr):
clf = klass(nu=0.05, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False)
clf.fit(X)
y_scores = clf.decision_function(T)
t = clf.t_
coef = clf.coef_
offset = clf.offset_
clf = klass(nu=0.05, eta0=0.01, max_iter=1, learning_rate=lr, shuffle=False)
for _ in range(2):
clf.partial_fit(X)
y_scores2 = clf.decision_function(T)
assert clf.t_ == t
assert_allclose(y_scores, y_scores2)
assert_allclose(clf.coef_, coef)
assert_allclose(clf.offset_, offset)
@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
def test_late_onset_averaging_reached_oneclass(klass):
# Test average
eta0 = 0.001
nu = 0.05
# 2 passes over the training set but average only at second pass
clf1 = klass(
average=7, learning_rate="constant", eta0=eta0, nu=nu, max_iter=2, shuffle=False
)
# 1 pass over the training set with no averaging
clf2 = klass(
average=False,
learning_rate="constant",
eta0=eta0,
nu=nu,
max_iter=1,
shuffle=False,
)
clf1.fit(X)
clf2.fit(X)
# Start from clf2 solution, compute averaging using asgd function and
# compare with clf1 solution
average_coef, average_offset = asgd_oneclass(
klass, X, eta0, nu, coef_init=clf2.coef_.ravel(), offset_init=clf2.offset_
)
assert_allclose(clf1.coef_.ravel(), average_coef.ravel())
assert_allclose(clf1.offset_, average_offset)
@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
def test_sgd_averaged_computed_correctly_oneclass(klass):
# Tests the average SGD One-Class SVM matches the naive implementation
eta = 0.001
nu = 0.05
n_samples = 20
n_features = 10
rng = np.random.RandomState(0)
X = rng.normal(size=(n_samples, n_features))
clf = klass(
learning_rate="constant",
eta0=eta,
nu=nu,
fit_intercept=True,
max_iter=1,
average=True,
shuffle=False,
)
clf.fit(X)
average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
assert_allclose(clf.coef_, average_coef)
assert_allclose(clf.offset_, average_offset)
@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
def test_sgd_averaged_partial_fit_oneclass(klass):
# Tests whether the partial fit yields the same average as the fit
eta = 0.001
nu = 0.05
n_samples = 20
n_features = 10
rng = np.random.RandomState(0)
X = rng.normal(size=(n_samples, n_features))
clf = klass(
learning_rate="constant",
eta0=eta,
nu=nu,
fit_intercept=True,
max_iter=1,
average=True,
shuffle=False,
)
clf.partial_fit(X[: int(n_samples / 2)][:])
clf.partial_fit(X[int(n_samples / 2) :][:])
average_coef, average_offset = asgd_oneclass(klass, X, eta, nu)
assert_allclose(clf.coef_, average_coef)
assert_allclose(clf.offset_, average_offset)
@pytest.mark.parametrize("klass", [SGDOneClassSVM, SparseSGDOneClassSVM])
def test_average_sparse_oneclass(klass):
# Checks the average coef on data with 0s
eta = 0.001
nu = 0.01
clf = klass(
learning_rate="constant",
eta0=eta,
nu=nu,
fit_intercept=True,
max_iter=1,
average=True,
shuffle=False,
)
n_samples = X3.shape[0]
clf.partial_fit(X3[: int(n_samples / 2)])
clf.partial_fit(X3[int(n_samples / 2) :])
average_coef, average_offset = asgd_oneclass(klass, X3, eta, nu)
assert_allclose(clf.coef_, average_coef)
assert_allclose(clf.offset_, average_offset)
def test_sgd_oneclass():
# Test fit, decision_function, predict and score_samples on a toy
# dataset
X_train = np.array([[-2, -1], [-1, -1], [1, 1]])
X_test = np.array([[0.5, -2], [2, 2]])
clf = SGDOneClassSVM(
nu=0.5, eta0=1, learning_rate="constant", shuffle=False, max_iter=1
)
clf.fit(X_train)
assert_allclose(clf.coef_, np.array([-0.125, 0.4375]))
assert clf.offset_[0] == -0.5
scores = clf.score_samples(X_test)
assert_allclose(scores, np.array([-0.9375, 0.625]))
dec = clf.score_samples(X_test) - clf.offset_
assert_allclose(clf.decision_function(X_test), dec)
pred = clf.predict(X_test)
assert_array_equal(pred, np.array([-1, 1]))
def test_ocsvm_vs_sgdocsvm():
# Checks SGDOneClass SVM gives a good approximation of kernelized
# One-Class SVM
nu = 0.05
gamma = 2.0
random_state = 42
# Generate train and test data
rng = np.random.RandomState(random_state)
X = 0.3 * rng.randn(500, 2)
X_train = np.r_[X + 2, X - 2]
X = 0.3 * rng.randn(100, 2)
X_test = np.r_[X + 2, X - 2]
# One-Class SVM
clf = OneClassSVM(gamma=gamma, kernel="rbf", nu=nu)
clf.fit(X_train)
y_pred_ocsvm = clf.predict(X_test)
dec_ocsvm = clf.decision_function(X_test).reshape(1, -1)
# SGDOneClassSVM using kernel approximation
max_iter = 15
transform = Nystroem(gamma=gamma, random_state=random_state)
clf_sgd = SGDOneClassSVM(
nu=nu,
shuffle=True,
fit_intercept=True,
max_iter=max_iter,
random_state=random_state,
tol=None,
)
pipe_sgd = make_pipeline(transform, clf_sgd)
pipe_sgd.fit(X_train)
y_pred_sgdocsvm = pipe_sgd.predict(X_test)
dec_sgdocsvm = pipe_sgd.decision_function(X_test).reshape(1, -1)
assert np.mean(y_pred_sgdocsvm == y_pred_ocsvm) >= 0.99
corrcoef = np.corrcoef(np.concatenate((dec_ocsvm, dec_sgdocsvm)))[0, 1]
assert corrcoef >= 0.9
def test_sgd_oneclass_convergence():
# Check that the optimization does not end early and that the stopping criterion
# is working. Non-regression test for #30027
for nu in [0.1, 0.5, 0.9]:
# no need for large max_iter
model = SGDOneClassSVM(
nu=nu, max_iter=100, tol=1e-3, learning_rate="constant", eta0=1e-3
)
model.fit(iris.data)
# 6 is the minimal number of iterations that should be surpassed, after which
# the optimization can stop
assert model.n_iter_ > 6
def test_sgd_oneclass_vs_linear_oneclass():
# Test convergence vs. liblinear `OneClassSVM` with kernel="linear"
for nu in [0.1, 0.5, 0.9]:
# allow enough iterations, small dataset
model = SGDOneClassSVM(
nu=nu, max_iter=20000, tol=None, learning_rate="constant", eta0=1e-3
)
model_ref = OneClassSVM(kernel="linear", nu=nu, tol=1e-6) # reference model
model.fit(iris.data)
model_ref.fit(iris.data)
preds = model.predict(iris.data)
dec_fn = model.decision_function(iris.data)
preds_ref = model_ref.predict(iris.data)
dec_fn_ref = model_ref.decision_function(iris.data)
dec_fn_corr = np.corrcoef(dec_fn, dec_fn_ref)[0, 1]
preds_corr = np.corrcoef(preds, preds_ref)[0, 1]
# check weights and intercept concatenated together for correlation
coef_corr = np.corrcoef(
np.concatenate([model.coef_, -model.offset_]),
np.concatenate([model_ref.coef_.flatten(), model_ref.intercept_]),
)[0, 1]
# share of predicted 1's
share_ones = (preds == 1).sum() / len(preds)
assert dec_fn_corr > 0.99
assert preds_corr > 0.95
assert coef_corr > 0.99
assert_allclose(1 - share_ones, nu)
def test_l1_ratio():
# Test if l1 ratio extremes match L1 and L2 penalty settings.
X, y = datasets.make_classification(
n_samples=1000, n_features=100, n_informative=20, random_state=1234
)
# test if elasticnet with l1_ratio near 1 gives same result as pure l1
est_en = SGDClassifier(
alpha=0.001,
penalty="elasticnet",
tol=None,
max_iter=6,
l1_ratio=0.9999999999,
random_state=42,
).fit(X, y)
est_l1 = SGDClassifier(
alpha=0.001, penalty="l1", max_iter=6, random_state=42, tol=None
).fit(X, y)
assert_array_almost_equal(est_en.coef_, est_l1.coef_)
# test if elasticnet with l1_ratio near 0 gives same result as pure l2
est_en = SGDClassifier(
alpha=0.001,
penalty="elasticnet",
tol=None,
max_iter=6,
l1_ratio=0.0000000001,
random_state=42,
).fit(X, y)
est_l2 = SGDClassifier(
alpha=0.001, penalty="l2", max_iter=6, random_state=42, tol=None
).fit(X, y)
assert_array_almost_equal(est_en.coef_, est_l2.coef_)
def test_underflow_or_overlow():
with np.errstate(all="raise"):
# Generate some weird data with hugely unscaled features
rng = np.random.RandomState(0)
n_samples = 100
n_features = 10
X = rng.normal(size=(n_samples, n_features))
X[:, :2] *= 1e300
assert np.isfinite(X).all()
# Use MinMaxScaler to scale the data without introducing a numerical
# instability (computing the standard deviation naively is not possible
# on this data)
X_scaled = MinMaxScaler().fit_transform(X)
assert np.isfinite(X_scaled).all()
# Define a ground truth on the scaled data
ground_truth = rng.normal(size=n_features)
y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32)
assert_array_equal(np.unique(y), [0, 1])
model = SGDClassifier(alpha=0.1, loss="squared_hinge", max_iter=500)
# smoke test: model is stable on scaled data
model.fit(X_scaled, y)
assert np.isfinite(model.coef_).all()
# model is numerically unstable on unscaled data
msg_regxp = (
r"Floating-point under-/overflow occurred at epoch #.*"
" Scaling input data with StandardScaler or MinMaxScaler"
" might help."
)
with pytest.raises(ValueError, match=msg_regxp):
model.fit(X, y)
def test_numerical_stability_large_gradient():
# Non regression test case for numerical stability on scaled problems
# where the gradient can still explode with some losses
model = SGDClassifier(
loss="squared_hinge",
max_iter=10,
shuffle=True,
penalty="elasticnet",
l1_ratio=0.3,
alpha=0.01,
eta0=0.001,
random_state=0,
tol=None,
)
with np.errstate(all="raise"):
model.fit(iris.data, iris.target)
assert np.isfinite(model.coef_).all()
@pytest.mark.parametrize("penalty", ["l2", "l1", "elasticnet"])
def test_large_regularization(penalty):
# Non regression tests for numerical stability issues caused by large
# regularization parameters
model = SGDClassifier(
alpha=1e5,
learning_rate="constant",
eta0=0.1,
penalty=penalty,
shuffle=False,
tol=None,
max_iter=6,
)
with np.errstate(all="raise"):
model.fit(iris.data, iris.target)
assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_))
def test_tol_parameter():
# Test that the tol parameter behaves as expected
X = StandardScaler().fit_transform(iris.data)
y = iris.target == 1
# With tol is None, the number of iteration should be equal to max_iter
max_iter = 42
model_0 = SGDClassifier(tol=None, random_state=0, max_iter=max_iter)
model_0.fit(X, y)
assert max_iter == model_0.n_iter_
# If tol is not None, the number of iteration should be less than max_iter
max_iter = 2000
model_1 = SGDClassifier(tol=0, random_state=0, max_iter=max_iter)
model_1.fit(X, y)
assert max_iter > model_1.n_iter_
assert model_1.n_iter_ > 5
# A larger tol should yield a smaller number of iteration
model_2 = SGDClassifier(tol=0.1, random_state=0, max_iter=max_iter)
model_2.fit(X, y)
assert model_1.n_iter_ > model_2.n_iter_
assert model_2.n_iter_ > 3
# Strict tolerance and small max_iter should trigger a warning
model_3 = SGDClassifier(max_iter=3, tol=1e-3, random_state=0)
warning_message = (
"Maximum number of iteration reached before "
"convergence. Consider increasing max_iter to "
"improve the fit."
)
with pytest.warns(ConvergenceWarning, match=warning_message):
model_3.fit(X, y)
assert model_3.n_iter_ == 3
def _test_loss_common(loss_function, cases):
# Test the different loss functions
# cases is a list of (p, y, expected)
for p, y, expected_loss, expected_dloss in cases:
assert_almost_equal(loss_function.py_loss(p, y), expected_loss)
assert_almost_equal(loss_function.py_dloss(p, y), expected_dloss)
def test_loss_hinge():
# Test Hinge (hinge / perceptron)
# hinge
loss = sgd_fast.Hinge(1.0)
cases = [
# (p, y, expected_loss, expected_dloss)
(1.1, 1.0, 0.0, 0.0),
(-2.0, -1.0, 0.0, 0.0),
(1.0, 1.0, 0.0, -1.0),
(-1.0, -1.0, 0.0, 1.0),
(0.5, 1.0, 0.5, -1.0),
(2.0, -1.0, 3.0, 1.0),
(-0.5, -1.0, 0.5, 1.0),
(0.0, 1.0, 1, -1.0),
]
_test_loss_common(loss, cases)
# perceptron
loss = sgd_fast.Hinge(0.0)
cases = [
# (p, y, expected_loss, expected_dloss)
(1.0, 1.0, 0.0, 0.0),
(-0.1, -1.0, 0.0, 0.0),
(0.0, 1.0, 0.0, -1.0),
(0.0, -1.0, 0.0, 1.0),
(0.5, -1.0, 0.5, 1.0),
(2.0, -1.0, 2.0, 1.0),
(-0.5, 1.0, 0.5, -1.0),
(-1.0, 1.0, 1.0, -1.0),
]
_test_loss_common(loss, cases)
def test_gradient_squared_hinge():
# Test SquaredHinge
loss = sgd_fast.SquaredHinge(1.0)
cases = [
# (p, y, expected_loss, expected_dloss)
(1.0, 1.0, 0.0, 0.0),
(-2.0, -1.0, 0.0, 0.0),
(1.0, -1.0, 4.0, 4.0),
(-1.0, 1.0, 4.0, -4.0),
(0.5, 1.0, 0.25, -1.0),
(0.5, -1.0, 2.25, 3.0),
]
_test_loss_common(loss, cases)
def test_loss_modified_huber():
# (p, y, expected_loss, expected_dloss)
loss = sgd_fast.ModifiedHuber()
cases = [
# (p, y, expected_loss, expected_dloss)
(1.0, 1.0, 0.0, 0.0),
(-1.0, -1.0, 0.0, 0.0),
(2.0, 1.0, 0.0, 0.0),
(0.0, 1.0, 1.0, -2.0),
(-1.0, 1.0, 4.0, -4.0),
(0.5, -1.0, 2.25, 3.0),
(-2.0, 1.0, 8, -4.0),
(-3.0, 1.0, 12, -4.0),
]
_test_loss_common(loss, cases)
def test_loss_epsilon_insensitive():
# Test EpsilonInsensitive
loss = sgd_fast.EpsilonInsensitive(0.1)
cases = [
# (p, y, expected_loss, expected_dloss)
(0.0, 0.0, 0.0, 0.0),
(0.1, 0.0, 0.0, 0.0),
(-2.05, -2.0, 0.0, 0.0),
(3.05, 3.0, 0.0, 0.0),
(2.2, 2.0, 0.1, 1.0),
(2.0, -1.0, 2.9, 1.0),
(2.0, 2.2, 0.1, -1.0),
(-2.0, 1.0, 2.9, -1.0),
]
_test_loss_common(loss, cases)
def test_loss_squared_epsilon_insensitive():
# Test SquaredEpsilonInsensitive
loss = sgd_fast.SquaredEpsilonInsensitive(0.1)
cases = [
# (p, y, expected_loss, expected_dloss)
(0.0, 0.0, 0.0, 0.0),
(0.1, 0.0, 0.0, 0.0),
(-2.05, -2.0, 0.0, 0.0),
(3.05, 3.0, 0.0, 0.0),
(2.2, 2.0, 0.01, 0.2),
(2.0, -1.0, 8.41, 5.8),
(2.0, 2.2, 0.01, -0.2),
(-2.0, 1.0, 8.41, -5.8),
]
_test_loss_common(loss, cases)
def test_multi_thread_multi_class_and_early_stopping():
# This is a non-regression test for a bad interaction between
# early stopping internal attribute and thread-based parallelism.
clf = SGDClassifier(
alpha=1e-3,
tol=1e-3,
max_iter=1000,
early_stopping=True,
n_iter_no_change=100,
random_state=0,
n_jobs=2,
)
clf.fit(iris.data, iris.target)
assert clf.n_iter_ > clf.n_iter_no_change
assert clf.n_iter_ < clf.n_iter_no_change + 20
assert clf.score(iris.data, iris.target) > 0.8
def test_multi_core_gridsearch_and_early_stopping():
# This is a non-regression test for a bad interaction between
# early stopping internal attribute and process-based multi-core
# parallelism.
param_grid = {
"alpha": np.logspace(-4, 4, 9),
"n_iter_no_change": [5, 10, 50],
}
clf = SGDClassifier(tol=1e-2, max_iter=1000, early_stopping=True, random_state=0)
search = RandomizedSearchCV(clf, param_grid, n_iter=5, n_jobs=2, random_state=0)
search.fit(iris.data, iris.target)
assert search.best_score_ > 0.8
@pytest.mark.parametrize("backend", ["loky", "multiprocessing", "threading"])
def test_SGDClassifier_fit_for_all_backends(backend):
# This is a non-regression smoke test. In the multi-class case,
# SGDClassifier.fit fits each class in a one-versus-all fashion using
# joblib.Parallel. However, each OvA step updates the coef_ attribute of
# the estimator in-place. Internally, SGDClassifier calls Parallel using
# require='sharedmem'. This test makes sure SGDClassifier.fit works
# consistently even when the user asks for a backend that does not provide
# sharedmem semantics.
# We further test a case where memmapping would have been used if
# SGDClassifier.fit was called from a loky or multiprocessing backend. In
# this specific case, in-place modification of clf.coef_ would have caused
# a segmentation fault when trying to write in a readonly memory mapped
# buffer.
random_state = np.random.RandomState(42)
# Create a classification problem with 50000 features and 20 classes. Using
# loky or multiprocessing this make the clf.coef_ exceed the threshold
# above which memmaping is used in joblib and loky (1MB as of 2018/11/1).
X = sp.random(500, 2000, density=0.02, format="csr", random_state=random_state)
y = random_state.choice(20, 500)
# Begin by fitting a SGD classifier sequentially
clf_sequential = SGDClassifier(max_iter=1000, n_jobs=1, random_state=42)
clf_sequential.fit(X, y)
# Fit a SGDClassifier using the specified backend, and make sure the
# coefficients are equal to those obtained using a sequential fit
clf_parallel = SGDClassifier(max_iter=1000, n_jobs=4, random_state=42)
with joblib.parallel_backend(backend=backend):
clf_parallel.fit(X, y)
assert_array_almost_equal(clf_sequential.coef_, clf_parallel.coef_)
@pytest.mark.parametrize(
"Estimator", [linear_model.SGDClassifier, linear_model.SGDRegressor]
)
def test_sgd_random_state(Estimator, global_random_seed):
# Train the same model on the same data without converging and check that we
# get reproducible results by fixing the random seed.
if Estimator == linear_model.SGDRegressor:
X, y = datasets.make_regression(random_state=global_random_seed)
else:
X, y = datasets.make_classification(random_state=global_random_seed)
# Fitting twice a model with the same hyper-parameters on the same training
# set with the same seed leads to the same results deterministically.
est = Estimator(random_state=global_random_seed, max_iter=1)
with pytest.warns(ConvergenceWarning):
coef_same_seed_a = est.fit(X, y).coef_
assert est.n_iter_ == 1
est = Estimator(random_state=global_random_seed, max_iter=1)
with pytest.warns(ConvergenceWarning):
coef_same_seed_b = est.fit(X, y).coef_
assert est.n_iter_ == 1
assert_allclose(coef_same_seed_a, coef_same_seed_b)
# Fitting twice a model with the same hyper-parameters on the same training
# set but with different random seed leads to different results after one
# epoch because of the random shuffling of the dataset.
est = Estimator(random_state=global_random_seed + 1, max_iter=1)
with pytest.warns(ConvergenceWarning):
coef_other_seed = est.fit(X, y).coef_
assert est.n_iter_ == 1
assert np.abs(coef_same_seed_a - coef_other_seed).max() > 1.0
def test_validation_mask_correctly_subsets(monkeypatch):
"""Test that data passed to validation callback correctly subsets.
Non-regression test for #23255.
"""
X, Y = iris.data, iris.target
n_samples = X.shape[0]
validation_fraction = 0.2
clf = linear_model.SGDClassifier(
early_stopping=True,
tol=1e-3,
max_iter=1000,
validation_fraction=validation_fraction,
)
mock = Mock(side_effect=_stochastic_gradient._ValidationScoreCallback)
monkeypatch.setattr(_stochastic_gradient, "_ValidationScoreCallback", mock)
clf.fit(X, Y)
X_val, y_val = mock.call_args[0][1:3]
assert X_val.shape[0] == int(n_samples * validation_fraction)
assert y_val.shape[0] == int(n_samples * validation_fraction)
def test_sgd_error_on_zero_validation_weight():
# Test that SGDClassifier raises error when all the validation samples
# have zero sample_weight. Non-regression test for #17229.
X, Y = iris.data, iris.target
sample_weight = np.zeros_like(Y)
validation_fraction = 0.4
clf = linear_model.SGDClassifier(
early_stopping=True, validation_fraction=validation_fraction, random_state=0
)
error_message = (
"The sample weights for validation set are all zero, consider using a"
" different random state."
)
with pytest.raises(ValueError, match=error_message):
clf.fit(X, Y, sample_weight=sample_weight)
@pytest.mark.parametrize("Estimator", [SGDClassifier, SGDRegressor])
def test_sgd_verbose(Estimator):
"""non-regression test for gh #25249"""
Estimator(verbose=1).fit(X, Y)
@pytest.mark.parametrize(
"SGDEstimator",
[
SGDClassifier,
SparseSGDClassifier,
SGDRegressor,
SparseSGDRegressor,
SGDOneClassSVM,
SparseSGDOneClassSVM,
],
)
@pytest.mark.parametrize("data_type", (np.float32, np.float64))
def test_sgd_dtype_match(SGDEstimator, data_type):
_X = X.astype(data_type)
_Y = np.array(Y, dtype=data_type)
sgd_model = SGDEstimator()
sgd_model.fit(_X, _Y)
assert sgd_model.coef_.dtype == data_type
@pytest.mark.parametrize(
"SGDEstimator",
[
SGDClassifier,
SparseSGDClassifier,
SGDRegressor,
SparseSGDRegressor,
SGDOneClassSVM,
SparseSGDOneClassSVM,
],
)
def test_sgd_numerical_consistency(SGDEstimator):
X_64 = X.astype(dtype=np.float64)
Y_64 = np.array(Y, dtype=np.float64)
X_32 = X.astype(dtype=np.float32)
Y_32 = np.array(Y, dtype=np.float32)
sgd_64 = SGDEstimator(max_iter=20)
sgd_64.fit(X_64, Y_64)
sgd_32 = SGDEstimator(max_iter=20)
sgd_32.fit(X_32, Y_32)
assert_allclose(sgd_64.coef_, sgd_32.coef_)
def test_sgd_one_class_svm_estimator_type():
"""Check that SGDOneClassSVM has the correct estimator type.
Non-regression test for if the mixin was not on the left.
"""
sgd_ocsvm = SGDOneClassSVM()
assert get_tags(sgd_ocsvm).estimator_type == "outlier_detector"