You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
292 lines
9.3 KiB
292 lines
9.3 KiB
# SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
import inspect
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from sklearn.base import clone, is_classifier
|
|
from sklearn.datasets import make_classification, make_low_rank_matrix, make_regression
|
|
from sklearn.linear_model import (
|
|
ARDRegression,
|
|
BayesianRidge,
|
|
ElasticNet,
|
|
ElasticNetCV,
|
|
GammaRegressor,
|
|
HuberRegressor,
|
|
Lars,
|
|
LarsCV,
|
|
Lasso,
|
|
LassoCV,
|
|
LassoLars,
|
|
LassoLarsCV,
|
|
LassoLarsIC,
|
|
LinearRegression,
|
|
LogisticRegression,
|
|
LogisticRegressionCV,
|
|
MultiTaskElasticNet,
|
|
MultiTaskElasticNetCV,
|
|
MultiTaskLasso,
|
|
MultiTaskLassoCV,
|
|
OrthogonalMatchingPursuit,
|
|
OrthogonalMatchingPursuitCV,
|
|
PassiveAggressiveClassifier,
|
|
PassiveAggressiveRegressor,
|
|
Perceptron,
|
|
PoissonRegressor,
|
|
Ridge,
|
|
RidgeClassifier,
|
|
RidgeClassifierCV,
|
|
RidgeCV,
|
|
SGDClassifier,
|
|
SGDRegressor,
|
|
TheilSenRegressor,
|
|
TweedieRegressor,
|
|
)
|
|
from sklearn.pipeline import make_pipeline
|
|
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
|
from sklearn.svm import LinearSVC, LinearSVR
|
|
from sklearn.utils._testing import assert_allclose, set_random_state
|
|
from sklearn.utils.fixes import CSR_CONTAINERS
|
|
|
|
|
|
# Note: GammaRegressor() and TweedieRegressor(power != 1) have a non-canonical link.
|
|
@pytest.mark.parametrize(
|
|
"model",
|
|
[
|
|
ARDRegression(),
|
|
BayesianRidge(),
|
|
ElasticNet(),
|
|
ElasticNetCV(),
|
|
Lars(),
|
|
LarsCV(),
|
|
Lasso(),
|
|
LassoCV(),
|
|
LassoLarsCV(),
|
|
LassoLarsIC(),
|
|
LinearRegression(),
|
|
# TODO: FIx SAGA which fails badly with sample_weights.
|
|
# This is a known limitation, see:
|
|
# https://github.com/scikit-learn/scikit-learn/issues/21305
|
|
pytest.param(
|
|
LogisticRegression(l1_ratio=0.5, solver="saga", tol=1e-15),
|
|
marks=pytest.mark.xfail(reason="Missing importance sampling scheme"),
|
|
),
|
|
LogisticRegressionCV(tol=1e-6, use_legacy_attributes=False, l1_ratios=(0,)),
|
|
MultiTaskElasticNet(),
|
|
MultiTaskElasticNetCV(),
|
|
MultiTaskLasso(),
|
|
MultiTaskLassoCV(),
|
|
OrthogonalMatchingPursuit(),
|
|
OrthogonalMatchingPursuitCV(),
|
|
PoissonRegressor(),
|
|
Ridge(),
|
|
RidgeCV(),
|
|
pytest.param(
|
|
SGDRegressor(tol=1e-15),
|
|
marks=pytest.mark.xfail(reason="Insufficient precision."),
|
|
),
|
|
SGDRegressor(penalty="elasticnet", max_iter=10_000),
|
|
TweedieRegressor(power=0), # same as Ridge
|
|
],
|
|
ids=lambda x: x.__class__.__name__,
|
|
)
|
|
@pytest.mark.parametrize("with_sample_weight", [False, True])
|
|
def test_balance_property(model, with_sample_weight, global_random_seed):
|
|
# Test that sum(y_predicted) == sum(y_observed) on the training set.
|
|
# This must hold for all linear models with deviance of an exponential disperson
|
|
# family as loss and the corresponding canonical link if fit_intercept=True.
|
|
# Examples:
|
|
# - squared error and identity link (most linear models)
|
|
# - Poisson deviance with log link
|
|
# - log loss with logit link
|
|
# This is known as balance property or unconditional calibration/unbiasedness.
|
|
# For reference, see Corollary 3.18, 3.20 and Chapter 5.1.5 of
|
|
# M.V. Wuthrich and M. Merz, "Statistical Foundations of Actuarial Learning and its
|
|
# Applications" (June 3, 2022). http://doi.org/10.2139/ssrn.3822407
|
|
model = clone(model) # Avoid side effects from shared instances.
|
|
if (
|
|
with_sample_weight
|
|
and "sample_weight" not in inspect.signature(model.fit).parameters.keys()
|
|
):
|
|
pytest.skip("Estimator does not support sample_weight.")
|
|
|
|
rel = 2e-4 # test precision
|
|
if isinstance(model, SGDRegressor):
|
|
rel = 1e-1
|
|
elif hasattr(model, "solver") and model.solver == "saga":
|
|
rel = 1e-2
|
|
|
|
rng = np.random.RandomState(global_random_seed)
|
|
n_train, n_features, n_targets = 100, 10, None
|
|
if isinstance(
|
|
model,
|
|
(MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLasso, MultiTaskLassoCV),
|
|
):
|
|
n_targets = 3
|
|
X = make_low_rank_matrix(n_samples=n_train, n_features=n_features, random_state=rng)
|
|
if n_targets:
|
|
coef = (
|
|
rng.uniform(low=-2, high=2, size=(n_features, n_targets))
|
|
/ np.max(X, axis=0)[:, None]
|
|
)
|
|
else:
|
|
coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
|
|
|
|
expectation = np.exp(X @ coef + 0.5)
|
|
y = rng.poisson(lam=expectation) + 1 # strict positive, i.e. y > 0
|
|
if is_classifier(model):
|
|
y = (y > expectation + 1).astype(np.float64)
|
|
|
|
if with_sample_weight:
|
|
sw = rng.uniform(low=1, high=10, size=y.shape[0])
|
|
else:
|
|
sw = None
|
|
|
|
model.set_params(fit_intercept=True) # to be sure
|
|
if with_sample_weight:
|
|
model.fit(X, y, sample_weight=sw)
|
|
else:
|
|
model.fit(X, y)
|
|
# Assert balance property.
|
|
if is_classifier(model):
|
|
assert np.average(model.predict_proba(X)[:, 1], weights=sw) == pytest.approx(
|
|
np.average(y, weights=sw), rel=rel
|
|
)
|
|
else:
|
|
assert np.average(model.predict(X), weights=sw, axis=0) == pytest.approx(
|
|
np.average(y, weights=sw, axis=0), rel=rel
|
|
)
|
|
|
|
|
|
@pytest.mark.filterwarnings("ignore:The default of 'normalize'")
|
|
@pytest.mark.filterwarnings("ignore:lbfgs failed to converge")
|
|
@pytest.mark.filterwarnings("ignore:A column-vector y was passed when a 1d array.*")
|
|
@pytest.mark.parametrize(
|
|
"Regressor",
|
|
[
|
|
ARDRegression,
|
|
BayesianRidge,
|
|
ElasticNet,
|
|
ElasticNetCV,
|
|
GammaRegressor,
|
|
HuberRegressor,
|
|
Lars,
|
|
LarsCV,
|
|
Lasso,
|
|
LassoCV,
|
|
LassoLars,
|
|
LassoLarsCV,
|
|
LassoLarsIC,
|
|
LinearSVR,
|
|
LinearRegression,
|
|
OrthogonalMatchingPursuit,
|
|
OrthogonalMatchingPursuitCV,
|
|
PassiveAggressiveRegressor,
|
|
PoissonRegressor,
|
|
Ridge,
|
|
RidgeCV,
|
|
SGDRegressor,
|
|
TheilSenRegressor,
|
|
TweedieRegressor,
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("ndim", [1, 2])
|
|
def test_linear_model_regressor_coef_shape(Regressor, ndim):
|
|
"""Check the consistency of linear models `coef` shape."""
|
|
if Regressor is LinearRegression:
|
|
pytest.xfail("LinearRegression does not follow `coef_` shape contract!")
|
|
|
|
X, y = make_regression(random_state=0, n_samples=200, n_features=20)
|
|
y = MinMaxScaler().fit_transform(y.reshape(-1, 1))[:, 0] + 1
|
|
y = y[:, np.newaxis] if ndim == 2 else y
|
|
|
|
regressor = Regressor()
|
|
set_random_state(regressor)
|
|
regressor.fit(X, y)
|
|
assert regressor.coef_.shape == (X.shape[1],)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
["Classifier", "params"],
|
|
[
|
|
(LinearSVC, {}),
|
|
(LogisticRegression, {}),
|
|
(
|
|
LogisticRegressionCV,
|
|
{
|
|
"solver": "newton-cholesky",
|
|
"use_legacy_attributes": False,
|
|
"l1_ratios": (0,),
|
|
},
|
|
),
|
|
(PassiveAggressiveClassifier, {}),
|
|
(Perceptron, {}),
|
|
(RidgeClassifier, {}),
|
|
(RidgeClassifierCV, {}),
|
|
(SGDClassifier, {}),
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("n_classes", [2, 3])
|
|
def test_linear_model_classifier_coef_shape(Classifier, params, n_classes):
|
|
if Classifier in (RidgeClassifier, RidgeClassifierCV):
|
|
pytest.xfail(f"{Classifier} does not follow `coef_` shape contract!")
|
|
|
|
X, y = make_classification(n_informative=10, n_classes=n_classes, random_state=0)
|
|
n_features = X.shape[1]
|
|
|
|
classifier = Classifier(**params)
|
|
set_random_state(classifier)
|
|
classifier.fit(X, y)
|
|
expected_shape = (1, n_features) if n_classes == 2 else (n_classes, n_features)
|
|
assert classifier.coef_.shape == expected_shape
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"LinearModel, params",
|
|
[
|
|
(Lasso, {"tol": 1e-15, "alpha": 0.01}),
|
|
(LassoCV, {"tol": 1e-15}),
|
|
(ElasticNetCV, {"tol": 1e-15}),
|
|
(RidgeClassifier, {"solver": "sparse_cg", "alpha": 0.1}),
|
|
(ElasticNet, {"tol": 1e-15, "l1_ratio": 1, "alpha": 0.01}),
|
|
(ElasticNet, {"tol": 1e-15, "l1_ratio": 1e-5, "alpha": 0.01}),
|
|
(Ridge, {"solver": "sparse_cg", "tol": 1e-12, "alpha": 0.1}),
|
|
(LinearRegression, {}),
|
|
(RidgeCV, {}),
|
|
(RidgeClassifierCV, {}),
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
|
def test_model_pipeline_same_dense_and_sparse(LinearModel, params, csr_container):
|
|
"""Test that sparse and dense linear models give same results.
|
|
|
|
Models use a preprocessing pipeline with a StandardScaler.
|
|
"""
|
|
model_dense = make_pipeline(StandardScaler(with_mean=False), LinearModel(**params))
|
|
|
|
model_sparse = make_pipeline(StandardScaler(with_mean=False), LinearModel(**params))
|
|
|
|
# prepare the data
|
|
rng = np.random.RandomState(0)
|
|
n_samples = 100
|
|
n_features = 2
|
|
X = rng.randn(n_samples, n_features)
|
|
X[X < 0.1] = 0.0
|
|
|
|
X_sparse = csr_container(X)
|
|
y = rng.rand(n_samples)
|
|
|
|
if is_classifier(model_dense):
|
|
y = np.sign(y)
|
|
|
|
model_dense.fit(X, y)
|
|
model_sparse.fit(X_sparse, y)
|
|
|
|
assert_allclose(model_sparse[1].coef_, model_dense[1].coef_, atol=1e-15)
|
|
y_pred_dense = model_dense.predict(X)
|
|
y_pred_sparse = model_sparse.predict(X_sparse)
|
|
assert_allclose(y_pred_dense, y_pred_sparse)
|
|
|
|
assert_allclose(model_dense[1].intercept_, model_sparse[1].intercept_)
|