You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
205 lines
7.3 KiB
205 lines
7.3 KiB
# Authors: The scikit-learn developers
|
|
# SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
import itertools
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from sklearn import datasets
|
|
from sklearn.covariance import MinCovDet, empirical_covariance, fast_mcd
|
|
from sklearn.utils._testing import assert_array_almost_equal
|
|
|
|
X = datasets.load_iris().data
|
|
X_1d = X[:, 0]
|
|
n_samples, n_features = X.shape
|
|
|
|
|
|
def test_mcd(global_random_seed):
|
|
# Tests the FastMCD algorithm implementation
|
|
# Small data set
|
|
# test without outliers (random independent normal data)
|
|
launch_mcd_on_dataset(100, 5, 0, 0.02, 0.1, 74, global_random_seed)
|
|
# test with a contaminated data set (medium contamination)
|
|
launch_mcd_on_dataset(100, 5, 20, 0.3, 0.3, 65, global_random_seed)
|
|
# test with a contaminated data set (strong contamination)
|
|
launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50, global_random_seed)
|
|
|
|
# Medium data set
|
|
launch_mcd_on_dataset(1000, 5, 450, 0.1, 0.1, 540, global_random_seed)
|
|
|
|
# Large data set
|
|
launch_mcd_on_dataset(1700, 5, 800, 0.1, 0.1, 870, global_random_seed)
|
|
|
|
# 1D data set
|
|
launch_mcd_on_dataset(500, 1, 100, 0.10, 0.10, 350, global_random_seed)
|
|
|
|
# n_samples == n_features
|
|
launch_mcd_on_dataset(20, 20, 0, 0.1, 0.1, 15, global_random_seed)
|
|
|
|
|
|
def test_fast_mcd_on_invalid_input():
|
|
X = np.arange(100)
|
|
msg = "Expected 2D array, got 1D array instead"
|
|
with pytest.raises(ValueError, match=msg):
|
|
fast_mcd(X)
|
|
|
|
|
|
def test_mcd_class_on_invalid_input():
|
|
X = np.arange(100)
|
|
mcd = MinCovDet()
|
|
msg = "Expected 2D array, got 1D array instead"
|
|
with pytest.raises(ValueError, match=msg):
|
|
mcd.fit(X)
|
|
|
|
|
|
def launch_mcd_on_dataset(
|
|
n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support, seed
|
|
):
|
|
rand_gen = np.random.RandomState(seed)
|
|
data = rand_gen.randn(n_samples, n_features)
|
|
# add some outliers
|
|
outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
|
|
outliers_offset = 10.0 * (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
|
|
data[outliers_index] += outliers_offset
|
|
inliers_mask = np.ones(n_samples).astype(bool)
|
|
inliers_mask[outliers_index] = False
|
|
|
|
pure_data = data[inliers_mask]
|
|
# compute MCD by fitting an object
|
|
mcd_fit = MinCovDet(random_state=seed).fit(data)
|
|
T = mcd_fit.location_
|
|
S = mcd_fit.covariance_
|
|
H = mcd_fit.support_
|
|
# compare with the estimates learnt from the inliers
|
|
error_location = np.mean((pure_data.mean(0) - T) ** 2)
|
|
assert error_location < tol_loc
|
|
error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
|
|
assert error_cov < tol_cov
|
|
assert np.sum(H) >= tol_support
|
|
assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
|
|
|
|
|
|
def test_mcd_issue1127():
|
|
# Check that the code does not break with X.shape = (3, 1)
|
|
# (i.e. n_support = n_samples)
|
|
rnd = np.random.RandomState(0)
|
|
X = rnd.normal(size=(3, 1))
|
|
mcd = MinCovDet()
|
|
mcd.fit(X)
|
|
|
|
|
|
def test_mcd_issue3367(global_random_seed):
|
|
# Check that MCD completes when the covariance matrix is singular
|
|
# i.e. one of the rows and columns are all zeros
|
|
rand_gen = np.random.RandomState(global_random_seed)
|
|
|
|
# Think of these as the values for X and Y -> 10 values between -5 and 5
|
|
data_values = np.linspace(-5, 5, 10).tolist()
|
|
# Get the cartesian product of all possible coordinate pairs from above set
|
|
data = np.array(list(itertools.product(data_values, data_values)))
|
|
|
|
# Add a third column that's all zeros to make our data a set of point
|
|
# within a plane, which means that the covariance matrix will be singular
|
|
data = np.hstack((data, np.zeros((data.shape[0], 1))))
|
|
|
|
# The below line of code should raise an exception if the covariance matrix
|
|
# is singular. As a further test, since we have points in XYZ, the
|
|
# principle components (Eigenvectors) of these directly relate to the
|
|
# geometry of the points. Since it's a plane, we should be able to test
|
|
# that the Eigenvector that corresponds to the smallest Eigenvalue is the
|
|
# plane normal, specifically [0, 0, 1], since everything is in the XY plane
|
|
# (as I've set it up above). To do this one would start by:
|
|
#
|
|
# evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
|
|
# normal = evecs[:, np.argmin(evals)]
|
|
#
|
|
# After which we need to assert that our `normal` is equal to [0, 0, 1].
|
|
# Do note that there is floating point error associated with this, so it's
|
|
# best to subtract the two and then compare some small tolerance (e.g.
|
|
# 1e-12).
|
|
MinCovDet(random_state=rand_gen).fit(data)
|
|
|
|
|
|
def test_mcd_support_covariance_is_zero():
|
|
# Check that MCD returns a ValueError with informative message when the
|
|
# covariance of the support data is equal to 0.
|
|
X_1 = np.array([0.5, 0.1, 0.1, 0.1, 0.957, 0.1, 0.1, 0.1, 0.4285, 0.1])
|
|
X_1 = X_1.reshape(-1, 1)
|
|
X_2 = np.array([0.5, 0.3, 0.3, 0.3, 0.957, 0.3, 0.3, 0.3, 0.4285, 0.3])
|
|
X_2 = X_2.reshape(-1, 1)
|
|
msg = (
|
|
"The covariance matrix of the support data is equal to 0, try to "
|
|
"increase support_fraction"
|
|
)
|
|
for X in [X_1, X_2]:
|
|
with pytest.raises(ValueError, match=msg):
|
|
MinCovDet().fit(X)
|
|
|
|
|
|
def test_mcd_increasing_det_warning(global_random_seed):
|
|
# Check that a warning is raised if we observe increasing determinants
|
|
# during the c_step. In theory the sequence of determinants should be
|
|
# decreasing. Increasing determinants are likely due to ill-conditioned
|
|
# covariance matrices that result in poor precision matrices.
|
|
|
|
X = [
|
|
[5.1, 3.5, 1.4, 0.2],
|
|
[4.9, 3.0, 1.4, 0.2],
|
|
[4.7, 3.2, 1.3, 0.2],
|
|
[4.6, 3.1, 1.5, 0.2],
|
|
[5.0, 3.6, 1.4, 0.2],
|
|
[4.6, 3.4, 1.4, 0.3],
|
|
[5.0, 3.4, 1.5, 0.2],
|
|
[4.4, 2.9, 1.4, 0.2],
|
|
[4.9, 3.1, 1.5, 0.1],
|
|
[5.4, 3.7, 1.5, 0.2],
|
|
[4.8, 3.4, 1.6, 0.2],
|
|
[4.8, 3.0, 1.4, 0.1],
|
|
[4.3, 3.0, 1.1, 0.1],
|
|
[5.1, 3.5, 1.4, 0.3],
|
|
[5.7, 3.8, 1.7, 0.3],
|
|
[5.4, 3.4, 1.7, 0.2],
|
|
[4.6, 3.6, 1.0, 0.2],
|
|
[5.0, 3.0, 1.6, 0.2],
|
|
[5.2, 3.5, 1.5, 0.2],
|
|
]
|
|
|
|
mcd = MinCovDet(support_fraction=0.5, random_state=global_random_seed)
|
|
warn_msg = "Determinant has increased"
|
|
with pytest.warns(RuntimeWarning, match=warn_msg):
|
|
mcd.fit(X)
|
|
|
|
|
|
@pytest.mark.parametrize("n_samples,n_features", [(2000, 10)])
|
|
def test_mincovdet_bias_on_normal(n_samples, n_features, global_random_seed):
|
|
"""Check that MinCovDet does not underestimate the empirical
|
|
variance on Gaussian data.
|
|
|
|
A large sample size and n_features makes the test robust.
|
|
|
|
Non-regression test for:
|
|
https://github.com/scikit-learn/scikit-learn/issues/23162
|
|
"""
|
|
threshold = 0.985 # threshold for variance underesitmation
|
|
rng = np.random.default_rng(global_random_seed)
|
|
x = rng.normal(size=(n_features, n_samples))
|
|
# Assume centered data, to reduce test complexity
|
|
var_emp = empirical_covariance(x.T, assume_centered=True).diagonal()
|
|
cov_mcd = (
|
|
MinCovDet(
|
|
support_fraction=1.0,
|
|
store_precision=False,
|
|
assume_centered=True,
|
|
random_state=global_random_seed,
|
|
)
|
|
.fit(x.T)
|
|
.covariance_
|
|
)
|
|
var_mcd = np.diag(cov_mcd)
|
|
|
|
# compute mean ratio of variances
|
|
mean_var_ratio = np.sum(var_mcd) / np.sum(var_emp)
|
|
|
|
assert mean_var_ratio > threshold, "MinCovDet underestimates the Gaussian variance"
|