|
| 1 | +# ruff: noqa |
| 2 | +""" |
| 3 | +======================================= |
| 4 | +Release Highlights for scikit-learn 1.5 |
| 5 | +======================================= |
| 6 | +
|
| 7 | +.. currentmodule:: sklearn |
| 8 | +
|
| 9 | +We are pleased to announce the release of scikit-learn 1.5! Many bug fixes |
| 10 | +and improvements were added, as well as some key new features. Below we |
| 11 | +detail the highlights of this release. **For an exhaustive list of |
| 12 | +all the changes**, please refer to the :ref:`release notes <release_notes_1_5>`. |
| 13 | +
|
| 14 | +To install the latest version (with pip):: |
| 15 | +
|
| 16 | + pip install --upgrade scikit-learn |
| 17 | +
|
| 18 | +or with conda:: |
| 19 | +
|
| 20 | + conda install -c conda-forge scikit-learn |
| 21 | +
|
| 22 | +""" |
| 23 | + |
| 24 | +# %% |
| 25 | +# FixedThresholdClassifier: Setting the decision threshold of a binary classifier |
| 26 | +# ------------------------------------------------------------------------------- |
| 27 | +# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to |
| 28 | +# convert probability estimates (i.e. output of `predict_proba`) into class |
| 29 | +# predictions. However, 0.5 is almost never the desired threshold for a given problem. |
| 30 | +# :class:`~model_selection.FixedThresholdClassifier` allows to wrap any binary |
| 31 | +# classifier and set a custom decision threshold. |
| 32 | +from sklearn.datasets import make_classification |
| 33 | +from sklearn.linear_model import LogisticRegression |
| 34 | +from sklearn.metrics import confusion_matrix |
| 35 | + |
| 36 | +X, y = make_classification(n_samples=1_000, weights=[0.9, 0.1], random_state=0) |
| 37 | +classifier = LogisticRegression(random_state=0).fit(X, y) |
| 38 | + |
| 39 | +print("confusion matrix:\n", confusion_matrix(y, classifier.predict(X))) |
| 40 | + |
| 41 | +# %% |
| 42 | +# Lowering the threshold, i.e. allowing more samples to be classified as the positive |
| 43 | +# class, increases the number of true positives at the cost of more false positives |
| 44 | +# (as is well known from the concavity of the ROC curve). |
| 45 | +from sklearn.model_selection import FixedThresholdClassifier |
| 46 | + |
| 47 | +wrapped_classifier = FixedThresholdClassifier(classifier, threshold=0.1).fit(X, y) |
| 48 | + |
| 49 | +print("confusion matrix:\n", confusion_matrix(y, wrapped_classifier.predict(X))) |
| 50 | + |
| 51 | +# %% |
| 52 | +# TunedThresholdClassifierCV: Tuning the decision threshold of a binary classifier |
| 53 | +# -------------------------------------------------------------------------------- |
| 54 | +# The decision threshold of a binary classifier can be tuned to optimize a given |
| 55 | +# metric, using :class:`~model_selection.TunedThresholdClassifierCV`. |
| 56 | +from sklearn.metrics import balanced_accuracy_score |
| 57 | + |
| 58 | +# Due to the class imbalance, the balanced accuracy is not optimal for the default |
| 59 | +# threshold. The classifier tends to over predict the majority class. |
| 60 | +print(f"balanced accuracy: {balanced_accuracy_score(y, classifier.predict(X)):.2f}") |
| 61 | + |
| 62 | +# %% |
| 63 | +# Tuning the threshold to optimize the balanced accuracy gives a smaller threshold |
| 64 | +# that allows more samples to be classified as the positive class. |
| 65 | +from sklearn.model_selection import TunedThresholdClassifierCV |
| 66 | + |
| 67 | +tuned_classifier = TunedThresholdClassifierCV( |
| 68 | + classifier, cv=5, scoring="balanced_accuracy" |
| 69 | +).fit(X, y) |
| 70 | + |
| 71 | +print(f"new threshold: {tuned_classifier.best_threshold_:.4f}") |
| 72 | +print( |
| 73 | + f"balanced accuracy: {balanced_accuracy_score(y, tuned_classifier.predict(X)):.2f}" |
| 74 | +) |
| 75 | + |
| 76 | +# %% |
| 77 | +# :class:`~model_selection.TunedThresholdClassifierCV` also benefits from the |
| 78 | +# metadata routing support (:ref:`Metadata Routing User Guide<metadata_routing>`) |
| 79 | +# allowing to optimze complex business metrics, detailed |
| 80 | +# in :ref:`Post-tuning the decision threshold for cost-sensitive learning |
| 81 | +# <sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py>`. |
| 82 | + |
| 83 | +# %% |
| 84 | +# Performance improvements in PCA |
| 85 | +# ------------------------------- |
| 86 | +# :class:`~decomposition.PCA` has a new solver, "covariance_eigh", which is faster |
| 87 | +# and more memory efficient than the other solvers for datasets with a large number |
| 88 | +# of samples and a small number of features. |
| 89 | +from sklearn.datasets import make_low_rank_matrix |
| 90 | +from sklearn.decomposition import PCA |
| 91 | + |
| 92 | +X = make_low_rank_matrix( |
| 93 | + n_samples=10_000, n_features=100, tail_strength=0.1, random_state=0 |
| 94 | +) |
| 95 | + |
| 96 | +pca = PCA(n_components=10).fit(X) |
| 97 | + |
| 98 | +print(f"explained variance: {pca.explained_variance_ratio_.sum():.2f}") |
| 99 | + |
| 100 | +# %% |
| 101 | +# The "full" solver has also been improved to use less memory and allows to |
| 102 | +# transform faster. The "auto" option for the solver takes advantage of the |
| 103 | +# new solver and is now able to select an appropriate solver for sparse |
| 104 | +# datasets. |
| 105 | +from scipy.sparse import random |
| 106 | + |
| 107 | +X = random(10000, 100, format="csr", random_state=0) |
| 108 | + |
| 109 | +pca = PCA(n_components=10, svd_solver="auto").fit(X) |
| 110 | + |
| 111 | +# %% |
| 112 | +# ColumnTransformer is subscriptable |
| 113 | +# ---------------------------------- |
| 114 | +# The transformers of a :class:`~compose.ColumnTransformer` can now be directly |
| 115 | +# accessed using indexing by name. |
| 116 | +import numpy as np |
| 117 | +from sklearn.compose import ColumnTransformer |
| 118 | +from sklearn.preprocessing import StandardScaler, OneHotEncoder |
| 119 | + |
| 120 | +X = np.array([[0, 1, 2], [3, 4, 5]]) |
| 121 | +column_transformer = ColumnTransformer( |
| 122 | + [("std_scaler", StandardScaler(), [0]), ("one_hot", OneHotEncoder(), [1, 2])] |
| 123 | +) |
| 124 | + |
| 125 | +column_transformer.fit(X) |
| 126 | + |
| 127 | +print(column_transformer["std_scaler"]) |
| 128 | +print(column_transformer["one_hot"]) |
| 129 | + |
| 130 | +# %% |
| 131 | +# Custom imputation strategies for the SimpleImputer |
| 132 | +# -------------------------------------------------- |
| 133 | +# :class:`~impute.SimpleImputer` now supports custom strategies for imputation, |
| 134 | +# using a callable that computes a scalar value from the non missing values of |
| 135 | +# a column vector. |
| 136 | +from sklearn.impute import SimpleImputer |
| 137 | + |
| 138 | +X = np.array( |
| 139 | + [ |
| 140 | + [-1.1, 1.1, 1.1], |
| 141 | + [3.9, -1.2, np.nan], |
| 142 | + [np.nan, 1.3, np.nan], |
| 143 | + [-0.1, -1.4, -1.4], |
| 144 | + [-4.9, 1.5, -1.5], |
| 145 | + [np.nan, 1.6, 1.6], |
| 146 | + ] |
| 147 | +) |
| 148 | + |
| 149 | + |
| 150 | +def smallest_abs(arr): |
| 151 | + """Return the smallest absolute value of a 1D array.""" |
| 152 | + return np.min(np.abs(arr)) |
| 153 | + |
| 154 | + |
| 155 | +imputer = SimpleImputer(strategy=smallest_abs) |
| 156 | + |
| 157 | +imputer.fit_transform(X) |
| 158 | + |
| 159 | +# %% |
| 160 | +# Pairwise distances with non-numeric arrays |
| 161 | +# ------------------------------------------ |
| 162 | +# :func:`~metrics.pairwise_distances` can now compute distances between |
| 163 | +# non-numeric arrays using a callable metric. |
| 164 | +from sklearn.metrics import pairwise_distances |
| 165 | + |
| 166 | +X = ["cat", "dog"] |
| 167 | +Y = ["cat", "fox"] |
| 168 | + |
| 169 | + |
| 170 | +def levenshtein_distance(x, y): |
| 171 | + """Return the Levenshtein distance between two strings.""" |
| 172 | + if x == "" or y == "": |
| 173 | + return max(len(x), len(y)) |
| 174 | + if x[0] == y[0]: |
| 175 | + return levenshtein_distance(x[1:], y[1:]) |
| 176 | + return 1 + min( |
| 177 | + levenshtein_distance(x[1:], y), |
| 178 | + levenshtein_distance(x, y[1:]), |
| 179 | + levenshtein_distance(x[1:], y[1:]), |
| 180 | + ) |
| 181 | + |
| 182 | + |
| 183 | +pairwise_distances(X, Y, metric=levenshtein_distance) |
0 commit comments