Skip to content

Commit 0ac28ad

Browse files
jeremiedbbbetatimglemaitrelorentzenchr
committed
DOC Release highlights 1.5 (#29007)
Co-authored-by: Tim Head <betatim@gmail.com> Co-authored-by: Guillaume Lemaitre <guillaume@probabl.ai> Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
1 parent 729b54d commit 0ac28ad

File tree

1 file changed

+183
-0
lines changed

1 file changed

+183
-0
lines changed
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
# ruff: noqa
2+
"""
3+
=======================================
4+
Release Highlights for scikit-learn 1.5
5+
=======================================
6+
7+
.. currentmodule:: sklearn
8+
9+
We are pleased to announce the release of scikit-learn 1.5! Many bug fixes
10+
and improvements were added, as well as some key new features. Below we
11+
detail the highlights of this release. **For an exhaustive list of
12+
all the changes**, please refer to the :ref:`release notes <release_notes_1_5>`.
13+
14+
To install the latest version (with pip)::
15+
16+
pip install --upgrade scikit-learn
17+
18+
or with conda::
19+
20+
conda install -c conda-forge scikit-learn
21+
22+
"""
23+
24+
# %%
25+
# FixedThresholdClassifier: Setting the decision threshold of a binary classifier
26+
# -------------------------------------------------------------------------------
27+
# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5 to
28+
# convert probability estimates (i.e. output of `predict_proba`) into class
29+
# predictions. However, 0.5 is almost never the desired threshold for a given problem.
30+
# :class:`~model_selection.FixedThresholdClassifier` allows to wrap any binary
31+
# classifier and set a custom decision threshold.
32+
from sklearn.datasets import make_classification
33+
from sklearn.linear_model import LogisticRegression
34+
from sklearn.metrics import confusion_matrix
35+
36+
X, y = make_classification(n_samples=1_000, weights=[0.9, 0.1], random_state=0)
37+
classifier = LogisticRegression(random_state=0).fit(X, y)
38+
39+
print("confusion matrix:\n", confusion_matrix(y, classifier.predict(X)))
40+
41+
# %%
42+
# Lowering the threshold, i.e. allowing more samples to be classified as the positive
43+
# class, increases the number of true positives at the cost of more false positives
44+
# (as is well known from the concavity of the ROC curve).
45+
from sklearn.model_selection import FixedThresholdClassifier
46+
47+
wrapped_classifier = FixedThresholdClassifier(classifier, threshold=0.1).fit(X, y)
48+
49+
print("confusion matrix:\n", confusion_matrix(y, wrapped_classifier.predict(X)))
50+
51+
# %%
52+
# TunedThresholdClassifierCV: Tuning the decision threshold of a binary classifier
53+
# --------------------------------------------------------------------------------
54+
# The decision threshold of a binary classifier can be tuned to optimize a given
55+
# metric, using :class:`~model_selection.TunedThresholdClassifierCV`.
56+
from sklearn.metrics import balanced_accuracy_score
57+
58+
# Due to the class imbalance, the balanced accuracy is not optimal for the default
59+
# threshold. The classifier tends to over predict the majority class.
60+
print(f"balanced accuracy: {balanced_accuracy_score(y, classifier.predict(X)):.2f}")
61+
62+
# %%
63+
# Tuning the threshold to optimize the balanced accuracy gives a smaller threshold
64+
# that allows more samples to be classified as the positive class.
65+
from sklearn.model_selection import TunedThresholdClassifierCV
66+
67+
tuned_classifier = TunedThresholdClassifierCV(
68+
classifier, cv=5, scoring="balanced_accuracy"
69+
).fit(X, y)
70+
71+
print(f"new threshold: {tuned_classifier.best_threshold_:.4f}")
72+
print(
73+
f"balanced accuracy: {balanced_accuracy_score(y, tuned_classifier.predict(X)):.2f}"
74+
)
75+
76+
# %%
77+
# :class:`~model_selection.TunedThresholdClassifierCV` also benefits from the
78+
# metadata routing support (:ref:`Metadata Routing User Guide<metadata_routing>`)
79+
# allowing to optimze complex business metrics, detailed
80+
# in :ref:`Post-tuning the decision threshold for cost-sensitive learning
81+
# <sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py>`.
82+
83+
# %%
84+
# Performance improvements in PCA
85+
# -------------------------------
86+
# :class:`~decomposition.PCA` has a new solver, "covariance_eigh", which is faster
87+
# and more memory efficient than the other solvers for datasets with a large number
88+
# of samples and a small number of features.
89+
from sklearn.datasets import make_low_rank_matrix
90+
from sklearn.decomposition import PCA
91+
92+
X = make_low_rank_matrix(
93+
n_samples=10_000, n_features=100, tail_strength=0.1, random_state=0
94+
)
95+
96+
pca = PCA(n_components=10).fit(X)
97+
98+
print(f"explained variance: {pca.explained_variance_ratio_.sum():.2f}")
99+
100+
# %%
101+
# The "full" solver has also been improved to use less memory and allows to
102+
# transform faster. The "auto" option for the solver takes advantage of the
103+
# new solver and is now able to select an appropriate solver for sparse
104+
# datasets.
105+
from scipy.sparse import random
106+
107+
X = random(10000, 100, format="csr", random_state=0)
108+
109+
pca = PCA(n_components=10, svd_solver="auto").fit(X)
110+
111+
# %%
112+
# ColumnTransformer is subscriptable
113+
# ----------------------------------
114+
# The transformers of a :class:`~compose.ColumnTransformer` can now be directly
115+
# accessed using indexing by name.
116+
import numpy as np
117+
from sklearn.compose import ColumnTransformer
118+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
119+
120+
X = np.array([[0, 1, 2], [3, 4, 5]])
121+
column_transformer = ColumnTransformer(
122+
[("std_scaler", StandardScaler(), [0]), ("one_hot", OneHotEncoder(), [1, 2])]
123+
)
124+
125+
column_transformer.fit(X)
126+
127+
print(column_transformer["std_scaler"])
128+
print(column_transformer["one_hot"])
129+
130+
# %%
131+
# Custom imputation strategies for the SimpleImputer
132+
# --------------------------------------------------
133+
# :class:`~impute.SimpleImputer` now supports custom strategies for imputation,
134+
# using a callable that computes a scalar value from the non missing values of
135+
# a column vector.
136+
from sklearn.impute import SimpleImputer
137+
138+
X = np.array(
139+
[
140+
[-1.1, 1.1, 1.1],
141+
[3.9, -1.2, np.nan],
142+
[np.nan, 1.3, np.nan],
143+
[-0.1, -1.4, -1.4],
144+
[-4.9, 1.5, -1.5],
145+
[np.nan, 1.6, 1.6],
146+
]
147+
)
148+
149+
150+
def smallest_abs(arr):
151+
"""Return the smallest absolute value of a 1D array."""
152+
return np.min(np.abs(arr))
153+
154+
155+
imputer = SimpleImputer(strategy=smallest_abs)
156+
157+
imputer.fit_transform(X)
158+
159+
# %%
160+
# Pairwise distances with non-numeric arrays
161+
# ------------------------------------------
162+
# :func:`~metrics.pairwise_distances` can now compute distances between
163+
# non-numeric arrays using a callable metric.
164+
from sklearn.metrics import pairwise_distances
165+
166+
X = ["cat", "dog"]
167+
Y = ["cat", "fox"]
168+
169+
170+
def levenshtein_distance(x, y):
171+
"""Return the Levenshtein distance between two strings."""
172+
if x == "" or y == "":
173+
return max(len(x), len(y))
174+
if x[0] == y[0]:
175+
return levenshtein_distance(x[1:], y[1:])
176+
return 1 + min(
177+
levenshtein_distance(x[1:], y),
178+
levenshtein_distance(x, y[1:]),
179+
levenshtein_distance(x[1:], y[1:]),
180+
)
181+
182+
183+
pairwise_distances(X, Y, metric=levenshtein_distance)

0 commit comments

Comments
 (0)