Skip to content

Commit f787529

Browse files
ENH: Add minmax_scale and standardize functions
1 parent 0477f80 commit f787529

File tree

2 files changed

+153
-1
lines changed

2 files changed

+153
-1
lines changed
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
"""Preprocessing module."""
22

3-
__all__ = []
3+
from .scalers import minmax_scale, standardize
4+
5+
__all__ = ["minmax_scale", "standardize"]
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
"""Data scaling functions."""
2+
3+
from scipy import sparse
4+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
5+
from sklearn.utils.validation import check_array
6+
7+
8+
def _validate_and_align(X_train, X_test):
9+
"""Validate arrays as numeric 2D matrices and ensure matching feature counts.
10+
11+
Parameters
12+
----------
13+
X_train : array-like of shape (n_samples, n_features)
14+
Training feature matrix used for validation reference.
15+
16+
X_test : array-like of shape (m_samples, n_features), optional
17+
Test feature matrix to validate against training matrix.
18+
19+
Returns
20+
-------
21+
(X_train_valid, X_test_valid) : tuple
22+
Validated arrays. X_test_valid is None if X_test is None.
23+
24+
Raises
25+
------
26+
ValueError
27+
If X_test has different number of features than X_train.
28+
29+
"""
30+
X_train = check_array(X_train, accept_sparse=True, dtype="numeric")
31+
if X_test is not None:
32+
X_test = check_array(X_test, accept_sparse=True, dtype="numeric")
33+
if X_test.shape[1] != X_train.shape[1]:
34+
raise ValueError(
35+
f"X_test has {X_test.shape[1]} features but X_train has {X_train.shape[1]}."
36+
)
37+
return X_train, X_test
38+
39+
40+
def minmax_scale(X_train, X_test=None, return_transformer=False):
41+
"""Scale features to a fixed range between 0 and 1.
42+
43+
Fits scaling parameters on training data and applies the same transformation
44+
to both training and test sets.
45+
46+
Parameters
47+
----------
48+
X_train : array-like of shape (n_samples, n_features)
49+
Training feature matrix used to fit scaling parameters.
50+
51+
X_test : array-like of shape (m_samples, n_features), optional
52+
Test feature matrix to transform using fitted parameters.
53+
54+
return_transformer : bool, default=False
55+
If True, also return the fitted scaling object.
56+
57+
Returns
58+
-------
59+
(X_train_scaled, X_test_scaled) or (X_train_scaled, X_test_scaled, scaler)
60+
Scaled arrays; X_test_scaled is None if X_test is None.
61+
62+
Raises
63+
------
64+
ValueError
65+
If X_test has a different number of features than X_train.
66+
67+
Examples
68+
--------
69+
>>> import numpy as np
70+
>>> from preprocessing.scalers import minmax_scale
71+
>>> X_train = np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]])
72+
>>> X_test = np.array([[2.5, 5.0]])
73+
>>> X_train_scaled, X_test_scaled = minmax_scale(X_train, X_test)
74+
>>> X_train_scaled
75+
array([[0. , 0. ],
76+
[0.5, 0.5],
77+
[1. , 1. ]])
78+
>>> X_test_scaled
79+
array([[0.75, 0.75]])
80+
81+
"""
82+
X_train, X_test = _validate_and_align(X_train, X_test)
83+
84+
scaler = MinMaxScaler(feature_range=(0.0, 1.0))
85+
X_train_scaled = scaler.fit_transform(X_train)
86+
X_test_scaled = scaler.transform(X_test) if X_test is not None else None
87+
88+
if return_transformer:
89+
return X_train_scaled, X_test_scaled, scaler
90+
else:
91+
return X_train_scaled, X_test_scaled
92+
93+
94+
def standardize(X_train, X_test=None, return_transformer=False):
95+
"""Standardize features to have zero mean and unit variance.
96+
97+
Fits scaling parameters on training data and applies the same transformation
98+
to both training and test sets. For sparse matrices, centering is disabled
99+
to preserve sparsity.
100+
101+
Parameters
102+
----------
103+
X_train : array-like of shape (n_samples, n_features)
104+
Feature matrix used specifically for model training.
105+
106+
X_test : array-like of shape (m_samples, n_features), optional
107+
Test feature matrix to transform using fitted parameters.
108+
109+
return_transformer: bool, default=False
110+
If True, also return the fitted scaling object.
111+
112+
Returns
113+
-------
114+
(X_train_scaled, X_test_scaled) or (X_train_scaled, X_test_scaled, scaler)
115+
Scaled arrays; X_test_scaled is None if X_test is None.
116+
117+
Raises
118+
------
119+
ValueError
120+
If X_test has a different number of features than X_train.
121+
122+
Examples
123+
--------
124+
>>> import numpy as np
125+
>>> from preprocessing.scalers import standardize
126+
>>> X_train = np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]])
127+
>>> X_test = np.array([[2.5, 5.0]])
128+
>>> X_train_scaled, X_test_scaled = standardize(X_train, X_test)
129+
>>> X_train_scaled.round(3)
130+
array([[-1.225, -1.225],
131+
[ 0. , 0. ],
132+
[ 1.225, 1.225]])
133+
>>> X_test_scaled.round(3)
134+
array([[0.612, 0.612]])
135+
136+
"""
137+
X_train, X_test = _validate_and_align(X_train, X_test)
138+
139+
scaler = (
140+
StandardScaler(with_mean=False)
141+
if sparse.issparse(X_train)
142+
else StandardScaler()
143+
)
144+
X_train_scaled = scaler.fit_transform(X_train)
145+
X_test_scaled = scaler.transform(X_test) if X_test is not None else None
146+
147+
if return_transformer:
148+
return X_train_scaled, X_test_scaled, scaler
149+
else:
150+
return X_train_scaled, X_test_scaled

0 commit comments

Comments
 (0)