ENH: Add minmax_scale and standardize functions

angelsevillamol · angelsevillamol · commit f787529402d4 · 2025-09-02T10:39:31.000+02:00
diff --git a/orca_python/preprocessing/__init__.py b/orca_python/preprocessing/__init__.py
@@ -1,3 +1,5 @@
 """Preprocessing module."""
 
-__all__ = []
+from .scalers import minmax_scale, standardize
+
+__all__ = ["minmax_scale", "standardize"]
diff --git a/orca_python/preprocessing/scalers.py b/orca_python/preprocessing/scalers.py
@@ -0,0 +1,150 @@
+"""Data scaling functions."""
+
+from scipy import sparse
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.utils.validation import check_array
+
+
+def _validate_and_align(X_train, X_test):
+    """Validate arrays as numeric 2D matrices and ensure matching feature counts.
+
+    Parameters
+    ----------
+    X_train : array-like of shape (n_samples, n_features)
+        Training feature matrix used for validation reference.
+
+    X_test : array-like of shape (m_samples, n_features), optional
+        Test feature matrix to validate against training matrix.
+
+    Returns
+    -------
+    (X_train_valid, X_test_valid) : tuple
+        Validated arrays. X_test_valid is None if X_test is None.
+
+    Raises
+    ------
+    ValueError
+        If X_test has different number of features than X_train.
+
+    """
+    X_train = check_array(X_train, accept_sparse=True, dtype="numeric")
+    if X_test is not None:
+        X_test = check_array(X_test, accept_sparse=True, dtype="numeric")
+        if X_test.shape[1] != X_train.shape[1]:
+            raise ValueError(
+                f"X_test has {X_test.shape[1]} features but X_train has {X_train.shape[1]}."
+            )
+    return X_train, X_test
+
+
+def minmax_scale(X_train, X_test=None, return_transformer=False):
+    """Scale features to a fixed range between 0 and 1.
+
+    Fits scaling parameters on training data and applies the same transformation
+    to both training and test sets.
+
+    Parameters
+    ----------
+    X_train : array-like of shape (n_samples, n_features)
+        Training feature matrix used to fit scaling parameters.
+
+    X_test : array-like of shape (m_samples, n_features), optional
+        Test feature matrix to transform using fitted parameters.
+
+    return_transformer : bool, default=False
+        If True, also return the fitted scaling object.
+
+    Returns
+    -------
+    (X_train_scaled, X_test_scaled) or (X_train_scaled, X_test_scaled, scaler)
+        Scaled arrays; X_test_scaled is None if X_test is None.
+
+    Raises
+    ------
+    ValueError
+        If X_test has a different number of features than X_train.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from preprocessing.scalers import minmax_scale
+    >>> X_train = np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]])
+    >>> X_test = np.array([[2.5, 5.0]])
+    >>> X_train_scaled, X_test_scaled = minmax_scale(X_train, X_test)
+    >>> X_train_scaled
+    array([[0. , 0. ],
+           [0.5, 0.5],
+           [1. , 1. ]])
+    >>> X_test_scaled
+    array([[0.75, 0.75]])
+
+    """
+    X_train, X_test = _validate_and_align(X_train, X_test)
+
+    scaler = MinMaxScaler(feature_range=(0.0, 1.0))
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test) if X_test is not None else None
+
+    if return_transformer:
+        return X_train_scaled, X_test_scaled, scaler
+    else:
+        return X_train_scaled, X_test_scaled
+
+
+def standardize(X_train, X_test=None, return_transformer=False):
+    """Standardize features to have zero mean and unit variance.
+
+    Fits scaling parameters on training data and applies the same transformation
+    to both training and test sets. For sparse matrices, centering is disabled
+    to preserve sparsity.
+
+    Parameters
+    ----------
+    X_train : array-like of shape (n_samples, n_features)
+        Feature matrix used specifically for model training.
+
+    X_test : array-like of shape (m_samples, n_features), optional
+        Test feature matrix to transform using fitted parameters.
+
+    return_transformer: bool, default=False
+        If True, also return the fitted scaling object.
+
+    Returns
+    -------
+    (X_train_scaled, X_test_scaled) or (X_train_scaled, X_test_scaled, scaler)
+        Scaled arrays; X_test_scaled is None if X_test is None.
+
+    Raises
+    ------
+    ValueError
+        If X_test has a different number of features than X_train.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from preprocessing.scalers import standardize
+    >>> X_train = np.array([[1.0, 2.0], [2.0, 4.0], [3.0, 6.0]])
+    >>> X_test = np.array([[2.5, 5.0]])
+    >>> X_train_scaled, X_test_scaled = standardize(X_train, X_test)
+    >>> X_train_scaled.round(3)
+    array([[-1.225, -1.225],
+           [ 0.   ,  0.   ],
+           [ 1.225,  1.225]])
+    >>> X_test_scaled.round(3)
+    array([[0.612, 0.612]])
+
+    """
+    X_train, X_test = _validate_and_align(X_train, X_test)
+
+    scaler = (
+        StandardScaler(with_mean=False)
+        if sparse.issparse(X_train)
+        else StandardScaler()
+    )
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test) if X_test is not None else None
+
+    if return_transformer:
+        return X_train_scaled, X_test_scaled, scaler
+    else:
+        return X_train_scaled, X_test_scaled