perf: try out new cliping numba kernel

ilan-gold · ilan-gold · commit bd5a121cb33a · 2026-01-23T13:48:47.000+01:00
diff --git a/benchmarks/benchmarks/preprocessing_log.py b/benchmarks/benchmarks/preprocessing_log.py
@@ -34,7 +34,7 @@ class PreprocessingSuite:  # noqa: D101
 
     def setup_cache(self) -> None:
         """Without this caching, asv was running several processes which meant the data was repeatedly downloaded."""
-        for dataset, layer in product(*self.params):
+        for dataset, layer in product(*self.params[:2]):
             adata, _ = get_dataset(dataset, layer=layer)
             adata.write_h5ad(f"{dataset}_{layer}.h5ad")
 
@@ -47,17 +47,6 @@ def time_pca(self, *_) -> None:
     def peakmem_pca(self, *_) -> None:
         sc.pp.pca(self.adata, svd_solver="arpack")
 
-    def time_highly_variable_genes(self, *_) -> None:
-        # the default flavor runs on log-transformed data
-        sc.pp.highly_variable_genes(
-            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5
-        )
-
-    def peakmem_highly_variable_genes(self, *_) -> None:
-        sc.pp.highly_variable_genes(
-            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5
-        )
-
     # regress_out is very slow for this dataset
     @skip_when(dataset={"pbmc3k"})
     def time_regress_out(self, *_) -> None:
@@ -72,3 +61,23 @@ def time_scale(self, *_) -> None:
 
     def peakmem_scale(self, *_) -> None:
         sc.pp.scale(self.adata, max_value=10)
+
+
+class HVGSuite(PreprocessingSuite):  # noqa: D101
+    params = (*params, ["seurat_v3", "cell_ranger", "seurat"])
+    param_names = (*param_names, "flavor")
+
+    def setup(self, dataset, layer, flavor) -> None:
+        self.adata = ad.read_h5ad(f"{dataset}_{layer}.h5ad")
+        self.flavor = flavor
+
+    def time_highly_variable_genes(self, *_) -> None:
+        # the default flavor runs on log-transformed data
+        sc.pp.highly_variable_genes(
+            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor=self.flavor
+        )
+
+    def peakmem_highly_variable_genes(self, *_) -> None:
+        sc.pp.highly_variable_genes(
+            self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5, flavor=self.flavor
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -198,6 +198,8 @@ filterwarnings = [
     "ignore:.*'(parseAll)'.*'(parse_all)':DeprecationWarning",
     # igraph vs leidenalg warning
     "ignore:The `igraph` implementation of leiden clustering:UserWarning",
+    "ignore:Detected unsupported threading environment:UserWarning",
+    "ignore:Cannot cache compiled function",
 ]
 
 [tool.coverage.run]
diff --git a/src/scanpy/preprocessing/_highly_variable_genes.py b/src/scanpy/preprocessing/_highly_variable_genes.py
@@ -13,7 +13,7 @@
 from fast_array_utils import stats
 
 from .. import logging as logg
-from .._compat import CSBase, CSRBase, DaskArray, old_positionals, warn
+from .._compat import CSBase, CSRBase, DaskArray, njit, old_positionals, warn
 from .._settings import Verbosity, settings
 from .._utils import (
     check_nonnegative_integers,
@@ -98,8 +98,7 @@ def _(data_batch: CSBase, clip_val: np.ndarray) -> tuple[np.ndarray, np.ndarray]
     )
 
 
-# parallel=False needed for accuracy
-@numba.njit(cache=True, parallel=False)  # noqa: TID251
+@njit
 def _sum_and_sum_squares_clipped(
     indices: NDArray[np.integer],
     data: NDArray[np.floating],
@@ -108,13 +107,34 @@ def _sum_and_sum_squares_clipped(
     clip_val: NDArray[np.float64],
     nnz: int,
 ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
-    squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
-    batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
+    """
+    Parallel implementation using thread-local buffers to avoid race conditions.
+
+    Previous implementation used parallel=False due to race condition on shared arrays.
+    This version uses explicit thread-local reduction to restore both correctness
+    and parallelism.
+    """
+    # Thread-local accumulators for parallel reduction
+    n_threads = numba.get_num_threads()
+    squared_local = np.zeros((n_threads, n_cols), dtype=np.float64)
+    sum_local = np.zeros((n_threads, n_cols), dtype=np.float64)
+
+    # Parallel accumulation into thread-local buffers (no race condition)
     for i in numba.prange(nnz):
+        tid = numba.get_thread_id()
         idx = indices[i]
         element = min(np.float64(data[i]), clip_val[idx])
-        squared_batch_counts_sum[idx] += element**2
-        batch_counts_sum[idx] += element
+        squared_local[tid, idx] += element**2
+        sum_local[tid, idx] += element
+
+    # Reduction phase: combine thread-local results
+    squared_batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
+    batch_counts_sum = np.zeros(n_cols, dtype=np.float64)
+
+    for t in range(n_threads):
+        for j in range(n_cols):
+            squared_batch_counts_sum[j] += squared_local[t, j]
+            batch_counts_sum[j] += sum_local[t, j]
 
     return squared_batch_counts_sum, batch_counts_sum
 

Original file line number	Diff line number	Diff line change
`@@ -198,6 +198,8 @@ filterwarnings = [`
`198`	`198`	`"ignore:.'(parseAll)'.'(parse_all)':DeprecationWarning",`
`199`	`199`	`# igraph vs leidenalg warning`
`200`	`200`	"ignore:The `igraph` implementation of leiden clustering:UserWarning",
	`201`	`+ "ignore:Detected unsupported threading environment:UserWarning",`
	`202`	`+ "ignore:Cannot cache compiled function",`
`201`	`203`	`]`
`202`	`204`
`203`	`205`	`[tool.coverage.run]`