diff --git a/docs/release-notes/3616.feat.md b/docs/release-notes/3616.feat.md new file mode 100644 index 0000000000..78502887a1 --- /dev/null +++ b/docs/release-notes/3616.feat.md @@ -0,0 +1 @@ +Add modularity scoring via {func}`scanpy.metrics.modularity` with support for directed/undirected graphs {smaller}`A. Karesh` diff --git a/src/scanpy/_utils/__init__.py b/src/scanpy/_utils/__init__.py index 5cf70ffde3..4c5e5ec1d1 100644 --- a/src/scanpy/_utils/__init__.py +++ b/src/scanpy/_utils/__init__.py @@ -883,20 +883,22 @@ class NeighborsView: This defines where to look for neighbors dictionary, connectivities, distances. - neigh = NeighborsView(adata, key) - neigh['distances'] - neigh['connectivities'] - neigh['params'] - 'connectivities' in neigh - 'params' in neigh - - is the same as - - adata.obsp[adata.uns[key]['distances_key']] - adata.obsp[adata.uns[key]['connectivities_key']] - adata.uns[key]['params'] - adata.uns[key]['connectivities_key'] in adata.obsp - 'params' in adata.uns[key] + Examples + -------- + >>> neigh = NeighborsView(adata, key) + >>> neigh["distances"] + >>> neigh["connectivities"] + >>> neigh["params"] + >>> "connectivities" in neigh + >>> "params" in neigh + + is the same as + + >>> adata.obsp[adata.uns[key]["distances_key"]] + >>> adata.obsp[adata.uns[key]["connectivities_key"]] + >>> adata.uns[key]["params"] + >>> adata.uns[key]["connectivities_key"] in adata.obsp + >>> "params" in adata.uns[key] """ diff --git a/src/scanpy/metrics/__init__.py b/src/scanpy/metrics/__init__.py index 9cdf82bdf2..03de5baf44 100644 --- a/src/scanpy/metrics/__init__.py +++ b/src/scanpy/metrics/__init__.py @@ -3,7 +3,7 @@ from __future__ import annotations from ._gearys_c import gearys_c -from ._metrics import confusion_matrix +from ._metrics import confusion_matrix, modularity from ._morans_i import morans_i -__all__ = ["confusion_matrix", "gearys_c", "morans_i"] +__all__ = ["confusion_matrix", "gearys_c", "modularity", "morans_i"] diff --git a/src/scanpy/metrics/_metrics.py b/src/scanpy/metrics/_metrics.py index 83957fcd4a..7b233cf237 100644 --- a/src/scanpy/metrics/_metrics.py +++ b/src/scanpy/metrics/_metrics.py @@ -2,15 +2,23 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, overload import numpy as np import pandas as pd +from anndata import AnnData from natsort import natsorted from pandas.api.types import CategoricalDtype +from .._utils import NeighborsView + if TYPE_CHECKING: from collections.abc import Sequence + from typing import Literal + + from numpy.typing import ArrayLike + + from .._compat import SpBase def confusion_matrix( @@ -89,3 +97,127 @@ def confusion_matrix( df = df.loc[np.array(orig_idx), np.array(new_idx)] return df + + +@overload +def modularity( + connectivities: ArrayLike | SpBase, + /, + labels: pd.Series | ArrayLike, + *, + is_directed: bool, +) -> float: ... + + +@overload +def modularity( + adata: AnnData, + /, + labels: str | pd.Series | ArrayLike = "leiden", + *, + neighbors_key: str | None = None, + is_directed: bool | None = None, + mode: Literal["calculate", "update", "retrieve"] = "calculate", +) -> float: ... + + +def modularity( + adata_or_connectivities: AnnData | ArrayLike | SpBase, + /, + labels: str | pd.Series | ArrayLike = "leiden", + *, + neighbors_key: str | None = None, + is_directed: bool | None = None, + mode: Literal["calculate", "update", "retrieve"] = "calculate", +) -> float: + """Compute the modularity of a graph given its connectivities and labels. + + Parameters + ---------- + adata_or_connectivities + The AnnData object containing the data or a weighted adjacency matrix representing the graph. + labels + Cluster labels for each node in the graph. + When `AnnData` is provided, this can be the key in `adata.obs` that contains the clustering labels and defaults to `"leiden"`. + neighbors_key + When `AnnData` is provided, the key in `adata.obsp` that contains the connectivities. + is_directed + Whether the graph is directed or undirected. + Optional when an `AnnData` object has been passed. + mode + When `AnnData` is provided, + this controls if the stored modularity is retrieved, + or if we should calculate it (and optionally update it in `adata.uns[labels]`). + + Returns + ------- + The modularity of the graph based on the provided clustering. + """ + if isinstance(adata_or_connectivities, AnnData): + return modularity_adata( + adata_or_connectivities, + labels=labels, + neighbors_key=neighbors_key, + is_directed=is_directed, + mode=mode, + ) + if isinstance(labels, str): + msg = "`labels` must be provided as array when passing a connectivities array" + raise TypeError(msg) + if is_directed is None: + msg = "`is_directed` must be provided when passing a connectivities array" + raise TypeError(msg) + return modularity_array( + adata_or_connectivities, labels=labels, is_directed=is_directed + ) + + +def modularity_adata( + adata: AnnData, + /, + *, + labels: str | pd.Series | ArrayLike, + neighbors_key: str | None, + is_directed: bool | None, + mode: Literal["calculate", "update", "retrieve"], +) -> float: + if mode in {"retrieve", "update"} and not isinstance(labels, str): + msg = "`labels` must be a string when `mode` is `'retrieve'` or `'update'`" + raise ValueError(msg) + if mode == "retrieve": + return adata.uns[labels]["modularity"] + + nv = NeighborsView(adata, neighbors_key) + connectivities = nv["connectivities"] + + if is_directed is None and (is_directed := nv["params"].get("is_directed")) is None: + msg = "`adata` has no `'is_directed'` in `adata.uns[neighbors_key]['params']`, need to specify `is_directed`" + raise ValueError(msg) + + m = modularity( + connectivities, + adata.obs[labels] if isinstance(labels, str) else labels, + is_directed=is_directed, + ) + if mode == "update": + adata.uns[labels]["modularity"] = m + return m + + +def modularity_array( + connectivities: ArrayLike | SpBase, + /, + *, + labels: pd.Series | ArrayLike, + is_directed: bool, +) -> float: + try: + import igraph as ig + except ImportError as e: + msg = "igraph is require for computing modularity" + raise ImportError(msg) from e + igraph_mode = ig.ADJ_DIRECTED if is_directed else ig.ADJ_UNDIRECTED + graph = ig.Graph.Weighted_Adjacency(connectivities, mode=igraph_mode) + # cluster labels to integer codes required by igraph + labels = pd.Categorical(np.asarray(labels)).codes + return graph.modularity(labels) diff --git a/src/scanpy/neighbors/__init__.py b/src/scanpy/neighbors/__init__.py index 3efb392a1d..f2695d67bb 100644 --- a/src/scanpy/neighbors/__init__.py +++ b/src/scanpy/neighbors/__init__.py @@ -75,6 +75,7 @@ class NeighborsParams(TypedDict): # noqa: D101 metric_kwds: NotRequired[Mapping[str, Any]] use_rep: NotRequired[str] n_pcs: NotRequired[int] + is_directed: NotRequired[bool] @_doc_params(n_pcs=doc_n_pcs, use_rep=doc_use_rep) @@ -92,6 +93,7 @@ def neighbors( # noqa: PLR0913 metric_kwds: Mapping[str, Any] = MappingProxyType({}), random_state: _LegacyRandom = 0, key_added: str | None = None, + is_directed: bool = False, copy: bool = False, ) -> AnnData | None: """Compute the nearest neighbors distance matrix and a neighborhood graph of observations :cite:p:`McInnes2018`. @@ -169,6 +171,8 @@ def neighbors( # noqa: PLR0913 If specified, the neighbors data is added to .uns[key_added], distances are stored in `.obsp[f'{{key_added}}_distances']` and connectivities in `.obsp[f'{{key_added}}_connectivities']`. + is_directed + If `True`, the connectivity matrix is expected to be a directed graph. copy Return a copy instead of writing to adata. @@ -264,6 +268,7 @@ def neighbors( # noqa: PLR0913 method=method, random_state=random_state, metric=metric, + is_directed=is_directed, **({} if not metric_kwds else dict(metric_kwds=metric_kwds)), **({} if use_rep is None else dict(use_rep=use_rep)), **({} if n_pcs is None else dict(n_pcs=n_pcs)), diff --git a/src/scanpy/tools/_leiden.py b/src/scanpy/tools/_leiden.py index ce0c26337a..69ff3eba5f 100644 --- a/src/scanpy/tools/_leiden.py +++ b/src/scanpy/tools/_leiden.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -47,7 +47,7 @@ def leiden( # noqa: PLR0912, PLR0913, PLR0915 flavor: Literal["leidenalg", "igraph"] | None = None, **clustering_args, ) -> AnnData | None: - """Cluster cells into subgroups :cite:p:`Traag2019`. + r"""Cluster cells into subgroups :cite:p:`Traag2019`. Cluster cells using the Leiden algorithm :cite:p:`Traag2019`, an improved version of the Louvain algorithm :cite:p:`Blondel2008`. @@ -120,6 +120,12 @@ def leiden( # noqa: PLR0912, PLR0913, PLR0915 A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. + `adata.uns['leiden' | key_added]['modularity']` : :class:`float` + The modularity score of the final clustering, + as calculated by the `flavor`. + Use :func:`scanpy.metrics.modularity`\ `(adata, mode='calculate' | 'update')` + to calculate a score independent of `flavor`. + """ if flavor is None: flavor = "leidenalg" @@ -178,7 +184,10 @@ def leiden( # noqa: PLR0912, PLR0913, PLR0915 if use_weights: clustering_args["weights"] = np.array(g.es["weight"]).astype(np.float64) clustering_args["seed"] = random_state - part = leidenalg.find_partition(g, partition_type, **clustering_args) + part = cast( + "MutableVertexPartition", + leidenalg.find_partition(g, partition_type, **clustering_args), + ) else: g = _utils.get_igraph_from_adjacency(adjacency, directed=False) if use_weights: @@ -212,6 +221,7 @@ def leiden( # noqa: PLR0912, PLR0913, PLR0915 random_state=random_state, n_iterations=n_iterations, ) + adata.uns[key_added]["modularity"] = part.modularity logg.info( " finished", time=start, diff --git a/tests/test_clustering.py b/tests/test_clustering.py index e3780595af..4c6902457c 100644 --- a/tests/test_clustering.py +++ b/tests/test_clustering.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +from typing import TYPE_CHECKING import pandas as pd import pytest @@ -10,21 +11,27 @@ from testing.scanpy._helpers.data import pbmc68k_reduced from testing.scanpy._pytest.marks import needs +if TYPE_CHECKING: + from typing import Literal + @pytest.fixture def adata_neighbors(): return pbmc68k_reduced() -FLAVORS = [ - pytest.param("igraph", marks=needs.igraph), - pytest.param("leidenalg", marks=needs.leidenalg), -] +@pytest.fixture( + params=[ + pytest.param("igraph", marks=needs.igraph), + pytest.param("leidenalg", marks=needs.leidenalg), + ] +) +def flavor(request: pytest.FixtureRequest) -> Literal["igraph", "leidenalg"]: + return request.param @needs.leidenalg @needs.igraph -@pytest.mark.parametrize("flavor", FLAVORS) @pytest.mark.parametrize("resolution", [1, 2]) @pytest.mark.parametrize("n_iterations", [-1, 3]) def test_leiden_basic(adata_neighbors, flavor, resolution, n_iterations): @@ -44,7 +51,6 @@ def test_leiden_basic(adata_neighbors, flavor, resolution, n_iterations): @needs.leidenalg @needs.igraph -@pytest.mark.parametrize("flavor", FLAVORS) def test_leiden_random_state(adata_neighbors, flavor): is_leiden_alg = flavor == "leidenalg" n_iterations = 2 if is_leiden_alg else -1 @@ -72,8 +78,18 @@ def test_leiden_random_state(adata_neighbors, flavor): directed=is_leiden_alg, n_iterations=n_iterations, ) + # reproducible pd.testing.assert_series_equal(adata_1.obs["leiden"], adata_1_again.obs["leiden"]) + assert ( + pytest.approx(adata_1.uns["leiden"]["modularity"]) + == adata_1_again.uns["leiden"]["modularity"] + ) + # different clustering assert not adata_2.obs["leiden"].equals(adata_1_again.obs["leiden"]) + assert ( + pytest.approx(adata_2.uns["leiden"]["modularity"]) + != adata_1_again.uns["leiden"]["modularity"] + ) @needs.igraph diff --git a/tests/test_metrics.py b/tests/test_metrics.py index c644839670..4e371cc6b5 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -2,6 +2,7 @@ import warnings from functools import partial +from itertools import combinations from string import ascii_letters from typing import TYPE_CHECKING @@ -13,7 +14,9 @@ from scipy import sparse import scanpy as sc -from testing.scanpy._helpers.data import pbmc68k_reduced +from scanpy.metrics import modularity +from testing.scanpy._helpers.data import pbmc3k, pbmc68k_reduced +from testing.scanpy._pytest.marks import needs from testing.scanpy._pytest.params import ARRAY_TYPES if TYPE_CHECKING: @@ -241,3 +244,109 @@ def test_confusion_matrix_api() -> None: pd.testing.assert_frame_equal( expected, sc.metrics.confusion_matrix(data["a"], "b", data) ) + + +@pytest.mark.parametrize("is_directed", [False, True], ids=["undirected", "directed"]) +@pytest.mark.parametrize("use_sparse", [False, True], ids=["dense", "sparse"]) +@needs.igraph +def test_modularity_sample_structure(*, use_sparse: bool, is_directed: bool) -> None: + """Sample graph with clear community structure (dense & sparse, directed & undirected).""" + # 4 node adjacency matrix with two separate 2-node communities + mat = np.array([ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1], + ]) + labels = ["A", "A", "B", "B"] + adj = sparse.csr_matrix(mat) if use_sparse else mat # noqa: TID251 + + score = modularity(adj, labels, is_directed=is_directed) + + # Modularity should be between 0 and 1 + assert 0 <= score <= 1 + + +@needs.igraph +def test_modularity_single_community() -> None: + """Edge case when all nodes belong to the same community/cluster.""" + # fully connected graph sample + adj = np.ones((4, 4)) - np.eye(4) + labels = ["A", "A", "A", "A"] + + score = modularity(adj, labels, is_directed=False) + + # modularity should be 0 + assert score == pytest.approx(0.0, rel=1e-6) + + +@needs.igraph +def test_modularity_invalid_labels() -> None: + """Invalad input, labels length does not match adjacency matrix size.""" + import igraph as ig + + adj = np.eye(4) + labels = ["A", "A", "B"] + + with pytest.raises(ig.InternalError, match=r"Membership vector size differs"): + modularity(adj, labels, is_directed=False) + + +@needs.igraph +def test_modularity_adata( + monkeypatch: pytest.MonkeyPatch, subtests: pytest.Subtests +) -> None: + """Test domain and API of modularity score.""" + adata = pbmc3k() + sc.pp.pca(adata) + sc.pp.neighbors(adata) + sc.tl.leiden(adata, flavor="igraph") + + scores = {} + with monkeypatch.context() as m: + # retrieve pre-calculated modularity + m.delattr(sc.metrics._metrics, "modularity_array") + scores["retrieve"] = modularity(adata, labels="leiden", mode="retrieve") + scores["calculate"] = modularity(adata, labels="leiden", mode="calculate") + del adata.uns["leiden"]["modularity"] + scores["update"] = modularity(adata, labels="leiden", mode="update") + + for name, s in scores.items(): + with subtests.test("bounds", score=name): + assert 0 <= s <= 1 + for (n0, s0), (n1, s1) in combinations(scores.items(), 2): + rel = 1e-6 if {n0, n1} == {"update", "calculate"} else 1e-3 + with subtests.test("equality", l=n0, r=n1): + assert pytest.approx(s0, rel=rel) == s1 + with subtests.test("update"): + assert adata.uns["leiden"]["modularity"] is scores["update"] + + +@needs.igraph +def test_modularity_order() -> None: + """Modularity should be the same no matter the order of the labels.""" + adj = np.array([ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 1, 1], + [0, 0, 1, 1], + ]) + labels1 = ["A", "A", "B", "B"] + labels2 = ["B", "B", "A", "A"] + + score_1 = modularity(adj, labels1, is_directed=False) + score_2 = modularity(adj, labels2, is_directed=False) + + assert score_1 == score_2 + + +@needs.igraph +def test_modularity_disconnected_graph() -> None: + """Modularity on disconnected graph like edge-case behavior in some algorithms.""" + adj = np.zeros((4, 4)) + labels = ["A", "B", "C", "D"] + + score = modularity(adj, labels, is_directed=False) + + # Modularity should be undefined for disconnected graphs + assert np.isnan(score)