MarioniLab
diff --git a/‎.github/workflows/test.yaml‎
Lines changed: 7 additions & 4 deletions b/‎.github/workflows/test.yaml‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎.readthedocs.yaml‎
Lines changed: 5 additions & 7 deletions b/‎.readthedocs.yaml‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/oor_benchmark/api.py‎
Lines changed: 6 additions & 6 deletions b/‎src/oor_benchmark/api.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/oor_benchmark/datasets/simulation.py‎
Lines changed: 70 additions & 23 deletions b/‎src/oor_benchmark/datasets/simulation.py‎
Lines changed: 70 additions & 23 deletions
diff --git a/‎src/oor_benchmark/methods/_cna.py‎
Lines changed: 34 additions & 0 deletions b/‎src/oor_benchmark/methods/_cna.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎src/oor_benchmark/methods/_latent_embedding.py‎
Lines changed: 7 additions & 7 deletions b/‎src/oor_benchmark/methods/_latent_embedding.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/oor_benchmark/methods/_meld.py‎
Lines changed: 57 additions & 0 deletions b/‎src/oor_benchmark/methods/_meld.py‎
Lines changed: 57 additions & 0 deletions
@@ -4,7 +4,7 @@ on:
     push:
         branches: [master]
     pull_request:
-        branches: [master]
+        branches: [master, devel]
 
 jobs:
     test:
@@ -25,7 +25,7 @@ jobs:
 
         steps:
             - uses: actions/checkout@v2
-            - uses: r-lib/actions/setup-r@v1
+            - uses: r-lib/actions/setup-r@v2
             - name: Set up Python ${{ matrix.python }}
               uses: actions/setup-python@v2
               with:
@@ -62,5 +62,8 @@ jobs:
             - name: Upload coverage
               env:
                   CODECOV_NAME: ${{ matrix.python }}-${{ matrix.os }}
-              run: |
-                  codecov --required --flags=unittests
+              #   run: |
+              #       codecov --required --flags=unittests
+              uses: codecov/codecov-action@v3
+              with:
+                  token: ${{secrets.CODECOV_TOKEN}}
@@ -1,18 +1,16 @@
 # https://docs.readthedocs.io/en/stable/config-file/v2.html
 version: 2
-
-conda:
-    environment: environment.yaml
-
 build:
     os: ubuntu-20.04
     tools:
         python: "3.10"
-
 sphinx:
     configuration: docs/conf.py
+    # disable this for more lenient docs builds
     fail_on_warning: true
-
 python:
     install:
-        - requirements: docs/requirements.txt
+        - method: pip
+          path: .
+          extra_requirements:
+              - doc
@@ -26,7 +26,9 @@ dependencies = [
     "scanpy",
     "scvi-tools",
     "milopy @ git+https://github.com/emdann/milopy.git@master",
-    "sklearn"
+    "sklearn",
+    "meld",
+    "cna"
     ]
 
 [project.optional-dependencies]
 
@@ -23,8 +23,8 @@ def check_method(adata: AnnData):
     assert "OOR_score" in adata.uns["sample_adata"].var
     assert "OOR_signif" in adata.uns["sample_adata"].var
     assert all(adata.uns["sample_adata"].var["OOR_signif"].isin([0, 1]))
-    assert "groups" in adata.uns["sample_adata"].varm
-    assert isinstance(adata.uns["sample_adata"].varm["groups"], csc_matrix)
+    if "groups" in adata.uns["sample_adata"].varm:
+        assert isinstance(adata.uns["sample_adata"].varm["groups"], csc_matrix)
     return True
 
 
@@ -49,8 +49,8 @@ def sample_dataset():
     adata.obs.loc[adata.obs["sample_id"].isin([f"S{n}" for n in range(8)]), "dataset_group"] = "atlas"
     adata.obs.loc[adata.obs["sample_id"].isin([f"S{n}" for n in range(8, 12)]), "dataset_group"] = "ctrl"
     adata.obs.loc[adata.obs["sample_id"].isin([f"S{n}" for n in range(12, 16)]), "dataset_group"] = "query"
-    # # Make out-of-reference cell state
-    # adata.obs["OOR_state"] = np.where(adata.obs["louvain"] == "B cells", 1, 0)
-    # remove_cells = adata.obs_names[(adata.obs["OOR_state"] == 1) & (adata.obs["dataset_group"] != "query")]
-    # adata = adata[~adata.obs_names.isin(remove_cells)].copy()
+    # Make out-of-reference cell state
+    adata.obs["OOR_state"] = np.where(adata.obs["louvain"] == "B cells", 1, 0)
+    remove_cells = adata.obs_names[(adata.obs["OOR_state"] == 1) & (adata.obs["dataset_group"] != "query")]
+    adata = adata[~adata.obs_names.isin(remove_cells)].copy()
     return adata
@@ -1,7 +1,9 @@
 from typing import List, Union
 
 import numpy as np
+import scanpy as sc
 from anndata import AnnData
+from sklearn.neighbors import KNeighborsClassifier
 
 
 def _split_train_test(adata: AnnData, annotation_col: str = "leiden", test_frac: float = 0.2):
@@ -23,10 +25,12 @@ def simulate_query_reference(
     ctrl_batch: Union[List[str], None] = None,
     annotation_col: str = "leiden",
     query_annotation: Union[List[str], None] = None,
-    perturbation_type: str = "remove",
+    perturbation_type: Union[str, List[str]] = "remove",
     test_frac: float = 0.2,
-    DA_frac: float = 0.2,
+    # DA_frac: float = 0.2,
+    split_pc: int = 0,
     seed=42,
+    use_rep_shift: str = "X_scVI",
 ):
     """
     Split single-cell dataset in a atlas, control and query dataset.
@@ -56,12 +60,18 @@ def simulate_query_reference(
         will be removed from the samples in ctrl_batch (the fraction specified by DA_test)
         if equal to 'depletion' a fraction of the cells in population specified in query_annotation
         will be removed from the samples in query_batch (the fraction specified by DA_test)
+        if equal to shift, the query population will be shifted along a principal component
+
     test_frac:
         fraction of cells in each population to be included in the query group (only used if batch_col is None)
     DA_frac:
         the fraction of cells of query_annotation to keep in control if perturbation_type is 'expansion', or in query if perturbation_type is 'depletion'
+    split_pc:
+        index of PC to use for splitting (default: 0, using PC1) (only used if perturbation_type=shift)
     seed:
         random seed for sampling
+    use_rep_shift:
+        representation to use to find neighbors in atlas dataset for shift perturbation (default: 'X_scVI')
 
     Returns:
     --------
@@ -99,29 +109,63 @@ def simulate_query_reference(
         query_annotation = np.random.choice(adata.obs[annotation_col].unique(), size=1)
 
     #  Apply perturbation
-    if perturbation_type == "remove":
-        adata.obs.loc[(adata.obs[annotation_col].isin(query_annotation)), "is_train"] = 0
-        if ctrl_batch is not None:
-            adata.obs.loc[(adata.obs[annotation_col].isin(query_annotation)), "is_ctrl"] = 0
-
-    elif perturbation_type == "expansion":
-        for b in ctrl_batch:
-            query_pop_cells = adata.obs_names[
-                (adata.obs[batch_col] == b) & (adata.obs[annotation_col].isin(query_annotation))
+    if isinstance(perturbation_type, str):
+        perturb_types = [perturbation_type] * len(query_annotation)
+    elif isinstance(perturbation_type, list):
+        assert len(perturbation_type) == len(
+            query_annotation
+        ), "If perturbation_type is a list, it should be the same length as query_annotation"
+        perturb_types = perturbation_type.copy()
+    else:
+        raise TypeError(
+            "perturbation_type should be a string or a list of strings of the same length as query_annotation"
+        )
+
+    perturb_annotations = query_annotation.copy()
+    oor_cells = []
+
+    for query_annotation, perturbation_type in zip(perturb_annotations, perturb_types):
+        if perturbation_type == "remove":
+            adata.obs.loc[(adata.obs[annotation_col] == query_annotation), "is_train"] = 0
+            if ctrl_batch is not None:
+                adata.obs.loc[(adata.obs[annotation_col] == query_annotation), "is_ctrl"] = 0
+            oor_cells_p = adata.obs_names[adata.obs[annotation_col] == query_annotation].tolist()
+            oor_cells.extend(oor_cells_p)
+
+        elif perturbation_type == "shift":
+            split_pop_cells = adata.obs_names[
+                (adata.obs[annotation_col] == query_annotation) & (adata.obs["is_train"] == 0)
             ]
-            cells2remove = np.random.choice(query_pop_cells, size=int(np.round(len(query_pop_cells) * (1 - DA_frac))))
-            adata.obs.loc[cells2remove, "is_ctrl"] = 0
-
-    elif perturbation_type == "depletion":
-        for b in query_batch:
-            query_pop_cells = adata.obs_names[
-                (adata.obs[batch_col] == b) & (adata.obs[annotation_col].isin(query_annotation))
+            # Run PCA on perturbation population (just query dataset to avoid batch effects)
+            split_pop_adata = adata[adata.obs_names.isin(split_pop_cells)].copy()
+            sc.pp.normalize_per_cell(split_pop_adata)
+            sc.pp.log1p(split_pop_adata)
+            sc.pp.pca(split_pop_adata)
+            pc2split = split_pop_adata.obsm["X_pca"][:, split_pc]
+            test_size = int(np.round(len(split_pop_cells) * 0.5))
+            idx = np.argpartition(pc2split, test_size)
+            cells2remove = split_pop_cells[idx[:test_size]].values
+
+            # Find neighbors in atlas cells
+            split_pop_adata.obs["remove"] = split_pop_adata.obs_names.isin(cells2remove).astype(int)
+            split_pop_cells_atlas = adata.obs_names[
+                (adata.obs[annotation_col] == query_annotation) & (adata.obs["is_train"] == 1)
             ]
-            cells2remove = np.random.choice(query_pop_cells, size=int(np.round(len(query_pop_cells) * (1 - DA_frac))))
-            adata.obs.loc[cells2remove, "is_query"] = 0
+            X_train = adata[split_pop_cells].obsm[use_rep_shift]
+            Y_train = split_pop_adata.obs["remove"]
+            X_atlas = adata[split_pop_cells_atlas].obsm[use_rep_shift]
 
-    else:
-        raise ValueError("perturbation type should be one of 'remove' or 'perturb_pc'")
+            neigh = KNeighborsClassifier(n_neighbors=10)
+            neigh = neigh.fit(X_train, Y_train)
+            atlas_cells2remove = split_pop_cells_atlas[neigh.predict(X_atlas) == 1]
+
+            adata.obs.loc[cells2remove, "is_ctrl"] = 0
+            adata.obs.loc[atlas_cells2remove, "is_train"] = 0
+            oor_cells_p = adata.obs_names[(adata.obs["is_test"] == 1) & (adata.obs_names.isin(cells2remove))].tolist()
+            oor_cells.extend(oor_cells_p)
+
+        else:
+            raise ValueError("perturbation type should be one of 'remove' or 'shift'")
     adata.uns["perturbation"] = {
         "annotation_col": annotation_col,
         "batch_col": batch_col,
@@ -137,7 +181,10 @@ def simulate_query_reference(
     adata.obs["dataset_group"] = np.where(adata.obs["is_train"] == 1, "atlas", adata.obs["dataset_group"])
     adata = adata[adata.obs["dataset_group"] != "exclude"].copy()  # remove cells that are not in any group
 
-    adata.obs["OOR_state"] = (adata.obs[annotation_col].isin(query_annotation)).astype(int)
+    # if perturbation_type == "remove":
+    #     adata.obs["OOR_state"] = (adata.obs[annotation_col].isin(query_annotation)).astype(int)
+    # elif perturbation_type == "shift":
+    adata.obs["OOR_state"] = (adata.obs_names.isin(oor_cells)).astype(int)
 
     adata.obs["cell_annotation"] = adata.obs[annotation_col].copy()
     adata.obs["sample_id"] = adata.obs[batch_col].copy()
 
@@ -0,0 +1,34 @@
+import cna
+from anndata import AnnData
+from multianndata import MultiAnnData
+
+
+def run_cna(adata_design: AnnData, query_group: str, reference_group: str, sample_col: str = "sample_id"):
+    """
+    Run MELD to compute probability estimate per condition.
+
+    Following tutorial in https://nbviewer.org/github/yakirr/cna/blob/master/demo/demo.ipynb
+
+    Parameters:
+    ------------
+    adata_design : AnnData
+        AnnData object of disease and reference cells to compare
+    query_group : str
+        Name of query group in adata_design.obs['dataset_group']
+    reference_group : str
+        Name of reference group in adata_design.obs['dataset_group']
+    sample_col : str
+        Name of column in adata_design.obs to use as sample ID
+    """
+    adata_design = MultiAnnData(adata_design, sampleid=sample_col)
+    adata_design.obs["dataset_group"] = adata_design.obs["dataset_group"].astype("category")
+    adata_design.obs["dataset_group_code"] = (
+        adata_design.obs["dataset_group"].cat.reorder_categories([reference_group, query_group]).cat.codes
+    )
+    adata_design.obs_to_sample(["dataset_group_code"])
+
+    res = cna.tl.association(adata_design, adata_design.samplem.dataset_group_code)
+
+    adata_design.obs["CNA_ncorrs"] = res.ncorrs
+
+    return None
@@ -27,13 +27,11 @@ def embedding_scvi(adata_merge: AnnData, n_hvgs: int = 5000, outdir: str = None,
     dataset_groups = adata_merge.obs["dataset_group"].unique().tolist()
     dataset_groups.sort()
     ref_dataset = "".join(dataset_groups)
-    # adata_merge = anndata.concat([adata_query, adata_ref])
-    # adata_merge.layers["counts"] = adata_merge.X.copy()
     adata_merge_train = adata_merge.copy()
 
     # Filter genes
     adata_merge_train.layers["counts"] = adata_merge_train.X.copy()
-    _filter_genes_scvi(adata_merge_train)
+    _filter_genes_scvi(adata_merge_train, n_hvgs=n_hvgs)
 
     # Train scVI model
     if outdir is not None:
@@ -42,7 +40,6 @@ def embedding_scvi(adata_merge: AnnData, n_hvgs: int = 5000, outdir: str = None,
 
     # Get latent embeddings
     adata_merge.obsm["X_scVI"] = model_scvi.get_latent_representation()
-    # return adata_merge
 
 
 def embedding_scArches(
@@ -79,7 +76,7 @@ def embedding_scArches(
     assert ref_dataset in adata_merge.obs["dataset_group"].unique().tolist()
     adata_merge.layers["counts"] = adata_merge.X.copy()
     adata_ref_train = adata_merge[adata_merge.obs["dataset_group"] == ref_dataset].copy()
-    _filter_genes_scvi(adata_ref_train)
+    _filter_genes_scvi(adata_ref_train, n_hvgs=n_hvgs)
 
     # Train scVI model
     if outdir is not None:
@@ -122,6 +119,9 @@ def _train_scVI(train_adata: AnnData, train_params: dict = None, outfile: str =
     \**kwargs : dict, optional
         Extra arguments to `scvi.model.SCVI.setup_anndata` (specifying batch etc)
     """
+    if train_params is None:
+        train_params = {}
+
     scvi.model.SCVI.setup_anndata(train_adata, layer="counts", **kwargs)
 
     arches_params = {
@@ -173,7 +173,7 @@ def _fit_scVI(
 # --- Latent embedding utils --- #
 
 
-def _filter_genes_scvi(adata: AnnData):
+def _filter_genes_scvi(adata: AnnData, n_hvgs: int = 5000) -> None:
     """Filter genes for latent embedding."""
     # Filter genes not expressed anywhere
     sc.pp.filter_genes(adata, min_cells=1)
@@ -183,4 +183,4 @@ def _filter_genes_scvi(adata: AnnData):
         sc.pp.normalize_per_cell(adata)
         sc.pp.log1p(adata)
 
-    sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=True)
+    sc.pp.highly_variable_genes(adata, n_top_genes=n_hvgs, subset=True)
@@ -0,0 +1,57 @@
+import meld
+import numpy as np
+import pandas as pd
+from anndata import AnnData
+
+
+def run_meld(
+    adata_design: AnnData, query_group: str, reference_group: str, sample_col: str = "sample_id", n_neighbors=10
+):
+    """
+    Run MELD to compute probability estimate per condition.
+
+    Parameters:
+    ------------
+    adata_design : AnnData
+        AnnData object of disease and reference cells to compare
+    query_group : str
+        Name of query group in adata_design.obs['dataset_group']
+    reference_group : str
+        Name of reference group in adata_design.obs['dataset_group']
+    sample_col : str
+        Name of column in adata_design.obs to use as sample ID
+    n_neighbors : int
+        Number of neighbors to use for MELD KNN graph (default: 10)
+    """
+    adata_design.obs["is_query"] = adata_design.obs["dataset_group"] == query_group
+    adata_design.uns["n_conditions"] = 2
+
+    # Complete the result in-place
+    meld_op = meld.MELD(knn=n_neighbors, verbose=True)
+    adata_design.obsm["sample_densities"] = meld_op.fit_transform(
+        adata_design.obsm["X_scVI"], sample_labels=adata_design.obs[sample_col]
+    ).set_index(adata_design.obs_names)
+
+    # Normalize the probability estimates for each condition per replicate
+    adata_design.obsm["probability_estimate"] = pd.DataFrame(
+        np.zeros(shape=(adata_design.n_obs, adata_design.uns["n_conditions"])),
+        index=adata_design.obs_names,
+        columns=["query", "reference"],
+    )
+
+    query_samples = adata_design.obs["sample_id"][adata_design.obs["dataset_group"] == query_group].unique().tolist()
+    reference_samples = (
+        adata_design.obs["sample_id"][adata_design.obs["dataset_group"] == reference_group].unique().tolist()
+    )
+
+    adata_design.obsm["probability_estimate"]["query"] = adata_design.obsm["sample_densities"][query_samples].mean(
+        axis=1
+    )
+    adata_design.obsm["probability_estimate"]["reference"] = adata_design.obsm["sample_densities"][
+        reference_samples
+    ].mean(axis=1)
+    adata_design.obsm["probability_estimate"] = meld.utils.normalize_densities(
+        adata_design.obsm["probability_estimate"]
+    )
+
+    return None