diff --git a/gemmapy/gemmapy_api.py b/gemmapy/gemmapy_api.py index 72a0993..03636e5 100644 --- a/gemmapy/gemmapy_api.py +++ b/gemmapy/gemmapy_api.py @@ -7,14 +7,18 @@ import logging import os import subprocess +import tarfile +import tempfile import warnings from getpass import getpass -from io import StringIO -from typing import Optional, List, Callable +from io import StringIO, BytesIO +from os.path import join +from typing import Optional, List, Callable, Any import anndata as ad import numpy as np import pandas as pd +import scanpy from anndata import AnnData from pandas import DataFrame @@ -1667,7 +1671,45 @@ def make_anndata(pack): pass return out - def get_differential_expression_values(self, + def get_single_cell_dataset_object(self, dataset: str | int, + download_dir=None) -> AnnData: + """ + :param download_dir: Directory where datasets can be downloaded, or else + the data will be retrieved in-memory. + :return: + """ + + def resolve(): + if download_dir: + dest = join(download_dir, dataset + '.tar') + if not os.path.exists(dest): + logger.info('Downloading single-cell data for %s to %s...', + dataset, download_dir) + with open(dest, 'wb') as f: + f.write(self.raw.get_dataset_single_cell_expression( + dataset)) + return open(dest, 'rb') + else: + logger.info("Downloading single-cell data data for %s...", + str(dataset)) + return BytesIO( + self.raw.get_dataset_single_cell_expression(dataset)) + + with (resolve() as f, tarfile.open(fileobj=f) as tf, + tempfile.TemporaryDirectory() as tmpdir): + logger.info('Extracting TAR file for %s to %s...', str(dataset), + tmpdir) + tf.extractall(tmpdir) + samples = [] + for sample_dir in os.listdir(tmpdir): + logger.info('Reading MEX data for %s...', sample_dir) + # Gemma already guarantees unicity of cell identifiers and + # scanpy cannot deal with numeric gene identifiers when + # make_unique is True, so we skip that part + samples.append(scanpy.read_10x_mtx(join(tmpdir, sample_dir))) + return scanpy.concat(samples) + + def get_differential_expression_values(self, dataset:Optional[str|int] = None, keep_non_specific:bool = False, result_sets:Optional[List[str|int]] = None, diff --git a/pyproject.toml b/pyproject.toml index 63cdda1..bdd8ff3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ description = "a Python Wrapper for the Gemma API" keywords = ["gemma", "bioinformatics"] readme = "README.rst" version = "2.0.4" -requires-python = ">=3.10" +requires-python = ">=3.12" dependencies = [ 'certifi >= 14.05.14', 'six >= 1.10', @@ -18,6 +18,7 @@ dependencies = [ 'pandas', 'numpy', 'anndata', + 'scanpy >= 1.12.0rc1', 'typing' ] @@ -25,4 +26,4 @@ dependencies = [ dev = ["pytest"] [tool.setuptools.packages] -find = {} \ No newline at end of file +find = {} diff --git a/tests/test_basic.py b/tests/test_basic.py index 9bc468a..fc27855 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -70,6 +70,19 @@ def test_auth(monkeypatch): monkeypatch.setitem(os.environ, 'GEMMA_PASSWORD_CMD', '') gemmapy.GemmaPy() +def test_get_single_cell_data(): + # TODO: use a publicly available dataset + client = gemmapy.GemmaPy() + ad = client.get_single_cell_dataset_object('GSE227313', download_dir='.') + print(ad) + +def test_get_genes(): + assert len(api.get_genes('BRCA1')) > 0 + assert len(api.get_genes(['BRCA1'])) > 0 + assert len(api.get_genes(672)) > 0 + assert len(api.get_genes([672])) > 0 + assert len(api.get_genes([672, 'BRCA1'])) > 0 + def test_get_result_sets(): res = api.get_result_sets([200])