diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..698dd01 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,59 @@ +# Bytecode files +__pycache__/ +*.pyc +*.pyo + +# Virtual environment +venv/ +.venv/ + +# Debug and log files +*.log +debug*.txt + +# Data and media +data/ +media/ + +# Temporary files and directories +tmp/ +temp/ +*.swp + +# IDE and OS files +.idea/ +.vscode/ +.DS_Store + +# Distribution files +dist/ +build/ +*.egg-info/ + +# Secrets +.env +.env.* +*.pem +*.key +*.crt +*.p12 +*.jks +secrets/ + +# Git +.git/ +.gitignore + +# Tests & coverage +tests/ +test/ +.pytest_cache/ +.coverage +htmlcov/ + +# Docs / misc +README* +docs/ + +# CI +.github/ diff --git a/.gitignore b/.gitignore index b32ab81..74540cc 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,6 @@ tpm3.fasta # VSCode .vscode + +# UTA download +uta_*pgd.gz diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..de2f586 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,23 @@ +services: + uta: + # Test: + # psql -XAt postgres://anonymous@localhost/uta -c 'select count(*) from uta_20241220.transcript' + # 329090 + image: biocommons/uta:uta_20241220 + environment: + - POSTGRES_PASSWORD=some-password-that-you-make-up + volumes: + - uta_vol:/var/lib/postgresql/data + - type: bind + source: ./uta_20241220.pgd.gz + target: /tmp/uta_20241220.pgd.gz + read_only: true + bind: + create_host_path: false + - ./uta-setup.sql:/docker-entrypoint-initdb.d/uta-setup.sql + ports: + - 127.0.0.1:5432:5432 + +volumes: + uta_vol: + external: true diff --git a/docker-desktop-container.png b/docker-desktop-container.png new file mode 100644 index 0000000..500e5fb Binary files /dev/null and b/docker-desktop-container.png differ diff --git a/docs/source/install.rst b/docs/source/install.rst index 07e4f8c..110535d 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -27,25 +27,6 @@ Install Cool-Seq-Tool from `PyPI `_: * ``tests`` includes packages for running tests * ``docs`` includes packages for writing and building documentation -Set up UTA ----------- - -Cool-Seq-Tool requires an available instance of the Universal Transcript Archive (UTA) database. Complete installation instructions (via Docker or a local server) are available at the `UTA GitHub repository `_. For local usage, we recommend the following: - -.. long-term, it would be best to move this over to the UTA repo to avoid duplication - -.. code-block:: - - createuser -U postgres uta_admin - createuser -U postgres anonymous - createdb -U postgres -O uta_admin uta - - export UTA_VERSION=uta_20241220.pgd.gz # most recent as of 2025/03/10 - curl -O https://dl.biocommons.org/uta/$UTA_VERSION - gzip -cdq ${UTA_VERSION} | psql -h localhost -U uta_admin --echo-errors --single-transaction -v ON_ERROR_STOP=1 -d uta -p 5432 - -By default, Cool-Seq-Tool expects to connect to the UTA database via a PostgreSQL connection served local on port 5432, under the PostgreSQL username ``uta_admin`` and the schema ``uta_20241220``. - Set up SeqRepo -------------- @@ -79,6 +60,49 @@ Try moving data manually with ``sudo``: See `mirroring documentation `_ on the SeqRepo GitHub repo for instructions and additional troubleshooting. +Set up using Docker +------------------- + +Cool-Seq-Tool's dependencies can be installed using a Docker container. We only provide guidance on setting up external dependencies using Docker. + +.. important:: + + This section assumes you have a local + `SeqRepo `_ + installed at ``/usr/local/share/seqrepo/2024-12-20``. + See the `SeqRepo setup section <#set-up-seqrepo>`_ for additional information. + + You must download `uta_20241220.pgd.gz` from + using a web browser and + move it to the root of the repository. + + If you're using Docker Desktop, you must go to + **Settings → Resources → File sharing** and add + ``/usr/local/share/seqrepo`` under the *Virtual file shares* + section. Otherwise, you will get the following error:: + + OSError: Unable to open SeqRepo directory /usr/local/share/seqrepo/2024-12-20 + +To build, (re)create, and start containers: + +.. code-block:: shell + + docker volume create uta_vol + docker compose up + +.. tip:: + + If you want a clean slate, run ``docker compose down -v`` to remove + containers and volumes, then run + ``docker compose up --build`` to rebuild and start fresh containers. + +In Docker Desktop, you should see the following for a successful setup: + +.. figure:: ../../docker-desktop-container.png + :alt: Docker Desktop Container + :align: center + + Check data availability ----------------------- diff --git a/docs/source/usage.rst b/docs/source/usage.rst index d335c14..8aff261 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -80,7 +80,7 @@ Individual classes will accept arguments upon initialization to set parameters r * - ``SEQREPO_ROOT_DIR`` - Path to SeqRepo directory (i.e. contains ``aliases.sqlite3`` database file, and ``sequences`` directory). Used by :py:class:`SeqRepoAccess `. If not defined, defaults to ``/usr/local/share/seqrepo/latest``. * - ``UTA_DB_URL`` - - A `libpq connection string `_, i.e. of the form ``postgresql://:@://``, used by the :py:class:`UtaDatabase ` class. By default, it is set to ``postgresql://uta_admin:uta@localhost:5432/uta/uta_20241220``. + - A `libpq connection string `_, i.e. of the form ``postgresql://:@://``, used by the :py:class:`UtaDatabase ` class. By default, it is set to ``postgresql://anonymous@localhost:5432/uta/uta_20241220``. * - ``LIFTOVER_CHAIN_37_TO_38`` - A path to a `chainfile `_ for lifting from GRCh37 to GRCh38. Used by the :py:class:`LiftOver ` class as input to `agct `_. If not provided, agct will fetch it automatically from UCSC. * - ``LIFTOVER_CHAIN_38_TO_37`` diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index 45aba11..e993ef6 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -27,7 +27,7 @@ UTADatabaseType = TypeVar("UTADatabaseType", bound="UtaDatabase") UTA_DB_URL = environ.get( - "UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5432/uta/uta_20241220" + "UTA_DB_URL", "postgresql://anonymous@localhost:5432/uta/uta_20241220" ) _logger = logging.getLogger(__name__) diff --git a/uta-setup.sql b/uta-setup.sql new file mode 100644 index 0000000..a902f7b --- /dev/null +++ b/uta-setup.sql @@ -0,0 +1,27 @@ +\c uta; +CREATE TABLE uta_20241220.genomic AS +SELECT t.hgnc, aes.alt_ac, aes.alt_aln_method, + aes.alt_strand, ae.start_i AS alt_start_i, + ae.end_i AS alt_end_i +FROM (((((uta_20241220.transcript t + JOIN uta_20241220.exon_set tes ON (((t.ac = tes.tx_ac) + AND (tes.alt_aln_method = 'transcript'::text)))) + JOIN uta_20241220.exon_set aes ON (((t.ac = aes.tx_ac) + AND (aes.alt_aln_method <> 'transcript'::text)))) + JOIN uta_20241220.exon te ON + ((tes.exon_set_id = te.exon_set_id))) + JOIN uta_20241220.exon ae ON + (((aes.exon_set_id = ae.exon_set_id) + AND (te.ord = ae.ord)))) + LEFT JOIN uta_20241220.exon_aln ea ON + (((te.exon_id = ea.tx_exon_id) AND + (ae.exon_id = ea.alt_exon_id)))); + +CREATE INDEX alt_pos_index ON uta_20241220.genomic (alt_ac, alt_start_i, alt_end_i); +CREATE INDEX gene_alt_index ON uta_20241220.genomic (hgnc, alt_ac); +CREATE INDEX alt_ac_index ON uta_20241220.genomic (alt_ac); + +GRANT CONNECT ON DATABASE uta TO anonymous; +GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA uta_20241220 TO anonymous; +ALTER DATABASE uta OWNER TO anonymous; +ALTER SCHEMA uta_20241220 OWNER to anonymous;