diff --git a/.gitignore b/.gitignore index 73aa25e..b3e5888 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,3 @@ __pycache__/ # MAC tmp files .DS_Store - -Test/* -!Test/Github_version diff --git a/docs/atlas.md b/docs/atlas.md index aebb18e..f02bb3c 100644 --- a/docs/atlas.md +++ b/docs/atlas.md @@ -1,63 +1,62 @@ -# ATLAS. +# ATLAS ATLAS (Atlas of proTein moLecular dynAmicS) is an open-access data repository that gathers standardized molecular dynamics simulations of protein structures, accompanied by their analysis in the form of interactive diagrams and trajectory visualisation. All raw trajectories as well as the results of analysis are available for download. -- web site: https://www.dsimb.inserm.fr/ATLAS/ -- documentation: https://www.dsimb.inserm.fr/ATLAS/api/redoc -- API: https://www.dsimb.inserm.fr/ATLAS/api/ +- web site: +- publication: [ATLAS: protein flexibility description from atomistic molecular dynamics simulations](https://academic.oup.com/nar/article/52/D1/D384/7438909), Nucleic Acids Research, 2024. -No account / token is needed to access ATLAS API. +## API ---- +- Base URL: +- [documentation](https://www.dsimb.inserm.fr/ATLAS/api/redoc) -## Finding molecular dynamics datasets and files +No account / token is needed to access ATLAS API. ### Datasets In ATLAS, each dataset corresponds to a molecular dynamics simulation of a **protein chain** and is uniquely identified by a **PDB ID and chain identifier** (`pdb_chain`). -The list of all available datasets can be obtained from the ATLAS HTML index: - -https://www.dsimb.inserm.fr/ATLAS/ - -This page is used as the **discovery layer** to extract all available PDB chain identifiers. +The list of all available datasets can be obtained from the ATLAS index page: ---- +All datasets (pdb chains) are extracted from this page with a regular expression. -### API entrypoint to search for entries +### Metadata for a given dataset API endpoint to retrieve metadata for a given dataset: -- Path: `/ATLAS/metadata/{pdb_chain}` -- documentation: https://www.dsimb.inserm.fr/ATLAS/api/redoc +- Endpoint: `/ATLAS/metadata/{pdb_chain}` +- HTTP method: GET +- documentation: -This endpoint returns structured JSON metadata describing the protein and its molecular dynamics simulation. +This endpoint returns structured JSON metadata describing the simulated protein. ---- +Example with dataset id `1k5n_A`: -### Files +- [web page](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A.html) +- [API view](https://www.dsimb.inserm.fr/ATLAS/api/ATLAS/metadata/1k5n_A) -Files associated with a given dataset are hosted in a public directory. - -- Base path: `/database/ATLAS/{pdb_chain}/` +Remarks: -These directories contain structure files (PDB, CIF), molecular dynamics trajectories, and precomputed analysis results. +- The title of the dataset is the protein name. +- No comment or description is provided. We used the organism as description. ---- +### Metadata for files -## Examples - -### 1k5n_A +Files associated with a given dataset are hosted in a public directory. -- entry id: `1k5n_A` -- entry on ATLAS GUI: https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A.html -- entry on ATLAS API: https://www.dsimb.inserm.fr/ATLAS/api/ATLAS/metadata/1k5n_A +For each dataset, 3 zip files are provided. They are accessible through the web page of each individual dataset: -### Description (called "Comment") : +Zip files url follow these patterns: -HLA class I histocompatibility antigen, B alpha chain +- Analysis & MDs (1,000 frames, only protein): +- MDs (10,000 frames, only protein): +- MDs (10,000 frames, total system): -### Files +Example with dataset id `1k5n_A`: -- files on ATLAS GUI: https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A.html +- [web page](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A.html) +- [1k5n_A_analysis.zip](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A_analysis.zip) +- [1k5n_A_protein.zip](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A_protein.zip) +- [1k5n_A_total.zip](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A_total.zip) +We parse HTML content of dataset page and use regular expressions to extract URLs, file names and file sizes. diff --git a/docs/zenodo.md b/docs/zenodo.md index 2f84de7..334928d 100644 --- a/docs/zenodo.md +++ b/docs/zenodo.md @@ -10,8 +10,7 @@ So we don't expect much files to have an individual size above 50 GB. ## API -### Documentation - +- Base URL: - [REST API](https://developers.zenodo.org/) - List of [HTTP status codes](https://developers.zenodo.org/#http-status-codes) @@ -21,10 +20,6 @@ Zenodo requires a token to access its API with higher rate limits. See "[Authent Example of direct API link for a given dataset: -### Base ULR - - - ### Query [Search guide](https://help.zenodo.org/guides/search/) diff --git a/pyproject.toml b/pyproject.toml index 5620d55..a0dc528 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,6 +3,27 @@ name = "mdverse-scrapers" version = "0.1.0" description = "MDverse scrapers" readme = "README.md" +license = "BSD-3-Clause" +authors = [ + { name = "Pierre Poulain", email = "pierre.poulain@cupnet.net" }, + { name = "Essmay Touami", email = "essmay.touami@etu.u-paris.fr" }, + { name = "Salahudin Sheikh", email = "sheikh@ibpc.fr"} +] +maintainers = [ + { name = "Pierre Poulain", email = "pierre.poulain@cupnet.net" } +] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Intended Audience :: Science/Research", + "Topic :: Database", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Scientific/Engineering :: Chemistry", +] requires-python = ">=3.12" dependencies = [ "beautifulsoup4>=4.13.3", @@ -50,3 +71,4 @@ build-backend = "uv_build" scrape-zenodo = "mdverse_scrapers.scrapers.zenodo:main" scrape-figshare = "mdverse_scrapers.scrapers.figshare:main" scrape-nomad = "mdverse_scrapers.scrapers.nomad:main" +scrape-atlas = "mdverse_scrapers.scrapers.atlas:main" diff --git a/scripts/scrap_atlas.py b/scripts/scrap_atlas.py deleted file mode 100644 index e2fc7a7..0000000 --- a/scripts/scrap_atlas.py +++ /dev/null @@ -1,406 +0,0 @@ -#!/usr/bin/env python3 - -""" - - -INSTALL them in myenv if there was some error in running this and some packages are not installed #######(pip install "pydantic>=2.0" httpx pandas pyarrow tenacity)###### - - -ATLAS MD scraper — cleaned, strict Pydantic-integrated single-file script. - -- Strict typing (no "null" strings) -- Pydantic validation for external data -- Async HTTP fetching with retries -- Concurrency control and polite delays -- Atomic parquet writes (two files: files + metadata) - -Scrape molecular dynamics metadata and files from ATLAS of proTein moLecular dynAmicS . - -This script fetches molecular dynamics (MD) metadata from the ATLAS repository (https://www.dsimb.inserm.fr/ATLAS/). -It collects metadata such as dataset names, organisms, sequences, authors, DOIs, and file information for protein MDs. - -The scraped data is saved locally in Parquet format: - - "ATLAS_files.parquet" : file-level metadata (file names, file sizes, number of files) - - "ATLAS_metadata.parquet" : dataset metadata (source, title, organism, DOI, sequence, etc.) - -Usage : -======= - python3 fetch_atlas.py - -Ensure required packages are installed: - - httpx - - pandas - - pyarrow - - -FIELD DESCRIPTIONS: -------------------- -source: Source of the dataset (here: ATLAS) -source_id: Unique identifier for the PDB chain or entry -data_set_url: URL to the dataset metadata API endpoint -title: Protein name or dataset title -organism: Organism from which the protein originates -length: Length of the protein sequence (number of residues) -sequence: Amino acid sequence of the protein -crawling_date: Date when this metadata was collected -date_creation: Original creation date of the dataset (if available) -date_last_modification: Last modification date of the dataset (if available) -nb_files: Number of files available for the dataset -file_names: Comma-separated list of available file names -file_sizes: Comma-separated list of file sizes corresponding to file_names -license: License under which the dataset is shared -authors: Names of authors or contributors -doi: DOI of the publication describing the dataset - -""" - - - -import asyncio -import logging -import re -import shutil -import tempfile -import time -from asyncio import Semaphore -from datetime import datetime, timezone -from pathlib import Path -from typing import Dict, List, Optional, Annotated - -import httpx -import pandas as pd -from pydantic import BaseModel, Field, HttpUrl, StringConstraints, field_validator, conint, constr -from tenacity import ( - RetryError, - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - - -# ------------------------ -# Metadata / Authors -# ------------------------ -__authors__ = ("Pierre Poulain", "Salahudin Sheikh") -__contact__ = ("pierre.poulain@u-paris.fr", "sheikh@ibpc.fr") -__copyright__ = "AGPL-3.0" -__date__ = "2025" -__version__ = "1.0.0" - -# ------------------------ -# Configuration -# ------------------------ -HTML_LIST_URL: str = "https://www.dsimb.inserm.fr/ATLAS/" -API_BASE: str = "https://www.dsimb.inserm.fr/ATLAS/api/ATLAS/metadata/" -BASE_URL: str = "https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/" - -FILES_PARQUET = "atlas_files.parquet" -METADATA_PARQUET = "atlas_datasets.parquet" -OUTPUT_DIR = "data/atlas" - -MAX_CONCURRENT_REQUESTS: int = 10 -REQUEST_DELAY: float = 0.05 # polite delay (seconds) -HTTP_TIMEOUT: float = 30.0 -RETRY_ATTEMPTS: int = 3 - -HEADERS = { - "User-Agent": "atlas-scraper/1.0 (+https://example.org)", -} - -# ------------------------ -# Logging -# ------------------------ -logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(message)s", - level=logging.INFO, -) -logger = logging.getLogger("atlas_scraper") - -# ------------------------ -# Pydantic models -# ------------------------ - -# Strict sequence: only amino-acid letters (uppercase) -SequenceStr = Annotated[str, StringConstraints(pattern=r"^[ACDEFGHIKLMNPQRSTVWY]+$")] - -class DatasetRecord(BaseModel): - source: str = "ATLAS" - source_id: constr(min_length=1) - data_set_url: HttpUrl - title: constr(min_length=1) - organism: Optional[constr(min_length=1)] = None - length: Optional[conint(ge=0)] = None - sequence: Optional[SequenceStr] = None - - crawling_date: constr(min_length=1) - date_creation: Optional[constr(min_length=1)] = None - date_last_modification: Optional[constr(min_length=1)] = None - - nb_files: conint(ge=0) = 0 - file_names: List[constr(min_length=1)] = Field(default_factory=list) - file_sizes: List[Optional[constr(min_length=1)]] = Field(default_factory=list) - - license: Optional[constr(min_length=1)] = None - authors: Optional[constr(min_length=1)] = None - doi: Optional[constr(min_length=1)] = None - - @field_validator("file_sizes", mode="before") - def ensure_three_sizes(cls, v): - if v is None: - return [None, None, None] - if isinstance(v, str): - parts = [p.strip() for p in v.split(",")] - return (parts + [None, None, None])[:3] - if isinstance(v, list): - parts = [p if p is not None else None for p in v] - return (parts + [None, None, None])[:3] - return v - - @field_validator("file_names", mode="before") - def ensure_file_names_list(cls, v): - if v is None: - return [] - if isinstance(v, str): - return [s.strip() for s in v.split(",") if s.strip()] - if isinstance(v, list): - return [s for s in v if s] - return v - -# ------------------------ -# HTTP helpers with retries -# ------------------------ - -def retry_decorator(): - return retry( - reraise=True, - stop=stop_after_attempt(RETRY_ATTEMPTS), - wait=wait_exponential(multiplier=1, min=1, max=10), - retry=retry_if_exception_type((httpx.HTTPError, httpx.ReadTimeout, httpx.ConnectError, asyncio.TimeoutError)), - ) - -@retry_decorator() -async def _get_json(client: httpx.AsyncClient, url: str) -> Dict: - resp = await client.get(url, timeout=HTTP_TIMEOUT) - resp.raise_for_status() - return resp.json() - -@retry_decorator() -async def _get_text(client: httpx.AsyncClient, url: str) -> str: - resp = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) - resp.raise_for_status() - return resp.text - -# ------------------------ -# Parsers -# ------------------------ - -PDB_PATTERN = re.compile(r"\b([0-9][A-Za-z0-9]{3}_[A-Za-z])\b") -DOWNLOAD_SIZE_RE = re.compile(r"Download.*?\(([^)]+)\)", re.IGNORECASE) - -def extract_pdb_chains(html: str) -> List[str]: - chains = sorted(set(PDB_PATTERN.findall(html))) - logger.info("extract_pdb_chains: found %d chains", len(chains)) - return chains - -def extract_file_sizes_from_html(html: str) -> List[Optional[str]]: - sizes = DOWNLOAD_SIZE_RE.findall(html) - return (sizes + [None, None, None])[:3] - -# ------------------------ -# Fetch functions (async) -# ------------------------ - -async def fetch_index_html_sync() -> str: - """Synchronous fetch used at startup for the index page.""" - def _sync(): - with httpx.Client(timeout=HTTP_TIMEOUT) as client: - r = client.get(HTML_LIST_URL, headers=HEADERS) - r.raise_for_status() - return r.text - - loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, _sync) - -async def fetch_metadata_for_chain( - client: httpx.AsyncClient, sem: Semaphore, pdb_chain: str -) -> Optional[Dict]: - api_url = f"{API_BASE}{pdb_chain}" - html_url = f"{BASE_URL}{pdb_chain}/{pdb_chain}.html" - - async with sem: - await asyncio.sleep(REQUEST_DELAY) - try: - api_data = await _get_json(client, api_url) - except Exception as exc: - logger.warning("API fetch failed for %s: %s", pdb_chain, exc) - return None - - try: - html_text = await _get_text(client, html_url) - sizes = extract_file_sizes_from_html(html_text) - except Exception as exc: - logger.warning("HTML fetch/parse failed for %s: %s", pdb_chain, exc) - sizes = [None, None, None] - - chain_key = pdb_chain if pdb_chain in api_data else pdb_chain.upper() - chain_data = api_data.get(chain_key, api_data if isinstance(api_data, dict) else {}) - - files = chain_data.get("files") if isinstance(chain_data.get("files"), list) else None - nb_files = len(files) if files else 3 - file_names = files if files else [ - "Analysis & MDs (only protein)", - "MDs (only protein)", - "MDs (total system)", - ] - - record = { - "source": "ATLAS", - "source_id": chain_data.get("PDB") or pdb_chain, - "data_set_url": api_url, - "title": chain_data.get("protein_name") or f"ATLAS dataset for {pdb_chain}", - "organism": chain_data.get("organism"), - "length": int(chain_data.get("length")) if chain_data.get("length") is not None else None, - "sequence": chain_data.get("sequence") if isinstance(chain_data.get("sequence"), str) else None, - "crawling_date": datetime.now(timezone.utc).date().isoformat(), - "date_creation": chain_data.get("date_creation"), - "date_last_modification": chain_data.get("date_last_modification"), - "nb_files": int(nb_files), - "file_names": file_names, - "file_sizes": sizes, - "license": chain_data.get("license") or "CC-BY-NC 4.0", - "authors": chain_data.get("authors") or "Yann Vander Meersche et al.", - "doi": chain_data.get("doi") or "https://doi.org/10.1093/nar/gkad1084", - } - - try: - validated = DatasetRecord(**record) - return validated.dict() - except Exception as exc: - logger.warning("Validation failed for %s: %s", pdb_chain, exc) - return None - -async def fetch_all(pdb_chains: List[str]) -> List[Dict]: - sem = Semaphore(MAX_CONCURRENT_REQUESTS) - results: List[Dict] = [] - async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as client: - tasks = [fetch_metadata_for_chain(client, sem, c) for c in pdb_chains] - for coro in asyncio.as_completed(tasks): - try: - res = await coro - if res: - results.append(res) - except RetryError as exc: - logger.warning("Fetch task failed after retries: %s", exc) - except Exception as exc: - logger.warning("Unhandled error in fetch task: %s", exc) - return results - -# ------------------------ -# Storage utilities (fixed for Pydantic types) -# ------------------------ - -def ensure_output_dir(path: str) -> Path: - p = Path(path) - p.mkdir(parents=True, exist_ok=True) - return p - - -def atomic_parquet_write(df: pd.DataFrame, path: Path) -> None: - # write to temp dir then atomically move - tmp_dir = tempfile.mkdtemp(dir=str(path.parent)) - tmp_file = Path(tmp_dir) / (path.name + ".tmp") - try: - df.to_parquet(tmp_file, index=False) - shutil.move(str(tmp_file), str(path)) - logger.info("Wrote parquet: %s", path) - finally: - try: - shutil.rmtree(tmp_dir) - except Exception: - pass - - -def convert_pydantic_to_native(df: pd.DataFrame) -> pd.DataFrame: - """ - Convert Pydantic-specific types (HttpUrl, SequenceStr, etc.) to native Python types - so PyArrow / pandas can write them to Parquet. - """ - for col in df.columns: - df[col] = df[col].apply(lambda x: str(x) if hasattr(x, "__str__") and not isinstance(x, str) else x) - return df - - -def save_results(validated_records: List[Dict], out_dir: str = OUTPUT_DIR) -> None: - if not validated_records: - logger.warning("No valid records to save.") - return - - out_path = ensure_output_dir(out_dir) - df = pd.DataFrame(validated_records) - - # ---------------- Files Parquet ---------------- - df_files = df[["source", "source_id", "nb_files", "file_names", "file_sizes"]].copy() - - # convert list columns to comma-separated strings - df_files["file_names"] = df_files["file_names"].apply(lambda x: ",".join(x) if isinstance(x, list) else x) - df_files["file_sizes"] = df_files["file_sizes"].apply(lambda x: ",".join([s for s in x if s]) if isinstance(x, list) else x) - - df_files = convert_pydantic_to_native(df_files) - atomic_parquet_write(df_files, out_path / FILES_PARQUET) - - # ---------------- Metadata Parquet ---------------- - meta_cols = [ - "source", "source_id", "data_set_url", "doi", "authors", "crawling_date", - "title", "organism", "date_creation", "date_last_modification", - "license", "length", "sequence" - ] - df_meta = df[meta_cols].copy() - - df_meta = convert_pydantic_to_native(df_meta) - atomic_parquet_write(df_meta, out_path / METADATA_PARQUET) - - logger.info("Saved all Parquet files successfully.") - -# ------------------------ -# Orchestration / CLI -# ------------------------ - -async def _run_pipeline(limit: Optional[int] = None, out_dir: str = OUTPUT_DIR) -> None: - logger.info("Fetching index page...") - try: - index_html = await fetch_index_html_sync() - except Exception as exc: - logger.error("Failed to fetch index page: %s", exc) - return - - chains = extract_pdb_chains(index_html) - if limit and limit > 0: - chains = chains[:limit] - logger.info("Found %d chains (limit=%s)", len(chains), limit) - - logger.info("Starting async fetch of metadata...") - results = await fetch_all(chains) - logger.info("Fetched %d valid records", len(results)) - - save_results(results, out_dir) - -def main(limit: Optional[int] = None, out_dir: str = OUTPUT_DIR) -> None: - start = time.time() - try: - asyncio.run(_run_pipeline(limit=limit, out_dir=out_dir)) - except Exception as exc: - logger.exception("Pipeline failed: %s", exc) - finally: - elapsed_minutes = (time.time() - start) / 60.0 - logger.info("Done. Elapsed time: %.2f minutes", elapsed_minutes) - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="ATLAS metadata scraper (Pydantic-validated, strict)") - parser.add_argument("--limit", type=int, default=0, help="Limit number of chains to fetch (0 = all)") - parser.add_argument("--output-dir", type=str, default=OUTPUT_DIR, help="Output directory for parquet files") - args = parser.parse_args() - - main(limit=args.limit or None, out_dir=args.output_dir) diff --git a/src/mdverse_scrapers/core/network.py b/src/mdverse_scrapers/core/network.py index 05f1024..e376e31 100644 --- a/src/mdverse_scrapers/core/network.py +++ b/src/mdverse_scrapers/core/network.py @@ -51,6 +51,37 @@ def create_httpx_client( return httpx.Client(base_url=base_url, headers=headers) +def is_connection_to_server_working( + client: httpx.Client, url: str, logger: "loguru.Logger" = loguru.logger +) -> bool: + """Test connection to a web server. + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + url : str + The URL endpoint. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + bool + True if the connection is successful, False otherwise. + """ + logger.debug("Testing connection to server...") + response = make_http_request_with_retries( + client, url, method=HttpMethod.GET, max_attempts=2, logger=logger + ) + if not response: + logger.error("Cannot connect to server.") + return False + if response and hasattr(response, "headers"): + logger.debug(response.headers) + return True + + def make_http_request_with_retries( client: httpx.Client, url: str, diff --git a/src/mdverse_scrapers/models/dataset.py b/src/mdverse_scrapers/models/dataset.py index 785c7ee..dfb0028 100644 --- a/src/mdverse_scrapers/models/dataset.py +++ b/src/mdverse_scrapers/models/dataset.py @@ -17,7 +17,7 @@ DOI = Annotated[ str, - StringConstraints(pattern=r"^10\.\d{4,9}/[\w\-.]+$"), + StringConstraints(pattern=r"^10\.\d{4,9}/[\w\-./]+$"), ] diff --git a/src/mdverse_scrapers/scrapers/atlas.py b/src/mdverse_scrapers/scrapers/atlas.py new file mode 100644 index 0000000..e5124fe --- /dev/null +++ b/src/mdverse_scrapers/scrapers/atlas.py @@ -0,0 +1,374 @@ +"""Scrape metadata of molecular dynamics datasets and files from ATLAS.""" + +import json +import re +import sys +from pathlib import Path + +import click +import httpx +import loguru +from bs4 import BeautifulSoup + +from ..core.logger import create_logger +from ..core.network import ( + HttpMethod, + create_httpx_client, + is_connection_to_server_working, + make_http_request_with_retries, +) +from ..core.toolbox import print_statistics +from ..models.dataset import DatasetMetadata +from ..models.enums import DatasetSourceName +from ..models.scraper import ScraperContext +from ..models.utils import ( + export_list_of_models_to_parquet, + normalize_datasets_metadata, + normalize_files_metadata, +) + +INDEX_URL = "https://www.dsimb.inserm.fr/ATLAS/" +BASE_API_URL = "https://www.dsimb.inserm.fr/ATLAS/api" +ATLAS_METADATA = { + "license": "CC-BY-NC", # https://www.dsimb.inserm.fr/ATLAS/download.html + "author_name": [ # https://academic.oup.com/nar/article/52/D1/D384/7438909 + "Yann Vander Meersche", + "Gabriel Cretin", + "Aria Gheeraert", + "Jean-Christophe Gelly", + "Tatiana Galochkina", + ], + "doi": "10.1093/nar/gkad1084", # https://academic.oup.com/nar/article/52/D1/D384/7438909 + "external_link": ["https://www.dsimb.inserm.fr/ATLAS/"], +} + + +def extract_pdb_chains_from_html( + html: str, logger: "loguru.Logger" = loguru.logger +) -> set[str]: + """Extract PDB chain identifiers from ATLAS index page. + + Parameters + ---------- + html : str + HTML content of the ATLAS index page. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + set[str] + Set of PDB chain identifiers found. + """ + pdb_chains = [] + pdb_chain_pattern = re.compile( + r"/ATLAS/database/ATLAS/([A-Za-z0-9]{4}_[A-Za-z])/.*html" + ) + soup = BeautifulSoup(html, "html.parser") + for link in soup.find_all("a", href=True): + href = link.get("href", "") + match = pdb_chain_pattern.search(href) + if match: + pdb_chains.append(match.group(1)) + return set(pdb_chains) + + +def extract_file_sizes_from_html( + html: str, logger: "loguru.Logger" = loguru.logger +) -> list[dict]: + """Extract file sizes from ATLAS dataset HTML page. + + Parameters + ---------- + html : str + HTML content of the ATLAS dataset page. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + list[dict] + List of file names, sizes and urls found. + + """ + files_metadata = [] + download_link_pattern = re.compile( + r"https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/[A-Za-z0-9]{4}_[A-Za-z]/.*zip" + ) + file_size_pattern = re.compile(r"Download \(([A-Za-z0-9,\. ]+)\)") + soup = BeautifulSoup(html, "html.parser") + for link in soup.find_all("a", href=True): + href = link.get("href", "") + match_link = download_link_pattern.search(href) + match_size = file_size_pattern.search(link.text) + if match_link and match_size: + files_metadata.append( + { + "file_name": Path(href).name, + "file_url_in_repository": href, + # File sizes are sometimes expressed with comma + # as decimal separator. + "file_size_in_bytes": match_size.group(1).replace(",", "."), + } + ) + logger.info(f"Found {len(files_metadata)} files in the HTML page.") + return files_metadata + + +def scrape_metadata_for_a_dataset( + client: httpx.Client, + chain_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> dict | None: + """Fetch metadata for a single ATLAS dataset (PDB chain). + + Parameters + ---------- + client : httpx.Client + HTTPX client for making requests. + chain_id : str + PDB chain identifier. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + dict | None + Scraped dataset metadata, or None if failed. + """ + logger.info(f"Scraping metadata for dataset: {chain_id}") + api_url = f"{BASE_API_URL}/ATLAS/metadata/{chain_id}" + dataset_url = ( + f"https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/{chain_id}/{chain_id}.html" + ) + response = make_http_request_with_retries( + client, api_url, HttpMethod.GET, delay_before_request=0.5, logger=logger + ) + if not response: + logger.warning(f"Failed to fetch API data for {chain_id}. Skipping.") + return None + meta_json = None + try: + meta_json = response.json().get(f"{chain_id}") + except (json.decoder.JSONDecodeError, KeyError) as exc: + logger.warning("Failed to decode JSON response from the ATLAS API.") + logger.warning(f"Error: {exc}") + return None + metadata = { + "dataset_repository_name": DatasetSourceName.ATLAS, + "dataset_id_in_repository": chain_id, + "dataset_url_in_repository": dataset_url, + "title": meta_json.get("protein_name"), + "description": meta_json.get("organism"), + "license": ATLAS_METADATA["license"], + "author_names": ATLAS_METADATA["author_name"], + "doi": ATLAS_METADATA["doi"], + "external_links": ATLAS_METADATA["external_link"], + } + logger.info("Done.") + return metadata + + +def search_all_datasets(client: httpx.Client, logger: "loguru.Logger") -> set[str]: + """Search for ATLAS datasets (1 dataset = 1 PDB chain). + + Parameters + ---------- + client : httpx.Client + HTTPX client for making requests. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + set[str] + Set of PDB chains (datasets) found. + """ + logger.info("Fetching index page listing ATLAS datasets...") + response = make_http_request_with_retries( + client, INDEX_URL, HttpMethod.GET, delay_before_request=0.5, logger=logger + ) + if not response: + logger.critical("Failed to fetch index page.") + logger.critical("Cannot list available datasets. Aborting!") + sys.exit(1) + if not hasattr(response, "text") or not response.text: + logger.critical("Index page response is empty.") + logger.critical("Cannot list available datasets. Aborting!") + sys.exit(1) + chain_ids = extract_pdb_chains_from_html(response.text, logger=logger) + logger.info(f"Found {len(chain_ids)} datasets.") + return chain_ids + + +def scrape_all_datasets( + client: httpx.Client, + pdb_chains: set[str], + logger: "loguru.Logger", +) -> list[dict]: + """Scrape all ATLAS datasets given a set of PDB chains. + + Parameters + ---------- + pdb_chains : set[str] + Set of PDB chains to scrape. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + list[dict] + List of scraped dataset metadata. + """ + datasets_meta = [] + logger.info("Starting scraping of all datasets...") + for pdb_counter, pdb_chain in enumerate(pdb_chains, start=1): + metadata = scrape_metadata_for_a_dataset(client, pdb_chain, logger=logger) + if metadata: + datasets_meta.append(metadata) + logger.info( + f"Scraped {pdb_counter:,}/{len(pdb_chains):,} " + f"({pdb_counter / len(pdb_chains):.0%}) datasets" + ) + return datasets_meta + + +def scrape_all_files( + client: httpx.Client, + datasets_metadata: list[DatasetMetadata], + logger: "loguru.Logger", +) -> list[dict]: + """Scrape ATLAS files. + + Parameters + ---------- + datasets_metadata : list[DatasetMetadata] + List of datasets metadata. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + list[dict] + List of scraped files metadata. + """ + files_metadata = [] + for dataset_counter, dataset_meta in enumerate(datasets_metadata, start=1): + pdb_chain = dataset_meta.dataset_id_in_repository + logger.info(f"Scraping files metadata for dataset: {pdb_chain}") + url = dataset_meta.dataset_url_in_repository + response = make_http_request_with_retries( + client, url, HttpMethod.GET, delay_before_request=0.5, logger=logger + ) + if not response: + logger.warning(f"Failed to fetch HTML page for {pdb_chain}. Skipping.") + continue + files_meta = extract_file_sizes_from_html(response.text, logger=logger) + for meta in files_meta: + metadata = { + "dataset_repository_name": dataset_meta.dataset_repository_name, + "dataset_id_in_repository": dataset_meta.dataset_id_in_repository, + "dataset_url_in_repository": dataset_meta.dataset_url_in_repository, + "file_name": meta["file_name"], + "file_url_in_repository": meta["file_url_in_repository"], + "file_size_in_bytes": meta["file_size_in_bytes"], + } + files_metadata.append(metadata) + logger.info( + "Scraped metadata files for " + f"{dataset_counter:,}/{len(datasets_metadata):,} " + f"({dataset_counter / len(datasets_metadata):.0%}) datasets" + ) + logger.info(f"Total files scraped so far: {len(files_metadata):,}") + return files_metadata + + +@click.command( + help="Command line interface for MDverse scrapers", + epilog="Happy scraping!", +) +@click.option( + "--output-dir", + "output_dir_path", + type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), + required=True, + help="Output directory path to save results.", +) +@click.option( + "--debug", + "is_in_debug_mode", + is_flag=True, + default=False, + help="Enable debug mode.", +) +def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: + """Scrape metadata of molecular dynamics datasets and files from ATLAS.""" + # Create scraper context. + scraper = ScraperContext( + data_source_name=DatasetSourceName.ATLAS, + output_dir_path=output_dir_path, + is_in_debug_mode=is_in_debug_mode, + ) + # Create logger. + level = "INFO" + if scraper.is_in_debug_mode: + level = "DEBUG" + logger = create_logger(logpath=scraper.log_file_path, level=level) + # Print scraper configuration. + logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) + logger.info("Starting ATLAS data scraping...") + # Create HTTPX client + client = create_httpx_client() + # Check connection to the ATLAS API + if is_connection_to_server_working( + client, f"{BASE_API_URL}/ATLAS/metadata/16pk_A", logger=logger + ): + logger.success("Connection to ATLAS API successful!") + else: + logger.critical("Connection to ATLAS API failed.") + logger.critical("Aborting.") + sys.exit(1) + # Scrape datasets metadata. + datasets_ids = search_all_datasets(client=client, logger=logger) + if scraper.is_in_debug_mode: + datasets_ids = set(list(datasets_ids)[:10]) + logger.warning("Debug mode is ON: limiting to first 10 datasets.") + datasets_metadata = scrape_all_datasets( + client, + datasets_ids, + logger=logger, + ) + # Normalize datasets metadata. + datasets_metadata_normalized = normalize_datasets_metadata( + datasets_metadata, + logger=logger, + ) + # Scrape files metadata. + files_metadata = scrape_all_files( + client, + datasets_metadata_normalized, + logger=logger, + ) + # Normalize datasets metadata. + files_metadata_normalized = normalize_files_metadata( + files_metadata, + logger=logger, + ) + # Save datasets metadata to parquet file. + scraper.number_of_datasets_scraped = export_list_of_models_to_parquet( + scraper.datasets_parquet_file_path, + datasets_metadata_normalized, + logger=logger, + ) + # Save files metadata to parquet file. + scraper.number_of_files_scraped = export_list_of_models_to_parquet( + scraper.files_parquet_file_path, + files_metadata_normalized, + logger=logger, + ) + # Print scraping statistics. + print_statistics(scraper, logger=logger) + + +if __name__ == "__main__": + main() diff --git a/src/mdverse_scrapers/scrapers/nomad.py b/src/mdverse_scrapers/scrapers/nomad.py index be638a0..d1f8ecd 100644 --- a/src/mdverse_scrapers/scrapers/nomad.py +++ b/src/mdverse_scrapers/scrapers/nomad.py @@ -17,6 +17,7 @@ from ..core.network import ( HttpMethod, create_httpx_client, + is_connection_to_server_working, make_http_request_with_retries, ) from ..core.toolbox import print_statistics @@ -40,35 +41,6 @@ } -def is_nomad_connection_working( - client: httpx.Client, url: str, logger: "loguru.Logger" = loguru.logger -) -> bool | None: - """Test connection to the NOMAD API. - - Parameters - ---------- - client : httpx.Client - The HTTPX client to use for making requests. - url : str - The URL endpoint. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - bool - True if the connection is successful, False otherwise. - """ - logger.debug("Testing connection to NOMAD API...") - response = make_http_request_with_retries(client, url, method=HttpMethod.GET) - if not response: - logger.error("Cannot connect to the NOMAD API.") - return False - if response and hasattr(response, "headers"): - logger.debug(response.headers) - return True - - def scrape_all_datasets( client: httpx.Client, query_entry_point: str, @@ -531,13 +503,20 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: output_dir_path=output_dir_path, is_in_debug_mode=is_in_debug_mode, ) - logger = create_logger(logpath=scraper.log_file_path, level="INFO") + # Create logger. + level = "INFO" + if scraper.is_in_debug_mode: + level = "DEBUG" + logger = create_logger(logpath=scraper.log_file_path, level=level) + # Print scraper configuration. logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) - logger.info("Starting Nomad data scraping...") + logger.info("Starting NOMAD data scraping...") # Create HTTPX client client = create_httpx_client() # Check connection to NOMAD API - if is_nomad_connection_working(client, f"{BASE_NOMAD_URL}/entries"): + if is_connection_to_server_working( + client, f"{BASE_NOMAD_URL}/entries", logger=logger + ): logger.success("Connection to NOMAD API successful!") else: logger.critical("Connection to NOMAD API failed.")