From c2717ac355eca65a8f17db3c01578e1bae559059 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Mon, 26 Jan 2026 17:44:07 +0100 Subject: [PATCH 1/6] BREAKING CHANGES: Update ATLAS (and NOMAD) scrapers --- .gitignore | 3 - docs/atlas.md | 67 ++-- docs/zenodo.md | 7 +- pyproject.toml | 22 ++ scripts/scrap_atlas.py | 406 ------------------------- src/mdverse_scrapers/core/network.py | 31 ++ src/mdverse_scrapers/models/dataset.py | 2 +- src/mdverse_scrapers/scrapers/atlas.py | 378 +++++++++++++++++++++++ src/mdverse_scrapers/scrapers/nomad.py | 43 +-- 9 files changed, 476 insertions(+), 483 deletions(-) delete mode 100644 scripts/scrap_atlas.py create mode 100644 src/mdverse_scrapers/scrapers/atlas.py diff --git a/.gitignore b/.gitignore index 73aa25e..b3e5888 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,3 @@ __pycache__/ # MAC tmp files .DS_Store - -Test/* -!Test/Github_version diff --git a/docs/atlas.md b/docs/atlas.md index aebb18e..3bfb932 100644 --- a/docs/atlas.md +++ b/docs/atlas.md @@ -1,63 +1,60 @@ -# ATLAS. +# ATLAS ATLAS (Atlas of proTein moLecular dynAmicS) is an open-access data repository that gathers standardized molecular dynamics simulations of protein structures, accompanied by their analysis in the form of interactive diagrams and trajectory visualisation. All raw trajectories as well as the results of analysis are available for download. -- web site: https://www.dsimb.inserm.fr/ATLAS/ -- documentation: https://www.dsimb.inserm.fr/ATLAS/api/redoc -- API: https://www.dsimb.inserm.fr/ATLAS/api/ +- web site: +- publication: [ATLAS: protein flexibility description from atomistic molecular dynamics simulations](https://academic.oup.com/nar/article/52/D1/D384/7438909), Nucleic Acids Research, 2024. -No account / token is needed to access ATLAS API. +## API ---- +- Base URL: +- [documentation](https://www.dsimb.inserm.fr/ATLAS/api/redoc) -## Finding molecular dynamics datasets and files +No account / token is needed to access ATLAS API. ### Datasets In ATLAS, each dataset corresponds to a molecular dynamics simulation of a **protein chain** and is uniquely identified by a **PDB ID and chain identifier** (`pdb_chain`). -The list of all available datasets can be obtained from the ATLAS HTML index: - -https://www.dsimb.inserm.fr/ATLAS/ +The list of all available datasets can be obtained from the ATLAS index page: -This page is used as the **discovery layer** to extract all available PDB chain identifiers. +All datasets (pdb chains) are extracted from this page. ---- - -### API entrypoint to search for entries +### Metadata for a given dataset API endpoint to retrieve metadata for a given dataset: -- Path: `/ATLAS/metadata/{pdb_chain}` -- documentation: https://www.dsimb.inserm.fr/ATLAS/api/redoc +- Endpoint: `/ATLAS/metadata/{pdb_chain}` +- HTTP method: GET +- documentation: -This endpoint returns structured JSON metadata describing the protein and its molecular dynamics simulation. +This endpoint returns structured JSON metadata describing the simulated protein. ---- +Example with dataset id `1k5n_A`: -### Files +- [web page](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A.html) +- [API view](https://www.dsimb.inserm.fr/ATLAS/api/ATLAS/metadata/1k5n_A) -Files associated with a given dataset are hosted in a public directory. +Remarks: -- Base path: `/database/ATLAS/{pdb_chain}/` +- The title of the dataset is the protein name. +- No comment or description is provided. We used the organism as description. -These directories contain structure files (PDB, CIF), molecular dynamics trajectories, and precomputed analysis results. +### Metadata for files ---- - -## Examples - -### 1k5n_A - -- entry id: `1k5n_A` -- entry on ATLAS GUI: https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A.html -- entry on ATLAS API: https://www.dsimb.inserm.fr/ATLAS/api/ATLAS/metadata/1k5n_A +Files associated with a given dataset are hosted in a public directory. -### Description (called "Comment") : +For each dataset, 3 zip files are provided. They are accessible through the web page of each individual dataset: -HLA class I histocompatibility antigen, B alpha chain +Zip files url follow these patterns: -### Files +- Analysis & MDs (1,000 frames, only protein): +- MDs (10,000 frames, only protein): +- MDs (10,000 frames, total system): -- files on ATLAS GUI: https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A.html +Example with dataset id `1k5n_A`: +- [web page](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A.html) +- [1k5n_A_analysis.zip](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A_analysis.zip) +- [1k5n_A_protein.zip](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A_protein.zip) +- [1k5n_A_total.zip](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A_total.zip) diff --git a/docs/zenodo.md b/docs/zenodo.md index 2f84de7..334928d 100644 --- a/docs/zenodo.md +++ b/docs/zenodo.md @@ -10,8 +10,7 @@ So we don't expect much files to have an individual size above 50 GB. ## API -### Documentation - +- Base URL: - [REST API](https://developers.zenodo.org/) - List of [HTTP status codes](https://developers.zenodo.org/#http-status-codes) @@ -21,10 +20,6 @@ Zenodo requires a token to access its API with higher rate limits. See "[Authent Example of direct API link for a given dataset: -### Base ULR - - - ### Query [Search guide](https://help.zenodo.org/guides/search/) diff --git a/pyproject.toml b/pyproject.toml index 5620d55..a0dc528 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,6 +3,27 @@ name = "mdverse-scrapers" version = "0.1.0" description = "MDverse scrapers" readme = "README.md" +license = "BSD-3-Clause" +authors = [ + { name = "Pierre Poulain", email = "pierre.poulain@cupnet.net" }, + { name = "Essmay Touami", email = "essmay.touami@etu.u-paris.fr" }, + { name = "Salahudin Sheikh", email = "sheikh@ibpc.fr"} +] +maintainers = [ + { name = "Pierre Poulain", email = "pierre.poulain@cupnet.net" } +] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Intended Audience :: Science/Research", + "Topic :: Database", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Scientific/Engineering :: Chemistry", +] requires-python = ">=3.12" dependencies = [ "beautifulsoup4>=4.13.3", @@ -50,3 +71,4 @@ build-backend = "uv_build" scrape-zenodo = "mdverse_scrapers.scrapers.zenodo:main" scrape-figshare = "mdverse_scrapers.scrapers.figshare:main" scrape-nomad = "mdverse_scrapers.scrapers.nomad:main" +scrape-atlas = "mdverse_scrapers.scrapers.atlas:main" diff --git a/scripts/scrap_atlas.py b/scripts/scrap_atlas.py deleted file mode 100644 index e2fc7a7..0000000 --- a/scripts/scrap_atlas.py +++ /dev/null @@ -1,406 +0,0 @@ -#!/usr/bin/env python3 - -""" - - -INSTALL them in myenv if there was some error in running this and some packages are not installed #######(pip install "pydantic>=2.0" httpx pandas pyarrow tenacity)###### - - -ATLAS MD scraper — cleaned, strict Pydantic-integrated single-file script. - -- Strict typing (no "null" strings) -- Pydantic validation for external data -- Async HTTP fetching with retries -- Concurrency control and polite delays -- Atomic parquet writes (two files: files + metadata) - -Scrape molecular dynamics metadata and files from ATLAS of proTein moLecular dynAmicS . - -This script fetches molecular dynamics (MD) metadata from the ATLAS repository (https://www.dsimb.inserm.fr/ATLAS/). -It collects metadata such as dataset names, organisms, sequences, authors, DOIs, and file information for protein MDs. - -The scraped data is saved locally in Parquet format: - - "ATLAS_files.parquet" : file-level metadata (file names, file sizes, number of files) - - "ATLAS_metadata.parquet" : dataset metadata (source, title, organism, DOI, sequence, etc.) - -Usage : -======= - python3 fetch_atlas.py - -Ensure required packages are installed: - - httpx - - pandas - - pyarrow - - -FIELD DESCRIPTIONS: -------------------- -source: Source of the dataset (here: ATLAS) -source_id: Unique identifier for the PDB chain or entry -data_set_url: URL to the dataset metadata API endpoint -title: Protein name or dataset title -organism: Organism from which the protein originates -length: Length of the protein sequence (number of residues) -sequence: Amino acid sequence of the protein -crawling_date: Date when this metadata was collected -date_creation: Original creation date of the dataset (if available) -date_last_modification: Last modification date of the dataset (if available) -nb_files: Number of files available for the dataset -file_names: Comma-separated list of available file names -file_sizes: Comma-separated list of file sizes corresponding to file_names -license: License under which the dataset is shared -authors: Names of authors or contributors -doi: DOI of the publication describing the dataset - -""" - - - -import asyncio -import logging -import re -import shutil -import tempfile -import time -from asyncio import Semaphore -from datetime import datetime, timezone -from pathlib import Path -from typing import Dict, List, Optional, Annotated - -import httpx -import pandas as pd -from pydantic import BaseModel, Field, HttpUrl, StringConstraints, field_validator, conint, constr -from tenacity import ( - RetryError, - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - - -# ------------------------ -# Metadata / Authors -# ------------------------ -__authors__ = ("Pierre Poulain", "Salahudin Sheikh") -__contact__ = ("pierre.poulain@u-paris.fr", "sheikh@ibpc.fr") -__copyright__ = "AGPL-3.0" -__date__ = "2025" -__version__ = "1.0.0" - -# ------------------------ -# Configuration -# ------------------------ -HTML_LIST_URL: str = "https://www.dsimb.inserm.fr/ATLAS/" -API_BASE: str = "https://www.dsimb.inserm.fr/ATLAS/api/ATLAS/metadata/" -BASE_URL: str = "https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/" - -FILES_PARQUET = "atlas_files.parquet" -METADATA_PARQUET = "atlas_datasets.parquet" -OUTPUT_DIR = "data/atlas" - -MAX_CONCURRENT_REQUESTS: int = 10 -REQUEST_DELAY: float = 0.05 # polite delay (seconds) -HTTP_TIMEOUT: float = 30.0 -RETRY_ATTEMPTS: int = 3 - -HEADERS = { - "User-Agent": "atlas-scraper/1.0 (+https://example.org)", -} - -# ------------------------ -# Logging -# ------------------------ -logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(message)s", - level=logging.INFO, -) -logger = logging.getLogger("atlas_scraper") - -# ------------------------ -# Pydantic models -# ------------------------ - -# Strict sequence: only amino-acid letters (uppercase) -SequenceStr = Annotated[str, StringConstraints(pattern=r"^[ACDEFGHIKLMNPQRSTVWY]+$")] - -class DatasetRecord(BaseModel): - source: str = "ATLAS" - source_id: constr(min_length=1) - data_set_url: HttpUrl - title: constr(min_length=1) - organism: Optional[constr(min_length=1)] = None - length: Optional[conint(ge=0)] = None - sequence: Optional[SequenceStr] = None - - crawling_date: constr(min_length=1) - date_creation: Optional[constr(min_length=1)] = None - date_last_modification: Optional[constr(min_length=1)] = None - - nb_files: conint(ge=0) = 0 - file_names: List[constr(min_length=1)] = Field(default_factory=list) - file_sizes: List[Optional[constr(min_length=1)]] = Field(default_factory=list) - - license: Optional[constr(min_length=1)] = None - authors: Optional[constr(min_length=1)] = None - doi: Optional[constr(min_length=1)] = None - - @field_validator("file_sizes", mode="before") - def ensure_three_sizes(cls, v): - if v is None: - return [None, None, None] - if isinstance(v, str): - parts = [p.strip() for p in v.split(",")] - return (parts + [None, None, None])[:3] - if isinstance(v, list): - parts = [p if p is not None else None for p in v] - return (parts + [None, None, None])[:3] - return v - - @field_validator("file_names", mode="before") - def ensure_file_names_list(cls, v): - if v is None: - return [] - if isinstance(v, str): - return [s.strip() for s in v.split(",") if s.strip()] - if isinstance(v, list): - return [s for s in v if s] - return v - -# ------------------------ -# HTTP helpers with retries -# ------------------------ - -def retry_decorator(): - return retry( - reraise=True, - stop=stop_after_attempt(RETRY_ATTEMPTS), - wait=wait_exponential(multiplier=1, min=1, max=10), - retry=retry_if_exception_type((httpx.HTTPError, httpx.ReadTimeout, httpx.ConnectError, asyncio.TimeoutError)), - ) - -@retry_decorator() -async def _get_json(client: httpx.AsyncClient, url: str) -> Dict: - resp = await client.get(url, timeout=HTTP_TIMEOUT) - resp.raise_for_status() - return resp.json() - -@retry_decorator() -async def _get_text(client: httpx.AsyncClient, url: str) -> str: - resp = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) - resp.raise_for_status() - return resp.text - -# ------------------------ -# Parsers -# ------------------------ - -PDB_PATTERN = re.compile(r"\b([0-9][A-Za-z0-9]{3}_[A-Za-z])\b") -DOWNLOAD_SIZE_RE = re.compile(r"Download.*?\(([^)]+)\)", re.IGNORECASE) - -def extract_pdb_chains(html: str) -> List[str]: - chains = sorted(set(PDB_PATTERN.findall(html))) - logger.info("extract_pdb_chains: found %d chains", len(chains)) - return chains - -def extract_file_sizes_from_html(html: str) -> List[Optional[str]]: - sizes = DOWNLOAD_SIZE_RE.findall(html) - return (sizes + [None, None, None])[:3] - -# ------------------------ -# Fetch functions (async) -# ------------------------ - -async def fetch_index_html_sync() -> str: - """Synchronous fetch used at startup for the index page.""" - def _sync(): - with httpx.Client(timeout=HTTP_TIMEOUT) as client: - r = client.get(HTML_LIST_URL, headers=HEADERS) - r.raise_for_status() - return r.text - - loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, _sync) - -async def fetch_metadata_for_chain( - client: httpx.AsyncClient, sem: Semaphore, pdb_chain: str -) -> Optional[Dict]: - api_url = f"{API_BASE}{pdb_chain}" - html_url = f"{BASE_URL}{pdb_chain}/{pdb_chain}.html" - - async with sem: - await asyncio.sleep(REQUEST_DELAY) - try: - api_data = await _get_json(client, api_url) - except Exception as exc: - logger.warning("API fetch failed for %s: %s", pdb_chain, exc) - return None - - try: - html_text = await _get_text(client, html_url) - sizes = extract_file_sizes_from_html(html_text) - except Exception as exc: - logger.warning("HTML fetch/parse failed for %s: %s", pdb_chain, exc) - sizes = [None, None, None] - - chain_key = pdb_chain if pdb_chain in api_data else pdb_chain.upper() - chain_data = api_data.get(chain_key, api_data if isinstance(api_data, dict) else {}) - - files = chain_data.get("files") if isinstance(chain_data.get("files"), list) else None - nb_files = len(files) if files else 3 - file_names = files if files else [ - "Analysis & MDs (only protein)", - "MDs (only protein)", - "MDs (total system)", - ] - - record = { - "source": "ATLAS", - "source_id": chain_data.get("PDB") or pdb_chain, - "data_set_url": api_url, - "title": chain_data.get("protein_name") or f"ATLAS dataset for {pdb_chain}", - "organism": chain_data.get("organism"), - "length": int(chain_data.get("length")) if chain_data.get("length") is not None else None, - "sequence": chain_data.get("sequence") if isinstance(chain_data.get("sequence"), str) else None, - "crawling_date": datetime.now(timezone.utc).date().isoformat(), - "date_creation": chain_data.get("date_creation"), - "date_last_modification": chain_data.get("date_last_modification"), - "nb_files": int(nb_files), - "file_names": file_names, - "file_sizes": sizes, - "license": chain_data.get("license") or "CC-BY-NC 4.0", - "authors": chain_data.get("authors") or "Yann Vander Meersche et al.", - "doi": chain_data.get("doi") or "https://doi.org/10.1093/nar/gkad1084", - } - - try: - validated = DatasetRecord(**record) - return validated.dict() - except Exception as exc: - logger.warning("Validation failed for %s: %s", pdb_chain, exc) - return None - -async def fetch_all(pdb_chains: List[str]) -> List[Dict]: - sem = Semaphore(MAX_CONCURRENT_REQUESTS) - results: List[Dict] = [] - async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as client: - tasks = [fetch_metadata_for_chain(client, sem, c) for c in pdb_chains] - for coro in asyncio.as_completed(tasks): - try: - res = await coro - if res: - results.append(res) - except RetryError as exc: - logger.warning("Fetch task failed after retries: %s", exc) - except Exception as exc: - logger.warning("Unhandled error in fetch task: %s", exc) - return results - -# ------------------------ -# Storage utilities (fixed for Pydantic types) -# ------------------------ - -def ensure_output_dir(path: str) -> Path: - p = Path(path) - p.mkdir(parents=True, exist_ok=True) - return p - - -def atomic_parquet_write(df: pd.DataFrame, path: Path) -> None: - # write to temp dir then atomically move - tmp_dir = tempfile.mkdtemp(dir=str(path.parent)) - tmp_file = Path(tmp_dir) / (path.name + ".tmp") - try: - df.to_parquet(tmp_file, index=False) - shutil.move(str(tmp_file), str(path)) - logger.info("Wrote parquet: %s", path) - finally: - try: - shutil.rmtree(tmp_dir) - except Exception: - pass - - -def convert_pydantic_to_native(df: pd.DataFrame) -> pd.DataFrame: - """ - Convert Pydantic-specific types (HttpUrl, SequenceStr, etc.) to native Python types - so PyArrow / pandas can write them to Parquet. - """ - for col in df.columns: - df[col] = df[col].apply(lambda x: str(x) if hasattr(x, "__str__") and not isinstance(x, str) else x) - return df - - -def save_results(validated_records: List[Dict], out_dir: str = OUTPUT_DIR) -> None: - if not validated_records: - logger.warning("No valid records to save.") - return - - out_path = ensure_output_dir(out_dir) - df = pd.DataFrame(validated_records) - - # ---------------- Files Parquet ---------------- - df_files = df[["source", "source_id", "nb_files", "file_names", "file_sizes"]].copy() - - # convert list columns to comma-separated strings - df_files["file_names"] = df_files["file_names"].apply(lambda x: ",".join(x) if isinstance(x, list) else x) - df_files["file_sizes"] = df_files["file_sizes"].apply(lambda x: ",".join([s for s in x if s]) if isinstance(x, list) else x) - - df_files = convert_pydantic_to_native(df_files) - atomic_parquet_write(df_files, out_path / FILES_PARQUET) - - # ---------------- Metadata Parquet ---------------- - meta_cols = [ - "source", "source_id", "data_set_url", "doi", "authors", "crawling_date", - "title", "organism", "date_creation", "date_last_modification", - "license", "length", "sequence" - ] - df_meta = df[meta_cols].copy() - - df_meta = convert_pydantic_to_native(df_meta) - atomic_parquet_write(df_meta, out_path / METADATA_PARQUET) - - logger.info("Saved all Parquet files successfully.") - -# ------------------------ -# Orchestration / CLI -# ------------------------ - -async def _run_pipeline(limit: Optional[int] = None, out_dir: str = OUTPUT_DIR) -> None: - logger.info("Fetching index page...") - try: - index_html = await fetch_index_html_sync() - except Exception as exc: - logger.error("Failed to fetch index page: %s", exc) - return - - chains = extract_pdb_chains(index_html) - if limit and limit > 0: - chains = chains[:limit] - logger.info("Found %d chains (limit=%s)", len(chains), limit) - - logger.info("Starting async fetch of metadata...") - results = await fetch_all(chains) - logger.info("Fetched %d valid records", len(results)) - - save_results(results, out_dir) - -def main(limit: Optional[int] = None, out_dir: str = OUTPUT_DIR) -> None: - start = time.time() - try: - asyncio.run(_run_pipeline(limit=limit, out_dir=out_dir)) - except Exception as exc: - logger.exception("Pipeline failed: %s", exc) - finally: - elapsed_minutes = (time.time() - start) / 60.0 - logger.info("Done. Elapsed time: %.2f minutes", elapsed_minutes) - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="ATLAS metadata scraper (Pydantic-validated, strict)") - parser.add_argument("--limit", type=int, default=0, help="Limit number of chains to fetch (0 = all)") - parser.add_argument("--output-dir", type=str, default=OUTPUT_DIR, help="Output directory for parquet files") - args = parser.parse_args() - - main(limit=args.limit or None, out_dir=args.output_dir) diff --git a/src/mdverse_scrapers/core/network.py b/src/mdverse_scrapers/core/network.py index 05f1024..8c81a83 100644 --- a/src/mdverse_scrapers/core/network.py +++ b/src/mdverse_scrapers/core/network.py @@ -51,6 +51,37 @@ def create_httpx_client( return httpx.Client(base_url=base_url, headers=headers) +def is_connection_to_server_working( + client: httpx.Client, url: str, logger: "loguru.Logger" = loguru.logger +) -> bool | None: + """Test connection to a web server. + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + url : str + The URL endpoint. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + bool + True if the connection is successful, False otherwise. + """ + logger.debug("Testing connection to server...") + response = make_http_request_with_retries( + client, url, method=HttpMethod.GET, max_attempts=2, logger=logger + ) + if not response: + logger.error("Cannot connect to server.") + return False + if response and hasattr(response, "headers"): + logger.debug(response.headers) + return True + + def make_http_request_with_retries( client: httpx.Client, url: str, diff --git a/src/mdverse_scrapers/models/dataset.py b/src/mdverse_scrapers/models/dataset.py index 785c7ee..dfb0028 100644 --- a/src/mdverse_scrapers/models/dataset.py +++ b/src/mdverse_scrapers/models/dataset.py @@ -17,7 +17,7 @@ DOI = Annotated[ str, - StringConstraints(pattern=r"^10\.\d{4,9}/[\w\-.]+$"), + StringConstraints(pattern=r"^10\.\d{4,9}/[\w\-./]+$"), ] diff --git a/src/mdverse_scrapers/scrapers/atlas.py b/src/mdverse_scrapers/scrapers/atlas.py new file mode 100644 index 0000000..d9438e3 --- /dev/null +++ b/src/mdverse_scrapers/scrapers/atlas.py @@ -0,0 +1,378 @@ +"""Scrape metadata of molecular dynamics datasets and files from ATLAS.""" + +import json +import re +import sys +from pathlib import Path + +import click +import httpx +import loguru +from bs4 import BeautifulSoup + +from ..core.logger import create_logger +from ..core.network import ( + HttpMethod, + create_httpx_client, + is_connection_to_server_working, + make_http_request_with_retries, +) +from ..core.toolbox import print_statistics +from ..models.dataset import DatasetMetadata +from ..models.enums import DatasetSourceName +from ..models.scraper import ScraperContext +from ..models.utils import ( + export_list_of_models_to_parquet, + normalize_datasets_metadata, + normalize_files_metadata, +) + +INDEX_URL = "https://www.dsimb.inserm.fr/ATLAS/" +BASE_API_URL = "https://www.dsimb.inserm.fr/ATLAS/api" +ATLAS_METADATA = { + "license": "CC-BY-NC", # https://www.dsimb.inserm.fr/ATLAS/download.html + "author_name": [ # https://academic.oup.com/nar/article/52/D1/D384/7438909 + "Yann Vander Meersche", + "Gabriel Cretin", + "Aria Gheeraert", + "Jean-Christophe Gelly", + "Tatiana Galochkina", + ], + "doi": "10.1093/nar/gkad1084", # https://academic.oup.com/nar/article/52/D1/D384/7438909 + "external_link": ["https://www.dsimb.inserm.fr/ATLAS/"], +} + + +def extract_pdb_chains_from_html( + html: str, logger: "loguru.Logger" = loguru.logger +) -> set[str]: + """Extract PDB chain identifiers from ATLAS index page. + + Parameters + ---------- + html : str + HTML content of the ATLAS index page. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + set[str] + List of PDB chain identifiers found. + """ + pdb_chains = [] + pdb_chain_pattern = re.compile( + r"/ATLAS/database/ATLAS/([A-Za-z0-9]{4}_[A-Za-z])/.*html" + ) + soup = BeautifulSoup(html, "html.parser") + for link in soup.find_all("a", href=True): + href = link.get("href", "") + match = pdb_chain_pattern.search(href) + if match: + pdb_chains.append(match.group(1)) + return set(pdb_chains) + + +def extract_file_sizes_from_html( + html: str, logger: "loguru.Logger" = loguru.logger +) -> list[dict]: + """Extract file sizes from ATLAS dataset HTML page. + + Parameters + ---------- + html : str + HTML content of the ATLAS dataset page. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + list[dict] + List of file names, sizes and urls found. + + """ + files_metadata = [] + download_link_pattern = re.compile( + r"https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/[A-Za-z0-9]{4}_[A-Za-z]/.*zip" + ) + file_size_pattern = re.compile(r"Download \(([A-Za-z0-9,\. ]+)\)") + soup = BeautifulSoup(html, "html.parser") + for link in soup.find_all("a", href=True): + href = link.get("href", "") + match_link = download_link_pattern.search(href) + match_size = file_size_pattern.search(link.text) + if match_link and match_size: + files_metadata.append( + { + "file_name": Path(href).name, + "file_url_in_repository": href, + # File size are sometimes expressed with comma as decimal separator. + "file_size_in_bytes": match_size.group(1).replace(",", "."), + } + ) + logger.info(f"Found {len(files_metadata)} files in the HTML page.") + return files_metadata + + +def scrape_metadata_for_a_dataset( + client: httpx.Client, + chain_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> dict | None: + """Fetch metadata for a single ATLAS dataset (PDB chain). + + Parameters + ---------- + client : httpx.Client + HTTPX client for making requests. + chain_id : str + PDB chain identifier. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + dict | None + Scraped dataset metadata, or None if failed. + """ + logger.info(f"Scraping metadata for dataset: {chain_id}") + api_url = f"{BASE_API_URL}/ATLAS/metadata/{chain_id}" + dataset_url = ( + f"https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/{chain_id}/{chain_id}.html" + ) + response = make_http_request_with_retries( + client, api_url, HttpMethod.GET, logger=logger + ) + if not response: + logger.warning(f"Failed to fetch API data for {chain_id}. Skipping.") + return None + # html = make_http_request_with_retries( + # client, dataset_url, HttpMethod.GET, logger=logger + # ) + # if not html: + # logger.warning(f"Failed to fetch HTML page for {chain_id}. Skipping.") + # return None + meta_json = None + try: + meta_json = response.json().get(f"{chain_id}") + except (json.decoder.JSONDecodeError, KeyError) as exc: + logger.warning("Failed to deconde JSON response fromthe ATLAS API.") + logger.warning(f"Error: {exc}") + return None + metadata = { + "dataset_repository_name": DatasetSourceName.ATLAS, + "dataset_id_in_repository": chain_id, + "dataset_url_in_repository": dataset_url, + "title": meta_json.get("protein_name"), + "description": meta_json.get("organism"), + "license": ATLAS_METADATA["license"], + "author_names": ATLAS_METADATA["author_name"], + "doi": ATLAS_METADATA["doi"], + "external_links": ATLAS_METADATA["external_link"], + } + logger.info("Done.") + return metadata + + +def search_all_datasets(client: httpx.Client, logger: "loguru.Logger") -> set[str]: + """Search for ATLAS datasets (1 dataset = 1 PDB chain). + + Parameters + ---------- + client : httpx.Client + HTTPX client for making requests. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + set[str] + List of PDB chains (datasets) found. + """ + logger.info("Fetching index page listing ATLAS datasets...") + response = make_http_request_with_retries( + client, INDEX_URL, HttpMethod.GET, logger=logger + ) + if not response: + logger.critical("Failed to fetch index page.") + logger.critical("Cannot list available datasets. Aborting.") + sys.exit(1) + if not hasattr(response, "text") or not response.text: + logger.critical("Index page response is empty.") + logger.critical("Cannot list available datasets. Aborting.") + sys.exit(1) + chain_ids = extract_pdb_chains_from_html(response.text, logger=logger) + logger.info(f"Found {len(chain_ids)} datasets.") + return chain_ids + + +def scrape_all_datasets( + client: httpx.Client, + pdb_chains: set[str], + logger: "loguru.Logger", +) -> list[dict]: + """Scrape all ATLAS datasets given a set of PDB chains. + + Parameters + ---------- + pdb_chains : set[str] + Set of PDB chains to scrape. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + list[dict] + List of scraped dataset metadata. + """ + datasets_meta = [] + logger.info("Starting scraping of all datasets...") + for pdb_counter, pdb_chain in enumerate(pdb_chains, start=1): + logger.info(f"Scraping dataset: {pdb_chain}") + metadata = scrape_metadata_for_a_dataset(client, pdb_chain, logger=logger) + if metadata: + datasets_meta.append(metadata) + logger.info(f"Scraped {pdb_counter}/{len(pdb_chains)} datasets") + return datasets_meta + + +def scrape_all_files( + client: httpx.Client, + datasets_metadata: list[DatasetMetadata], + logger: "loguru.Logger", +) -> list[dict]: + """Scrape ATLAS files. + + Parameters + ---------- + datasets_metadata : list[DatasetMetadata] + List of datasets metadata. + logger : loguru.Logger + Logger for logging messages. + + Returns + ------- + list[dict] + List of scraped files metadata. + """ + files_metadata = [] + for dataset_counter, dataset_meta in enumerate(datasets_metadata, start=1): + pdb_chain = dataset_meta.dataset_id_in_repository + logger.info(f"Scraping of files metadata for dataset: {pdb_chain}") + url = dataset_meta.dataset_url_in_repository + logger.info(url) + response = make_http_request_with_retries( + client, url, HttpMethod.GET, logger=logger + ) + if not response: + logger.warning(f"Failed to fetch HTML page for {pdb_chain}. Skipping.") + continue + files_meta = extract_file_sizes_from_html(response.text) + for meta in files_meta: + metadata = { + "dataset_repository_name": dataset_meta.dataset_repository_name, + "dataset_id_in_repository": dataset_meta.dataset_id_in_repository, + "dataset_url_in_repository": dataset_meta.dataset_url_in_repository, + "file_name": meta["file_name"], + "file_url_in_repository": meta["file_url_in_repository"], + "file_size_in_bytes": meta["file_size_in_bytes"], + } + files_metadata.append(metadata) + logger.info( + "Scraped metadata files for " + f"{dataset_counter:,}/{len(datasets_metadata):,} " + f"({dataset_counter / len(datasets_metadata):.0%}) datasets" + ) + logger.info(f"Total files scraped so far: {len(files_metadata):,}") + return files_metadata + + +@click.command( + help="Command line interface for MDverse scrapers", + epilog="Happy scraping!", +) +@click.option( + "--output-dir", + "output_dir_path", + type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path), + required=True, + help="Output directory path to save results.", +) +@click.option( + "--debug", + "is_in_debug_mode", + is_flag=True, + default=False, + help="Enable debug mode.", +) +def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: + """Scrape metadata of molecular dynamics datasets and files from ATLAS.""" + # Create scraper context. + scraper = ScraperContext( + data_source_name=DatasetSourceName.ATLAS, + output_dir_path=output_dir_path, + is_in_debug_mode=is_in_debug_mode, + ) + # Create logger. + level = "INFO" + if scraper.is_in_debug_mode: + level = "DEBUG" + logger = create_logger(logpath=scraper.log_file_path, level=level) + # Print scraper configuration. + logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) + logger.info("Starting ATLAS data scraping...") + # Create HTTPX client + client = create_httpx_client() + # Check connection to NOMAD API + if is_connection_to_server_working( + client, f"{BASE_API_URL}/ATLAS/metadata/16pk_A", logger=logger + ): + logger.success("Connection to ATLAS API successful!") + else: + logger.critical("Connection to ATLAS API failed.") + logger.critical("Aborting.") + sys.exit(1) + # Scrape datasets metadata. + datasets_ids = search_all_datasets(client=client, logger=logger) + if scraper.is_in_debug_mode: + datasets_ids = set(list(datasets_ids)[:10]) + logger.warning("Debug mode is ON: limiting to first 10 datasets.") + datasets_metadata = scrape_all_datasets( + client, + datasets_ids, + logger=logger, + ) + # Normalize datasets metadata. + datasets_metadata_normalized = normalize_datasets_metadata( + datasets_metadata, + logger=logger, + ) + # Scrape files metadata. + files_metadata = scrape_all_files( + client, + datasets_metadata_normalized, + logger=logger, + ) + # Normalize datasets metadata. + files_metadata_normalized = normalize_files_metadata( + files_metadata, + logger=logger, + ) + # Save datasets metadata to parquet file. + scraper.number_of_datasets_scraped = export_list_of_models_to_parquet( + scraper.datasets_parquet_file_path, + datasets_metadata_normalized, + logger=logger, + ) + # Save files metadata to parquet file. + scraper.number_of_files_scraped = export_list_of_models_to_parquet( + scraper.files_parquet_file_path, + files_metadata_normalized, + logger=logger, + ) + # Print scraping statistics. + print_statistics(scraper, logger=logger) + + +if __name__ == "__main__": + main() diff --git a/src/mdverse_scrapers/scrapers/nomad.py b/src/mdverse_scrapers/scrapers/nomad.py index be638a0..d1f8ecd 100644 --- a/src/mdverse_scrapers/scrapers/nomad.py +++ b/src/mdverse_scrapers/scrapers/nomad.py @@ -17,6 +17,7 @@ from ..core.network import ( HttpMethod, create_httpx_client, + is_connection_to_server_working, make_http_request_with_retries, ) from ..core.toolbox import print_statistics @@ -40,35 +41,6 @@ } -def is_nomad_connection_working( - client: httpx.Client, url: str, logger: "loguru.Logger" = loguru.logger -) -> bool | None: - """Test connection to the NOMAD API. - - Parameters - ---------- - client : httpx.Client - The HTTPX client to use for making requests. - url : str - The URL endpoint. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - bool - True if the connection is successful, False otherwise. - """ - logger.debug("Testing connection to NOMAD API...") - response = make_http_request_with_retries(client, url, method=HttpMethod.GET) - if not response: - logger.error("Cannot connect to the NOMAD API.") - return False - if response and hasattr(response, "headers"): - logger.debug(response.headers) - return True - - def scrape_all_datasets( client: httpx.Client, query_entry_point: str, @@ -531,13 +503,20 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: output_dir_path=output_dir_path, is_in_debug_mode=is_in_debug_mode, ) - logger = create_logger(logpath=scraper.log_file_path, level="INFO") + # Create logger. + level = "INFO" + if scraper.is_in_debug_mode: + level = "DEBUG" + logger = create_logger(logpath=scraper.log_file_path, level=level) + # Print scraper configuration. logger.debug(scraper.model_dump_json(indent=4, exclude={"token"})) - logger.info("Starting Nomad data scraping...") + logger.info("Starting NOMAD data scraping...") # Create HTTPX client client = create_httpx_client() # Check connection to NOMAD API - if is_nomad_connection_working(client, f"{BASE_NOMAD_URL}/entries"): + if is_connection_to_server_working( + client, f"{BASE_NOMAD_URL}/entries", logger=logger + ): logger.success("Connection to NOMAD API successful!") else: logger.critical("Connection to NOMAD API failed.") From c8d0911cbfea7ee1bf47ad63b27ec6a19958040d Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Mon, 26 Jan 2026 18:25:20 +0100 Subject: [PATCH 2/6] docs: Explain how we extract metadata from HTML pages --- docs/atlas.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/atlas.md b/docs/atlas.md index 3bfb932..f02bb3c 100644 --- a/docs/atlas.md +++ b/docs/atlas.md @@ -18,7 +18,7 @@ In ATLAS, each dataset corresponds to a molecular dynamics simulation of a **pro The list of all available datasets can be obtained from the ATLAS index page: -All datasets (pdb chains) are extracted from this page. +All datasets (pdb chains) are extracted from this page with a regular expression. ### Metadata for a given dataset @@ -58,3 +58,5 @@ Example with dataset id `1k5n_A`: - [1k5n_A_analysis.zip](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A_analysis.zip) - [1k5n_A_protein.zip](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A_protein.zip) - [1k5n_A_total.zip](https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/1k5n_A/1k5n_A_total.zip) + +We parse HTML content of dataset page and use regular expressions to extract URLs, file names and file sizes. From 2b1e94af1a0406a12cd0264740d6e8f53dd4e3f1 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Mon, 26 Jan 2026 18:26:00 +0100 Subject: [PATCH 3/6] feat: Add more logger and decrease delay before HTTP GET request --- src/mdverse_scrapers/scrapers/atlas.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/atlas.py b/src/mdverse_scrapers/scrapers/atlas.py index d9438e3..e07ca59 100644 --- a/src/mdverse_scrapers/scrapers/atlas.py +++ b/src/mdverse_scrapers/scrapers/atlas.py @@ -141,17 +141,11 @@ def scrape_metadata_for_a_dataset( f"https://www.dsimb.inserm.fr/ATLAS/database/ATLAS/{chain_id}/{chain_id}.html" ) response = make_http_request_with_retries( - client, api_url, HttpMethod.GET, logger=logger + client, api_url, HttpMethod.GET, delay_before_request=0.5, logger=logger ) if not response: logger.warning(f"Failed to fetch API data for {chain_id}. Skipping.") return None - # html = make_http_request_with_retries( - # client, dataset_url, HttpMethod.GET, logger=logger - # ) - # if not html: - # logger.warning(f"Failed to fetch HTML page for {chain_id}. Skipping.") - # return None meta_json = None try: meta_json = response.json().get(f"{chain_id}") @@ -191,7 +185,7 @@ def search_all_datasets(client: httpx.Client, logger: "loguru.Logger") -> set[st """ logger.info("Fetching index page listing ATLAS datasets...") response = make_http_request_with_retries( - client, INDEX_URL, HttpMethod.GET, logger=logger + client, INDEX_URL, HttpMethod.GET, delay_before_request=0.5, logger=logger ) if not response: logger.critical("Failed to fetch index page.") @@ -228,11 +222,13 @@ def scrape_all_datasets( datasets_meta = [] logger.info("Starting scraping of all datasets...") for pdb_counter, pdb_chain in enumerate(pdb_chains, start=1): - logger.info(f"Scraping dataset: {pdb_chain}") metadata = scrape_metadata_for_a_dataset(client, pdb_chain, logger=logger) if metadata: datasets_meta.append(metadata) - logger.info(f"Scraped {pdb_counter}/{len(pdb_chains)} datasets") + logger.info( + f"Scraped {pdb_counter:,}/{len(pdb_chains):,} " + f"({pdb_counter / len(pdb_chains):.0%}) datasets" + ) return datasets_meta @@ -262,12 +258,12 @@ def scrape_all_files( url = dataset_meta.dataset_url_in_repository logger.info(url) response = make_http_request_with_retries( - client, url, HttpMethod.GET, logger=logger + client, url, HttpMethod.GET, delay_before_request=0.5, logger=logger ) if not response: logger.warning(f"Failed to fetch HTML page for {pdb_chain}. Skipping.") continue - files_meta = extract_file_sizes_from_html(response.text) + files_meta = extract_file_sizes_from_html(response.text, logger=logger) for meta in files_meta: metadata = { "dataset_repository_name": dataset_meta.dataset_repository_name, From 3122d7b3e9a4ecae50b1ceb6be72cace127ad026 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Mon, 26 Jan 2026 18:56:32 +0100 Subject: [PATCH 4/6] refactor: Remove unnecessary logs --- src/mdverse_scrapers/scrapers/atlas.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/atlas.py b/src/mdverse_scrapers/scrapers/atlas.py index e07ca59..370c395 100644 --- a/src/mdverse_scrapers/scrapers/atlas.py +++ b/src/mdverse_scrapers/scrapers/atlas.py @@ -254,9 +254,8 @@ def scrape_all_files( files_metadata = [] for dataset_counter, dataset_meta in enumerate(datasets_metadata, start=1): pdb_chain = dataset_meta.dataset_id_in_repository - logger.info(f"Scraping of files metadata for dataset: {pdb_chain}") + logger.info(f"Scraping files metadata for dataset: {pdb_chain}") url = dataset_meta.dataset_url_in_repository - logger.info(url) response = make_http_request_with_retries( client, url, HttpMethod.GET, delay_before_request=0.5, logger=logger ) From c02ceba2c47352eef05cdd59d520389a2080d6f4 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Tue, 27 Jan 2026 08:41:34 +0100 Subject: [PATCH 5/6] docs: Fix types in doctrings and typos in comments --- src/mdverse_scrapers/core/network.py | 2 +- src/mdverse_scrapers/scrapers/atlas.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/mdverse_scrapers/core/network.py b/src/mdverse_scrapers/core/network.py index 8c81a83..e376e31 100644 --- a/src/mdverse_scrapers/core/network.py +++ b/src/mdverse_scrapers/core/network.py @@ -53,7 +53,7 @@ def create_httpx_client( def is_connection_to_server_working( client: httpx.Client, url: str, logger: "loguru.Logger" = loguru.logger -) -> bool | None: +) -> bool: """Test connection to a web server. Parameters diff --git a/src/mdverse_scrapers/scrapers/atlas.py b/src/mdverse_scrapers/scrapers/atlas.py index 370c395..e46da96 100644 --- a/src/mdverse_scrapers/scrapers/atlas.py +++ b/src/mdverse_scrapers/scrapers/atlas.py @@ -58,7 +58,7 @@ def extract_pdb_chains_from_html( Returns ------- set[str] - List of PDB chain identifiers found. + Set of PDB chain identifiers found. """ pdb_chains = [] pdb_chain_pattern = re.compile( @@ -106,7 +106,8 @@ def extract_file_sizes_from_html( { "file_name": Path(href).name, "file_url_in_repository": href, - # File size are sometimes expressed with comma as decimal separator. + # File sizes are sometimes expressed with comma + # as decimal separator. "file_size_in_bytes": match_size.group(1).replace(",", "."), } ) @@ -150,7 +151,7 @@ def scrape_metadata_for_a_dataset( try: meta_json = response.json().get(f"{chain_id}") except (json.decoder.JSONDecodeError, KeyError) as exc: - logger.warning("Failed to deconde JSON response fromthe ATLAS API.") + logger.warning("Failed to decode JSON response from the ATLAS API.") logger.warning(f"Error: {exc}") return None metadata = { @@ -181,7 +182,7 @@ def search_all_datasets(client: httpx.Client, logger: "loguru.Logger") -> set[st Returns ------- set[str] - List of PDB chains (datasets) found. + Set of PDB chains (datasets) found. """ logger.info("Fetching index page listing ATLAS datasets...") response = make_http_request_with_retries( @@ -318,7 +319,7 @@ def main(output_dir_path: Path, *, is_in_debug_mode: bool = False) -> None: logger.info("Starting ATLAS data scraping...") # Create HTTPX client client = create_httpx_client() - # Check connection to NOMAD API + # Check connection to the ATLAS API if is_connection_to_server_working( client, f"{BASE_API_URL}/ATLAS/metadata/16pk_A", logger=logger ): From 76afd1d29ccd4762eb55f2893caa6b1c970ed1e8 Mon Sep 17 00:00:00 2001 From: Pierre Poulain Date: Tue, 27 Jan 2026 11:26:31 +0100 Subject: [PATCH 6/6] docs: Fix log message --- src/mdverse_scrapers/scrapers/atlas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/atlas.py b/src/mdverse_scrapers/scrapers/atlas.py index e46da96..e5124fe 100644 --- a/src/mdverse_scrapers/scrapers/atlas.py +++ b/src/mdverse_scrapers/scrapers/atlas.py @@ -190,11 +190,11 @@ def search_all_datasets(client: httpx.Client, logger: "loguru.Logger") -> set[st ) if not response: logger.critical("Failed to fetch index page.") - logger.critical("Cannot list available datasets. Aborting.") + logger.critical("Cannot list available datasets. Aborting!") sys.exit(1) if not hasattr(response, "text") or not response.text: logger.critical("Index page response is empty.") - logger.critical("Cannot list available datasets. Aborting.") + logger.critical("Cannot list available datasets. Aborting!") sys.exit(1) chain_ids = extract_pdb_chains_from_html(response.text, logger=logger) logger.info(f"Found {len(chain_ids)} datasets.")