From 420ff8c0b5349c45f7d424e20369d7e5559d491f Mon Sep 17 00:00:00 2001 From: essmaw Date: Fri, 16 Jan 2026 19:09:42 +0100 Subject: [PATCH 1/5] [WIP] - feat: add GPCRmd scraper to fetch molecular dynamics datasets. --- pyproject.toml | 1 + src/mdverse_scrapers/scrapers/gpcrmd.py | 538 ++++++++++++++++++++++++ 2 files changed, 539 insertions(+) create mode 100644 src/mdverse_scrapers/scrapers/gpcrmd.py diff --git a/pyproject.toml b/pyproject.toml index 96ee2d6..983a7b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,3 +47,4 @@ build-backend = "uv_build" scrape-zenodo = "mdverse_scrapers.scrapers.zenodo:main" scrape-figshare = "mdverse_scrapers.scrapers.figshare:main" scrape-nomad = "mdverse_scrapers.scrapers.nomad:main" +scrape-gpcrmd = "mdverse_scrapers.scrapers.gpcrmd:main" diff --git a/src/mdverse_scrapers/scrapers/gpcrmd.py b/src/mdverse_scrapers/scrapers/gpcrmd.py new file mode 100644 index 0000000..b489f07 --- /dev/null +++ b/src/mdverse_scrapers/scrapers/gpcrmd.py @@ -0,0 +1,538 @@ +"""Scrape molecular dynamics simulation datasets and files from GPCRmd. + +This script scrapes molecular dynamics datasets from the GPCRmd repository +(https://www.gpcrmd.org/), a platform dedicated to simulations of +G-protein-coupled receptors (GPCRs), a major family of membrane proteins and +frequent drug targets. +""" +import json +import sys +import time +from datetime import datetime, timedelta +from pathlib import Path +from typing import Any + +import click +import httpx +import loguru +from bs4 import BeautifulSoup, Tag + +from ..core.logger import create_logger +from ..core.network import ( + HttpMethod, + create_httpx_client, + make_http_request_with_retries, +) +from ..core.toolbox import export_list_of_models_to_parquet +from ..models.dataset import DatasetMetadata +from ..models.enums import DatasetProjectName, DatasetRepositoryName, DataType +from ..models.file import FileMetadata +from ..models.utils import validate_metadata_against_model + +BASE_GPCRMD_URL = "https://www.gpcrmd.org/api/search_all/info/" + + +def is_gpcrmd_connection_working( + client: httpx.Client, url: str, logger: "loguru.Logger" = loguru.logger +) -> bool | None: + """Test connection to the GPCRmd API. + + Returns + ------- + bool + True if the connection is successful, False otherwise. + """ + logger.debug("Testing connection to GPCRmd API...") + response = make_http_request_with_retries(client, url, method=HttpMethod.GET) + if not response: + logger.error("Cannot connect to the GPCRmd API.") + return False + if response and hasattr(response, "headers"): + logger.debug(response.headers) + return True + + +def scrape_all_datasets( + client: httpx.Client, + query_entry_point: str, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """ + Scrape Molecular Dynamics-related datasets from the GPCRmd API. + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + query_entry_point : str + The entry point of the API request. + + Returns + ------- + list[dict]: + A list of GPCRmd entries. + """ + logger.info("Scraping molecular dynamics datasets from GPCRmd.") + logger.info("Requesting all datasets in a single fetch...") + + response = make_http_request_with_retries( + client, + query_entry_point, + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.2, + ) + if not response: + logger.critical("Failed to fetch data from GPCRmd API.") + sys.exit(1) + try: + # Get the formatted response with request metadatas in JSON format + all_datasets = response.json() + except (json.decoder.JSONDecodeError, ValueError) as exc: + logger.error(f"Error while parsing GPCRmd response: {exc}") + logger.error("Cannot find datasets.") + logger.critical("Aborting.") + sys.exit(1) + + logger.success(f"Scraped {len(all_datasets)} datasets in GPCRmd.") + return all_datasets + + +def scrape_files_for_all_datasets( + client: httpx.Client, + datasets: list[DatasetMetadata], + logger: "loguru.Logger" = loguru.logger, +) -> list[FileMetadata]: + """Scrape files metadata for all datasets in GPCRmd. + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + datasets : list[DatasetMetadata] + List of datasets to scrape files metadata for. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[FileMetadata] + List of successfully validated `FileMetadata` objects. + """ + all_files_metadata = [] + for dataset_count, dataset in enumerate(datasets, start=1): + dataset_id = dataset.dataset_id_in_repository + files_metadata = scrape_files_for_one_dataset( + client, + url=f"{BASE_GPCRMD_URL}/entries/{dataset_id}/rawdir", + dataset_id=dataset_id, + logger=logger, + ) + if not files_metadata: + continue + # Extract relevant files metadata. + files_selected_metadata = extract_files_metadata(files_metadata, logger=logger) + # Normalize files metadata with pydantic model (FileMetadata) + logger.info(f"Validating files metadata for dataset: {dataset_id}") + for file_metadata in files_selected_metadata: + normalized_metadata = validate_metadata_against_model( + file_metadata, + FileMetadata, + logger=logger, + ) + if not normalized_metadata: + logger.error( + f"Normalization failed for metadata of file " + f"{file_metadata.get('file_name')} " + f"in dataset {dataset_id}" + ) + continue + all_files_metadata.append(normalized_metadata) + logger.info("Done.") + logger.info(f"Total files: {len(all_files_metadata):,}") + logger.info( + "Extracted and validated files metadata for " + f"{dataset_count:,}/{len(datasets):,} " + f"({dataset_count / len(datasets):.0%}) datasets." + ) + return all_files_metadata + + +def scrape_files_for_one_dataset( + client: httpx.Client, + url: str, + dataset_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> dict | None: + """ + Scrape files metadata for a given GPCRmd dataset. + + Doc: https://gpcrmd-lab.eu/prod/v1/api/v1/extensions/docs#/entries/metadata + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + url : str + The URL endpoint. + dataset_id : str + The unique identifier of the dataset in GPCRmd. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + dict | None + File metadata dictionary for the dataset. + """ + logger.info(f"Scraping files for dataset ID: {dataset_id}") + response = make_http_request_with_retries( + client, + url, + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.1, + ) + if not response: + logger.error("Failed to fetch files metadata.") + return None + return response.json() + + +def fetch_dataset_page(url: str | None, + dataset_id: str | None, + client: httpx.Client, + logger: "loguru.Logger" = loguru.logger +) -> str | None: + """Fetch an dataset page and return its HTML content. + + Parameters + ---------- + url : str + The URL of the dataset page to fetch. + client : httpx.Client + The HTTPX client to use for making requests. + + Returns + ------- + str | None + The HTML content of the page if the request is successful, otherwise None. + """ + if url: + html_header = make_http_request_with_retries( + client, + url, + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.2, + ) + if html_header: + html_content = html_header.text + + return html_content + + +def retrieve_metadata(html: str, field_name: str) -> str | None: + """ + Retrieve a specific metadata field from a webpage. + + Parameters + ---------- + html : str + The HTML content of the page. + field_name : str + The name of the metadata field to extract (case-sensitive). + + Returns + ------- + str | None + The value of the metadata field if found, otherwise None. + + """ + # Parse the HTML content of the page using BeautifulSoup + soup = BeautifulSoup(html, "html.parser") + bold_tag = next( + (b for b in soup.find_all("b") if b.get_text(strip=True) == field_name), + None, + ) + if not bold_tag: + return None + # Get all the text from the parent element of the tag + parent = bold_tag.parent + if not isinstance(parent, Tag): + return None + parent_text = parent.get_text(strip=True) + if ":" not in parent_text: + return None + # Get only what is after the "field_name:" + metadata = parent_text.split(":", 1)[1].strip() + return metadata + + +def extract_datasets_metadata( + datasets: list[dict[str, Any]], + client: httpx.Client, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """ + Extract relevant metadata from raw GPCRmd datasets metadata. + + Parameters + ---------- + datasets : List[Dict[str, Any]] + List of raw GPCRmd datasets metadata. + client : httpx.Client + The HTTPX client to use for making requests. + + Returns + ------- + list[dict] + List of dataset metadata dictionaries. + """ + datasets_metadata = [] + for dataset in datasets: + dataset_id = str(dataset.get("dyn_id")) + logger.info(f"Extracting relevant metadata for dataset: {dataset_id}") + dataset_url = dataset.get("url") + metadata = { + "dataset_repository_name": DatasetRepositoryName.GPCRMD, + "dataset_project_name": DatasetProjectName.GPCRMD, + "dataset_id_in_repository": dataset_id, + "dataset_id_in_project": dataset_id, + "dataset_url_in_repository": dataset_url, + "dataset_url_in_project": dataset_url, + "title": dataset.get("modelname"), + "date_created": dataset.get("creation_timestamp"), + "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), + "simulation_program_name": dataset.get("mysoftware"), + "simulation_program_version": dataset.get("software_version"), + "forcefield_model_name": dataset.get("forcefield"), + "forcefield_model_version": dataset.get("forcefield_version"), + "timestep": dataset.get("timestep"), + "delta": dataset.get("delta"), + "nb_atoms": dataset.get("atom_num") + } + # Extract other metadata from dataset url page if available. + # Fetch dataset page with url + html_content = fetch_dataset_page(dataset_url, dataset_id, client, logger) + if html_content is None: + logger.warning("Error parsing additionnal metadatas from web page for entry" + f" {dataset_id} ({dataset_url})") + logger.warning("Skipping this step.") + continue + + # Author names. + author_names = None + try: + author_names = retrieve_metadata(html_content, "Submitted by") + except (ValueError, KeyError) as e: + logger.warning(f"Error parsing software name for entry {dataset_id}: {e}") + metadata["author_names"] = (author_names if author_names + is None else [author_names]) + + """# Software version + software_version = None + try: + software_version = ( + dataset.get("results", {}) + .get("method", {}) + .get("simulation", {}) + .get("program_version") + ) + except (ValueError, KeyError) as e: + logger.warning( + f"Error parsing software version for entry {dataset_id}: {e}") + metadata["software_version"] = software_version + # Molecules and number total of atoms. + total_atoms = None + molecules = None + try: + topology = ( + dataset.get("results", {}).get("material", {}).get("topology", []) + ) + if isinstance(topology, list): + total_atoms = next( + ( + t.get("n_atoms") + for t in topology + if t.get("label") == "original" + ), + None, + ) + molecules = [ + f"{t.get('label')} ({t.get('n_atoms')} atoms)" + for t in topology + if t.get("structural_type") == "molecule" + ] + except (ValueError, KeyError) as e: + logger.warning(f"Error parsing molecules for entry {dataset_id}: {e}") + metadata["nb_atoms"] = total_atoms + metadata["molecule_names"] = molecules + datasets_metadata.append(metadata)""" + logger.info(f"Extracted metadata for {len(datasets_metadata)} datasets.") + return datasets_metadata + + +def normalize_datasets_metadata( + datasets: list[dict], + logger: "loguru.Logger" = loguru.logger, +) -> list[DatasetMetadata]: + """ + Normalize dataset metadata with a Pydantic model. + + Parameters + ---------- + datasets : list[dict] + List of dataset metadata dictionaries. + + Returns + ------- + list[DatasetMetadata] + List of successfully validated `DatasetMetadata` objects. + """ + datasets_metadata = [] + for dataset in datasets: + logger.info( + f"Normalizing metadata for dataset: {dataset['dataset_id_in_repository']}" + ) + normalized_metadata = validate_metadata_against_model( + dataset, DatasetMetadata, logger=logger + ) + if not normalized_metadata: + logger.error( + f"Normalization failed for metadata of dataset " + f"{dataset['dataset_id_in_repository']}" + ) + continue + datasets_metadata.append(normalized_metadata) + logger.info( + "Normalized metadata for " + f"{len(datasets_metadata)}/{len(datasets)} " + f"({len(datasets_metadata) / len(datasets):.0%}) datasets." + ) + return datasets_metadata + + +def extract_files_metadata( + raw_metadata: dict, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict]: + """ + Extract relevant metadata from raw GPCRmd files metadata. + + Parameters + ---------- + raw_metadata: dict + Raw files metadata. + + Returns + ------- + list[dict] + List of select files metadata. + """ + logger.info("Extracting files metadata...") + files_metadata = [] + entry_id = raw_metadata["entry_id"] + for nomad_file in raw_metadata.get("data", {}).get("files", []): + file_path = Path(nomad_file.get("path", "")) + file_name = file_path.name + file_type = file_path.suffix.lstrip(".") + file_path_url = ( + f"https://gpcrmd-lab.eu/prod/v1/gui/search/entries/entry/id/" + f"{entry_id}/files/{file_name}" + ) + size = nomad_file.get("size", None) + + parsed_file = { + "dataset_repository_name": DatasetRepositoryName.GPCRMD, + "dataset_id_in_repository": entry_id, + "file_name": file_name, + "file_type": file_type, + "file_size_in_bytes": size, + "file_url_in_repository": file_path_url, + "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), + } + files_metadata.append(parsed_file) + logger.info(f"Extracted metadata for {len(files_metadata)} files.") + return files_metadata + + +@click.command( + help="Command line interface for MDverse scrapers", + epilog="Happy scraping!", +) +@click.option( + "--output-dir", + "output_dir_path", + type=click.Path(exists=False, file_okay=False, dir_okay=True, path_type=Path), + required=True, + help="Output directory path to save results.", +) +def main(output_dir_path: Path) -> None: + """Scrape molecular dynamics datasets and files from GPCRmd.""" + # Create directories and logger. + output_dir_path = output_dir_path / DatasetProjectName.GPCRMD.value + output_dir_path.mkdir(parents=True, exist_ok=True) + logfile_path = output_dir_path / f"{DatasetProjectName.GPCRMD.value}_scraper.log" + logger = create_logger(logpath=logfile_path, level="INFO") + logger.info("Starting GPCRmd data scraping...") + start_time = time.perf_counter() + # Create HTTPX client + client = create_httpx_client() + # Check connection to GPCRmd API + if is_gpcrmd_connection_working(client, f"{BASE_GPCRMD_URL}"): + logger.success("Connection to GPCRmd API successful!") + else: + logger.critical("Connection to GPCRmd API failed.") + logger.critical("Aborting.") + sys.exit(1) + + # Scrape GPCRmd datasets metadata. + datasets_raw_metadata = scrape_all_datasets( + client, + query_entry_point="entries/query", + logger=logger, + ) + if not datasets_raw_metadata: + logger.critical("No datasets found in GPCRmd.") + logger.critical("Aborting.") + sys.exit(1) + + # Select datasets metadata + datasets_selected_metadata = extract_datasets_metadata( + datasets_raw_metadata, client, logger=logger + ) + """ + # Parse and validate GPCRmd dataset metadata with a pydantic model (DatasetMetadata) + datasets_normalized_metadata = normalize_datasets_metadata( + datasets_selected_metadata, + logger=logger + ) + # Save datasets metadata to parquet file. + export_list_of_models_to_parquet( + output_dir_path + / f"{DatasetProjectName.GPCRMD.value}_{DataType.DATASETS.value}.parquet", + datasets_normalized_metadata, + logger=logger, + ) + # Scrape GPCRmd files metadata. + files_normalized_metadata = scrape_files_for_all_datasets( + client, datasets_normalized_metadata, logger=logger + ) + + # Save files metadata to parquet file. + export_list_of_models_to_parquet( + output_dir_path + / f"{DatasetProjectName.GPCRMD.value}_{DataType.FILES.value}.parquet", + files_normalized_metadata, + logger=logger, + ) + + # Print script duration. + elapsed_time = int(time.perf_counter() - start_time) + logger.success(f"Scraped GPCRmd in: {timedelta(seconds=elapsed_time)} 🎉") + """ + + +if __name__ == "__main__": + main() From 59dbfa4ea5f9397d63a9e70917c287a2c69f270b Mon Sep 17 00:00:00 2001 From: essmaw Date: Mon, 19 Jan 2026 18:01:17 +0100 Subject: [PATCH 2/5] refactor(gpcrmd): finalize scraper integration as a reusable module. --- src/mdverse_scrapers/scrapers/gpcrmd.py | 325 ++++++++++++++++-------- 1 file changed, 226 insertions(+), 99 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/gpcrmd.py b/src/mdverse_scrapers/scrapers/gpcrmd.py index b489f07..b0afa55 100644 --- a/src/mdverse_scrapers/scrapers/gpcrmd.py +++ b/src/mdverse_scrapers/scrapers/gpcrmd.py @@ -6,6 +6,7 @@ frequent drug targets. """ import json +import os import sys import time from datetime import datetime, timedelta @@ -85,14 +86,15 @@ def scrape_all_datasets( if not response: logger.critical("Failed to fetch data from GPCRmd API.") sys.exit(1) - try: - # Get the formatted response with request metadatas in JSON format - all_datasets = response.json() - except (json.decoder.JSONDecodeError, ValueError) as exc: - logger.error(f"Error while parsing GPCRmd response: {exc}") - logger.error("Cannot find datasets.") - logger.critical("Aborting.") - sys.exit(1) + else: + try: + # Get the formatted response with request metadatas in JSON format + all_datasets = response.json() + except (json.decoder.JSONDecodeError, ValueError) as exc: + logger.error(f"Error while parsing GPCRmd response: {exc}") + logger.error("Cannot find datasets.") + logger.critical("Aborting.") + sys.exit(1) logger.success(f"Scraped {len(all_datasets)} datasets in GPCRmd.") return all_datasets @@ -124,17 +126,15 @@ def scrape_files_for_all_datasets( dataset_id = dataset.dataset_id_in_repository files_metadata = scrape_files_for_one_dataset( client, - url=f"{BASE_GPCRMD_URL}/entries/{dataset_id}/rawdir", + url=dataset.dataset_url_in_repository, dataset_id=dataset_id, logger=logger, ) if not files_metadata: continue - # Extract relevant files metadata. - files_selected_metadata = extract_files_metadata(files_metadata, logger=logger) # Normalize files metadata with pydantic model (FileMetadata) logger.info(f"Validating files metadata for dataset: {dataset_id}") - for file_metadata in files_selected_metadata: + for file_metadata in files_metadata: normalized_metadata = validate_metadata_against_model( file_metadata, FileMetadata, @@ -163,12 +163,10 @@ def scrape_files_for_one_dataset( url: str, dataset_id: str, logger: "loguru.Logger" = loguru.logger, -) -> dict | None: +) -> list[dict] | None: """ Scrape files metadata for a given GPCRmd dataset. - Doc: https://gpcrmd-lab.eu/prod/v1/api/v1/extensions/docs#/entries/metadata - Parameters ---------- client : httpx.Client @@ -186,17 +184,34 @@ def scrape_files_for_one_dataset( File metadata dictionary for the dataset. """ logger.info(f"Scraping files for dataset ID: {dataset_id}") - response = make_http_request_with_retries( - client, - url, - method=HttpMethod.GET, - timeout=60, - delay_before_request=0.1, - ) - if not response: + files_metadata: list[dict] = [] + dataset_metadatas = { + "dataset_repository_name": DatasetRepositoryName.GPCRMD, + "dataset_id_in_repository": dataset_id, + "dataset_url_in_repository": url, + } + + # Extract metadata from dataset url page if available. + # Fetch dataset page with url + html_content = fetch_dataset_page(url, dataset_id, client, logger) + if not html_content: logger.error("Failed to fetch files metadata.") return None - return response.json() + + for file_name, file_type, file_size, file_url in extract_files_metadata_from_html( + client, html_content, logger + ): + file_metadata = { + **dataset_metadatas, + "file_name": file_name, + "file_type": file_type, + "file_size_in_bytes": file_size, + "file_url_in_repository": file_url, + "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + } + files_metadata.append(file_metadata) + + return files_metadata def fetch_dataset_page(url: str | None, @@ -218,6 +233,7 @@ def fetch_dataset_page(url: str | None, str | None The HTML content of the page if the request is successful, otherwise None. """ + html_content = None if url: html_header = make_http_request_with_retries( client, @@ -269,6 +285,91 @@ def retrieve_metadata(html: str, field_name: str) -> str | None: return metadata +def retrieve_reference_links(html: str) -> list[str]: + """ + Retrieve reference URLs from the References section of a GPCRMD dataset page. + + Parameters + ---------- + html : str + The HTML content of the page. + + Returns + ------- + list[str] | None + List of reference URLs (starting with http:// or https://) if found, + otherwise None. + """ + soup = BeautifulSoup(html, "html.parser") + + header = next( + ( + h + for h in soup.find_all("h3") + if h.get_text(strip=True) == "References" + ), + None, + ) + if not isinstance(header, Tag): + return [] + + content_div = header.find_next_sibling("div", class_="techinfo_content") + if not isinstance(content_div, Tag): + return [] + + return [ + a["href"].strip() + for a in content_div.find_all("a", href=True) + if isinstance(a, Tag) + and a["href"].strip().startswith(("http://", "https://")) + ] + + +def count_simulation_files(html: str) -> int | None: + """ + Count files in the dataset webpage. + + Especially in 'Simulation output files' and 'Simulation protocol \ + & starting files' sections. + + Parameters + ---------- + html : str + The HTML content of the page. + + Returns + ------- + int | None + The number of files related to this dataset. + """ + if html: + # Parse the HTML content + soup = BeautifulSoup(html, "html.parser") + + # Helper function to count unique links in a container div + def count_links(container_id: str) -> int: + # Find the container
by ID + container = soup.find("div", id=container_id) + # Ensure the container is actually a Tag + if not isinstance(container, Tag): + return 0 + + # Collect all hrefs in tags, stripping whitespace + links = [ + str(a.get("href", "")).strip() + for a in container.find_all("a", href=True) + if isinstance(a, Tag) and str(a.get("href", "")).strip() + ] + + # Remove duplicates while preserving order + return len(dict.fromkeys(links)) + + output_files_count = count_links("allfiles") + protocol_files_count = count_links("paramfiles") + return output_files_count + protocol_files_count + return None + + def extract_datasets_metadata( datasets: list[dict[str, Any]], client: httpx.Client, @@ -304,21 +405,22 @@ def extract_datasets_metadata( "title": dataset.get("modelname"), "date_created": dataset.get("creation_timestamp"), "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), - "simulation_program_name": dataset.get("mysoftware"), - "simulation_program_version": dataset.get("software_version"), + "software_name": dataset.get("mysoftware"), + "software_version": dataset.get("software_version"), "forcefield_model_name": dataset.get("forcefield"), "forcefield_model_version": dataset.get("forcefield_version"), - "timestep": dataset.get("timestep"), - "delta": dataset.get("delta"), + "simulation_timestep": dataset.get("timestep"), "nb_atoms": dataset.get("atom_num") } # Extract other metadata from dataset url page if available. # Fetch dataset page with url html_content = fetch_dataset_page(dataset_url, dataset_id, client, logger) if html_content is None: - logger.warning("Error parsing additionnal metadatas from web page for entry" - f" {dataset_id} ({dataset_url})") + logger.warning( + "Error parsing additionnal metadatas from web page for dataset" + f" {dataset_id} ({dataset_url})") logger.warning("Skipping this step.") + datasets_metadata.append(metadata) continue # Author names. @@ -326,49 +428,50 @@ def extract_datasets_metadata( try: author_names = retrieve_metadata(html_content, "Submitted by") except (ValueError, KeyError) as e: - logger.warning(f"Error parsing software name for entry {dataset_id}: {e}") + logger.warning(f"Error parsing author names for entry {dataset_id}: {e}") metadata["author_names"] = (author_names if author_names is None else [author_names]) - - """# Software version - software_version = None + # Description. + description = None try: - software_version = ( - dataset.get("results", {}) - .get("method", {}) - .get("simulation", {}) - .get("program_version") - ) + description = retrieve_metadata(html_content, "Description") except (ValueError, KeyError) as e: - logger.warning( - f"Error parsing software version for entry {dataset_id}: {e}") - metadata["software_version"] = software_version - # Molecules and number total of atoms. - total_atoms = None - molecules = None + logger.warning(f"Error parsing description for entry {dataset_id}: {e}") + metadata["description"] = description + # Simulation time. + stime_list = None try: - topology = ( - dataset.get("results", {}).get("material", {}).get("topology", []) - ) - if isinstance(topology, list): - total_atoms = next( - ( - t.get("n_atoms") - for t in topology - if t.get("label") == "original" - ), - None, - ) - molecules = [ - f"{t.get('label')} ({t.get('n_atoms')} atoms)" - for t in topology - if t.get("structural_type") == "molecule" - ] + stime = retrieve_metadata(html_content, "Accumulated simulation time") + stime_list = [stime] if stime else [] except (ValueError, KeyError) as e: - logger.warning(f"Error parsing molecules for entry {dataset_id}: {e}") - metadata["nb_atoms"] = total_atoms - metadata["molecule_names"] = molecules - datasets_metadata.append(metadata)""" + logger.warning(f"Error parsing simulation time for entry {dataset_id}: {e}") + metadata["simulation_time"] = stime_list + # Reference links. + refs = None + try: + refs = retrieve_reference_links(html_content) + except (ValueError, KeyError) as e: + logger.warning(f"Error parsing reference links for entry {dataset_id}: {e}") + metadata["external_links"] = refs + # Number of files. + nb_files = None + try: + nb_files: int | None = count_simulation_files(html_content) + except (ValueError, KeyError) as e: + logger.warning(f"Error parsing number of files for entry {dataset_id}: {e}") + metadata["nb_files"] = nb_files + # Molecule names. + molecule_names = None + try: + dyncomp: list[dict[str, Any]] = dataset.get("dyncomp", []) + molecule_names: list[str] = ( + [comp.get("resname") for comp in dyncomp if comp.get("resname")]) + except (ValueError, KeyError) as e: + logger.warning(f"Error parsing molecule names for entry {dataset_id}: {e}") + metadata["molecule_names"] = molecule_names + # Adding full metadatas of the dataset + datasets_metadata.append(metadata) + logger.info(f"Extracted metadata for {len(datasets_metadata)} datasets.") return datasets_metadata @@ -413,47 +516,73 @@ def normalize_datasets_metadata( return datasets_metadata -def extract_files_metadata( - raw_metadata: dict, +def extract_files_metadata_from_html( + client: httpx.Client, + html_content: str, logger: "loguru.Logger" = loguru.logger, -) -> list[dict]: +) -> list[tuple[str, str, int, str]]: """ Extract relevant metadata from raw GPCRmd files metadata. Parameters ---------- - raw_metadata: dict - Raw files metadata. + client : httpx.Client + The HTTPX client to use for making requests. + html_content : str + HTML content of the dataset page. + logger: "loguru.Logger" + Logger for logging messages. Returns ------- - list[dict] - List of select files metadata. + list[tuple[str, str, int, str]] + Tuples of (file_name, file_type, int, file_url). + Empty if none found. """ logger.info("Extracting files metadata...") files_metadata = [] - entry_id = raw_metadata["entry_id"] - for nomad_file in raw_metadata.get("data", {}).get("files", []): - file_path = Path(nomad_file.get("path", "")) - file_name = file_path.name - file_type = file_path.suffix.lstrip(".") - file_path_url = ( - f"https://gpcrmd-lab.eu/prod/v1/gui/search/entries/entry/id/" - f"{entry_id}/files/{file_name}" - ) - size = nomad_file.get("size", None) + soup = BeautifulSoup(html_content, "html.parser") + + # Loop over the two possible sections: dataset files and parameter files + for sec_id in ("allfiles", "paramfiles"): + container = soup.find("div", id=sec_id) + # Warn if mandatory section is missing () + if not isinstance(container, Tag): + if sec_id == "allfiles": + logger.warning( + f"Mandatory section `{sec_id}` is missing or invalid." + "Files cannot be retrieved, skipping this step.") + continue + + for link in container.find_all("a", href=True): + # Encountered a non-Tag element in links + if not isinstance(link, Tag): + continue + href_value = str(link.get("href", "")).strip() + # Found link without 'href' + if not href_value: + continue + + file_url = f"https://www.gpcrmd.org/{href_value}" + file_name = os.path.basename(file_url) + file_type = os.path.splitext(file_name)[1].lstrip(".").lower() + + # Fetch the file size using a HEAD request + response = make_http_request_with_retries( + client, + file_url, + method=HttpMethod.HEAD, + timeout=60, + delay_before_request=0.2, + ) + if response and response.headers: + file_size = int(response.headers.get("Content-Length", 0)) + else: + file_size = 0 + logger.warning(f"Could not retrieve file size for '{file_name}'") + + files_metadata.append((file_name, file_type, file_size, file_url)) - parsed_file = { - "dataset_repository_name": DatasetRepositoryName.GPCRMD, - "dataset_id_in_repository": entry_id, - "file_name": file_name, - "file_type": file_type, - "file_size_in_bytes": size, - "file_url_in_repository": file_path_url, - "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), - } - files_metadata.append(parsed_file) - logger.info(f"Extracted metadata for {len(files_metadata)} files.") return files_metadata @@ -490,19 +619,18 @@ def main(output_dir_path: Path) -> None: # Scrape GPCRmd datasets metadata. datasets_raw_metadata = scrape_all_datasets( client, - query_entry_point="entries/query", + query_entry_point=BASE_GPCRMD_URL, logger=logger, ) if not datasets_raw_metadata: logger.critical("No datasets found in GPCRmd.") logger.critical("Aborting.") sys.exit(1) - # Select datasets metadata datasets_selected_metadata = extract_datasets_metadata( datasets_raw_metadata, client, logger=logger ) - """ + # Parse and validate GPCRmd dataset metadata with a pydantic model (DatasetMetadata) datasets_normalized_metadata = normalize_datasets_metadata( datasets_selected_metadata, @@ -531,7 +659,6 @@ def main(output_dir_path: Path) -> None: # Print script duration. elapsed_time = int(time.perf_counter() - start_time) logger.success(f"Scraped GPCRmd in: {timedelta(seconds=elapsed_time)} 🎉") - """ if __name__ == "__main__": From 438823d8040066e47ea902f3146e00d65d724857 Mon Sep 17 00:00:00 2001 From: essmaw Date: Mon, 19 Jan 2026 18:02:27 +0100 Subject: [PATCH 3/5] docs(readme): update command for GPCRmd scraper module. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 41cc932..c6ed3d8 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ The scraping takes about 2 h. Scrape GPCRmd to collect molecular dynamics (MD) datasets and files related to G-protein-coupled receptors (GPCRs), a major family of membrane proteins and common drug targets. ```bash -uv run -m scripts.scrape_gpcrmd +uv run scrape-gpcrmd --output-dir data ``` This command will: From ef7dd7ed1df654963866037408b130272043596a Mon Sep 17 00:00:00 2001 From: essmaw Date: Tue, 20 Jan 2026 18:52:15 +0100 Subject: [PATCH 4/5] docs: Adding the documentation for scrapping GPCRmd. --- docs/gpcrmd.md | 113 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 docs/gpcrmd.md diff --git a/docs/gpcrmd.md b/docs/gpcrmd.md new file mode 100644 index 0000000..5c4f228 --- /dev/null +++ b/docs/gpcrmd.md @@ -0,0 +1,113 @@ +# GPCRmd + +> GPCRmd is an online platform for visualizing, analyzing, and sharing molecular dynamics simulations of G-protein-coupled receptors (GPCRs), a key family of membrane proteins and common drug targets. + +- web site: https://www.gpcrmd.org/ +- documentation: https://gpcrmd-docs.readthedocs.io/en/latest/index.html +- API: https://www.gpcrmd.org/api/ + - `version v1.3` + +No account / token is needed to access GPCRmd API. + + +## Finding molecular dynamics datasets and files + +Although GPCRmd provides a public API to discover molecular dynamics datasets, **some important metadata fields and all file-level information are not exposed via the API**. For this reason, web scraping of the dataset page is required to retrieve complete dataset descriptions and file metadata. + +### Datasets + +In GPCRmd, datasets (a simulation and its related files) are called "dynamic". + +API entrypoint to search for all datasets at once: + +- Path: /search_all/info/ +- [documentatation](https://gpcrmd-docs.readthedocs.io/en/latest/api.html#main-gpcrmd-api) + + +#### Dataset metadata retrieved via API: + +| Field | Description | +| ------------------ | ----------------------------------- | +| `dyn_id` | *Unique dynamic (dataset) identifier* | +| `modelname` | *Name of the simulated model* | +| `timestep` | *MD integration timestep* | +| `atom_num` | *Number of atoms* | +| `mysoftware` | *MD engine used* | +| `software_version` | *Version of the MD engine* | +| `forcefield` | *Force field and model name* | +| `forcefield_version` | *Force field and model version* | +| `creation_timestamp` | *Dataset creation date* | +| `dataset_url` | *URL of the dataset web page* | + +#### Dataset metadata retrieved via web scraping (URL provided by the API): + +| Field | Description | +| -------------------- | ------------------------------------------ | +| `description` | *Full textual description of the simulation* | +| `authors` | *Dataset authors* | +| `simulation_time` | *Total simulation length* | + + +### Files + +The GPCRmd API does not provide any endpoint to access file-level metadata. All file information must therefore be extracted from the dataset web page. Two file categories are available: **simulation output files** and **simulation protocol and starting files**. + +For example, the files corresponding to the dataset` 7` (https://www.gpcrmd.org/dynadb/dynamics/id/7/) include these files: +- https://www.gpcrmd.org/dynadb/files/Dynamics/10166_trj_7.dcd +- https://www.gpcrmd.org/dynadb/files/Dynamics/10167_dyn_7.psf +- https://www.gpcrmd.org/dynadb/files/Dynamics/10168_dyn_7.pdb + + +#### File metadata retrieved via web scraping (URL provided by the API): + +| Field | Description | +| ---------- | ---------------------- | +| `file_name` | *Name of the file* | +| `file_type` | *File extension* | +| `file_path`| *Public download URL* | +| `file_size`| *File size in bytes* | + +> 💡 File size is obtained using an HTTP `HEAD` request on the file path, **avoiding file download**. + + +## Examples +### Dataset ID 2316 + +- [Dataset on GPCRmd GUI](https://www.gpcrmd.org/dynadb/dynamics/id/2316/) +- [Dataset on GPCRmd API](https://www.gpcrmd.org/api/search_dyn/info/2316) + + +#### Dataset metadata (API + scraping) + +| Field | Description | +| ------------------ | ----------------------------------- | +| `dyn_id` | *2316* | +| `modelname` | *FFA2_TUG1375_Gi1-TUG1375* | +| `timestep` | *2* | +| `atom_num` | *4829* | +| `mysoftware` | *AMBER PMEMD.CUDA* | +| `software_version` | *2020* | +| `forcefield` | *ff19SB/lipid21/GAFF2* | +| `forcefield_version` | *ff19SB/lipid21* | +| `creation_timestamp` | *2025-05-13* | +| `dataset_url` | *https://www.gpcrmd.org/dynadb/dynamics/id/2316/* | +| `description` | *Simulation aims to observe structural features of FFA2 without an orthosteric agonist and G-protein, which will be compared to docking-based simulations of allosteric activators...* | +| `authors` | *Abdul-Akim Guseinov, University of Glasgow* | +| `simulation_time` | *3.0 µs* | + + +- [files on GPCRmd GUI](https://www.gpcrmd.org/api/search_dyn/info/2316) (accessible via the *Technical Information* section) + +#### Example file from the dataset + +| Field | Description | +| ---------- | ---------------------- | +| `file_name` | *tmp_dyn_0_2667.pdb* | +| `file_type` | *pdb* | +| `file_path`| *https://www.gpcrmd.org/dynadb/files/Dynamics/dyn2667/tmp_dyn_0_2667.pdb* | +| `file_size`| *1 024 bytes* | + + +## References + +Rodríguez-Espigares, I., Torrens-Fontanals, M., Tiemann, J.K.S. et al. GPCRmd uncovers the dynamics of the 3D-GPCRome. Nat Methods. 2020;17(8):777-787. doi:[10.1038/s41592-020-0884-y](https://www.nature.com/articles/s41592-020-0884-y) \ No newline at end of file From dc8eb03a5253b2cbb0f8a45ad1431e3734efec4c Mon Sep 17 00:00:00 2001 From: essmaw Date: Tue, 20 Jan 2026 18:55:50 +0100 Subject: [PATCH 5/5] refactor(gpcrmd): incorporate @pierrepo review, streamline dataset scraping and metadata extraction. --- src/mdverse_scrapers/scrapers/gpcrmd.py | 371 ++++++++++++------------ 1 file changed, 187 insertions(+), 184 deletions(-) diff --git a/src/mdverse_scrapers/scrapers/gpcrmd.py b/src/mdverse_scrapers/scrapers/gpcrmd.py index b0afa55..8da6a2c 100644 --- a/src/mdverse_scrapers/scrapers/gpcrmd.py +++ b/src/mdverse_scrapers/scrapers/gpcrmd.py @@ -1,12 +1,7 @@ -"""Scrape molecular dynamics simulation datasets and files from GPCRmd. +"""Scrape molecular dynamics simulation datasets and files from GPCRmd.""" + -This script scrapes molecular dynamics datasets from the GPCRmd repository -(https://www.gpcrmd.org/), a platform dedicated to simulations of -G-protein-coupled receptors (GPCRs), a major family of membrane proteins and -frequent drug targets. -""" import json -import os import sys import time from datetime import datetime, timedelta @@ -30,7 +25,7 @@ from ..models.file import FileMetadata from ..models.utils import validate_metadata_against_model -BASE_GPCRMD_URL = "https://www.gpcrmd.org/api/search_all/info/" +BASE_GPCRMD_URL = "https://www.gpcrmd.org/api/search_all/" def is_gpcrmd_connection_working( @@ -100,152 +95,42 @@ def scrape_all_datasets( return all_datasets -def scrape_files_for_all_datasets( - client: httpx.Client, - datasets: list[DatasetMetadata], - logger: "loguru.Logger" = loguru.logger, -) -> list[FileMetadata]: - """Scrape files metadata for all datasets in GPCRmd. - - Parameters - ---------- - client : httpx.Client - The HTTPX client to use for making requests. - datasets : list[DatasetMetadata] - List of datasets to scrape files metadata for. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - list[FileMetadata] - List of successfully validated `FileMetadata` objects. - """ - all_files_metadata = [] - for dataset_count, dataset in enumerate(datasets, start=1): - dataset_id = dataset.dataset_id_in_repository - files_metadata = scrape_files_for_one_dataset( - client, - url=dataset.dataset_url_in_repository, - dataset_id=dataset_id, - logger=logger, - ) - if not files_metadata: - continue - # Normalize files metadata with pydantic model (FileMetadata) - logger.info(f"Validating files metadata for dataset: {dataset_id}") - for file_metadata in files_metadata: - normalized_metadata = validate_metadata_against_model( - file_metadata, - FileMetadata, - logger=logger, - ) - if not normalized_metadata: - logger.error( - f"Normalization failed for metadata of file " - f"{file_metadata.get('file_name')} " - f"in dataset {dataset_id}" - ) - continue - all_files_metadata.append(normalized_metadata) - logger.info("Done.") - logger.info(f"Total files: {len(all_files_metadata):,}") - logger.info( - "Extracted and validated files metadata for " - f"{dataset_count:,}/{len(datasets):,} " - f"({dataset_count / len(datasets):.0%}) datasets." - ) - return all_files_metadata - - -def scrape_files_for_one_dataset( - client: httpx.Client, - url: str, - dataset_id: str, - logger: "loguru.Logger" = loguru.logger, -) -> list[dict] | None: - """ - Scrape files metadata for a given GPCRmd dataset. - - Parameters - ---------- - client : httpx.Client - The HTTPX client to use for making requests. - url : str - The URL endpoint. - dataset_id : str - The unique identifier of the dataset in GPCRmd. - logger: "loguru.Logger" - Logger for logging messages. - - Returns - ------- - dict | None - File metadata dictionary for the dataset. - """ - logger.info(f"Scraping files for dataset ID: {dataset_id}") - files_metadata: list[dict] = [] - dataset_metadatas = { - "dataset_repository_name": DatasetRepositoryName.GPCRMD, - "dataset_id_in_repository": dataset_id, - "dataset_url_in_repository": url, - } - - # Extract metadata from dataset url page if available. - # Fetch dataset page with url - html_content = fetch_dataset_page(url, dataset_id, client, logger) - if not html_content: - logger.error("Failed to fetch files metadata.") - return None - - for file_name, file_type, file_size, file_url in extract_files_metadata_from_html( - client, html_content, logger - ): - file_metadata = { - **dataset_metadatas, - "file_name": file_name, - "file_type": file_type, - "file_size_in_bytes": file_size, - "file_url_in_repository": file_url, - "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S") - } - files_metadata.append(file_metadata) - - return files_metadata - - -def fetch_dataset_page(url: str | None, - dataset_id: str | None, +def fetch_all_datasets_page( client: httpx.Client, + datasets: list[dict], logger: "loguru.Logger" = loguru.logger -) -> str | None: +) -> list[str | None]: """Fetch an dataset page and return its HTML content. Parameters ---------- - url : str - The URL of the dataset page to fetch. client : httpx.Client The HTTPX client to use for making requests. + datasets : List[Dict[str, Any]] + List of raw GPCRmd datasets metadata. Returns ------- str | None The HTML content of the page if the request is successful, otherwise None. """ - html_content = None - if url: - html_header = make_http_request_with_retries( - client, - url, - method=HttpMethod.GET, - timeout=60, - delay_before_request=0.2, - ) - if html_header: - html_content = html_header.text + datasets_html_page = [] + for dataset in datasets: + url = dataset.get("url") + html_content = None + if url: + html_header = make_http_request_with_retries( + client, + url, + method=HttpMethod.GET, + timeout=60, + delay_before_request=0.2, + ) + if html_header: + html_content = html_header.text + datasets_html_page.append(html_content) - return html_content + return datasets_html_page def retrieve_metadata(html: str, field_name: str) -> str | None: @@ -371,8 +256,9 @@ def count_links(container_id: str) -> int: def extract_datasets_metadata( - datasets: list[dict[str, Any]], client: httpx.Client, + datasets: list[dict[str, Any]], + datasets_html_page: list[str | None], logger: "loguru.Logger" = loguru.logger, ) -> list[dict]: """ @@ -380,8 +266,10 @@ def extract_datasets_metadata( Parameters ---------- - datasets : List[Dict[str, Any]] + datasets : list[dict[str, Any]] List of raw GPCRmd datasets metadata. + datasets_html_page: list[str | None] + List of html content of the dataset web page. client : httpx.Client The HTTPX client to use for making requests. @@ -391,7 +279,7 @@ def extract_datasets_metadata( List of dataset metadata dictionaries. """ datasets_metadata = [] - for dataset in datasets: + for dataset, html_content in zip(datasets, datasets_html_page, strict=True): dataset_id = str(dataset.get("dyn_id")) logger.info(f"Extracting relevant metadata for dataset: {dataset_id}") dataset_url = dataset.get("url") @@ -413,8 +301,6 @@ def extract_datasets_metadata( "nb_atoms": dataset.get("atom_num") } # Extract other metadata from dataset url page if available. - # Fetch dataset page with url - html_content = fetch_dataset_page(dataset_url, dataset_id, client, logger) if html_content is None: logger.warning( "Error parsing additionnal metadatas from web page for dataset" @@ -543,49 +429,160 @@ def extract_files_metadata_from_html( files_metadata = [] soup = BeautifulSoup(html_content, "html.parser") - # Loop over the two possible sections: dataset files and parameter files - for sec_id in ("allfiles", "paramfiles"): - container = soup.find("div", id=sec_id) - # Warn if mandatory section is missing () - if not isinstance(container, Tag): - if sec_id == "allfiles": - logger.warning( - f"Mandatory section `{sec_id}` is missing or invalid." - "Files cannot be retrieved, skipping this step.") + # Find all tags with href containing the files path + for link in soup.find_all("a", href=True): + href_value = link.get("href", "").strip() + if not href_value or "/dynadb/files/Dynamics/" not in href_value: continue - for link in container.find_all("a", href=True): - # Encountered a non-Tag element in links - if not isinstance(link, Tag): - continue - href_value = str(link.get("href", "")).strip() - # Found link without 'href' - if not href_value: - continue + file_url = f"https://www.gpcrmd.org/{href_value}" + # Example of file urls: + # From dataset ID: 2316 (https://www.gpcrmd.org/dynadb/dynamics/id/2316/) + # 1. https://www.gpcrmd.org/dynadb/files/Dynamics/dyn2667/tmp_dyn_0_2667.pdb + # 2. https://www.gpcrmd.org/dynadb/files/Dynamics/dyn2667/25400_trj_2316.dcd - file_url = f"https://www.gpcrmd.org/{href_value}" - file_name = os.path.basename(file_url) - file_type = os.path.splitext(file_name)[1].lstrip(".").lower() + file_name = Path(file_url).name + file_type = Path(file_name).suffix.lstrip(".").lower() - # Fetch the file size using a HEAD request - response = make_http_request_with_retries( - client, - file_url, - method=HttpMethod.HEAD, - timeout=60, - delay_before_request=0.2, - ) - if response and response.headers: - file_size = int(response.headers.get("Content-Length", 0)) - else: - file_size = 0 - logger.warning(f"Could not retrieve file size for '{file_name}'") + # Fetch the file size using a HEAD request + response = make_http_request_with_retries( + client, + file_url, + method=HttpMethod.HEAD, + timeout=60, + delay_before_request=0.2, + ) + if response and response.headers: + file_size = int(response.headers.get("Content-Length", 0)) + else: + file_size = None + logger.warning(f"Could not retrieve file size for '{file_name}'") - files_metadata.append((file_name, file_type, file_size, file_url)) + files_metadata.append((file_name, file_type, file_size, file_url)) return files_metadata +def scrape_files_for_one_dataset( + client: httpx.Client, + url: str, + html_content: str | None, + dataset_id: str, + logger: "loguru.Logger" = loguru.logger, +) -> list[dict] | None: + """ + Scrape files metadata for a given GPCRmd dataset. + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + url : str + The URL endpoint. + html_content: str | None + Html content of the dataset web page. + dataset_id : str + The unique identifier of the dataset in GPCRmd. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + dict | None + File metadata dictionary for the dataset. + """ + logger.info(f"Scraping files for dataset ID: {dataset_id}") + files_metadata: list[dict] = [] + datasets_metadata = { + "dataset_repository_name": DatasetRepositoryName.GPCRMD, + "dataset_id_in_repository": dataset_id, + "dataset_url_in_repository": url, + } + + # Extract metadata from dataset url page if available. + if not html_content: + logger.error("Failed to fetch files metadata.") + return None + + for file_name, file_type, file_size, file_url in extract_files_metadata_from_html( + client, html_content, logger + ): + file_metadata = { + **datasets_metadata, + "file_name": file_name, + "file_type": file_type, + "file_size_in_bytes": file_size, + "file_url_in_repository": file_url, + "date_last_fetched": datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + } + files_metadata.append(file_metadata) + + return files_metadata + + +def scrape_files_for_all_datasets( + client: httpx.Client, + datasets: list[DatasetMetadata], + datasets_html_page: list[str | None], + logger: "loguru.Logger" = loguru.logger, +) -> list[FileMetadata]: + """Scrape files metadata for all datasets in GPCRmd. + + Parameters + ---------- + client : httpx.Client + The HTTPX client to use for making requests. + datasets : list[DatasetMetadata] + List of datasets to scrape files metadata for. + datasets_html_page: list[str | None] + List of html content of the dataset web page. + logger: "loguru.Logger" + Logger for logging messages. + + Returns + ------- + list[FileMetadata] + List of successfully validated `FileMetadata` objects. + """ + all_files_metadata = [] + for dataset_count, dataset in enumerate(datasets, start=1): + dataset_id = dataset.dataset_id_in_repository + dataset_html_page = datasets_html_page[dataset_count - 1] + files_metadata = scrape_files_for_one_dataset( + client, + url=dataset.dataset_url_in_repository, + html_content=dataset_html_page, + dataset_id=dataset_id, + logger=logger, + ) + if not files_metadata: + continue + # Normalize files metadata with pydantic model (FileMetadata) + logger.info(f"Validating files metadata for dataset: {dataset_id}") + for file_metadata in files_metadata: + normalized_metadata = validate_metadata_against_model( + file_metadata, + FileMetadata, + logger=logger, + ) + if not normalized_metadata: + logger.error( + f"Normalization failed for metadata of file " + f"{file_metadata.get('file_name')} " + f"in dataset {dataset_id}" + ) + continue + all_files_metadata.append(normalized_metadata) + logger.info("Done.") + logger.info(f"Total files: {len(all_files_metadata):,}") + logger.info( + "Extracted and validated files metadata for " + f"{dataset_count:,}/{len(datasets):,} " + f"({dataset_count / len(datasets):.0%}) datasets." + ) + return all_files_metadata + + @click.command( help="Command line interface for MDverse scrapers", epilog="Happy scraping!", @@ -600,7 +597,8 @@ def extract_files_metadata_from_html( def main(output_dir_path: Path) -> None: """Scrape molecular dynamics datasets and files from GPCRmd.""" # Create directories and logger. - output_dir_path = output_dir_path / DatasetProjectName.GPCRMD.value + output_dir_path = (output_dir_path / DatasetProjectName.GPCRMD.value + / datetime.now().strftime("%Y-%m-%d")) output_dir_path.mkdir(parents=True, exist_ok=True) logfile_path = output_dir_path / f"{DatasetProjectName.GPCRMD.value}_scraper.log" logger = create_logger(logpath=logfile_path, level="INFO") @@ -609,7 +607,7 @@ def main(output_dir_path: Path) -> None: # Create HTTPX client client = create_httpx_client() # Check connection to GPCRmd API - if is_gpcrmd_connection_working(client, f"{BASE_GPCRMD_URL}"): + if is_gpcrmd_connection_working(client, f"{BASE_GPCRMD_URL}pdbs/"): logger.success("Connection to GPCRmd API successful!") else: logger.critical("Connection to GPCRmd API failed.") @@ -619,18 +617,23 @@ def main(output_dir_path: Path) -> None: # Scrape GPCRmd datasets metadata. datasets_raw_metadata = scrape_all_datasets( client, - query_entry_point=BASE_GPCRMD_URL, + query_entry_point=f"{BASE_GPCRMD_URL}info/", logger=logger, ) if not datasets_raw_metadata: logger.critical("No datasets found in GPCRmd.") logger.critical("Aborting.") sys.exit(1) + datasets_raw_metadata = datasets_raw_metadata[:4] + + # Fetch the dataset page for all datasets + datasets_html_page = fetch_all_datasets_page( + client, datasets_raw_metadata, logger + ) # Select datasets metadata datasets_selected_metadata = extract_datasets_metadata( - datasets_raw_metadata, client, logger=logger + client, datasets_raw_metadata, datasets_html_page, logger=logger ) - # Parse and validate GPCRmd dataset metadata with a pydantic model (DatasetMetadata) datasets_normalized_metadata = normalize_datasets_metadata( datasets_selected_metadata, @@ -645,7 +648,7 @@ def main(output_dir_path: Path) -> None: ) # Scrape GPCRmd files metadata. files_normalized_metadata = scrape_files_for_all_datasets( - client, datasets_normalized_metadata, logger=logger + client, datasets_normalized_metadata, datasets_html_page, logger=logger ) # Save files metadata to parquet file.