Skip to content

Commit 1ebfb4f

Browse files
authored
Map biotools ID and name gh->bt (#115)
* feat: implement name mapping function * feat: wire in the mapping * feat: implement and wire the mapping for bt id * feat: add optional equality func param to policy * chore: use the new equality param in name mapping * doc: add placeholders for process diagrams for new maps
1 parent b68a3c7 commit 1ebfb4f

File tree

9 files changed

+225
-5
lines changed

9 files changed

+225
-5
lines changed

docs/source/api_reference/diagrams.rst

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,17 @@ Map publications
5757
Map documentation
5858
~~~~~~~~~~~~~~~~~~~~
5959

60-
TODO: add diagram here once errors in the mapping function are resolved.
60+
TODO: add diagram here.
61+
62+
Map biotools ID
63+
~~~~~~~~~~~~~~~~~~
64+
65+
TODO: add diagram here.
66+
67+
Map name
68+
~~~~~~~~~~
69+
70+
TODO: add diagram here.
6171

6272
Map homepage
6373
~~~~~~~~~~~~~~
@@ -114,7 +124,7 @@ Map topics
114124
Map version
115125
~~~~~~~~~~~~~~
116126

117-
TODO: add diagram here once errors in the mapping function are resolved.
127+
TODO: add diagram here.
118128

119129

120130
bio.tools → GitHub mapping for pull requests

src/bridge/pipelines/gh2bt_for_meta/main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ async def run(args: GitHubToBiotoolsForMetaPipelineArgs) -> BiotoolsToolModel:
7777
)
7878

7979
biotools_metadata = BiotoolsToolModel(
80-
name=github_repo.repo.name,
80+
biotoolsID=await mapper.map["biotools_id"].run(),
81+
name=await mapper.map["name"].run(),
8182
description=((await mapper.map["description"].run()) or "***Please change this description***"),
8283
homepage=await mapper.map["homepage"].run(),
8384
maturity=await mapper.map["maturity"].run(),

src/bridge/pipelines/gh2bt_for_meta/map.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@
88
from bridge.pipelines.utils import load_dict_from_yaml_file
99

1010
from .map_funcs import (
11+
map_biotools_id,
1112
map_description,
1213
map_documentation,
1314
map_homepage,
1415
map_language,
1516
map_license,
1617
map_maturity,
18+
map_name,
1719
map_publication,
1820
map_version,
1921
)
@@ -34,7 +36,15 @@ def map(self) -> dict[str, MapItem]:
3436
Map GitHub metadata property to corresponding bio.tools property.
3537
"""
3638
return {
37-
"name": MapItem(schema_entry=self.metadata.name, repo_entry=self.repo.repo.name, method=Method.EXACT),
39+
"biotools_id": MapItem(
40+
schema_entry=self.metadata.biotoolsID,
41+
repo_entry=self.repo.repo.name,
42+
method=Method.FUZZY,
43+
fn=map_biotools_id,
44+
),
45+
"name": MapItem(
46+
schema_entry=self.metadata.name, repo_entry=self.repo.repo.name, method=Method.EXACT, fn=map_name
47+
),
3848
# Languages are both lists
3949
"language": MapItem(
4050
schema_entry=self.metadata.language,

src/bridge/pipelines/gh2bt_for_meta/map_funcs/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
Individual mapping functions for GitHub to bio.tools.
33
"""
44

5+
from .biotools_id import map_biotools_id
56
from .description import map_description
67
from .documentation import map_documentation
78
from .homepage import map_homepage
89
from .language import map_language
910
from .license import map_license
1011
from .maturity import map_maturity
12+
from .name import map_name
1113
from .publication import map_publication
1214
from .version import map_version
1315

@@ -20,4 +22,6 @@
2022
"map_maturity",
2123
"map_version",
2224
"map_publication",
25+
"map_name",
26+
"map_biotools_id",
2327
]
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""
2+
Map GitHub repository name to bio.tools ID.
3+
4+
This module reconciles the GitHub repository name with the bio.tools ID.
5+
It applies a policy that prefers the GitHub name while ensuring uniqueness
6+
within bio.tools, generating alternative IDs if necessary.
7+
"""
8+
9+
import httpx
10+
11+
from bridge.builders import compose_biotools_metadata
12+
from bridge.logging import get_user_logger
13+
from bridge.pipelines.utils import normalize_text, str_contain_each_other
14+
15+
logger = get_user_logger()
16+
17+
18+
async def _matching_biotools_id_exists(biotools_id: str) -> bool:
19+
"""
20+
Check if a bio.tools entry with the given ID already exists.
21+
22+
The check is performed by attempting to fetch bio.tools metadata
23+
for the given ID. If a 404 Not Found error is returned, the ID
24+
does not exist. Any other error is treated as an indication that
25+
the ID may exist, to avoid false negatives.
26+
27+
Parameters
28+
----------
29+
biotools_id : str
30+
Candidate bio.tools ID to check.
31+
32+
Returns
33+
-------
34+
bool
35+
True if a matching bio.tools entry exists, False otherwise.
36+
"""
37+
try:
38+
matching_bt_metadata = await compose_biotools_metadata(identifier=biotools_id)
39+
except Exception as e:
40+
if isinstance(e, httpx.HTTPStatusError) and e.response.status_code == 404:
41+
return False
42+
else:
43+
logger.added(f"Failed to check existing bio.tools ID '{biotools_id}': {e}. Assuming it exists.")
44+
return True
45+
46+
return matching_bt_metadata is not None
47+
48+
49+
async def map_biotools_id(gh_name: str | None, bt_id: str | None) -> str | None:
50+
"""
51+
Map and reconcile GitHub repository name to bio.tools ID.
52+
53+
Policy:
54+
1. If no GitHub repo name is available, preserve existing bio.tools ID (even if None).
55+
2. If the existing bio.tools ID and GitHub repo name contain each other
56+
(case-insensitive), preserve the existing bio.tools ID.
57+
3. If they do not contain each other, log a conflict but continue.
58+
4. If the GitHub repo name is not already used as a bio.tools ID,
59+
use it as the new bio.tools ID.
60+
5. If the GitHub repo name is already used as a bio.tools ID,
61+
attempt to generate a unique ID by appending suffixes ``-1``, ``-2``, ...
62+
up to ``-99``. If a unique ID is found, use it.
63+
6. If no unique ID can be generated, log a note and return ``None``,
64+
requiring manual intervention.
65+
66+
Parameters
67+
----------
68+
gh_name : str | None
69+
GitHub repository name.
70+
bt_id : str | None
71+
Existing bio.tools ID.
72+
73+
Returns
74+
-------
75+
str | None
76+
Mapped bio.tools ID, or ``None`` if mapping failed.
77+
"""
78+
if gh_name is None:
79+
logger.unchanged("No GitHub repo name found, nothing to map.")
80+
return bt_id
81+
82+
gh_norm = normalize_text(gh_name)
83+
bt_norm = normalize_text(bt_id or "")
84+
85+
if bt_id is not None and str_contain_each_other(gh_norm, bt_norm):
86+
logger.exact(f"bio.tools ID '{bt_id}' and GitHub repo name '{gh_name}' contain each other")
87+
return bt_id
88+
89+
if bt_id is not None and not str_contain_each_other(gh_norm, bt_norm):
90+
logger.conflict(f"bio.tools ID '{bt_id}' and GitHub repo name '{gh_name}' do not contain each other")
91+
92+
if not await _matching_biotools_id_exists(gh_norm):
93+
logger.added(f"Using GitHub repo name '{gh_name}' as bio.tools ID")
94+
return gh_norm
95+
96+
logger.conflict(
97+
f"GitHub repo name '{gh_name}' cannot be used as bio.tools ID because it matches an existing entry. "
98+
"Trying to generate a unique bio.tools ID."
99+
)
100+
101+
for suffix in range(1, 100):
102+
candidate_id = f"{gh_norm}-{suffix}"
103+
if not await _matching_biotools_id_exists(candidate_id):
104+
logger.added(f"Using generated bio.tools ID '{candidate_id}'")
105+
return candidate_id
106+
107+
logger.note(
108+
f"Failed to generate unique bio.tools ID based on GitHub repo name '{gh_name}'. "
109+
"Please set bio.tools ID manually."
110+
)
111+
return None
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Mapping name from GitHub to bio.tools.
3+
4+
This module reconciles the GitHub repository name with the bio.tools name.
5+
It compares the two names and applies a policy that prefers the GitHub name
6+
while preserving the bio.tools name when GitHub is silent or ambiguous.
7+
"""
8+
9+
from bridge.logging import get_user_logger
10+
from bridge.pipelines.policies.gh2bt import reconcile_gh_over_bt
11+
from bridge.pipelines.utils import normalize_text, str_contain_each_other
12+
13+
logger = get_user_logger()
14+
15+
16+
def map_name(gh_name: str | None, bt_name: str | None) -> str | None:
17+
"""
18+
Map and reconcile GitHub and bio.tools names using the generic
19+
GitHub-over-bio.tools policy.
20+
21+
Parameters
22+
----------
23+
gh_name : str | None
24+
GitHub repository name.
25+
bt_name : str | None
26+
Existing bio.tools name.
27+
28+
Returns
29+
-------
30+
str | None
31+
Mapped bio.tools name.
32+
"""
33+
if gh_name is None:
34+
logger.unchanged("No GitHub name found, nothing to map.")
35+
return bt_name
36+
37+
gh_norm = normalize_text(gh_name)
38+
bt_norm = normalize_text(bt_name)
39+
40+
return reconcile_gh_over_bt(
41+
gh_norm=gh_norm,
42+
bt_norm=bt_norm,
43+
bt_value=bt_name,
44+
build_bt_from_gh=lambda name: name,
45+
log_label="name",
46+
equality_fn=str_contain_each_other,
47+
)

src/bridge/pipelines/policies/gh2bt/reconcile_gh_over_bt.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def reconcile_gh_over_bt(
2525
bt_value: BT | None,
2626
build_bt_from_gh: Callable[[GHN], BT],
2727
log_label: str,
28+
equality_fn: Callable[[GHN, BTN], bool] | None = None,
2829
) -> BT | None:
2930
"""
3031
Apply a generic GitHub-over-bio.tools reconciliation policy.
@@ -66,6 +67,11 @@ def reconcile_gh_over_bt(
6667
log_label : str
6768
Short label used in log messages to identify the reconciled field
6869
(e.g., ``"license"``, ``"languages"``, ``"homepage"``).
70+
equality_fn : Callable[[GHN, BTN], bool] | None, optional
71+
Optional callable to determine equality between normalized GitHub and
72+
bio.tools values. If ``None``, the default equality operator (``==``)
73+
is used. This parameter is useful when the normalized representations
74+
require custom comparison logic (e.g., set equality for lists).
6975
7076
Returns
7177
-------
@@ -87,7 +93,12 @@ def reconcile_gh_over_bt(
8793
logger.added(f"{log_label} from GitHub: {gh_norm!r}")
8894
return gh_from_bt
8995

90-
if gh_from_bt == bt_norm:
96+
if equality_fn is not None:
97+
equal = equality_fn(gh_from_bt, bt_norm)
98+
else:
99+
equal = gh_from_bt == bt_norm
100+
101+
if equal:
91102
logger.exact(f"GitHub {log_label} matches bio.tools {log_label}.")
92103
return bt_value
93104

src/bridge/pipelines/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
normalize_pydantic_model_strings,
1313
normalize_text,
1414
)
15+
from .comparisons import str_contain_each_other
1516
from .conversions import find_matching_enum_member, object_to_primitive, svg_to_base64
1617
from .files import check_file_with_extension_exists, get_file_content, load_dict_from_yaml_file
1718
from .templating import fill_template, remove_first_snippet_from_text
@@ -34,4 +35,5 @@
3435
"svg_to_base64",
3536
"find_matching_enum_member",
3637
"object_to_primitive",
38+
"str_contain_each_other",
3739
]
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""
2+
Utility functions for comparisons.
3+
"""
4+
5+
6+
def str_contain_each_other(str1: str, str2: str) -> bool:
7+
"""
8+
Check if two strings contain each other (case-insensitive).
9+
10+
Parameters
11+
----------
12+
str1 : str
13+
First string.
14+
str2 : str
15+
Second string.
16+
17+
Returns
18+
-------
19+
bool
20+
True if either string contains the other, False otherwise.
21+
"""
22+
str1_lower = str1.lower()
23+
str2_lower = str2.lower()
24+
return str1_lower in str2_lower or str2_lower in str1_lower

0 commit comments

Comments
 (0)