Skip to content

Commit 884f905

Browse files
Fix allele translator to_spdi issue for RLE deletion. Add RLE reconstruction when longer than rle_seq_limit. (#542)
This fixes a bug for `_to_spdi` for ReferenceLengthExpressions. It also adds an optional argument to `translate_to` called `ref_seq_limit` which works similarly to `rle_seq_limit`, but for SPDI expressions. If `ref_seq_limit` is nonzero, the 3rd term of the SPDI expression will be the actual reference sequence, not the length. To round-trip a clinvar SPDI expression which contains the reference sequence: ``` spdi = "NC_000019.10:44908821:C:T" vrs = translator.translate_from(spdi, "spdi") to_spdi = translator.translate_to(vrs, "spdi", ref_seq_limit=None) # or some number >=1 since len("C")==1 assert spdi == to_spdi ``` To continue the existing behavior of using the reference sequence length: ``` spdi = "NC_000019.10:44908821:1:T" vrs = translator.translate_from(spdi, "spdi") to_spdi = translator.translate_to(vrs, "spdi") # ref_seq_limit defaults to 0 assert spdi == to_spdi ``` --------- Co-authored-by: Kori Kuzma <korikuzma@gmail.com>
1 parent 70ec42f commit 884f905

File tree

7 files changed

+5825
-17
lines changed

7 files changed

+5825
-17
lines changed

src/ga4gh/vrs/extras/translator.py

Lines changed: 83 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,40 @@
99
import re
1010
from abc import ABC
1111
from collections.abc import Mapping
12+
from typing import Protocol
1213

1314
from ga4gh.core import ga4gh_identify
1415
from ga4gh.vrs import models, normalize
15-
from ga4gh.vrs.dataproxy import _DataProxy
16+
from ga4gh.vrs.dataproxy import SequenceProxy, _DataProxy
1617
from ga4gh.vrs.extras.decorators import lazy_property
18+
from ga4gh.vrs.normalize import denormalize_reference_length_expression
1719
from ga4gh.vrs.utils.hgvs_tools import HgvsTools
1820

1921
_logger = logging.getLogger(__name__)
2022

2123

24+
class VariationToStrProtocol(Protocol):
25+
"""Protocol for translating VRS objects to other string expressions.
26+
27+
This protocol defines a callable interface for translating a VRS object
28+
into variation strings, with optional keyword arguments for customization.
29+
"""
30+
31+
def __call__(self, vo: models._VariationBase, **kwargs) -> list[str]:
32+
"""Translate vrs object `vo` to variation string expressions"""
33+
34+
35+
class VariationFromStrProtocol(Protocol):
36+
"""Protocol for translating variation strings to VRS objects.
37+
38+
This protocol defines a callable interface for translating a variation
39+
string into a VRS object, with optional keyword arguments for customization.
40+
"""
41+
42+
def __call__(self, expr: str, **kwargs) -> models._VariationBase | None:
43+
"""Translate variation string `expr` to a VRS object"""
44+
45+
2246
class _Translator(ABC): # noqa: B024
2347
"""abstract class / interface for VRS to/from translation needs
2448
@@ -54,8 +78,8 @@ def __init__(
5478
self.data_proxy = data_proxy
5579
self.identify = identify
5680
self.rle_seq_limit = rle_seq_limit
57-
self.from_translators = {}
58-
self.to_translators = {}
81+
self.from_translators: dict[str, VariationFromStrProtocol] = {}
82+
self.to_translators: dict[str, VariationToStrProtocol] = {}
5983

6084
def translate_from(
6185
self, var: str, fmt: str | None = None, **kwargs
@@ -110,10 +134,15 @@ def translate_from(
110134
msg = f"Unable to parse data as {', '.join(formats)}"
111135
raise ValueError(msg)
112136

113-
def translate_to(self, vo: models._VariationBase, fmt: str) -> str:
114-
"""Translate vrs object `vo` to named format `fmt`"""
137+
def translate_to(self, vo: models._VariationBase, fmt: str, **kwargs) -> list[str]:
138+
"""Translate vrs object `vo` to named format `fmt`
139+
140+
kwargs:
141+
ref_seq_limit Optional(int):
142+
If vo.state is a ReferenceLengthExpression, and `ref_seq_limit` is specified, and `fmt` is `spdi`, the reference sequence is included in the SPDI expression if it is below the limit Otherwise only the length of the reference sequence is included. If the limit is None, the reference sequence is always included. In all cases, the alt sequence is included. Default is 0 (never include reference sequence).
143+
"""
115144
t = self.to_translators[fmt]
116-
return t(vo)
145+
return t(vo, **kwargs)
117146

118147
############################################################################
119148
# INTERNAL
@@ -385,12 +414,15 @@ def _from_spdi(self, spdi_expr: str, **kwargs) -> models.Allele | None:
385414
return self._create_allele(values, **kwargs)
386415

387416
def _to_hgvs(
388-
self, vo: models.Allele, namespace: str | None = "refseq"
417+
self,
418+
vo: models.Allele,
419+
namespace: str | None = "refseq",
420+
**kwargs, # noqa: ARG002
389421
) -> list[str]:
390422
return self.hgvs_tools.from_allele(vo, namespace)
391423

392424
def _to_spdi(
393-
self, vo: models.Allele, namespace: str | None = "refseq"
425+
self, vo: models.Allele, namespace: str | None = "refseq", **kwargs
394426
) -> list[str]:
395427
"""Generate a *list* of SPDI expressions for VRS Allele.
396428
@@ -400,6 +432,13 @@ def _to_spdi(
400432
If `namespace` is None, returns SPDI strings for all alias
401433
translations.
402434
435+
If `ref_seq_limit` is specified, the reference sequence is
436+
included in the SPDI expression only if it is below the limit.
437+
Otherwise only the length of the reference sequence is
438+
included. If the limit is None, the reference sequence is
439+
always included. In all cases, the alt sequence is included.
440+
Default is 0 (never include reference sequence).
441+
403442
If no alias translations are available, an empty list is
404443
returned.
405444
@@ -411,9 +450,43 @@ def _to_spdi(
411450
sequence = f"ga4gh:{vo.location.get_refget_accession()}"
412451
aliases = self.data_proxy.translate_sequence_identifier(sequence, namespace)
413452
aliases = [a.split(":")[1] for a in aliases]
453+
seq_proxies = {a: SequenceProxy(self.data_proxy, a) for a in aliases}
414454
start, end = vo.location.start, vo.location.end
415-
spdi_tail = f":{start}:{end - start}:{vo.state.sequence.root}"
416-
return [a + spdi_tail for a in aliases]
455+
spdi_exprs = []
456+
457+
for alias in aliases:
458+
# Get the reference sequence
459+
seq_proxy = seq_proxies[alias]
460+
ref_seq = seq_proxy[start:end]
461+
462+
if vo.state.type == models.VrsType.REF_LEN_EXPR.value:
463+
# Derived from reference. sequence included if under limit, but
464+
# we can derive it again from the reference.
465+
alt_seq = denormalize_reference_length_expression(
466+
ref_seq=ref_seq,
467+
repeat_subunit_length=vo.state.repeatSubunitLength,
468+
alt_length=vo.state.length,
469+
)
470+
# Warn if the derived sequence is different from the one in the object
471+
if vo.state.sequence and vo.state.sequence != alt_seq:
472+
_logger.warning(
473+
"Derived sequence '%s' is different from provided state.sequence '%s'",
474+
alt_seq,
475+
vo.state.sequence,
476+
)
477+
else:
478+
alt_seq = vo.state.sequence.root
479+
480+
# Optionally allow using the length of the reference sequence
481+
# instead of the sequence itself.
482+
ref_seq_limit = kwargs.get("ref_seq_limit", 0)
483+
if ref_seq_limit is not None and len(ref_seq) > int(ref_seq_limit):
484+
ref_seq = len(ref_seq)
485+
486+
spdi_expr = f"{alias}:{start}:{ref_seq}:{alt_seq}"
487+
spdi_exprs.append(spdi_expr)
488+
489+
return spdi_exprs
417490

418491
def _post_process_imported_allele(
419492
self, allele: models.Allele, **kwargs

src/ga4gh/vrs/normalize.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,33 @@ def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50):
221221
return new_allele
222222

223223

224+
def denormalize_reference_length_expression(
225+
ref_seq: str,
226+
repeat_subunit_length: int,
227+
alt_length: int,
228+
) -> str:
229+
"""Reverse the process of repeat subunit compaction for a ReferenceLengthExpression.
230+
The alt is reference-derived so repeat the repeat subunit until it is it to alt_length
231+
characters long. May include a trailing partial copy of the repeat subunit.
232+
233+
e.g. "ACGT" (repeat_subunit_length=4, length=10) -> "ACGTACGTAC" (ACGT ACGT AC)
234+
235+
:param ref_seq: The reference sequence to be denormalized.
236+
:param repeat_subunit_length: The length of the repeat subunit.
237+
:param alt_length: The length of the alternate sequence that was compacted during normalization.
238+
"""
239+
repeat_subunit = ref_seq[:repeat_subunit_length]
240+
if len(repeat_subunit) != repeat_subunit_length:
241+
raise ValueError( # noqa: TRY003
242+
f"Repeat subunit length {repeat_subunit_length} is not equal to the length of the repeat subunit {len(repeat_subunit)}" # noqa: EM102
243+
)
244+
repeat_count = alt_length // repeat_subunit_length
245+
remainder = alt_length % repeat_subunit_length
246+
alt = repeat_subunit * repeat_count
247+
alt += repeat_subunit[:remainder]
248+
return alt
249+
250+
224251
def _factor_gen(n):
225252
"""Yield all factors of an integer `n`, in descending order"""
226253
lower_factors = []
@@ -259,7 +286,10 @@ def _is_valid_cycle(template_start, template, target):
259286
# TODO _normalize_genotype?
260287

261288

262-
def _normalize_cis_phased_block(o, data_proxy: _DataProxy | None = None): # noqa: ARG001
289+
def _normalize_cis_phased_block(
290+
o,
291+
data_proxy: _DataProxy | None = None, # noqa: ARG001
292+
):
263293
o.members = sorted(o.members, key=ga4gh_digest)
264294
return o
265295

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
interactions:
2+
- request:
3+
body: null
4+
headers:
5+
Accept:
6+
- '*/*'
7+
Accept-Encoding:
8+
- gzip, deflate
9+
Connection:
10+
- keep-alive
11+
User-Agent:
12+
- python-requests/2.32.3
13+
method: GET
14+
uri: http://localhost:5001/seqrepo/1/metadata/refseq:NC_000013.11
15+
response:
16+
body:
17+
string: "{\n \"added\": \"2016-08-27T23:50:14Z\",\n \"aliases\": [\n \"GRCh38:13\",\n
18+
\ \"GRCh38:chr13\",\n \"GRCh38.p1:13\",\n \"GRCh38.p1:chr13\",\n \"GRCh38.p10:13\",\n
19+
\ \"GRCh38.p10:chr13\",\n \"GRCh38.p11:13\",\n \"GRCh38.p11:chr13\",\n
20+
\ \"GRCh38.p12:13\",\n \"GRCh38.p12:chr13\",\n \"GRCh38.p2:13\",\n
21+
\ \"GRCh38.p2:chr13\",\n \"GRCh38.p3:13\",\n \"GRCh38.p3:chr13\",\n
22+
\ \"GRCh38.p4:13\",\n \"GRCh38.p4:chr13\",\n \"GRCh38.p5:13\",\n \"GRCh38.p5:chr13\",\n
23+
\ \"GRCh38.p6:13\",\n \"GRCh38.p6:chr13\",\n \"GRCh38.p7:13\",\n \"GRCh38.p7:chr13\",\n
24+
\ \"GRCh38.p8:13\",\n \"GRCh38.p8:chr13\",\n \"GRCh38.p9:13\",\n \"GRCh38.p9:chr13\",\n
25+
\ \"MD5:a5437debe2ef9c9ef8f3ea2874ae1d82\",\n \"NCBI:NC_000013.11\",\n
26+
\ \"refseq:NC_000013.11\",\n \"SEGUID:2oDBty0yKV9wHo7gg+Bt+fPgi5o\",\n
27+
\ \"SHA1:da80c1b72d32295f701e8ee083e06df9f3e08b9a\",\n \"VMC:GS__0wi-qoDrvram155UmcSC-zA5ZK4fpLT\",\n
28+
\ \"sha512t24u:_0wi-qoDrvram155UmcSC-zA5ZK4fpLT\",\n \"ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT\"\n
29+
\ ],\n \"alphabet\": \"ACGKNTY\",\n \"length\": 114364328\n}\n"
30+
headers:
31+
Connection:
32+
- close
33+
Content-Length:
34+
- '1002'
35+
Content-Type:
36+
- application/json
37+
Date:
38+
- Wed, 09 Apr 2025 19:40:39 GMT
39+
Server:
40+
- Werkzeug/2.2.3 Python/3.10.12
41+
status:
42+
code: 200
43+
message: OK
44+
version: 1

tests/extras/cassettes/test_hgvs[NC_000013.11:g.32936732=-expected0].yaml

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,37 @@ interactions:
2323
Content-Type:
2424
- text/plain; charset=utf-8
2525
Date:
26-
- Mon, 10 Mar 2025 16:22:49 GMT
26+
- Wed, 16 Apr 2025 22:49:30 GMT
27+
Server:
28+
- Werkzeug/2.2.3 Python/3.10.12
29+
status:
30+
code: 200
31+
message: OK
32+
- request:
33+
body: null
34+
headers:
35+
Accept:
36+
- '*/*'
37+
Accept-Encoding:
38+
- gzip, deflate
39+
Connection:
40+
- keep-alive
41+
User-Agent:
42+
- python-requests/2.32.3
43+
method: GET
44+
uri: http://localhost:5000/seqrepo/1/sequence/ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT?start=32936731&end=32936732
45+
response:
46+
body:
47+
string: C
48+
headers:
49+
Connection:
50+
- close
51+
Content-Length:
52+
- '1'
53+
Content-Type:
54+
- text/plain; charset=utf-8
55+
Date:
56+
- Wed, 16 Apr 2025 22:49:30 GMT
2757
Server:
2858
- Werkzeug/2.2.3 Python/3.10.12
2959
status:
@@ -67,20 +97,20 @@ interactions:
6797
Content-Type:
6898
- text/plain
6999
Date:
70-
- Mon, 10 Mar 2025 16:22:50 GMT
100+
- Wed, 16 Apr 2025 22:49:30 GMT
71101
Keep-Alive:
72102
- timeout=4, max=40
73103
NCBI-PHID:
74-
- D0BDD793D08B96450000396AA7F60D6E.1.1.m_5
104+
- 322CF8979F5E85B5000050CD47341D44.1.1.m_5
75105
NCBI-SID:
76-
- A10F065EB442C5FB_D344SID
106+
- B4DD900AE926F06D_8D6DSID
77107
Referrer-Policy:
78108
- origin-when-cross-origin
79109
Server:
80110
- Finatra
81111
Set-Cookie:
82-
- ncbi_sid=A10F065EB442C5FB_D344SID; domain=.nih.gov; path=/; expires=Tue, 10
83-
Mar 2026 16:22:50 GMT
112+
- ncbi_sid=B4DD900AE926F06D_8D6DSID; domain=.nih.gov; path=/; expires=Thu, 16
113+
Apr 2026 22:49:30 GMT
84114
Strict-Transport-Security:
85115
- max-age=31536000; includeSubDomains; preload
86116
Transfer-Encoding:

0 commit comments

Comments
 (0)