Skip to content

Commit 70ec42f

Browse files
authored
feat: add optional suppression of allele collection (#545)
* VCF annotator should not collect alleles if no output is requested by user. This has major implications for memory usage, apparently. * Provide this in a relatively OOP-friendly way without producing a breaking change At the next major version, there is a cleaner way to do this and it should be implemented at that point
1 parent 9f74acf commit 70ec42f

File tree

1 file changed

+29
-4
lines changed
  • src/ga4gh/vrs/extras/annotator

1 file changed

+29
-4
lines changed

src/ga4gh/vrs/extras/annotator/vcf.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,18 @@ def __init__(self, data_proxy: _DataProxy, **kwargs) -> None: # noqa: ARG002
109109
self.data_proxy = data_proxy
110110
self.tlr = AlleleTranslator(self.data_proxy)
111111

112+
def should_collect_alleles(self, **kwargs) -> bool: # noqa: ARG002
113+
"""Determine whether allele aggregation is necessary.
114+
115+
This method is called to initialize (or not) an initial allele collection for
116+
downstream use. By default, it returns the corresponding class variable, but
117+
implementing classes can choose to amend this based on provided annotation
118+
arguments.
119+
120+
:return: ``True`` if alleles should be collected
121+
"""
122+
return self.collect_alleles
123+
112124
@abc.abstractmethod
113125
def raise_for_output_args(self, output_vcf_path: Path | None, **kwargs) -> None:
114126
"""Raise an exception if no output appears to be configured or declared.
@@ -215,7 +227,11 @@ def annotate(
215227
else:
216228
vcf_out = None
217229

218-
allele_collection = [] if self.collect_alleles else None
230+
allele_collection = (
231+
[]
232+
if (self.collect_alleles and self.should_collect_alleles(**kwargs))
233+
else None
234+
)
219235
for record in vcf:
220236
if vcf_out:
221237
additional_info_fields = [FieldName.IDS_FIELD]
@@ -265,7 +281,7 @@ def annotate(
265281
if vcf_out:
266282
vcf_out.close()
267283

268-
if self.collect_alleles:
284+
if allele_collection is not None:
269285
self.on_vrs_object_collection(allele_collection, **kwargs)
270286

271287
@abc.abstractmethod
@@ -275,8 +291,8 @@ def on_vrs_object(
275291
"""Perform side-effects (eg additional annotation or storage) or additional
276292
filtering on VRS alleles as they are constructed during VCF annotation.
277293
278-
Reimplement in a child class to add custom logic. Otherwise, this method simply
279-
passes through ``vrs_allele`` without altering it further or storing it.
294+
Reimplement in a child class to add custom logic. Otherwise, simply pass through
295+
``vrs_allele`` without altering it further or storing it.
280296
281297
:param vcf_coords: CHR-POS-REF-ALT from VCF for this allele
282298
:param vrs_allele: allele translated from coords
@@ -444,6 +460,15 @@ class VcfAnnotator(AbstractVcfAnnotator):
444460
pkl_arg_name = "output_pkl_path"
445461
ndjson_arg_name = "output_ndjson_path"
446462

463+
def should_collect_alleles(self, **kwargs) -> bool:
464+
"""Inhibit allele collection parameter if no means of output are given.
465+
466+
:kwparam output_pkl_path: Optional path to output PKL dump of all alleles
467+
:kwparam output_ndjson_path: Optional path to output NDJSON dump of all alleles
468+
:return: ``True`` if at least one of the output path args is ``True``
469+
"""
470+
return bool(kwargs.get("output_pkl_path") or kwargs.get("output_ndjson_path"))
471+
447472
@use_ga4gh_compute_identifier_when(VrsObjectIdentifierIs.MISSING)
448473
def annotate(
449474
self,

0 commit comments

Comments
 (0)