Skip to content

Commit 6114552

Browse files
authored
Merge pull request #110 from Rfam/bugfix-genbank-md5-lookup
If id looks like rnacentral, don't fetch from genbank
2 parents e0d6fa2 + 8f9f8e8 commit 6114552

File tree

1 file changed

+129
-128
lines changed

1 file changed

+129
-128
lines changed

Rfam/Lib/Bio/Rfam/Utils.pm

Lines changed: 129 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -2494,6 +2494,10 @@ sub genbank_nse_lookup_and_md5 {
24942494
if(! defined $nattempts) { $nattempts = 1; }
24952495
if(! defined $nseconds) { $nseconds = 3; }
24962496

2497+
if(id_looks_like_rnacentral($name)) {
2498+
return (0, 0, undef); # RNAcentral seq, just return
2499+
}
2500+
24972501
# $nse will have end < start if it is negative strand, but we can't fetch from ENA
24982502
# with an end coord less than start, so if we are negative strand, we need to fetch
24992503
# the positive strand, and then revcomp it later.
@@ -2511,25 +2515,22 @@ sub genbank_nse_lookup_and_md5 {
25112515
my $api_key = "472570bf7f5d4d9d52023765697b4957fa08";
25122516
my $url = sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=%s&rettype=fasta&retmode=text&from=%d&to=%d&api_key=%s", $name, $qstart, $qend, $api_key);
25132517
my $got_url = get($url);
2514-
my $looks_like_rnacentral = id_looks_like_rnacentral($name);
25152518

25162519
if(! defined $got_url) {
2517-
if(! $looks_like_rnacentral) {
2518-
# if NCBI is being hit by a bunch of requests, the get() command
2519-
# may fail in that $got_url may be undefined. If that happens we
2520-
# wait a few seconds ($nseconds) and try again (up to
2521-
# $nattempts) times BUT we only do this for sequences that
2522-
# don't look like they are RNAcentral ids. For sequences that
2523-
# look like they are RNAcentral ids we do not do more attempts.
2524-
my $attempt_ctr = 1;
2525-
while((! defined $got_url) && ($attempt_ctr < $nattempts)) {
2526-
sleep($nseconds);
2527-
$got_url = get($url);
2528-
$attempt_ctr++;
2529-
}
2530-
if(($attempt_ctr >= $nattempts) && (! defined $got_url)) {
2531-
croak "ERROR trying to fetch sequence info for $name from genbank, reached maximum allowed number of attempts ($nattempts)";
2532-
}
2520+
# if NCBI is being hit by a bunch of requests, the get() command
2521+
# may fail in that $got_url may be undefined. If that happens we
2522+
# wait a few seconds ($nseconds) and try again (up to
2523+
# $nattempts) times BUT we only do this for sequences that
2524+
# don't look like they are RNAcentral ids. For sequences that
2525+
# look like they are RNAcentral ids we do not do more attempts.
2526+
my $attempt_ctr = 1;
2527+
while((! defined $got_url) && ($attempt_ctr < $nattempts)) {
2528+
sleep($nseconds);
2529+
$got_url = get($url);
2530+
$attempt_ctr++;
2531+
}
2532+
if(($attempt_ctr >= $nattempts) && (! defined $got_url)) {
2533+
croak "ERROR trying to fetch sequence info for $name from genbank, reached maximum allowed number of attempts ($nattempts)";
25332534
}
25342535
}
25352536
elsif($got_url !~ m/^>/) {
@@ -2620,34 +2621,34 @@ sub genbank_fetch_seq_info {
26202621
$info_HHR->{$name}{"length"} = "-";
26212622
$info_HHR->{$name}{"mol_type"} = "-";
26222623

2623-
my $api_key = "472570bf7f5d4d9d52023765697b4957fa08";
2624-
my $genbank_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&retmode=xml&id=" . $name_str . "&api_key=" . $api_key;
2625-
my $xml = undef;
2626-
my $xml_string = get($genbank_url);
2627-
my $xml_valid = 0;
2628-
if(defined $xml_string) {
2629-
# Previously, we tried to substitute out the GBSeq_sequence and translation lines
2630-
# but in Sept 2022 this was identified as a bottleneck for at least some families
2631-
# these substitution commands are now commented out but left here for reference.
2632-
# The motivation for them in the first place was to save memory so if memory
2633-
# does not become an issue it should be fine to leave them commented out.
2634-
# However, if we do want to put the substitution commands back in, an alternative
2635-
# strategy might be to add a 'usleep(0.1)' call just prior to the substitution
2636-
# commands. In 2019 testing, this seemed to work when I encountered flakiness
2637-
# related to these substitution commands (see git commits on 7/9/2019, e.g. d1547f8)
2638-
# --------------
2639-
## to save memory, remove sequence info from the xml_string since we don't need it
2640-
## remove <GBSeq_sequence> lines
2641-
#$xml_string =~ s/[^\n]+\<GBSeq\_sequence\>\w+\<\/GBSeq\_sequence\>\n//g;
2642-
## remove <GBQualifier>\n<GBQualifer_name>translation\nGBQualifier_value\n<\GBQualifier> sets of 4 lines
2643-
#$xml_string =~ s/[^\n]+\<GBQualifier\>\n[^\n]+\<GBQualifier\_name\>translation\<\/GBQualifier\_name\>\n[^\n]+\<GBQualifier\_value\>\w+\<\/GBQualifier\_value\>\n[^\n]+\<\/GBQualifier\>\n//g;
2644-
$xml = eval { XML::LibXML->load_xml(string => $xml_string); };
2645-
if($@) { $xml_valid = 0; }
2646-
else { $xml_valid = 1; }
2647-
}
2624+
if(! $looks_like_rnacentral) {
2625+
my $api_key = "472570bf7f5d4d9d52023765697b4957fa08";
2626+
my $genbank_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&retmode=xml&id=" . $name_str . "&api_key=" . $api_key;
2627+
my $xml = undef;
2628+
my $xml_string = get($genbank_url);
2629+
my $xml_valid = 0;
2630+
if(defined $xml_string) {
2631+
# Previously, we tried to substitute out the GBSeq_sequence and translation lines
2632+
# but in Sept 2022 this was identified as a bottleneck for at least some families
2633+
# these substitution commands are now commented out but left here for reference.
2634+
# The motivation for them in the first place was to save memory so if memory
2635+
# does not become an issue it should be fine to leave them commented out.
2636+
# However, if we do want to put the substitution commands back in, an alternative
2637+
# strategy might be to add a 'usleep(0.1)' call just prior to the substitution
2638+
# commands. In 2019 testing, this seemed to work when I encountered flakiness
2639+
# related to these substitution commands (see git commits on 7/9/2019, e.g. d1547f8)
2640+
# --------------
2641+
## to save memory, remove sequence info from the xml_string since we don't need it
2642+
## remove <GBSeq_sequence> lines
2643+
#$xml_string =~ s/[^\n]+\<GBSeq\_sequence\>\w+\<\/GBSeq\_sequence\>\n//g;
2644+
## remove <GBQualifier>\n<GBQualifer_name>translation\nGBQualifier_value\n<\GBQualifier> sets of 4 lines
2645+
#$xml_string =~ s/[^\n]+\<GBQualifier\>\n[^\n]+\<GBQualifier\_name\>translation\<\/GBQualifier\_name\>\n[^\n]+\<GBQualifier\_value\>\w+\<\/GBQualifier\_value\>\n[^\n]+\<\/GBQualifier\>\n//g;
2646+
$xml = eval { XML::LibXML->load_xml(string => $xml_string); };
2647+
if($@) { $xml_valid = 0; }
2648+
else { $xml_valid = 1; }
2649+
}
26482650

2649-
if(! $xml_valid) {
2650-
if(! $looks_like_rnacentral) {
2651+
if(! $xml_valid) {
26512652
# the get() command either failed (returned undef) or
26522653
# returned an invalid xml string, either way we
26532654
# wait a few seconds ($nseconds) and try again (up to
@@ -2680,95 +2681,95 @@ sub genbank_fetch_seq_info {
26802681
croak "ERROR trying to fetch sequence data for sequence $name from genbank, reached maximum allowed number of attempts ($attempt_ctr)";
26812682
}
26822683
}
2683-
}
2684-
else {
2685-
# if we get here: we know that $xml_string is defined and valid
2686-
# and $xml is ready for parsing
2687-
foreach my $gbseq ($xml->findnodes('//GBSeq')) {
2688-
my $accver = $gbseq->findvalue('./GBSeq_accession-version');
2689-
if(! defined $accver) {
2690-
croak "ERROR in $sub_name problem parsing XML, no accession-version read";
2691-
}
2692-
if(! exists $info_HHR->{$accver}) {
2693-
croak "ERROR in $sub_name problem parsing XML, unexpected accession.version $accver";
2694-
}
2695-
2696-
my $description = $gbseq->findvalue('./GBSeq_definition');
2697-
if(! defined $description) {
2698-
croak "ERROR in $sub_name problem parsing XML, no definition (description) read";
2699-
}
2700-
$info_HHR->{$accver}{"description"} = $description;
2701-
2702-
my $length = $gbseq->findvalue('./GBSeq_length');
2703-
if(! defined $length) {
2704-
croak "ERROR in $sub_name problem parsing XML, no length read";
2705-
}
2706-
$info_HHR->{$accver}{"length"} = $length;
2707-
2708-
# for taxid and mol_type, we have to fetch from Qualifier_values, and may have more than 1
2709-
# in that case, they will be concatenated together
2710-
my $taxid_val = $gbseq->findvalue('./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_value[starts-with(text(), "taxon:")]');
2711-
my $taxid = undef;
2712-
my $orig_taxid_val = $taxid_val;
2713-
if(! defined $taxid_val) {
2714-
croak "ERROR in $sub_name did not read taxon info for $accver";
2715-
}
2716-
# $taxid_val will be concatenation of taxon:<\d+> N >= 1 times, we want to make sure <\d+> is equivalent all N instances
2717-
while($taxid_val =~ /^taxon\:(\d+)/) {
2718-
my $cur_taxid = $1;
2719-
if(! defined $taxid) { # first taxid
2720-
$taxid = $cur_taxid;
2684+
else {
2685+
# if we get here: we know that $xml_string is defined and valid
2686+
# and $xml is ready for parsing
2687+
foreach my $gbseq ($xml->findnodes('//GBSeq')) {
2688+
my $accver = $gbseq->findvalue('./GBSeq_accession-version');
2689+
if(! defined $accver) {
2690+
croak "ERROR in $sub_name problem parsing XML, no accession-version read";
27212691
}
2722-
elsif($cur_taxid != $taxid) {
2723-
; # do nothing, see comment below
2724-
# Change Jan 29, 2021: if multiple taxids are returned for a single accession,
2725-
# always use the first one and don't complain. Previously we would fail here
2726-
# with following croak:
2727-
#croak "ERROR in $sub_name for $accver, > 1 taxids read: $taxid and $cur_taxid\nFull taxon values read: $orig_taxid_val\n";
2692+
if(! exists $info_HHR->{$accver}) {
2693+
croak "ERROR in $sub_name problem parsing XML, unexpected accession.version $accver";
27282694
}
2729-
$taxid_val =~ s/^taxon\:(\d+)//;
2730-
}
2731-
if($taxid_val ne "") {
2732-
croak "ERROR in $sub_name could not parse taxon info $accver\nFull taxon values read: $orig_taxid_val\n";
2733-
}
2734-
$info_HHR->{$accver}{"ncbi_id"} = $taxid;
2735-
2736-
# mol_type is like taxid in that we may fetch more than one value concatenated together
2737-
# but more complicated because we don't have the 'taxon:' at the beginning to use to parse
2738-
# to determine the single value that we want
2739-
# For example if we have more than 3 mol_type qualifiers, they will just be concatenated
2740-
# together like "genomic DNAgenomic DNAgenomic DNA" and the single value we want is
2741-
# "genomic DNA". To figure out the single value we assume it is repeated N times,
2742-
# determine N, then determine its length, use substr to get it, and then verify
2743-
# we have that same single value concatenated N times.
2744-
my $mol_type_val = $gbseq->findvalue('./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_name[text()="mol_type"]/following-sibling::GBQualifier_value');
2745-
my $mol_type = undef;
2746-
my $orig_mol_type_val = $mol_type_val;
2747-
my $nmol_type = $gbseq->findvalue('count(./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_name[text()="mol_type"]/following-sibling::GBQualifier_value)');
2748-
# the value we want ($mol_type) is concatenated $nmol_type times together in $mol_type_val, determine what it is, croaking if we can't
2749-
if((length($mol_type_val) % ($nmol_type)) != 0) {
2750-
croak "ERROR in $sub_name could not parse mol_type info $mol_type_val\n";
2751-
}
2752-
my $mol_type_len = int((length($mol_type_val) / $nmol_type) + 0.01);
2753-
my $mol_type_val_start = 0;
2754-
while($mol_type_val_start < $mol_type_len) {
2755-
my $cur_mol_type = substr($mol_type_val, $mol_type_val_start, $mol_type_len);
2756-
if(! defined $mol_type) {
2757-
$mol_type = $cur_mol_type;
2695+
2696+
my $description = $gbseq->findvalue('./GBSeq_definition');
2697+
if(! defined $description) {
2698+
croak "ERROR in $sub_name problem parsing XML, no definition (description) read";
27582699
}
2759-
elsif($cur_mol_type ne $mol_type) {
2760-
croak "ERROR in $sub_name for $accver, > 1 mol_types read: $mol_type and $cur_mol_type\nFull mol_type values read: $orig_mol_type_val\n";
2700+
$info_HHR->{$accver}{"description"} = $description;
2701+
2702+
my $length = $gbseq->findvalue('./GBSeq_length');
2703+
if(! defined $length) {
2704+
croak "ERROR in $sub_name problem parsing XML, no length read";
27612705
}
2762-
$mol_type_val_start += $mol_type_len;
2763-
}
2764-
if($mol_type_val_start != $mol_type_len) {
2765-
croak "ERROR in $sub_name could not parse mol_type value for $accver\nFull mol_type values read: $orig_mol_type_val\n";
2706+
$info_HHR->{$accver}{"length"} = $length;
2707+
2708+
# for taxid and mol_type, we have to fetch from Qualifier_values, and may have more than 1
2709+
# in that case, they will be concatenated together
2710+
my $taxid_val = $gbseq->findvalue('./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_value[starts-with(text(), "taxon:")]');
2711+
my $taxid = undef;
2712+
my $orig_taxid_val = $taxid_val;
2713+
if(! defined $taxid_val) {
2714+
croak "ERROR in $sub_name did not read taxon info for $accver";
2715+
}
2716+
# $taxid_val will be concatenation of taxon:<\d+> N >= 1 times, we want to make sure <\d+> is equivalent all N instances
2717+
while($taxid_val =~ /^taxon\:(\d+)/) {
2718+
my $cur_taxid = $1;
2719+
if(! defined $taxid) { # first taxid
2720+
$taxid = $cur_taxid;
2721+
}
2722+
elsif($cur_taxid != $taxid) {
2723+
; # do nothing, see comment below
2724+
# Change Jan 29, 2021: if multiple taxids are returned for a single accession,
2725+
# always use the first one and don't complain. Previously we would fail here
2726+
# with following croak:
2727+
#croak "ERROR in $sub_name for $accver, > 1 taxids read: $taxid and $cur_taxid\nFull taxon values read: $orig_taxid_val\n";
2728+
}
2729+
$taxid_val =~ s/^taxon\:(\d+)//;
2730+
}
2731+
if($taxid_val ne "") {
2732+
croak "ERROR in $sub_name could not parse taxon info $accver\nFull taxon values read: $orig_taxid_val\n";
2733+
}
2734+
$info_HHR->{$accver}{"ncbi_id"} = $taxid;
2735+
2736+
# mol_type is like taxid in that we may fetch more than one value concatenated together
2737+
# but more complicated because we don't have the 'taxon:' at the beginning to use to parse
2738+
# to determine the single value that we want
2739+
# For example if we have more than 3 mol_type qualifiers, they will just be concatenated
2740+
# together like "genomic DNAgenomic DNAgenomic DNA" and the single value we want is
2741+
# "genomic DNA". To figure out the single value we assume it is repeated N times,
2742+
# determine N, then determine its length, use substr to get it, and then verify
2743+
# we have that same single value concatenated N times.
2744+
my $mol_type_val = $gbseq->findvalue('./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_name[text()="mol_type"]/following-sibling::GBQualifier_value');
2745+
my $mol_type = undef;
2746+
my $orig_mol_type_val = $mol_type_val;
2747+
my $nmol_type = $gbseq->findvalue('count(./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_name[text()="mol_type"]/following-sibling::GBQualifier_value)');
2748+
# the value we want ($mol_type) is concatenated $nmol_type times together in $mol_type_val, determine what it is, croaking if we can't
2749+
if((length($mol_type_val) % ($nmol_type)) != 0) {
2750+
croak "ERROR in $sub_name could not parse mol_type info $mol_type_val\n";
2751+
}
2752+
my $mol_type_len = int((length($mol_type_val) / $nmol_type) + 0.01);
2753+
my $mol_type_val_start = 0;
2754+
while($mol_type_val_start < $mol_type_len) {
2755+
my $cur_mol_type = substr($mol_type_val, $mol_type_val_start, $mol_type_len);
2756+
if(! defined $mol_type) {
2757+
$mol_type = $cur_mol_type;
2758+
}
2759+
elsif($cur_mol_type ne $mol_type) {
2760+
croak "ERROR in $sub_name for $accver, > 1 mol_types read: $mol_type and $cur_mol_type\nFull mol_type values read: $orig_mol_type_val\n";
2761+
}
2762+
$mol_type_val_start += $mol_type_len;
2763+
}
2764+
if($mol_type_val_start != $mol_type_len) {
2765+
croak "ERROR in $sub_name could not parse mol_type value for $accver\nFull mol_type values read: $orig_mol_type_val\n";
2766+
}
2767+
$info_HHR->{$accver}{"mol_type"} = $mol_type;
2768+
2769+
$info_HHR->{$accver}{"source"} = "SEED:GenBank";
27662770
}
2767-
$info_HHR->{$accver}{"mol_type"} = $mol_type;
2768-
2769-
$info_HHR->{$accver}{"source"} = "SEED:GenBank";
2770-
}
2771-
} # end of 'else' entered if $xml_string is defined
2771+
} # end of 'else' entered if $xml_string is defined
2772+
} # end of 'if(! $looks_like_rnacentral)'
27722773
} # end of 'for' loop over seq names for fetching and adding data per seq name
27732774

27742775
return;

0 commit comments

Comments
 (0)