@@ -2494,6 +2494,10 @@ sub genbank_nse_lookup_and_md5 {
24942494 if (! defined $nattempts ) { $nattempts = 1; }
24952495 if (! defined $nseconds ) { $nseconds = 3; }
24962496
2497+ if (id_looks_like_rnacentral($name )) {
2498+ return (0, 0, undef ); # RNAcentral seq, just return
2499+ }
2500+
24972501 # $nse will have end < start if it is negative strand, but we can't fetch from ENA
24982502 # with an end coord less than start, so if we are negative strand, we need to fetch
24992503 # the positive strand, and then revcomp it later.
@@ -2511,25 +2515,22 @@ sub genbank_nse_lookup_and_md5 {
25112515 my $api_key = " 472570bf7f5d4d9d52023765697b4957fa08" ;
25122516 my $url = sprintf (" https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=%s &rettype=fasta&retmode=text&from=%d &to=%d &api_key=%s " , $name , $qstart , $qend , $api_key );
25132517 my $got_url = get($url );
2514- my $looks_like_rnacentral = id_looks_like_rnacentral($name );
25152518
25162519 if (! defined $got_url ) {
2517- if (! $looks_like_rnacentral ) {
2518- # if NCBI is being hit by a bunch of requests, the get() command
2519- # may fail in that $got_url may be undefined. If that happens we
2520- # wait a few seconds ($nseconds) and try again (up to
2521- # $nattempts) times BUT we only do this for sequences that
2522- # don't look like they are RNAcentral ids. For sequences that
2523- # look like they are RNAcentral ids we do not do more attempts.
2524- my $attempt_ctr = 1;
2525- while ((! defined $got_url ) && ($attempt_ctr < $nattempts )) {
2526- sleep ($nseconds );
2527- $got_url = get($url );
2528- $attempt_ctr ++;
2529- }
2530- if (($attempt_ctr >= $nattempts ) && (! defined $got_url )) {
2531- croak " ERROR trying to fetch sequence info for $name from genbank, reached maximum allowed number of attempts ($nattempts )" ;
2532- }
2520+ # if NCBI is being hit by a bunch of requests, the get() command
2521+ # may fail in that $got_url may be undefined. If that happens we
2522+ # wait a few seconds ($nseconds) and try again (up to
2523+ # $nattempts) times BUT we only do this for sequences that
2524+ # don't look like they are RNAcentral ids. For sequences that
2525+ # look like they are RNAcentral ids we do not do more attempts.
2526+ my $attempt_ctr = 1;
2527+ while ((! defined $got_url ) && ($attempt_ctr < $nattempts )) {
2528+ sleep ($nseconds );
2529+ $got_url = get($url );
2530+ $attempt_ctr ++;
2531+ }
2532+ if (($attempt_ctr >= $nattempts ) && (! defined $got_url )) {
2533+ croak " ERROR trying to fetch sequence info for $name from genbank, reached maximum allowed number of attempts ($nattempts )" ;
25332534 }
25342535 }
25352536 elsif ($got_url !~ m / ^>/ ) {
@@ -2620,34 +2621,34 @@ sub genbank_fetch_seq_info {
26202621 $info_HHR -> {$name }{" length" } = " -" ;
26212622 $info_HHR -> {$name }{" mol_type" } = " -" ;
26222623
2623- my $api_key = " 472570bf7f5d4d9d52023765697b4957fa08" ;
2624- my $genbank_url = " https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&retmode=xml&id=" . $name_str . " &api_key=" . $api_key ;
2625- my $xml = undef ;
2626- my $xml_string = get($genbank_url );
2627- my $xml_valid = 0;
2628- if (defined $xml_string ) {
2629- # Previously, we tried to substitute out the GBSeq_sequence and translation lines
2630- # but in Sept 2022 this was identified as a bottleneck for at least some families
2631- # these substitution commands are now commented out but left here for reference.
2632- # The motivation for them in the first place was to save memory so if memory
2633- # does not become an issue it should be fine to leave them commented out.
2634- # However, if we do want to put the substitution commands back in, an alternative
2635- # strategy might be to add a 'usleep(0.1)' call just prior to the substitution
2636- # commands. In 2019 testing, this seemed to work when I encountered flakiness
2637- # related to these substitution commands (see git commits on 7/9/2019, e.g. d1547f8)
2638- # --------------
2639- # # to save memory, remove sequence info from the xml_string since we don't need it
2640- # # remove <GBSeq_sequence> lines
2641- # $xml_string =~ s/[^\n]+\<GBSeq\_sequence\>\w+\<\/GBSeq\_sequence\>\n//g;
2642- # # remove <GBQualifier>\n<GBQualifer_name>translation\nGBQualifier_value\n<\GBQualifier> sets of 4 lines
2643- # $xml_string =~ s/[^\n]+\<GBQualifier\>\n[^\n]+\<GBQualifier\_name\>translation\<\/GBQualifier\_name\>\n[^\n]+\<GBQualifier\_value\>\w+\<\/GBQualifier\_value\>\n[^\n]+\<\/GBQualifier\>\n//g;
2644- $xml = eval { XML::LibXML-> load_xml(string => $xml_string ); };
2645- if ($@ ) { $xml_valid = 0; }
2646- else { $xml_valid = 1; }
2647- }
2624+ if (! $looks_like_rnacentral ) {
2625+ my $api_key = " 472570bf7f5d4d9d52023765697b4957fa08" ;
2626+ my $genbank_url = " https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&retmode=xml&id=" . $name_str . " &api_key=" . $api_key ;
2627+ my $xml = undef ;
2628+ my $xml_string = get($genbank_url );
2629+ my $xml_valid = 0;
2630+ if (defined $xml_string ) {
2631+ # Previously, we tried to substitute out the GBSeq_sequence and translation lines
2632+ # but in Sept 2022 this was identified as a bottleneck for at least some families
2633+ # these substitution commands are now commented out but left here for reference.
2634+ # The motivation for them in the first place was to save memory so if memory
2635+ # does not become an issue it should be fine to leave them commented out.
2636+ # However, if we do want to put the substitution commands back in, an alternative
2637+ # strategy might be to add a 'usleep(0.1)' call just prior to the substitution
2638+ # commands. In 2019 testing, this seemed to work when I encountered flakiness
2639+ # related to these substitution commands (see git commits on 7/9/2019, e.g. d1547f8)
2640+ # --------------
2641+ # # to save memory, remove sequence info from the xml_string since we don't need it
2642+ # # remove <GBSeq_sequence> lines
2643+ # $xml_string =~ s/[^\n]+\<GBSeq\_sequence\>\w+\<\/GBSeq\_sequence\>\n//g;
2644+ # # remove <GBQualifier>\n<GBQualifer_name>translation\nGBQualifier_value\n<\GBQualifier> sets of 4 lines
2645+ # $xml_string =~ s/[^\n]+\<GBQualifier\>\n[^\n]+\<GBQualifier\_name\>translation\<\/GBQualifier\_name\>\n[^\n]+\<GBQualifier\_value\>\w+\<\/GBQualifier\_value\>\n[^\n]+\<\/GBQualifier\>\n//g;
2646+ $xml = eval { XML::LibXML-> load_xml(string => $xml_string ); };
2647+ if ($@ ) { $xml_valid = 0; }
2648+ else { $xml_valid = 1; }
2649+ }
26482650
2649- if (! $xml_valid ) {
2650- if (! $looks_like_rnacentral ) {
2651+ if (! $xml_valid ) {
26512652 # the get() command either failed (returned undef) or
26522653 # returned an invalid xml string, either way we
26532654 # wait a few seconds ($nseconds) and try again (up to
@@ -2680,95 +2681,95 @@ sub genbank_fetch_seq_info {
26802681 croak " ERROR trying to fetch sequence data for sequence $name from genbank, reached maximum allowed number of attempts ($attempt_ctr )" ;
26812682 }
26822683 }
2683- }
2684- else {
2685- # if we get here: we know that $xml_string is defined and valid
2686- # and $xml is ready for parsing
2687- foreach my $gbseq ($xml -> findnodes(' //GBSeq' )) {
2688- my $accver = $gbseq -> findvalue(' ./GBSeq_accession-version' );
2689- if (! defined $accver ) {
2690- croak " ERROR in $sub_name problem parsing XML, no accession-version read" ;
2691- }
2692- if (! exists $info_HHR -> {$accver }) {
2693- croak " ERROR in $sub_name problem parsing XML, unexpected accession.version $accver " ;
2694- }
2695-
2696- my $description = $gbseq -> findvalue(' ./GBSeq_definition' );
2697- if (! defined $description ) {
2698- croak " ERROR in $sub_name problem parsing XML, no definition (description) read" ;
2699- }
2700- $info_HHR -> {$accver }{" description" } = $description ;
2701-
2702- my $length = $gbseq -> findvalue(' ./GBSeq_length' );
2703- if (! defined $length ) {
2704- croak " ERROR in $sub_name problem parsing XML, no length read" ;
2705- }
2706- $info_HHR -> {$accver }{" length" } = $length ;
2707-
2708- # for taxid and mol_type, we have to fetch from Qualifier_values, and may have more than 1
2709- # in that case, they will be concatenated together
2710- my $taxid_val = $gbseq -> findvalue(' ./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_value[starts-with(text(), "taxon:")]' );
2711- my $taxid = undef ;
2712- my $orig_taxid_val = $taxid_val ;
2713- if (! defined $taxid_val ) {
2714- croak " ERROR in $sub_name did not read taxon info for $accver " ;
2715- }
2716- # $taxid_val will be concatenation of taxon:<\d+> N >= 1 times, we want to make sure <\d+> is equivalent all N instances
2717- while ($taxid_val =~ / ^taxon\: (\d +)/ ) {
2718- my $cur_taxid = $1 ;
2719- if (! defined $taxid ) { # first taxid
2720- $taxid = $cur_taxid ;
2684+ else {
2685+ # if we get here: we know that $xml_string is defined and valid
2686+ # and $xml is ready for parsing
2687+ foreach my $gbseq ($xml -> findnodes(' //GBSeq' )) {
2688+ my $accver = $gbseq -> findvalue(' ./GBSeq_accession-version' );
2689+ if (! defined $accver ) {
2690+ croak " ERROR in $sub_name problem parsing XML, no accession-version read" ;
27212691 }
2722- elsif ($cur_taxid != $taxid ) {
2723- ; # do nothing, see comment below
2724- # Change Jan 29, 2021: if multiple taxids are returned for a single accession,
2725- # always use the first one and don't complain. Previously we would fail here
2726- # with following croak:
2727- # croak "ERROR in $sub_name for $accver, > 1 taxids read: $taxid and $cur_taxid\nFull taxon values read: $orig_taxid_val\n";
2692+ if (! exists $info_HHR -> {$accver }) {
2693+ croak " ERROR in $sub_name problem parsing XML, unexpected accession.version $accver " ;
27282694 }
2729- $taxid_val =~ s / ^taxon\: (\d +)// ;
2730- }
2731- if ($taxid_val ne " " ) {
2732- croak " ERROR in $sub_name could not parse taxon info $accver \n Full taxon values read: $orig_taxid_val \n " ;
2733- }
2734- $info_HHR -> {$accver }{" ncbi_id" } = $taxid ;
2735-
2736- # mol_type is like taxid in that we may fetch more than one value concatenated together
2737- # but more complicated because we don't have the 'taxon:' at the beginning to use to parse
2738- # to determine the single value that we want
2739- # For example if we have more than 3 mol_type qualifiers, they will just be concatenated
2740- # together like "genomic DNAgenomic DNAgenomic DNA" and the single value we want is
2741- # "genomic DNA". To figure out the single value we assume it is repeated N times,
2742- # determine N, then determine its length, use substr to get it, and then verify
2743- # we have that same single value concatenated N times.
2744- my $mol_type_val = $gbseq -> findvalue(' ./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_name[text()="mol_type"]/following-sibling::GBQualifier_value' );
2745- my $mol_type = undef ;
2746- my $orig_mol_type_val = $mol_type_val ;
2747- my $nmol_type = $gbseq -> findvalue(' count(./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_name[text()="mol_type"]/following-sibling::GBQualifier_value)' );
2748- # the value we want ($mol_type) is concatenated $nmol_type times together in $mol_type_val, determine what it is, croaking if we can't
2749- if ((length ($mol_type_val ) % ($nmol_type )) != 0) {
2750- croak " ERROR in $sub_name could not parse mol_type info $mol_type_val \n " ;
2751- }
2752- my $mol_type_len = int ((length ($mol_type_val ) / $nmol_type ) + 0.01);
2753- my $mol_type_val_start = 0;
2754- while ($mol_type_val_start < $mol_type_len ) {
2755- my $cur_mol_type = substr ($mol_type_val , $mol_type_val_start , $mol_type_len );
2756- if (! defined $mol_type ) {
2757- $mol_type = $cur_mol_type ;
2695+
2696+ my $description = $gbseq -> findvalue(' ./GBSeq_definition' );
2697+ if (! defined $description ) {
2698+ croak " ERROR in $sub_name problem parsing XML, no definition (description) read" ;
27582699 }
2759- elsif ($cur_mol_type ne $mol_type ) {
2760- croak " ERROR in $sub_name for $accver , > 1 mol_types read: $mol_type and $cur_mol_type \n Full mol_type values read: $orig_mol_type_val \n " ;
2700+ $info_HHR -> {$accver }{" description" } = $description ;
2701+
2702+ my $length = $gbseq -> findvalue(' ./GBSeq_length' );
2703+ if (! defined $length ) {
2704+ croak " ERROR in $sub_name problem parsing XML, no length read" ;
27612705 }
2762- $mol_type_val_start += $mol_type_len ;
2763- }
2764- if ($mol_type_val_start != $mol_type_len ) {
2765- croak " ERROR in $sub_name could not parse mol_type value for $accver \n Full mol_type values read: $orig_mol_type_val \n " ;
2706+ $info_HHR -> {$accver }{" length" } = $length ;
2707+
2708+ # for taxid and mol_type, we have to fetch from Qualifier_values, and may have more than 1
2709+ # in that case, they will be concatenated together
2710+ my $taxid_val = $gbseq -> findvalue(' ./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_value[starts-with(text(), "taxon:")]' );
2711+ my $taxid = undef ;
2712+ my $orig_taxid_val = $taxid_val ;
2713+ if (! defined $taxid_val ) {
2714+ croak " ERROR in $sub_name did not read taxon info for $accver " ;
2715+ }
2716+ # $taxid_val will be concatenation of taxon:<\d+> N >= 1 times, we want to make sure <\d+> is equivalent all N instances
2717+ while ($taxid_val =~ / ^taxon\: (\d +)/ ) {
2718+ my $cur_taxid = $1 ;
2719+ if (! defined $taxid ) { # first taxid
2720+ $taxid = $cur_taxid ;
2721+ }
2722+ elsif ($cur_taxid != $taxid ) {
2723+ ; # do nothing, see comment below
2724+ # Change Jan 29, 2021: if multiple taxids are returned for a single accession,
2725+ # always use the first one and don't complain. Previously we would fail here
2726+ # with following croak:
2727+ # croak "ERROR in $sub_name for $accver, > 1 taxids read: $taxid and $cur_taxid\nFull taxon values read: $orig_taxid_val\n";
2728+ }
2729+ $taxid_val =~ s / ^taxon\: (\d +)// ;
2730+ }
2731+ if ($taxid_val ne " " ) {
2732+ croak " ERROR in $sub_name could not parse taxon info $accver \n Full taxon values read: $orig_taxid_val \n " ;
2733+ }
2734+ $info_HHR -> {$accver }{" ncbi_id" } = $taxid ;
2735+
2736+ # mol_type is like taxid in that we may fetch more than one value concatenated together
2737+ # but more complicated because we don't have the 'taxon:' at the beginning to use to parse
2738+ # to determine the single value that we want
2739+ # For example if we have more than 3 mol_type qualifiers, they will just be concatenated
2740+ # together like "genomic DNAgenomic DNAgenomic DNA" and the single value we want is
2741+ # "genomic DNA". To figure out the single value we assume it is repeated N times,
2742+ # determine N, then determine its length, use substr to get it, and then verify
2743+ # we have that same single value concatenated N times.
2744+ my $mol_type_val = $gbseq -> findvalue(' ./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_name[text()="mol_type"]/following-sibling::GBQualifier_value' );
2745+ my $mol_type = undef ;
2746+ my $orig_mol_type_val = $mol_type_val ;
2747+ my $nmol_type = $gbseq -> findvalue(' count(./GBSeq_feature-table/GBFeature/GBFeature_quals/GBQualifier/GBQualifier_name[text()="mol_type"]/following-sibling::GBQualifier_value)' );
2748+ # the value we want ($mol_type) is concatenated $nmol_type times together in $mol_type_val, determine what it is, croaking if we can't
2749+ if ((length ($mol_type_val ) % ($nmol_type )) != 0) {
2750+ croak " ERROR in $sub_name could not parse mol_type info $mol_type_val \n " ;
2751+ }
2752+ my $mol_type_len = int ((length ($mol_type_val ) / $nmol_type ) + 0.01);
2753+ my $mol_type_val_start = 0;
2754+ while ($mol_type_val_start < $mol_type_len ) {
2755+ my $cur_mol_type = substr ($mol_type_val , $mol_type_val_start , $mol_type_len );
2756+ if (! defined $mol_type ) {
2757+ $mol_type = $cur_mol_type ;
2758+ }
2759+ elsif ($cur_mol_type ne $mol_type ) {
2760+ croak " ERROR in $sub_name for $accver , > 1 mol_types read: $mol_type and $cur_mol_type \n Full mol_type values read: $orig_mol_type_val \n " ;
2761+ }
2762+ $mol_type_val_start += $mol_type_len ;
2763+ }
2764+ if ($mol_type_val_start != $mol_type_len ) {
2765+ croak " ERROR in $sub_name could not parse mol_type value for $accver \n Full mol_type values read: $orig_mol_type_val \n " ;
2766+ }
2767+ $info_HHR -> {$accver }{" mol_type" } = $mol_type ;
2768+
2769+ $info_HHR -> {$accver }{" source" } = " SEED:GenBank" ;
27662770 }
2767- $info_HHR -> {$accver }{" mol_type" } = $mol_type ;
2768-
2769- $info_HHR -> {$accver }{" source" } = " SEED:GenBank" ;
2770- }
2771- } # end of 'else' entered if $xml_string is defined
2771+ } # end of 'else' entered if $xml_string is defined
2772+ } # end of 'if(! $looks_like_rnacentral)'
27722773 } # end of 'for' loop over seq names for fetching and adding data per seq name
27732774
27742775 return ;
0 commit comments