Skip to content

Commit ac7e588

Browse files
committed
More Entrez improvements
Merge PubMedSearch and PubMedXMLFetcher. Make PubMedXMLParser fully static. Coordinate calls to the Entrez API instead of blindly waiting between calls to respect usage policies.
1 parent 0a24f2b commit ac7e588

37 files changed

+505
-712
lines changed

gemma-cli/src/main/java/ubic/gemma/apps/ExpressionExperimentPrimaryPubCli.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import org.springframework.beans.factory.annotation.Autowired;
2828
import org.springframework.beans.factory.annotation.Value;
2929
import ubic.gemma.core.loader.entrez.pubmed.ExpressionExperimentBibRefFinder;
30-
import ubic.gemma.core.loader.entrez.pubmed.PubMedXMLFetcher;
30+
import ubic.gemma.core.loader.entrez.pubmed.PubMedSearch;
3131
import ubic.gemma.model.common.description.BibliographicReference;
3232
import ubic.gemma.model.expression.experiment.BioAssaySet;
3333
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
@@ -53,7 +53,7 @@ public class ExpressionExperimentPrimaryPubCli extends ExpressionExperimentManip
5353
private ExpressionExperimentService ees;
5454
@Autowired
5555
private PersisterHelper persisterHelper;
56-
private PubMedXMLFetcher fetcher;
56+
private PubMedSearch fetcher;
5757
private ExpressionExperimentBibRefFinder finder;
5858

5959
@Value("${entrez.efetch.apikey}")
@@ -62,7 +62,7 @@ public class ExpressionExperimentPrimaryPubCli extends ExpressionExperimentManip
6262
@Override
6363
public void afterPropertiesSet() throws Exception {
6464
super.afterPropertiesSet();
65-
this.fetcher = new PubMedXMLFetcher( ncbiApiKey );
65+
this.fetcher = new PubMedSearch( ncbiApiKey );
6666
this.finder = new ExpressionExperimentBibRefFinder( ncbiApiKey );
6767
}
6868

@@ -146,7 +146,7 @@ protected void processExpressionExperiment( ExpressionExperiment experiment ) {
146146
experiment = ees.thawLite( experiment );
147147

148148
// get from GEO or get from a file
149-
BibliographicReference ref = fetcher.retrieveByHTTP( pubmedIds.get( experiment.getShortName() ) );
149+
BibliographicReference ref = fetcher.fetchById( pubmedIds.get( experiment.getShortName() ) );
150150

151151
if ( ref == null ) {
152152
if ( this.pubmedIdFilename != null ) {

gemma-cli/src/main/java/ubic/gemma/apps/MeshTermFetcherCli.java

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@
2121
import org.apache.commons.cli.CommandLine;
2222
import org.apache.commons.cli.Option;
2323
import org.apache.commons.cli.Options;
24+
import org.apache.commons.collections4.ListUtils;
2425
import org.apache.commons.lang3.StringUtils;
2526
import org.springframework.beans.factory.annotation.Value;
26-
import ubic.gemma.core.loader.entrez.pubmed.PubMedXMLFetcher;
2727
import ubic.gemma.cli.util.AbstractCLI;
2828
import ubic.gemma.cli.util.CLI;
29+
import ubic.gemma.core.loader.entrez.pubmed.PubMedSearch;
2930
import ubic.gemma.model.common.description.BibliographicReference;
3031
import ubic.gemma.model.common.description.MedicalSubjectHeading;
3132

@@ -36,6 +37,7 @@
3637
import java.util.Collection;
3738
import java.util.Collections;
3839
import java.util.List;
40+
import java.util.stream.Collectors;
3941

4042
/**
4143
* @author pavlidis
@@ -91,45 +93,29 @@ protected void processOptions( CommandLine commandLine ) {
9193

9294
@Override
9395
protected void doWork() throws Exception {
94-
PubMedXMLFetcher fetcher = new PubMedXMLFetcher( ncbiApiKey );
95-
96-
Collection<Integer> ids = this.readIdsFromFile( file );
97-
Collection<Integer> chunk = new ArrayList<>();
98-
for ( Integer i : ids ) {
99-
100-
chunk.add( i );
101-
102-
if ( chunk.size() == MeshTermFetcherCli.CHUNK_SIZE ) {
103-
104-
this.processChunk( fetcher, chunk );
105-
chunk.clear();
106-
}
107-
}
108-
109-
if ( !chunk.isEmpty() ) {
96+
PubMedSearch fetcher = new PubMedSearch( ncbiApiKey );
97+
List<String> ids = this.readIdsFromFile( file ).stream().distinct().collect( Collectors.toList() );
98+
for ( List<String> chunk : ListUtils.partition( ids, MeshTermFetcherCli.CHUNK_SIZE ) ) {
11099
this.processChunk( fetcher, chunk );
111100
}
112101
}
113102

114-
private Collection<Integer> readIdsFromFile( String inFile ) throws IOException {
103+
private Collection<String> readIdsFromFile( String inFile ) throws IOException {
115104
log.info( "Reading " + inFile );
116-
117-
Collection<Integer> ids = new ArrayList<>();
105+
Collection<String> ids = new ArrayList<>();
118106
try ( BufferedReader in = new BufferedReader( new FileReader( file ) ) ) {
119107
String line;
120108
while ( ( line = in.readLine() ) != null ) {
121109
if ( line.startsWith( "#" ) )
122110
continue;
123-
124-
ids.add( Integer.parseInt( line ) );
125-
111+
ids.add( StringUtils.strip( line ) );
126112
}
127113
}
128114
return ids;
129115
}
130116

131-
private void processChunk( PubMedXMLFetcher fetcher, Collection<Integer> ids ) throws IOException {
132-
Collection<BibliographicReference> refs = fetcher.retrieveByHTTP( ids );
117+
private void processChunk( PubMedSearch fetcher, Collection<String> pubMedIds ) throws IOException {
118+
Collection<BibliographicReference> refs = fetcher.fetchById( pubMedIds );
133119

134120
for ( BibliographicReference r : refs ) {
135121
getCliContext().getOutputStream().print( r.getPubAccession().getAccession() + "\t" );

gemma-cli/src/main/java/ubic/gemma/apps/PubMedSearcher.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ protected void processOptions( CommandLine commandLine ) {
7474
protected void doAuthenticatedWork() throws Exception {
7575
PubMedSearch pubMedSearcher = new PubMedSearch( ncbiApiKey );
7676
Collection<BibliographicReference> refs = pubMedSearcher
77-
.searchAndRetrieveByHTTP( this.args );
77+
.searchAndRetrieve( this.args );
7878

7979
getCliContext().getOutputStream().println( refs.size() + " references found" );
8080

gemma-cli/src/main/java/ubic/gemma/apps/UpdatePubMedCli.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ private BibliographicReference getBibliographicReference( String pubmedId ) {
161161
searchTerms.add( pubmedId );
162162
Collection<BibliographicReference> publications;
163163
try {
164-
publications = pms.searchAndRetrieveIdByHTTP( searchTerms );
164+
publications = pms.fetchById( searchTerms );
165165
} catch ( IOException e ) {
166166
throw new RuntimeException( e );
167167
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package ubic.gemma.core.loader.entrez;
2+
3+
import java.util.List;
4+
5+
public class EntrezException extends RuntimeException {
6+
7+
private final List<String> errors;
8+
9+
public EntrezException( String message, List<String> errors ) {
10+
super( message );
11+
this.errors = errors;
12+
}
13+
14+
public List<String> getErrors() {
15+
return errors;
16+
}
17+
}

gemma-core/src/main/java/ubic/gemma/core/loader/entrez/EntrezUtils.java

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,26 @@
11
package ubic.gemma.core.loader.entrez;
22

3+
import lombok.extern.apachecommons.CommonsLog;
34
import org.apache.commons.lang3.StringUtils;
4-
import org.springframework.util.Assert;
55
import ubic.gemma.core.config.Settings;
6+
import ubic.gemma.core.util.SimpleRetryCallable;
67

78
import javax.annotation.Nullable;
9+
import java.io.IOException;
810
import java.io.UnsupportedEncodingException;
911
import java.net.MalformedURLException;
1012
import java.net.URL;
1113
import java.net.URLEncoder;
1214
import java.nio.charset.StandardCharsets;
15+
import java.util.concurrent.Callable;
1316

1417
/**
1518
* Low-level utilities for generating Entrez URLs.
1619
* <p>
1720
* Read more about this in <a href="https://www.ncbi.nlm.nih.gov/books/NBK25500/">The E-utilities In-Depth: Parameters, Syntax and More</a>.
1821
* @author poirigui
1922
*/
23+
@CommonsLog
2024
public class EntrezUtils {
2125

2226
public static final String ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";
@@ -27,46 +31,68 @@ public class EntrezUtils {
2731
private static final String TOOL = "gemma";
2832
private static final String EMAIL = Settings.getString( "gemma.support.email" );
2933

34+
private static final long TIMEOUT_AUTHENTICATED_MS = 100;
35+
private static final long TIMEOUT_ANONYMOUS_MS = 333;
36+
37+
private static long lastCall = 0L;
38+
39+
@FunctionalInterface
40+
public interface EntrezCall<T> extends Callable<T> {
41+
42+
@Override
43+
T call() throws IOException;
44+
}
45+
3046
/**
31-
* Perform a search on an Entrez database.
47+
* Coordinate calls to the Entrez API so that we always respect the recommended usage.
3248
* <p>
33-
* Results must be subsequently retrieved with {@link #fetch(String, String, String, int, int, String, String)} or
34-
* {@link #summary(String, String, String, int, int, String, String)}. The query key and WebEnv values must be
35-
* extracted from the payload.
49+
* Refer to <a href="https://www.ncbi.nlm.nih.gov/books/NBK25497/">A General Introduction to the E-utilities</a> for
50+
* more information about usage policies.
3651
*/
37-
public static URL search( String db, String term, @Nullable String apiKey ) {
38-
return createUrl( ESEARCH
39-
+ "?db=" + urlEncode( db )
40-
+ "&term=" + urlEncode( term )
41-
+ "&usehistory=y", apiKey );
52+
public synchronized static <T> T doNicely( EntrezCall<T> task, @Nullable String apiKey ) throws IOException {
53+
long timeoutMs = StringUtils.isNotBlank( apiKey ) ? TIMEOUT_AUTHENTICATED_MS : TIMEOUT_ANONYMOUS_MS;
54+
long diff = System.currentTimeMillis() - lastCall;
55+
if ( diff < timeoutMs ) {
56+
try {
57+
log.debug( "Last Entrez API call occurred " + diff + " ms ago, waiting " + ( timeoutMs - diff ) + " ms..." );
58+
Thread.sleep( timeoutMs - diff );
59+
} catch ( InterruptedException e ) {
60+
Thread.currentThread().interrupt();
61+
throw new RuntimeException( e );
62+
}
63+
}
64+
try {
65+
return task.call();
66+
} finally {
67+
lastCall = System.currentTimeMillis();
68+
}
4269
}
4370

4471
/**
45-
* Perform a search on an Entrez database and retrieve results immediately.
72+
* Wrap a {@link SimpleRetryCallable} such that it will respect the recommended usage of the Entrez API.
73+
* @see #doNicely(EntrezCall, String)
4674
*/
47-
public static URL searchAndRetrieve( String db, String term, String retmode, String rettype, @Nullable String apiKey ) {
48-
return createUrl( ESEARCH
49-
+ "?db=" + urlEncode( db )
50-
+ "&term=" + urlEncode( term )
51-
+ "&retmode=" + urlEncode( retmode )
52-
+ "&rettype=" + urlEncode( rettype ), apiKey );
75+
public static <T> SimpleRetryCallable<T, IOException> retryNicely( SimpleRetryCallable<T, IOException> callable, @Nullable String apiKey ) {
76+
return ( ctx ) -> EntrezUtils.doNicely( () -> callable.call( ctx ), apiKey );
5377
}
5478

5579
/**
56-
* Perform a search on an Entrez database and retrieve results immediately.
80+
* Perform a search on an Entrez database.
81+
* <p>
82+
* Results must be subsequently retrieved with {@link #fetch(String, String, String, int, int, String, String)} or
83+
* {@link #summary(String, String, String, int, int, String, String)}. The query key and WebEnv values must be
84+
* extracted from the payload.
5785
*/
58-
public static URL searchAndRetrieve( String db, String term, String retmode, String rettype, int retstart, int retmax, @Nullable String apiKey ) {
86+
public static URL search( String db, String term, String retmode, @Nullable String apiKey ) {
5987
return createUrl( ESEARCH
6088
+ "?db=" + urlEncode( db )
6189
+ "&term=" + urlEncode( term )
62-
+ "&retstart=" + retstart
63-
+ "&retmax=" + retmax
6490
+ "&retmode=" + urlEncode( retmode )
65-
+ "&rettype=" + urlEncode( rettype ), apiKey );
91+
+ "&usehistory=y", apiKey );
6692
}
6793

6894
/**
69-
* Summarize a previous {@link #search(String, String, String)} query.
95+
* Summarize a previous {@link #search(String, String, String, String)} query.
7096
*/
7197
public static URL summary( String db, String queryKey, String retmode, int retstart, int retmax, String webEnv, @Nullable String apiKey ) {
7298
return createUrl( ESUMMARY
@@ -90,7 +116,7 @@ public static URL fetchById( String db, String id, String retmode, String rettyp
90116
}
91117

92118
/**
93-
* Retrieve the results of a previous {@link #search(String, String, String)} query.
119+
* Retrieve the results of a previous {@link #search(String, String, String, String)} query.
94120
*/
95121
public static URL fetch( String db, String queryKey, String retmode, int retstart, int retmax, String webEnv, @Nullable String apiKey ) {
96122
return createUrl( EFETCH
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
package ubic.gemma.core.loader.entrez;
2+
3+
import org.apache.commons.lang3.StringUtils;
4+
import org.w3c.dom.Document;
5+
import org.w3c.dom.Node;
6+
import org.w3c.dom.NodeList;
7+
import org.xml.sax.InputSource;
8+
import org.xml.sax.SAXException;
9+
import ubic.gemma.core.util.XMLUtils;
10+
11+
import javax.xml.parsers.DocumentBuilder;
12+
import javax.xml.parsers.ParserConfigurationException;
13+
import java.io.IOException;
14+
import java.io.InputStream;
15+
import java.util.ArrayList;
16+
import java.util.Collection;
17+
import java.util.HashSet;
18+
import java.util.List;
19+
20+
/**
21+
* Utilities for manipulating Entrez XML responses.
22+
* @author poirigui
23+
*/
24+
public class EntrezXmlUtils {
25+
26+
/**
27+
* Parse an XML reply from Entrez.
28+
* <p>
29+
* This will check if there are any {@code ERROR} tags.
30+
*/
31+
public static Document parse( InputStream is ) throws IOException {
32+
try {
33+
DocumentBuilder builder = createDocumentBuilder();
34+
Document doc = builder.parse( is );
35+
checkForErrors( doc );
36+
return doc;
37+
} catch ( ParserConfigurationException | SAXException e ) {
38+
throw new RuntimeException( e );
39+
}
40+
}
41+
42+
public static Document parse( InputSource is ) throws IOException {
43+
try {
44+
DocumentBuilder builder = createDocumentBuilder();
45+
Document doc = builder.parse( is );
46+
checkForErrors( doc );
47+
return doc;
48+
} catch ( ParserConfigurationException | SAXException e ) {
49+
throw new RuntimeException( e );
50+
}
51+
}
52+
53+
private static void checkForErrors( Document doc ) {
54+
NodeList error = doc.getDocumentElement().getElementsByTagName( "ERROR" );
55+
if ( error.item( 0 ) != null ) {
56+
List<String> errors = new ArrayList<>();
57+
for ( Node elem = error.item( 0 ); elem != null; elem = elem.getNextSibling() ) {
58+
errors.add( XMLUtils.getTextValue( elem ) );
59+
}
60+
throw new EntrezException( errors.get( 0 ), errors );
61+
}
62+
}
63+
64+
65+
/**
66+
* Create a document builder with {@link NcbiEntityResolver} as entity resolver.
67+
* <p>
68+
* This will work for most of not all XML files from NCBI Entrez and related services.
69+
*/
70+
private static DocumentBuilder createDocumentBuilder() throws ParserConfigurationException {
71+
DocumentBuilder builder = ubic.gemma.core.util.XMLUtils.createDocumentBuilder();
72+
builder.setEntityResolver( new NcbiEntityResolver() );
73+
return builder;
74+
}
75+
76+
public static Collection<String> extractIds( Document doc ) {
77+
NodeList idList = doc.getElementsByTagName( "Id" );
78+
Collection<String> result = new HashSet<>();
79+
for ( Node elem = idList.item( 0 ); elem != null; elem = elem.getNextSibling() ) {
80+
String val = XMLUtils.getTextValue( elem );
81+
if ( StringUtils.isBlank( val ) ) {
82+
continue;
83+
}
84+
result.add( val );
85+
}
86+
return result;
87+
}
88+
89+
public static int getCount( Document document ) {
90+
return Integer.parseInt( XMLUtils.getTextValue( document.getElementsByTagName( "Count" ).item( 0 ) ) );
91+
}
92+
93+
public static String getQueryId( Document document ) {
94+
return XMLUtils.getTextValue( XMLUtils.getUniqueItem( document.getElementsByTagName( "QueryKey" ) ) );
95+
}
96+
97+
public static String getCookie( Document document ) {
98+
return XMLUtils.getTextValue( XMLUtils.getUniqueItem( document.getElementsByTagName( "WebEnv" ) ) );
99+
}
100+
}

0 commit comments

Comments
 (0)