1616from selenium .webdriver .common .by import By
1717from selenium .common .exceptions import TimeoutException
1818from tqdm import tqdm
19+ from seleniumbase import SB
1920
2021from ace .utils import PubMedAPI
2122from ace .config import USER_AGENTS
@@ -256,12 +257,13 @@ def search_pubmed(self, journal, search, retmax=10000, savelist=None,):
256257 return doc
257258
258259
259- def get_html (self , url , journal , mode = 'browser' ):
260+ def get_html (self , url , journal , mode = 'browser' , headless = True ):
260261
261262 ''' Get HTML of full-text article. Uses either browser automation (if mode == 'browser')
262263 or just gets the URL directly. '''
263264
264265 if mode == 'browser' :
266+ < << << << Updated upstream
265267 driver = Driver (
266268 uc = True ,
267269 headless2 = True ,
@@ -352,6 +354,26 @@ def get_html(self, url, journal, mode='browser'):
352354 By .CLASS_NAME , 'table-expand-inline' )))
353355 driver .execute_script ("arguments[0].scrollIntoView();" , link )
354356 link .click ()
357+ == == == =
358+ with SB (
359+ uc = True , headless2 = headless ,
360+ agent = random .choice (USER_AGENTS ),
361+ incognito = True , disable_csp = True , block_images = True ,
362+ ) as sb :
363+ sb .activate_cdp_mode (url )
364+ html = sb .get_page_source ()
365+ url = sb .get_current_url ()
366+
367+ new_url = self .check_for_substitute_url (url , html , journal )
368+
369+ if url != new_url :
370+ sb .activate_cdp_mode (new_url )
371+ url = sb .get_current_url ()
372+
373+ if journal .lower () in ['human brain mapping' ,
374+ 'european journal of neuroscience' ,
375+ 'brain and behavior' , 'epilepsia' ]:
376+ > >> >> >> Stashed changes
355377 sleep (0.5 + random .random () * 1 )
356378
357379 # If title has ScienceDirect in in title
@@ -394,7 +416,7 @@ def get_html(self, url, journal, mode='browser'):
394416 return r .text
395417
396418
397- def get_html_by_pmid (self , pmid , journal , mode = 'browser' , retmode = 'ref' , prefer_pmc_source = True ):
419+ def get_html_by_pmid (self , pmid , journal , mode = 'browser' , retmode = 'ref' , prefer_pmc_source = True , headless = True ):
398420 base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
399421 "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
400422
@@ -419,15 +441,15 @@ def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_
419441 else :
420442 query = f"{ base_url } ?dbfrom=pubmed&id={ pmid } &cmd=prlinks&retmode={ retmode } "
421443 logger .info (query )
422- return self .get_html (query , journal , mode = mode )
444+ return self .get_html (query , journal , mode = mode , headless = headless )
423445
424446 if prefer_pmc_source == "only" :
425447 logger .info ("\t No PMC source found!! Skipping..." )
426448 return
427449
428450 # Fallback if no PMC link found
429451 query = f"{ base_url } ?dbfrom=pubmed&id={ pmid } &cmd=prlinks&retmode={ retmode } "
430- return self .get_html (query , journal , mode = mode )
452+ return self .get_html (query , journal , mode = mode , headless = headless )
431453
432454
433455 def check_for_substitute_url (self , url , html , journal ):
@@ -465,7 +487,7 @@ def is_pmc_open_acess(self, pmcid):
465487
466488 return 'idIsNotOpenAccess' not in response
467489
468- def process_article (self , id , journal , delay = None , mode = 'browser' , overwrite = False , prefer_pmc_source = True ):
490+ def process_article (self , id , journal , delay = None , mode = 'browser' , overwrite = False , prefer_pmc_source = True , headless = True ):
469491
470492 logger .info ("Processing %s..." % id )
471493 journal_path = (self .store / 'html' / journal )
@@ -478,7 +500,7 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
478500 return None , None
479501
480502 # Save the HTML
481- doc = self .get_html_by_pmid (id , journal , mode = mode , prefer_pmc_source = prefer_pmc_source )
503+ doc = self .get_html_by_pmid (id , journal , mode = mode , prefer_pmc_source = prefer_pmc_source , headless = headless )
482504 valid = None
483505 if doc :
484506 valid = _validate_scrape (doc )
@@ -497,7 +519,8 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
497519
498520 def retrieve_articles (self , journal = None , pmids = None , dois = None , delay = None , mode = 'browser' , search = None ,
499521 limit = None , overwrite = False , min_pmid = None , max_pmid = None , shuffle = False ,
500- index_pmids = False , skip_pubmed_central = True , metadata_store = None , invalid_article_log_file = None , prefer_pmc_source = True ):
522+ index_pmids = False , skip_pubmed_central = True , metadata_store = None , invalid_article_log_file = None ,
523+ prefer_pmc_source = True , headless = True ):
501524
502525 ''' Try to retrieve all PubMed articles for a single journal that don't
503526 already exist in the storage directory.
@@ -535,6 +558,7 @@ def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mod
535558 (regardless of mode). This is useful for journals that have full-text articles available on PMC,
536559 but are not open-access. If set to "only", will only retrieve articles from PMC, and
537560 skip articles it cannot retrieve from PMC.
561+ headless: When True, runs the browser in headless mode (only relevant if mode=='browser', and not PMC)
538562 '''
539563 articles_found = 0
540564 if journal is None and dois is None and pmids is None :
@@ -612,7 +636,7 @@ def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mod
612636 f .write (f"{ pmcid } \n " )
613637 continue
614638
615- filename , valid = self .process_article (pmid , journal , delay , mode , overwrite , prefer_pmc_source )
639+ filename , valid = self .process_article (pmid , journal , delay , mode , overwrite , prefer_pmc_source , headless )
616640
617641 if not valid :
618642 invalid_articles .append (filename )
0 commit comments