Skip to content

Commit 296e258

Browse files
committed
Merge resolve
1 parent d46fc20 commit 296e258

File tree

2 files changed

+33
-9
lines changed

2 files changed

+33
-9
lines changed

ace/export.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def export_database(db, foldername, skip_empty=True, table_html=False):
2828

2929
# New table.csv columns
3030
table_columns = [
31-
'pmcid', 'table_id', 'table_label', 'table_caption',
31+
'pmid', 'table_id', 'table_label', 'table_caption',
3232
'table_foot', 'n_header_rows', 'table_raw_file'
3333
]
3434
tables_data = []

ace/scrape.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from selenium.webdriver.common.by import By
1717
from selenium.common.exceptions import TimeoutException
1818
from tqdm import tqdm
19+
from seleniumbase import SB
1920

2021
from ace.utils import PubMedAPI
2122
from ace.config import USER_AGENTS
@@ -256,12 +257,13 @@ def search_pubmed(self, journal, search, retmax=10000, savelist=None,):
256257
return doc
257258

258259

259-
def get_html(self, url, journal, mode='browser'):
260+
def get_html(self, url, journal, mode='browser', headless=True):
260261

261262
''' Get HTML of full-text article. Uses either browser automation (if mode == 'browser')
262263
or just gets the URL directly. '''
263264

264265
if mode == 'browser':
266+
<<<<<<< Updated upstream
265267
driver = Driver(
266268
uc=True,
267269
headless2=True,
@@ -352,6 +354,26 @@ def get_html(self, url, journal, mode='browser'):
352354
By.CLASS_NAME, 'table-expand-inline')))
353355
driver.execute_script("arguments[0].scrollIntoView();", link)
354356
link.click()
357+
=======
358+
with SB(
359+
uc=True, headless2=headless,
360+
agent=random.choice(USER_AGENTS),
361+
incognito=True, disable_csp=True, block_images=True,
362+
) as sb:
363+
sb.activate_cdp_mode(url)
364+
html = sb.get_page_source()
365+
url = sb.get_current_url()
366+
367+
new_url = self.check_for_substitute_url(url, html, journal)
368+
369+
if url != new_url:
370+
sb.activate_cdp_mode(new_url)
371+
url = sb.get_current_url()
372+
373+
if journal.lower() in ['human brain mapping',
374+
'european journal of neuroscience',
375+
'brain and behavior', 'epilepsia']:
376+
>>>>>>> Stashed changes
355377
sleep(0.5 + random.random() * 1)
356378

357379
# If title has ScienceDirect in in title
@@ -394,7 +416,7 @@ def get_html(self, url, journal, mode='browser'):
394416
return r.text
395417

396418

397-
def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_pmc_source=True):
419+
def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_pmc_source=True, headless=True):
398420
base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
399421
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
400422

@@ -419,15 +441,15 @@ def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_
419441
else:
420442
query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}"
421443
logger.info(query)
422-
return self.get_html(query, journal, mode=mode)
444+
return self.get_html(query, journal, mode=mode, headless=headless)
423445

424446
if prefer_pmc_source == "only":
425447
logger.info("\tNo PMC source found!! Skipping...")
426448
return
427449

428450
# Fallback if no PMC link found
429451
query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}"
430-
return self.get_html(query, journal, mode=mode)
452+
return self.get_html(query, journal, mode=mode, headless=headless)
431453

432454

433455
def check_for_substitute_url(self, url, html, journal):
@@ -465,7 +487,7 @@ def is_pmc_open_acess(self, pmcid):
465487

466488
return 'idIsNotOpenAccess' not in response
467489

468-
def process_article(self, id, journal, delay=None, mode='browser', overwrite=False, prefer_pmc_source=True):
490+
def process_article(self, id, journal, delay=None, mode='browser', overwrite=False, prefer_pmc_source=True, headless=True):
469491

470492
logger.info("Processing %s..." % id)
471493
journal_path = (self.store / 'html' / journal)
@@ -478,7 +500,7 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
478500
return None, None
479501

480502
# Save the HTML
481-
doc = self.get_html_by_pmid(id, journal, mode=mode, prefer_pmc_source=prefer_pmc_source)
503+
doc = self.get_html_by_pmid(id, journal, mode=mode, prefer_pmc_source=prefer_pmc_source, headless=headless)
482504
valid = None
483505
if doc:
484506
valid = _validate_scrape(doc)
@@ -497,7 +519,8 @@ def process_article(self, id, journal, delay=None, mode='browser', overwrite=Fal
497519

498520
def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mode='browser', search=None,
499521
limit=None, overwrite=False, min_pmid=None, max_pmid=None, shuffle=False,
500-
index_pmids=False, skip_pubmed_central=True, metadata_store=None, invalid_article_log_file=None, prefer_pmc_source=True):
522+
index_pmids=False, skip_pubmed_central=True, metadata_store=None, invalid_article_log_file=None,
523+
prefer_pmc_source=True, headless=True):
501524

502525
''' Try to retrieve all PubMed articles for a single journal that don't
503526
already exist in the storage directory.
@@ -535,6 +558,7 @@ def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mod
535558
(regardless of mode). This is useful for journals that have full-text articles available on PMC,
536559
but are not open-access. If set to "only", will only retrieve articles from PMC, and
537560
skip articles it cannot retrieve from PMC.
561+
headless: When True, runs the browser in headless mode (only relevant if mode=='browser', and not PMC)
538562
'''
539563
articles_found = 0
540564
if journal is None and dois is None and pmids is None:
@@ -612,7 +636,7 @@ def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mod
612636
f.write(f"{pmcid}\n")
613637
continue
614638

615-
filename, valid = self.process_article(pmid, journal, delay, mode, overwrite, prefer_pmc_source)
639+
filename, valid = self.process_article(pmid, journal, delay, mode, overwrite, prefer_pmc_source, headless)
616640

617641
if not valid:
618642
invalid_articles.append(filename)

0 commit comments

Comments
 (0)