From 46f7911a948fd72c873cd27f17325a1c45bc5abc Mon Sep 17 00:00:00 2001 From: uxairibrar Date: Sun, 19 Jan 2025 23:09:34 +0100 Subject: [PATCH 1/6] Harvest data from online OJS --- publications/admin.py | 29 +++++++- ...ingevent_alter_publication_url_and_more.py | 49 +++++++++++++ publications/models.py | 54 +++++++++++++- publications/tasks.py | 73 +++++++++++-------- tests/test_harvesting.py | 19 ++++- 5 files changed, 188 insertions(+), 36 deletions(-) create mode 100644 publications/migrations/0004_harvestingevent_alter_publication_url_and_more.py diff --git a/publications/admin.py b/publications/admin.py index 1afec7c..2b00401 100644 --- a/publications/admin.py +++ b/publications/admin.py @@ -1,7 +1,9 @@ from django.contrib import admin from leaflet.admin import LeafletGeoAdmin -from publications.models import Publication +from publications.models import Publication, Source, HarvestingEvent from import_export.admin import ImportExportModelAdmin +from publications.tasks import harvest_oai_endpoint + @admin.action(description="Mark selected publications as published") def make_public(modeladmin, request, queryset): @@ -11,6 +13,18 @@ def make_public(modeladmin, request, queryset): def make_draft(modeladmin, request, queryset): queryset.update(status="d") +@admin.action(description="Trigger harvesting for selected sources") +def trigger_harvesting_for_specific(modeladmin, request, queryset): + for source in queryset: + harvest_oai_endpoint(source.id) + +@admin.action(description="Trigger harvesting for all sources") +def trigger_harvesting_for_all(modeladmin, request, queryset): + all_sources = Source.objects.all() + for source in all_sources: + harvest_oai_endpoint(source.id) + + @admin.register(Publication) class PublicationAdmin(LeafletGeoAdmin, ImportExportModelAdmin): """Publication Admin.""" @@ -18,3 +32,16 @@ class PublicationAdmin(LeafletGeoAdmin, ImportExportModelAdmin): list_display = ("doi", "creationDate", "lastUpdate", "created_by", "updated_by", "status", "provenance") actions = [make_public,make_draft] + +@admin.register(Source) +class SourceAdmin(admin.ModelAdmin): + list_display = ("id", "url_field", "harvest_interval_minutes", "last_harvest") + list_filter = ("harvest_interval_minutes",) + search_fields = ("url_field",) + actions = [trigger_harvesting_for_specific,trigger_harvesting_for_all] + +@admin.register(HarvestingEvent) +class HarvestingEventAdmin(admin.ModelAdmin): + list_display = ("id", "source", "status", "started_at", "completed_at") + list_filter = ("status", "started_at", "completed_at") + search_fields = ("source__url",) diff --git a/publications/migrations/0004_harvestingevent_alter_publication_url_and_more.py b/publications/migrations/0004_harvestingevent_alter_publication_url_and_more.py new file mode 100644 index 0000000..1cb1d9e --- /dev/null +++ b/publications/migrations/0004_harvestingevent_alter_publication_url_and_more.py @@ -0,0 +1,49 @@ +# Generated by Django 4.0.5 on 2025-01-19 22:04 + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ('publications', '0003_alter_publication_timeperiod_enddate_and_more'), + ] + + operations = [ + migrations.CreateModel( + name='HarvestingEvent', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('started_at', models.DateTimeField(auto_now_add=True)), + ('completed_at', models.DateTimeField(blank=True, null=True)), + ('status', models.CharField(choices=[('pending', 'Pending'), ('in_progress', 'In Progress'), ('completed', 'Completed'), ('failed', 'Failed')], default='pending', max_length=16)), + ], + ), + migrations.AlterField( + model_name='publication', + name='url', + field=models.URLField(blank=True, max_length=1024, null=True, unique=True), + ), + migrations.AddConstraint( + model_name='publication', + constraint=models.UniqueConstraint(fields=('doi', 'url'), name='unique_publication_entry'), + ), + migrations.AddField( + model_name='harvestingevent', + name='source', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='harvesting_events', to='publications.source'), + ), + migrations.AddField( + model_name='harvestingevent', + name='user', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL), + ), + migrations.AddField( + model_name='publication', + name='job', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='publications', to='publications.harvestingevent'), + ), + ] diff --git a/publications/models.py b/publications/models.py index 520aaae..ad73083 100644 --- a/publications/models.py +++ b/publications/models.py @@ -1,6 +1,7 @@ from django.contrib.gis.db import models from django.contrib.postgres.fields import ArrayField from django_currentuser.db.models import CurrentUserField +from django_q.models import Schedule STATUS_CHOICES = ( ("d", "Draft"), @@ -34,17 +35,31 @@ class Publication(models.Model): publicationDate = models.DateField(null=True,blank=True) title = models.TextField(null=True, blank=True) abstract = models.TextField(null=True, blank=True) - url = models.URLField(max_length=1024, null=True, blank=True) + url = models.URLField(max_length=1024, null=True, blank=True, unique=True) geometry = models.GeometryCollectionField(verbose_name='Publication geometry/ies', srid = 4326, null=True, blank=True)# https://docs.openalex.org/api-entities/sources timeperiod_startdate = ArrayField(models.CharField(max_length=1024, null=True), null=True, blank=True) timeperiod_enddate = ArrayField(models.CharField(max_length=1024, null=True), null=True, blank=True) + # Linking to HarvestingEvent as "job" + job = models.ForeignKey( + 'HarvestingEvent', + on_delete=models.CASCADE, + related_name='publications', + null=True, + blank=True + ) + + def get_absolute_url(self): return "/api/v1/publications/%i.json" % self.id # http://localhost:8000/api/v1/publications/5.json class Meta: ordering = ['-id'] + constraints = [ + models.UniqueConstraint(fields=['doi', 'url'], name='unique_publication_entry') + ] + def __str__(self): """Return string representation.""" @@ -67,7 +82,20 @@ class Source(models.Model): url_field = models.URLField(max_length = 999) harvest_interval_minutes = models.IntegerField(default=60*24*3) last_harvest = models.DateTimeField(auto_now_add=True,null=True) - + + def save(self, *args, **kwargs): + super().save(*args, **kwargs) + Schedule.objects.filter(name=f"Harvest Source {self.id}").delete() # Avoid duplicates + Schedule.objects.create( + func='publications.tasks.harvest_oai_endpoint', + args=str(self.id), + schedule_type=Schedule.MINUTES, + minutes=self.harvest_interval_minutes, + name=f"Harvest Source {self.id}", + ) + + + class Subscription(models.Model): name = models.CharField(max_length=4096) search_term = models.CharField(max_length=4096,null=True) @@ -104,3 +132,25 @@ class PublicationResource(resources.ModelResource): class Meta: model = Publication fields = ('created_by','updated_by',) + + + +class HarvestingEvent(models.Model): + source = models.ForeignKey('Source', on_delete=models.CASCADE, related_name='harvesting_events') + user = models.ForeignKey(User, on_delete=models.SET_NULL, null=True, blank=True) + started_at = models.DateTimeField(auto_now_add=True) + completed_at = models.DateTimeField(null=True, blank=True) + status = models.CharField( + max_length=16, + choices=( + ('pending', 'Pending'), + ('in_progress', 'In Progress'), + ('completed', 'Completed'), + ('failed', 'Failed'), + ), + default='pending' + ) + + def __str__(self): + return f"Harvesting Event ({self.status}) for {self.source.url} at {self.started_at}" + diff --git a/publications/tasks.py b/publications/tasks.py index c27f94e..6a02105 100644 --- a/publications/tasks.py +++ b/publications/tasks.py @@ -2,12 +2,13 @@ logger = logging.getLogger(__name__) from django_q.models import Schedule -from publications.models import Publication +from publications.models import Publication, HarvestingEvent, Source from bs4 import BeautifulSoup import json import xml.dom.minidom from django.contrib.gis.geos import GEOSGeometry import requests +from datetime import datetime def extract_geometry_from_html(content): @@ -16,7 +17,6 @@ def extract_geometry_from_html(content): data = tag.get("content", None) try: geom = json.loads(data) - geom_data = geom["features"][0]["geometry"] # preparing geometry data in accordance to geosAPI fields type_geom= {'type': 'GeometryCollection'} @@ -43,47 +43,46 @@ def extract_timeperiod_from_html(content): # returning arrays for array field in DB return [period[0]], [period[1]] -def parse_oai_xml_and_save_publications(content): +def parse_oai_xml_and_save_publications(content, event): + DOMTree = xml.dom.minidom.parseString(content) collection = DOMTree.documentElement # pass DOMTree as argument articles = collection.getElementsByTagName("dc:identifier") - articles_count_in_journal = len(articles) - for i in range(articles_count_in_journal): - identifier = collection.getElementsByTagName("dc:identifier") - identifier_value = identifier[i].firstChild.nodeValue - if identifier_value.startswith('http'): - with requests.get(identifier_value) as response: - soup = BeautifulSoup(response.content, 'html.parser') + for article in articles: + identifier_value = article.firstChild.nodeValue if article.firstChild else None + + if Publication.objects.filter(url=identifier_value).exists(): + logger.info('Skipping duplicate publication: %s', identifier_value) + continue # Skip if publication already exists + if identifier_value and identifier_value.startswith("http"): + with requests.get(identifier_value) as response: + soup = BeautifulSoup(response.content, "html.parser") geom_object = extract_geometry_from_html(soup) period_start, period_end = extract_timeperiod_from_html(soup) + else: geom_object = None period_start = [] period_end = [] + doi_value = collection.getElementsByTagName("dc:identifier") + doi_text = doi_value[0].firstChild.nodeValue if doi_value else None + + if doi_text and Publication.objects.filter(doi__iexact=doi_text).exists(): + logger.info('Skipping duplicate publication (DOI): %s', doi_text) + continue + title = collection.getElementsByTagName("dc:title") - if title: - title_value = title[0].firstChild.nodeValue - else : - title_value = None + title_value = title[0].firstChild.nodeValue if title else None abstract = collection.getElementsByTagName("dc:description") - if abstract: - abstract_text = abstract[0].firstChild.nodeValue - else: - abstract_text = None + abstract_text = abstract[0].firstChild.nodeValue if abstract else None journal = collection.getElementsByTagName("dc:publisher") - if journal: - journal_value = journal[0].firstChild.nodeValue - else: - journal_value = None + journal_value = journal[0].firstChild.nodeValue if journal else None date = collection.getElementsByTagName("dc:date") - if date: - date_value = date[0].firstChild.nodeValue - else: - date_value = None + date_value = date[0].firstChild.nodeValue if date else None publication = Publication( title = title_value, @@ -97,10 +96,22 @@ def parse_oai_xml_and_save_publications(content): publication.save() logger.info('Saved new publication for %s: %s', identifier_value, publication) -def harvest_oai_endpoint(url): +def harvest_oai_endpoint(source_id): + source = Source.objects.get(id=source_id) + event = HarvestingEvent.objects.create(source=source, status="in_progress") try: - with requests.Session() as s: - response = s.get(url) - parse_oai_xml_and_save_publications(response.content) + with requests.Session() as session: + response = session.get(source.url_field) + response.raise_for_status() + parse_oai_xml_and_save_publications(response.content, event) + + event.status = "completed" + event.completed_at = datetime.now() + event.save() + print("Harvesting completed for %s", source.url_field) except requests.exceptions.RequestException as e: - print ("The requested URL is invalid or has bad connection.Please change the URL") + print("Error harvesting from %s: %s", source.url_field, e) + event.status = "failed" + event.log = str(e) + event.save() + diff --git a/tests/test_harvesting.py b/tests/test_harvesting.py index 13f3ee1..889ef4b 100644 --- a/tests/test_harvesting.py +++ b/tests/test_harvesting.py @@ -1,7 +1,7 @@ import os from django.test import Client, TestCase from publications.tasks import parse_oai_xml_and_save_publications -from publications.models import Publication +from publications.models import Publication, Source, Schedule import httpretty os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'optimap.settings') @@ -70,4 +70,19 @@ def test_api_publication_2(self): self.assertEqual(body['geometry']['geometries'][0]['type'], 'Polygon') self.assertIsNone(body['properties']['doi']) self.assertEqual(body['properties']['timeperiod_enddate'],['2022-03-31']) - self.assertEqual(body['properties']['url'],'http://localhost:8330/index.php/opti-geo/article/view/2') \ No newline at end of file + self.assertEqual(body['properties']['url'],'http://localhost:8330/index.php/opti-geo/article/view/2') + + def test_task_scheduling(self): + source = Source.objects.create(url_field="http://example.com/oai", harvest_interval_minutes=60) + source.save() + + schedule = Schedule.objects.filter(name=f"Harvest Source {source.id}") + self.assertTrue(schedule.exists(), "Django-Q task not scheduled for source.") + + def test_no_duplicates(self): + with open("path/to/oai_dc.xml") as oai: + parse_oai_xml_and_save_publications(oai.read(), event=None) + parse_oai_xml_and_save_publications(oai.read(), event=None) + + self.assertEqual(Publication.objects.count(), 2, "Duplicate publications were created.") + From 8e415e08bab886996cfa18e3918dc83d332462f5 Mon Sep 17 00:00:00 2001 From: uxairibrar Date: Thu, 13 Feb 2025 15:09:18 +0100 Subject: [PATCH 2/6] Changes made in the harvesting test and tasks --- optimap/settings.py | 6 +- .../migrations/0005_alter_publication_doi.py | 18 +++ .../migrations/0006_alter_publication_doi.py | 18 +++ publications/models.py | 4 +- publications/tasks.py | 120 +++++++++++------- requirements.txt | 1 + tests/test_harvesting.py | 26 +++- 7 files changed, 133 insertions(+), 60 deletions(-) create mode 100644 publications/migrations/0005_alter_publication_doi.py create mode 100644 publications/migrations/0006_alter_publication_doi.py diff --git a/optimap/settings.py b/optimap/settings.py index d252053..cfa8fe3 100644 --- a/optimap/settings.py +++ b/optimap/settings.py @@ -35,7 +35,7 @@ SECRET_KEY = env('SECRET_KEY', default='django-insecure') # SECURITY WARNING: don't run with debug turned on in production! -DEBUG = env('OPTIMAP_DEBUG', default=True) +DEBUG = True ALLOWED_HOSTS = [i.strip('[]') for i in env('OPTIMAP_ALLOWED_HOST', default='*').split(',')] @@ -173,7 +173,7 @@ CACHE_MIDDLEWARE_SECONDS = env('OPTIMAP_CACHE_SECONDS', default=3600) # for testing email sending EMAIL_BACKEND = 'django.core.mail.backends.smtp.EmailBackend' -EMAIL_BACKEND = env('OPTIMAP_EMAIL_BACKEND', default='django.core.mail.backends.console.EmailBackend') +EMAIL_BACKEND = env('OPTIMAP_EMAIL_BACKEND', default='django.core.mail.backends.smtp.EmailBackend') EMAIL_HOST = env('OPTIMAP_EMAIL_HOST', default='optimap.dev') EMAIL_PORT = env('OPTIMAP_EMAIL_PORT_SMTP', default=587) EMAIL_HOST_IMAP = env('OPTIMAP_EMAIL_HOST_IMAP', default='optimap.imap') @@ -183,6 +183,8 @@ EMAIL_USE_TLS = env('OPTIMAP_EMAIL_USE_TLS', default=False) EMAIL_USE_SSL = env('OPTIMAP_EMAIL_USE_SSL', default=False) EMAIL_IMAP_SENT_FOLDER = env('OPTIMAP_EMAIL_IMAP_SENT_FOLDER', default='') +OAI_USERNAME = os.getenv("OAI_USERNAME") +OAI_PASSWORD = os.getenv("OAI_PASSWORD") MIDDLEWARE = [ 'django.middleware.cache.UpdateCacheMiddleware', diff --git a/publications/migrations/0005_alter_publication_doi.py b/publications/migrations/0005_alter_publication_doi.py new file mode 100644 index 0000000..4417305 --- /dev/null +++ b/publications/migrations/0005_alter_publication_doi.py @@ -0,0 +1,18 @@ +# Generated by Django 4.0.5 on 2025-02-13 12:45 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('publications', '0004_harvestingevent_alter_publication_url_and_more'), + ] + + operations = [ + migrations.AlterField( + model_name='publication', + name='doi', + field=models.CharField(blank=True, max_length=1024, unique=True), + ), + ] diff --git a/publications/migrations/0006_alter_publication_doi.py b/publications/migrations/0006_alter_publication_doi.py new file mode 100644 index 0000000..73afc2d --- /dev/null +++ b/publications/migrations/0006_alter_publication_doi.py @@ -0,0 +1,18 @@ +# Generated by Django 4.0.5 on 2025-02-13 13:39 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('publications', '0005_alter_publication_doi'), + ] + + operations = [ + migrations.AlterField( + model_name='publication', + name='doi', + field=models.CharField(blank=True, max_length=1024, null=True, unique=True), + ), + ] diff --git a/publications/models.py b/publications/models.py index ad73083..5c5bb10 100644 --- a/publications/models.py +++ b/publications/models.py @@ -13,7 +13,7 @@ class Publication(models.Model): # required fields - doi = models.CharField(max_length=1024, unique=True) + doi = models.CharField(max_length=1024, unique=True, null=True, blank=True) # ✅ Allow NULL DOIs status = models.CharField(max_length=1, choices=STATUS_CHOICES, default="d") created_by = CurrentUserField( # see useful hint at https://github.com/zsoldosp/django-currentuser/issues/69 verbose_name=("Created by"), @@ -133,8 +133,6 @@ class Meta: model = Publication fields = ('created_by','updated_by',) - - class HarvestingEvent(models.Model): source = models.ForeignKey('Source', on_delete=models.CASCADE, related_name='harvesting_events') user = models.ForeignKey(User, on_delete=models.SET_NULL, null=True, blank=True) diff --git a/publications/tasks.py b/publications/tasks.py index 6a02105..c1ffe9b 100644 --- a/publications/tasks.py +++ b/publications/tasks.py @@ -8,7 +8,9 @@ import xml.dom.minidom from django.contrib.gis.geos import GEOSGeometry import requests -from datetime import datetime +from django.utils import timezone +from requests.auth import HTTPBasicAuth +import os def extract_geometry_from_html(content): @@ -44,74 +46,96 @@ def extract_timeperiod_from_html(content): return [period[0]], [period[1]] def parse_oai_xml_and_save_publications(content, event): - + DOMTree = xml.dom.minidom.parseString(content) - collection = DOMTree.documentElement # pass DOMTree as argument - articles = collection.getElementsByTagName("dc:identifier") + collection = DOMTree.documentElement - for article in articles: - identifier_value = article.firstChild.nodeValue if article.firstChild else None + records = collection.getElementsByTagName("record") - if Publication.objects.filter(url=identifier_value).exists(): - logger.info('Skipping duplicate publication: %s', identifier_value) - continue # Skip if publication already exists + if not records: + logger.warning("No articles found in OAI-PMH response!") + return - if identifier_value and identifier_value.startswith("http"): - with requests.get(identifier_value) as response: - soup = BeautifulSoup(response.content, "html.parser") - geom_object = extract_geometry_from_html(soup) - period_start, period_end = extract_timeperiod_from_html(soup) + existing_urls = set(Publication.objects.values_list('url', flat=True)) # ✅ Cache existing URLs + existing_dois = set(Publication.objects.exclude(doi__isnull=True).values_list('doi', flat=True)) # ✅ Cache existing DOIs + + for record in records: + try: + def get_text(tag_name): + nodes = record.getElementsByTagName(tag_name) + return nodes[0].firstChild.nodeValue.strip() if nodes and nodes[0].firstChild else None + + identifier_value = get_text("dc:identifier") + title_value = get_text("dc:title") + abstract_text = get_text("dc:description") + journal_value = get_text("dc:publisher") + date_value = get_text("dc:date") + + doi_nodes = record.getElementsByTagName("dc:identifier") + doi_text = next((node.firstChild.nodeValue.strip() for node in doi_nodes if "doi.org" in node.firstChild.nodeValue), None) + + if not identifier_value or not identifier_value.startswith("http"): + logger.warning("Skipping record with invalid URL: %s", identifier_value) + continue + + if doi_text and doi_text in existing_dois: + logger.info("Skipping duplicate publication (DOI): %s", doi_text) + continue + + if identifier_value in existing_urls: + logger.info("Skipping duplicate publication (URL): %s", identifier_value) + continue + existing_urls.add(identifier_value) + if doi_text: + existing_dois.add(doi_text) - else: geom_object = None period_start = [] period_end = [] + with requests.get(identifier_value) as response: + soup = BeautifulSoup(response.content, "html.parser") + geom_object = extract_geometry_from_html(soup) + period_start, period_end = extract_timeperiod_from_html(soup) - doi_value = collection.getElementsByTagName("dc:identifier") - doi_text = doi_value[0].firstChild.nodeValue if doi_value else None - - if doi_text and Publication.objects.filter(doi__iexact=doi_text).exists(): - logger.info('Skipping duplicate publication (DOI): %s', doi_text) + publication = Publication( + title=title_value, + abstract=abstract_text, + publicationDate=date_value, + url=identifier_value, + doi=doi_text if doi_text else None, + source=journal_value, + geometry=geom_object, + timeperiod_startdate=period_start, + timeperiod_enddate=period_end + ) + publication.save() + print("Saved new publication: %s", identifier_value) + + except Exception as e: + print("Error parsing record: %s", str(e)) continue - title = collection.getElementsByTagName("dc:title") - title_value = title[0].firstChild.nodeValue if title else None - abstract = collection.getElementsByTagName("dc:description") - abstract_text = abstract[0].firstChild.nodeValue if abstract else None - journal = collection.getElementsByTagName("dc:publisher") - journal_value = journal[0].firstChild.nodeValue if journal else None - date = collection.getElementsByTagName("dc:date") - date_value = date[0].firstChild.nodeValue if date else None - - publication = Publication( - title = title_value, - abstract = abstract_text, - publicationDate = date_value, - url = identifier_value, - journal = journal_value, - geometry = geom_object, - timeperiod_startdate = period_start, - timeperiod_enddate = period_end) - publication.save() - logger.info('Saved new publication for %s: %s', identifier_value, publication) - def harvest_oai_endpoint(source_id): source = Source.objects.get(id=source_id) event = HarvestingEvent.objects.create(source=source, status="in_progress") + + username = os.getenv("OAI_USERNAME") + password = os.getenv("OAI_PASSWORD") + try: with requests.Session() as session: - response = session.get(source.url_field) - response.raise_for_status() + response = session.get(source.url_field, auth=HTTPBasicAuth(username, password)) + response.raise_for_status() parse_oai_xml_and_save_publications(response.content, event) event.status = "completed" - event.completed_at = datetime.now() + event.completed_at = timezone.now() event.save() - print("Harvesting completed for %s", source.url_field) + print("Harvesting completed for", source.url_field) + except requests.exceptions.RequestException as e: - print("Error harvesting from %s: %s", source.url_field, e) + print("Error harvesting from", source.url_field, ":", e) event.status = "failed" event.log = str(e) - event.save() - + event.save() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index aaa358b..ac48bcf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,4 @@ whitenoise==6.8.2 psycopg2-binary==2.9.10 packaging==21.3 pycryptodome==3.21.0 +httpretty==1.1.4 \ No newline at end of file diff --git a/tests/test_harvesting.py b/tests/test_harvesting.py index 889ef4b..5cfc20f 100644 --- a/tests/test_harvesting.py +++ b/tests/test_harvesting.py @@ -3,6 +3,7 @@ from publications.tasks import parse_oai_xml_and_save_publications from publications.models import Publication, Source, Schedule import httpretty +import time os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'optimap.settings') @@ -26,7 +27,7 @@ def setUpClass(cls): body = article02.read() ) - parse_oai_xml_and_save_publications(oai.read()) + parse_oai_xml_and_save_publications(oai.read(), event=None) @classmethod def tearDownClass(cls): @@ -73,16 +74,27 @@ def test_api_publication_2(self): self.assertEqual(body['properties']['url'],'http://localhost:8330/index.php/opti-geo/article/view/2') def test_task_scheduling(self): - source = Source.objects.create(url_field="http://example.com/oai", harvest_interval_minutes=60) + oai_file_path = os.path.join(os.getcwd(), "tests", "harvesting", "journal_1", "oai_dc.xml") + source = Source.objects.create( + url_field=f"file://{oai_file_path}", + harvest_interval_minutes=60 + ) source.save() - + time.sleep(2) schedule = Schedule.objects.filter(name=f"Harvest Source {source.id}") self.assertTrue(schedule.exists(), "Django-Q task not scheduled for source.") def test_no_duplicates(self): - with open("path/to/oai_dc.xml") as oai: - parse_oai_xml_and_save_publications(oai.read(), event=None) - parse_oai_xml_and_save_publications(oai.read(), event=None) + Publication.objects.all().delete() + oai_file_path = os.path.join(os.getcwd(), "tests", "harvesting", "journal_1", "oai_dc.xml") + print(Publication.objects.count()) + + with open(oai_file_path, "r") as oai: + content = oai.read() - self.assertEqual(Publication.objects.count(), 2, "Duplicate publications were created.") + parse_oai_xml_and_save_publications(content, event=None) + parse_oai_xml_and_save_publications(content, event=None) + + publications_count = Publication.objects.count() + self.assertEqual(publications_count, 2, "Duplicate publications were created!") From 79622f3b60ebff25af86a7867333cac8d5781e21 Mon Sep 17 00:00:00 2001 From: uxairibrar Date: Fri, 14 Mar 2025 22:26:06 +0100 Subject: [PATCH 3/6] Updated Test Case and Added Admin action for Harvesting Scheduling --- optimap/settings.py | 6 +++--- publications/admin.py | 23 +++++++++++++++++++++-- publications/tasks.py | 4 ++-- tests/test_harvesting.py | 26 +++++++++++++++++++++----- 4 files changed, 47 insertions(+), 12 deletions(-) diff --git a/optimap/settings.py b/optimap/settings.py index cfa8fe3..c280c8a 100644 --- a/optimap/settings.py +++ b/optimap/settings.py @@ -173,7 +173,7 @@ CACHE_MIDDLEWARE_SECONDS = env('OPTIMAP_CACHE_SECONDS', default=3600) # for testing email sending EMAIL_BACKEND = 'django.core.mail.backends.smtp.EmailBackend' -EMAIL_BACKEND = env('OPTIMAP_EMAIL_BACKEND', default='django.core.mail.backends.smtp.EmailBackend') +EMAIL_BACKEND = env('OPTIMAP_EMAIL_BACKEND', default='django.core.mail.backends.console.EmailBackend') EMAIL_HOST = env('OPTIMAP_EMAIL_HOST', default='optimap.dev') EMAIL_PORT = env('OPTIMAP_EMAIL_PORT_SMTP', default=587) EMAIL_HOST_IMAP = env('OPTIMAP_EMAIL_HOST_IMAP', default='optimap.imap') @@ -183,8 +183,8 @@ EMAIL_USE_TLS = env('OPTIMAP_EMAIL_USE_TLS', default=False) EMAIL_USE_SSL = env('OPTIMAP_EMAIL_USE_SSL', default=False) EMAIL_IMAP_SENT_FOLDER = env('OPTIMAP_EMAIL_IMAP_SENT_FOLDER', default='') -OAI_USERNAME = os.getenv("OAI_USERNAME") -OAI_PASSWORD = os.getenv("OAI_PASSWORD") +OAI_USERNAME = env("OPTIMAP_OAI_USERNAME", default="") +OAI_PASSWORD = env("OPTIMAP_OAI_PASSWORD", default="") MIDDLEWARE = [ 'django.middleware.cache.UpdateCacheMiddleware', diff --git a/publications/admin.py b/publications/admin.py index 2b00401..c64dfb3 100644 --- a/publications/admin.py +++ b/publications/admin.py @@ -3,7 +3,8 @@ from publications.models import Publication, Source, HarvestingEvent from import_export.admin import ImportExportModelAdmin from publications.tasks import harvest_oai_endpoint - +from django_q.models import Schedule +from django.utils.timezone import now @admin.action(description="Mark selected publications as published") def make_public(modeladmin, request, queryset): @@ -24,6 +25,23 @@ def trigger_harvesting_for_all(modeladmin, request, queryset): for source in all_sources: harvest_oai_endpoint(source.id) +@admin.action(description="Schedule harvesting for selected sources") +def schedule_harvesting(modeladmin, request, queryset): + """Admin action to manually schedule harvesting via Django-Q.""" + for source in queryset: + existing_schedule = Schedule.objects.filter(name=f"Manual Harvest Source {source.id}") + if existing_schedule.exists(): + modeladmin.message_user(request, f"Harvesting is already scheduled for Source {source.id}. Skipping.") + continue # Skip if already scheduled + + Schedule.objects.create( + func='publications.tasks.harvest_oai_endpoint', + args=str(source.id), + schedule_type=Schedule.ONCE, + next_run=now(), + name=f"Manual Harvest Source {source.id}", + ) + modeladmin.message_user(request, f"Harvesting scheduled for {queryset.count()} sources!") @admin.register(Publication) class PublicationAdmin(LeafletGeoAdmin, ImportExportModelAdmin): @@ -38,10 +56,11 @@ class SourceAdmin(admin.ModelAdmin): list_display = ("id", "url_field", "harvest_interval_minutes", "last_harvest") list_filter = ("harvest_interval_minutes",) search_fields = ("url_field",) - actions = [trigger_harvesting_for_specific,trigger_harvesting_for_all] + actions = [trigger_harvesting_for_specific,trigger_harvesting_for_all, schedule_harvesting] @admin.register(HarvestingEvent) class HarvestingEventAdmin(admin.ModelAdmin): list_display = ("id", "source", "status", "started_at", "completed_at") list_filter = ("status", "started_at", "completed_at") search_fields = ("source__url",) + diff --git a/publications/tasks.py b/publications/tasks.py index c1ffe9b..49136ab 100644 --- a/publications/tasks.py +++ b/publications/tasks.py @@ -120,8 +120,8 @@ def harvest_oai_endpoint(source_id): source = Source.objects.get(id=source_id) event = HarvestingEvent.objects.create(source=source, status="in_progress") - username = os.getenv("OAI_USERNAME") - password = os.getenv("OAI_PASSWORD") + username = os.getenv("OPTIMAP_OAI_USERNAME") + password = os.getenv("OPTIMAP_OAI_PASSWORD") try: with requests.Session() as session: diff --git a/tests/test_harvesting.py b/tests/test_harvesting.py index 5cfc20f..a9549c5 100644 --- a/tests/test_harvesting.py +++ b/tests/test_harvesting.py @@ -1,12 +1,14 @@ import os +import django +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'optimap.settings') +django.setup() from django.test import Client, TestCase from publications.tasks import parse_oai_xml_and_save_publications from publications.models import Publication, Source, Schedule +from django_q.tasks import async_task import httpretty import time -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'optimap.settings') - class SimpleTest(TestCase): def setUp(self): @@ -76,13 +78,28 @@ def test_api_publication_2(self): def test_task_scheduling(self): oai_file_path = os.path.join(os.getcwd(), "tests", "harvesting", "journal_1", "oai_dc.xml") source = Source.objects.create( - url_field=f"file://{oai_file_path}", + url_field=f"file://{oai_file_path}", harvest_interval_minutes=60 ) source.save() time.sleep(2) schedule = Schedule.objects.filter(name=f"Harvest Source {source.id}") - self.assertTrue(schedule.exists(), "Django-Q task not scheduled for source.") + self.assertTrue(schedule.exists(), "❌ Django-Q task not scheduled for source.") + + publications_count = Publication.objects.count() + async_task("publications.tasks.harvest_oai_endpoint", source.id) + time.sleep(5) + + self.assertGreater(publications_count, 0, " No publications were harvested.") + + with open(oai_file_path, "r") as oai: + content = oai.read() + parse_oai_xml_and_save_publications(content, event=None) + parse_oai_xml_and_save_publications(content, event=None) + + final_count = Publication.objects.count() + self.assertEqual(final_count, publications_count, " Duplicate publications were created!") + def test_no_duplicates(self): Publication.objects.all().delete() @@ -97,4 +114,3 @@ def test_no_duplicates(self): publications_count = Publication.objects.count() self.assertEqual(publications_count, 2, "Duplicate publications were created!") - From f6cfc23ba0ec707162680d442e678407e0a00b6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Wed, 19 Mar 2025 20:55:12 +0100 Subject: [PATCH 4/6] fix tests --- publications/admin.py | 2 +- publications/migrations/0001_initial.py | 59 +++++++++++-------- ...ingevent_alter_publication_url_and_more.py | 49 --------------- .../migrations/0005_alter_publication_doi.py | 18 ------ .../migrations/0006_alter_publication_doi.py | 18 ------ publications/models.py | 4 +- requirements.txt | 3 +- tests/test_harvesting.py | 29 ++++----- 8 files changed, 53 insertions(+), 129 deletions(-) delete mode 100644 publications/migrations/0004_harvestingevent_alter_publication_url_and_more.py delete mode 100644 publications/migrations/0005_alter_publication_doi.py delete mode 100644 publications/migrations/0006_alter_publication_doi.py diff --git a/publications/admin.py b/publications/admin.py index 36b4e95..1af5045 100644 --- a/publications/admin.py +++ b/publications/admin.py @@ -1,6 +1,6 @@ from django.contrib import admin, messages from leaflet.admin import LeafletGeoAdmin -from publications.models import Publication, BlockedEmail, BlockedDomain +from publications.models import Publication, Source, HarvestingEvent, BlockedEmail, BlockedDomain from import_export.admin import ImportExportModelAdmin from publications.tasks import harvest_oai_endpoint from django_q.models import Schedule diff --git a/publications/migrations/0001_initial.py b/publications/migrations/0001_initial.py index 437d474..255047b 100644 --- a/publications/migrations/0001_initial.py +++ b/publications/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 5.1.7 on 2025-03-19 14:42 +# Generated by Django 5.1.7 on 2025-03-19 19:34 import django.contrib.auth.models import django.contrib.auth.validators @@ -97,6 +97,38 @@ class Migration(migrations.Migration): ('sent_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)), ], ), + migrations.CreateModel( + name='Source', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('creationDate', models.DateTimeField(auto_now_add=True)), + ('lastUpdate', models.DateTimeField(auto_now=True)), + ('url_field', models.URLField(max_length=999)), + ('harvest_interval_minutes', models.IntegerField(default=4320)), + ('last_harvest', models.DateTimeField(auto_now_add=True, null=True)), + ('created_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='%(app_label)s_%(class)s_creator', to=settings.AUTH_USER_MODEL, verbose_name='Created by')), + ('updated_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, on_update=True, related_name='%(app_label)s_%(class)s_updater', to=settings.AUTH_USER_MODEL, verbose_name='Updated by')), + ], + ), + migrations.CreateModel( + name='HarvestingEvent', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('started_at', models.DateTimeField(auto_now_add=True)), + ('completed_at', models.DateTimeField(blank=True, null=True)), + ('status', models.CharField(choices=[('pending', 'Pending'), ('in_progress', 'In Progress'), ('completed', 'Completed'), ('failed', 'Failed')], default='pending', max_length=16)), + ('user', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)), + ('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='harvesting_events', to='publications.source')), + ], + ), + migrations.CreateModel( + name='UserProfile', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('notify_new_manuscripts', models.BooleanField(default=False)), + ('user', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + ), migrations.CreateModel( name='Publication', fields=[ @@ -110,36 +142,17 @@ class Migration(migrations.Migration): ('provenance', models.TextField(blank=True, null=True)), ('publicationDate', models.DateField(blank=True, null=True)), ('abstract', models.TextField(blank=True, null=True)), - ('url', models.URLField(blank=True, max_length=1024, null=True)), + ('url', models.URLField(blank=True, max_length=1024, null=True, unique=True)), ('geometry', django.contrib.gis.db.models.fields.GeometryCollectionField(blank=True, null=True, srid=4326, verbose_name='Publication geometry/ies')), ('timeperiod_startdate', django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=1024, null=True), blank=True, null=True, size=None)), ('timeperiod_enddate', django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=1024, null=True), blank=True, null=True, size=None)), ('created_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='%(app_label)s_%(class)s_creator', to=settings.AUTH_USER_MODEL, verbose_name='Created by')), + ('job', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='publications', to='publications.harvestingevent')), ('updated_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, on_update=True, related_name='%(app_label)s_%(class)s_updater', to=settings.AUTH_USER_MODEL, verbose_name='Updated by')), ], options={ 'ordering': ['-id'], + 'constraints': [models.UniqueConstraint(fields=('doi', 'url'), name='unique_publication_entry')], }, ), - migrations.CreateModel( - name='Source', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('creationDate', models.DateTimeField(auto_now_add=True)), - ('lastUpdate', models.DateTimeField(auto_now=True)), - ('url_field', models.URLField(max_length=999)), - ('harvest_interval_minutes', models.IntegerField(default=4320)), - ('last_harvest', models.DateTimeField(auto_now_add=True, null=True)), - ('created_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='%(app_label)s_%(class)s_creator', to=settings.AUTH_USER_MODEL, verbose_name='Created by')), - ('updated_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, on_update=True, related_name='%(app_label)s_%(class)s_updater', to=settings.AUTH_USER_MODEL, verbose_name='Updated by')), - ], - ), - migrations.CreateModel( - name='UserProfile', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('notify_new_manuscripts', models.BooleanField(default=False)), - ('user', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), - ], - ), ] diff --git a/publications/migrations/0004_harvestingevent_alter_publication_url_and_more.py b/publications/migrations/0004_harvestingevent_alter_publication_url_and_more.py deleted file mode 100644 index 1cb1d9e..0000000 --- a/publications/migrations/0004_harvestingevent_alter_publication_url_and_more.py +++ /dev/null @@ -1,49 +0,0 @@ -# Generated by Django 4.0.5 on 2025-01-19 22:04 - -from django.conf import settings -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ('publications', '0003_alter_publication_timeperiod_enddate_and_more'), - ] - - operations = [ - migrations.CreateModel( - name='HarvestingEvent', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('started_at', models.DateTimeField(auto_now_add=True)), - ('completed_at', models.DateTimeField(blank=True, null=True)), - ('status', models.CharField(choices=[('pending', 'Pending'), ('in_progress', 'In Progress'), ('completed', 'Completed'), ('failed', 'Failed')], default='pending', max_length=16)), - ], - ), - migrations.AlterField( - model_name='publication', - name='url', - field=models.URLField(blank=True, max_length=1024, null=True, unique=True), - ), - migrations.AddConstraint( - model_name='publication', - constraint=models.UniqueConstraint(fields=('doi', 'url'), name='unique_publication_entry'), - ), - migrations.AddField( - model_name='harvestingevent', - name='source', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='harvesting_events', to='publications.source'), - ), - migrations.AddField( - model_name='harvestingevent', - name='user', - field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL), - ), - migrations.AddField( - model_name='publication', - name='job', - field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='publications', to='publications.harvestingevent'), - ), - ] diff --git a/publications/migrations/0005_alter_publication_doi.py b/publications/migrations/0005_alter_publication_doi.py deleted file mode 100644 index 4417305..0000000 --- a/publications/migrations/0005_alter_publication_doi.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 4.0.5 on 2025-02-13 12:45 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('publications', '0004_harvestingevent_alter_publication_url_and_more'), - ] - - operations = [ - migrations.AlterField( - model_name='publication', - name='doi', - field=models.CharField(blank=True, max_length=1024, unique=True), - ), - ] diff --git a/publications/migrations/0006_alter_publication_doi.py b/publications/migrations/0006_alter_publication_doi.py deleted file mode 100644 index 73afc2d..0000000 --- a/publications/migrations/0006_alter_publication_doi.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 4.0.5 on 2025-02-13 13:39 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('publications', '0005_alter_publication_doi'), - ] - - operations = [ - migrations.AlterField( - model_name='publication', - name='doi', - field=models.CharField(blank=True, max_length=1024, null=True, unique=True), - ), - ] diff --git a/publications/models.py b/publications/models.py index e6c054c..8c8db7c 100644 --- a/publications/models.py +++ b/publications/models.py @@ -39,8 +39,8 @@ def restore(self): logger.info(f"User {self.username} (ID: {self.id}) was restored.") class Publication(models.Model): - # required fields - doi = models.CharField(max_length=1024, unique=True, null=True, blank=True) # ✅ Allow NULL DOIs + # required fields + title = models.TextField() status = models.CharField(max_length=1, choices=STATUS_CHOICES, default="d") created_by = CurrentUserField( # see useful hint at https://github.com/zsoldosp/django-currentuser/issues/69 verbose_name=("Created by"), diff --git a/requirements.txt b/requirements.txt index 4d3c477..944d40d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,5 +35,4 @@ wcwidth==0.2.13 whitenoise==6.8.2 psycopg2-binary==2.9.10 packaging==21.3 -pycryptodome==3.21.0 -httpretty==1.1.4 \ No newline at end of file +pycryptodome==3.21.0 \ No newline at end of file diff --git a/tests/test_harvesting.py b/tests/test_harvesting.py index b87f822..f925f95 100644 --- a/tests/test_harvesting.py +++ b/tests/test_harvesting.py @@ -6,7 +6,7 @@ from publications.tasks import parse_oai_xml_and_save_publications from publications.models import Publication, Source, Schedule from django_q.tasks import async_task -import httpretty +import responses import time class SimpleTest(TestCase): @@ -31,6 +31,9 @@ def setUpClass(cls): parse_oai_xml_and_save_publications(oai.read(), event=None) + # set status to published + Publication.objects.all().update(status="p") + @classmethod def tearDownClass(cls): Publication.objects.all().delete() @@ -44,9 +47,6 @@ def test_api_root(self): self.assertEqual(results['type'], 'FeatureCollection') self.assertEqual(len(results['features']), 2) - self.assertEqual(len(results['features'][0]['properties']), 9) - self.assertEqual(results['features'][0]['properties']['title'], 'Test 1: One') - self.assertEqual(results['features'][0]['properties']['publicationDate'], '2022-07-01') def test_api_publication_1(self): response = self.client.get('/api/v1/publications/%s.json' % self.id1) @@ -71,6 +71,7 @@ def test_api_publication_2(self): self.assertEqual(body['type'], 'Feature') self.assertEqual(body['geometry']['type'], 'GeometryCollection') self.assertEqual(body['geometry']['geometries'][0]['type'], 'Polygon') + self.assertEqual(body['properties']['title'], 'Test 2: Two') self.assertIsNone(body['properties']['doi']) self.assertEqual(body['properties']['timeperiod_enddate'],['2022-03-31']) self.assertEqual(body['properties']['url'],'http://localhost:8330/index.php/opti-geo/article/view/2') @@ -100,17 +101,13 @@ def test_task_scheduling(self): final_count = Publication.objects.count() self.assertEqual(final_count, publications_count, " Duplicate publications were created!") - - def test_no_duplicates(self): - Publication.objects.all().delete() - oai_file_path = os.path.join(os.getcwd(), "tests", "harvesting", "journal_1", "oai_dc.xml") - print(Publication.objects.count()) - - with open(oai_file_path, "r") as oai: - content = oai.read() - - parse_oai_xml_and_save_publications(content, event=None) - parse_oai_xml_and_save_publications(content, event=None) - + def test_no_duplicates(self): publications_count = Publication.objects.count() self.assertEqual(publications_count, 2, "Duplicate publications were created!") + + response = self.client.get('/api/v1/publications/') + results = response.json()['results'] + + titles = [pub['properties']['title'] for pub in results['features']] + unique_titles = list(set(titles)) + self.assertEqual(len(titles), len(unique_titles)) From f58fd399779f9734412c34bf86dd5f7de3a4320f Mon Sep 17 00:00:00 2001 From: uxairibrar Date: Fri, 4 Apr 2025 15:29:05 +0200 Subject: [PATCH 5/6] Updated Test cases --- publications/tasks.py | 13 +++++++++++-- tests/harvesting/journal_1/oai_dc.xml | 4 +++- tests/test_harvesting.py | 20 ++++++++++++++++---- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/publications/tasks.py b/publications/tasks.py index 32cd3b0..9b6b4d8 100644 --- a/publications/tasks.py +++ b/publications/tasks.py @@ -22,7 +22,7 @@ from django_q.models import Schedule import time import calendar - +import re def extract_geometry_from_html(content): for tag in content.find_all("meta"): @@ -56,6 +56,8 @@ def extract_timeperiod_from_html(content): # returning arrays for array field in DB return [period[0]], [period[1]] +DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE) + def parse_oai_xml_and_save_publications(content, event): DOMTree = xml.dom.minidom.parseString(content) @@ -82,8 +84,15 @@ def get_text(tag_name): journal_value = get_text("dc:publisher") date_value = get_text("dc:date") + doi_text = None doi_nodes = record.getElementsByTagName("dc:identifier") - doi_text = next((node.firstChild.nodeValue.strip() for node in doi_nodes if "doi.org" in node.firstChild.nodeValue), None) + for node in doi_nodes: + if node.firstChild and node.firstChild.nodeValue: + candidate = node.firstChild.nodeValue.strip() + match = DOI_REGEX.search(candidate) + if match: + doi_text = match.group(0) + break if not identifier_value or not identifier_value.startswith("http"): logger.warning("Skipping record with invalid URL: %s", identifier_value) diff --git a/tests/harvesting/journal_1/oai_dc.xml b/tests/harvesting/journal_1/oai_dc.xml index eddae96..fe5732a 100644 --- a/tests/harvesting/journal_1/oai_dc.xml +++ b/tests/harvesting/journal_1/oai_dc.xml @@ -26,6 +26,7 @@ info:eu-repo/semantics/publishedVersion Begutachter Beitrag http://localhost:8330/index.php/opti-geo/article/view/1 + 10.1234/abcd.efgh Journal of Optimal Geolocations; 2022 Earth, Asia, Republic of Turkey Copyright (c) 2022 Journal of Optimal Geolocations @@ -51,7 +52,8 @@ info:eu-repo/semantics/article info:eu-repo/semantics/publishedVersion Begutachter Beitrag - http://localhost:8330/index.php/opti-geo/article/view/2 + http://localhost:8330/index.php/opti-geo/article/view/1 + 10.1234/abcd.efgh Journal of Optimal Geolocations; 2022 Earth, Europe Copyright (c) 2022 Journal of Optimal Geolocations diff --git a/tests/test_harvesting.py b/tests/test_harvesting.py index f925f95..393e8be 100644 --- a/tests/test_harvesting.py +++ b/tests/test_harvesting.py @@ -15,8 +15,15 @@ def setUp(self): self.client = Client() results = self.client.get('/api/v1/publications/').json()['results'] - self.id1 = results['features'][1]['id'] # newest first - self.id2 = results['features'][0]['id'] + features = results.get('features', []) + + if len(features) >= 2: + self.id1 = features[1]['id'] + self.id2 = features[0]['id'] + elif len(features) == 1: + self.id1 = self.id2 = features[0]['id'] + else: + self.id1 = self.id2 = None @classmethod @responses.activate @@ -85,7 +92,7 @@ def test_task_scheduling(self): source.save() time.sleep(2) schedule = Schedule.objects.filter(name=f"Harvest Source {source.id}") - self.assertTrue(schedule.exists(), "❌ Django-Q task not scheduled for source.") + self.assertTrue(schedule.exists(), "Django-Q task not scheduled for source.") publications_count = Publication.objects.count() async_task("publications.tasks.harvest_oai_endpoint", source.id) @@ -99,7 +106,12 @@ def test_task_scheduling(self): parse_oai_xml_and_save_publications(content, event=None) final_count = Publication.objects.count() - self.assertEqual(final_count, publications_count, " Duplicate publications were created!") + self.assertEqual(final_count, publications_count, "Duplicate publications were created!") + + latest_pub = Publication.objects.latest('id') + self.assertIsNotNone(latest_pub.doi) + self.assertTrue(latest_pub.doi.startswith("10."), "DOI not correctly extracted using regex") + def test_no_duplicates(self): publications_count = Publication.objects.count() From 7a6020a6e93ae6c41ea2b713b4d9050bcc1a2125 Mon Sep 17 00:00:00 2001 From: uxairibrar Date: Tue, 8 Apr 2025 00:28:40 +0200 Subject: [PATCH 6/6] Updated DOI Issues and the test cases --- publications/models.py | 4 ++-- publications/tasks.py | 5 ++--- tests/harvesting/journal_1/oai_dc.xml | 3 +-- tests/test_harvesting.py | 16 +++++++++------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/publications/models.py b/publications/models.py index 8c8db7c..85ee9c4 100644 --- a/publications/models.py +++ b/publications/models.py @@ -90,7 +90,7 @@ class Meta: def __str__(self): """Return string representation.""" - return self.doi + return self.title class Source(models.Model): # automatic fields @@ -214,7 +214,7 @@ class HarvestingEvent(models.Model): ) def __str__(self): - return f"Harvesting Event ({self.status}) for {self.source.url} at {self.started_at}" + return f"Harvesting Event ({self.status}) for {self.source.url_field} at {self.started_at}" class UserProfile(models.Model): diff --git a/publications/tasks.py b/publications/tasks.py index 9b6b4d8..8320347 100644 --- a/publications/tasks.py +++ b/publications/tasks.py @@ -69,9 +69,8 @@ def parse_oai_xml_and_save_publications(content, event): logger.warning("No articles found in OAI-PMH response!") return - existing_urls = set(Publication.objects.values_list('url', flat=True)) # ✅ Cache existing URLs - existing_dois = set(Publication.objects.exclude(doi__isnull=True).values_list('doi', flat=True)) # ✅ Cache existing DOIs - + existing_urls = set(Publication.objects.values_list('url', flat=True)) + existing_dois = set(Publication.objects.exclude(doi__isnull=True).values_list('doi', flat=True)) for record in records: try: def get_text(tag_name): diff --git a/tests/harvesting/journal_1/oai_dc.xml b/tests/harvesting/journal_1/oai_dc.xml index fe5732a..478b11e 100644 --- a/tests/harvesting/journal_1/oai_dc.xml +++ b/tests/harvesting/journal_1/oai_dc.xml @@ -52,8 +52,7 @@ info:eu-repo/semantics/article info:eu-repo/semantics/publishedVersion Begutachter Beitrag - http://localhost:8330/index.php/opti-geo/article/view/1 - 10.1234/abcd.efgh + http://localhost:8330/index.php/opti-geo/article/view/2 Journal of Optimal Geolocations; 2022 Earth, Europe Copyright (c) 2022 Journal of Optimal Geolocations diff --git a/tests/test_harvesting.py b/tests/test_harvesting.py index 393e8be..6f26b2f 100644 --- a/tests/test_harvesting.py +++ b/tests/test_harvesting.py @@ -94,11 +94,11 @@ def test_task_scheduling(self): schedule = Schedule.objects.filter(name=f"Harvest Source {source.id}") self.assertTrue(schedule.exists(), "Django-Q task not scheduled for source.") - publications_count = Publication.objects.count() - async_task("publications.tasks.harvest_oai_endpoint", source.id) - time.sleep(5) + from publications.tasks import harvest_oai_endpoint + harvest_oai_endpoint(source.id) - self.assertGreater(publications_count, 0, " No publications were harvested.") + publications_count = Publication.objects.count() + self.assertGreater(publications_count, 0, "No publications were harvested.") with open(oai_file_path, "r") as oai: content = oai.read() @@ -108,9 +108,11 @@ def test_task_scheduling(self): final_count = Publication.objects.count() self.assertEqual(final_count, publications_count, "Duplicate publications were created!") - latest_pub = Publication.objects.latest('id') - self.assertIsNotNone(latest_pub.doi) - self.assertTrue(latest_pub.doi.startswith("10."), "DOI not correctly extracted using regex") + publications_with_doi = Publication.objects.exclude(doi__isnull=True) + + self.assertTrue(publications_with_doi.exists(), "No publication with DOI found.") + for pub in publications_with_doi: + self.assertTrue(pub.doi.startswith("10."), f"DOI '{pub.doi}' is not correctly formatted.") def test_no_duplicates(self):