Merge pull request #135 from GeoinformationSystems/features/provide_regular_metadata_23

nuest · web-flow · commit 2bc15e70cb0c · 2025-04-23T14:38:27.000+02:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -62,6 +62,8 @@ jobs:
         python -Wa manage.py test tests
 
   uitests:
+    # disable until UI tests are fixed, see https://github.com/GeoinformationSystems/optimap/issues/142
+    if: false
     runs-on: ubuntu-24.04
     
     strategy:
diff --git a/publications/admin.py b/publications/admin.py
@@ -18,14 +18,16 @@ def make_draft(modeladmin, request, queryset):
 
 @admin.action(description="Trigger harvesting for selected sources")
 def trigger_harvesting_for_specific(modeladmin, request, queryset):
+    user = request.user
     for source in queryset:
-        harvest_oai_endpoint(source.id)  
+        harvest_oai_endpoint(source.id, user)  
 
 @admin.action(description="Trigger harvesting for all sources")
 def trigger_harvesting_for_all(modeladmin, request, queryset):
     all_sources = Source.objects.all()
+    user = request.user
     for source in all_sources:
-        harvest_oai_endpoint(source.id) 
+        harvest_oai_endpoint(source.id, user) 
 
 @admin.action(description="Schedule harvesting for selected sources")
 def schedule_harvesting(modeladmin, request, queryset):
@@ -113,17 +115,24 @@ def block_email_and_domain(modeladmin, request, queryset):
 @admin.register(Publication)
 class PublicationAdmin(LeafletGeoAdmin, ImportExportModelAdmin):
     """Publication Admin."""
-
-    list_display = ("doi", "creationDate", "lastUpdate", "created_by", "updated_by", "status", "provenance")
-
-    actions = [make_public,make_draft]
+    list_display = ("title", "doi", "creationDate", "lastUpdate", "created_by", "updated_by", "status", "provenance", "source")
+    search_fields = ("title", "doi", "abstract", "source")
+    list_filter = ("status", "creationDate")
+    actions = [make_public, make_draft]
+
+    fields = (
+        "title", "doi", "status", "source", "abstract",
+        "geometry", "timeperiod_startdate", "timeperiod_enddate",
+        "created_by", "updated_by", "provenance"
+    )
+    readonly_fields = ("created_by", "updated_by")
 
 @admin.register(Source)
 class SourceAdmin(admin.ModelAdmin):
-    list_display = ("id", "url_field", "harvest_interval_minutes", "last_harvest")
-    list_filter = ("harvest_interval_minutes",)
-    search_fields = ("url_field",)
-    actions = [trigger_harvesting_for_specific,trigger_harvesting_for_all, schedule_harvesting]
+    list_display = ("id", "url_field", "harvest_interval_minutes", "last_harvest", "collection_name", "tags")
+    list_filter = ("harvest_interval_minutes", "collection_name")
+    search_fields = ("url_field", "collection_name", "tags")
+    actions = [trigger_harvesting_for_specific, trigger_harvesting_for_all, schedule_harvesting]
 
 @admin.register(HarvestingEvent)
 class HarvestingEventAdmin(admin.ModelAdmin):
diff --git a/publications/migrations/0002_source_collection_name_source_tags.py b/publications/migrations/0002_source_collection_name_source_tags.py
@@ -0,0 +1,33 @@
+# Generated by Django 5.1.7 on 2025-04-21 19:25
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("publications", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="source",
+            name="collection_name",
+            field=models.CharField(
+                blank=True,
+                help_text="Identifier for a set or group of journals (e.g., 'Health Journals', 'TestBatch_Apr2025').",
+                max_length=255,
+                null=True,
+            ),
+        ),
+        migrations.AddField(
+            model_name="source",
+            name="tags",
+            field=models.CharField(
+                blank=True,
+                help_text="Comma-separated tags to provide additional context",
+                max_length=1024,
+                null=True,
+            ),
+        ),
+    ]
diff --git a/publications/models.py b/publications/models.py
@@ -114,6 +114,19 @@ class Source(models.Model):
     harvest_interval_minutes = models.IntegerField(default=60*24*3)
     last_harvest = models.DateTimeField(auto_now_add=True,null=True)
 
+    collection_name = models.CharField(
+        max_length=255,
+        blank=True,
+        null=True,
+        help_text="Identifier for a set or group of journals (e.g., 'Health Journals', 'TestBatch_Apr2025')."
+    )
+    tags = models.CharField(
+        max_length=1024,
+        blank=True,
+        null=True,
+        help_text="Comma-separated tags to provide additional context"
+    )
+
     def save(self, *args, **kwargs):
         super().save(*args, **kwargs)
         Schedule.objects.filter(name=f"Harvest Source {self.id}").delete()  # Avoid duplicates
diff --git a/publications/tasks.py b/publications/tasks.py
@@ -28,6 +28,7 @@
 import time  
 import calendar
 import re
+from django.contrib.gis.geos import GeometryCollection
 
 BASE_URL = settings.BASE_URL
 
@@ -66,105 +67,148 @@ def extract_timeperiod_from_html(content):
 DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
 
 def parse_oai_xml_and_save_publications(content, event):
+    try:
+        DOMTree = xml.dom.minidom.parseString(content)
+    except Exception as e:
+        logger.error("Error parsing XML: %s", e)
+        return
 
-    DOMTree = xml.dom.minidom.parseString(content)
     collection = DOMTree.documentElement
-
     records = collection.getElementsByTagName("record")
-
     if not records:
         logger.warning("No articles found in OAI-PMH response!")
         return
-
-    existing_urls = set(Publication.objects.values_list('url', flat=True))
-    existing_dois = set(Publication.objects.exclude(doi__isnull=True).values_list('doi', flat=True)) 
     for record in records:
         try:
             def get_text(tag_name):
                 nodes = record.getElementsByTagName(tag_name)
                 return nodes[0].firstChild.nodeValue.strip() if nodes and nodes[0].firstChild else None
 
-            identifier_value = get_text("dc:identifier")
+            # collect all dc:identifier values
+            id_nodes = record.getElementsByTagName("dc:identifier")
+            identifiers = [
+                n.firstChild.nodeValue.strip()
+                for n in id_nodes
+                if n.firstChild and n.firstChild.nodeValue
+            ]
+            http_urls = [u for u in identifiers if u.lower().startswith("http")]
+            view_urls = [u for u in http_urls if "/view/" in u]
+            identifier_value = (view_urls or http_urls or [None])[0]
+
             title_value = get_text("dc:title")
             abstract_text = get_text("dc:description")
             journal_value = get_text("dc:publisher")
             date_value = get_text("dc:date")
 
             doi_text = None
-            doi_nodes = record.getElementsByTagName("dc:identifier")
-            for node in doi_nodes:
-                if node.firstChild and node.firstChild.nodeValue:
-                    candidate = node.firstChild.nodeValue.strip()
-                    match = DOI_REGEX.search(candidate)
-                    if match:
-                        doi_text = match.group(0)
-                        break
+            for ident in identifiers:
+                if match := DOI_REGEX.search(ident):
+                    doi_text = match.group(0)
+                    break
 
-            if not identifier_value or not identifier_value.startswith("http"):
-                logger.warning("Skipping record with invalid URL: %s", identifier_value)
-                continue
-
-            if doi_text and doi_text in existing_dois:
+            if doi_text and Publication.objects.filter(doi=doi_text).exists():
                 logger.info("Skipping duplicate publication (DOI): %s", doi_text)
                 continue
-
-            if identifier_value in existing_urls:
+            if identifier_value and Publication.objects.filter(url=identifier_value).exists():
                 logger.info("Skipping duplicate publication (URL): %s", identifier_value)
                 continue
+            # Skip records without a valid URL.
+            if not identifier_value or not identifier_value.startswith("http"):
+                logger.warning("Skipping record with invalid URL: %s", identifier_value)
+                continue
 
-            existing_urls.add(identifier_value)
-            if doi_text:
-                existing_dois.add(doi_text)
-
-            geom_object = None
-            period_start = []
-            period_end = []
-            with requests.get(identifier_value) as response:
-                soup = BeautifulSoup(response.content, "html.parser")
-                geom_object = extract_geometry_from_html(soup)
-                period_start, period_end = extract_timeperiod_from_html(soup)
+            geom_object = GeometryCollection()
+            period_start, period_end = [], []
+            try:
+                resp = requests.get(identifier_value, timeout=10)
+                resp.raise_for_status()
+                soup = BeautifulSoup(resp.content, "html.parser")
+
+                try:
+                    geom = extract_geometry_from_html(soup)
+                    geom_object = geom or GeometryCollection()
+                except Exception as geo_err:
+                    logger.error("Geometry extraction failed for URL %s: %s", identifier_value, geo_err)
+                    geom_object = GeometryCollection()
+
+                try:
+                    start_time, end_time = extract_timeperiod_from_html(soup)
+                    if isinstance(start_time, list):
+                        period_start = [d for d in start_time if d]
+                    if isinstance(end_time, list):
+                        period_end = [d for d in end_time if d]
+                except Exception as time_err:
+                    logger.error("Time period extraction failed for URL %s: %s", identifier_value, time_err)
+
+            except Exception as fetch_err:
+                logger.error("Error fetching HTML for %s: %s", identifier_value, fetch_err)
+                geom_object = GeometryCollection()
+                period_start, period_end = [], []
 
             publication = Publication(
                 title=title_value,
                 abstract=abstract_text,
                 publicationDate=date_value,
                 url=identifier_value,
-                doi=doi_text if doi_text else None,
+                doi=doi_text,
                 source=journal_value,
                 geometry=geom_object,
                 timeperiod_startdate=period_start,
-                timeperiod_enddate=period_end
+                timeperiod_enddate=period_end,
+                job=event
             )
             publication.save()
-            print("Saved new publication: %s", identifier_value)
 
         except Exception as e:
-            print("Error parsing record: %s", str(e))
+            logger.error("Error parsing record: %s", e)
             continue
 
-def harvest_oai_endpoint(source_id):
+def harvest_oai_endpoint(source_id, user=None):
     source = Source.objects.get(id=source_id)
     event = HarvestingEvent.objects.create(source=source, status="in_progress")
 
-    username = os.getenv("OPTIMAP_OAI_USERNAME")
-    password = os.getenv("OPTIMAP_OAI_PASSWORD")
-
     try:
-        with requests.Session() as session:
-            response = session.get(source.url_field, auth=HTTPBasicAuth(username, password))
-            response.raise_for_status() 
-            parse_oai_xml_and_save_publications(response.content, event)
-
-            event.status = "completed"
-            event.completed_at = timezone.now()
-            event.save()
-            print("Harvesting completed for", source.url_field)
-
-    except requests.exceptions.RequestException as e:
-        print("Error harvesting from", source.url_field, ":", e)
+        response = requests.get(source.url_field)
+        response.raise_for_status()
+        
+        parse_oai_xml_and_save_publications(response.content, event)
+        
+        event.status = "completed"
+        event.completed_at = timezone.now()
+        event.save()
+        
+        new_count = Publication.objects.filter(job=event).count()
+        spatial_count = Publication.objects.filter(job=event).exclude(geometry__isnull=True).count()
+        temporal_count = Publication.objects.filter(job=event).exclude(timeperiod_startdate=[]).count()
+        
+        subject = f"Harvesting Completed for {source.collection_name}"
+        completed_str = event.completed_at.strftime('%Y-%m-%d %H:%M:%S') if event.completed_at else 'N/A'
+        message = (
+            f"Harvesting job details:\n\n"
+            f"Number of added articles: {new_count}\n"
+            f"Number of articles with spatial metadata: {spatial_count}\n"
+            f"Number of articles with temporal metadata: {temporal_count}\n"
+            f"Collection used: {source.collection_name or 'N/A'}\n"
+            f"Journal: {source.url_field}\n"
+            f"Job started at: {event.started_at.strftime('%Y-%m-%d %H:%M:%S')}\n"
+            f"Job completed at: {completed_str}\n"
+        )
+        
+        if user and user.email:
+            send_mail(
+                subject,
+                message,
+                settings.EMAIL_HOST_USER,
+                [user.email],
+                fail_silently=False,
+            )
+    
+    except Exception as e:
+        logger.error("Harvesting failed for source %s: %s", source.url_field, str(e))
         event.status = "failed"
-        event.log = str(e)
+        event.completed_at = timezone.now()
         event.save()
+
 def send_monthly_email(trigger_source='manual', sent_by=None):
     recipients = User.objects.filter(userprofile__notify_new_manuscripts=True).values_list('email', flat=True)
     last_month = now().replace(day=1) - timedelta(days=1)
diff --git a/tests/test_harvesting.py b/tests/test_harvesting.py
@@ -8,11 +8,20 @@
 from django_q.tasks import async_task
 import responses
 import time
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
 
 class SimpleTest(TestCase):   
   
     def setUp(self):
         self.client = Client()
+        self.user = User.objects.create_user(
+            username="testuser", 
+            email="testuser@example.com", 
+            password="password123"
+        )
 
         results = self.client.get('/api/v1/publications/').json()['results']
         features = results.get('features', [])
@@ -95,7 +104,7 @@ def test_task_scheduling(self):
         self.assertTrue(schedule.exists(), "Django-Q task not scheduled for source.")
 
         from publications.tasks import harvest_oai_endpoint
-        harvest_oai_endpoint(source.id)
+        harvest_oai_endpoint(source.id, self.user)
 
         publications_count = Publication.objects.count()
         self.assertGreater(publications_count, 0, "No publications were harvested.")
diff --git a/tests/test_regular_harvesting.py b/tests/test_regular_harvesting.py