Skip to content

Commit 2bc15e7

Browse files
authored
Merge pull request #135 from GeoinformationSystems/features/provide_regular_metadata_23
2 parents e7a250f + 25de603 commit 2bc15e7

File tree

7 files changed

+261
-65
lines changed

7 files changed

+261
-65
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ jobs:
6262
python -Wa manage.py test tests
6363
6464
uitests:
65+
# disable until UI tests are fixed, see https://github.com/GeoinformationSystems/optimap/issues/142
66+
if: false
6567
runs-on: ubuntu-24.04
6668

6769
strategy:

publications/admin.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@ def make_draft(modeladmin, request, queryset):
1818

1919
@admin.action(description="Trigger harvesting for selected sources")
2020
def trigger_harvesting_for_specific(modeladmin, request, queryset):
21+
user = request.user
2122
for source in queryset:
22-
harvest_oai_endpoint(source.id)
23+
harvest_oai_endpoint(source.id, user)
2324

2425
@admin.action(description="Trigger harvesting for all sources")
2526
def trigger_harvesting_for_all(modeladmin, request, queryset):
2627
all_sources = Source.objects.all()
28+
user = request.user
2729
for source in all_sources:
28-
harvest_oai_endpoint(source.id)
30+
harvest_oai_endpoint(source.id, user)
2931

3032
@admin.action(description="Schedule harvesting for selected sources")
3133
def schedule_harvesting(modeladmin, request, queryset):
@@ -113,17 +115,24 @@ def block_email_and_domain(modeladmin, request, queryset):
113115
@admin.register(Publication)
114116
class PublicationAdmin(LeafletGeoAdmin, ImportExportModelAdmin):
115117
"""Publication Admin."""
116-
117-
list_display = ("doi", "creationDate", "lastUpdate", "created_by", "updated_by", "status", "provenance")
118-
119-
actions = [make_public,make_draft]
118+
list_display = ("title", "doi", "creationDate", "lastUpdate", "created_by", "updated_by", "status", "provenance", "source")
119+
search_fields = ("title", "doi", "abstract", "source")
120+
list_filter = ("status", "creationDate")
121+
actions = [make_public, make_draft]
122+
123+
fields = (
124+
"title", "doi", "status", "source", "abstract",
125+
"geometry", "timeperiod_startdate", "timeperiod_enddate",
126+
"created_by", "updated_by", "provenance"
127+
)
128+
readonly_fields = ("created_by", "updated_by")
120129

121130
@admin.register(Source)
122131
class SourceAdmin(admin.ModelAdmin):
123-
list_display = ("id", "url_field", "harvest_interval_minutes", "last_harvest")
124-
list_filter = ("harvest_interval_minutes",)
125-
search_fields = ("url_field",)
126-
actions = [trigger_harvesting_for_specific,trigger_harvesting_for_all, schedule_harvesting]
132+
list_display = ("id", "url_field", "harvest_interval_minutes", "last_harvest", "collection_name", "tags")
133+
list_filter = ("harvest_interval_minutes", "collection_name")
134+
search_fields = ("url_field", "collection_name", "tags")
135+
actions = [trigger_harvesting_for_specific, trigger_harvesting_for_all, schedule_harvesting]
127136

128137
@admin.register(HarvestingEvent)
129138
class HarvestingEventAdmin(admin.ModelAdmin):
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Generated by Django 5.1.7 on 2025-04-21 19:25
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("publications", "0001_initial"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="source",
15+
name="collection_name",
16+
field=models.CharField(
17+
blank=True,
18+
help_text="Identifier for a set or group of journals (e.g., 'Health Journals', 'TestBatch_Apr2025').",
19+
max_length=255,
20+
null=True,
21+
),
22+
),
23+
migrations.AddField(
24+
model_name="source",
25+
name="tags",
26+
field=models.CharField(
27+
blank=True,
28+
help_text="Comma-separated tags to provide additional context",
29+
max_length=1024,
30+
null=True,
31+
),
32+
),
33+
]

publications/models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,19 @@ class Source(models.Model):
114114
harvest_interval_minutes = models.IntegerField(default=60*24*3)
115115
last_harvest = models.DateTimeField(auto_now_add=True,null=True)
116116

117+
collection_name = models.CharField(
118+
max_length=255,
119+
blank=True,
120+
null=True,
121+
help_text="Identifier for a set or group of journals (e.g., 'Health Journals', 'TestBatch_Apr2025')."
122+
)
123+
tags = models.CharField(
124+
max_length=1024,
125+
blank=True,
126+
null=True,
127+
help_text="Comma-separated tags to provide additional context"
128+
)
129+
117130
def save(self, *args, **kwargs):
118131
super().save(*args, **kwargs)
119132
Schedule.objects.filter(name=f"Harvest Source {self.id}").delete() # Avoid duplicates

publications/tasks.py

Lines changed: 98 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import time
2929
import calendar
3030
import re
31+
from django.contrib.gis.geos import GeometryCollection
3132

3233
BASE_URL = settings.BASE_URL
3334

@@ -66,105 +67,148 @@ def extract_timeperiod_from_html(content):
6667
DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
6768

6869
def parse_oai_xml_and_save_publications(content, event):
70+
try:
71+
DOMTree = xml.dom.minidom.parseString(content)
72+
except Exception as e:
73+
logger.error("Error parsing XML: %s", e)
74+
return
6975

70-
DOMTree = xml.dom.minidom.parseString(content)
7176
collection = DOMTree.documentElement
72-
7377
records = collection.getElementsByTagName("record")
74-
7578
if not records:
7679
logger.warning("No articles found in OAI-PMH response!")
7780
return
78-
79-
existing_urls = set(Publication.objects.values_list('url', flat=True))
80-
existing_dois = set(Publication.objects.exclude(doi__isnull=True).values_list('doi', flat=True))
8181
for record in records:
8282
try:
8383
def get_text(tag_name):
8484
nodes = record.getElementsByTagName(tag_name)
8585
return nodes[0].firstChild.nodeValue.strip() if nodes and nodes[0].firstChild else None
8686

87-
identifier_value = get_text("dc:identifier")
87+
# collect all dc:identifier values
88+
id_nodes = record.getElementsByTagName("dc:identifier")
89+
identifiers = [
90+
n.firstChild.nodeValue.strip()
91+
for n in id_nodes
92+
if n.firstChild and n.firstChild.nodeValue
93+
]
94+
http_urls = [u for u in identifiers if u.lower().startswith("http")]
95+
view_urls = [u for u in http_urls if "/view/" in u]
96+
identifier_value = (view_urls or http_urls or [None])[0]
97+
8898
title_value = get_text("dc:title")
8999
abstract_text = get_text("dc:description")
90100
journal_value = get_text("dc:publisher")
91101
date_value = get_text("dc:date")
92102

93103
doi_text = None
94-
doi_nodes = record.getElementsByTagName("dc:identifier")
95-
for node in doi_nodes:
96-
if node.firstChild and node.firstChild.nodeValue:
97-
candidate = node.firstChild.nodeValue.strip()
98-
match = DOI_REGEX.search(candidate)
99-
if match:
100-
doi_text = match.group(0)
101-
break
104+
for ident in identifiers:
105+
if match := DOI_REGEX.search(ident):
106+
doi_text = match.group(0)
107+
break
102108

103-
if not identifier_value or not identifier_value.startswith("http"):
104-
logger.warning("Skipping record with invalid URL: %s", identifier_value)
105-
continue
106-
107-
if doi_text and doi_text in existing_dois:
109+
if doi_text and Publication.objects.filter(doi=doi_text).exists():
108110
logger.info("Skipping duplicate publication (DOI): %s", doi_text)
109111
continue
110-
111-
if identifier_value in existing_urls:
112+
if identifier_value and Publication.objects.filter(url=identifier_value).exists():
112113
logger.info("Skipping duplicate publication (URL): %s", identifier_value)
113114
continue
115+
# Skip records without a valid URL.
116+
if not identifier_value or not identifier_value.startswith("http"):
117+
logger.warning("Skipping record with invalid URL: %s", identifier_value)
118+
continue
114119

115-
existing_urls.add(identifier_value)
116-
if doi_text:
117-
existing_dois.add(doi_text)
118-
119-
geom_object = None
120-
period_start = []
121-
period_end = []
122-
with requests.get(identifier_value) as response:
123-
soup = BeautifulSoup(response.content, "html.parser")
124-
geom_object = extract_geometry_from_html(soup)
125-
period_start, period_end = extract_timeperiod_from_html(soup)
120+
geom_object = GeometryCollection()
121+
period_start, period_end = [], []
122+
try:
123+
resp = requests.get(identifier_value, timeout=10)
124+
resp.raise_for_status()
125+
soup = BeautifulSoup(resp.content, "html.parser")
126+
127+
try:
128+
geom = extract_geometry_from_html(soup)
129+
geom_object = geom or GeometryCollection()
130+
except Exception as geo_err:
131+
logger.error("Geometry extraction failed for URL %s: %s", identifier_value, geo_err)
132+
geom_object = GeometryCollection()
133+
134+
try:
135+
start_time, end_time = extract_timeperiod_from_html(soup)
136+
if isinstance(start_time, list):
137+
period_start = [d for d in start_time if d]
138+
if isinstance(end_time, list):
139+
period_end = [d for d in end_time if d]
140+
except Exception as time_err:
141+
logger.error("Time period extraction failed for URL %s: %s", identifier_value, time_err)
142+
143+
except Exception as fetch_err:
144+
logger.error("Error fetching HTML for %s: %s", identifier_value, fetch_err)
145+
geom_object = GeometryCollection()
146+
period_start, period_end = [], []
126147

127148
publication = Publication(
128149
title=title_value,
129150
abstract=abstract_text,
130151
publicationDate=date_value,
131152
url=identifier_value,
132-
doi=doi_text if doi_text else None,
153+
doi=doi_text,
133154
source=journal_value,
134155
geometry=geom_object,
135156
timeperiod_startdate=period_start,
136-
timeperiod_enddate=period_end
157+
timeperiod_enddate=period_end,
158+
job=event
137159
)
138160
publication.save()
139-
print("Saved new publication: %s", identifier_value)
140161

141162
except Exception as e:
142-
print("Error parsing record: %s", str(e))
163+
logger.error("Error parsing record: %s", e)
143164
continue
144165

145-
def harvest_oai_endpoint(source_id):
166+
def harvest_oai_endpoint(source_id, user=None):
146167
source = Source.objects.get(id=source_id)
147168
event = HarvestingEvent.objects.create(source=source, status="in_progress")
148169

149-
username = os.getenv("OPTIMAP_OAI_USERNAME")
150-
password = os.getenv("OPTIMAP_OAI_PASSWORD")
151-
152170
try:
153-
with requests.Session() as session:
154-
response = session.get(source.url_field, auth=HTTPBasicAuth(username, password))
155-
response.raise_for_status()
156-
parse_oai_xml_and_save_publications(response.content, event)
157-
158-
event.status = "completed"
159-
event.completed_at = timezone.now()
160-
event.save()
161-
print("Harvesting completed for", source.url_field)
162-
163-
except requests.exceptions.RequestException as e:
164-
print("Error harvesting from", source.url_field, ":", e)
171+
response = requests.get(source.url_field)
172+
response.raise_for_status()
173+
174+
parse_oai_xml_and_save_publications(response.content, event)
175+
176+
event.status = "completed"
177+
event.completed_at = timezone.now()
178+
event.save()
179+
180+
new_count = Publication.objects.filter(job=event).count()
181+
spatial_count = Publication.objects.filter(job=event).exclude(geometry__isnull=True).count()
182+
temporal_count = Publication.objects.filter(job=event).exclude(timeperiod_startdate=[]).count()
183+
184+
subject = f"Harvesting Completed for {source.collection_name}"
185+
completed_str = event.completed_at.strftime('%Y-%m-%d %H:%M:%S') if event.completed_at else 'N/A'
186+
message = (
187+
f"Harvesting job details:\n\n"
188+
f"Number of added articles: {new_count}\n"
189+
f"Number of articles with spatial metadata: {spatial_count}\n"
190+
f"Number of articles with temporal metadata: {temporal_count}\n"
191+
f"Collection used: {source.collection_name or 'N/A'}\n"
192+
f"Journal: {source.url_field}\n"
193+
f"Job started at: {event.started_at.strftime('%Y-%m-%d %H:%M:%S')}\n"
194+
f"Job completed at: {completed_str}\n"
195+
)
196+
197+
if user and user.email:
198+
send_mail(
199+
subject,
200+
message,
201+
settings.EMAIL_HOST_USER,
202+
[user.email],
203+
fail_silently=False,
204+
)
205+
206+
except Exception as e:
207+
logger.error("Harvesting failed for source %s: %s", source.url_field, str(e))
165208
event.status = "failed"
166-
event.log = str(e)
209+
event.completed_at = timezone.now()
167210
event.save()
211+
168212
def send_monthly_email(trigger_source='manual', sent_by=None):
169213
recipients = User.objects.filter(userprofile__notify_new_manuscripts=True).values_list('email', flat=True)
170214
last_month = now().replace(day=1) - timedelta(days=1)

tests/test_harvesting.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,20 @@
88
from django_q.tasks import async_task
99
import responses
1010
import time
11+
from django.contrib.auth import get_user_model
12+
13+
User = get_user_model()
14+
1115

1216
class SimpleTest(TestCase):
1317

1418
def setUp(self):
1519
self.client = Client()
20+
self.user = User.objects.create_user(
21+
username="testuser",
22+
23+
password="password123"
24+
)
1625

1726
results = self.client.get('/api/v1/publications/').json()['results']
1827
features = results.get('features', [])
@@ -95,7 +104,7 @@ def test_task_scheduling(self):
95104
self.assertTrue(schedule.exists(), "Django-Q task not scheduled for source.")
96105

97106
from publications.tasks import harvest_oai_endpoint
98-
harvest_oai_endpoint(source.id)
107+
harvest_oai_endpoint(source.id, self.user)
99108

100109
publications_count = Publication.objects.count()
101110
self.assertGreater(publications_count, 0, "No publications were harvested.")

0 commit comments

Comments
 (0)