|
28 | 28 | import time |
29 | 29 | import calendar |
30 | 30 | import re |
| 31 | +from django.contrib.gis.geos import GeometryCollection |
31 | 32 |
|
32 | 33 | BASE_URL = settings.BASE_URL |
33 | 34 |
|
@@ -66,105 +67,148 @@ def extract_timeperiod_from_html(content): |
66 | 67 | DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE) |
67 | 68 |
|
68 | 69 | def parse_oai_xml_and_save_publications(content, event): |
| 70 | + try: |
| 71 | + DOMTree = xml.dom.minidom.parseString(content) |
| 72 | + except Exception as e: |
| 73 | + logger.error("Error parsing XML: %s", e) |
| 74 | + return |
69 | 75 |
|
70 | | - DOMTree = xml.dom.minidom.parseString(content) |
71 | 76 | collection = DOMTree.documentElement |
72 | | - |
73 | 77 | records = collection.getElementsByTagName("record") |
74 | | - |
75 | 78 | if not records: |
76 | 79 | logger.warning("No articles found in OAI-PMH response!") |
77 | 80 | return |
78 | | - |
79 | | - existing_urls = set(Publication.objects.values_list('url', flat=True)) |
80 | | - existing_dois = set(Publication.objects.exclude(doi__isnull=True).values_list('doi', flat=True)) |
81 | 81 | for record in records: |
82 | 82 | try: |
83 | 83 | def get_text(tag_name): |
84 | 84 | nodes = record.getElementsByTagName(tag_name) |
85 | 85 | return nodes[0].firstChild.nodeValue.strip() if nodes and nodes[0].firstChild else None |
86 | 86 |
|
87 | | - identifier_value = get_text("dc:identifier") |
| 87 | + # collect all dc:identifier values |
| 88 | + id_nodes = record.getElementsByTagName("dc:identifier") |
| 89 | + identifiers = [ |
| 90 | + n.firstChild.nodeValue.strip() |
| 91 | + for n in id_nodes |
| 92 | + if n.firstChild and n.firstChild.nodeValue |
| 93 | + ] |
| 94 | + http_urls = [u for u in identifiers if u.lower().startswith("http")] |
| 95 | + view_urls = [u for u in http_urls if "/view/" in u] |
| 96 | + identifier_value = (view_urls or http_urls or [None])[0] |
| 97 | + |
88 | 98 | title_value = get_text("dc:title") |
89 | 99 | abstract_text = get_text("dc:description") |
90 | 100 | journal_value = get_text("dc:publisher") |
91 | 101 | date_value = get_text("dc:date") |
92 | 102 |
|
93 | 103 | doi_text = None |
94 | | - doi_nodes = record.getElementsByTagName("dc:identifier") |
95 | | - for node in doi_nodes: |
96 | | - if node.firstChild and node.firstChild.nodeValue: |
97 | | - candidate = node.firstChild.nodeValue.strip() |
98 | | - match = DOI_REGEX.search(candidate) |
99 | | - if match: |
100 | | - doi_text = match.group(0) |
101 | | - break |
| 104 | + for ident in identifiers: |
| 105 | + if match := DOI_REGEX.search(ident): |
| 106 | + doi_text = match.group(0) |
| 107 | + break |
102 | 108 |
|
103 | | - if not identifier_value or not identifier_value.startswith("http"): |
104 | | - logger.warning("Skipping record with invalid URL: %s", identifier_value) |
105 | | - continue |
106 | | - |
107 | | - if doi_text and doi_text in existing_dois: |
| 109 | + if doi_text and Publication.objects.filter(doi=doi_text).exists(): |
108 | 110 | logger.info("Skipping duplicate publication (DOI): %s", doi_text) |
109 | 111 | continue |
110 | | - |
111 | | - if identifier_value in existing_urls: |
| 112 | + if identifier_value and Publication.objects.filter(url=identifier_value).exists(): |
112 | 113 | logger.info("Skipping duplicate publication (URL): %s", identifier_value) |
113 | 114 | continue |
| 115 | + # Skip records without a valid URL. |
| 116 | + if not identifier_value or not identifier_value.startswith("http"): |
| 117 | + logger.warning("Skipping record with invalid URL: %s", identifier_value) |
| 118 | + continue |
114 | 119 |
|
115 | | - existing_urls.add(identifier_value) |
116 | | - if doi_text: |
117 | | - existing_dois.add(doi_text) |
118 | | - |
119 | | - geom_object = None |
120 | | - period_start = [] |
121 | | - period_end = [] |
122 | | - with requests.get(identifier_value) as response: |
123 | | - soup = BeautifulSoup(response.content, "html.parser") |
124 | | - geom_object = extract_geometry_from_html(soup) |
125 | | - period_start, period_end = extract_timeperiod_from_html(soup) |
| 120 | + geom_object = GeometryCollection() |
| 121 | + period_start, period_end = [], [] |
| 122 | + try: |
| 123 | + resp = requests.get(identifier_value, timeout=10) |
| 124 | + resp.raise_for_status() |
| 125 | + soup = BeautifulSoup(resp.content, "html.parser") |
| 126 | + |
| 127 | + try: |
| 128 | + geom = extract_geometry_from_html(soup) |
| 129 | + geom_object = geom or GeometryCollection() |
| 130 | + except Exception as geo_err: |
| 131 | + logger.error("Geometry extraction failed for URL %s: %s", identifier_value, geo_err) |
| 132 | + geom_object = GeometryCollection() |
| 133 | + |
| 134 | + try: |
| 135 | + start_time, end_time = extract_timeperiod_from_html(soup) |
| 136 | + if isinstance(start_time, list): |
| 137 | + period_start = [d for d in start_time if d] |
| 138 | + if isinstance(end_time, list): |
| 139 | + period_end = [d for d in end_time if d] |
| 140 | + except Exception as time_err: |
| 141 | + logger.error("Time period extraction failed for URL %s: %s", identifier_value, time_err) |
| 142 | + |
| 143 | + except Exception as fetch_err: |
| 144 | + logger.error("Error fetching HTML for %s: %s", identifier_value, fetch_err) |
| 145 | + geom_object = GeometryCollection() |
| 146 | + period_start, period_end = [], [] |
126 | 147 |
|
127 | 148 | publication = Publication( |
128 | 149 | title=title_value, |
129 | 150 | abstract=abstract_text, |
130 | 151 | publicationDate=date_value, |
131 | 152 | url=identifier_value, |
132 | | - doi=doi_text if doi_text else None, |
| 153 | + doi=doi_text, |
133 | 154 | source=journal_value, |
134 | 155 | geometry=geom_object, |
135 | 156 | timeperiod_startdate=period_start, |
136 | | - timeperiod_enddate=period_end |
| 157 | + timeperiod_enddate=period_end, |
| 158 | + job=event |
137 | 159 | ) |
138 | 160 | publication.save() |
139 | | - print("Saved new publication: %s", identifier_value) |
140 | 161 |
|
141 | 162 | except Exception as e: |
142 | | - print("Error parsing record: %s", str(e)) |
| 163 | + logger.error("Error parsing record: %s", e) |
143 | 164 | continue |
144 | 165 |
|
145 | | -def harvest_oai_endpoint(source_id): |
| 166 | +def harvest_oai_endpoint(source_id, user=None): |
146 | 167 | source = Source.objects.get(id=source_id) |
147 | 168 | event = HarvestingEvent.objects.create(source=source, status="in_progress") |
148 | 169 |
|
149 | | - username = os.getenv("OPTIMAP_OAI_USERNAME") |
150 | | - password = os.getenv("OPTIMAP_OAI_PASSWORD") |
151 | | - |
152 | 170 | try: |
153 | | - with requests.Session() as session: |
154 | | - response = session.get(source.url_field, auth=HTTPBasicAuth(username, password)) |
155 | | - response.raise_for_status() |
156 | | - parse_oai_xml_and_save_publications(response.content, event) |
157 | | - |
158 | | - event.status = "completed" |
159 | | - event.completed_at = timezone.now() |
160 | | - event.save() |
161 | | - print("Harvesting completed for", source.url_field) |
162 | | - |
163 | | - except requests.exceptions.RequestException as e: |
164 | | - print("Error harvesting from", source.url_field, ":", e) |
| 171 | + response = requests.get(source.url_field) |
| 172 | + response.raise_for_status() |
| 173 | + |
| 174 | + parse_oai_xml_and_save_publications(response.content, event) |
| 175 | + |
| 176 | + event.status = "completed" |
| 177 | + event.completed_at = timezone.now() |
| 178 | + event.save() |
| 179 | + |
| 180 | + new_count = Publication.objects.filter(job=event).count() |
| 181 | + spatial_count = Publication.objects.filter(job=event).exclude(geometry__isnull=True).count() |
| 182 | + temporal_count = Publication.objects.filter(job=event).exclude(timeperiod_startdate=[]).count() |
| 183 | + |
| 184 | + subject = f"Harvesting Completed for {source.collection_name}" |
| 185 | + completed_str = event.completed_at.strftime('%Y-%m-%d %H:%M:%S') if event.completed_at else 'N/A' |
| 186 | + message = ( |
| 187 | + f"Harvesting job details:\n\n" |
| 188 | + f"Number of added articles: {new_count}\n" |
| 189 | + f"Number of articles with spatial metadata: {spatial_count}\n" |
| 190 | + f"Number of articles with temporal metadata: {temporal_count}\n" |
| 191 | + f"Collection used: {source.collection_name or 'N/A'}\n" |
| 192 | + f"Journal: {source.url_field}\n" |
| 193 | + f"Job started at: {event.started_at.strftime('%Y-%m-%d %H:%M:%S')}\n" |
| 194 | + f"Job completed at: {completed_str}\n" |
| 195 | + ) |
| 196 | + |
| 197 | + if user and user.email: |
| 198 | + send_mail( |
| 199 | + subject, |
| 200 | + message, |
| 201 | + settings.EMAIL_HOST_USER, |
| 202 | + [user.email], |
| 203 | + fail_silently=False, |
| 204 | + ) |
| 205 | + |
| 206 | + except Exception as e: |
| 207 | + logger.error("Harvesting failed for source %s: %s", source.url_field, str(e)) |
165 | 208 | event.status = "failed" |
166 | | - event.log = str(e) |
| 209 | + event.completed_at = timezone.now() |
167 | 210 | event.save() |
| 211 | + |
168 | 212 | def send_monthly_email(trigger_source='manual', sent_by=None): |
169 | 213 | recipients = User.objects.filter(userprofile__notify_new_manuscripts=True).values_list('email', flat=True) |
170 | 214 | last_month = now().replace(day=1) - timedelta(days=1) |
|
0 commit comments