Skip to content

Commit 645db4c

Browse files
authored
Merge pull request #147 from GeoinformationSystems/Task/Add_"optimap"_and_timestamp_136
2 parents 7ef2283 + 7f01809 commit 645db4c

File tree

7 files changed

+170
-107
lines changed

7 files changed

+170
-107
lines changed

Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ ARG UBUNTU_VERSION=22.04
22

33
FROM ubuntu:${UBUNTU_VERSION}
44

5-
ENV PYTHONDONTWRITEBYTECODE 1
6-
ENV PYTHONUNBUFFERED 1
5+
ENV PYTHONDONTWRITEBYTECODE=1
6+
ENV PYTHONUNBUFFERED=1
77

88
ENV OPTIMAP_DEBUG=False
99
ENV OPTIMAP_ALLOWED_HOST=*

optimap/.env.example

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,5 @@ OPTIMAP_EMAIL_USE_TLS=True
2424
OPTIMAP_EMAIL_IMAP_SENT_FOLDER=""
2525

2626
OPTIMAP_LOGGING_CONSOLE_LEVEL=INFO
27+
28+
OPTIMAP_DATA_DUMP_RETENTION=3

optimap/settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545

4646
ROOT_URLCONF = 'optimap.urls'
4747

48+
DATA_DUMP_RETENTION = int(os.getenv("OPTIMAP_DATA_DUMP_RETENTION", 3))
49+
4850
AUTHENTICATION_BACKENDS = [
4951
'django.contrib.auth.backends.ModelBackend',
5052
"sesame.backends.ModelBackend",

publications/tasks.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,32 @@
2828
from urllib.parse import quote
2929
from django_q.tasks import schedule
3030
from django_q.models import Schedule
31+
import glob
32+
from pathlib import Path
33+
from datetime import datetime, timezone as dt_timezone
3134

3235
BASE_URL = settings.BASE_URL
3336
DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
37+
CACHE_DIR = Path(tempfile.gettempdir()) / 'optimap_cache'
38+
39+
def generate_data_dump_filename(extension: str) -> str:
40+
"""
41+
Returns: optimap_data_dump_YYYYMMDDThhmmss.<extension>
42+
"""
43+
ts = datetime.now(dt_timezone.utc).strftime("%Y%m%dT%H%M%S")
44+
return f"optimap_data_dump_{ts}.{extension}"
45+
46+
def cleanup_old_data_dumps(directory: Path, keep: int):
47+
"""
48+
Deletes all files matching optimap_data_dump_* beyond the newest `keep` ones.
49+
"""
50+
pattern = str(directory / "optimap_data_dump_*")
51+
files = sorted(glob.glob(pattern), reverse=True) # newest first
52+
for old in files[keep:]:
53+
try:
54+
os.remove(old)
55+
except OSError:
56+
logger.warning("Could not delete old dump %s", old)
3457

3558
def extract_geometry_from_html(content):
3659
for tag in content.find_all("meta"):
@@ -64,8 +87,6 @@ def extract_timeperiod_from_html(content):
6487
# returning arrays for array field in DB
6588
return [period[0]], [period[1]]
6689

67-
DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
68-
6990
def parse_oai_xml_and_save_publications(content, event):
7091
try:
7192
DOMTree = xml.dom.minidom.parseString(content)
@@ -324,7 +345,8 @@ def regenerate_geojson_cache():
324345
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
325346
os.makedirs(cache_dir, exist_ok=True)
326347

327-
json_path = os.path.join(cache_dir, 'geojson_cache.json')
348+
json_filename = generate_data_dump_filename("geojson")
349+
json_path = os.path.join(cache_dir, json_filename)
328350
with open(json_path, 'w') as f:
329351
serialize(
330352
'geojson',
@@ -334,31 +356,40 @@ def regenerate_geojson_cache():
334356
stream=f
335357
)
336358

337-
gzip_path = json_path + '.gz'
359+
gzip_filename = generate_data_dump_filename("geojson.gz")
360+
gzip_path = os.path.join(cache_dir, gzip_filename)
338361
with open(json_path, 'rb') as fin, gzip.open(gzip_path, 'wb') as fout:
339362
fout.writelines(fin)
340363

341364
size = os.path.getsize(json_path)
342365
logger.info("Cached GeoJSON at %s (%d bytes), gzipped at %s", json_path, size, gzip_path)
366+
# remove old dumps beyond retention
367+
cleanup_old_data_dumps(Path(cache_dir), settings.DATA_DUMP_RETENTION)
343368
return json_path
344369

345370
def convert_geojson_to_geopackage(geojson_path):
346371
cache_dir = os.path.dirname(geojson_path)
347-
gpkg_path = os.path.join(cache_dir, 'publications.gpkg')
372+
gpkg_filename = generate_data_dump_filename("gpkg")
373+
gpkg_path = os.path.join(cache_dir, gpkg_filename)
348374
try:
349375
output = subprocess.check_output(
350376
["ogr2ogr", "-f", "GPKG", gpkg_path, geojson_path],
351377
stderr=subprocess.STDOUT,
352378
text=True,
353379
)
354380
logger.info("ogr2ogr output:\n%s", output)
381+
# remove old dumps beyond retention
355382
return gpkg_path
356383
except subprocess.CalledProcessError as e:
357384
return None
358385
# on success, return the filename so callers can stream it or inspect it
386+
# remove old dumps beyond retention
359387
return gpkg_path
360388

361389

362390
def regenerate_geopackage_cache():
363391
geojson_path = regenerate_geojson_cache()
364-
return convert_geojson_to_geopackage(geojson_path)
392+
cache_dir = Path(geojson_path).parent
393+
gpkg_path = convert_geojson_to_geopackage(geojson_path)
394+
cleanup_old_data_dumps(cache_dir, settings.DATA_DUMP_RETENTION)
395+
return gpkg_path

publications/templates/data.html

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ <h2 class="py-2">OpenAPI UI</h2>
3939

4040
<h2 class="py-2">Download Publication Data</h2>
4141
<ul class="list-unstyled mb-4">
42-
{% if geojson_size %}
42+
{% if last_geojson %}
4343
<li class="mb-3">
4444
<div class="d-flex align-items-center">
4545
<a class="btn btn-primary btn-sm" href="{% url 'optimap:download_geojson' %}">
@@ -48,28 +48,32 @@ <h2 class="py-2">Download Publication Data</h2>
4848
(<a href="https://geojson.org/" target="_blank" class="ms-2 small">GeoJSON spec</a>)
4949
</div>
5050
<div class="small text-muted mt-1">
51-
File size: {{ geojson_size }}
51+
File: {{ last_geojson }}{% if geojson_size %} &middot; Size: {{ geojson_size }}{% endif %}
5252
</div>
5353
</li>
5454
{% endif %}
5555

56-
{% if geopackage_size %}
56+
{% if last_gpkg %}
5757
<li>
5858
<div class="d-flex align-items-center">
5959
<a class="btn btn-primary btn-sm" href="{% url 'optimap:download_geopackage' %}">
6060
Download GeoPackage
61-
</a>
61+
</a>
6262
(<a href="https://www.geopackage.org/" target="_blank" class="ms-2 small">GeoPackage spec</a>)
6363
</div>
6464
<div class="small text-muted mt-1">
65-
File size: {{ geopackage_size }}
65+
File: {{ last_gpkg }}{% if geopackage_size %} &middot; Size: {{ geopackage_size }}{% endif %}
6666
</div>
6767
</li>
6868
{% endif %}
6969
</ul>
7070
<p class="small text-muted text-center mb-0">
7171
Data dumps run every {{ interval }} hour{{ interval|pluralize }}.<br>
72-
Last updated: {{ last_updated|naturaltime }}
72+
{% if last_updated %}
73+
Last updated: {{ last_updated|naturaltime }}
74+
{% else %}
75+
No dumps have been generated yet.
76+
{% endif %}
7377
</p>
7478

7579
</div>

publications/views.py

Lines changed: 64 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from django.contrib.auth import login, logout
55
from django.shortcuts import render, redirect, get_object_or_404
66
from django.core.cache import cache
7-
from django.http import HttpResponseRedirect, HttpResponse, FileResponse
7+
from django.http import HttpResponseRedirect, HttpResponse, FileResponse, Http404
88
from django.contrib.auth.decorators import login_required
99
from django.views.decorators.http import require_GET
1010
from django.core.mail import EmailMessage, send_mail, get_connection
@@ -26,7 +26,8 @@
2626
from publications.models import BlockedEmail, BlockedDomain, Subscription, UserProfile, Publication
2727
from django.contrib.auth import get_user_model
2828
User = get_user_model()
29-
import tempfile, os
29+
import tempfile, os, glob
30+
from pathlib import Path
3031
from publications.tasks import regenerate_geojson_cache, regenerate_geopackage_cache
3132
from osgeo import ogr, osr
3233
ogr.UseExceptions()
@@ -45,32 +46,34 @@
4546
@require_GET
4647
def download_geojson(request):
4748
"""
48-
Returns the cached GeoJSON file, gzipped if the client accepts it.
49+
Returns the latest GeoJSON dump file, gzipped if the client accepts it.
4950
"""
50-
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
51-
os.makedirs(cache_dir, exist_ok=True)
52-
json_path = os.path.join(cache_dir, 'geojson_cache.json')
53-
gzip_path = os.path.join(cache_dir, 'geojson_cache.json.gz')
54-
if not os.path.exists(json_path):
55-
json_path = regenerate_geojson_cache()
51+
cache_dir = Path(tempfile.gettempdir()) / "optimap_cache"
52+
cache_dir.mkdir(exist_ok=True)
53+
54+
# regenerate and find latest geojson dump
55+
path = regenerate_geojson_cache()
56+
gzip_path = Path(str(path) + ".gz")
5657
accept_enc = request.META.get('HTTP_ACCEPT_ENCODING', '')
57-
if 'gzip' in accept_enc and os.path.exists(gzip_path):
58+
59+
if 'gzip' in accept_enc and gzip_path.exists():
5860
response = FileResponse(
5961
open(gzip_path, 'rb'),
6062
content_type="application/json",
6163
as_attachment=True,
62-
filename="publications.geojson"
64+
filename=gzip_path.name
6365
)
6466
response['Content-Encoding'] = 'gzip'
6567
else:
6668
response = FileResponse(
67-
open(json_path, 'rb'),
69+
open(path, 'rb'),
6870
content_type="application/json",
6971
as_attachment=True,
70-
filename="publications.geojson"
72+
filename=Path(path).name
7173
)
7274
return response
7375

76+
7477
def generate_geopackage():
7578
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
7679
os.makedirs(cache_dir, exist_ok=True)
@@ -97,7 +100,7 @@ def generate_geopackage():
97100
feat.SetField("doi", pub.doi or "")
98101
feat.SetField("source", pub.source or "")
99102
if pub.geometry:
100-
wkb = pub.geometry.wkb # bytes
103+
wkb = pub.geometry.wkb
101104
geom = ogr.CreateGeometryFromWkb(wkb)
102105
geom.AssignSpatialReference(srs)
103106
feat.SetGeometry(geom)
@@ -110,16 +113,16 @@ def generate_geopackage():
110113
@require_GET
111114
def download_geopackage(request):
112115
"""
113-
Returns the generated GeoPackage file as a downloadable file.
116+
Returns the latest GeoPackage dump file.
114117
"""
115-
filename = generate_geopackage()
116-
if not os.path.exists(filename):
117-
return HttpResponse("Error generating GeoPackage.", status=500)
118+
path = regenerate_geopackage_cache()
119+
if not os.path.exists(path):
120+
raise Http404('GeoPackage dump not found')
118121
return FileResponse(
119-
open(filename, 'rb'),
122+
open(path, 'rb'),
120123
content_type="application/geopackage+sqlite3",
121124
as_attachment=True,
122-
filename="publications.gpkg"
125+
filename=Path(path).name
123126
)
124127

125128

@@ -185,37 +188,54 @@ def loginres(request):
185188
def privacy(request):
186189
return render(request, 'privacy.html')
187190

188-
189191
@never_cache
190192
def data(request):
191-
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
192-
json_path = os.path.join(cache_dir, "geojson_cache.json")
193-
gpkg_path = os.path.join(cache_dir, "publications.gpkg")
194-
195-
# If dumps don’t exist yet, trigger one synchronously
196-
if not os.path.exists(json_path) or not os.path.exists(gpkg_path):
197-
regenerate_geopackage_cache()
198-
199-
if os.path.exists(json_path):
200-
geojson_size = humanize.naturalsize(os.path.getsize(json_path), binary=True)
201-
else:
202-
geojson_size = None
203-
204-
if os.path.exists(gpkg_path):
205-
geopackage_size = humanize.naturalsize(os.path.getsize(gpkg_path), binary=True)
193+
"""
194+
Renders the data page showing links and sizes for the latest dumps.
195+
"""
196+
cache_dir = Path(tempfile.gettempdir()) / "optimap_cache"
197+
cache_dir.mkdir(exist_ok=True)
198+
199+
# scan for existing dumps
200+
geojson_files = sorted(cache_dir.glob('optimap_data_dump_*.geojson'), reverse=True)
201+
gpkg_files = sorted(cache_dir.glob('optimap_data_dump_*.gpkg'), reverse=True)
202+
203+
last_geo = geojson_files[0] if geojson_files else None
204+
last_gzip = Path(str(last_geo) + ".gz") if last_geo else None
205+
last_gpkg = gpkg_files[0] if gpkg_files else None
206+
207+
# — Supervisor check: ensure all dump file times are within 1 hour
208+
mtimes = []
209+
for p in (last_geo, last_gzip, last_gpkg):
210+
if p and p.exists():
211+
mtimes.append(p.stat().st_mtime)
212+
if mtimes and (max(mtimes) - min(mtimes) > 3600):
213+
ts_map = {
214+
p.name: datetime.fromtimestamp(p.stat().st_mtime, get_default_timezone())
215+
for p in (last_geo, last_gzip, last_gpkg) if p and p.exists()
216+
}
217+
logger.warning("Data dump timestamps differ by >1h: %s", ts_map)
218+
219+
# humanized sizes
220+
geojson_size = humanize.naturalsize(last_geo.stat().st_size, binary=True) if last_geo else None
221+
geopackage_size = humanize.naturalsize(last_gpkg.stat().st_size, binary=True) if last_gpkg else None
222+
223+
# last updated timestamp (using JSON file)
224+
if last_geo:
225+
ts = last_geo.stat().st_mtime
226+
last_updated = datetime.fromtimestamp(ts, get_default_timezone())
206227
else:
207-
geopackage_size = None
208-
ts = os.path.getmtime(json_path)
209-
tz = get_default_timezone()
210-
last_updated = datetime.fromtimestamp(ts, tz)
228+
last_updated = None
211229

212230
return render(request, 'data.html', {
213-
'geojson_size': geojson_size,
231+
'geojson_size': geojson_size,
214232
'geopackage_size': geopackage_size,
215-
'interval': settings.DATA_DUMP_INTERVAL_HOURS,
216-
'last_updated': last_updated,
233+
'interval': settings.DATA_DUMP_INTERVAL_HOURS,
234+
'last_updated': last_updated,
235+
'last_geojson': last_geo.name if last_geo else None,
236+
'last_gpkg': last_gpkg.name if last_gpkg else None,
217237
})
218-
238+
219239
def Confirmationlogin(request):
220240
return render(request, 'confirmation_login.html')
221241

0 commit comments

Comments
 (0)