Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ ARG UBUNTU_VERSION=22.04

FROM ubuntu:${UBUNTU_VERSION}

ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1

ENV OPTIMAP_DEBUG=False
ENV OPTIMAP_ALLOWED_HOST=*
Expand Down
2 changes: 2 additions & 0 deletions optimap/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ OPTIMAP_EMAIL_USE_TLS=True
OPTIMAP_EMAIL_IMAP_SENT_FOLDER=""

OPTIMAP_LOGGING_CONSOLE_LEVEL=INFO

OPTIMAP_DATA_DUMP_RETENTION=3
2 changes: 2 additions & 0 deletions optimap/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@

ROOT_URLCONF = 'optimap.urls'

DATA_DUMP_RETENTION = int(os.getenv("OPTIMAP_DATA_DUMP_RETENTION", 3))

AUTHENTICATION_BACKENDS = [
'django.contrib.auth.backends.ModelBackend',
"sesame.backends.ModelBackend",
Expand Down
43 changes: 37 additions & 6 deletions publications/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,32 @@
from urllib.parse import quote
from django_q.tasks import schedule
from django_q.models import Schedule
import glob
from pathlib import Path
from datetime import datetime, timezone as dt_timezone

BASE_URL = settings.BASE_URL
DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
CACHE_DIR = Path(tempfile.gettempdir()) / 'optimap_cache'

def generate_data_dump_filename(extension: str) -> str:
"""
Returns: optimap_data_dump_YYYYMMDDThhmmss.<extension>
"""
ts = datetime.now(dt_timezone.utc).strftime("%Y%m%dT%H%M%S")
return f"optimap_data_dump_{ts}.{extension}"

def cleanup_old_data_dumps(directory: Path, keep: int):
"""
Deletes all files matching optimap_data_dump_* beyond the newest `keep` ones.
"""
pattern = str(directory / "optimap_data_dump_*")
files = sorted(glob.glob(pattern), reverse=True) # newest first
for old in files[keep:]:
try:
os.remove(old)
except OSError:
logger.warning("Could not delete old dump %s", old)

def extract_geometry_from_html(content):
for tag in content.find_all("meta"):
Expand Down Expand Up @@ -64,8 +87,6 @@ def extract_timeperiod_from_html(content):
# returning arrays for array field in DB
return [period[0]], [period[1]]

DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)

def parse_oai_xml_and_save_publications(content, event):
try:
DOMTree = xml.dom.minidom.parseString(content)
Expand Down Expand Up @@ -324,7 +345,8 @@ def regenerate_geojson_cache():
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
os.makedirs(cache_dir, exist_ok=True)

json_path = os.path.join(cache_dir, 'geojson_cache.json')
json_filename = generate_data_dump_filename("geojson")
json_path = os.path.join(cache_dir, json_filename)
with open(json_path, 'w') as f:
serialize(
'geojson',
Expand All @@ -334,31 +356,40 @@ def regenerate_geojson_cache():
stream=f
)

gzip_path = json_path + '.gz'
gzip_filename = generate_data_dump_filename("geojson.gz")
gzip_path = os.path.join(cache_dir, gzip_filename)
with open(json_path, 'rb') as fin, gzip.open(gzip_path, 'wb') as fout:
fout.writelines(fin)

size = os.path.getsize(json_path)
logger.info("Cached GeoJSON at %s (%d bytes), gzipped at %s", json_path, size, gzip_path)
# remove old dumps beyond retention
cleanup_old_data_dumps(Path(cache_dir), settings.DATA_DUMP_RETENTION)
return json_path

def convert_geojson_to_geopackage(geojson_path):
cache_dir = os.path.dirname(geojson_path)
gpkg_path = os.path.join(cache_dir, 'publications.gpkg')
gpkg_filename = generate_data_dump_filename("gpkg")
gpkg_path = os.path.join(cache_dir, gpkg_filename)
try:
output = subprocess.check_output(
["ogr2ogr", "-f", "GPKG", gpkg_path, geojson_path],
stderr=subprocess.STDOUT,
text=True,
)
logger.info("ogr2ogr output:\n%s", output)
# remove old dumps beyond retention
return gpkg_path
except subprocess.CalledProcessError as e:
return None
# on success, return the filename so callers can stream it or inspect it
# remove old dumps beyond retention
return gpkg_path


def regenerate_geopackage_cache():
geojson_path = regenerate_geojson_cache()
return convert_geojson_to_geopackage(geojson_path)
cache_dir = Path(geojson_path).parent
gpkg_path = convert_geojson_to_geopackage(geojson_path)
cleanup_old_data_dumps(cache_dir, settings.DATA_DUMP_RETENTION)
return gpkg_path
16 changes: 10 additions & 6 deletions publications/templates/data.html
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ <h2 class="py-2">OpenAPI UI</h2>

<h2 class="py-2">Download Publication Data</h2>
<ul class="list-unstyled mb-4">
{% if geojson_size %}
{% if last_geojson %}
<li class="mb-3">
<div class="d-flex align-items-center">
<a class="btn btn-primary btn-sm" href="{% url 'optimap:download_geojson' %}">
Expand All @@ -48,28 +48,32 @@ <h2 class="py-2">Download Publication Data</h2>
(<a href="https://geojson.org/" target="_blank" class="ms-2 small">GeoJSON spec</a>)
</div>
<div class="small text-muted mt-1">
File size: {{ geojson_size }}
File: {{ last_geojson }}{% if geojson_size %} &middot; Size: {{ geojson_size }}{% endif %}
</div>
</li>
{% endif %}

{% if geopackage_size %}
{% if last_gpkg %}
<li>
<div class="d-flex align-items-center">
<a class="btn btn-primary btn-sm" href="{% url 'optimap:download_geopackage' %}">
Download GeoPackage
</a>
</a>
(<a href="https://www.geopackage.org/" target="_blank" class="ms-2 small">GeoPackage spec</a>)
</div>
<div class="small text-muted mt-1">
File size: {{ geopackage_size }}
File: {{ last_gpkg }}{% if geopackage_size %} &middot; Size: {{ geopackage_size }}{% endif %}
</div>
</li>
{% endif %}
</ul>
<p class="small text-muted text-center mb-0">
Data dumps run every {{ interval }} hour{{ interval|pluralize }}.<br>
Last updated: {{ last_updated|naturaltime }}
{% if last_updated %}
Last updated: {{ last_updated|naturaltime }}
{% else %}
No dumps have been generated yet.
{% endif %}
</p>

</div>
Expand Down
108 changes: 64 additions & 44 deletions publications/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from django.contrib.auth import login, logout
from django.shortcuts import render, redirect, get_object_or_404
from django.core.cache import cache
from django.http import HttpResponseRedirect, HttpResponse, FileResponse
from django.http import HttpResponseRedirect, HttpResponse, FileResponse, Http404
from django.contrib.auth.decorators import login_required
from django.views.decorators.http import require_GET
from django.core.mail import EmailMessage, send_mail, get_connection
Expand All @@ -26,7 +26,8 @@
from publications.models import BlockedEmail, BlockedDomain, Subscription, UserProfile, Publication
from django.contrib.auth import get_user_model
User = get_user_model()
import tempfile, os
import tempfile, os, glob
from pathlib import Path
from publications.tasks import regenerate_geojson_cache, regenerate_geopackage_cache
from osgeo import ogr, osr
ogr.UseExceptions()
Expand All @@ -45,32 +46,34 @@
@require_GET
def download_geojson(request):
"""
Returns the cached GeoJSON file, gzipped if the client accepts it.
Returns the latest GeoJSON dump file, gzipped if the client accepts it.
"""
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
os.makedirs(cache_dir, exist_ok=True)
json_path = os.path.join(cache_dir, 'geojson_cache.json')
gzip_path = os.path.join(cache_dir, 'geojson_cache.json.gz')
if not os.path.exists(json_path):
json_path = regenerate_geojson_cache()
cache_dir = Path(tempfile.gettempdir()) / "optimap_cache"
cache_dir.mkdir(exist_ok=True)

# regenerate and find latest geojson dump
path = regenerate_geojson_cache()
gzip_path = Path(str(path) + ".gz")
accept_enc = request.META.get('HTTP_ACCEPT_ENCODING', '')
if 'gzip' in accept_enc and os.path.exists(gzip_path):

if 'gzip' in accept_enc and gzip_path.exists():
response = FileResponse(
open(gzip_path, 'rb'),
content_type="application/json",
as_attachment=True,
filename="publications.geojson"
filename=gzip_path.name
)
response['Content-Encoding'] = 'gzip'
else:
response = FileResponse(
open(json_path, 'rb'),
open(path, 'rb'),
content_type="application/json",
as_attachment=True,
filename="publications.geojson"
filename=Path(path).name
)
return response


def generate_geopackage():
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
os.makedirs(cache_dir, exist_ok=True)
Expand All @@ -97,7 +100,7 @@ def generate_geopackage():
feat.SetField("doi", pub.doi or "")
feat.SetField("source", pub.source or "")
if pub.geometry:
wkb = pub.geometry.wkb # bytes
wkb = pub.geometry.wkb
geom = ogr.CreateGeometryFromWkb(wkb)
geom.AssignSpatialReference(srs)
feat.SetGeometry(geom)
Expand All @@ -110,16 +113,16 @@ def generate_geopackage():
@require_GET
def download_geopackage(request):
"""
Returns the generated GeoPackage file as a downloadable file.
Returns the latest GeoPackage dump file.
"""
filename = generate_geopackage()
if not os.path.exists(filename):
return HttpResponse("Error generating GeoPackage.", status=500)
path = regenerate_geopackage_cache()
if not os.path.exists(path):
raise Http404('GeoPackage dump not found')
return FileResponse(
open(filename, 'rb'),
open(path, 'rb'),
content_type="application/geopackage+sqlite3",
as_attachment=True,
filename="publications.gpkg"
filename=Path(path).name
)


Expand Down Expand Up @@ -185,37 +188,54 @@ def loginres(request):
def privacy(request):
return render(request, 'privacy.html')


@never_cache
def data(request):
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
json_path = os.path.join(cache_dir, "geojson_cache.json")
gpkg_path = os.path.join(cache_dir, "publications.gpkg")

# If dumps don’t exist yet, trigger one synchronously
if not os.path.exists(json_path) or not os.path.exists(gpkg_path):
regenerate_geopackage_cache()

if os.path.exists(json_path):
geojson_size = humanize.naturalsize(os.path.getsize(json_path), binary=True)
else:
geojson_size = None

if os.path.exists(gpkg_path):
geopackage_size = humanize.naturalsize(os.path.getsize(gpkg_path), binary=True)
"""
Renders the data page showing links and sizes for the latest dumps.
"""
cache_dir = Path(tempfile.gettempdir()) / "optimap_cache"
cache_dir.mkdir(exist_ok=True)

# scan for existing dumps
geojson_files = sorted(cache_dir.glob('optimap_data_dump_*.geojson'), reverse=True)
gpkg_files = sorted(cache_dir.glob('optimap_data_dump_*.gpkg'), reverse=True)

last_geo = geojson_files[0] if geojson_files else None
last_gzip = Path(str(last_geo) + ".gz") if last_geo else None
last_gpkg = gpkg_files[0] if gpkg_files else None

# — Supervisor check: ensure all dump file times are within 1 hour
mtimes = []
for p in (last_geo, last_gzip, last_gpkg):
if p and p.exists():
mtimes.append(p.stat().st_mtime)
if mtimes and (max(mtimes) - min(mtimes) > 3600):
ts_map = {
p.name: datetime.fromtimestamp(p.stat().st_mtime, get_default_timezone())
for p in (last_geo, last_gzip, last_gpkg) if p and p.exists()
}
logger.warning("Data dump timestamps differ by >1h: %s", ts_map)

# humanized sizes
geojson_size = humanize.naturalsize(last_geo.stat().st_size, binary=True) if last_geo else None
geopackage_size = humanize.naturalsize(last_gpkg.stat().st_size, binary=True) if last_gpkg else None

# last updated timestamp (using JSON file)
if last_geo:
ts = last_geo.stat().st_mtime
last_updated = datetime.fromtimestamp(ts, get_default_timezone())
else:
geopackage_size = None
ts = os.path.getmtime(json_path)
tz = get_default_timezone()
last_updated = datetime.fromtimestamp(ts, tz)
last_updated = None

return render(request, 'data.html', {
'geojson_size': geojson_size,
'geojson_size': geojson_size,
'geopackage_size': geopackage_size,
'interval': settings.DATA_DUMP_INTERVAL_HOURS,
'last_updated': last_updated,
'interval': settings.DATA_DUMP_INTERVAL_HOURS,
'last_updated': last_updated,
'last_geojson': last_geo.name if last_geo else None,
'last_gpkg': last_gpkg.name if last_gpkg else None,
})

def Confirmationlogin(request):
return render(request, 'confirmation_login.html')

Expand Down
Loading