Skip to content

Commit a08856b

Browse files
authored
Merge pull request #124 from GeoinformationSystems/enhancement/Download_all_geometries_and_metadata_as_GeoJSON_61
2 parents 0733ee8 + 100dc2a commit a08856b

File tree

15 files changed

+562
-192
lines changed

15 files changed

+562
-192
lines changed

.github/workflows/ci.yml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@ jobs:
4141
run: |
4242
sudo apt-get update
4343
sudo add-apt-repository ppa:ubuntugis/ppa
44-
sudo apt-get install -y -qq gdal-bin libgdal-dev python3-gdal
45-
44+
sudo apt-get install -y -qq gdal-bin libgdal-dev
4645
- name: Install Python Dependencies
4746
run: |
4847
python -m pip install --upgrade pip
49-
pip install -r requirements.txt
50-
pip install -r requirements-dev.txt
51-
48+
python -m pip install gdal=="$(gdal-config --version).*"
49+
python -m pip install -r requirements.txt
50+
python -m pip install -r requirements-dev.txt
51+
5252
- name: Run Django migrations
5353
run: |
5454
python manage.py migrate
@@ -110,13 +110,14 @@ jobs:
110110
run: |
111111
sudo apt-get update
112112
sudo add-apt-repository ppa:ubuntugis/ppa
113-
sudo apt-get install -y -qq gdal-bin libgdal-dev python3-gdal
113+
sudo apt-get install -y -qq gdal-bin libgdal-dev
114114
115115
- name: Install Python Dependencies
116116
run: |
117117
python -m pip install --upgrade pip
118-
pip install -r requirements.txt
119-
pip install -r requirements-dev.txt
118+
python -m pip install gdal=="$(gdal-config --version).*"
119+
python -m pip install -r requirements.txt
120+
python -m pip install -r requirements-dev.txt
120121
121122
- name: Run Django migrations
122123
run: |

Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ RUN apt-get update && \
2121
RUN apt-get update && \
2222
apt-get install -y -qq software-properties-common && \
2323
add-apt-repository ppa:ubuntugis/ppa && \
24-
apt-get install -y -qq gdal-bin libgdal-dev python3-gdal
24+
25+
RUN pip install gdal=="$(gdal-config --version).*"
2526

2627
RUN mkdir -p /code
2728

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,13 @@ source .venv/bin/activate
111111
# Confirm Python path
112112
which python
113113

114-
# Instal GDAL and the Python GDAL bindings, see Dockerfile for example on Ubuntu
114+
# Install GDAL
115115
gdalinfo --version
116116

117-
# Install non-GDAL Python dependencies
117+
# Install gdal Pyhton library matching your GDAL version
118+
pip install gdal=="$(gdal-config --version).*"
119+
120+
# Install Python dependencies
118121
pip install -r requirements.txt
119122

120123
# create local DB container (once)

optimap/settings.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
'django.contrib.staticfiles',
6060
'django.contrib.gis',
6161
'django.contrib.sitemaps',
62+
'django.contrib.humanize',
6263
'publications',
6364
'rest_framework',
6465
'rest_framework_gis',
@@ -191,6 +192,7 @@
191192
OAI_USERNAME = env("OPTIMAP_OAI_USERNAME", default="")
192193
OAI_PASSWORD = env("OPTIMAP_OAI_PASSWORD", default="")
193194
EMAIL_SEND_DELAY = 2
195+
DATA_DUMP_INTERVAL_HOURS = 6
194196

195197
MIDDLEWARE = [
196198
'django.middleware.cache.UpdateCacheMiddleware',
@@ -210,6 +212,8 @@
210212
"django.contrib.sites.middleware.CurrentSiteMiddleware",
211213
"sesame.middleware.AuthenticationMiddleware",
212214
"django_currentuser.middleware.ThreadLocalUserMiddleware",
215+
"django.middleware.gzip.GZipMiddleware",
216+
213217
]
214218

215219
ROOT_URLCONF = 'optimap.urls'

optimap/urls.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
}
2626

2727
urlpatterns = [
28-
path('admin/', admin.site.urls),
29-
path('', include('publications.urls')),
28+
path('admin/', admin.site.urls),
29+
path('', include(('publications.urls', 'optimap'), namespace='optimap')),
3030
path(
3131
"sitemap.xml",
3232
sitemaps_views.index,
@@ -40,16 +40,13 @@
4040
name="django.contrib.sitemaps.views.sitemap",
4141
),
4242
re_path(r'^robots.txt', RobotsView.as_view(), name="robots_file"),
43-
]
43+
]
4444

45-
# https://stackoverflow.com/a/18272203/261210
45+
# Context processor for the site
4646
from django.contrib.sites.shortcuts import get_current_site
4747
from django.utils.functional import SimpleLazyObject
4848

4949
def site(request):
5050
protocol = 'https' if request.is_secure() else 'http'
5151
site = SimpleLazyObject(lambda: "{0}://{1}".format(protocol, get_current_site(request)))
52-
53-
return {
54-
'site': site,
55-
}
52+
return {'site': site}

publications/apps.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,39 @@
1+
import logging
12
from django.apps import AppConfig
3+
from django.db.models.signals import post_migrate
4+
from django.conf import settings
5+
from django.utils import timezone
6+
7+
logger = logging.getLogger(__name__)
8+
9+
def schedule_data_dump(sender, **kwargs):
10+
from django_q.models import Schedule
11+
from django_q.tasks import schedule
12+
13+
func_name = "publications.tasks.regenerate_geopackage_cache"
14+
if not Schedule.objects.filter(func=func_name).exists():
15+
schedule(
16+
func_name,
17+
schedule_type="I",
18+
minutes=settings.DATA_DUMP_INTERVAL_HOURS * 60,
19+
next_run=timezone.now(),
20+
repeats=-1,
21+
)
22+
logger.info(
23+
"Scheduled data‐dump task '%s' every %d hours",
24+
func_name,
25+
settings.DATA_DUMP_INTERVAL_HOURS,
26+
)
227

328
class PublicationsConfig(AppConfig):
4-
default_auto_field = 'django.db.models.BigAutoField'
5-
name = 'publications'
29+
name = "publications"
30+
default_auto_field = "django.db.models.BigAutoField"
631

732
def ready(self):
8-
# Implicitly connect signal handlers decorated with @receiver.
9-
from . import signals
33+
import publications.signals
34+
post_migrate.connect(
35+
schedule_data_dump,
36+
sender=self,
37+
weak=False,
38+
dispatch_uid="publications.schedule_data_dump",
39+
)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from django.core.management.base import BaseCommand
2+
from django_q.tasks import schedule
3+
from django_q.models import Schedule
4+
5+
class Command(BaseCommand):
6+
help = "Schedule the GeoJSON regeneration task every 6 hours."
7+
8+
def handle(self, *args, **options):
9+
func_name = 'publications.tasks.regenerate_geojson_cache'
10+
if not Schedule.objects.filter(func=func_name).exists():
11+
schedule(
12+
func_name,
13+
schedule_type='I', # interval
14+
minutes=360, # every 6 hours
15+
repeats=-1
16+
)
17+
self.stdout.write(self.style.SUCCESS("Scheduled GeoJSON regeneration every 6h."))
18+
else:
19+
self.stdout.write("GeoJSON regeneration already scheduled.")

publications/tasks.py

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,36 @@
11
import logging
22
logger = logging.getLogger(__name__)
33

4-
from django_q.models import Schedule
5-
from publications.models import Publication, HarvestingEvent, Source
6-
from bs4 import BeautifulSoup
4+
import os
75
import json
6+
import subprocess
7+
import gzip
8+
import re
9+
import tempfile
10+
import time
11+
import calendar
12+
from datetime import datetime, timedelta
813
import xml.dom.minidom
9-
from django.contrib.gis.geos import GEOSGeometry
1014
import requests
11-
from django.core.mail import send_mail, EmailMessage
12-
from django.utils import timezone
15+
from bs4 import BeautifulSoup
1316
from requests.auth import HTTPBasicAuth
14-
import os
17+
from urllib.parse import quote
1518
from django.conf import settings
16-
from django.utils.timezone import now
19+
from django.core.mail import send_mail, EmailMessage
20+
from django.core.serializers import serialize
21+
from django.contrib.gis.geos import GEOSGeometry, GeometryCollection
22+
from django.utils import timezone
23+
from publications.models import Publication, HarvestingEvent, Source
24+
from .models import EmailLog, Subscription
1725
from django.contrib.auth import get_user_model
1826
User = get_user_model()
19-
from .models import EmailLog, Subscription
20-
from datetime import datetime, timedelta
2127
from django.urls import reverse
2228
from urllib.parse import quote
23-
from datetime import datetime
24-
from django_q.tasks import schedule
25-
from django.utils import timezone
2629
from django_q.tasks import schedule
2730
from django_q.models import Schedule
28-
import time
29-
import calendar
30-
import re
31-
from django.contrib.gis.geos import GeometryCollection
3231

3332
BASE_URL = settings.BASE_URL
33+
DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
3434

3535
def extract_geometry_from_html(content):
3636
for tag in content.find_all("meta"):
@@ -211,7 +211,7 @@ def harvest_oai_endpoint(source_id, user=None):
211211

212212
def send_monthly_email(trigger_source='manual', sent_by=None):
213213
recipients = User.objects.filter(userprofile__notify_new_manuscripts=True).values_list('email', flat=True)
214-
last_month = now().replace(day=1) - timedelta(days=1)
214+
last_month = timezone.now().replace(day=1) - timedelta(days=1)
215215
new_manuscripts = Publication.objects.filter(creationDate__month=last_month.month)
216216

217217
if not recipients.exists() or not new_manuscripts.exists():
@@ -319,4 +319,46 @@ def schedule_subscription_email_task(sent_by=None):
319319
kwargs={'trigger_source': 'scheduled', 'sent_by': sent_by.id if sent_by else None}
320320
)
321321
logger.info(f"Scheduled 'send_subscription_based_email' for {next_run_date}")
322+
323+
def regenerate_geojson_cache():
324+
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
325+
os.makedirs(cache_dir, exist_ok=True)
326+
327+
json_path = os.path.join(cache_dir, 'geojson_cache.json')
328+
with open(json_path, 'w') as f:
329+
serialize(
330+
'geojson',
331+
Publication.objects.filter(status="p"),
332+
geometry_field='geometry',
333+
srid=4326,
334+
stream=f
335+
)
336+
337+
gzip_path = json_path + '.gz'
338+
with open(json_path, 'rb') as fin, gzip.open(gzip_path, 'wb') as fout:
339+
fout.writelines(fin)
340+
341+
size = os.path.getsize(json_path)
342+
logger.info("Cached GeoJSON at %s (%d bytes), gzipped at %s", json_path, size, gzip_path)
343+
return json_path
344+
345+
def convert_geojson_to_geopackage(geojson_path):
346+
cache_dir = os.path.dirname(geojson_path)
347+
gpkg_path = os.path.join(cache_dir, 'publications.gpkg')
348+
try:
349+
output = subprocess.check_output(
350+
["ogr2ogr", "-f", "GPKG", gpkg_path, geojson_path],
351+
stderr=subprocess.STDOUT,
352+
text=True,
353+
)
354+
logger.info("ogr2ogr output:\n%s", output)
355+
return gpkg_path
356+
except subprocess.CalledProcessError as e:
357+
return None
358+
# on success, return the filename so callers can stream it or inspect it
359+
return gpkg_path
360+
322361

362+
def regenerate_geopackage_cache():
363+
geojson_path = regenerate_geojson_cache()
364+
return convert_geojson_to_geopackage(geojson_path)

publications/templates/data.html

Lines changed: 64 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,77 @@
11
{% extends "main.html" %}
2+
{% load optimap_extras humanize %}
3+
{% block title %}Data & API | {% endblock %}
24

3-
{% load optimap_extras %}
5+
{% block content %}
6+
<div class="row justify-content-center">
7+
<div class="col-md-6 py-5">
48

5-
{% block title %}API | {% endblock %}
9+
<h1 class="mb-4">OPTIMAP Data &amp; API Access</h1>
10+
<p class="lead">
11+
All publication metadata published in OPTIMAP is licensed under a Creative Commons Zero
12+
(<a href="https://creativecommons.org/publicdomain/zero/1.0/" target="_blank">CC-0</a>) license.
13+
</p>
614

7-
{% block content %}
15+
<h2 class="py-2">API Endpoint</h2>
16+
<p>
17+
The API endpoint is <b>{{ site|addstr:"/api"|urlize }}</b>. Visit in your browser for
18+
an interactive interface.
19+
</p>
820

9-
<div class="row justify-content-center">
10-
<div class="col-4 py-5 text-wrap">
11-
<h1 class="py-2">OPTIMAP data access</h1>
21+
<p>
22+
Query all publications via:
23+
<pre class="bg-light p-2">
24+
curl -X GET {{ site|addstr:"/api" }}/api/optimap/ | jq
25+
</pre>
26+
</p>
1227

13-
<p class="lead">All publication metadata published in OPTIMAP is licensed under a Create Commons Zero (<a href='https://creativecommons.org/publicdomain/zero/1.0/'>CC-0</a>) license.</p>
28+
<h2 class="py-2">OpenAPI Schema</h2>
29+
<p>
30+
Download the OpenAPI spec at <b>{{ site|addstr:"/api/schema"|urlize }}</b>.
31+
</p>
1432

15-
<h2 class="py-2">API endpoint</h2>
16-
<p>The API endpoint is <b>{{ site|addstr:"/api"|urlize }}</b>. Visit the URL in your browser to get an interactive interface for exploring the API.</p>
33+
<h2 class="py-2">OpenAPI UI</h2>
34+
<p>
35+
Explore interactively at <b>{{ site|addstr:"/api/schema/ui"|urlize }}</b>.
36+
</p>
1737

18-
<p>You can query all publications with the following request (using <a href="https://stedolan.github.io/jq/" title="Link to jq project website"><code>jq</code></a> for formatting of the response):</p>
19-
<pre>
20-
curl -X GET {{ site|addstr:"/api" }}/api/publications/ | jq
21-
</pre>
38+
<hr>
2239

23-
<h2 class="py-2">OpenAPI schema</h2>
24-
<p>You can download an OpenAPI specification of the api at <b>{{ site|addstr:"/api/schema"|urlize }}</b>.</p>
40+
<h2 class="py-2">Download Publication Data</h2>
41+
<ul class="list-unstyled mb-4">
42+
{% if geojson_size %}
43+
<li class="mb-3">
44+
<div class="d-flex align-items-center">
45+
<a class="btn btn-primary btn-sm" href="{% url 'optimap:download_geojson' %}">
46+
Download GeoJSON
47+
</a>
48+
(<a href="https://geojson.org/" target="_blank" class="ms-2 small">GeoJSON spec</a>)
49+
</div>
50+
<div class="small text-muted mt-1">
51+
File size: {{ geojson_size }}
52+
</div>
53+
</li>
54+
{% endif %}
2555

26-
<h2 class="py-2">OpenAPI user interface</h2>
27-
<p>You can explore the API with an interactive user intreface built based on the OpenAPI schema at <b>{{ site|addstr:"/api/schema/ui"|urlize }}</b>.</p>
56+
{% if geopackage_size %}
57+
<li>
58+
<div class="d-flex align-items-center">
59+
<a class="btn btn-primary btn-sm" href="{% url 'optimap:download_geopackage' %}">
60+
Download GeoPackage
61+
</a>
62+
(<a href="https://www.geopackage.org/" target="_blank" class="ms-2 small">GeoPackage spec</a>)
63+
</div>
64+
<div class="small text-muted mt-1">
65+
File size: {{ geopackage_size }}
66+
</div>
67+
</li>
68+
{% endif %}
69+
</ul>
70+
<p class="small text-muted text-center mb-0">
71+
Data dumps run every {{ interval }} hour{{ interval|pluralize }}.<br>
72+
Last updated: {{ last_updated|naturaltime }}
73+
</p>
2874

2975
</div>
3076
</div>
31-
32-
{% endblock %}
77+
{% endblock %}

0 commit comments

Comments
 (0)