GeoinformationSystems · nuest · May 15, 2025 · Mar 24, 2025 · Mar 26, 2025 · Mar 31, 2025
diff --git a/publications/tasks.py b/publications/tasks.py
@@ -1,39 +1,58 @@
 import logging
 logger = logging.getLogger(__name__)
 
-from django_q.models import Schedule
-from publications.models import Publication, HarvestingEvent, Source
-from bs4 import BeautifulSoup
+import os
 import json
+import subprocess
+import gzip
+import re
+import tempfile
+import time
+import calendar
+from datetime import datetime, timedelta
 import xml.dom.minidom
-from django.contrib.gis.geos import GEOSGeometry
+
 import requests
-from django.core.mail import send_mail, EmailMessage
-from django.utils import timezone 
-from requests.auth import HTTPBasicAuth
-import os
-from django.conf import settings
-from django.utils.timezone import now
-from django.contrib.auth import get_user_model
-User = get_user_model()
-from .models import EmailLog, Subscription
-from datetime import datetime, timedelta
-from django.urls import reverse
+from bs4 import BeautifulSoup
 from urllib.parse import quote
-from datetime import datetime
-from django_q.tasks import schedule
-from django.utils import timezone 
+
+from django.conf import settings
+from django.core.mail import send_mail, EmailMessage
+from django.core.serializers import serialize
+from django.contrib.gis.geos import GEOSGeometry
+from django.utils import timezone
 from django_q.tasks import schedule
 from django_q.models import Schedule
-import time  
-import calendar
-import re
-import subprocess
-import gzip
-import os
 
+from publications.models import Publication, HarvestingEvent, Source
+from .models import EmailLog, Subscription
+from django.contrib.auth import get_user_model
+User = get_user_model()
+from django.urls import reverse
+from django.utils.timezone import now
 BASE_URL = settings.BASE_URL
 
+DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
+
+
+def extract_geometry_from_html(content):
+    for tag in content.find_all("meta"):
+        if tag.get("name") == "DC.SpatialCoverage":
+            data = tag.get("content")
+            try:
+                geom = json.loads(data)
+                geom_data = geom["features"][0]["geometry"]
+                type_geom = {'type': 'GeometryCollection', "geometries": [geom_data]}
+                geom_data_string = json.dumps(type_geom)
+                try:
+                    geom_object = GEOSGeometry(geom_data_string)
+                    logger.debug('Found geometry: %s', geom_object)
+                    return geom_object
+                except Exception as e:
+                    logger.error("Cannot create geometry from string '%s': %s", geom_data_string, e)
+            except ValueError as e:
+                logger.error("Error loading JSON from %s: %s", tag.get("name"), e)
+
 def extract_geometry_from_html(content):
     for tag in content.find_all("meta"):
         if tag.get("name", None) == "DC.SpatialCoverage":
@@ -48,10 +67,7 @@ def extract_geometry_from_html(content):
                 geom_data_string = json.dumps(type_geom)
                 try:
                     geom_object = GEOSGeometry(geom_data_string)
-                    logger.debug('Found geometry: %s', geom_object)
-                    return geom_object
-                except Exception as e:
-                    logger.error("Cannot create geometry from string '%s': %s", geom_data_string, e)
+	@@ -55,31 +53,27 @@ def extract_geometry_from_html(content):
             except ValueError as e:
                 logger.error("Error loading JSON from %s: %s", tag.get("name"), e)
 
@@ -83,38 +99,7 @@ def parse_oai_xml_and_save_publications(content, event):
     for record in records:
         try:
             def get_text(tag_name):
-                nodes = record.getElementsByTagName(tag_name)
-                return nodes[0].firstChild.nodeValue.strip() if nodes and nodes[0].firstChild else None
-
-            identifier_value = get_text("dc:identifier")
-            title_value = get_text("dc:title")
-            abstract_text = get_text("dc:description")
-            journal_value = get_text("dc:publisher")
-            date_value = get_text("dc:date")
-
-            doi_text = None
-            doi_nodes = record.getElementsByTagName("dc:identifier")
-            for node in doi_nodes:
-                if node.firstChild and node.firstChild.nodeValue:
-                    candidate = node.firstChild.nodeValue.strip()
-                    match = DOI_REGEX.search(candidate)
-                    if match:
-                        doi_text = match.group(0)
-                        break
-
-            if not identifier_value or not identifier_value.startswith("http"):
-                logger.warning("Skipping record with invalid URL: %s", identifier_value)
-                continue
-
-            if doi_text and doi_text in existing_dois:
-                logger.info("Skipping duplicate publication (DOI): %s", doi_text)
-                continue
-
-            if identifier_value in existing_urls:
-                logger.info("Skipping duplicate publication (URL): %s", identifier_value)
-                continue
-
-            existing_urls.add(identifier_value)
+	@@ -118,9 +112,6 @@ def get_text(tag_name):
             if doi_text:
                 existing_dois.add(doi_text)
 
@@ -125,7 +110,6 @@ def get_text(tag_name):
                 soup = BeautifulSoup(response.content, "html.parser")
                 geom_object = extract_geometry_from_html(soup)
                 period_start, period_end = extract_timeperiod_from_html(soup)
-
             publication = Publication(
                 title=title_value,
                 abstract=abstract_text,
@@ -181,9 +165,7 @@ def send_monthly_email(trigger_source='manual', sent_by=None):
     for recipient in recipients:
         try:
             send_mail(
-                subject,
-                content,
-                settings.EMAIL_HOST_USER,
+	@@ -187,140 +173,108 @@ def send_monthly_email(trigger_source='manual', sent_by=None):
                 [recipient],
                 fail_silently=False,
             )
@@ -221,14 +203,12 @@ def send_subscription_based_email(trigger_source='manual', sent_by=None, user_id
         unsubscribe_all = f"{BASE_URL}{reverse('optimap:unsubscribe')}?all=true"
 
         subject = f"📚 New Manuscripts Matching '{subscription.search_term}'"
-        
+
         bullet_list = "\n".join([f"- {pub.title}" for pub in new_publications])
 
         content = f"""Dear {subscription.user.username},
         Here are the latest manuscripts matching your subscription:
-
         {bullet_list}
-
         Manage your subscriptions:
         Unsubscribe from '{subscription.search_term}': {unsubscribe_specific}
         Unsubscribe from All: {unsubscribe_all}
@@ -276,51 +256,39 @@ def schedule_subscription_email_task(sent_by=None):
             kwargs={'trigger_source': 'scheduled', 'sent_by': sent_by.id if sent_by else None} 
         )
         logger.info(f"Scheduled 'send_subscription_based_email' for {next_run_date}")
-
-
-# ------------------------------
-# New GeoJSON/GeoPackage Cache Functions
-# ------------------------------
-
 def regenerate_geojson_cache():
-    """
-    Serializes all Publication objects into a GeoJSON FeatureCollection,
-    writes it to a file, and creates a gzipped version.
-    """
-    from django.core.serializers import serialize
-    features = []
-    geojson_str = serialize('geojson', Publication.objects.all(), geometry_field='geometry')
+    cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
+    os.makedirs(cache_dir, exist_ok=True)
+    geojson_str = serialize(
+        'geojson',
+        Publication.objects.filter(status='p'),
+        geometry_field='geometry'
+    )
     try:
         geojson_obj = json.loads(geojson_str)
         features = geojson_obj.get("features", [])
     except Exception as e:
         logger.error("Error parsing GeoJSON: %s", e)
-
+        features = []
     full_collection = {
         "type": "FeatureCollection",
         "crs": {"type": "name", "properties": {"name": "EPSG:4326"}},
         "features": features
     }
-
-    cache_dir = os.path.join(os.path.dirname(__file__), 'cache')
-    os.makedirs(cache_dir, exist_ok=True)
-
     json_path = os.path.join(cache_dir, 'geojson_cache.json')
     with open(json_path, 'w') as f:
         json.dump(full_collection, f)
-
     gzip_path = os.path.join(cache_dir, 'geojson_cache.json.gz')
     with gzip.open(gzip_path, 'wt') as f:
         json.dump(full_collection, f)
-
-    logger.info("GeoJSON cache regenerated successfully.")
+    json_size = os.path.getsize(json_path)
+    logger.info("GeoJSON cache regenerated at %s (size: %d bytes); gzipped at %s", json_path, json_size, gzip_path)
     return json_path
 
+
 def convert_geojson_to_geopackage(geojson_path):
-    """
-    Converts the GeoJSON file at geojson_path to a GeoPackage using ogr2ogr.
-    """
-    cache_dir = os.path.join(os.path.dirname(__file__), 'cache')
+    cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
+    os.makedirs(cache_dir, exist_ok=True)
     geopackage_path = os.path.join(cache_dir, 'publications.gpkg')
     cmd = ["ogr2ogr", "-f", "GPKG", geopackage_path, geojson_path]
     try:
@@ -331,11 +299,8 @@ def convert_geojson_to_geopackage(geojson_path):
         geopackage_path = None
     return geopackage_path
 
+
 def regenerate_geopackage_cache():
-    """
-    Regenerates the GeoJSON cache and converts it to a GeoPackage.
-    Intended to be run on a schedule via Django Q.
-    """
     json_path = regenerate_geojson_cache()
     gpkg_path = convert_geojson_to_geopackage(json_path)
     return gpkg_path
diff --git a/publications/templates/data.html b/publications/templates/data.html
@@ -21,10 +21,10 @@ <h2 class="py-2">API Endpoint</h2>
 
     <p>
       You can query all publications with the following request (using 
-      <a href="https://stedolan.github.io/jq/" title="Link to jq project website"><code>jq</code></a> for formatting):
+      <a href="https://stedolan.github.io/jq/" title="Link to jq project website" target="_blank"><code>jq</code></a> for formatting):
     </p>
     <pre>
-      curl -X GET {{ site|addstr:"/api" }}/api/publications/ | jq
+curl -X GET {{ site|addstr:"/api" }}/api/publications/ | jq
     </pre>
 
     <h2 class="py-2">OpenAPI Schema</h2>
@@ -39,21 +39,21 @@ <h2 class="py-2">OpenAPI User Interface</h2>
 
     <hr>
 
-    <!-- New Section: File Downloads -->
     <h2 class="py-2">Download Publication Data</h2>
     <p>
       Choose your desired file format.
     </p>
     <ul>
       <li>
         <a class="btn btn-primary" href="{% url 'publications:download_geojson' %}">Download GeoJSON</a>
-        ({{ geojson_size }})
+        (<a href="https://geojson.org/" target="_blank">GeoJSON spec</a>) ({{ geojson_size }})
       </li>
       <li>
         <a class="btn btn-primary" href="{% url 'publications:download_geopackage' %}">Download GeoPackage</a>
-        ({{ geopackage_size }})
+        (<a href="https://www.geopackage.org/" target="_blank">GeoPackage spec</a>) ({{ geopackage_size }})
       </li>
     </ul>
+    <p class="small text-muted">Data dumps are recreated each night. Last updated: {{ last_updated }}</p>
   </div>
 </div>
 {% endblock %}
diff --git a/publications/templates/footer.html b/publications/templates/footer.html
@@ -4,7 +4,7 @@
             <span class="px-3">&copy;&nbsp;2023&nbsp;<a class="text-white" title="OPTIMETA project website" href="https://projects.tib.eu/optimeta">OPTIMETA project</a> &amp; <a class="text-white" title="KOMET project website" href="https://projects.tib.eu/komet">KOMET project</a></span>
             <a class="px-3 text-white" title="Link to source code project" href="https://github.com/GeoinformationSystems/optimap">Code</a>
             <a class="px-3 text-white" title="Privace information / Imprint" href="{% url 'optimap:privacy' %}">Privacy / Imprint / Contact</a>
-            <a class="px-3 text-white" title="API browser" href="{% url 'optimap:data' %}">API & Data</a>
+            <a class="px-3 text-white" title="Data & API browser" href="{% url 'optimap:data_and_api' %}">API & Data</a>
             <span class="px-3">Publication data license: <a class="text-white" title="Publication metadata license" href='https://creativecommons.org/publicdomain/zero/1.0/'>CC-0</a></span>
         </p>
     </div>

diff --git a/publications/urls.py b/publications/urls.py
@@ -19,7 +19,7 @@
     path("api/v1/", include("publications.api")),
     path('api/schema/', SpectacularAPIView.as_view(), name='schema'),
     path('api/schema/ui/sitemap', SpectacularRedocView.as_view(url_name='optimap:schema'), name='redoc'),
-    path("data/", views.data, name="data"),
+    path("data/", views.data, name="data_and_api"),
     path('feed/georss/', GeoFeed(feed_type_variant="georss"), name='georss_feed'),
     path('feed/geoatom/', GeoFeed(feed_type_variant="geoatom"), name='geoatom_feed'),
     path('feed/w3cgeo/', GeoFeed(feed_type_variant="w3cgeo"), name='w3cgeo_feed'),