Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ace5ee0
Changes for testing warnings with js
BharatVe Mar 24, 2025
e987387
Update APi to API&Data
BharatVe Mar 26, 2025
7c145c0
Merge branch 'main' into enhancement/Download_all_geometries_and_meta…
nuest Mar 31, 2025
ba487d4
Addition of GeoPackage + Dynamic Size Calculation ( GeoPackage needs …
Apr 2, 2025
af50835
Upadted implemntation for Geopackage download
Apr 7, 2025
83cc473
Updated test file
Apr 9, 2025
699e78b
Merge remote-tracking branch 'origin/main' into enhancement/Download_…
Apr 9, 2025
1cc2193
Merge remote-tracking branch 'origin/main' into enhancement/Download_…
Apr 9, 2025
b21233d
update test( with pygdal)
Apr 9, 2025
823ad18
Update test_geo_data.py
BharatVe Apr 9, 2025
bcec9a1
Update requirements.txt
BharatVe Apr 9, 2025
aca2962
updated views.py, requirements.txt using fiona and shapely (vs osgeo)
Apr 10, 2025
7869a82
Changes for updated pull request. (Work in progress)
Apr 20, 2025
17965b0
Update tasks.py, minor updates
BharatVe Apr 20, 2025
988b5e1
Completed implemeentation with recommeded changes(final check needed)
Apr 22, 2025
c4cc194
Minor corrections tasks.py
BharatVe Apr 23, 2025
f135ef3
updated test
Apr 23, 2025
052c42f
Update data.html
BharatVe Apr 23, 2025
acfe536
Updated data message
BharatVe Apr 23, 2025
30cb5f1
now to timezone (Fix unittest issue)
Apr 23, 2025
9d24d63
add logos and colours to README, closes #33
nuest Apr 9, 2025
fc02e9b
Updated scripts- changed time fomats, modified test added humanize time
Apr 28, 2025
daed800
Merge branch 'main' into enhancement/Download_all_geometries_and_meta…
BharatVe Apr 28, 2025
a9f7a8d
fixed tests, removed fiona and updated requirements.txt
Apr 28, 2025
939e0b8
install GDAL package form PyPI
nuest Apr 29, 2025
e7d9701
fix test
nuest Apr 29, 2025
a2f829f
Updated links, changed URLs, corrected footer, added automated cache …
May 5, 2025
479e10e
Updated apps and tests
May 6, 2025
0e4b16b
Update apps.py
BharatVe May 6, 2025
6c072ed
Use Humanize, added checks for link validity.
May 12, 2025
100dc2a
added humanize
May 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 61 additions & 96 deletions publications/tasks.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,58 @@
import logging
logger = logging.getLogger(__name__)

from django_q.models import Schedule
from publications.models import Publication, HarvestingEvent, Source
from bs4 import BeautifulSoup
import os
import json
import subprocess
import gzip
import re
import tempfile
import time
import calendar
from datetime import datetime, timedelta
import xml.dom.minidom
from django.contrib.gis.geos import GEOSGeometry

import requests
from django.core.mail import send_mail, EmailMessage
from django.utils import timezone
from requests.auth import HTTPBasicAuth
import os
from django.conf import settings
from django.utils.timezone import now
from django.contrib.auth import get_user_model
User = get_user_model()
from .models import EmailLog, Subscription
from datetime import datetime, timedelta
from django.urls import reverse
from bs4 import BeautifulSoup
from urllib.parse import quote
from datetime import datetime
from django_q.tasks import schedule
from django.utils import timezone

from django.conf import settings
from django.core.mail import send_mail, EmailMessage
from django.core.serializers import serialize
from django.contrib.gis.geos import GEOSGeometry
from django.utils import timezone
from django_q.tasks import schedule
from django_q.models import Schedule
import time
import calendar
import re
import subprocess
import gzip
import os

from publications.models import Publication, HarvestingEvent, Source
from .models import EmailLog, Subscription
from django.contrib.auth import get_user_model
User = get_user_model()
from django.urls import reverse
from django.utils.timezone import now
BASE_URL = settings.BASE_URL

DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)


def extract_geometry_from_html(content):
for tag in content.find_all("meta"):
if tag.get("name") == "DC.SpatialCoverage":
data = tag.get("content")
try:
geom = json.loads(data)
geom_data = geom["features"][0]["geometry"]
type_geom = {'type': 'GeometryCollection', "geometries": [geom_data]}
geom_data_string = json.dumps(type_geom)
try:
geom_object = GEOSGeometry(geom_data_string)
logger.debug('Found geometry: %s', geom_object)
return geom_object
except Exception as e:
logger.error("Cannot create geometry from string '%s': %s", geom_data_string, e)
except ValueError as e:
logger.error("Error loading JSON from %s: %s", tag.get("name"), e)

def extract_geometry_from_html(content):
for tag in content.find_all("meta"):
if tag.get("name", None) == "DC.SpatialCoverage":
Expand All @@ -48,10 +67,7 @@ def extract_geometry_from_html(content):
geom_data_string = json.dumps(type_geom)
try:
geom_object = GEOSGeometry(geom_data_string)
logger.debug('Found geometry: %s', geom_object)
return geom_object
except Exception as e:
logger.error("Cannot create geometry from string '%s': %s", geom_data_string, e)
@@ -55,31 +53,27 @@ def extract_geometry_from_html(content):
except ValueError as e:
logger.error("Error loading JSON from %s: %s", tag.get("name"), e)

Expand Down Expand Up @@ -83,38 +99,7 @@ def parse_oai_xml_and_save_publications(content, event):
for record in records:
try:
def get_text(tag_name):
nodes = record.getElementsByTagName(tag_name)
return nodes[0].firstChild.nodeValue.strip() if nodes and nodes[0].firstChild else None

identifier_value = get_text("dc:identifier")
title_value = get_text("dc:title")
abstract_text = get_text("dc:description")
journal_value = get_text("dc:publisher")
date_value = get_text("dc:date")

doi_text = None
doi_nodes = record.getElementsByTagName("dc:identifier")
for node in doi_nodes:
if node.firstChild and node.firstChild.nodeValue:
candidate = node.firstChild.nodeValue.strip()
match = DOI_REGEX.search(candidate)
if match:
doi_text = match.group(0)
break

if not identifier_value or not identifier_value.startswith("http"):
logger.warning("Skipping record with invalid URL: %s", identifier_value)
continue

if doi_text and doi_text in existing_dois:
logger.info("Skipping duplicate publication (DOI): %s", doi_text)
continue

if identifier_value in existing_urls:
logger.info("Skipping duplicate publication (URL): %s", identifier_value)
continue

existing_urls.add(identifier_value)
@@ -118,9 +112,6 @@ def get_text(tag_name):
if doi_text:
existing_dois.add(doi_text)

Expand All @@ -125,7 +110,6 @@ def get_text(tag_name):
soup = BeautifulSoup(response.content, "html.parser")
geom_object = extract_geometry_from_html(soup)
period_start, period_end = extract_timeperiod_from_html(soup)

publication = Publication(
title=title_value,
abstract=abstract_text,
Expand Down Expand Up @@ -181,9 +165,7 @@ def send_monthly_email(trigger_source='manual', sent_by=None):
for recipient in recipients:
try:
send_mail(
subject,
content,
settings.EMAIL_HOST_USER,
@@ -187,140 +173,108 @@ def send_monthly_email(trigger_source='manual', sent_by=None):
[recipient],
fail_silently=False,
)
Expand Down Expand Up @@ -221,14 +203,12 @@ def send_subscription_based_email(trigger_source='manual', sent_by=None, user_id
unsubscribe_all = f"{BASE_URL}{reverse('optimap:unsubscribe')}?all=true"

subject = f"📚 New Manuscripts Matching '{subscription.search_term}'"

bullet_list = "\n".join([f"- {pub.title}" for pub in new_publications])

content = f"""Dear {subscription.user.username},
Here are the latest manuscripts matching your subscription:

{bullet_list}

Manage your subscriptions:
Unsubscribe from '{subscription.search_term}': {unsubscribe_specific}
Unsubscribe from All: {unsubscribe_all}
Expand Down Expand Up @@ -276,51 +256,39 @@ def schedule_subscription_email_task(sent_by=None):
kwargs={'trigger_source': 'scheduled', 'sent_by': sent_by.id if sent_by else None}
)
logger.info(f"Scheduled 'send_subscription_based_email' for {next_run_date}")


# ------------------------------
# New GeoJSON/GeoPackage Cache Functions
# ------------------------------

def regenerate_geojson_cache():
Comment thread
BharatVe marked this conversation as resolved.
"""
Serializes all Publication objects into a GeoJSON FeatureCollection,
writes it to a file, and creates a gzipped version.
"""
from django.core.serializers import serialize
features = []
geojson_str = serialize('geojson', Publication.objects.all(), geometry_field='geometry')
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
os.makedirs(cache_dir, exist_ok=True)
geojson_str = serialize(
'geojson',
Publication.objects.filter(status='p'),
geometry_field='geometry'
)
try:
Comment thread
BharatVe marked this conversation as resolved.
Outdated
geojson_obj = json.loads(geojson_str)
features = geojson_obj.get("features", [])
except Exception as e:
logger.error("Error parsing GeoJSON: %s", e)

features = []
full_collection = {
"type": "FeatureCollection",
"crs": {"type": "name", "properties": {"name": "EPSG:4326"}},
"features": features
}

cache_dir = os.path.join(os.path.dirname(__file__), 'cache')
os.makedirs(cache_dir, exist_ok=True)

json_path = os.path.join(cache_dir, 'geojson_cache.json')
with open(json_path, 'w') as f:
Comment thread
BharatVe marked this conversation as resolved.
json.dump(full_collection, f)

gzip_path = os.path.join(cache_dir, 'geojson_cache.json.gz')
with gzip.open(gzip_path, 'wt') as f:
Comment thread
BharatVe marked this conversation as resolved.
Outdated
json.dump(full_collection, f)

logger.info("GeoJSON cache regenerated successfully.")
json_size = os.path.getsize(json_path)
logger.info("GeoJSON cache regenerated at %s (size: %d bytes); gzipped at %s", json_path, json_size, gzip_path)
return json_path


def convert_geojson_to_geopackage(geojson_path):
"""
Converts the GeoJSON file at geojson_path to a GeoPackage using ogr2ogr.
"""
cache_dir = os.path.join(os.path.dirname(__file__), 'cache')
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
os.makedirs(cache_dir, exist_ok=True)
geopackage_path = os.path.join(cache_dir, 'publications.gpkg')
cmd = ["ogr2ogr", "-f", "GPKG", geopackage_path, geojson_path]
try:
Expand All @@ -331,11 +299,8 @@ def convert_geojson_to_geopackage(geojson_path):
geopackage_path = None
return geopackage_path


def regenerate_geopackage_cache():
"""
Regenerates the GeoJSON cache and converts it to a GeoPackage.
Intended to be run on a schedule via Django Q.
"""
json_path = regenerate_geojson_cache()
gpkg_path = convert_geojson_to_geopackage(json_path)
return gpkg_path
Comment thread
BharatVe marked this conversation as resolved.
Outdated
10 changes: 5 additions & 5 deletions publications/templates/data.html
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ <h2 class="py-2">API Endpoint</h2>

<p>
You can query all publications with the following request (using
<a href="https://stedolan.github.io/jq/" title="Link to jq project website"><code>jq</code></a> for formatting):
<a href="https://stedolan.github.io/jq/" title="Link to jq project website" target="_blank"><code>jq</code></a> for formatting):
</p>
<pre>
curl -X GET {{ site|addstr:"/api" }}/api/publications/ | jq
curl -X GET {{ site|addstr:"/api" }}/api/publications/ | jq
</pre>

<h2 class="py-2">OpenAPI Schema</h2>
Expand All @@ -39,21 +39,21 @@ <h2 class="py-2">OpenAPI User Interface</h2>

<hr>

<!-- New Section: File Downloads -->
<h2 class="py-2">Download Publication Data</h2>
<p>
Choose your desired file format.
Comment thread
nuest marked this conversation as resolved.
Outdated
</p>
<ul>
<li>
<a class="btn btn-primary" href="{% url 'publications:download_geojson' %}">Download GeoJSON</a>
Comment thread
BharatVe marked this conversation as resolved.
Outdated
({{ geojson_size }})
(<a href="https://geojson.org/" target="_blank">GeoJSON spec</a>) ({{ geojson_size }})
</li>
<li>
<a class="btn btn-primary" href="{% url 'publications:download_geopackage' %}">Download GeoPackage</a>
Comment thread
nuest marked this conversation as resolved.
Outdated
({{ geopackage_size }})
(<a href="https://www.geopackage.org/" target="_blank">GeoPackage spec</a>) ({{ geopackage_size }})
</li>
</ul>
<p class="small text-muted">Data dumps are recreated each night. Last updated: {{ last_updated }}</p>
</div>
</div>
{% endblock %}
2 changes: 1 addition & 1 deletion publications/templates/footer.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<span class="px-3">&copy;&nbsp;2023&nbsp;<a class="text-white" title="OPTIMETA project website" href="https://projects.tib.eu/optimeta">OPTIMETA project</a> &amp; <a class="text-white" title="KOMET project website" href="https://projects.tib.eu/komet">KOMET project</a></span>
<a class="px-3 text-white" title="Link to source code project" href="https://github.com/GeoinformationSystems/optimap">Code</a>
<a class="px-3 text-white" title="Privace information / Imprint" href="{% url 'optimap:privacy' %}">Privacy / Imprint / Contact</a>
<a class="px-3 text-white" title="API browser" href="{% url 'optimap:data' %}">API & Data</a>
<a class="px-3 text-white" title="Data & API browser" href="{% url 'optimap:data_and_api' %}">API & Data</a>
<span class="px-3">Publication data license: <a class="text-white" title="Publication metadata license" href='https://creativecommons.org/publicdomain/zero/1.0/'>CC-0</a></span>
</p>
</div>
Expand Down
2 changes: 1 addition & 1 deletion publications/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
path("api/v1/", include("publications.api")),
path('api/schema/', SpectacularAPIView.as_view(), name='schema'),
path('api/schema/ui/sitemap', SpectacularRedocView.as_view(url_name='optimap:schema'), name='redoc'),
path("data/", views.data, name="data"),
path("data/", views.data, name="data_and_api"),
path('feed/georss/', GeoFeed(feed_type_variant="georss"), name='georss_feed'),
path('feed/geoatom/', GeoFeed(feed_type_variant="geoatom"), name='geoatom_feed'),
path('feed/w3cgeo/', GeoFeed(feed_type_variant="w3cgeo"), name='w3cgeo_feed'),
Expand Down
Loading
Loading