Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ace5ee0
Changes for testing warnings with js
BharatVe Mar 24, 2025
e987387
Update APi to API&Data
BharatVe Mar 26, 2025
7c145c0
Merge branch 'main' into enhancement/Download_all_geometries_and_meta…
nuest Mar 31, 2025
ba487d4
Addition of GeoPackage + Dynamic Size Calculation ( GeoPackage needs …
Apr 2, 2025
af50835
Upadted implemntation for Geopackage download
Apr 7, 2025
83cc473
Updated test file
Apr 9, 2025
699e78b
Merge remote-tracking branch 'origin/main' into enhancement/Download_…
Apr 9, 2025
1cc2193
Merge remote-tracking branch 'origin/main' into enhancement/Download_…
Apr 9, 2025
b21233d
update test( with pygdal)
Apr 9, 2025
823ad18
Update test_geo_data.py
BharatVe Apr 9, 2025
bcec9a1
Update requirements.txt
BharatVe Apr 9, 2025
aca2962
updated views.py, requirements.txt using fiona and shapely (vs osgeo)
Apr 10, 2025
7869a82
Changes for updated pull request. (Work in progress)
Apr 20, 2025
17965b0
Update tasks.py, minor updates
BharatVe Apr 20, 2025
988b5e1
Completed implemeentation with recommeded changes(final check needed)
Apr 22, 2025
c4cc194
Minor corrections tasks.py
BharatVe Apr 23, 2025
f135ef3
updated test
Apr 23, 2025
052c42f
Update data.html
BharatVe Apr 23, 2025
acfe536
Updated data message
BharatVe Apr 23, 2025
30cb5f1
now to timezone (Fix unittest issue)
Apr 23, 2025
9d24d63
add logos and colours to README, closes #33
nuest Apr 9, 2025
fc02e9b
Updated scripts- changed time fomats, modified test added humanize time
Apr 28, 2025
daed800
Merge branch 'main' into enhancement/Download_all_geometries_and_meta…
BharatVe Apr 28, 2025
a9f7a8d
fixed tests, removed fiona and updated requirements.txt
Apr 28, 2025
939e0b8
install GDAL package form PyPI
nuest Apr 29, 2025
e7d9701
fix test
nuest Apr 29, 2025
a2f829f
Updated links, changed URLs, corrected footer, added automated cache …
May 5, 2025
479e10e
Updated apps and tests
May 6, 2025
0e4b16b
Update apps.py
BharatVe May 6, 2025
6c072ed
Use Humanize, added checks for link validity.
May 12, 2025
100dc2a
added humanize
May 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions publications/management/commands/schedule_geojson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from django.core.management.base import BaseCommand
from django_q.tasks import schedule
from django_q.models import Schedule

class Command(BaseCommand):
help = "Schedule the GeoJSON regeneration task every 6 hours."

def handle(self, *args, **options):
func_name = 'publications.tasks.regenerate_geojson_cache'
if not Schedule.objects.filter(func=func_name).exists():
schedule(
func_name,
schedule_type='I', # interval
minutes=360, # every 6 hours
repeats=-1
)
self.stdout.write(self.style.SUCCESS("Scheduled GeoJSON regeneration every 6h."))
else:
self.stdout.write("GeoJSON regeneration already scheduled.")
206 changes: 117 additions & 89 deletions publications/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,69 +11,74 @@
import calendar
from datetime import datetime, timedelta
import xml.dom.minidom

import requests
from bs4 import BeautifulSoup
from requests.auth import HTTPBasicAuth
from urllib.parse import quote

from django.conf import settings
from django.core.mail import send_mail, EmailMessage
from django.core.serializers import serialize
from django.contrib.gis.geos import GEOSGeometry
from django.utils import timezone
from django_q.tasks import schedule
from django_q.models import Schedule

from publications.models import Publication, HarvestingEvent, Source
from .models import EmailLog, Subscription
from django.contrib.auth import get_user_model
User = get_user_model()
from django.urls import reverse
from django.utils.timezone import now

BASE_URL = settings.BASE_URL

DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)


def extract_geometry_from_html(content):
for tag in content.find_all("meta"):
if tag.get("name") == "DC.SpatialCoverage":
data = tag.get("content")
if tag.get("name", None) == "DC.SpatialCoverage":
data = tag.get("content", None)
try:
geom = json.loads(data)
geom_data = geom["features"][0]["geometry"]
type_geom = {'type': 'GeometryCollection', "geometries": [geom_data]}
geom_data_string = json.dumps(type_geom)
try:
geom_object = GEOSGeometry(geom_data_string)
logger.debug('Found geometry: %s', geom_object)
# preparing geometry data in accordance to geos API fields
type_geom= {'type': 'GeometryCollection'}
geom_content = {"geometries" : [geom_data]}
type_geom.update(geom_content)
geom_data_string= json.dumps(type_geom)
try :
geom_object = GEOSGeometry(geom_data_string) # GeometryCollection object
logging.debug('Found geometry: %s', geom_object)
return geom_object
except Exception as e:
logger.error("Cannot create geometry from string '%s': %s", geom_data_string, e)
except ValueError as e:
logger.error("Error loading JSON from %s: %s", tag.get("name"), e)


def extract_timeperiod_from_html(content):
period = [None, None]
for tag in content.find_all("meta"):
if tag.get("name") in ['DC.temporal', 'DC.PeriodOfTime']:
data = tag.get("content")
period = data.split("/")
logger.debug('Found time period: %s', period)
break
if tag.get("name", None) in ['DC.temporal', 'DC.PeriodOfTime']:
data = tag.get("content", None)
period = data.split("/")
logging.debug('Found time period: %s', period)
break;
# returning arrays for array field in DB
return [period[0]], [period[1]]

DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)

def parse_oai_xml_and_save_publications(content, event):

DOMTree = xml.dom.minidom.parseString(content)
collection = DOMTree.documentElement

records = collection.getElementsByTagName("record")

if not records:
logger.warning("No articles found in OAI-PMH response!")
return

existing_urls = set(Publication.objects.values_list('url', flat=True))
existing_dois = set(Publication.objects.exclude(doi__isnull=True).values_list('doi', flat=True))
existing_dois = set(Publication.objects.exclude(doi__isnull=True).values_list('doi', flat=True))
for record in records:
try:
def get_text(tag_name):
Expand Down Expand Up @@ -112,6 +117,9 @@ def get_text(tag_name):
if doi_text:
existing_dois.add(doi_text)

geom_object = None
period_start = []
period_end = []
with requests.get(identifier_value) as response:
soup = BeautifulSoup(response.content, "html.parser")
geom_object = extract_geometry_from_html(soup)
Expand All @@ -129,41 +137,46 @@ def get_text(tag_name):
timeperiod_enddate=period_end
)
publication.save()
print("Saved new publication: %s" % identifier_value)
print("Saved new publication: %s", identifier_value)

except Exception as e:
print("Error parsing record: %s" % str(e))
print("Error parsing record: %s", str(e))
continue


def harvest_oai_endpoint(source_id):
source = Source.objects.get(id=source_id)
event = HarvestingEvent.objects.create(source=source, status="in_progress")

username = os.getenv("OPTIMAP_OAI_USERNAME")
password = os.getenv("OPTIMAP_OAI_PASSWORD")

try:
with requests.Session() as session:
response = session.get(source.url_field, auth=requests.auth.HTTPBasicAuth(username, password))
response.raise_for_status()
response = session.get(source.url_field, auth=HTTPBasicAuth(username, password))
response.raise_for_status()
parse_oai_xml_and_save_publications(response.content, event)

event.status = "completed"
event.completed_at = timezone.now()
event.save()
print("Harvesting completed for", source.url_field)

except requests.exceptions.RequestException as e:
print("Error harvesting from", source.url_field, ":", e)
event.status = "failed"
event.log = str(e)
event.save()


def send_monthly_email(trigger_source='manual', sent_by=None):
recipients = User.objects.filter(userprofile__notify_new_manuscripts=True).values_list('email', flat=True)
last_month = now().replace(day=1) - timedelta(days=1)
new_manuscripts = Publication.objects.filter(creationDate__month=last_month.month)

if not recipients.exists() or not new_manuscripts.exists():
return

subject = "📚 New Manuscripts This Month"
content = "Here are the new manuscripts:\n" + "\n".join([pub.title for pub in new_manuscripts])

for recipient in recipients:
try:
send_mail(
Expand All @@ -173,120 +186,135 @@ def send_monthly_email(trigger_source='manual', sent_by=None):
[recipient],
fail_silently=False,
)
EmailLog.log_email(recipient, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="success")
time.sleep(settings.EMAIL_SEND_DELAY)

EmailLog.log_email(
recipient, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="success"
)
time.sleep(settings.EMAIL_SEND_DELAY)

except Exception as e:
error_message = str(e)
logger.error("Failed to send monthly email to %s: %s", recipient, error_message)
EmailLog.log_email(recipient, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="failed", error_message=error_message)
logger.error(f"Failed to send monthly email to {recipient}: {error_message}")
EmailLog.log_email(
recipient, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="failed", error_message=error_message
)


def send_subscription_based_email(trigger_source='manual', sent_by=None, user_ids=None):
query = Subscription.objects.filter(subscribed=True, user__isnull=False)
query = Subscription.objects.filter(subscribed=True, user__isnull=False)
if user_ids:
query = query.filter(user__id__in=user_ids)
query = query.filter(user__id__in=user_ids)

for subscription in query:
user_email = subscription.user.email
new_publications = Publication.objects.filter(geometry__intersects=subscription.region)
user_email = subscription.user.email

new_publications = Publication.objects.filter(
geometry__intersects=subscription.region,
# publicationDate__gte=subscription.timeperiod_startdate,
# publicationDate__lte=subscription.timeperiod_enddate
)

if not new_publications.exists():
continue
continue

unsubscribe_specific = f"{BASE_URL}{reverse('optimap:unsubscribe')}?search={quote(subscription.search_term)}"
unsubscribe_all = f"{BASE_URL}{reverse('optimap:unsubscribe')}?all=true"

subject = f"📚 New Manuscripts Matching '{subscription.search_term}'"

bullet_list = "\n".join([f"- {pub.title}" for pub in new_publications])
content = (
f"Dear {subscription.user.username},\n\n"
f"Here are the latest manuscripts matching your subscription:\n\n{bullet_list}\n\n"
f"Manage your subscriptions:\n"
f"Unsubscribe from '{subscription.search_term}': {unsubscribe_specific}\n"
f"Unsubscribe from All: {unsubscribe_all}\n"
)

content = f"""Dear {subscription.user.username},
Here are the latest manuscripts matching your subscription:

{bullet_list}

Manage your subscriptions:
Unsubscribe from '{subscription.search_term}': {unsubscribe_specific}
Unsubscribe from All: {unsubscribe_all}
"""

try:
email = EmailMessage(subject, content, settings.EMAIL_HOST_USER, [user_email])
email.send()
EmailLog.log_email(user_email, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="success")
time.sleep(settings.EMAIL_SEND_DELAY)
EmailLog.log_email(
user_email, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="success"
)
time.sleep(settings.EMAIL_SEND_DELAY)

except Exception as e:
error_message = str(e)
logger.error("Failed to send subscription email to %s: %s", user_email, error_message)
EmailLog.log_email(user_email, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="failed", error_message=error_message)

logger.error(f"Failed to send subscription email to {user_email}: {error_message}")
EmailLog.log_email(
user_email, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="failed", error_message=error_message
)

def schedule_monthly_email_task(sent_by=None):
if not Schedule.objects.filter(func='publications.tasks.send_monthly_email').exists():
now_dt = datetime.now()
last_day = calendar.monthrange(now_dt.year, now_dt.month)[1]
next_run_date = now_dt.replace(day=last_day, hour=23, minute=59)
now = datetime.now()
last_day_of_month = calendar.monthrange(now.year, now.month)[1] # Get last day of the month
next_run_date = now.replace(day=last_day_of_month, hour=23, minute=59) # Run at the end of the last day
schedule(
'publications.tasks.send_monthly_email',
schedule_type='M',
repeats=-1,
next_run=next_run_date,
kwargs={'trigger_source': 'scheduled', 'sent_by': sent_by.id if sent_by else None}
kwargs={'trigger_source': 'scheduled', 'sent_by': sent_by.id if sent_by else None}
)
logger.info("Scheduled 'send_monthly_email' for %s", next_run_date)

logger.info(f"Scheduled 'schedule_monthly_email_task' for {next_run_date}")

def schedule_subscription_email_task(sent_by=None):
if not Schedule.objects.filter(func='publications.tasks.send_subscription_based_email').exists():
now_dt = datetime.now()
last_day = calendar.monthrange(now_dt.year, now_dt.month)[1]
next_run_date = now_dt.replace(day=last_day, hour=23, minute=59)
now = datetime.now()
last_day_of_month = calendar.monthrange(now.year, now.month)[1] # Get last day of the month
next_run_date = now.replace(day=last_day_of_month, hour=23, minute=59) # Run at the end of the last day
schedule(
'publications.tasks.send_subscription_based_email',
schedule_type='M',
repeats=-1,
next_run=next_run_date,
kwargs={'trigger_source': 'scheduled', 'sent_by': sent_by.id if sent_by else None}
kwargs={'trigger_source': 'scheduled', 'sent_by': sent_by.id if sent_by else None}
)
logger.info("Scheduled 'send_subscription_based_email' for %s", next_run_date)


logger.info(f"Scheduled 'send_subscription_based_email' for {next_run_date}")

def regenerate_geojson_cache():
Comment thread
BharatVe marked this conversation as resolved.
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
os.makedirs(cache_dir, exist_ok=True)
geojson_str = serialize(
Comment thread
nuest marked this conversation as resolved.
Outdated
'geojson',
Publication.objects.filter(status='p'),
geometry_field='geometry'
)
try:
geojson_obj = json.loads(geojson_str)
features = geojson_obj.get("features", [])
except Exception as e:
logger.error("Error parsing GeoJSON: %s", e)
features = []
full_collection = {
"type": "FeatureCollection",
"crs": {"type": "name", "properties": {"name": "EPSG:4326"}},
"features": features
}

json_path = os.path.join(cache_dir, 'geojson_cache.json')
with open(json_path, 'w') as f:
Comment thread
BharatVe marked this conversation as resolved.
json.dump(full_collection, f)
gzip_path = os.path.join(cache_dir, 'geojson_cache.json.gz')
with gzip.open(gzip_path, 'wt') as f:
json.dump(full_collection, f)
json_size = os.path.getsize(json_path)
logger.info("GeoJSON cache regenerated at %s (size: %d bytes); gzipped at %s", json_path, json_size, gzip_path)
serialize(
'geojson',
Publication.objects.filter(status='p'),
geometry_field='geometry',
srid=4326,
stream=f
)

gzip_path = json_path + '.gz'
with open(json_path, 'rb') as fin, gzip.open(gzip_path, 'wb') as fout:
fout.writelines(fin)

size = os.path.getsize(json_path)
logger.info("Cached GeoJSON at %s (%d bytes), gzipped at %s", json_path, size, gzip_path)
return json_path


def convert_geojson_to_geopackage(geojson_path):
cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache")
os.makedirs(cache_dir, exist_ok=True)
geopackage_path = os.path.join(cache_dir, 'publications.gpkg')
cmd = ["ogr2ogr", "-f", "GPKG", geopackage_path, geojson_path]
gpkg = os.path.join(cache_dir, 'publications.gpkg')
cmd = ["ogr2ogr", "-f", "GPKG", gpkg, geojson_path]
try:
subprocess.check_call(cmd)
Comment thread
BharatVe marked this conversation as resolved.
Outdated
logger.info("GeoPackage generated at: %s", geopackage_path)
logger.info("Generated GeoPackage at %s", gpkg)
except subprocess.CalledProcessError as e:
logger.error("Error converting GeoJSON to GeoPackage: %s", e)
geopackage_path = None
return geopackage_path
logger.error("ogr2ogr failed: %s", e)
return None
return gpkg


def regenerate_geopackage_cache():
json_path = regenerate_geojson_cache()
gpkg_path = convert_geojson_to_geopackage(json_path)
return gpkg_path
return json_path, gpkg_path
Loading
Loading