Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
16e74dd
correcting subject name parse errors
iannesbitt Jun 2, 2025
f6d29c9
moving debug statements outside of if statement
iannesbitt Jun 2, 2025
c88a062
adding box calculation logic
iannesbitt Jun 6, 2025
cd3520d
adding a way for user to convert_geoshapes
iannesbitt Jun 9, 2025
2573960
adding debug statements and bound validation (#76)
iannesbitt Jun 9, 2025
e796690
adding dummy obis sitemap for testing (#76)
iannesbitt Jun 9, 2025
a984bd5
correcting minor version issue
iannesbitt Jun 11, 2025
6d2c990
Merge branch 'develop' into feature-76-calculate-geoshape-box
iannesbitt Jun 11, 2025
c7f0109
adding more compute logic and GeoShape type (#76)
iannesbitt Jun 17, 2025
460ef1b
updating geo node key removal (#76)
iannesbitt Jun 17, 2025
3536028
removing geo node when empty (#76)
iannesbitt Jun 17, 2025
c44dcd0
lowering msg log level (#76)
iannesbitt Jun 17, 2025
4e66ee3
eliminating empty geo node (#76)
iannesbitt Jun 17, 2025
a35ca82
adding OBIS-SEAMAP sitemap for testing
iannesbitt Aug 22, 2025
84b68d4
adding way to specify preferred series_id prefixes in order to handle…
iannesbitt Aug 26, 2025
aa62e15
updating lockfile
iannesbitt Aug 26, 2025
fa1d528
partial fix for geobox conversion issues
iannesbitt Aug 26, 2025
386e8e9
Merge branch 'develop' into feature-76-calculate-geoshape-box
iannesbitt Aug 26, 2025
3ceffcb
adding ownership values to xmnlite.ini to fix socket creation issues
iannesbitt Aug 27, 2025
7de29df
bumping version and adding authorship
iannesbitt Aug 27, 2025
faa26b0
adding alternate link functionality and test sitemap
iannesbitt Sep 5, 2025
f3d5c5e
refining alternate link functionality
iannesbitt Sep 5, 2025
3e44c14
bumping version
iannesbitt Sep 5, 2025
1e60244
adding logic to calculate an antimeridian crossing and split geo boxe…
iannesbitt Sep 15, 2025
6e04e83
Merge pull request #82 from DataONEorg/feature-76-calculate-geoshape-box
iannesbitt Sep 15, 2025
b3e165f
suppressing full jsonld dump on DropItem by defining Item __repr__ me…
iannesbitt Sep 16, 2025
422477a
suppressing full jsonld dump on DropItem by defining Item __repr__ me…
iannesbitt Sep 16, 2025
23686da
fixed KeyError stemming from Item self.__repr__ method
iannesbitt Sep 16, 2025
e414582
adding logic to clean spaces from license and additionalType fields
iannesbitt Oct 24, 2025
1d7bb57
clarifying logic to pick identifiers
iannesbitt Oct 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mnlite/xmnlite.ini
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ need-app = true
module = mnlite:create_app()
socket = /home/mnlite/WORK/mnlite/mnlite/tmp/mnlite.sock
chmod-socket = 664
uid = www-data
gid = www-data

#stats = /tmp/stats.socket
##stats = 127.0.0.1:9191
Expand Down
10 changes: 7 additions & 3 deletions mnonboard/cn.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def get_or_create_subj(loc: str, value: str, client: CoordinatingNodeClient_2_0,
set_role(loc=loc, title=title, value=value)
return name

def cn_subj_lookup(subj, cn_url='https://cn.dataone.org/cn', debug=False, client: CoordinatingNodeClient_2_0=None):
def cn_subj_lookup(subj, cn_url='https://cn.dataone.org/cn', debug=False, client: CoordinatingNodeClient_2_0=None, D1_AUTH_TOKEN=None):
"""
Use the DataONE API to look up whether a given ORCiD number already exists
in the system.
Expand All @@ -120,11 +120,15 @@ def cn_subj_lookup(subj, cn_url='https://cn.dataone.org/cn', debug=False, client
L.info('Starting record lookup for %s from %s' % (subj, cn_url))
subject = client.getSubjectInfo(subj)
client._session.close()
r = subject.content()
name = f'{r[0].content()} {r[1].content()}' # first last
L.debug('Subject content: %s' % subject.content())
L.debug('Subject content 0 content: %s' % subject.content()[0].content())
r = subject.content()[0].content() # first record, first content
name = f'{r[1]} {(r[2])}' # first last
L.info('Name associated with record %s found in %s: %s.' % (subj, cn_url, name))
rt = name if not debug else r
return rt
except IndexError as e:
L.warning(f'Caught IndexError while looking up {subj} at {cn_url}: {e}')
except exceptions.NotFound as e:
estrip = str(e).split('<description>')[1].split('</description>')[0]
e = e if debug else estrip
Expand Down
552 changes: 265 additions & 287 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[tool.poetry]
name = "mnlite"
version = "0.1.2"
version = "0.1.4"
description = "Light weight read-only DataONE member node in Python Flask"
authors = ["datadavev <[email protected]>"]
authors = ["datadavev <[email protected]>", "iannesbitt <[email protected]>"]
license = "Apache 2.0"

[tool.poetry.dependencies]
Expand Down
4 changes: 4 additions & 0 deletions soscan/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,7 @@ class SoscanItem(scrapy.Item):
series_id = scrapy.Field() # Series ID to be used for the item
alt_identifiers = scrapy.Field() # alternative identifiers extracted from the item
format_id = scrapy.Field()

def __repr__(self):
"""Only print out url after exiting the Pipeline"""
return repr({"url": self["url"]})
89 changes: 82 additions & 7 deletions soscan/sonormalizepipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import opersist.rdfutils
from pathlib import Path
import soscan.utils as utils

def consolidate_list(l: list, sep: str=', '):
"""
Expand Down Expand Up @@ -41,11 +42,22 @@ class SoscanNormalizePipeline:
def __init__(self, **kwargs):
self.logger = logging.getLogger("SoscanNormalize")
self.use_at_id = False
self.convert_geoshapes = False
self.reorder_ids = False
self.fallback_to_url = True
if 'use_at_id' in kwargs:
self.use_at_id = kwargs['use_at_id']
self.logger.debug(f'Using @id as identifier: {self.use_at_id}')
if self.use_at_id:
self.logger.debug(f'Using @id as identifier: {self.use_at_id}')
self.fallback_to_url = False
if 'convert_geoshapes' in kwargs:
self.convert_geoshapes = kwargs['convert_geoshapes']
self.logger.debug(f'Converting geoshapes to boxes: {self.convert_geoshapes}')
if 'reorder_identifiers' in kwargs:
# if reorder_identifiers is set, the script will reorder to prioritize the set string value if found in the identifier
self.reorder_ids = kwargs['reorder_identifiers']
self.logger.debug(f'Reordering identifiers to prioritize: {self.reorder_ids}')


@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
node_path = crawler.settings.get("STORE_PATH", None)
Expand All @@ -56,10 +68,36 @@ def from_crawler(cls, crawler, *args, **kwargs):
for s in _cs:
if s == 'use_at_id':
kwargs['use_at_id'] = _cs[s]
if s == 'convert_geoshapes':
kwargs['convert_geoshapes'] = _cs[s]
if s == 'reorder_identifiers':
kwargs['reorder_identifiers'] = _cs[s]
return cls(**kwargs)


def extract_identifier(self, ids:list, use_at_id:bool):
def _strip_spaces_for_keys(self, obj, keys=["license", "additionalType"]):
if isinstance(obj, dict):
for k, v in obj.items():
if k in keys:
if isinstance(v, str):
obj[k] = v.replace(" ", "")
self.logger.debug(f'Stripped spaces from key {k}: {obj[k]}')
elif isinstance(v, list):
obj[k] = [s.replace(" ", "") if isinstance(s, str) else s for s in v]
self.logger.debug(f'Stripped spaces from list at key {k}: {obj[k]}')
# recurse into all children
self._strip_spaces_for_keys(v, keys)
elif isinstance(obj, list):
for item in obj:
self._strip_spaces_for_keys(item, keys)
return obj


def extract_identifier(self, ids:list,
use_at_id:bool,
preferred_prefix: str=False,
fallback_to_url: bool=True,
url: str=None):
"""
Extract the series identifier from a list of identifiers structured like the following.

Expand All @@ -71,6 +109,17 @@ def extract_identifier(self, ids:list, use_at_id:bool):
The first identifier is the one we should use as the series_id.
"""
if len(ids) > 0:
self.logger.debug(f'Looking up preferred prefix: {preferred_prefix}')
if preferred_prefix != False:
for id in ids:
self.logger.debug(f'Checking for {preferred_prefix} in identifier: {id["identifier"]}')
for idx in id["identifier"]:
if idx.startswith(preferred_prefix):
self.logger.debug(f'Found preferred identifier: {idx}')
return idx
if fallback_to_url:
self.logger.debug(f'No preferred identifier found, falling back to url {url}')
return url
if len(ids[0]["identifier"]) > 0:
return ids[0]["identifier"][0]
else:
Expand Down Expand Up @@ -129,8 +178,7 @@ def process_item(self, item, spider):
require_identifier = True

jsonld: dict = item["jsonld"]
version = jsonld.get('version', None)
version = jsonld.get('@version', '1.1') if not version else version
version = jsonld.get('@version', '1.1')
version = '1.0' if version == '1' else version
jldversion = f'json-ld-{version}'
self.logger.debug(f"process_item: version {jldversion}")
Expand Down Expand Up @@ -194,7 +242,23 @@ def process_item(self, item, spider):
ids = []
try:
_framed = sonormal.normalize.frameSODataset(normalized, options=options)
ids = sonormal.normalize.getDatasetsIdentifiers(_framed)
ids = sonormal.normalize.getDatasetsIdentifiers(_framed, prefer_str=self.reorder_ids)
if self.reorder_ids != False:
self.logger.debug(f'Looking for {self.reorder_ids} in identifier strings {ids}')
if ids[0]['identifier'] is None or len(ids[0]['identifier']) == 0:
idx = None
else:
idx = ids[0]['identifier'][0]
for id in ids:
for idu in id['identifier']:
self.logger.debug(f'Checking for {self.reorder_ids} in identifier: {idu}')
if self.reorder_ids in idu:
# make this the first item in the list
idx = idu
self.logger.debug(f'Found preferred identifier: {idx}')
self.logger.debug(f'Removing existing identifier list: {ids[0]["identifier"]}')
if idx is not None:
ids[0]['identifier'].insert(0, idx)
except Exception as e:
raise scrapy.exceptions.DropItem(f"JSON-LD identifier extract failed: {e}")
if len(ids) < 1:
Expand All @@ -203,12 +267,22 @@ def process_item(self, item, spider):
f"Framed dataset:\n{_framed}"
)


# convert alternate geoshapes to boxes
if self.convert_geoshapes:
# try:
self.logger.debug("Converting geoshapes")
item["jsonld"] = utils.convert_geoshapes_to_boxes(item["jsonld"])
# except Exception as e:
# self.logger.warning(f"Geoshape conversion failed: {e}")
item["jsonld"] = self._strip_spaces_for_keys(item["jsonld"])

# TODO: identifiers
# The process for handling of identifiers needs to be set in configuration

# Use the first identifier value provided for series_id
# PID will be computed from the object checksum
item["series_id"] = self.extract_identifier(ids, self.use_at_id)
item["series_id"] = self.extract_identifier(ids, self.use_at_id, preferred_prefix=self.reorder_ids, fallback_to_url=self.fallback_to_url, url=item["url"])
item["alt_identifiers"] = self.extract_alt_identifiers(ids, self.use_at_id)
# if there are no identifiers, we need to drop the item
if item["series_id"] is None:
Expand All @@ -225,4 +299,5 @@ def process_item(self, item, spider):
# Obsoletes is not a property of the retrieved object but instead needs
# to be inferred from the history associated with the object lineage
# item["obsoletes"] = None

return item
16 changes: 16 additions & 0 deletions soscan/spiders/jsonldspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,14 @@ def sitemap_filter(self, entries):
if self.lastmod_filter is not None and ts is not None:
if ts > self.lastmod_filter:
if self.url_match:
for url in entry.get("alternate", []):
if self.url_match in url:
entry['loc'] = url
self.logger.debug(f'Yielding record {i}: {entry}')
y += 1
yield entry
else:
self.logger.debug(f'url_match skipping record {i}: {self.url_match} not in {url}')
if self.url_match in entry['loc']:
self.logger.debug(f'Yielding record {i}: {entry}')
y += 1
Expand All @@ -167,6 +175,14 @@ def sitemap_filter(self, entries):
self.logger.debug(f'lastmod_filter skipping record {i}: (ts {ts}) {entry}')
else:
if self.url_match:
for url in entry.get("alternate", []):
if self.url_match in url:
entry['loc'] = url
self.logger.debug(f'Yielding record {i}: {entry}')
y += 1
yield entry
else:
self.logger.debug(f'url_match skipping record {i}: {self.url_match} not in {url}')
if self.url_match in entry['loc']:
self.logger.debug(f'Yielding record {i}: {entry}')
y += 1
Expand Down
20 changes: 19 additions & 1 deletion soscan/spiders/ldsitemapspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
The sitemap loc lastmod property is provided in the request meta
"""

import os
from pathlib import Path
import json
import re
import logging
from scrapy.spiders import Spider
Expand Down Expand Up @@ -51,6 +52,23 @@ def __init__(self, *a, alt_rules=None, **kw):
# If set, then don't download the target
self._count_only = kw.get("count_only", False)

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
"""
Get the alternate rules from the crawler settings.
"""
node_path = crawler.settings.get("STORE_PATH", False)
mn_settings = Path(f'{node_path}/settings.json')
if mn_settings.exists():
with open(mn_settings) as cs:
_cs: dict = json.loads(cs.read())
for s in _cs:
if s == 'alt_rules':
kwargs['alt_rules'] = _cs[s]
if s == 'follow_alternate_links':
cls.sitemap_alternate_links = _cs[s]
kwargs['follow_alternate_links'] = _cs[s]
return cls(*args, alt_rules=kwargs.get('alt_rules'), **kwargs)

def start_requests(self):
for url in self.sitemap_urls:
Expand Down
Loading