DataONEorg · iannesbitt · Jun 2, 2025 · Jun 2, 2025 · Jun 6, 2025 · Jun 9, 2025
diff --git a/mnlite/xmnlite.ini b/mnlite/xmnlite.ini
@@ -38,6 +38,8 @@ need-app = true
 module = mnlite:create_app()
 socket = /home/mnlite/WORK/mnlite/mnlite/tmp/mnlite.sock
 chmod-socket = 664
+uid = www-data
+gid = www-data
 
 #stats = /tmp/stats.socket
 ##stats = 127.0.0.1:9191

diff --git a/mnonboard/cn.py b/mnonboard/cn.py
@@ -100,7 +100,7 @@ def get_or_create_subj(loc: str, value: str, client: CoordinatingNodeClient_2_0,
         set_role(loc=loc, title=title, value=value)
     return name
 
-def cn_subj_lookup(subj, cn_url='https://cn.dataone.org/cn', debug=False, client: CoordinatingNodeClient_2_0=None):
+def cn_subj_lookup(subj, cn_url='https://cn.dataone.org/cn', debug=False, client: CoordinatingNodeClient_2_0=None, D1_AUTH_TOKEN=None):
     """
     Use the DataONE API to look up whether a given ORCiD number already exists
     in the system.
@@ -120,11 +120,15 @@ def cn_subj_lookup(subj, cn_url='https://cn.dataone.org/cn', debug=False, client
         L.info('Starting record lookup for %s from %s' % (subj, cn_url))
         subject = client.getSubjectInfo(subj)
         client._session.close()
-        r = subject.content()
-        name = f'{r[0].content()} {r[1].content()}' # first last
+        L.debug('Subject content: %s' % subject.content())
+        L.debug('Subject content 0 content: %s' % subject.content()[0].content())
+        r = subject.content()[0].content()  # first record, first content
+        name = f'{r[1]} {(r[2])}' # first last
         L.info('Name associated with record %s found in %s: %s.' % (subj, cn_url, name))
         rt = name if not debug else r
         return rt
+    except IndexError as e:
+        L.warning(f'Caught IndexError while looking up {subj} at {cn_url}: {e}')
     except exceptions.NotFound as e:
         estrip = str(e).split('<description>')[1].split('</description>')[0]
         e = e if debug else estrip

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,8 @@
 [tool.poetry]
 name = "mnlite"
-version = "0.1.2"
+version = "0.1.4"
 description = "Light weight read-only DataONE member node in Python Flask"
-authors = ["datadavev <[email protected]>"]
+authors = ["datadavev <[email protected]>", "iannesbitt <[email protected]>"]
 license = "Apache 2.0"
 
 [tool.poetry.dependencies]

diff --git a/soscan/items.py b/soscan/items.py
@@ -73,3 +73,7 @@ class SoscanItem(scrapy.Item):
     series_id = scrapy.Field()  # Series ID to be used for the item
     alt_identifiers = scrapy.Field()  # alternative identifiers extracted from the item
     format_id = scrapy.Field()
+
+    def __repr__(self):
+        """Only print out url after exiting the Pipeline"""
+        return repr({"url": self["url"]})
diff --git a/soscan/sonormalizepipeline.py b/soscan/sonormalizepipeline.py
@@ -4,6 +4,7 @@
 import json
 import opersist.rdfutils
 from pathlib import Path
+import soscan.utils as utils
 
 def consolidate_list(l: list, sep: str=', '):
     """
@@ -41,11 +42,22 @@ class SoscanNormalizePipeline:
     def __init__(self, **kwargs):
         self.logger = logging.getLogger("SoscanNormalize")
         self.use_at_id = False
+        self.convert_geoshapes = False
+        self.reorder_ids = False
+        self.fallback_to_url = True
         if 'use_at_id' in kwargs:
             self.use_at_id = kwargs['use_at_id']
-            self.logger.debug(f'Using @id as identifier: {self.use_at_id}')
+            if self.use_at_id:
+                self.logger.debug(f'Using @id as identifier: {self.use_at_id}')
+                self.fallback_to_url = False
+        if 'convert_geoshapes' in kwargs:
+            self.convert_geoshapes = kwargs['convert_geoshapes']
+            self.logger.debug(f'Converting geoshapes to boxes: {self.convert_geoshapes}')
+        if 'reorder_identifiers' in kwargs:
+            # if reorder_identifiers is set, the script will reorder to prioritize the set string value if found in the identifier
+            self.reorder_ids = kwargs['reorder_identifiers']
+            self.logger.debug(f'Reordering identifiers to prioritize: {self.reorder_ids}')
 
-
     @classmethod
     def from_crawler(cls, crawler, *args, **kwargs):
         node_path = crawler.settings.get("STORE_PATH", None)
@@ -56,10 +68,36 @@ def from_crawler(cls, crawler, *args, **kwargs):
             for s in _cs:
                 if s == 'use_at_id':
                     kwargs['use_at_id'] = _cs[s]
+                if s == 'convert_geoshapes':
+                    kwargs['convert_geoshapes'] = _cs[s]
+                if s == 'reorder_identifiers':
+                    kwargs['reorder_identifiers'] = _cs[s]
         return cls(**kwargs)
 
 
-    def extract_identifier(self, ids:list, use_at_id:bool):
+    def _strip_spaces_for_keys(self, obj, keys=["license", "additionalType"]):
+        if isinstance(obj, dict):
+            for k, v in obj.items():
+                if k in keys:
+                    if isinstance(v, str):
+                        obj[k] = v.replace(" ", "")
+                        self.logger.debug(f'Stripped spaces from key {k}: {obj[k]}')
+                    elif isinstance(v, list):
+                        obj[k] = [s.replace(" ", "") if isinstance(s, str) else s for s in v]
+                        self.logger.debug(f'Stripped spaces from list at key {k}: {obj[k]}')
+                # recurse into all children
+                self._strip_spaces_for_keys(v, keys)
+        elif isinstance(obj, list):
+            for item in obj:
+                self._strip_spaces_for_keys(item, keys)
+        return obj
+
+
+    def extract_identifier(self, ids:list,
+                           use_at_id:bool,
+                           preferred_prefix: str=False,
+                           fallback_to_url: bool=True,
+                           url: str=None):
         """
         Extract the series identifier from a list of identifiers structured like the following.
 
@@ -71,6 +109,17 @@ def extract_identifier(self, ids:list, use_at_id:bool):
         The first identifier is the one we should use as the series_id.
         """
         if len(ids) > 0:
+            self.logger.debug(f'Looking up preferred prefix: {preferred_prefix}')
+            if preferred_prefix != False:
+                for id in ids:
+                    self.logger.debug(f'Checking for {preferred_prefix} in identifier: {id["identifier"]}')
+                    for idx in id["identifier"]:
+                        if idx.startswith(preferred_prefix):
+                            self.logger.debug(f'Found preferred identifier: {idx}')
+                            return idx
+                if fallback_to_url:
+                    self.logger.debug(f'No preferred identifier found, falling back to url {url}')
+                    return url
             if len(ids[0]["identifier"]) > 0:
                 return ids[0]["identifier"][0]
             else:
@@ -129,8 +178,7 @@ def process_item(self, item, spider):
         require_identifier = True
 
         jsonld: dict = item["jsonld"]
-        version = jsonld.get('version', None)
-        version = jsonld.get('@version', '1.1') if not version else version
+        version = jsonld.get('@version', '1.1')
         version = '1.0' if version == '1' else version
         jldversion = f'json-ld-{version}'
         self.logger.debug(f"process_item: version {jldversion}")
@@ -194,7 +242,23 @@ def process_item(self, item, spider):
         ids = []
         try:
             _framed = sonormal.normalize.frameSODataset(normalized, options=options)
-            ids = sonormal.normalize.getDatasetsIdentifiers(_framed)
+            ids = sonormal.normalize.getDatasetsIdentifiers(_framed, prefer_str=self.reorder_ids)
+            if self.reorder_ids != False:
+                self.logger.debug(f'Looking for {self.reorder_ids} in identifier strings {ids}')
+                if ids[0]['identifier'] is None or len(ids[0]['identifier']) == 0:
+                    idx = None
+                else:
+                    idx = ids[0]['identifier'][0]
+                for id in ids:
+                    for idu in id['identifier']:
+                        self.logger.debug(f'Checking for {self.reorder_ids} in identifier: {idu}')
+                        if self.reorder_ids in idu:
+                            # make this the first item in the list
+                            idx = idu
+                            self.logger.debug(f'Found preferred identifier: {idx}')
+                        self.logger.debug(f'Removing existing identifier list: {ids[0]["identifier"]}')
+                if idx is not None:
+                    ids[0]['identifier'].insert(0, idx)
         except Exception as e:
             raise scrapy.exceptions.DropItem(f"JSON-LD identifier extract failed: {e}")
         if len(ids) < 1:
@@ -203,12 +267,22 @@ def process_item(self, item, spider):
                 f"Framed dataset:\n{_framed}"
             )
 
+
+        # convert alternate geoshapes to boxes
+        if self.convert_geoshapes:
+            # try:
+            self.logger.debug("Converting geoshapes")
+            item["jsonld"] = utils.convert_geoshapes_to_boxes(item["jsonld"])
+            # except Exception as e:
+            #     self.logger.warning(f"Geoshape conversion failed: {e}")
+        item["jsonld"] = self._strip_spaces_for_keys(item["jsonld"])
+
         # TODO: identifiers
         # The process for handling of identifiers needs to be set in configuration
 
         # Use the first identifier value provided for series_id
         # PID will be computed from the object checksum
-        item["series_id"] = self.extract_identifier(ids, self.use_at_id)
+        item["series_id"] = self.extract_identifier(ids, self.use_at_id, preferred_prefix=self.reorder_ids, fallback_to_url=self.fallback_to_url, url=item["url"])
         item["alt_identifiers"] = self.extract_alt_identifiers(ids, self.use_at_id)
         # if there are no identifiers, we need to drop the item
         if item["series_id"] is None:
@@ -225,4 +299,5 @@ def process_item(self, item, spider):
         # Obsoletes is not a property of the retrieved object but instead needs
         # to be inferred from the history associated with the object lineage
         # item["obsoletes"] = None
+
         return item
diff --git a/soscan/spiders/jsonldspider.py b/soscan/spiders/jsonldspider.py
@@ -153,6 +153,14 @@ def sitemap_filter(self, entries):
                 if self.lastmod_filter is not None and ts is not None:
                     if ts > self.lastmod_filter:
                         if self.url_match:
+                            for url in entry.get("alternate", []):
+                                if self.url_match in url:
+                                    entry['loc'] = url
+                                    self.logger.debug(f'Yielding record {i}: {entry}')
+                                    y += 1
+                                    yield entry
+                                else:
+                                    self.logger.debug(f'url_match skipping record {i}: {self.url_match} not in {url}')
                             if self.url_match in entry['loc']:
                                 self.logger.debug(f'Yielding record {i}: {entry}')
                                 y += 1
@@ -167,6 +175,14 @@ def sitemap_filter(self, entries):
                         self.logger.debug(f'lastmod_filter skipping record {i}: (ts {ts}) {entry}')
                 else:
                     if self.url_match:
+                        for url in entry.get("alternate", []):
+                            if self.url_match in url:
+                                entry['loc'] = url
+                                self.logger.debug(f'Yielding record {i}: {entry}')
+                                y += 1
+                                yield entry
+                            else:
+                                self.logger.debug(f'url_match skipping record {i}: {self.url_match} not in {url}')
                         if self.url_match in entry['loc']:
                             self.logger.debug(f'Yielding record {i}: {entry}')
                             y += 1

diff --git a/soscan/spiders/ldsitemapspider.py b/soscan/spiders/ldsitemapspider.py
@@ -5,7 +5,8 @@
 The sitemap loc lastmod property is provided in the request meta
 """
 
-import os
+from pathlib import Path
+import json
 import re
 import logging
 from scrapy.spiders import Spider
@@ -51,6 +52,23 @@ def __init__(self, *a, alt_rules=None, **kw):
         # If set, then don't download the target
         self._count_only = kw.get("count_only", False)
 
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        """
+        Get the alternate rules from the crawler settings.
+        """
+        node_path = crawler.settings.get("STORE_PATH", False)
+        mn_settings = Path(f'{node_path}/settings.json')
+        if mn_settings.exists():
+            with open(mn_settings) as cs:
+                _cs: dict = json.loads(cs.read())
+            for s in _cs:
+                if s == 'alt_rules':
+                    kwargs['alt_rules'] = _cs[s]
+                if s == 'follow_alternate_links':
+                    cls.sitemap_alternate_links = _cs[s]
+                    kwargs['follow_alternate_links'] = _cs[s]
+        return cls(*args, alt_rules=kwargs.get('alt_rules'), **kwargs)
 
     def start_requests(self):
         for url in self.sitemap_urls: