Merge pull request #61 from GSA/refactor

Refactor
GSA · May 6, 2024 · 915e4f6 · 915e4f6 · github-actions · May 6, 2024
2 parents 3eb5b9f + 57ecdcd
commit 915e4f6
Show file tree

Hide file tree

Showing 62 changed files with 879 additions and 3,385 deletions.
diff --git a/app/__init__.py b/app/__init__.py
@@ -5,7 +5,7 @@
 from flask_bootstrap import Bootstrap
 from flask_migrate import Migrate
 
-from .models import db
+from database.models import db
 
 load_dotenv()
 

diff --git a/app/routes.py b/app/routes.py
@@ -1,7 +1,7 @@
 from flask import Blueprint, flash, jsonify, redirect, render_template, request
 
 from .forms import HarvestSourceForm, OrganizationForm
-from .interface import HarvesterDBInterface
+from database.interface import HarvesterDBInterface
 
 mod = Blueprint("harvest", __name__)
 db = HarvesterDBInterface()

diff --git a/database/__init__.py b/database/__init__.py
@@ -0,0 +1,6 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+DATABASE_URI = os.getenv("DATABASE_URI")
diff --git a/app/interface.py → database/interface.py b/app/interface.py → database/interface.py
@@ -2,7 +2,7 @@
 from sqlalchemy.exc import NoResultFound
 from sqlalchemy.orm import scoped_session, sessionmaker
 
-from app.models import (
+from .models import (
     HarvestError,
     HarvestJob,
     HarvestRecord,

diff --git a/migrations/README → database/migrations/README b/migrations/README → database/migrations/README
diff --git a/migrations/alembic.ini → database/migrations/alembic.ini b/migrations/alembic.ini → database/migrations/alembic.ini
diff --git a/migrations/env.py → database/migrations/env.py b/migrations/env.py → database/migrations/env.py
diff --git a/migrations/script.py.mako → database/migrations/script.py.mako b/migrations/script.py.mako → database/migrations/script.py.mako
diff --git a/migrations/versions/112aacfec4f3_.py → ...base/migrations/versions/112aacfec4f3_.py b/migrations/versions/112aacfec4f3_.py → ...base/migrations/versions/112aacfec4f3_.py
diff --git a/app/models.py → database/models.py b/app/models.py → database/models.py
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,39 +1,11 @@
 services:
-  mdtranslator:
-    image: ghcr.io/gsa/mdtranslator:latest
-    ports:
-      - 3000:3000
-    healthcheck:
-      test: ["CMD", "curl", "-d", "{}", "-X", "POST", "http://localhost:3000/translates"]
-      interval: 10s
-      timeout: 10s
-      retries: 5
   nginx-harvest-source:
     image: nginx
     volumes:
-      - ./tests/harvest-sources:/usr/share/nginx/html
-      - ./tests/nginx.conf:/etc/nginx/conf.d/default.conf
+      - ./example_data:/usr/share/nginx/html
+      - ./nginx.conf:/etc/nginx/conf.d/default.conf
     ports:
       - 80:80  
-  localstack-container:
-    privileged: true
-    image: localstack/localstack:1.3.1
-    ports:
-      - "4566:4566"
-      - "8081:8081"
-    healthcheck:
-      test: ["CMD", "curl", "--fail", "localhost:4566"]
-      interval: 2s
-      timeout: 5s
-      retries: 5
-    environment:
-      - SERVICES=s3
-      - DEBUG=1
-      - DATA_DIR=/tmp/localstack/data
-      - DOCKER_HOST=unix:///var/run/docker.sock
-      - DEFAULT_REGION=us-east-1
-    volumes:
-      - "./tmp/localstack:/var/lib/localstack"
   db:
       image: postgres:latest
       restart: always
@@ -47,7 +19,6 @@ services:
         - "${DATABASE_PORT}:5432"
       volumes:
         - postgres_data:/var/lib/postgresql/data
-
   app:
     build: .
     depends_on:
@@ -62,6 +33,5 @@ services:
       - "8080:8080"
     command: flask run --host=0.0.0.0 --port=8080 --reload
 
-
 volumes:
   postgres_data:
diff --git a/tests/harvest-sources/dcatus/dcatus.json → example_data/dcatus/dcatus.json b/tests/harvest-sources/dcatus/dcatus.json → example_data/dcatus/dcatus.json
diff --git a/...harvest-sources/dcatus/missing_title.json → example_data/dcatus/missing_title.json b/...harvest-sources/dcatus/missing_title.json → example_data/dcatus/missing_title.json
diff --git a/...s/harvest-sources/waf/USGSHydroCached.xml → example_data/waf/USGSHydroCached.xml b/...s/harvest-sources/waf/USGSHydroCached.xml → example_data/waf/USGSHydroCached.xml
diff --git a/...s/harvest-sources/waf/USGSImageryOnly.xml → example_data/waf/USGSImageryOnly.xml b/...s/harvest-sources/waf/USGSImageryOnly.xml → example_data/waf/USGSImageryOnly.xml
diff --git a/...s/harvest-sources/waf/USGSImageryTopo.xml → example_data/waf/USGSImageryTopo.xml b/...s/harvest-sources/waf/USGSImageryTopo.xml → example_data/waf/USGSImageryTopo.xml
diff --git a/...vest-sources/waf/USGSShadedReliefOnly.xml → example_data/waf/USGSShadedReliefOnly.xml b/...vest-sources/waf/USGSShadedReliefOnly.xml → example_data/waf/USGSShadedReliefOnly.xml
diff --git a/...st-sources/waf/browse/USGSHydroCached.xml → example_data/waf/browse/USGSHydroCached.xml b/...st-sources/waf/browse/USGSHydroCached.xml → example_data/waf/browse/USGSHydroCached.xml
diff --git a/.../waf/browse/more/USGSShadedReliefOnly.xml → .../waf/browse/more/USGSShadedReliefOnly.xml b/.../waf/browse/more/USGSShadedReliefOnly.xml → .../waf/browse/more/USGSShadedReliefOnly.xml
diff --git a/...est-sources/waf/other/USGSImageryOnly.xml → example_data/waf/other/USGSImageryOnly.xml b/...est-sources/waf/other/USGSImageryOnly.xml → example_data/waf/other/USGSImageryOnly.xml
diff --git a/harvester/ckan_utils.py b/harvester/ckan_utils.py
@@ -148,3 +148,147 @@ def munge_tag(tag: str) -> str:
     tag = re.sub(r"[^a-zA-Z0-9\- ]", "", tag).replace(" ", "-")
     tag = _munge_to_length(tag, MIN_TAG_LENGTH, MAX_TAG_LENGTH)
     return tag
+
+
+def create_ckan_extras(metadata: dict) -> list[dict]:
+    extras = [
+        "accessLevel",
+        "bureauCode",
+        "identifier",
+        "modified",
+        "programCode",
+        "publisher",
+    ]
+
+    output = [{"key": "resource-type", "value": "Dataset"}]
+
+    for extra in extras:
+        if extra not in metadata:
+            continue
+        data = {"key": extra, "value": None}
+        val = metadata[extra]
+        if extra == "publisher":
+            data["value"] = val["name"]
+
+            output.append(
+                {
+                    "key": "publisher_hierarchy",
+                    "value": create_ckan_publisher_hierarchy(val, []),
+                }
+            )
+
+        else:
+            if isinstance(val, list):  # TODO: confirm this is what we want.
+                val = val[0]
+            data["value"] = val
+        output.append(data)
+
+    # TODO: update this
+    # output.append(
+    #     {
+    #         "key": "dcat_metadata",
+    #         "value": str(sort_dataset(self.metadata)),
+    #     }
+    # )
+
+    # output.append(
+    #     {
+    #         "key": self.harvest_source.extra_source_name,
+    #         "value": self.harvest_source.title,
+    #     }
+    # )
+
+    output.append({"key": "identifier", "value": metadata["identifier"]})
+
+    return output
+
+
+def create_ckan_tags(keywords: list[str]) -> list:
+    output = []
+
+    for keyword in keywords:
+        output.append({"name": munge_tag(keyword)})
+
+    return output
+
+
+def create_ckan_publisher_hierarchy(pub_dict: dict, data: list = []) -> str:
+    for k, v in pub_dict.items():
+        if k == "name":
+            data.append(v)
+        if isinstance(v, dict):
+            create_ckan_publisher_hierarchy(v, data)
+
+    return " > ".join(data[::-1])
+
+
+def get_email_from_str(in_str: str) -> str:
+    res = re.search(r"[\w.+-]+@[\w-]+\.[\w.-]+", in_str)
+    if res is not None:
+        return res.group(0)
+
+
+def create_ckan_resources(metadata: dict) -> list[dict]:
+    output = []
+
+    if "distribution" not in metadata or metadata["distribution"] is None:
+        return output
+
+    for dist in metadata["distribution"]:
+        url_keys = ["downloadURL", "accessURL"]
+        for url_key in url_keys:
+            if dist.get(url_key, None) is None:
+                continue
+            resource = {"url": dist[url_key]}
+            if "mimetype" in dist:
+                resource["mimetype"] = dist["mediaType"]
+
+        output.append(resource)
+
+    return output
+
+
+def simple_transform(metadata: dict, owner_org: str) -> dict:
+    output = {
+        "name": munge_title_to_name(metadata["title"]),
+        "owner_org": owner_org,
+        "identifier": metadata["identifier"],
+        "author": None,  # TODO: CHANGE THIS!
+        "author_email": None,  # TODO: CHANGE THIS!
+    }
+
+    mapping = {
+        "contactPoint": {"fn": "maintainer", "hasEmail": "maintainer_email"},
+        "description": "notes",
+        "title": "title",
+    }
+
+    for k, v in metadata.items():
+        if k not in mapping:
+            continue
+        if isinstance(mapping[k], dict):
+            temp = {}
+            to_skip = ["@type"]
+            for k2, v2 in v.items():
+                if k2 == "hasEmail":
+                    v2 = get_email_from_str(v2)
+                if k2 in to_skip:
+                    continue
+                temp[mapping[k][k2]] = v2
+            output = {**output, **temp}
+        else:
+            output[mapping[k]] = v
+
+    return output
+
+
+def ckanify_dcatus(metadata: dict, owner_org: str) -> dict:
+    ckanified_metadata = simple_transform(metadata, owner_org)
+
+    ckanified_metadata["resources"] = create_ckan_resources(metadata)
+    ckanified_metadata["tags"] = (
+        create_ckan_tags(metadata["keyword"]) if "keyword" in metadata else []
+    )
+    ckanified_metadata["extras"] = create_ckan_extras(metadata)
+
+    return ckanified_metadata
diff --git a/harvester/data/dcatus/jsons/arm.data.json b/harvester/data/dcatus/jsons/arm.data.json
File	Stmts	Miss	Cover	Missing
harvester
__init__.py	5	0	100%
ckan_utils.py	113	6	6	95%
exceptions.py	42	0	100%
harvest.py	237	41	41	83%
logger_config.py	1	0	100%
utils.py	78	13	13	83%
TOTAL	476	60	87%