Skip to content

Commit

Permalink
Merge pull request #61 from GSA/refactor
Browse files Browse the repository at this point in the history
Refactor
  • Loading branch information
rshewitt authored May 6, 2024
2 parents 3eb5b9f + 57ecdcd commit 915e4f6
Show file tree
Hide file tree
Showing 62 changed files with 879 additions and 3,385 deletions.
2 changes: 1 addition & 1 deletion app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from flask_bootstrap import Bootstrap
from flask_migrate import Migrate

from .models import db
from database.models import db

load_dotenv()

Expand Down
2 changes: 1 addition & 1 deletion app/routes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from flask import Blueprint, flash, jsonify, redirect, render_template, request

from .forms import HarvestSourceForm, OrganizationForm
from .interface import HarvesterDBInterface
from database.interface import HarvesterDBInterface

mod = Blueprint("harvest", __name__)
db = HarvesterDBInterface()
Expand Down
6 changes: 6 additions & 0 deletions database/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import os
from dotenv import load_dotenv

load_dotenv()

DATABASE_URI = os.getenv("DATABASE_URI")
2 changes: 1 addition & 1 deletion app/interface.py → database/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import scoped_session, sessionmaker

from app.models import (
from .models import (
HarvestError,
HarvestJob,
HarvestRecord,
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
34 changes: 2 additions & 32 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,39 +1,11 @@
services:
mdtranslator:
image: ghcr.io/gsa/mdtranslator:latest
ports:
- 3000:3000
healthcheck:
test: ["CMD", "curl", "-d", "{}", "-X", "POST", "http://localhost:3000/translates"]
interval: 10s
timeout: 10s
retries: 5
nginx-harvest-source:
image: nginx
volumes:
- ./tests/harvest-sources:/usr/share/nginx/html
- ./tests/nginx.conf:/etc/nginx/conf.d/default.conf
- ./example_data:/usr/share/nginx/html
- ./nginx.conf:/etc/nginx/conf.d/default.conf
ports:
- 80:80
localstack-container:
privileged: true
image: localstack/localstack:1.3.1
ports:
- "4566:4566"
- "8081:8081"
healthcheck:
test: ["CMD", "curl", "--fail", "localhost:4566"]
interval: 2s
timeout: 5s
retries: 5
environment:
- SERVICES=s3
- DEBUG=1
- DATA_DIR=/tmp/localstack/data
- DOCKER_HOST=unix:///var/run/docker.sock
- DEFAULT_REGION=us-east-1
volumes:
- "./tmp/localstack:/var/lib/localstack"
db:
image: postgres:latest
restart: always
Expand All @@ -47,7 +19,6 @@ services:
- "${DATABASE_PORT}:5432"
volumes:
- postgres_data:/var/lib/postgresql/data

app:
build: .
depends_on:
Expand All @@ -62,6 +33,5 @@ services:
- "8080:8080"
command: flask run --host=0.0.0.0 --port=8080 --reload


volumes:
postgres_data:
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
144 changes: 144 additions & 0 deletions harvester/ckan_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,147 @@ def munge_tag(tag: str) -> str:
tag = re.sub(r"[^a-zA-Z0-9\- ]", "", tag).replace(" ", "-")
tag = _munge_to_length(tag, MIN_TAG_LENGTH, MAX_TAG_LENGTH)
return tag


def create_ckan_extras(metadata: dict) -> list[dict]:
extras = [
"accessLevel",
"bureauCode",
"identifier",
"modified",
"programCode",
"publisher",
]

output = [{"key": "resource-type", "value": "Dataset"}]

for extra in extras:
if extra not in metadata:
continue
data = {"key": extra, "value": None}
val = metadata[extra]
if extra == "publisher":
data["value"] = val["name"]

output.append(
{
"key": "publisher_hierarchy",
"value": create_ckan_publisher_hierarchy(val, []),
}
)

else:
if isinstance(val, list): # TODO: confirm this is what we want.
val = val[0]
data["value"] = val
output.append(data)

# TODO: update this
# output.append(
# {
# "key": "dcat_metadata",
# "value": str(sort_dataset(self.metadata)),
# }
# )

# output.append(
# {
# "key": self.harvest_source.extra_source_name,
# "value": self.harvest_source.title,
# }
# )

output.append({"key": "identifier", "value": metadata["identifier"]})

return output


def create_ckan_tags(keywords: list[str]) -> list:
output = []

for keyword in keywords:
output.append({"name": munge_tag(keyword)})

return output


def create_ckan_publisher_hierarchy(pub_dict: dict, data: list = []) -> str:
for k, v in pub_dict.items():
if k == "name":
data.append(v)
if isinstance(v, dict):
create_ckan_publisher_hierarchy(v, data)

return " > ".join(data[::-1])


def get_email_from_str(in_str: str) -> str:
res = re.search(r"[\w.+-]+@[\w-]+\.[\w.-]+", in_str)
if res is not None:
return res.group(0)


def create_ckan_resources(metadata: dict) -> list[dict]:
output = []

if "distribution" not in metadata or metadata["distribution"] is None:
return output

for dist in metadata["distribution"]:
url_keys = ["downloadURL", "accessURL"]
for url_key in url_keys:
if dist.get(url_key, None) is None:
continue
resource = {"url": dist[url_key]}
if "mimetype" in dist:
resource["mimetype"] = dist["mediaType"]

output.append(resource)

return output


def simple_transform(metadata: dict, owner_org: str) -> dict:
output = {
"name": munge_title_to_name(metadata["title"]),
"owner_org": owner_org,
"identifier": metadata["identifier"],
"author": None, # TODO: CHANGE THIS!
"author_email": None, # TODO: CHANGE THIS!
}

mapping = {
"contactPoint": {"fn": "maintainer", "hasEmail": "maintainer_email"},
"description": "notes",
"title": "title",
}

for k, v in metadata.items():
if k not in mapping:
continue
if isinstance(mapping[k], dict):
temp = {}
to_skip = ["@type"]
for k2, v2 in v.items():
if k2 == "hasEmail":
v2 = get_email_from_str(v2)
if k2 in to_skip:
continue
temp[mapping[k][k2]] = v2
output = {**output, **temp}
else:
output[mapping[k]] = v

return output


def ckanify_dcatus(metadata: dict, owner_org: str) -> dict:
ckanified_metadata = simple_transform(metadata, owner_org)

ckanified_metadata["resources"] = create_ckan_resources(metadata)
ckanified_metadata["tags"] = (
create_ckan_tags(metadata["keyword"]) if "keyword" in metadata else []
)
ckanified_metadata["extras"] = create_ckan_extras(metadata)

return ckanified_metadata
155 changes: 0 additions & 155 deletions harvester/data/dcatus/jsons/arm.data.json

This file was deleted.

Loading

1 comment on commit 915e4f6

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
harvester
   __init__.py50100% 
   ckan_utils.py11366 95%
   exceptions.py420100% 
   harvest.py2374141 83%
   logger_config.py10100% 
   utils.py781313 83%
TOTAL4766087% 

Tests Skipped Failures Errors Time
43 0 💤 0 ❌ 0 🔥 1.068s ⏱️

Please sign in to comment.