Skip to content

Commit

Permalink
moved waf into extract.py, fixed tests, added make down/clean
Browse files Browse the repository at this point in the history
  • Loading branch information
robert-bryson committed Dec 19, 2023
1 parent eb4b3a1 commit 7986401
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 44 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ test: up ## Runs poetry tests, ignores ckan load
up: ## Sets up local docker environment
docker compose up -d

down: ## Shuts down local docker instance
docker-compose down

clean: ## Cleans docker images
docker compose down -v --remove-orphans

lint: ## Lints wtih ruff
ruff .

Expand Down
42 changes: 42 additions & 0 deletions harvester/extract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
import os

import requests
from bs4 import BeautifulSoup
from requests.exceptions import JSONDecodeError, RequestException

logger = logging.getLogger("harvester")
Expand All @@ -26,6 +28,46 @@ def download_dcatus_catalog(url):
return Exception(e)


def traverse_waf(url, files=[], file_ext=".xml", folder="/", filters=[]):
# TODO: add exception handling
parent = os.path.dirname(url.rstrip("/"))

res = requests.get(url)
if res.status_code == 200:
soup = BeautifulSoup(res.content, "html.parser")
anchors = soup.find_all("a", href=True)

folders = []
for anchor in anchors:
if (
anchor["href"].endswith(folder)
and not parent.endswith(anchor["href"].rstrip("/"))
and anchor["href"] not in filters
):
folders.append(os.path.join(url, anchor["href"]))

if anchor["href"].endswith(file_ext):
files.append(os.path.join(url, anchor["href"]))

for folder in folders:
traverse_waf(folder, files=files, filters=filters)

return files


def download_waf(files):
output = []
for file in files:
data = {}
data["url"] = file
res = requests.get(file)
if res.status_code == 200:
data["content"] = res.content
output.append(data)

return output


def extract(harvest_source) -> list:
"""Extracts all records from a harvest_source"""
logger.info("Hello from harvester.extract()")
Expand Down
43 changes: 0 additions & 43 deletions harvester/waf.py

This file was deleted.

2 changes: 1 addition & 1 deletion tests/extract/test_waf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from harvester.extract.waf import traverse_waf, download_waf
from harvester.extract import download_waf, traverse_waf


def test_traverse_waf(get_waf_url):
Expand Down

1 comment on commit 7986401

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
harvester
   __init__.py100100% 
   compare.py50100% 
   extract.py4877 85%
   load.py8522 98%
   transform.py50100% 
harvester/utils
   __init__.py20100% 
   json.py40100% 
harvester/validate
   __init__.py20100% 
   dcat_us.py2433 88%
TOTAL1851294% 

Tests Skipped Failures Errors Time
24 0 💤 0 ❌ 0 🔥 33.616s ⏱️

Please sign in to comment.