Skip to content

Commit

Permalink
Merge pull request #95 from GSA/load-test-fixes
Browse files Browse the repository at this point in the history
Adds Harvest Source Size to Model
  • Loading branch information
btylerburton authored Aug 29, 2024
2 parents 940fa9e + f7db4e6 commit 7ca8ee7
Show file tree
Hide file tree
Showing 14 changed files with 53 additions and 21 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ When there are database DDL changes, use following to do the database update:
```bash
app_name: harvesting-logic
database_name: harvesting-logic-db
route-external: harvester-dev-datagov.app.cloud.gov
route_external: harvester-dev-datagov.app.cloud.gov
```

2. Deploy the application using Cloud Foundry's `cf push` command with the variable file:
Expand Down
5 changes: 5 additions & 0 deletions app/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ class HarvestSourceForm(FlaskForm):
choices=["manual", "daily", "weekly", "biweekly", "monthly"],
validators=[DataRequired()],
)
size = SelectField(
"Size",
choices=["small", "medium", "large"],
validators=[DataRequired()],
)
schema_type = SelectField(
"Schema Type", choices=["strict", "other"], validators=[DataRequired()]
)
Expand Down
8 changes: 3 additions & 5 deletions app/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,9 +431,7 @@ def add_harvest_source():
@mod.route("/harvest_source/<source_id>", methods=["GET"])
def view_harvest_source_data(source_id: str):
source = db.get_harvest_source(source_id)
jobs = db.get_all_harvest_jobs_by_filter(
{"harvest_source_id": source.id, "status": "complete"}
)
jobs = db.get_all_harvest_jobs_by_filter({"harvest_source_id": source.id})
next_job = "N/A"
future_jobs = db.get_new_harvest_jobs_by_source_in_future(source.id)
if len(future_jobs):
Expand Down Expand Up @@ -623,9 +621,9 @@ def get_harvest_errors_by_job(job_id, error_type):
try:
match error_type:
case "job":
return db.get_harvest_job_errors_by_job(job_id)
return db._to_dict(db.get_harvest_job_errors_by_job(job_id))
case "record":
return db.get_harvest_record_errors_by_job(job_id)
return db._to_dict(db.get_harvest_record_errors_by_job(job_id))
except Exception:
return "Please provide correct job_id"

Expand Down
16 changes: 13 additions & 3 deletions app/scripts/load_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@

logger = logging.getLogger("harvest_admin")

TASK_SIZE_ENUM = {
"medium": (2048, 768),
"large": (4096, 1536),
}


def create_cf_handler():
# check for correct env vars to init CFHandler
Expand All @@ -27,7 +32,7 @@ def create_cf_handler():
return CFHandler(CF_API_URL, CF_SERVICE_USER, CF_SERVICE_AUTH)


def create_task(job_id, cf_handler=None):
def create_task(job_id, size, cf_handler=None):
task_contract = {
"app_guuid": HARVEST_RUNNER_APP_GUID,
"command": f"python harvester/harvest.py {job_id}",
Expand All @@ -36,6 +41,10 @@ def create_task(job_id, cf_handler=None):
if cf_handler is None:
cf_handler = create_cf_handler()

if size != "small" and size in TASK_SIZE_ENUM:
task_contract["memory_in_mb"] = TASK_SIZE_ENUM[size][0]
task_contract["disk_in_mb"] = TASK_SIZE_ENUM[size][1]

cf_handler.start_task(**task_contract)
updated_job = interface.update_harvest_job(job_id, {"status": "in_progress"})
message = f"Updated job {updated_job.id} to in_progress"
Expand Down Expand Up @@ -63,7 +72,7 @@ def trigger_manual_job(source_id):
logger.info(
f"Created new manual harvest job: for {job_data.harvest_source_id}."
)
return create_task(job_data.id)
return create_task(job_data.id, source.size)


def schedule_first_job(source_id):
Expand Down Expand Up @@ -121,5 +130,6 @@ def load_manager():
# then mark that job(s) as running in the DB
logger.info("Load Manager :: Updated Harvest Jobs")
for job in jobs[:slots]:
create_task(job.id, cf_handler)
source = interface.get_harvest_source(job.harvest_source_id)
create_task(job.id, source.size, cf_handler)
schedule_next_job(job.harvest_source_id)
2 changes: 2 additions & 0 deletions app/templates/view_source_data.html
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ <h2>Harvest Jobs</h2>
<thead>
<tr>
<th data-sortable scope="col" role="columnheader">Id</th>
<th data-sortable scope="col" role="columnheader">Status</th>
<th data-sortable scope="col" role="columnheader">Date Created</th>
<th data-sortable scope="col" role="columnheader">Date Finished</th>
<th data-sortable scope="col" role="columnheader">Records Added</th>
Expand All @@ -85,6 +86,7 @@ <h2>Harvest Jobs</h2>
<tr>
<th scope="row"><a href="{{ url_for('harvest.get_harvest_job', job_id=job.id) }}">{{job.id}}</a>
</th>
<td data-sort-value={jobs.date_created}> {{job.status}}</td>
<td data-sort-value={jobs.date_created}> {{job.date_created}}</td>
<td data-sort-value={jobs.date_finished}>{{job.date_finished}} </td>
<td data-sort-value={jobs.records_added}>{{job.records_added}}</td>
Expand Down
1 change: 1 addition & 0 deletions database/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class HarvestSource(db.Model):
index=True,
)
url = db.Column(db.String, nullable=False, unique=True)
size = db.Column(Enum("small", "medium", "large", name="size"))
schema_type = db.Column(db.String, nullable=False)
source_type = db.Column(db.String, nullable=False)
status = db.Column(db.String)
Expand Down
9 changes: 8 additions & 1 deletion harvester/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def __init__(self, msg, harvest_job_id):

self.db_interface.add_harvest_job_error(error_data)
self.db_interface.update_harvest_job(harvest_job_id, job_status)
self.log_err()

def log_err(self):
self.logger.critical(self.msg, exc_info=True)


Expand Down Expand Up @@ -65,11 +68,15 @@ def __init__(self, msg, harvest_job_id, harvest_record_id):

self.db_interface.add_harvest_record_error(error_data)
self.db_interface.update_harvest_record(harvest_record_id, {"status": "error"})
self.log_err()

def log_err(self):
self.logger.error(self.msg, exc_info=True)


class ValidationException(HarvestNonCriticalException):
pass
def log_err(self):
pass


class TranformationException(HarvestNonCriticalException):
Expand Down
13 changes: 8 additions & 5 deletions harvester/harvest.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,8 @@ def compare(self) -> None:
def get_record_changes(self) -> None:
"""determine which records needs to be updated, deleted, or created"""
logger.info(f"getting records changes for {self.name} using {self.url}")
self.prepare_internal_data()
self.prepare_external_data()
self.prepare_internal_data()
self.compare()

def write_compare_to_db(self) -> dict:
Expand Down Expand Up @@ -319,26 +319,29 @@ def report(self) -> None:
None: 0,
}
actual_results_status = {"success": 0, "error": 0, None: 0}
validity = {"valid": 0, "invalid": 0}
validity = {"valid": 0, "invalid": 0, "ignored": 0}

for record_id, record in self.external_records.items():
# action
actual_results_action[record.action] += 1
if record.status != "error":
actual_results_action[record.action] += 1
# status
actual_results_status[record.status] += 1
# validity
if record.valid:
validity["valid"] += 1
else:
elif not record.valid:
validity["invalid"] += 1
else:
validity["not_validated"] += 1

# what actually happened?
logger.info("actual actions completed")
logger.info(actual_results_action)

# what actually happened?
logger.info("actual status completed")
logger.info(actual_results_action)
logger.info(actual_results_status)

# what's our record validity count?
logger.info("validity of the records")
Expand Down
6 changes: 4 additions & 2 deletions harvester/lib/cf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ def setup(self):
self.client.init_with_user_credentials(self.user, self.password)
self.task_mgr = TaskManager(self.url, self.client)

def start_task(self, app_guuid, command, task_id):
return self.task_mgr.create(app_guuid, command, task_id)
def start_task(self, app_guuid, command, task_id, memory_in_mb=512, disk_in_mb=512):
return self.task_mgr.create(
app_guuid, command, task_id, memory_in_mb, disk_in_mb
)

def stop_task(self, task_id):
return self.task_mgr.cancel(task_id)
Expand Down
2 changes: 1 addition & 1 deletion manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ applications:
buildpacks:
- python_buildpack
routes:
- route: ((route-external))
- route: ((route_external))
services:
- ((app_name))-db
- ((app_name))-secrets
Expand Down
4 changes: 4 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def source_data_dcatus(organization_data: dict) -> dict:
"notification_emails": "[email protected]",
"organization_id": organization_data["id"],
"frequency": "daily",
"size": "small",
"url": f"{HARVEST_SOURCE_URL}/dcatus/dcatus.json",
"schema_type": "type1",
"source_type": "dcatus",
Expand All @@ -119,6 +120,7 @@ def source_data_dcatus_2(organization_data: dict) -> dict:
"notification_emails": "[email protected]",
"organization_id": organization_data["id"],
"frequency": "daily",
"size": "medium",
"url": f"{HARVEST_SOURCE_URL}/dcatus/dcatus_2.json",
"schema_type": "type1",
"source_type": "dcatus",
Expand All @@ -134,6 +136,7 @@ def source_data_dcatus_same_title(organization_data: dict) -> dict:
"notification_emails": "[email protected]",
"organization_id": organization_data["id"],
"frequency": "daily",
"size": "small",
"url": f"{HARVEST_SOURCE_URL}/dcatus/dcatus_same_title.json",
"schema_type": "type1",
"source_type": "dcatus",
Expand All @@ -154,6 +157,7 @@ def source_data_waf(organization_data: dict) -> dict:
"notification_emails": "[email protected]",
"organization_id": organization_data["id"],
"frequency": "daily",
"size": "small",
"url": f"{HARVEST_SOURCE_URL}/waf/",
"schema_type": "type1",
"source_type": "waf",
Expand Down
2 changes: 1 addition & 1 deletion vars.development.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
app_name: datagov-harvest
route-external: datagov-harvest-admin-dev.app.cloud.gov
route_external: datagov-harvest-admin-dev.app.cloud.gov
CF_API_URL: https://api.fr.cloud.gov
CKAN_API_URL: https://catalog-next-dev-admin-datagov.app.cloud.gov
HARVEST_RUNNER_APP_GUID: e6a8bba8-ed6d-4200-8280-67b46cebdc63
Expand Down
2 changes: 1 addition & 1 deletion vars.prod.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
app_name: datagov-harvest
route-external: datagov-harvest-admin-prod.app.cloud.gov
route_external: datagov-harvest-admin-prod.app.cloud.gov
CF_API_URL: https://api.fr.cloud.gov
CKAN_API_URL: https://catalog.data.gov
HARVEST_RUNNER_APP_GUID: null
Expand Down
2 changes: 1 addition & 1 deletion vars.staging.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
app_name: datagov-harvest
route-external: datagov-harvest-admin-stage.app.cloud.gov
route_external: datagov-harvest-admin-stage.app.cloud.gov
CF_API_URL: https://api.fr.cloud.gov
CKAN_API_URL: https://catalog-stage.data.gov
HARVEST_RUNNER_APP_GUID: null
Expand Down

2 comments on commit 7ca8ee7

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests Skipped Failures Errors Time
2 0 💤 0 ❌ 0 🔥 6.510s ⏱️

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests Skipped Failures Errors Time
2 0 💤 0 ❌ 0 🔥 8.139s ⏱️

Please sign in to comment.