Skip to content

Commit

Permalink
Merge pull request #15 from GSA/define-db-sqlalchemy
Browse files Browse the repository at this point in the history
Define db sqlalchemy
  • Loading branch information
nickumia-reisys authored Aug 25, 2023
2 parents a5fe2c8 + b76dee4 commit 9aab59f
Show file tree
Hide file tree
Showing 10 changed files with 514 additions and 43 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
run: docker-compose up -d

- name: Run Pytest
run: poetry run pytest --junitxml=pytest.xml --cov=harvester | tee pytest-coverage.txt
run: set -o pipefail; poetry run pytest --junitxml=pytest.xml --cov=harvester | tee pytest-coverage.txt

- name: Report test coverage
uses: MishaKav/pytest-coverage-comment@main
Expand Down
Binary file added docs/diagrams/out/erd/harvester2.0 ERD.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 12 additions & 0 deletions docs/diagrams/src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
## How do we organize and display data on data.gov?

- Navigating to the [datasets](https://catalog.data.gov/dataset) page we see the following "filters"
- topics
- topic categories
- dataset type
- tags
- formats
- organization types
- organizations
- publishers
- bureaus
102 changes: 102 additions & 0 deletions docs/diagrams/src/erd.plantuml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
@startuml harvester2.0 ERD

skinparam linetype ortho

' ERD
card "Entity Relationship Diagram" as entities {
entity "harvest_source" as source {
uuid : uuid (PK)
--
name: text
notifications_emails: array( txt )
organization_name: txt
frequency: text
config: json
urls: array( txt )
schema_validation_type: text
}

entity "harvest_record" as record {
uuid : uuid (PK)
--
source_id: uuid (FK)
job_id: uuid (FK)
status: text
s3_path: text
}

entity "harvest_job" as job {
uuid : uuid (PK)
--
source_id: uuid (FK)
status: text
date_created
date_finished: datetime
extract_started: datetime
extract_finished: datetime
compare_started: datetime
compare_finished: datetime
records_added: smallint
records_updated: smallint
records_deleted: smallint
records_errored: smallint
records_ignored: smallint
}

entity "harvest_error" as error {
uuid : uuid (PK)
--
job_id: uuid (FK)
record_id: uuid (FK)
record_reported_id: text
date_created: datetime
error_type: text
severity: enum
message: text

}
}

' lookup tables
card "Lookup Tables" as lookup {

entity "data_format" as data_format {
name: text (PK)
--
mime_type: text
}
}

' enumerators
card "Enumerators" as enumerators {

enum error_severity {
ERROR
CRITICAL
}

enum job_status {
CREATE
PROCESSING
COMPLETE
}

enum schema_validation {
DCATUS
ISO1911
}

enum record_status {
STALE
ACTIVE
INVALID
}
}

' relationships
source ||--|{ job
source ||--|{ record
job ||--|{ record
job ||-|{ error

@enduml
9 changes: 9 additions & 0 deletions harvester/db/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import DeclarativeBase, mapped_column


class Base(DeclarativeBase):
id = mapped_column(
UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()")
)
71 changes: 71 additions & 0 deletions harvester/db/models/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from harvester.db.models import Base
from sqlalchemy import ForeignKey, SMALLINT
from sqlalchemy import String, DateTime, Enum
from sqlalchemy.dialects.postgresql import JSON, UUID, ARRAY
from sqlalchemy.orm import mapped_column
from sqlalchemy.sql import func
import enum


class SeverityEnum(enum.Enum):
error = "ERROR"
critical = "CRITICAL"


class HarvestSource(Base):
__tablename__ = "harvest_source"
__table_args__ = {"comment": "Contains information for each harvest source"}

name = mapped_column(String, nullable=False)
notification_emails = mapped_column(ARRAY(String), nullable=False)
organization_name = mapped_column(String, nullable=False)
frequency = mapped_column(String, nullable=False) # enum?
config = mapped_column(JSON)
urls = mapped_column(ARRAY(String), nullable=False)
schema_validation_type = mapped_column(String, nullable=False) # enum?


class HarvestJob(Base):
__tablename__ = "harvest_job"
__table_args__ = {
"comment": "Contains job state information run through the pipeline"
}

source_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_source.id"))
status = mapped_column(String, nullable=False) # enum?
date_created = mapped_column(DateTime(timezone=True), server_default=func.now())
date_finished = mapped_column(DateTime(timezone=True))
extract_started = mapped_column(DateTime(timezone=True))
extract_finished = mapped_column(DateTime(timezone=True))
compare_started = mapped_column(DateTime(timezone=True))
compare_finished = mapped_column(DateTime(timezone=True))
records_added = mapped_column(SMALLINT)
records_updated = mapped_column(SMALLINT)
records_deleted = mapped_column(SMALLINT)
records_errored = mapped_column(SMALLINT)
records_ignored = mapped_column(SMALLINT)


class HarvestError(Base):
__tablename__ = "harvest_error"
__table_args__ = {"comment": "Table to contain all errors in the pipeline"}

job_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_job.id"))
record_id = mapped_column(UUID(as_uuid=True))
record_reported_id = mapped_column(String)
date_created = mapped_column(DateTime(timezone=True), server_default=func.now())
error_type = mapped_column(String) # enum?
severity = mapped_column(
Enum(SeverityEnum, values_callable=lambda enum: [e.value for e in enum])
)
message = mapped_column(String)


class HarvestRecord(Base):
__tablename__ = "harvest_record"
__table_args__ = {"comment": "Table to contain records"}

job_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_job.id"))
source_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_source.id"))
status = mapped_column(String) # enum?
s3_path = mapped_column(String)
Loading

1 comment on commit 9aab59f

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
harvester
   __init__.py30100% 
harvester/db/models
   __init__.py50100% 
   models.py530100% 
harvester/extract
   __init__.py1922 89%
   dcatus.py1122 82%
harvester/utils
   __init__.py00100% 
   json.py2266 73%
   pg.py3544 89%
   s3.py2466 75%
harvester/validate
   __init__.py00100% 
   dcat_us.py240100% 
TOTAL1962090% 

Tests Skipped Failures Errors Time
29 0 💤 0 ❌ 0 🔥 22.543s ⏱️

Please sign in to comment.