Merge pull request #15 from GSA/define-db-sqlalchemy

Define db sqlalchemy
GSA · Aug 25, 2023 · 9aab59f · 9aab59f · github-actions · Aug 25, 2023
2 parents a5fe2c8 + b76dee4
commit 9aab59f
Show file tree

Hide file tree

Showing 10 changed files with 514 additions and 43 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -42,7 +42,7 @@ jobs:
         run: docker-compose up -d
 
       - name: Run Pytest
-        run: poetry run pytest --junitxml=pytest.xml --cov=harvester | tee pytest-coverage.txt
+        run: set -o pipefail; poetry run pytest --junitxml=pytest.xml --cov=harvester | tee pytest-coverage.txt
 
       - name: Report test coverage
         uses: MishaKav/pytest-coverage-comment@main

diff --git a/docs/diagrams/out/erd/harvester2.0 ERD.png b/docs/diagrams/out/erd/harvester2.0 ERD.png
diff --git a/docs/diagrams/src/README.md b/docs/diagrams/src/README.md
@@ -0,0 +1,12 @@
+## How do we organize and display data on data.gov?
+
+- Navigating to the [datasets](https://catalog.data.gov/dataset) page we see the following "filters"
+  - topics
+  - topic categories
+  - dataset type
+  - tags
+  - formats
+  - organization types
+  - organizations
+  - publishers
+  - bureaus
diff --git a/docs/diagrams/src/erd.plantuml b/docs/diagrams/src/erd.plantuml
@@ -0,0 +1,102 @@
+@startuml harvester2.0 ERD 
+
+skinparam linetype ortho
+
+' ERD
+card "Entity Relationship Diagram" as entities {
+  entity "harvest_source" as source {
+    uuid : uuid (PK)
+    --
+    name: text
+    notifications_emails: array( txt )
+    organization_name: txt
+    frequency: text
+    config: json
+    urls: array( txt ) 
+    schema_validation_type: text
+  }
+
+  entity "harvest_record" as record {
+    uuid : uuid (PK)
+    --
+    source_id: uuid (FK)
+    job_id: uuid (FK)
+    status: text
+    s3_path: text
+  }
+
+  entity "harvest_job" as job {
+    uuid : uuid (PK)
+    --
+    source_id: uuid (FK)
+    status: text
+    date_created 
+    date_finished: datetime
+    extract_started: datetime
+    extract_finished: datetime
+    compare_started: datetime
+    compare_finished: datetime
+    records_added: smallint
+    records_updated: smallint
+    records_deleted: smallint
+    records_errored: smallint
+    records_ignored: smallint
+  }
+
+  entity "harvest_error" as error {
+    uuid : uuid (PK)
+    --
+    job_id: uuid (FK)
+    record_id: uuid (FK)
+    record_reported_id: text
+    date_created: datetime
+    error_type: text
+    severity: enum
+    message: text
+
+  }
+}
+
+' lookup tables
+card "Lookup Tables" as lookup {
+
+  entity "data_format" as data_format {
+    name: text (PK)
+    --
+    mime_type: text
+  }
+}
+
+' enumerators
+card "Enumerators" as enumerators {
+
+  enum error_severity {
+    ERROR
+    CRITICAL
+  }
+
+  enum job_status {
+    CREATE
+    PROCESSING
+    COMPLETE
+  }
+
+  enum schema_validation {
+    DCATUS
+    ISO1911
+  }
+
+  enum record_status {
+    STALE
+    ACTIVE
+    INVALID
+  }
+} 
+
+' relationships
+source ||--|{ job
+source ||--|{ record
+job ||--|{ record
+job ||-|{ error
+
+@enduml
diff --git a/harvester/db/models/__init__.py b/harvester/db/models/__init__.py
@@ -0,0 +1,9 @@
+from sqlalchemy import text
+from sqlalchemy.dialects.postgresql import UUID
+from sqlalchemy.orm import DeclarativeBase, mapped_column
+
+
+class Base(DeclarativeBase):
+    id = mapped_column(
+        UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()")
+    )
diff --git a/harvester/db/models/models.py b/harvester/db/models/models.py
@@ -0,0 +1,71 @@
+from harvester.db.models import Base
+from sqlalchemy import ForeignKey, SMALLINT
+from sqlalchemy import String, DateTime, Enum
+from sqlalchemy.dialects.postgresql import JSON, UUID, ARRAY
+from sqlalchemy.orm import mapped_column
+from sqlalchemy.sql import func
+import enum
+
+
+class SeverityEnum(enum.Enum):
+    error = "ERROR"
+    critical = "CRITICAL"
+
+
+class HarvestSource(Base):
+    __tablename__ = "harvest_source"
+    __table_args__ = {"comment": "Contains information for each harvest source"}
+
+    name = mapped_column(String, nullable=False)
+    notification_emails = mapped_column(ARRAY(String), nullable=False)
+    organization_name = mapped_column(String, nullable=False)
+    frequency = mapped_column(String, nullable=False)  # enum?
+    config = mapped_column(JSON)
+    urls = mapped_column(ARRAY(String), nullable=False)
+    schema_validation_type = mapped_column(String, nullable=False)  # enum?
+
+
+class HarvestJob(Base):
+    __tablename__ = "harvest_job"
+    __table_args__ = {
+        "comment": "Contains job state information run through the pipeline"
+    }
+
+    source_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_source.id"))
+    status = mapped_column(String, nullable=False)  # enum?
+    date_created = mapped_column(DateTime(timezone=True), server_default=func.now())
+    date_finished = mapped_column(DateTime(timezone=True))
+    extract_started = mapped_column(DateTime(timezone=True))
+    extract_finished = mapped_column(DateTime(timezone=True))
+    compare_started = mapped_column(DateTime(timezone=True))
+    compare_finished = mapped_column(DateTime(timezone=True))
+    records_added = mapped_column(SMALLINT)
+    records_updated = mapped_column(SMALLINT)
+    records_deleted = mapped_column(SMALLINT)
+    records_errored = mapped_column(SMALLINT)
+    records_ignored = mapped_column(SMALLINT)
+
+
+class HarvestError(Base):
+    __tablename__ = "harvest_error"
+    __table_args__ = {"comment": "Table to contain all errors in the pipeline"}
+
+    job_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_job.id"))
+    record_id = mapped_column(UUID(as_uuid=True))
+    record_reported_id = mapped_column(String)
+    date_created = mapped_column(DateTime(timezone=True), server_default=func.now())
+    error_type = mapped_column(String)  # enum?
+    severity = mapped_column(
+        Enum(SeverityEnum, values_callable=lambda enum: [e.value for e in enum])
+    )
+    message = mapped_column(String)
+
+
+class HarvestRecord(Base):
+    __tablename__ = "harvest_record"
+    __table_args__ = {"comment": "Table to contain records"}
+
+    job_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_job.id"))
+    source_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_source.id"))
+    status = mapped_column(String)  # enum?
+    s3_path = mapped_column(String)
File	Stmts	Miss	Cover	Missing
harvester
__init__.py	3	0	100%
harvester/db/models
__init__.py	5	0	100%
models.py	53	0	100%
harvester/extract
__init__.py	19	2	2	89%
dcatus.py	11	2	2	82%
harvester/utils
__init__.py	0	0	100%
json.py	22	6	6	73%
pg.py	35	4	4	89%
s3.py	24	6	6	75%
harvester/validate
__init__.py	0	0	100%
dcat_us.py	24	0	100%
TOTAL	196	20	90%