Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Define db sqlalchemy #15

Merged
merged 25 commits into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
277d8b6
add sqlalchemy
rshewitt Aug 3, 2023
b3bb3a6
create sqlalchemy utils
rshewitt Aug 3, 2023
f44632c
add declarative models.
rshewitt Aug 3, 2023
540d9c5
add plantuml ERD
rshewitt Aug 10, 2023
cee72da
information for potential data model
rshewitt Aug 11, 2023
2ba07cb
change owner entity name. add lookups.
rshewitt Aug 11, 2023
b21ba59
update to latest diagram.
rshewitt Aug 11, 2023
33d5edd
reorganize.
rshewitt Aug 11, 2023
ab9731c
update to latest diagram.
rshewitt Aug 11, 2023
53f2db3
move file
rshewitt Aug 11, 2023
ab2ee08
update ERD to reflect models and export as png.
rshewitt Aug 15, 2023
9a5b728
remove unneeded repr function.
rshewitt Aug 15, 2023
1e68719
update datetime columns and add harvest record model.
rshewitt Aug 15, 2023
c6b1ec6
add fixtures.
rshewitt Aug 15, 2023
c52bf48
add tests for adding records to associated models/tables.
rshewitt Aug 15, 2023
3c7426d
removed unneeded module.
rshewitt Aug 15, 2023
5cc3faf
update for ruff.
rshewitt Aug 15, 2023
2c1ec64
black/isort lint
robert-bryson Aug 16, 2023
7b081f3
remove unneeded imports. move functions to conftest.
rshewitt Aug 17, 2023
9c89fd0
add new fixtures. update existing fixtures.
rshewitt Aug 17, 2023
1be0015
add record update test. parametrize tests.
rshewitt Aug 17, 2023
c75d5ea
Merge remote-tracking branch 'refs/remotes/origin/define-db-sqlalchem…
rshewitt Aug 17, 2023
a1cfc7f
prevent error codes from being masked.
rshewitt Aug 17, 2023
7bb874c
add psycopg3 dialect to connection string.
rshewitt Aug 17, 2023
b76dee4
update to reflect enum in erd.plantuml.
rshewitt Aug 25, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
run: docker-compose up -d

- name: Run Pytest
run: poetry run pytest --junitxml=pytest.xml --cov=harvester | tee pytest-coverage.txt
run: set -o pipefail; poetry run pytest --junitxml=pytest.xml --cov=harvester | tee pytest-coverage.txt

- name: Report test coverage
uses: MishaKav/pytest-coverage-comment@main
Expand Down
Binary file added docs/diagrams/out/erd/harvester2.0 ERD.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 12 additions & 0 deletions docs/diagrams/src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
## How do we organize and display data on data.gov?

- Navigating to the [datasets](https://catalog.data.gov/dataset) page we see the following "filters"
rshewitt marked this conversation as resolved.
Show resolved Hide resolved
- topics
- topic categories
- dataset type
- tags
- formats
- organization types
- organizations
- publishers
- bureaus
102 changes: 102 additions & 0 deletions docs/diagrams/src/erd.plantuml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
@startuml harvester2.0 ERD

skinparam linetype ortho

' ERD
card "Entity Relationship Diagram" as entities {
entity "harvest_source" as source {
uuid : uuid (PK)
--
name: text
notifications_emails: array( txt )
organization_name: txt
frequency: text
config: json
urls: array( txt )
schema_validation_type: text
}

entity "harvest_record" as record {
uuid : uuid (PK)
--
source_id: uuid (FK)
job_id: uuid (FK)
status: text
s3_path: text
}

entity "harvest_job" as job {
uuid : uuid (PK)
--
source_id: uuid (FK)
status: text
date_created
date_finished: datetime
extract_started: datetime
extract_finished: datetime
compare_started: datetime
compare_finished: datetime
records_added: smallint
records_updated: smallint
records_deleted: smallint
records_errored: smallint
records_ignored: smallint
}

entity "harvest_error" as error {
uuid : uuid (PK)
--
job_id: uuid (FK)
record_id: uuid (FK)
record_reported_id: text
date_created: datetime
error_type: text
severity: enum
message: text

}
}

' lookup tables
card "Lookup Tables" as lookup {

entity "data_format" as data_format {
name: text (PK)
--
mime_type: text
}
}

' enumerators
card "Enumerators" as enumerators {

enum error_severity {
ERROR
CRITICAL
}

enum job_status {
CREATE
PROCESSING
COMPLETE
}

enum schema_validation {
DCATUS
ISO1911
}

enum record_status {
STALE
ACTIVE
INVALID
}
}

' relationships
source ||--|{ job
source ||--|{ record
job ||--|{ record
job ||-|{ error

@enduml
9 changes: 9 additions & 0 deletions harvester/db/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from sqlalchemy import text
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import DeclarativeBase, mapped_column


class Base(DeclarativeBase):
id = mapped_column(
UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()")
)
71 changes: 71 additions & 0 deletions harvester/db/models/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from harvester.db.models import Base
from sqlalchemy import ForeignKey, SMALLINT
from sqlalchemy import String, DateTime, Enum
from sqlalchemy.dialects.postgresql import JSON, UUID, ARRAY
from sqlalchemy.orm import mapped_column
from sqlalchemy.sql import func
import enum


class SeverityEnum(enum.Enum):
error = "ERROR"
critical = "CRITICAL"


class HarvestSource(Base):
__tablename__ = "harvest_source"
__table_args__ = {"comment": "Contains information for each harvest source"}

name = mapped_column(String, nullable=False)
notification_emails = mapped_column(ARRAY(String), nullable=False)
organization_name = mapped_column(String, nullable=False)
frequency = mapped_column(String, nullable=False) # enum?
config = mapped_column(JSON)
urls = mapped_column(ARRAY(String), nullable=False)
schema_validation_type = mapped_column(String, nullable=False) # enum?


class HarvestJob(Base):
__tablename__ = "harvest_job"
__table_args__ = {
"comment": "Contains job state information run through the pipeline"
}

source_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_source.id"))
status = mapped_column(String, nullable=False) # enum?
date_created = mapped_column(DateTime(timezone=True), server_default=func.now())
date_finished = mapped_column(DateTime(timezone=True))
extract_started = mapped_column(DateTime(timezone=True))
extract_finished = mapped_column(DateTime(timezone=True))
compare_started = mapped_column(DateTime(timezone=True))
compare_finished = mapped_column(DateTime(timezone=True))
records_added = mapped_column(SMALLINT)
records_updated = mapped_column(SMALLINT)
records_deleted = mapped_column(SMALLINT)
records_errored = mapped_column(SMALLINT)
records_ignored = mapped_column(SMALLINT)


class HarvestError(Base):
__tablename__ = "harvest_error"
__table_args__ = {"comment": "Table to contain all errors in the pipeline"}

job_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_job.id"))
record_id = mapped_column(UUID(as_uuid=True))
record_reported_id = mapped_column(String)
date_created = mapped_column(DateTime(timezone=True), server_default=func.now())
error_type = mapped_column(String) # enum?
severity = mapped_column(
Enum(SeverityEnum, values_callable=lambda enum: [e.value for e in enum])
)
message = mapped_column(String)


class HarvestRecord(Base):
__tablename__ = "harvest_record"
__table_args__ = {"comment": "Table to contain records"}

job_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_job.id"))
source_id = mapped_column(UUID(as_uuid=True), ForeignKey("harvest_source.id"))
status = mapped_column(String) # enum?
s3_path = mapped_column(String)
Loading