Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ test-results
node_modules/
.vscode/
/test-env
/ynr/apps/sopn_parsing/tests/data/sopn_baseline.json
/ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json
# PyCharm
.idea/

Expand Down
45 changes: 0 additions & 45 deletions Makefile

This file was deleted.

76 changes: 0 additions & 76 deletions ynr/apps/bulk_adding/tests/test_bulk_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,79 +752,3 @@ def test_bulk_add_person_removes_spaces_from_name(self):
self.assertContains(resp, "Review candidates")
resp = form.submit()
self.assertContains(resp, "Bart Simpson")

def test_fall_back_to_camelot_if_no_textract(self):
data = {"name": "Bart", "party_id": "PP52"}

raw_people = RawPeople.objects.create(
ballot=self.dulwich_post_ballot,
data=[data],
source_type=RawPeople.SOURCE_PARSED_PDF,
)

self.assertEqual(
raw_people.as_form_kwargs(),
{
"initial": [
{
"name": "Bart",
"party": ["PP52", "PP52"],
"previous_party_affiliations": [],
"source": "",
}
]
},
)
raw_people.delete()

textract_data = {"name": "Lisa", "party_id": "PP53"}
raw_people = RawPeople.objects.create(
ballot=self.dulwich_post_ballot,
data=[data],
textract_data=[textract_data],
source_type=RawPeople.SOURCE_PARSED_PDF,
)

self.assertEqual(
raw_people.as_form_kwargs(),
{
"initial": [
{
"name": "Lisa",
"party": ["PP53", "PP53"],
"previous_party_affiliations": [],
"source": "",
}
]
},
)

def test_can_change_parser_in_frontend(self):
"""
Check that a query param can change the parser we use
"""
BallotSOPN.objects.create(
source_url="http://example.com",
ballot=self.dulwich_post_ballot,
uploaded_file="sopn.pdf",
)
RawPeople.objects.create(
ballot=self.dulwich_post_ballot,
data=[{"name": "Bart", "party_id": "PP52"}],
textract_data=[{"name": "Lisa", "party_id": "PP53"}],
source_type=RawPeople.SOURCE_PARSED_PDF,
)
response = self.app.get(
"/bulk_adding/sopn/parl.65808.2015-05-07/", user=self.user
)
form = response.forms["bulk_add_form"]
# This should be the Textract data
self.assertEqual(form.fields["form-0-name"][0].value, "Lisa")

response = self.app.get(
"/bulk_adding/sopn/parl.65808.2015-05-07/?v1_parser=1",
user=self.user,
)
form = response.forms["bulk_add_form"]
# This should be the Textract data
self.assertEqual(form.fields["form-0-name"][0].value, "Bart")
4 changes: 0 additions & 4 deletions ynr/apps/bulk_adding/views/sopns.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,8 @@ def get(self, request, *args, **kwargs):
return super().get(request, *args, **kwargs)

def get_active_parser(self) -> Optional[SOPNParsingBackends]:
if self.request.GET.get("v1_parser"):
return SOPNParsingBackends.CAMELOT
if self.ballot.rawpeople.textract_data:
return SOPNParsingBackends.TEXTRACT
if self.ballot.rawpeople.data:
return SOPNParsingBackends.CAMELOT
return None

def get_context_data(self, **kwargs):
Expand Down
16 changes: 0 additions & 16 deletions ynr/apps/elections/templates/elections/includes/_sopn_debug.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,12 @@ <h3>Parsing Status</h3>
<ul>
<li>Pages matched: {% if object.sopn.get_pages %}Yes (matched pages: {{ object.sopn.get_pages|join:", " }}
){% else %}No{% endif %}</li>
<li>Camelot tables extracted: {% if object.sopn.camelotparsedsopn %}Yes{% else %}No{% endif %}</li>
<li>Raw Person Data: {% if object.rawpeople %}Yes{% else %}No{% endif %}</li>
<li>AWS Textract Data: {% if textract_parsed.raw_data %}Yes{% else %}No{% endif %}</li>
<li>AWS Textract Parsed? {% if textract_parsed.parsed_data %}Yes{% else %}
No{% endif %}</li>
</ul>

<h3>Camelot raw Data</h3>
{% if object.sopn.camelotparsedsopn.raw_data %}
<pre>{{ object.sopn.camelotparsedsopn.as_pandas.to_dict|pprint }}</pre>
{% else %}
N/A
{% endif %}

<h3>Camelot table Data</h3>
{% if object.sopn.camelotparsedsopn.data_as_html %}
{{ object.sopn.camelotparsedsopn.data_as_html|safe }}
{% else %}
N/A
{% endif %}
<br/>


{% if textract_parsed and textract_parsed.as_textractor_document %}
<h3>AWS extracted table{{ textract_parsed.as_textractor_document.tables|pluralize }}</h3>
Expand Down
7 changes: 0 additions & 7 deletions ynr/apps/official_documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,6 @@ def parse(self):

"""

from sopn_parsing.helpers.extract_tables import extract_ballot_table
from sopn_parsing.helpers.textract_helpers import (
NotUsingAWSException,
TextractSOPNHelper,
Expand All @@ -276,12 +275,6 @@ def parse(self):
# There's a cron job that should pick up the result and carry on parsing later.
textract_helper.start_detection()

if getattr(
settings, "CAMELOT_ENABLED", False
) and self.uploaded_file.name.endswith(".pdf"):
# Camelot
extract_ballot_table(self.ballot)


class BallotSOPNHistory(BaseBallotSOPN):
ballot = models.ForeignKey(
Expand Down
25 changes: 1 addition & 24 deletions ynr/apps/official_documents/tests/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,20 +114,9 @@ def test_upload_authorized(self):
with open(self.example_image_filename, "rb") as f:
form["uploaded_file"] = Upload("pilot.jpg", f.read())

# TODO: Add back in
# with patch(
# "official_documents.views.extract_pages_for_ballot"
# ) as extract_pages, patch(
# "official_documents.views.extract_ballot_table"
# ) as extract_tables, patch(
# "official_documents.views.parse_raw_data_for_ballot"
# ) as parse_tables:

response = form.submit()
self.assertEqual(response.status_code, 302)
# TODO: Add back in
# extract_pages.assert_called_once()
# extract_tables.assert_called_once()
# parse_tables.assert_called_once()

ballot_sopns = BallotSOPN.objects.all()
self.assertEqual(ballot_sopns.count(), 1)
Expand Down Expand Up @@ -181,20 +170,8 @@ def test_docx_upload_form_validation(self):
with open(self.example_docx_filename, "rb") as f:
form["uploaded_file"] = Upload("pilot.docx", f.read())

# TODO: add back in
# with patch(
# "official_documents.views.extract_pages_for_ballot"
# ) as extract_pages, patch(
# "official_documents.views.extract_ballot_table"
# ) as extract_tables, patch(
# "official_documents.views.parse_raw_data_for_ballot"
# ) as parse_tables:
response = form.submit()
self.assertEqual(response.status_code, 302)
# TODO Add back in
# extract_pages.assert_called_once()
# extract_tables.assert_called_once()
# parse_tables.assert_called_once()
self.assertEqual(BallotSOPN.objects.count(), 1)
self.assertEqual(response.location, self.ballot.get_sopn_url())

Expand Down
63 changes: 0 additions & 63 deletions ynr/apps/sopn_parsing/helpers/extract_tables.py

This file was deleted.

21 changes: 3 additions & 18 deletions ynr/apps/sopn_parsing/helpers/parse_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,20 +476,12 @@ def parse_dataframe(ballot: Ballot, df: DataFrame):

def parse_raw_data(ballot: Ballot, reparse=False):
"""
Given a Ballot, go and get the Camelot and the AWS Textract dataframes
Given a Ballot, go and get the AWS Textract dataframes
and process them
"""

camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None)
camelot_data = {}
textract_model = getattr(ballot.sopn, "awstextractparsedsopn", None)
textract_data = {}
if (
camelot_model
and camelot_model.raw_data_type == "pandas"
and (reparse or not camelot_model.parsed_data)
):
camelot_data = parse_dataframe(ballot, camelot_model.as_pandas)
if (
textract_model
and textract_model.raw_data
Expand All @@ -500,7 +492,7 @@ def parse_raw_data(ballot: Ballot, reparse=False):
textract_model.parse_raw_data()
textract_data = parse_dataframe(ballot, textract_model.as_pandas)

if camelot_data or textract_data:
if textract_data:
# Check there isn't a rawpeople object from another (better) source
rawpeople_qs = RawPeople.objects.filter(ballot=ballot).exclude(
source_type=RawPeople.SOURCE_PARSED_PDF
Expand All @@ -510,7 +502,7 @@ def parse_raw_data(ballot: Ballot, reparse=False):
RawPeople.objects.update_or_create(
ballot=ballot,
defaults={
"data": camelot_data or "",
"data": "",
"textract_data": textract_data or "",
"source": "Parsed from {}".format(
ballot.sopn.source_url
Expand All @@ -525,17 +517,10 @@ def parse_raw_data(ballot: Ballot, reparse=False):
return
# We've done the parsing, so let's still save the result
storage = DefaultStorage()
storage.save(
f"raw_people/camelot_{ballot.ballot_paper_id}.json",
ContentFile(json.dumps(camelot_data, indent=4).encode("utf8")),
)
storage.save(
f"raw_people/textract_{ballot.ballot_paper_id}.json",
ContentFile(json.dumps(textract_data, indent=4).encode("utf8")),
)
if camelot_model:
ballot.sopn.camelotparsedsopn.status = "parsed"
ballot.sopn.camelotparsedsopn.save()
if textract_model:
ballot.sopn.awstextractparsedsopn.status = "parsed"
ballot.sopn.awstextractparsedsopn.save()

This file was deleted.

Loading
Loading