diff --git a/.gitignore b/.gitignore
index d08c2cda36..38c331c5a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,8 +24,6 @@ test-results
node_modules/
.vscode/
/test-env
-/ynr/apps/sopn_parsing/tests/data/sopn_baseline.json
-/ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json
# PyCharm
.idea/
diff --git a/Makefile b/Makefile
deleted file mode 100644
index f8d2865666..0000000000
--- a/Makefile
+++ /dev/null
@@ -1,45 +0,0 @@
-export DJANGO_SETTINGS_MODULE?=ynr.settings.sopn_testing
-
-
-.PHONY: sopn-runserver
-sopn-runserver:
- python manage.py runserver
-
-.PHONY: sopn-shell
-sopn-shell:
- python manage.py shell_plus
-
-.PHONY: migrate-db
-migrate-db:
- python manage.py migrate
-
-.PHONY: test-sopns
-test-sopns: migrate-db
- python manage.py sopn_tooling_compare_raw_people --election-slugs= --ballot= --date 2021-05-06
-
-.PHONY: download-sopns
-download-sopns:
- python manage.py migrate --no-input
- python manage.py sopn_tooling_create_official_documents --election-slugs= --date 2021-05-06
-
-.PHONY: populate-sopn-testing-database
-populate-sopn-testing-database: migrate-db
- python manage.py candidates_import_from_live_site
-
-.PHONY: delete-test-sopns
-delete-test-sopns:
- python manage.py sopn_tooling_clear_existing_objects
- rm -rf ./ynr/media/sopn_testing/
-
-.PHONY: create-baseline-file
-create-baseline-file:
- python manage.py sopn_tooling_write_baseline
-
-.PHONY: copy-baseline-file
-copy-baseline-file:
- cp ynr/apps/sopn_parsing/tests/data/sopn_baseline.json ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json
-
-.PHONY: prod-import-sopns
-prod-import-sopns:
- cd deploy; \
- ansible-playbook import_sopns.yml
diff --git a/ynr/apps/bulk_adding/tests/test_bulk_add.py b/ynr/apps/bulk_adding/tests/test_bulk_add.py
index 5361cda441..e18589768a 100644
--- a/ynr/apps/bulk_adding/tests/test_bulk_add.py
+++ b/ynr/apps/bulk_adding/tests/test_bulk_add.py
@@ -752,79 +752,3 @@ def test_bulk_add_person_removes_spaces_from_name(self):
self.assertContains(resp, "Review candidates")
resp = form.submit()
self.assertContains(resp, "Bart Simpson")
-
- def test_fall_back_to_camelot_if_no_textract(self):
- data = {"name": "Bart", "party_id": "PP52"}
-
- raw_people = RawPeople.objects.create(
- ballot=self.dulwich_post_ballot,
- data=[data],
- source_type=RawPeople.SOURCE_PARSED_PDF,
- )
-
- self.assertEqual(
- raw_people.as_form_kwargs(),
- {
- "initial": [
- {
- "name": "Bart",
- "party": ["PP52", "PP52"],
- "previous_party_affiliations": [],
- "source": "",
- }
- ]
- },
- )
- raw_people.delete()
-
- textract_data = {"name": "Lisa", "party_id": "PP53"}
- raw_people = RawPeople.objects.create(
- ballot=self.dulwich_post_ballot,
- data=[data],
- textract_data=[textract_data],
- source_type=RawPeople.SOURCE_PARSED_PDF,
- )
-
- self.assertEqual(
- raw_people.as_form_kwargs(),
- {
- "initial": [
- {
- "name": "Lisa",
- "party": ["PP53", "PP53"],
- "previous_party_affiliations": [],
- "source": "",
- }
- ]
- },
- )
-
- def test_can_change_parser_in_frontend(self):
- """
- Check that a query param can change the parser we use
- """
- BallotSOPN.objects.create(
- source_url="http://example.com",
- ballot=self.dulwich_post_ballot,
- uploaded_file="sopn.pdf",
- )
- RawPeople.objects.create(
- ballot=self.dulwich_post_ballot,
- data=[{"name": "Bart", "party_id": "PP52"}],
- textract_data=[{"name": "Lisa", "party_id": "PP53"}],
- source_type=RawPeople.SOURCE_PARSED_PDF,
- )
- response = self.app.get(
- "/bulk_adding/sopn/parl.65808.2015-05-07/", user=self.user
- )
- form = response.forms["bulk_add_form"]
- # This should be the Textract data
- self.assertEqual(form.fields["form-0-name"][0].value, "Lisa")
-
- response = self.app.get(
- "/bulk_adding/sopn/parl.65808.2015-05-07/?v1_parser=1",
- user=self.user,
- )
- form = response.forms["bulk_add_form"]
- # This should be the Textract data
- self.assertEqual(form.fields["form-0-name"][0].value, "Bart")
diff --git a/ynr/apps/bulk_adding/views/sopns.py b/ynr/apps/bulk_adding/views/sopns.py
index e0003e9c9e..6564db69a5 100644
--- a/ynr/apps/bulk_adding/views/sopns.py
+++ b/ynr/apps/bulk_adding/views/sopns.py
@@ -123,12 +123,8 @@ def get(self, request, *args, **kwargs):
return super().get(request, *args, **kwargs)
def get_active_parser(self) -> Optional[SOPNParsingBackends]:
- if self.request.GET.get("v1_parser"):
- return SOPNParsingBackends.CAMELOT
if self.ballot.rawpeople.textract_data:
return SOPNParsingBackends.TEXTRACT
- if self.ballot.rawpeople.data:
- return SOPNParsingBackends.CAMELOT
return None
def get_context_data(self, **kwargs):
diff --git a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html
index c61ab936ce..620857b058 100644
--- a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html
+++ b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html
@@ -7,28 +7,12 @@
Parsing Status
- Pages matched: {% if object.sopn.get_pages %}Yes (matched pages: {{ object.sopn.get_pages|join:", " }}
){% else %}No{% endif %}
- - Camelot tables extracted: {% if object.sopn.camelotparsedsopn %}Yes{% else %}No{% endif %}
- Raw Person Data: {% if object.rawpeople %}Yes{% else %}No{% endif %}
- AWS Textract Data: {% if textract_parsed.raw_data %}Yes{% else %}No{% endif %}
- AWS Textract Parsed? {% if textract_parsed.parsed_data %}Yes{% else %}
No{% endif %}
- Camelot raw Data
- {% if object.sopn.camelotparsedsopn.raw_data %}
- {{ object.sopn.camelotparsedsopn.as_pandas.to_dict|pprint }}
- {% else %}
- N/A
- {% endif %}
-
- Camelot table Data
- {% if object.sopn.camelotparsedsopn.data_as_html %}
- {{ object.sopn.camelotparsedsopn.data_as_html|safe }}
- {% else %}
- N/A
- {% endif %}
-
-
{% if textract_parsed and textract_parsed.as_textractor_document %}
AWS extracted table{{ textract_parsed.as_textractor_document.tables|pluralize }}
diff --git a/ynr/apps/official_documents/models.py b/ynr/apps/official_documents/models.py
index 15de317657..55ca3e7d1d 100644
--- a/ynr/apps/official_documents/models.py
+++ b/ynr/apps/official_documents/models.py
@@ -260,7 +260,6 @@ def parse(self):
"""
- from sopn_parsing.helpers.extract_tables import extract_ballot_table
from sopn_parsing.helpers.textract_helpers import (
NotUsingAWSException,
TextractSOPNHelper,
@@ -276,12 +275,6 @@ def parse(self):
# There's a cron job that should pick up the result and carry on parsing later.
textract_helper.start_detection()
- if getattr(
- settings, "CAMELOT_ENABLED", False
- ) and self.uploaded_file.name.endswith(".pdf"):
- # Camelot
- extract_ballot_table(self.ballot)
-
class BallotSOPNHistory(BaseBallotSOPN):
ballot = models.ForeignKey(
diff --git a/ynr/apps/official_documents/tests/test_upload.py b/ynr/apps/official_documents/tests/test_upload.py
index 03423f9ce7..443cc97fc6 100644
--- a/ynr/apps/official_documents/tests/test_upload.py
+++ b/ynr/apps/official_documents/tests/test_upload.py
@@ -114,20 +114,9 @@ def test_upload_authorized(self):
with open(self.example_image_filename, "rb") as f:
form["uploaded_file"] = Upload("pilot.jpg", f.read())
- # TODO: Add back in
- # with patch(
- # "official_documents.views.extract_pages_for_ballot"
- # ) as extract_pages, patch(
- # "official_documents.views.extract_ballot_table"
- # ) as extract_tables, patch(
- # "official_documents.views.parse_raw_data_for_ballot"
- # ) as parse_tables:
+
response = form.submit()
self.assertEqual(response.status_code, 302)
- # TODO: Add back in
- # extract_pages.assert_called_once()
- # extract_tables.assert_called_once()
- # parse_tables.assert_called_once()
ballot_sopns = BallotSOPN.objects.all()
self.assertEqual(ballot_sopns.count(), 1)
@@ -181,20 +170,8 @@ def test_docx_upload_form_validation(self):
with open(self.example_docx_filename, "rb") as f:
form["uploaded_file"] = Upload("pilot.docx", f.read())
- # TODO: add back in
- # with patch(
- # "official_documents.views.extract_pages_for_ballot"
- # ) as extract_pages, patch(
- # "official_documents.views.extract_ballot_table"
- # ) as extract_tables, patch(
- # "official_documents.views.parse_raw_data_for_ballot"
- # ) as parse_tables:
response = form.submit()
self.assertEqual(response.status_code, 302)
- # TODO Add back in
- # extract_pages.assert_called_once()
- # extract_tables.assert_called_once()
- # parse_tables.assert_called_once()
self.assertEqual(BallotSOPN.objects.count(), 1)
self.assertEqual(response.location, self.ballot.get_sopn_url())
diff --git a/ynr/apps/sopn_parsing/helpers/extract_tables.py b/ynr/apps/sopn_parsing/helpers/extract_tables.py
deleted file mode 100644
index 0b610c7847..0000000000
--- a/ynr/apps/sopn_parsing/helpers/extract_tables.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import json
-
-import pandas as pd
-from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text
-from sopn_parsing.models import CamelotParsedSOPN
-
-
-def extract_ballot_table(ballot, parse_flavor="lattice"):
- """
- Given a OfficialDocument model, update or create a CamelotParsedSOPN model with the
- contents of the table as a JSON string.
-
- :type ballot: candidates.models.Ballot
-
- """
- import camelot # import here to avoid import error running tests without pdf deps installed
-
- document = ballot.sopn
- try:
- tables = camelot.read_pdf(
- document.uploaded_file.path,
- pages="all",
- flavor=parse_flavor,
- )
- except (NotImplementedError, AttributeError):
- # * NotImplementedError is thrown if the PDF is an image or generally
- # unreadable.
- # * AttributeError is thrown on some PDFs saying they need a password.
- # Assume this is a bug in camelot, and ignore these PDFs
- raise NoTextInDocumentError()
-
- # Tables can span pages, camelot assumes they're different tables, so we
- # need to join them back together
- table_list = []
- for table in tables:
- table_list.append(table)
- table_list.sort(key=lambda t: (t.page, t.order))
-
- if not table_list:
- return None
-
- table_data = table_list.pop(0).df
-
- for table in table_list:
- # It's possible to have the "situation of poll" document on the SOPN
- # Ignore any table that contains "polling station" (SOPNs tables don't)
- table = table.df
- first_row = table.iloc[0].to_string()
-
- if "polling station" in clean_text(first_row):
- break
- # Append the continuation table to the first one in the document.
- # ignore_index is needed so the e.g table 2 row 1 doesn't replace
- # table 1 row 1
- table_data = pd.concat([table_data, table], ignore_index=True)
-
- if not table_data.empty:
- parsed, _ = CamelotParsedSOPN.objects.update_or_create(
- sopn=document,
- defaults={"raw_data": json.dumps(table_data.to_dict())},
- )
- return parsed
- return None
diff --git a/ynr/apps/sopn_parsing/helpers/parse_tables.py b/ynr/apps/sopn_parsing/helpers/parse_tables.py
index 247d0e7e31..e8152b0816 100644
--- a/ynr/apps/sopn_parsing/helpers/parse_tables.py
+++ b/ynr/apps/sopn_parsing/helpers/parse_tables.py
@@ -476,20 +476,12 @@ def parse_dataframe(ballot: Ballot, df: DataFrame):
def parse_raw_data(ballot: Ballot, reparse=False):
"""
- Given a Ballot, go and get the Camelot and the AWS Textract dataframes
+ Given a Ballot, go and get the AWS Textract dataframes
and process them
"""
- camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None)
- camelot_data = {}
textract_model = getattr(ballot.sopn, "awstextractparsedsopn", None)
textract_data = {}
- if (
- camelot_model
- and camelot_model.raw_data_type == "pandas"
- and (reparse or not camelot_model.parsed_data)
- ):
- camelot_data = parse_dataframe(ballot, camelot_model.as_pandas)
if (
textract_model
and textract_model.raw_data
@@ -500,7 +492,7 @@ def parse_raw_data(ballot: Ballot, reparse=False):
textract_model.parse_raw_data()
textract_data = parse_dataframe(ballot, textract_model.as_pandas)
- if camelot_data or textract_data:
+ if textract_data:
# Check there isn't a rawpeople object from another (better) source
rawpeople_qs = RawPeople.objects.filter(ballot=ballot).exclude(
source_type=RawPeople.SOURCE_PARSED_PDF
@@ -510,7 +502,7 @@ def parse_raw_data(ballot: Ballot, reparse=False):
RawPeople.objects.update_or_create(
ballot=ballot,
defaults={
- "data": camelot_data or "",
+ "data": "",
"textract_data": textract_data or "",
"source": "Parsed from {}".format(
ballot.sopn.source_url
@@ -525,17 +517,10 @@ def parse_raw_data(ballot: Ballot, reparse=False):
return
# We've done the parsing, so let's still save the result
storage = DefaultStorage()
- storage.save(
- f"raw_people/camelot_{ballot.ballot_paper_id}.json",
- ContentFile(json.dumps(camelot_data, indent=4).encode("utf8")),
- )
storage.save(
f"raw_people/textract_{ballot.ballot_paper_id}.json",
ContentFile(json.dumps(textract_data, indent=4).encode("utf8")),
)
- if camelot_model:
- ballot.sopn.camelotparsedsopn.status = "parsed"
- ballot.sopn.camelotparsedsopn.save()
if textract_model:
ballot.sopn.awstextractparsedsopn.status = "parsed"
ballot.sopn.awstextractparsedsopn.save()
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py
deleted file mode 100644
index 3a4e091290..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand
-from sopn_parsing.helpers.extract_tables import extract_ballot_table
-from sopn_parsing.helpers.text_helpers import NoTextInDocumentError
-
-
-class Command(BaseSOPNParsingCommand):
- help = """
- Parse tables out of PDFs in to CamelotParsedSOPN models for later parsing.
- """
-
- def handle(self, *args, **options):
- qs = self.get_queryset(options)
- filter_kwargs = {}
- if not options["ballot"] and not options["testing"]:
- if not options["reparse"]:
- filter_kwargs["sopn__camelotparsedsopn"] = None
-
- qs = qs.filter(**filter_kwargs)
- for ballot in qs:
- try:
- extract_ballot_table(ballot)
- except NoTextInDocumentError:
- self.stdout.write(
- f"{ballot} raised a NoTextInDocumentError trying to extract tables"
- )
- except ValueError:
- self.stdout.write(
- f"{ballot} raised a ValueError trying extract tables"
- )
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py
deleted file mode 100644
index 26448b697f..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from bulk_adding.models import RawPeople
-from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand
-from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot
-
-
-class Command(BaseSOPNParsingCommand):
- help = """
- Convert the raw extracted tables on the CamelotParsedSOPN model to a parsed
- RawPeople model, and set the status as parsed.
-
- """
-
- def build_filter_kwargs(self, options):
- """
- Build kwargs used to filter the BallotQuerySet that is parsed
- - Always skip any ballots where we do not have a CamelotParsedSOPN to try to
- extract candidates from
- - When test flag is used, dont make any changes
- - When parsing a single ballot, dont make any changes
- - When reparsing, only use ballots where we have previously created a
- RawPeople object from a CamelotParsedSOPN
- - Otherwise filter by unparsed CamelotParsedSOPN objects
- """
- # Always skip any ballots where we do not have a CamelotParsedSOPN to try to
- # extract candidates from
- filter_kwargs = {}
- if options.get("testing"):
- return filter_kwargs
-
- if options.get("ballot"):
- return filter_kwargs
-
- if options.get("reparse"):
- filter_kwargs[
- "rawpeople__source_type"
- ] = RawPeople.SOURCE_PARSED_PDF
- return filter_kwargs
-
- return filter_kwargs
-
- def handle(self, *args, **options):
- # filters that we never change with args. These two would raise
- # ValueErrors in the parse_raw_data_for_ballot function
- base_qs = self.get_queryset(options)
- filter_kwargs = self.build_filter_kwargs(options)
-
- qs = base_qs.filter(**filter_kwargs)
- qs = qs.filter(
- candidates_locked=False, # Never parse a locked ballot
- suggestedpostlock=None, # Never parse a ballot with lock suggestions
- )
-
- if not qs.exists():
- msg = ["No ballots to parse found."]
-
- if options.get("ballot"):
- msg.append(
- "This ballot might be locked or have lock suggestions"
- )
-
- self.stderr.write("\n".join(msg))
-
- for ballot in qs:
- try:
- parse_raw_data_for_ballot(ballot, options["reparse"])
- except ValueError as e:
- print(str(e))
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py
index 41db0e1f57..7b38b54b6a 100644
--- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py
+++ b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py
@@ -1,4 +1,3 @@
-from django.conf import settings
from django.core.management.base import BaseCommand
from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot
from sopn_parsing.helpers.textract_helpers import (
@@ -8,7 +7,6 @@
from sopn_parsing.models import (
AWSTextractParsedSOPN,
AWSTextractParsedSOPNStatus,
- CamelotParsedSOPN,
)
@@ -21,22 +19,16 @@ class Command(BaseCommand):
This script picks up where `parse` left off. It manages two cases:
- # Camelot
-
- We expect to have made a `CamelotParsedSOPN` with `raw_data` populated. This will only have
- happened if the file is a PDF readable by Camelot.
-
- We need to parse the `raw_data` into `parsed_data` and then make a `RawData` object for bulk adding.
-
# AWS Textract
- We should have made a `AWSTextractParsedSOPN` with `job_id` populated. Textract is async,
- so the initial `parse` just submits the data to AWS and gets a job_id.
+ We should have made a `AWSTextractParsedSOPN` with `job_id` populated.
+ Textract is async, so the initial `parse` just submits the data to AWS and
+ gets a job_id.
We need to check if the job ID has finished and pull in the data to `raw_data`.
- We're then in the same state as the Camelot method above, we need to parse the `raw_data` into
- `parsed_data` and makr a `RawData` object for bulk adding.
+ We need to parse the `raw_data` into `parsed_data` and makr a `RawData`
+ object for bulk adding.
"""
def handle(self, *args, **options):
@@ -45,15 +37,6 @@ def handle(self, *args, **options):
"sopn__ballot__candidates_locked": False,
}
- if getattr(settings, "CAMELOT_ENABLED", False):
- # Camelot first
- qs = (
- CamelotParsedSOPN.objects.filter(parsed_data=None)
- .exclude(raw_data="")
- .filter(**current_ballot_kwargs)
- )
- self.parse_tables_for_qs(qs)
-
# Textract
qs = AWSTextractParsedSOPN.objects.exclude(
status__in=[
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py
deleted file mode 100644
index cb68ffdf02..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from bulk_adding.models import RawPeople
-from django.conf import settings
-from django.core.management.base import BaseCommand
-from official_documents.models import OfficialDocument
-
-
-class Command(BaseCommand):
- """
- Used to quickly delete existing objects used when testing SOPN
- parsing so that you can start fresh for example, when you want
- to start testing a new set of SOPNs.
- """
-
- def print_deleted(self, deleted_dict):
- for object, count in deleted_dict.items():
- self.stdout.write(f"Deleted {count} {object}")
-
- def handle(self, *args, **options):
- if settings.SETTINGS_MODULE != "ynr.settings.sopn_testing":
- raise ValueError(
- "You are trying to run this command outside of SOPN testing environment"
- )
-
- deleted_dict = {}
- deleted_dict.update(OfficialDocument.objects.all().delete()[1])
- deleted_dict.update(RawPeople.objects.all().delete()[1])
- self.print_deleted(deleted_dict)
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py
deleted file mode 100644
index dbe5eb913a..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import json
-import os
-from collections import Counter
-
-from bulk_adding.models import RawPeople
-from candidates.models import Ballot
-from django.core.management import call_command
-from official_documents.models import OfficialDocument
-from popolo.models import Membership
-from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand
-from sopn_parsing.models import CamelotParsedSOPN
-
-
-class Command(BaseSOPNParsingCommand):
- CORRECT_EXACTLY = "correct_exactly"
- NUM_CORRECT_MISSING_PARTIES = "num_correct_some_parties_missing"
- NUM_INCORRECT = "num_incorrect"
- ZERO_CANDIDATES = "zero_candidates"
-
- def add_arguments(self, parser):
- super().add_arguments(parser)
- parser.add_argument("--loud", action="store_true", default=False)
-
- def handle(self, *args, **options):
- """
- - Check we have a baseline file to compare with
- - Prepare some OfficialDocuments
- - Re-parse the documents
- - Loop through the created RawPeople objects, comparing to our baseline
- to make sure that we are parsing at least as many people as before
- - If no asserts failed, use the data to write a new baseline file
- """
-
- self.loud = options.pop("loud")
-
- self.candidates_results = {
- "correct_exactly": [],
- "num_correct_some_parties_missing": [],
- "num_incorrect": [],
- "zero_candidates": [],
- }
-
- raw_people_file = "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json"
- if not os.path.isfile(raw_people_file):
- call_command("sopn_tooling_write_baseline")
- self.stdout.write("Baseline file didn't exist so one was created")
-
- options.update({"testing": True})
-
- OfficialDocument.objects.update(relevant_pages="")
- call_command("sopn_parsing_extract_page_numbers", *args, **options)
- CamelotParsedSOPN.objects.all().delete()
- call_command("sopn_parsing_extract_tables", *args, **options)
- RawPeople.objects.all().delete()
- call_command("sopn_parsing_parse_tables", *args, **options)
-
- with open(raw_people_file) as file:
- old_raw_people = json.loads(file.read())
-
- self.new_raw_people = {}
- for ballot in Ballot.objects.exclude(officialdocument__isnull=True):
- ballot_data = old_raw_people.get(ballot.ballot_paper_id, {})
-
- self.compare_relevant_pages(ballot=ballot, ballot_data=ballot_data)
-
- self.compare_raw_people(ballot=ballot, ballot_data=ballot_data)
-
- # display some overall totals
- self.stdout.write(
- "Old total 'people' parsed WAS {old}\n"
- "New total 'people' parsed IS {new}".format(
- old=self.count_people_parsed(old_raw_people),
- new=self.count_people_parsed(self.new_raw_people),
- )
- )
-
- old_raw_people_obj_count = len(
- {k: v for k, v in old_raw_people.items() if v["raw_people"]}
- )
- new_raw_people_obj_count = RawPeople.objects.count()
- style = self.style.SUCCESS
- if new_raw_people_obj_count < old_raw_people_obj_count:
- style = self.style.ERROR
- self.stdout.write(
- style(
- f"Old RawPeople count: {old_raw_people_obj_count}\n"
- f"New total RawPeople count: {new_raw_people_obj_count}"
- )
- )
-
- for result, ballots in self.candidates_results.items():
- total = len(ballots)
- self.stdout.write(f"{total} ballots parsed {result}")
- # Write a new baseline
- call_command("sopn_tooling_write_baseline")
-
- def compare_relevant_pages(self, ballot, ballot_data):
- old_relevant_pages = ballot_data.get("relevant_pages", "")
- new_relevant_pages = ballot.sopn.relevant_pages
-
- if old_relevant_pages != new_relevant_pages:
- self.stdout.write(
- self.style.WARNING(
- f"RELEVANT PAGES CHANGED FROM {old_relevant_pages} to {new_relevant_pages} for {ballot.ballot_paper_id}"
- )
- )
-
- def compare_raw_people(self, ballot, ballot_data):
- try:
- raw_people = ballot.rawpeople.data
- except RawPeople.DoesNotExist:
- raw_people = []
-
- old_raw_people_for_ballot = ballot_data.get("raw_people", [])
- old_count = len(old_raw_people_for_ballot)
- new_count = len(raw_people)
- if new_count < old_count:
- self.stderr.write(
- f"Uh oh, parsed people for {ballot.ballot_paper_id} decreased from {old_count} to {new_count}. Stopping."
- )
-
- if new_count > old_count:
- self.stdout.write(
- f"{ballot.ballot_paper_id} increased from {old_count} to {new_count} parsed people.\n"
- f"Check the SOPN at https://candidates.democracyclub.org.uk{ballot.get_sopn_url()}."
- )
- for person in raw_people:
- if person not in old_raw_people_for_ballot:
- self.stdout.write(self.style.SUCCESS(person))
-
- # when people parsed have changed e.g. different name/different party print it for further checking
- changed_people = [
- person
- for person in old_raw_people_for_ballot
- if person not in raw_people
- ]
- if changed_people:
- self.stdout.write(
- self.style.WARNING(
- f"Parsed data changed for {ballot.ballot_paper_id}\n"
- f"New raw people data:\n"
- f"{raw_people}\n"
- "Missing people:"
- )
- )
- for person in changed_people:
- self.stderr.write(str(person))
-
- self.new_raw_people[ballot.ballot_paper_id] = {"raw_people": raw_people}
-
- self.parties_correct(ballot, raw_people)
-
- def count_people_parsed(self, raw_people_data):
- """
- Returns the total number of "people" that were parsed.
- NB that just because something was parsed, it doesnt mean that it was
- accurately parsed. Therefore this total is best used to look for large
- changes that should then be checked in detail.
- """
- return sum(
- [len(data["raw_people"]) for data in raw_people_data.values()]
- )
-
- def parties_correct(self, ballot, raw_people_for_ballot):
- candidates = Membership.objects.filter(ballot=ballot)
- if not candidates:
- self.stdout.write(
- self.style.WARNING(
- f"We dont have candidates for {ballot.ballot_paper_id}. Try updating with the live site first?"
- )
- )
-
- if not raw_people_for_ballot:
- self.candidates_results[self.ZERO_CANDIDATES].append(
- ballot.ballot_paper_id
- )
- return None
-
- num_candidates_correct = candidates.count() == len(
- raw_people_for_ballot
- )
-
- if self.loud:
- if num_candidates_correct:
- self.stdout.write(
- self.style.SUCCESS(
- f"Correct number of people parsed as expected for {ballot.ballot_paper_id}"
- )
- )
- else:
- self.stdout.write(
- self.style.ERROR(
- f"Incorrect number of people parsed for {ballot.ballot_paper_id}"
- )
- )
-
- parsed = sorted(
- [person["party_id"] for person in raw_people_for_ballot]
- )
- expected = list(
- candidates.values_list("party__ec_id", flat=True).order_by(
- "party__ec_id"
- )
- )
-
- if parsed == expected:
- return self.candidates_results[self.CORRECT_EXACTLY].append(
- ballot.ballot_paper_id
- )
-
- # count number of each missing party ID as there could be more than one
- # missing candidate for a party e.g. 1 missing Green, 2 missing independents
- parsed = Counter(parsed)
- expected = Counter(expected)
- missing = expected - parsed
- if missing:
- total = sum(missing.values())
- self.stderr.write(
- f"{total} MISSING parties for {ballot.ballot_paper_id} (party_id:num_missing)\n{missing}"
- )
- else:
- # sometimes we incorrectly parse extra people - often independents
- # due to an empty row
- extras = parsed - expected
- total = sum(extras.values())
- self.stderr.write(
- f"{total} EXTRA parties for {ballot.ballot_paper_id}\n{extras}"
- )
-
- if num_candidates_correct:
- return self.candidates_results[
- self.NUM_CORRECT_MISSING_PARTIES
- ].append(ballot.ballot_paper_id)
-
- return self.candidates_results[self.NUM_INCORRECT].append(
- ballot.ballot_paper_id
- )
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py
deleted file mode 100644
index e7c3f3e1b2..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import requests
-from candidates.models import Ballot
-from django.conf import settings
-from django.core.files.base import ContentFile
-from django.core.management.base import BaseCommand
-from elections.models import Election
-from official_documents.models import OfficialDocument
-
-
-class Command(BaseCommand):
- """This command uses the ballots endpoint to loop over each
- ballot and store each sopn pdf (uploaded_file) locally"""
-
- def add_arguments(self, parser):
- parser.add_argument(
- "--date",
- "-d",
- action="store",
- help="Election date in ISO format, defaults to 2021-05-06",
- default="2021-05-06",
- type=str,
- )
- parser.add_argument(
- "--site_url",
- "-u",
- action="store",
- help="URL of site to download from",
- default="https://candidates.democracyclub.org.uk/",
- type=str,
- )
- parser.add_argument(
- "--election-count",
- "-c",
- action="store",
- help="URL of site to download from",
- default=50,
- type=int,
- )
- parser.add_argument(
- "--election-slugs", "-s", action="store", required=False
- )
-
- def handle(self, *args, **options):
- site_url = options.get("site_url")
- election_date = options.get("date")
- election_count = options.get("election_count")
-
- if options["election_slugs"]:
- election_slugs = options["election_slugs"].split(",")
- else:
- election_slugs = Election.objects.filter(
- election_date=election_date
- ).values_list("slug", flat=True)[:election_count]
-
- for slug in election_slugs:
- url = f"{site_url}api/next/ballots/?has_sopn=1&page_size=200&election_id={slug}&auth_token={settings.YNR_API_KEY}"
- self.create_official_documents(url=url)
-
- def create_official_documents(self, url):
- data = requests.get(url=url).json()
- try:
- next_page = data["next"]
- except KeyError:
- next_page = None
- if "results" in data:
- for ballot_data in data["results"]:
- ballot = Ballot.objects.get(
- ballot_paper_id=ballot_data["ballot_paper_id"]
- )
- sopn_data = ballot_data["sopn"]
-
- # if we already have the SOPN no need to recreate
- if ballot.officialdocument_set.filter(
- source_url=sopn_data["source_url"]
- ).exists():
- self.stdout.write(
- f"SOPN already exists for {ballot.ballot_paper_id}"
- )
- continue
-
- # check if we already have an OfficialDocument with this source
- # downloaded
- official_document = OfficialDocument.objects.filter(
- source_url=sopn_data["source_url"]
- ).first()
- if official_document:
- # if so we dont need to redownload the file, we can create a new
- # object for this ballot with the same file
- self.stdout.write(
- f"Found SOPN for source {sopn_data['source_url']}"
- )
- OfficialDocument.objects.create(
- ballot=ballot,
- source_url=sopn_data["source_url"],
- uploaded_file=official_document.uploaded_file,
- document_type=OfficialDocument.NOMINATION_PAPER,
- )
- continue
-
- # otherwise we dont have this file stored already, so download it as
- # part of creating the OfficialDocument
- self.stdout.write(
- f"Downloading SOPN from {sopn_data['uploaded_file']}"
- )
- file_response = requests.get(sopn_data["uploaded_file"])
- file_object = ContentFile(content=file_response.content)
- official_document = OfficialDocument(
- ballot=ballot,
- source_url=sopn_data["source_url"],
- document_type=OfficialDocument.NOMINATION_PAPER,
- )
- file_extension = sopn_data["uploaded_file"].split(".")[-1]
- filename = f"{ballot.ballot_paper_id}.{file_extension}"
- official_document.uploaded_file.save(
- name=filename, content=file_object
- )
- else:
- self.stdout.write("No results found")
-
- # this should only be the case where the election object has > 200
- # ballots e.g. parliamentary elections
- if next_page:
- return self.create_official_documents(url=next_page)
- return None
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py
deleted file mode 100644
index 07ae9309cd..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import json
-import os
-
-from bulk_adding.models import RawPeople
-from candidates.models import Ballot
-from django.core.management.base import BaseCommand
-from django.db.models import Q
-
-
-class Command(BaseCommand):
- """
- Creates a JSON file to represent ballots that have an Officialdocument.
- Only include ballots where:
- - The source of the RawPeople is from parsing a PDF
- - No RawPeople were created from the OfficialDocument. This is so that we
- will know if we make make improvements that mean more RawPeople are parsed
- from an OfficialDocument
- """
-
- def add_arguments(self, parser):
- parser.add_argument(
- "--data",
- action="store",
- help="Dictionary of raw people to write as a baseline",
- )
-
- def handle(self, *args, **options):
- json_data = options["data"] or {}
-
- if not json_data:
- qs = Ballot.objects.exclude(officialdocument__isnull=True).filter(
- Q(rawpeople__source_type=RawPeople.SOURCE_PARSED_PDF)
- | Q(rawpeople__isnull=True)
- )
- for ballot in qs:
- raw_people = getattr(ballot, "rawpeople", [])
- try:
- raw_people = ballot.rawpeople.data
- except RawPeople.DoesNotExist:
- raw_people = []
-
- json_data[ballot.ballot_paper_id] = {
- "raw_people": raw_people,
- "relevant_pages": ballot.sopn.relevant_pages,
- }
-
- file_path = os.path.join(
- os.getcwd(), "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json"
- )
- with open(file_path, "w") as f:
- f.write(json.dumps(json_data))
diff --git a/ynr/apps/sopn_parsing/tests/test_extract_tables.py b/ynr/apps/sopn_parsing/tests/test_extract_tables.py
index 21a03dfb63..bda0d99fd3 100644
--- a/ynr/apps/sopn_parsing/tests/test_extract_tables.py
+++ b/ynr/apps/sopn_parsing/tests/test_extract_tables.py
@@ -1,15 +1,10 @@
from os.path import abspath, dirname, join
-from unittest import skipIf
from candidates.tests.helpers import TmpMediaRootMixin
from candidates.tests.uk_examples import UK2015ExamplesMixin
from django.core.files.uploadedfile import SimpleUploadedFile
-from django.core.management import call_command
from django.test import TestCase
from official_documents.models import BallotSOPN
-from sopn_parsing.helpers.extract_tables import extract_ballot_table
-from sopn_parsing.models import CamelotParsedSOPN
-from sopn_parsing.tests import should_skip_pdf_tests
class TestSOPNHelpers(TmpMediaRootMixin, UK2015ExamplesMixin, TestCase):
@@ -27,152 +22,3 @@ def setUp(self):
uploaded_file=SimpleUploadedFile("sopn.pdf", sopn_file),
source_url="example.com",
)
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_extract_tables(self):
- extract_ballot_table(self.dulwich_post_ballot)
- self.assertEqual(
- CamelotParsedSOPN.objects.get().as_pandas.to_dict(),
- {
- "0": {
- "0": "Name of \nCandidate",
- "1": "ALAGARATNAM \nRathy",
- "2": "BARBER \nJames",
- "3": "HAYES \nHelen Elizabeth",
- "4": "KANUMANSA \nAmadu",
- "5": "KOTECHA \nResham",
- "6": "LAMBERT \nRobin Andrew \nDavid",
- "7": "NALLY \nSteve",
- "8": "NIX \nRashid",
- },
- "1": {
- "0": "Home \nAddress",
- "1": "(address in the \nMitcham and Morden \nConstituency)",
- "2": "33 Champion Hill, \nLondon, SE5 8BS",
- "3": "11 Woodsyre, \nSydenham Hill, \nLondon, SE26 6SS",
- "4": "11 Coleridge House, \nBrowning Street, \nLondon, SE17 1DG",
- "5": "(address in the \nRuislip, Northwood \nand Pinner \nConstituency)",
- "6": "(address in the \nDuwlich and West \nNorwood \nConstituency)",
- "7": "(address in the \nVauxhall \nConstituency)",
- "8": "66 Guinness Court, \nLondon, SW3 2PQ",
- },
- "2": {
- "0": "Description \n(if any)",
- "1": "UK Independence \nParty (UKIP)",
- "2": "Liberal Democrat",
- "3": "Labour Party",
- "4": "All People`s Party",
- "5": "The Conservative \nParty Candidate",
- "6": "Independent",
- "7": "Trade Unionist \nand Socialist \nCoalition",
- "8": "The Green Party",
- },
- "3": {
- "0": "Name of Assentors \nProposer(+), Seconder(++)",
- "1": "Coleman Alice M + \n"
- "Potter Keith S ++ \n"
- "Potter Stephanie \n"
- "Smith Bryan L \n"
- "Anderson Beth \n"
- "Lumba Avita \n"
- "Andersen Robert \n"
- "Patel Sajal \n"
- "Stanbury Linda \n"
- "Stanbury James",
- "2": "Fitchett Keith + \n"
- "Price Jonathan ++ \n"
- "Gardner Brigid \n"
- "Waddington Simon \n"
- "Morland Laura \n"
- "Lester Rachel \n"
- "Pidgeon Caroline \n"
- "Hare David \n"
- "Hanton Alastair \n"
- "Haylett Alexander",
- "3": "Samuel Gaynelle + \n"
- "Whaley Stephen P ++ \n"
- "Brazell Shadi M \n"
- "De Souza Johnny \n"
- "Alcock Heather \n"
- "Natzler Robert S \n"
- "Pearce Michelle E \n"
- "Pickering Robert \n"
- "Richardson Katherine G \n"
- "Pickard Jane",
- "4": "King James + \n"
- "King Rosemary ++ \n"
- "King David \n"
- "Davies Yadalieu \n"
- "Sesay Mary \n"
- "Rahman Layla K \n"
- "Rahman Syed A \n"
- "Ahmed Jalaluddin \n"
- "Rahman Tajwar S \n"
- "Rahman Taamid S",
- "5": "Davis James G + \n"
- "Bradbury David S ++ \n"
- "Badman Susan E \n"
- "Hill-Archer Roderick C \n"
- "Langley Anne C \n"
- "Mitchell Andrew M \n"
- "Virgo Marjorie J \n"
- "Virgo Philip A \n"
- "Chathli Lindsay \n"
- "Broomhead Robert A",
- "6": "Smith Caitlin + \n"
- "Parks Jesse ++ \n"
- "Connage Kyesha \n"
- "Hendry Perihan \n"
- "Mounty E J \n"
- "Sharif B \n"
- "Scott Wellesley \n"
- "Harriott S A \n"
- "Harriott Clive \n"
- "Ojumu Ibi",
- "7": "Tullis Andrew C + \n"
- "Mason Joshua H ++ \n"
- "Parkinson Francine M \n"
- "Gait Elizabeth \n"
- "Doolan Samantha \n"
- "Ubiaro Elizabeth \n"
- "Garner Stuart \n"
- "Akinjogbin Dolapo \n"
- "Walker Donna \n"
- "Lang Geoffrey P",
- "8": "Atwell E G + \n"
- "Rose Lloyd ++ \n"
- "O`Shea C \n"
- "Gomes Jacqueline \n"
- "Wood Thomas \n"
- "Rosenfeld David \n"
- "Conroy Martin \n"
- "Skiadopoulou I \n"
- "Rosenfeld Lawrence \n"
- "Rosenfeld Emily",
- },
- "4": {
- "0": "Reason why \nno longer \nnominated*",
- "1": "",
- "2": "",
- "3": "",
- "4": "",
- "5": "",
- "6": "",
- "7": "",
- "8": "",
- },
- },
- )
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_extract_command_current(self):
- self.assertEqual(CamelotParsedSOPN.objects.count(), 0)
- call_command("sopn_parsing_extract_tables", current=True)
- self.assertEqual(CamelotParsedSOPN.objects.count(), 1)
-
- def test_extract_command_current_no_current_elections(self):
- self.election.current = False
- self.election.save()
- self.assertEqual(CamelotParsedSOPN.objects.count(), 0)
- call_command("sopn_parsing_extract_tables", current=True)
- self.assertEqual(CamelotParsedSOPN.objects.count(), 0)
diff --git a/ynr/apps/sopn_parsing/tests/test_parse_tables.py b/ynr/apps/sopn_parsing/tests/test_parse_tables.py
deleted file mode 100644
index 922c487dd4..0000000000
--- a/ynr/apps/sopn_parsing/tests/test_parse_tables.py
+++ /dev/null
@@ -1,529 +0,0 @@
-import json
-from pathlib import Path
-from unittest import skipIf
-from unittest.mock import patch
-
-from bulk_adding.models import RawPeople
-from candidates.tests.uk_examples import UK2015ExamplesMixin
-from django.core.management import call_command
-from django.db import connection
-from django.test import TestCase
-from official_documents.models import BallotSOPN
-from pandas import Index, Series
-from parties.models import Party, PartyDescription
-from parties.tests.factories import PartyFactory
-from parties.tests.fixtures import DefaultPartyFixtures
-from sopn_parsing.helpers import parse_tables
-from sopn_parsing.models import CamelotParsedSOPN
-from sopn_parsing.tests import should_skip_pdf_tests
-from sopn_parsing.tests.data.welsh_sopn_data import welsh_sopn_data
-
-from ynr.apps.sopn_parsing.management.commands.sopn_parsing_parse_tables import (
- Command as ParseTablesCommand,
-)
-
-
-class TestSOPNHelpers(DefaultPartyFixtures, UK2015ExamplesMixin, TestCase):
- def setUp(self):
- PartyFactory(ec_id="PP85", name="UK Independence Party (UKIP)")
- with connection.cursor() as cursor:
- cursor.execute("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;")
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_basic_parsing(self):
- self.assertFalse(RawPeople.objects.exists())
- doc = BallotSOPN.objects.create(
- ballot=self.dulwich_post_ballot,
- source_url="example.com",
- )
- dataframe = json.dumps(
- {
- "0": {
- "0": "Name of \nCandidate",
- "1": "BRADBURY \nAndrew John",
- "2": "COLLINS \nDave",
- "3": "HARVEY \nPeter John",
- "4": "JENNER \nMelanie",
- },
- "1": {
- "0": "Home Address",
- "1": "10 Fowey Close, \nShoreham by Sea, \nWest Sussex, \nBN43 5HE",
- "2": "51 Old Fort Road, \nShoreham by Sea, \nBN43 5RL",
- "3": "76 Harbour Way, \nShoreham by Sea, \nSussex, \nBN43 5HH",
- "4": "9 Flag Square, \nShoreham by Sea, \nWest Sussex, \nBN43 5RZ",
- },
- "2": {
- "0": "Description (if \nany)",
- "1": "Green Party",
- "2": "Independent",
- "3": "UK Independence \nParty (UKIP)",
- "4": "Labour Party",
- },
- "3": {
- "0": "Name of \nProposer",
- "1": "Tiffin Susan J",
- "2": "Loader Jocelyn C",
- "3": "Hearne James H",
- "4": "O`Connor Lavinia",
- },
- "4": {
- "0": "Reason \nwhy no \nlonger \nnominated\n*",
- "1": "",
- "2": "",
- "3": "",
- "4": "",
- },
- }
- )
- CamelotParsedSOPN.objects.create(
- sopn=doc, raw_data=dataframe, status="unparsed"
- )
- call_command("sopn_parsing_parse_tables")
- self.assertEqual(RawPeople.objects.count(), 1)
- raw_people = RawPeople.objects.get()
- self.assertEqual(
- raw_people.data,
- [
- {"name": "Andrew John Bradbury", "party_id": "PP63"},
- {"name": "Dave Collins", "party_id": "ynmp-party:2"},
- {"name": "Peter John Harvey", "party_id": "PP85"},
- {"name": "Melanie Jenner", "party_id": "PP53"},
- ],
- )
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_welsh_run_sopn(self):
- """
- Test that if the ballot is welsh run and previous party affiliations
- are included they are parsed
- """
- self.assertFalse(RawPeople.objects.exists())
- doc = BallotSOPN.objects.create(
- ballot=self.senedd_ballot,
- source_url="example.com",
- )
-
- plaid_cymru, _ = Party.objects.update_or_create(
- ec_id="PP77",
- legacy_slug="party:77",
- defaults={
- "name": "Plaid Cymru - The Party of Wales",
- "date_registered": "1999-01-14",
- },
- )
-
- dataframe = json.dumps(welsh_sopn_data)
- CamelotParsedSOPN.objects.create(
- sopn=doc, raw_data=dataframe, status="unparsed"
- )
- call_command("sopn_parsing_parse_tables")
- self.assertEqual(RawPeople.objects.count(), 1)
- raw_people = RawPeople.objects.get()
- self.assertEqual(
- raw_people.data,
- [
- {
- "name": "John Smith",
- "party_id": self.conservative_party.ec_id,
- "previous_party_affiliations": [self.ld_party.ec_id],
- },
- {
- "name": "Joe Bloggs",
- "party_id": self.labour_party.ec_id,
- "previous_party_affiliations": ["ynmp-party:2"],
- },
- {"name": "Jon Doe", "party_id": self.ld_party.ec_id},
- {
- "name": "Jane Brown",
- "party_id": "ynmp-party:2",
- "previous_party_affiliations": [plaid_cymru.ec_id],
- },
- {
- "name": "Judy Johnson",
- "party_id": plaid_cymru.ec_id,
- "previous_party_affiliations": [self.labour_party.ec_id],
- },
- {"name": "Julie Williams", "party_id": "ynmp-party:2"},
- ],
- )
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_match_complex_descriptions(self):
- self.assertFalse(RawPeople.objects.exists())
- doc = BallotSOPN.objects.create(
- ballot=self.senedd_ballot,
- source_url="example.com",
- )
-
- plaid_cymru, _ = Party.objects.update_or_create(
- ec_id="PP77",
- legacy_slug="party:77",
- defaults={
- "name": "Plaid Cymru - The Party of Wales",
- "date_registered": "1999-01-14",
- },
- )
-
- dickens_heath, _ = Party.objects.update_or_create(
- ec_id="PP1",
- legacy_slug="PP!",
- defaults={
- "name": "Independent Dickens Heath Residents Action Group",
- "date_registered": "1999-01-14",
- },
- )
- PartyDescription.objects.create(
- party=dickens_heath,
- description="Independent Dickens Heath Residents Action Group",
- )
- lib_dem, _ = Party.objects.update_or_create(
- ec_id="PP100",
- legacy_slug="PP100",
- defaults={
- "name": "Liberal Democrats",
- "date_registered": "1999-01-14",
- },
- register="GB",
- )
-
- PartyDescription.objects.create(
- party=lib_dem,
- description="Liberal Democrat Focus Team | Tîm Ffocws y Democratiaid Rhyddfrydol",
- )
-
- data_path = (
- Path(__file__).parent / "data/edge_case_description_data.json"
- )
- with data_path.open() as f:
- CamelotParsedSOPN.objects.create(
- sopn=doc, raw_data=f.read(), status="unparsed"
- )
- call_command("sopn_parsing_parse_tables")
- self.assertEqual(RawPeople.objects.count(), 1)
- raw_people = RawPeople.objects.get()
- self.assertEqual(
- sorted(raw_people.data, key=lambda x: x["name"]),
- sorted(
- [
- {
- "name": "John Smith",
- "party_id": self.conservative_party.ec_id,
- },
- {
- "name": "Joe Bloggs",
- "party_id": self.labour_party.ec_id,
- },
- {
- "name": "Jon Doe",
- "party_id": self.ld_party.ec_id,
- },
- {
- "name": "Jane Brown",
- "party_id": "ynmp-party:2",
- },
- {
- "name": "Judy Johnson",
- "party_id": plaid_cymru.ec_id,
- },
- {"name": "Julie Williams", "party_id": "ynmp-party:2"},
- ],
- key=lambda x: x["name"],
- ),
- )
-
-
-class TestParseTablesUnitTests(UK2015ExamplesMixin, TestCase):
- def get_two_name_field_cases(self):
- # this could be updated with more combinations as we come across them
- return [
- {
- "name_fields": ["candidate surname", "candidate forename"],
- "row": {
- "candidate surname": "BAGSHAW",
- "candidate forename": "Elaine Sheila",
- "home address": "1 Foo Street \n London \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- "ordered_name_fields": [
- "candidate forename",
- "candidate surname",
- ],
- "expected_name": "Elaine Sheila Bagshaw",
- },
- {
- "name_fields": ["surname", "other names"],
- "row": {
- "surname": "BAGSHAW",
- "other names": "Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- "ordered_name_fields": ["other names", "surname"],
- "expected_name": "Elaine Sheila Bagshaw",
- },
- {
- "name_fields": ["last name", "other names"],
- "row": {
- "last name": "BAGSHAW",
- "other names": "Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- "ordered_name_fields": ["other names", "last name"],
- "expected_name": "Elaine Sheila Bagshaw",
- },
- {
- "name_fields": ["candidate forename", "candidate surname"],
- "row": {
- "candidate forename": "Elaine Sheila",
- "candidate surname": "BAGSHAW",
- "home address": "1 Foo Street \n London \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- "ordered_name_fields": [
- "candidate forename",
- "candidate surname",
- ],
- "expected_name": "Elaine Sheila Bagshaw",
- },
- ]
-
- def get_single_name_field_cases(self):
- return [
- {
- "name_fields": ["name of candidate"],
- "row": {
- "name of candidate": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \n London \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["names of candidate"],
- "row": {
- "names of candidate": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["candidate name"],
- "row": {
- "candidate name": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["surname"],
- "row": {
- "surname": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["candidates surname"],
- "row": {
- "candidates surname": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["other name"],
- "row": {
- "other name": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- ]
-
- def test_get_name_single_field(self):
- for case in self.get_single_name_field_cases():
- row = Series(case["row"])
- name_fields = case["name_fields"]
- with self.subTest(name_fields=name_fields):
- assert len(case["name_fields"]) == 1
- name = parse_tables.get_name(row=row, name_fields=name_fields)
- assert name == "Elaine Sheila Bagshaw"
-
- def test_get_name_two_fields(self):
- for case in self.get_two_name_field_cases():
- row = Series(case["row"])
- name_fields = case["name_fields"]
- with self.subTest(name_fields=name_fields):
- assert len(case["name_fields"]) == 2
- name = parse_tables.get_name(row=row, name_fields=name_fields)
- assert name == case["expected_name"]
-
- def test_get_name_fields_single(self):
- for case in self.get_single_name_field_cases():
- row = Index(case["row"])
- with self.subTest(row=row):
- name_fields = parse_tables.get_name_fields(row=row)
- assert len(name_fields) == 1
- assert name_fields == case["name_fields"]
-
- def test_get_name_fields_two(self):
- for case in self.get_two_name_field_cases():
- row = Index(case["row"])
- with self.subTest(row=row):
- name_fields = parse_tables.get_name_fields(row=row)
- assert len(name_fields) == 2
- assert name_fields == case["name_fields"]
-
- def test_get_name_fields_raises_error(self):
- row = Index({"foo": "Bar"})
- with self.assertRaises(ValueError):
- parse_tables.get_name_fields(row=row)
-
- def test_order_name_fields(self):
- for case in self.get_two_name_field_cases():
- name_fields = case["name_fields"]
- with self.subTest(name_fields=name_fields):
- result = parse_tables.order_name_fields(name_fields)
- assert result == case["ordered_name_fields"]
-
- def test_clean_name_replaces_backticks(self):
- name = parse_tables.clean_name("D`SOUZA")
- assert "`" not in name
- assert "'" in name
-
- def test_clean_name_replaces_newlines(self):
- name = parse_tables.clean_name(
- "A Very Long Name That Splits \nOver Lines"
- )
- assert "\n" not in name
-
- def test_clean_name_capitalized_last_and_titalized(self):
- name = parse_tables.clean_name("SMITH John")
- assert name == "John Smith"
-
- def test_clean_last_names(self):
- name = parse_tables.clean_last_names(["MACDONALD", "John"])
- assert name == "MacDonald"
-
- def test_clean_name_two_word_surnames(self):
- names = [
- ("EDE COOPER \nPalmer", "Palmer Ede Cooper"),
- ("VAN DULKEN \nRichard Michael", "Richard Michael Van Dulken"),
- ("ARMSTRONG LILLEY \nLynne", "Lynne Armstrong Lilley"),
- (
- " D`SOUZA Aaron Anthony Jose \nHasan",
- "Aaron Anthony Jose Hasan D'Souza",
- ),
- ("Michael James Collins", "Michael James Collins"),
- (" Michael James Collins ", "Michael James Collins"),
- ("DAVE Nitesh Pravin", "Nitesh Pravin Dave"),
- ("DAVE\nNitesh Pravin", "Nitesh Pravin Dave"),
- ("COOKE Anne-Marie", "Anne-Marie Cooke"),
- ("COOKE\nAnne-Marie", "Anne-Marie Cooke"),
- ("BROOKES-\nDUNCAN\nKaty", "Katy Brookes-Duncan"),
- ("HOUNSOME\nJohn", "John Hounsome"),
- ("O`CONNELL \nStephen John", "Stephen John O'Connell"),
- ("O`NEAL \nCarol Joy", "Carol Joy O'Neal"),
- ("O`REILLY \nTracey Linda \nDiane", "Tracey Linda Diane O'Reilly"),
- ("LIAM THOMAS O'ROURKE", "Liam Thomas O'Rourke"),
- ("O'CALLAGHAN \nClaire Louise", "Claire Louise O'Callaghan"),
- ]
- for name in names:
- with self.subTest(name=names[0]):
- assert parse_tables.clean_name(name[0]) == name[1]
-
- def test_clean_description_removes_newlines(self):
- cleaned_description = parse_tables.clean_description(
- "A Long Description That Splits \nOver \\nLines"
- )
- assert "\n" not in cleaned_description
- assert "\\n" not in cleaned_description
-
- def test_clean_description_replaces_backticks(self):
- cleaned_description = parse_tables.clean_description(
- "All People`s Party"
- )
- assert "`" not in cleaned_description
- assert "'" in cleaned_description
- assert cleaned_description == "All People's Party"
-
- def test_guess_previous_party_affiliations_field(self):
- sopn = CamelotParsedSOPN(raw_data=json.dumps(welsh_sopn_data))
- data = sopn.as_pandas
- data.columns = data.iloc[0]
-
- cases = [
- (self.dulwich_post_ballot, None),
- (self.senedd_ballot, "statement of party membership"),
- ]
- for case in cases:
- with self.subTest(msg=case[0]):
- sopn.sopn = BallotSOPN(ballot=case[0])
- result = parse_tables.guess_previous_party_affiliations_field(
- data=data, sopn=sopn
- )
- assert result == case[1]
-
- def test_add_previous_party_affiliations(self):
- cases = [
- {"party_str": "", "party": None, "expected": {}},
- {"party_str": "Unknown Party", "party": None, "expected": {}},
- {
- "party_str": "Labour Party",
- "party": self.labour_party,
- "expected": {
- "previous_party_affiliations": [self.labour_party.ec_id]
- },
- },
- ]
- for case in cases:
- with self.subTest(msg=case["party_str"]), patch.object(
- parse_tables, "get_party", return_value=case["party"]
- ):
- raw_data = {}
- sopn = CamelotParsedSOPN()
- result = parse_tables.add_previous_party_affiliations(
- party_str=case["party_str"],
- raw_data=raw_data,
- sopn=sopn,
- )
- assert result == case["expected"]
-
-
-class TestParseTablesFilterKwargs(TestCase):
- def setUp(self):
- self.command = ParseTablesCommand()
- self.default_filter_kwargs = {}
-
- def test_when_testing(self):
- options = {"testing": True}
- result = self.command.build_filter_kwargs(options)
- self.assertEqual(result, self.default_filter_kwargs)
-
- def test_when_using_ballot(self):
- options = {"ballot": "local.foo.bar.2021-05-06"}
- result = self.command.build_filter_kwargs(options)
- self.assertEqual(result, self.default_filter_kwargs)
-
- def test_when_using_reparse(self):
- options = {"reparse": True}
- result = self.command.build_filter_kwargs(options)
- expected = self.default_filter_kwargs.copy()
- expected["rawpeople__source_type"] = RawPeople.SOURCE_PARSED_PDF
- self.assertEqual(result, expected)
-
- def test_when_no_options(self):
- options = {}
- result = self.command.build_filter_kwargs(options)
- expected = self.default_filter_kwargs.copy()
- self.assertEqual(result, expected)