diff --git a/.gitignore b/.gitignore index d08c2cda36..38c331c5a1 100644 --- a/.gitignore +++ b/.gitignore @@ -24,8 +24,6 @@ test-results node_modules/ .vscode/ /test-env -/ynr/apps/sopn_parsing/tests/data/sopn_baseline.json -/ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json # PyCharm .idea/ diff --git a/Makefile b/Makefile deleted file mode 100644 index f8d2865666..0000000000 --- a/Makefile +++ /dev/null @@ -1,45 +0,0 @@ -export DJANGO_SETTINGS_MODULE?=ynr.settings.sopn_testing - - -.PHONY: sopn-runserver -sopn-runserver: - python manage.py runserver - -.PHONY: sopn-shell -sopn-shell: - python manage.py shell_plus - -.PHONY: migrate-db -migrate-db: - python manage.py migrate - -.PHONY: test-sopns -test-sopns: migrate-db - python manage.py sopn_tooling_compare_raw_people --election-slugs= --ballot= --date 2021-05-06 - -.PHONY: download-sopns -download-sopns: - python manage.py migrate --no-input - python manage.py sopn_tooling_create_official_documents --election-slugs= --date 2021-05-06 - -.PHONY: populate-sopn-testing-database -populate-sopn-testing-database: migrate-db - python manage.py candidates_import_from_live_site - -.PHONY: delete-test-sopns -delete-test-sopns: - python manage.py sopn_tooling_clear_existing_objects - rm -rf ./ynr/media/sopn_testing/ - -.PHONY: create-baseline-file -create-baseline-file: - python manage.py sopn_tooling_write_baseline - -.PHONY: copy-baseline-file -copy-baseline-file: - cp ynr/apps/sopn_parsing/tests/data/sopn_baseline.json ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json - -.PHONY: prod-import-sopns -prod-import-sopns: - cd deploy; \ - ansible-playbook import_sopns.yml diff --git a/ynr/apps/bulk_adding/tests/test_bulk_add.py b/ynr/apps/bulk_adding/tests/test_bulk_add.py index 5361cda441..e18589768a 100644 --- a/ynr/apps/bulk_adding/tests/test_bulk_add.py +++ b/ynr/apps/bulk_adding/tests/test_bulk_add.py @@ -752,79 +752,3 @@ def test_bulk_add_person_removes_spaces_from_name(self): self.assertContains(resp, "Review candidates") resp = form.submit() self.assertContains(resp, "Bart Simpson") - - def test_fall_back_to_camelot_if_no_textract(self): - data = {"name": "Bart", "party_id": "PP52"} - - raw_people = RawPeople.objects.create( - ballot=self.dulwich_post_ballot, - data=[data], - source_type=RawPeople.SOURCE_PARSED_PDF, - ) - - self.assertEqual( - raw_people.as_form_kwargs(), - { - "initial": [ - { - "name": "Bart", - "party": ["PP52", "PP52"], - "previous_party_affiliations": [], - "source": "", - } - ] - }, - ) - raw_people.delete() - - textract_data = {"name": "Lisa", "party_id": "PP53"} - raw_people = RawPeople.objects.create( - ballot=self.dulwich_post_ballot, - data=[data], - textract_data=[textract_data], - source_type=RawPeople.SOURCE_PARSED_PDF, - ) - - self.assertEqual( - raw_people.as_form_kwargs(), - { - "initial": [ - { - "name": "Lisa", - "party": ["PP53", "PP53"], - "previous_party_affiliations": [], - "source": "", - } - ] - }, - ) - - def test_can_change_parser_in_frontend(self): - """ - Check that a query param can change the parser we use - """ - BallotSOPN.objects.create( - source_url="http://example.com", - ballot=self.dulwich_post_ballot, - uploaded_file="sopn.pdf", - ) - RawPeople.objects.create( - ballot=self.dulwich_post_ballot, - data=[{"name": "Bart", "party_id": "PP52"}], - textract_data=[{"name": "Lisa", "party_id": "PP53"}], - source_type=RawPeople.SOURCE_PARSED_PDF, - ) - response = self.app.get( - "/bulk_adding/sopn/parl.65808.2015-05-07/", user=self.user - ) - form = response.forms["bulk_add_form"] - # This should be the Textract data - self.assertEqual(form.fields["form-0-name"][0].value, "Lisa") - - response = self.app.get( - "/bulk_adding/sopn/parl.65808.2015-05-07/?v1_parser=1", - user=self.user, - ) - form = response.forms["bulk_add_form"] - # This should be the Textract data - self.assertEqual(form.fields["form-0-name"][0].value, "Bart") diff --git a/ynr/apps/bulk_adding/views/sopns.py b/ynr/apps/bulk_adding/views/sopns.py index e0003e9c9e..6564db69a5 100644 --- a/ynr/apps/bulk_adding/views/sopns.py +++ b/ynr/apps/bulk_adding/views/sopns.py @@ -123,12 +123,8 @@ def get(self, request, *args, **kwargs): return super().get(request, *args, **kwargs) def get_active_parser(self) -> Optional[SOPNParsingBackends]: - if self.request.GET.get("v1_parser"): - return SOPNParsingBackends.CAMELOT if self.ballot.rawpeople.textract_data: return SOPNParsingBackends.TEXTRACT - if self.ballot.rawpeople.data: - return SOPNParsingBackends.CAMELOT return None def get_context_data(self, **kwargs): diff --git a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html index c61ab936ce..620857b058 100644 --- a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html +++ b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html @@ -7,28 +7,12 @@

Parsing Status

-

Camelot raw Data

- {% if object.sopn.camelotparsedsopn.raw_data %} -
{{ object.sopn.camelotparsedsopn.as_pandas.to_dict|pprint }}
- {% else %} - N/A - {% endif %} - -

Camelot table Data

- {% if object.sopn.camelotparsedsopn.data_as_html %} - {{ object.sopn.camelotparsedsopn.data_as_html|safe }} - {% else %} - N/A - {% endif %} -
- {% if textract_parsed and textract_parsed.as_textractor_document %}

AWS extracted table{{ textract_parsed.as_textractor_document.tables|pluralize }}

diff --git a/ynr/apps/official_documents/models.py b/ynr/apps/official_documents/models.py index 15de317657..55ca3e7d1d 100644 --- a/ynr/apps/official_documents/models.py +++ b/ynr/apps/official_documents/models.py @@ -260,7 +260,6 @@ def parse(self): """ - from sopn_parsing.helpers.extract_tables import extract_ballot_table from sopn_parsing.helpers.textract_helpers import ( NotUsingAWSException, TextractSOPNHelper, @@ -276,12 +275,6 @@ def parse(self): # There's a cron job that should pick up the result and carry on parsing later. textract_helper.start_detection() - if getattr( - settings, "CAMELOT_ENABLED", False - ) and self.uploaded_file.name.endswith(".pdf"): - # Camelot - extract_ballot_table(self.ballot) - class BallotSOPNHistory(BaseBallotSOPN): ballot = models.ForeignKey( diff --git a/ynr/apps/official_documents/tests/test_upload.py b/ynr/apps/official_documents/tests/test_upload.py index 03423f9ce7..443cc97fc6 100644 --- a/ynr/apps/official_documents/tests/test_upload.py +++ b/ynr/apps/official_documents/tests/test_upload.py @@ -114,20 +114,9 @@ def test_upload_authorized(self): with open(self.example_image_filename, "rb") as f: form["uploaded_file"] = Upload("pilot.jpg", f.read()) - # TODO: Add back in - # with patch( - # "official_documents.views.extract_pages_for_ballot" - # ) as extract_pages, patch( - # "official_documents.views.extract_ballot_table" - # ) as extract_tables, patch( - # "official_documents.views.parse_raw_data_for_ballot" - # ) as parse_tables: + response = form.submit() self.assertEqual(response.status_code, 302) - # TODO: Add back in - # extract_pages.assert_called_once() - # extract_tables.assert_called_once() - # parse_tables.assert_called_once() ballot_sopns = BallotSOPN.objects.all() self.assertEqual(ballot_sopns.count(), 1) @@ -181,20 +170,8 @@ def test_docx_upload_form_validation(self): with open(self.example_docx_filename, "rb") as f: form["uploaded_file"] = Upload("pilot.docx", f.read()) - # TODO: add back in - # with patch( - # "official_documents.views.extract_pages_for_ballot" - # ) as extract_pages, patch( - # "official_documents.views.extract_ballot_table" - # ) as extract_tables, patch( - # "official_documents.views.parse_raw_data_for_ballot" - # ) as parse_tables: response = form.submit() self.assertEqual(response.status_code, 302) - # TODO Add back in - # extract_pages.assert_called_once() - # extract_tables.assert_called_once() - # parse_tables.assert_called_once() self.assertEqual(BallotSOPN.objects.count(), 1) self.assertEqual(response.location, self.ballot.get_sopn_url()) diff --git a/ynr/apps/sopn_parsing/helpers/extract_tables.py b/ynr/apps/sopn_parsing/helpers/extract_tables.py deleted file mode 100644 index 0b610c7847..0000000000 --- a/ynr/apps/sopn_parsing/helpers/extract_tables.py +++ /dev/null @@ -1,63 +0,0 @@ -import json - -import pandas as pd -from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text -from sopn_parsing.models import CamelotParsedSOPN - - -def extract_ballot_table(ballot, parse_flavor="lattice"): - """ - Given a OfficialDocument model, update or create a CamelotParsedSOPN model with the - contents of the table as a JSON string. - - :type ballot: candidates.models.Ballot - - """ - import camelot # import here to avoid import error running tests without pdf deps installed - - document = ballot.sopn - try: - tables = camelot.read_pdf( - document.uploaded_file.path, - pages="all", - flavor=parse_flavor, - ) - except (NotImplementedError, AttributeError): - # * NotImplementedError is thrown if the PDF is an image or generally - # unreadable. - # * AttributeError is thrown on some PDFs saying they need a password. - # Assume this is a bug in camelot, and ignore these PDFs - raise NoTextInDocumentError() - - # Tables can span pages, camelot assumes they're different tables, so we - # need to join them back together - table_list = [] - for table in tables: - table_list.append(table) - table_list.sort(key=lambda t: (t.page, t.order)) - - if not table_list: - return None - - table_data = table_list.pop(0).df - - for table in table_list: - # It's possible to have the "situation of poll" document on the SOPN - # Ignore any table that contains "polling station" (SOPNs tables don't) - table = table.df - first_row = table.iloc[0].to_string() - - if "polling station" in clean_text(first_row): - break - # Append the continuation table to the first one in the document. - # ignore_index is needed so the e.g table 2 row 1 doesn't replace - # table 1 row 1 - table_data = pd.concat([table_data, table], ignore_index=True) - - if not table_data.empty: - parsed, _ = CamelotParsedSOPN.objects.update_or_create( - sopn=document, - defaults={"raw_data": json.dumps(table_data.to_dict())}, - ) - return parsed - return None diff --git a/ynr/apps/sopn_parsing/helpers/parse_tables.py b/ynr/apps/sopn_parsing/helpers/parse_tables.py index 247d0e7e31..e8152b0816 100644 --- a/ynr/apps/sopn_parsing/helpers/parse_tables.py +++ b/ynr/apps/sopn_parsing/helpers/parse_tables.py @@ -476,20 +476,12 @@ def parse_dataframe(ballot: Ballot, df: DataFrame): def parse_raw_data(ballot: Ballot, reparse=False): """ - Given a Ballot, go and get the Camelot and the AWS Textract dataframes + Given a Ballot, go and get the AWS Textract dataframes and process them """ - camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None) - camelot_data = {} textract_model = getattr(ballot.sopn, "awstextractparsedsopn", None) textract_data = {} - if ( - camelot_model - and camelot_model.raw_data_type == "pandas" - and (reparse or not camelot_model.parsed_data) - ): - camelot_data = parse_dataframe(ballot, camelot_model.as_pandas) if ( textract_model and textract_model.raw_data @@ -500,7 +492,7 @@ def parse_raw_data(ballot: Ballot, reparse=False): textract_model.parse_raw_data() textract_data = parse_dataframe(ballot, textract_model.as_pandas) - if camelot_data or textract_data: + if textract_data: # Check there isn't a rawpeople object from another (better) source rawpeople_qs = RawPeople.objects.filter(ballot=ballot).exclude( source_type=RawPeople.SOURCE_PARSED_PDF @@ -510,7 +502,7 @@ def parse_raw_data(ballot: Ballot, reparse=False): RawPeople.objects.update_or_create( ballot=ballot, defaults={ - "data": camelot_data or "", + "data": "", "textract_data": textract_data or "", "source": "Parsed from {}".format( ballot.sopn.source_url @@ -525,17 +517,10 @@ def parse_raw_data(ballot: Ballot, reparse=False): return # We've done the parsing, so let's still save the result storage = DefaultStorage() - storage.save( - f"raw_people/camelot_{ballot.ballot_paper_id}.json", - ContentFile(json.dumps(camelot_data, indent=4).encode("utf8")), - ) storage.save( f"raw_people/textract_{ballot.ballot_paper_id}.json", ContentFile(json.dumps(textract_data, indent=4).encode("utf8")), ) - if camelot_model: - ballot.sopn.camelotparsedsopn.status = "parsed" - ballot.sopn.camelotparsedsopn.save() if textract_model: ballot.sopn.awstextractparsedsopn.status = "parsed" ballot.sopn.awstextractparsedsopn.save() diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py deleted file mode 100644 index 3a4e091290..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py +++ /dev/null @@ -1,29 +0,0 @@ -from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand -from sopn_parsing.helpers.extract_tables import extract_ballot_table -from sopn_parsing.helpers.text_helpers import NoTextInDocumentError - - -class Command(BaseSOPNParsingCommand): - help = """ - Parse tables out of PDFs in to CamelotParsedSOPN models for later parsing. - """ - - def handle(self, *args, **options): - qs = self.get_queryset(options) - filter_kwargs = {} - if not options["ballot"] and not options["testing"]: - if not options["reparse"]: - filter_kwargs["sopn__camelotparsedsopn"] = None - - qs = qs.filter(**filter_kwargs) - for ballot in qs: - try: - extract_ballot_table(ballot) - except NoTextInDocumentError: - self.stdout.write( - f"{ballot} raised a NoTextInDocumentError trying to extract tables" - ) - except ValueError: - self.stdout.write( - f"{ballot} raised a ValueError trying extract tables" - ) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py deleted file mode 100644 index 26448b697f..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py +++ /dev/null @@ -1,67 +0,0 @@ -from bulk_adding.models import RawPeople -from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand -from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot - - -class Command(BaseSOPNParsingCommand): - help = """ - Convert the raw extracted tables on the CamelotParsedSOPN model to a parsed - RawPeople model, and set the status as parsed. - - """ - - def build_filter_kwargs(self, options): - """ - Build kwargs used to filter the BallotQuerySet that is parsed - - Always skip any ballots where we do not have a CamelotParsedSOPN to try to - extract candidates from - - When test flag is used, dont make any changes - - When parsing a single ballot, dont make any changes - - When reparsing, only use ballots where we have previously created a - RawPeople object from a CamelotParsedSOPN - - Otherwise filter by unparsed CamelotParsedSOPN objects - """ - # Always skip any ballots where we do not have a CamelotParsedSOPN to try to - # extract candidates from - filter_kwargs = {} - if options.get("testing"): - return filter_kwargs - - if options.get("ballot"): - return filter_kwargs - - if options.get("reparse"): - filter_kwargs[ - "rawpeople__source_type" - ] = RawPeople.SOURCE_PARSED_PDF - return filter_kwargs - - return filter_kwargs - - def handle(self, *args, **options): - # filters that we never change with args. These two would raise - # ValueErrors in the parse_raw_data_for_ballot function - base_qs = self.get_queryset(options) - filter_kwargs = self.build_filter_kwargs(options) - - qs = base_qs.filter(**filter_kwargs) - qs = qs.filter( - candidates_locked=False, # Never parse a locked ballot - suggestedpostlock=None, # Never parse a ballot with lock suggestions - ) - - if not qs.exists(): - msg = ["No ballots to parse found."] - - if options.get("ballot"): - msg.append( - "This ballot might be locked or have lock suggestions" - ) - - self.stderr.write("\n".join(msg)) - - for ballot in qs: - try: - parse_raw_data_for_ballot(ballot, options["reparse"]) - except ValueError as e: - print(str(e)) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py index 41db0e1f57..7b38b54b6a 100644 --- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py +++ b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py @@ -1,4 +1,3 @@ -from django.conf import settings from django.core.management.base import BaseCommand from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot from sopn_parsing.helpers.textract_helpers import ( @@ -8,7 +7,6 @@ from sopn_parsing.models import ( AWSTextractParsedSOPN, AWSTextractParsedSOPNStatus, - CamelotParsedSOPN, ) @@ -21,22 +19,16 @@ class Command(BaseCommand): This script picks up where `parse` left off. It manages two cases: - # Camelot - - We expect to have made a `CamelotParsedSOPN` with `raw_data` populated. This will only have - happened if the file is a PDF readable by Camelot. - - We need to parse the `raw_data` into `parsed_data` and then make a `RawData` object for bulk adding. - # AWS Textract - We should have made a `AWSTextractParsedSOPN` with `job_id` populated. Textract is async, - so the initial `parse` just submits the data to AWS and gets a job_id. + We should have made a `AWSTextractParsedSOPN` with `job_id` populated. + Textract is async, so the initial `parse` just submits the data to AWS and + gets a job_id. We need to check if the job ID has finished and pull in the data to `raw_data`. - We're then in the same state as the Camelot method above, we need to parse the `raw_data` into - `parsed_data` and makr a `RawData` object for bulk adding. + We need to parse the `raw_data` into `parsed_data` and makr a `RawData` + object for bulk adding. """ def handle(self, *args, **options): @@ -45,15 +37,6 @@ def handle(self, *args, **options): "sopn__ballot__candidates_locked": False, } - if getattr(settings, "CAMELOT_ENABLED", False): - # Camelot first - qs = ( - CamelotParsedSOPN.objects.filter(parsed_data=None) - .exclude(raw_data="") - .filter(**current_ballot_kwargs) - ) - self.parse_tables_for_qs(qs) - # Textract qs = AWSTextractParsedSOPN.objects.exclude( status__in=[ diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py deleted file mode 100644 index cb68ffdf02..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py +++ /dev/null @@ -1,27 +0,0 @@ -from bulk_adding.models import RawPeople -from django.conf import settings -from django.core.management.base import BaseCommand -from official_documents.models import OfficialDocument - - -class Command(BaseCommand): - """ - Used to quickly delete existing objects used when testing SOPN - parsing so that you can start fresh for example, when you want - to start testing a new set of SOPNs. - """ - - def print_deleted(self, deleted_dict): - for object, count in deleted_dict.items(): - self.stdout.write(f"Deleted {count} {object}") - - def handle(self, *args, **options): - if settings.SETTINGS_MODULE != "ynr.settings.sopn_testing": - raise ValueError( - "You are trying to run this command outside of SOPN testing environment" - ) - - deleted_dict = {} - deleted_dict.update(OfficialDocument.objects.all().delete()[1]) - deleted_dict.update(RawPeople.objects.all().delete()[1]) - self.print_deleted(deleted_dict) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py deleted file mode 100644 index dbe5eb913a..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py +++ /dev/null @@ -1,237 +0,0 @@ -import json -import os -from collections import Counter - -from bulk_adding.models import RawPeople -from candidates.models import Ballot -from django.core.management import call_command -from official_documents.models import OfficialDocument -from popolo.models import Membership -from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand -from sopn_parsing.models import CamelotParsedSOPN - - -class Command(BaseSOPNParsingCommand): - CORRECT_EXACTLY = "correct_exactly" - NUM_CORRECT_MISSING_PARTIES = "num_correct_some_parties_missing" - NUM_INCORRECT = "num_incorrect" - ZERO_CANDIDATES = "zero_candidates" - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument("--loud", action="store_true", default=False) - - def handle(self, *args, **options): - """ - - Check we have a baseline file to compare with - - Prepare some OfficialDocuments - - Re-parse the documents - - Loop through the created RawPeople objects, comparing to our baseline - to make sure that we are parsing at least as many people as before - - If no asserts failed, use the data to write a new baseline file - """ - - self.loud = options.pop("loud") - - self.candidates_results = { - "correct_exactly": [], - "num_correct_some_parties_missing": [], - "num_incorrect": [], - "zero_candidates": [], - } - - raw_people_file = "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json" - if not os.path.isfile(raw_people_file): - call_command("sopn_tooling_write_baseline") - self.stdout.write("Baseline file didn't exist so one was created") - - options.update({"testing": True}) - - OfficialDocument.objects.update(relevant_pages="") - call_command("sopn_parsing_extract_page_numbers", *args, **options) - CamelotParsedSOPN.objects.all().delete() - call_command("sopn_parsing_extract_tables", *args, **options) - RawPeople.objects.all().delete() - call_command("sopn_parsing_parse_tables", *args, **options) - - with open(raw_people_file) as file: - old_raw_people = json.loads(file.read()) - - self.new_raw_people = {} - for ballot in Ballot.objects.exclude(officialdocument__isnull=True): - ballot_data = old_raw_people.get(ballot.ballot_paper_id, {}) - - self.compare_relevant_pages(ballot=ballot, ballot_data=ballot_data) - - self.compare_raw_people(ballot=ballot, ballot_data=ballot_data) - - # display some overall totals - self.stdout.write( - "Old total 'people' parsed WAS {old}\n" - "New total 'people' parsed IS {new}".format( - old=self.count_people_parsed(old_raw_people), - new=self.count_people_parsed(self.new_raw_people), - ) - ) - - old_raw_people_obj_count = len( - {k: v for k, v in old_raw_people.items() if v["raw_people"]} - ) - new_raw_people_obj_count = RawPeople.objects.count() - style = self.style.SUCCESS - if new_raw_people_obj_count < old_raw_people_obj_count: - style = self.style.ERROR - self.stdout.write( - style( - f"Old RawPeople count: {old_raw_people_obj_count}\n" - f"New total RawPeople count: {new_raw_people_obj_count}" - ) - ) - - for result, ballots in self.candidates_results.items(): - total = len(ballots) - self.stdout.write(f"{total} ballots parsed {result}") - # Write a new baseline - call_command("sopn_tooling_write_baseline") - - def compare_relevant_pages(self, ballot, ballot_data): - old_relevant_pages = ballot_data.get("relevant_pages", "") - new_relevant_pages = ballot.sopn.relevant_pages - - if old_relevant_pages != new_relevant_pages: - self.stdout.write( - self.style.WARNING( - f"RELEVANT PAGES CHANGED FROM {old_relevant_pages} to {new_relevant_pages} for {ballot.ballot_paper_id}" - ) - ) - - def compare_raw_people(self, ballot, ballot_data): - try: - raw_people = ballot.rawpeople.data - except RawPeople.DoesNotExist: - raw_people = [] - - old_raw_people_for_ballot = ballot_data.get("raw_people", []) - old_count = len(old_raw_people_for_ballot) - new_count = len(raw_people) - if new_count < old_count: - self.stderr.write( - f"Uh oh, parsed people for {ballot.ballot_paper_id} decreased from {old_count} to {new_count}. Stopping." - ) - - if new_count > old_count: - self.stdout.write( - f"{ballot.ballot_paper_id} increased from {old_count} to {new_count} parsed people.\n" - f"Check the SOPN at https://candidates.democracyclub.org.uk{ballot.get_sopn_url()}." - ) - for person in raw_people: - if person not in old_raw_people_for_ballot: - self.stdout.write(self.style.SUCCESS(person)) - - # when people parsed have changed e.g. different name/different party print it for further checking - changed_people = [ - person - for person in old_raw_people_for_ballot - if person not in raw_people - ] - if changed_people: - self.stdout.write( - self.style.WARNING( - f"Parsed data changed for {ballot.ballot_paper_id}\n" - f"New raw people data:\n" - f"{raw_people}\n" - "Missing people:" - ) - ) - for person in changed_people: - self.stderr.write(str(person)) - - self.new_raw_people[ballot.ballot_paper_id] = {"raw_people": raw_people} - - self.parties_correct(ballot, raw_people) - - def count_people_parsed(self, raw_people_data): - """ - Returns the total number of "people" that were parsed. - NB that just because something was parsed, it doesnt mean that it was - accurately parsed. Therefore this total is best used to look for large - changes that should then be checked in detail. - """ - return sum( - [len(data["raw_people"]) for data in raw_people_data.values()] - ) - - def parties_correct(self, ballot, raw_people_for_ballot): - candidates = Membership.objects.filter(ballot=ballot) - if not candidates: - self.stdout.write( - self.style.WARNING( - f"We dont have candidates for {ballot.ballot_paper_id}. Try updating with the live site first?" - ) - ) - - if not raw_people_for_ballot: - self.candidates_results[self.ZERO_CANDIDATES].append( - ballot.ballot_paper_id - ) - return None - - num_candidates_correct = candidates.count() == len( - raw_people_for_ballot - ) - - if self.loud: - if num_candidates_correct: - self.stdout.write( - self.style.SUCCESS( - f"Correct number of people parsed as expected for {ballot.ballot_paper_id}" - ) - ) - else: - self.stdout.write( - self.style.ERROR( - f"Incorrect number of people parsed for {ballot.ballot_paper_id}" - ) - ) - - parsed = sorted( - [person["party_id"] for person in raw_people_for_ballot] - ) - expected = list( - candidates.values_list("party__ec_id", flat=True).order_by( - "party__ec_id" - ) - ) - - if parsed == expected: - return self.candidates_results[self.CORRECT_EXACTLY].append( - ballot.ballot_paper_id - ) - - # count number of each missing party ID as there could be more than one - # missing candidate for a party e.g. 1 missing Green, 2 missing independents - parsed = Counter(parsed) - expected = Counter(expected) - missing = expected - parsed - if missing: - total = sum(missing.values()) - self.stderr.write( - f"{total} MISSING parties for {ballot.ballot_paper_id} (party_id:num_missing)\n{missing}" - ) - else: - # sometimes we incorrectly parse extra people - often independents - # due to an empty row - extras = parsed - expected - total = sum(extras.values()) - self.stderr.write( - f"{total} EXTRA parties for {ballot.ballot_paper_id}\n{extras}" - ) - - if num_candidates_correct: - return self.candidates_results[ - self.NUM_CORRECT_MISSING_PARTIES - ].append(ballot.ballot_paper_id) - - return self.candidates_results[self.NUM_INCORRECT].append( - ballot.ballot_paper_id - ) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py deleted file mode 100644 index e7c3f3e1b2..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py +++ /dev/null @@ -1,124 +0,0 @@ -import requests -from candidates.models import Ballot -from django.conf import settings -from django.core.files.base import ContentFile -from django.core.management.base import BaseCommand -from elections.models import Election -from official_documents.models import OfficialDocument - - -class Command(BaseCommand): - """This command uses the ballots endpoint to loop over each - ballot and store each sopn pdf (uploaded_file) locally""" - - def add_arguments(self, parser): - parser.add_argument( - "--date", - "-d", - action="store", - help="Election date in ISO format, defaults to 2021-05-06", - default="2021-05-06", - type=str, - ) - parser.add_argument( - "--site_url", - "-u", - action="store", - help="URL of site to download from", - default="https://candidates.democracyclub.org.uk/", - type=str, - ) - parser.add_argument( - "--election-count", - "-c", - action="store", - help="URL of site to download from", - default=50, - type=int, - ) - parser.add_argument( - "--election-slugs", "-s", action="store", required=False - ) - - def handle(self, *args, **options): - site_url = options.get("site_url") - election_date = options.get("date") - election_count = options.get("election_count") - - if options["election_slugs"]: - election_slugs = options["election_slugs"].split(",") - else: - election_slugs = Election.objects.filter( - election_date=election_date - ).values_list("slug", flat=True)[:election_count] - - for slug in election_slugs: - url = f"{site_url}api/next/ballots/?has_sopn=1&page_size=200&election_id={slug}&auth_token={settings.YNR_API_KEY}" - self.create_official_documents(url=url) - - def create_official_documents(self, url): - data = requests.get(url=url).json() - try: - next_page = data["next"] - except KeyError: - next_page = None - if "results" in data: - for ballot_data in data["results"]: - ballot = Ballot.objects.get( - ballot_paper_id=ballot_data["ballot_paper_id"] - ) - sopn_data = ballot_data["sopn"] - - # if we already have the SOPN no need to recreate - if ballot.officialdocument_set.filter( - source_url=sopn_data["source_url"] - ).exists(): - self.stdout.write( - f"SOPN already exists for {ballot.ballot_paper_id}" - ) - continue - - # check if we already have an OfficialDocument with this source - # downloaded - official_document = OfficialDocument.objects.filter( - source_url=sopn_data["source_url"] - ).first() - if official_document: - # if so we dont need to redownload the file, we can create a new - # object for this ballot with the same file - self.stdout.write( - f"Found SOPN for source {sopn_data['source_url']}" - ) - OfficialDocument.objects.create( - ballot=ballot, - source_url=sopn_data["source_url"], - uploaded_file=official_document.uploaded_file, - document_type=OfficialDocument.NOMINATION_PAPER, - ) - continue - - # otherwise we dont have this file stored already, so download it as - # part of creating the OfficialDocument - self.stdout.write( - f"Downloading SOPN from {sopn_data['uploaded_file']}" - ) - file_response = requests.get(sopn_data["uploaded_file"]) - file_object = ContentFile(content=file_response.content) - official_document = OfficialDocument( - ballot=ballot, - source_url=sopn_data["source_url"], - document_type=OfficialDocument.NOMINATION_PAPER, - ) - file_extension = sopn_data["uploaded_file"].split(".")[-1] - filename = f"{ballot.ballot_paper_id}.{file_extension}" - official_document.uploaded_file.save( - name=filename, content=file_object - ) - else: - self.stdout.write("No results found") - - # this should only be the case where the election object has > 200 - # ballots e.g. parliamentary elections - if next_page: - return self.create_official_documents(url=next_page) - return None diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py deleted file mode 100644 index 07ae9309cd..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py +++ /dev/null @@ -1,51 +0,0 @@ -import json -import os - -from bulk_adding.models import RawPeople -from candidates.models import Ballot -from django.core.management.base import BaseCommand -from django.db.models import Q - - -class Command(BaseCommand): - """ - Creates a JSON file to represent ballots that have an Officialdocument. - Only include ballots where: - - The source of the RawPeople is from parsing a PDF - - No RawPeople were created from the OfficialDocument. This is so that we - will know if we make make improvements that mean more RawPeople are parsed - from an OfficialDocument - """ - - def add_arguments(self, parser): - parser.add_argument( - "--data", - action="store", - help="Dictionary of raw people to write as a baseline", - ) - - def handle(self, *args, **options): - json_data = options["data"] or {} - - if not json_data: - qs = Ballot.objects.exclude(officialdocument__isnull=True).filter( - Q(rawpeople__source_type=RawPeople.SOURCE_PARSED_PDF) - | Q(rawpeople__isnull=True) - ) - for ballot in qs: - raw_people = getattr(ballot, "rawpeople", []) - try: - raw_people = ballot.rawpeople.data - except RawPeople.DoesNotExist: - raw_people = [] - - json_data[ballot.ballot_paper_id] = { - "raw_people": raw_people, - "relevant_pages": ballot.sopn.relevant_pages, - } - - file_path = os.path.join( - os.getcwd(), "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json" - ) - with open(file_path, "w") as f: - f.write(json.dumps(json_data)) diff --git a/ynr/apps/sopn_parsing/tests/test_extract_tables.py b/ynr/apps/sopn_parsing/tests/test_extract_tables.py index 21a03dfb63..bda0d99fd3 100644 --- a/ynr/apps/sopn_parsing/tests/test_extract_tables.py +++ b/ynr/apps/sopn_parsing/tests/test_extract_tables.py @@ -1,15 +1,10 @@ from os.path import abspath, dirname, join -from unittest import skipIf from candidates.tests.helpers import TmpMediaRootMixin from candidates.tests.uk_examples import UK2015ExamplesMixin from django.core.files.uploadedfile import SimpleUploadedFile -from django.core.management import call_command from django.test import TestCase from official_documents.models import BallotSOPN -from sopn_parsing.helpers.extract_tables import extract_ballot_table -from sopn_parsing.models import CamelotParsedSOPN -from sopn_parsing.tests import should_skip_pdf_tests class TestSOPNHelpers(TmpMediaRootMixin, UK2015ExamplesMixin, TestCase): @@ -27,152 +22,3 @@ def setUp(self): uploaded_file=SimpleUploadedFile("sopn.pdf", sopn_file), source_url="example.com", ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_extract_tables(self): - extract_ballot_table(self.dulwich_post_ballot) - self.assertEqual( - CamelotParsedSOPN.objects.get().as_pandas.to_dict(), - { - "0": { - "0": "Name of \nCandidate", - "1": "ALAGARATNAM \nRathy", - "2": "BARBER \nJames", - "3": "HAYES \nHelen Elizabeth", - "4": "KANUMANSA \nAmadu", - "5": "KOTECHA \nResham", - "6": "LAMBERT \nRobin Andrew \nDavid", - "7": "NALLY \nSteve", - "8": "NIX \nRashid", - }, - "1": { - "0": "Home \nAddress", - "1": "(address in the \nMitcham and Morden \nConstituency)", - "2": "33 Champion Hill, \nLondon, SE5 8BS", - "3": "11 Woodsyre, \nSydenham Hill, \nLondon, SE26 6SS", - "4": "11 Coleridge House, \nBrowning Street, \nLondon, SE17 1DG", - "5": "(address in the \nRuislip, Northwood \nand Pinner \nConstituency)", - "6": "(address in the \nDuwlich and West \nNorwood \nConstituency)", - "7": "(address in the \nVauxhall \nConstituency)", - "8": "66 Guinness Court, \nLondon, SW3 2PQ", - }, - "2": { - "0": "Description \n(if any)", - "1": "UK Independence \nParty (UKIP)", - "2": "Liberal Democrat", - "3": "Labour Party", - "4": "All People`s Party", - "5": "The Conservative \nParty Candidate", - "6": "Independent", - "7": "Trade Unionist \nand Socialist \nCoalition", - "8": "The Green Party", - }, - "3": { - "0": "Name of Assentors \nProposer(+), Seconder(++)", - "1": "Coleman Alice M + \n" - "Potter Keith S ++ \n" - "Potter Stephanie \n" - "Smith Bryan L \n" - "Anderson Beth \n" - "Lumba Avita \n" - "Andersen Robert \n" - "Patel Sajal \n" - "Stanbury Linda \n" - "Stanbury James", - "2": "Fitchett Keith + \n" - "Price Jonathan ++ \n" - "Gardner Brigid \n" - "Waddington Simon \n" - "Morland Laura \n" - "Lester Rachel \n" - "Pidgeon Caroline \n" - "Hare David \n" - "Hanton Alastair \n" - "Haylett Alexander", - "3": "Samuel Gaynelle + \n" - "Whaley Stephen P ++ \n" - "Brazell Shadi M \n" - "De Souza Johnny \n" - "Alcock Heather \n" - "Natzler Robert S \n" - "Pearce Michelle E \n" - "Pickering Robert \n" - "Richardson Katherine G \n" - "Pickard Jane", - "4": "King James + \n" - "King Rosemary ++ \n" - "King David \n" - "Davies Yadalieu \n" - "Sesay Mary \n" - "Rahman Layla K \n" - "Rahman Syed A \n" - "Ahmed Jalaluddin \n" - "Rahman Tajwar S \n" - "Rahman Taamid S", - "5": "Davis James G + \n" - "Bradbury David S ++ \n" - "Badman Susan E \n" - "Hill-Archer Roderick C \n" - "Langley Anne C \n" - "Mitchell Andrew M \n" - "Virgo Marjorie J \n" - "Virgo Philip A \n" - "Chathli Lindsay \n" - "Broomhead Robert A", - "6": "Smith Caitlin + \n" - "Parks Jesse ++ \n" - "Connage Kyesha \n" - "Hendry Perihan \n" - "Mounty E J \n" - "Sharif B \n" - "Scott Wellesley \n" - "Harriott S A \n" - "Harriott Clive \n" - "Ojumu Ibi", - "7": "Tullis Andrew C + \n" - "Mason Joshua H ++ \n" - "Parkinson Francine M \n" - "Gait Elizabeth \n" - "Doolan Samantha \n" - "Ubiaro Elizabeth \n" - "Garner Stuart \n" - "Akinjogbin Dolapo \n" - "Walker Donna \n" - "Lang Geoffrey P", - "8": "Atwell E G + \n" - "Rose Lloyd ++ \n" - "O`Shea C \n" - "Gomes Jacqueline \n" - "Wood Thomas \n" - "Rosenfeld David \n" - "Conroy Martin \n" - "Skiadopoulou I \n" - "Rosenfeld Lawrence \n" - "Rosenfeld Emily", - }, - "4": { - "0": "Reason why \nno longer \nnominated*", - "1": "", - "2": "", - "3": "", - "4": "", - "5": "", - "6": "", - "7": "", - "8": "", - }, - }, - ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_extract_command_current(self): - self.assertEqual(CamelotParsedSOPN.objects.count(), 0) - call_command("sopn_parsing_extract_tables", current=True) - self.assertEqual(CamelotParsedSOPN.objects.count(), 1) - - def test_extract_command_current_no_current_elections(self): - self.election.current = False - self.election.save() - self.assertEqual(CamelotParsedSOPN.objects.count(), 0) - call_command("sopn_parsing_extract_tables", current=True) - self.assertEqual(CamelotParsedSOPN.objects.count(), 0) diff --git a/ynr/apps/sopn_parsing/tests/test_parse_tables.py b/ynr/apps/sopn_parsing/tests/test_parse_tables.py deleted file mode 100644 index 922c487dd4..0000000000 --- a/ynr/apps/sopn_parsing/tests/test_parse_tables.py +++ /dev/null @@ -1,529 +0,0 @@ -import json -from pathlib import Path -from unittest import skipIf -from unittest.mock import patch - -from bulk_adding.models import RawPeople -from candidates.tests.uk_examples import UK2015ExamplesMixin -from django.core.management import call_command -from django.db import connection -from django.test import TestCase -from official_documents.models import BallotSOPN -from pandas import Index, Series -from parties.models import Party, PartyDescription -from parties.tests.factories import PartyFactory -from parties.tests.fixtures import DefaultPartyFixtures -from sopn_parsing.helpers import parse_tables -from sopn_parsing.models import CamelotParsedSOPN -from sopn_parsing.tests import should_skip_pdf_tests -from sopn_parsing.tests.data.welsh_sopn_data import welsh_sopn_data - -from ynr.apps.sopn_parsing.management.commands.sopn_parsing_parse_tables import ( - Command as ParseTablesCommand, -) - - -class TestSOPNHelpers(DefaultPartyFixtures, UK2015ExamplesMixin, TestCase): - def setUp(self): - PartyFactory(ec_id="PP85", name="UK Independence Party (UKIP)") - with connection.cursor() as cursor: - cursor.execute("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;") - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_basic_parsing(self): - self.assertFalse(RawPeople.objects.exists()) - doc = BallotSOPN.objects.create( - ballot=self.dulwich_post_ballot, - source_url="example.com", - ) - dataframe = json.dumps( - { - "0": { - "0": "Name of \nCandidate", - "1": "BRADBURY \nAndrew John", - "2": "COLLINS \nDave", - "3": "HARVEY \nPeter John", - "4": "JENNER \nMelanie", - }, - "1": { - "0": "Home Address", - "1": "10 Fowey Close, \nShoreham by Sea, \nWest Sussex, \nBN43 5HE", - "2": "51 Old Fort Road, \nShoreham by Sea, \nBN43 5RL", - "3": "76 Harbour Way, \nShoreham by Sea, \nSussex, \nBN43 5HH", - "4": "9 Flag Square, \nShoreham by Sea, \nWest Sussex, \nBN43 5RZ", - }, - "2": { - "0": "Description (if \nany)", - "1": "Green Party", - "2": "Independent", - "3": "UK Independence \nParty (UKIP)", - "4": "Labour Party", - }, - "3": { - "0": "Name of \nProposer", - "1": "Tiffin Susan J", - "2": "Loader Jocelyn C", - "3": "Hearne James H", - "4": "O`Connor Lavinia", - }, - "4": { - "0": "Reason \nwhy no \nlonger \nnominated\n*", - "1": "", - "2": "", - "3": "", - "4": "", - }, - } - ) - CamelotParsedSOPN.objects.create( - sopn=doc, raw_data=dataframe, status="unparsed" - ) - call_command("sopn_parsing_parse_tables") - self.assertEqual(RawPeople.objects.count(), 1) - raw_people = RawPeople.objects.get() - self.assertEqual( - raw_people.data, - [ - {"name": "Andrew John Bradbury", "party_id": "PP63"}, - {"name": "Dave Collins", "party_id": "ynmp-party:2"}, - {"name": "Peter John Harvey", "party_id": "PP85"}, - {"name": "Melanie Jenner", "party_id": "PP53"}, - ], - ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_welsh_run_sopn(self): - """ - Test that if the ballot is welsh run and previous party affiliations - are included they are parsed - """ - self.assertFalse(RawPeople.objects.exists()) - doc = BallotSOPN.objects.create( - ballot=self.senedd_ballot, - source_url="example.com", - ) - - plaid_cymru, _ = Party.objects.update_or_create( - ec_id="PP77", - legacy_slug="party:77", - defaults={ - "name": "Plaid Cymru - The Party of Wales", - "date_registered": "1999-01-14", - }, - ) - - dataframe = json.dumps(welsh_sopn_data) - CamelotParsedSOPN.objects.create( - sopn=doc, raw_data=dataframe, status="unparsed" - ) - call_command("sopn_parsing_parse_tables") - self.assertEqual(RawPeople.objects.count(), 1) - raw_people = RawPeople.objects.get() - self.assertEqual( - raw_people.data, - [ - { - "name": "John Smith", - "party_id": self.conservative_party.ec_id, - "previous_party_affiliations": [self.ld_party.ec_id], - }, - { - "name": "Joe Bloggs", - "party_id": self.labour_party.ec_id, - "previous_party_affiliations": ["ynmp-party:2"], - }, - {"name": "Jon Doe", "party_id": self.ld_party.ec_id}, - { - "name": "Jane Brown", - "party_id": "ynmp-party:2", - "previous_party_affiliations": [plaid_cymru.ec_id], - }, - { - "name": "Judy Johnson", - "party_id": plaid_cymru.ec_id, - "previous_party_affiliations": [self.labour_party.ec_id], - }, - {"name": "Julie Williams", "party_id": "ynmp-party:2"}, - ], - ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_match_complex_descriptions(self): - self.assertFalse(RawPeople.objects.exists()) - doc = BallotSOPN.objects.create( - ballot=self.senedd_ballot, - source_url="example.com", - ) - - plaid_cymru, _ = Party.objects.update_or_create( - ec_id="PP77", - legacy_slug="party:77", - defaults={ - "name": "Plaid Cymru - The Party of Wales", - "date_registered": "1999-01-14", - }, - ) - - dickens_heath, _ = Party.objects.update_or_create( - ec_id="PP1", - legacy_slug="PP!", - defaults={ - "name": "Independent Dickens Heath Residents Action Group", - "date_registered": "1999-01-14", - }, - ) - PartyDescription.objects.create( - party=dickens_heath, - description="Independent Dickens Heath Residents Action Group", - ) - lib_dem, _ = Party.objects.update_or_create( - ec_id="PP100", - legacy_slug="PP100", - defaults={ - "name": "Liberal Democrats", - "date_registered": "1999-01-14", - }, - register="GB", - ) - - PartyDescription.objects.create( - party=lib_dem, - description="Liberal Democrat Focus Team | Tîm Ffocws y Democratiaid Rhyddfrydol", - ) - - data_path = ( - Path(__file__).parent / "data/edge_case_description_data.json" - ) - with data_path.open() as f: - CamelotParsedSOPN.objects.create( - sopn=doc, raw_data=f.read(), status="unparsed" - ) - call_command("sopn_parsing_parse_tables") - self.assertEqual(RawPeople.objects.count(), 1) - raw_people = RawPeople.objects.get() - self.assertEqual( - sorted(raw_people.data, key=lambda x: x["name"]), - sorted( - [ - { - "name": "John Smith", - "party_id": self.conservative_party.ec_id, - }, - { - "name": "Joe Bloggs", - "party_id": self.labour_party.ec_id, - }, - { - "name": "Jon Doe", - "party_id": self.ld_party.ec_id, - }, - { - "name": "Jane Brown", - "party_id": "ynmp-party:2", - }, - { - "name": "Judy Johnson", - "party_id": plaid_cymru.ec_id, - }, - {"name": "Julie Williams", "party_id": "ynmp-party:2"}, - ], - key=lambda x: x["name"], - ), - ) - - -class TestParseTablesUnitTests(UK2015ExamplesMixin, TestCase): - def get_two_name_field_cases(self): - # this could be updated with more combinations as we come across them - return [ - { - "name_fields": ["candidate surname", "candidate forename"], - "row": { - "candidate surname": "BAGSHAW", - "candidate forename": "Elaine Sheila", - "home address": "1 Foo Street \n London \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": [ - "candidate forename", - "candidate surname", - ], - "expected_name": "Elaine Sheila Bagshaw", - }, - { - "name_fields": ["surname", "other names"], - "row": { - "surname": "BAGSHAW", - "other names": "Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": ["other names", "surname"], - "expected_name": "Elaine Sheila Bagshaw", - }, - { - "name_fields": ["last name", "other names"], - "row": { - "last name": "BAGSHAW", - "other names": "Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": ["other names", "last name"], - "expected_name": "Elaine Sheila Bagshaw", - }, - { - "name_fields": ["candidate forename", "candidate surname"], - "row": { - "candidate forename": "Elaine Sheila", - "candidate surname": "BAGSHAW", - "home address": "1 Foo Street \n London \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": [ - "candidate forename", - "candidate surname", - ], - "expected_name": "Elaine Sheila Bagshaw", - }, - ] - - def get_single_name_field_cases(self): - return [ - { - "name_fields": ["name of candidate"], - "row": { - "name of candidate": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \n London \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["names of candidate"], - "row": { - "names of candidate": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["candidate name"], - "row": { - "candidate name": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["surname"], - "row": { - "surname": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["candidates surname"], - "row": { - "candidates surname": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["other name"], - "row": { - "other name": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - ] - - def test_get_name_single_field(self): - for case in self.get_single_name_field_cases(): - row = Series(case["row"]) - name_fields = case["name_fields"] - with self.subTest(name_fields=name_fields): - assert len(case["name_fields"]) == 1 - name = parse_tables.get_name(row=row, name_fields=name_fields) - assert name == "Elaine Sheila Bagshaw" - - def test_get_name_two_fields(self): - for case in self.get_two_name_field_cases(): - row = Series(case["row"]) - name_fields = case["name_fields"] - with self.subTest(name_fields=name_fields): - assert len(case["name_fields"]) == 2 - name = parse_tables.get_name(row=row, name_fields=name_fields) - assert name == case["expected_name"] - - def test_get_name_fields_single(self): - for case in self.get_single_name_field_cases(): - row = Index(case["row"]) - with self.subTest(row=row): - name_fields = parse_tables.get_name_fields(row=row) - assert len(name_fields) == 1 - assert name_fields == case["name_fields"] - - def test_get_name_fields_two(self): - for case in self.get_two_name_field_cases(): - row = Index(case["row"]) - with self.subTest(row=row): - name_fields = parse_tables.get_name_fields(row=row) - assert len(name_fields) == 2 - assert name_fields == case["name_fields"] - - def test_get_name_fields_raises_error(self): - row = Index({"foo": "Bar"}) - with self.assertRaises(ValueError): - parse_tables.get_name_fields(row=row) - - def test_order_name_fields(self): - for case in self.get_two_name_field_cases(): - name_fields = case["name_fields"] - with self.subTest(name_fields=name_fields): - result = parse_tables.order_name_fields(name_fields) - assert result == case["ordered_name_fields"] - - def test_clean_name_replaces_backticks(self): - name = parse_tables.clean_name("D`SOUZA") - assert "`" not in name - assert "'" in name - - def test_clean_name_replaces_newlines(self): - name = parse_tables.clean_name( - "A Very Long Name That Splits \nOver Lines" - ) - assert "\n" not in name - - def test_clean_name_capitalized_last_and_titalized(self): - name = parse_tables.clean_name("SMITH John") - assert name == "John Smith" - - def test_clean_last_names(self): - name = parse_tables.clean_last_names(["MACDONALD", "John"]) - assert name == "MacDonald" - - def test_clean_name_two_word_surnames(self): - names = [ - ("EDE COOPER \nPalmer", "Palmer Ede Cooper"), - ("VAN DULKEN \nRichard Michael", "Richard Michael Van Dulken"), - ("ARMSTRONG LILLEY \nLynne", "Lynne Armstrong Lilley"), - ( - " D`SOUZA Aaron Anthony Jose \nHasan", - "Aaron Anthony Jose Hasan D'Souza", - ), - ("Michael James Collins", "Michael James Collins"), - (" Michael James Collins ", "Michael James Collins"), - ("DAVE Nitesh Pravin", "Nitesh Pravin Dave"), - ("DAVE\nNitesh Pravin", "Nitesh Pravin Dave"), - ("COOKE Anne-Marie", "Anne-Marie Cooke"), - ("COOKE\nAnne-Marie", "Anne-Marie Cooke"), - ("BROOKES-\nDUNCAN\nKaty", "Katy Brookes-Duncan"), - ("HOUNSOME\nJohn", "John Hounsome"), - ("O`CONNELL \nStephen John", "Stephen John O'Connell"), - ("O`NEAL \nCarol Joy", "Carol Joy O'Neal"), - ("O`REILLY \nTracey Linda \nDiane", "Tracey Linda Diane O'Reilly"), - ("LIAM THOMAS O'ROURKE", "Liam Thomas O'Rourke"), - ("O'CALLAGHAN \nClaire Louise", "Claire Louise O'Callaghan"), - ] - for name in names: - with self.subTest(name=names[0]): - assert parse_tables.clean_name(name[0]) == name[1] - - def test_clean_description_removes_newlines(self): - cleaned_description = parse_tables.clean_description( - "A Long Description That Splits \nOver \\nLines" - ) - assert "\n" not in cleaned_description - assert "\\n" not in cleaned_description - - def test_clean_description_replaces_backticks(self): - cleaned_description = parse_tables.clean_description( - "All People`s Party" - ) - assert "`" not in cleaned_description - assert "'" in cleaned_description - assert cleaned_description == "All People's Party" - - def test_guess_previous_party_affiliations_field(self): - sopn = CamelotParsedSOPN(raw_data=json.dumps(welsh_sopn_data)) - data = sopn.as_pandas - data.columns = data.iloc[0] - - cases = [ - (self.dulwich_post_ballot, None), - (self.senedd_ballot, "statement of party membership"), - ] - for case in cases: - with self.subTest(msg=case[0]): - sopn.sopn = BallotSOPN(ballot=case[0]) - result = parse_tables.guess_previous_party_affiliations_field( - data=data, sopn=sopn - ) - assert result == case[1] - - def test_add_previous_party_affiliations(self): - cases = [ - {"party_str": "", "party": None, "expected": {}}, - {"party_str": "Unknown Party", "party": None, "expected": {}}, - { - "party_str": "Labour Party", - "party": self.labour_party, - "expected": { - "previous_party_affiliations": [self.labour_party.ec_id] - }, - }, - ] - for case in cases: - with self.subTest(msg=case["party_str"]), patch.object( - parse_tables, "get_party", return_value=case["party"] - ): - raw_data = {} - sopn = CamelotParsedSOPN() - result = parse_tables.add_previous_party_affiliations( - party_str=case["party_str"], - raw_data=raw_data, - sopn=sopn, - ) - assert result == case["expected"] - - -class TestParseTablesFilterKwargs(TestCase): - def setUp(self): - self.command = ParseTablesCommand() - self.default_filter_kwargs = {} - - def test_when_testing(self): - options = {"testing": True} - result = self.command.build_filter_kwargs(options) - self.assertEqual(result, self.default_filter_kwargs) - - def test_when_using_ballot(self): - options = {"ballot": "local.foo.bar.2021-05-06"} - result = self.command.build_filter_kwargs(options) - self.assertEqual(result, self.default_filter_kwargs) - - def test_when_using_reparse(self): - options = {"reparse": True} - result = self.command.build_filter_kwargs(options) - expected = self.default_filter_kwargs.copy() - expected["rawpeople__source_type"] = RawPeople.SOURCE_PARSED_PDF - self.assertEqual(result, expected) - - def test_when_no_options(self): - options = {} - result = self.command.build_filter_kwargs(options) - expected = self.default_filter_kwargs.copy() - self.assertEqual(result, expected)