From 0a5dac7a557afd265c8bdd77f8a41174f756fd4f Mon Sep 17 00:00:00 2001 From: symroe Date: Sat, 15 Nov 2025 10:17:56 +0000 Subject: [PATCH 1/2] Remove SOPN tooling This was used to capture baselines for Camelot. We no longer need this code --- .gitignore | 2 - Makefile | 45 ---- .../sopn_tooling_clear_existing_objects.py | 27 -- .../sopn_tooling_compare_raw_people.py | 237 ------------------ .../sopn_tooling_create_official_documents.py | 124 --------- .../commands/sopn_tooling_write_baseline.py | 51 ---- 6 files changed, 486 deletions(-) delete mode 100644 Makefile delete mode 100644 ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py delete mode 100644 ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py delete mode 100644 ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py delete mode 100644 ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py diff --git a/.gitignore b/.gitignore index d08c2cda36..38c331c5a1 100644 --- a/.gitignore +++ b/.gitignore @@ -24,8 +24,6 @@ test-results node_modules/ .vscode/ /test-env -/ynr/apps/sopn_parsing/tests/data/sopn_baseline.json -/ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json # PyCharm .idea/ diff --git a/Makefile b/Makefile deleted file mode 100644 index f8d2865666..0000000000 --- a/Makefile +++ /dev/null @@ -1,45 +0,0 @@ -export DJANGO_SETTINGS_MODULE?=ynr.settings.sopn_testing - - -.PHONY: sopn-runserver -sopn-runserver: - python manage.py runserver - -.PHONY: sopn-shell -sopn-shell: - python manage.py shell_plus - -.PHONY: migrate-db -migrate-db: - python manage.py migrate - -.PHONY: test-sopns -test-sopns: migrate-db - python manage.py sopn_tooling_compare_raw_people --election-slugs= --ballot= --date 2021-05-06 - -.PHONY: download-sopns -download-sopns: - python manage.py migrate --no-input - python manage.py sopn_tooling_create_official_documents --election-slugs= --date 2021-05-06 - -.PHONY: populate-sopn-testing-database -populate-sopn-testing-database: migrate-db - python manage.py candidates_import_from_live_site - -.PHONY: delete-test-sopns -delete-test-sopns: - python manage.py sopn_tooling_clear_existing_objects - rm -rf ./ynr/media/sopn_testing/ - -.PHONY: create-baseline-file -create-baseline-file: - python manage.py sopn_tooling_write_baseline - -.PHONY: copy-baseline-file -copy-baseline-file: - cp ynr/apps/sopn_parsing/tests/data/sopn_baseline.json ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json - -.PHONY: prod-import-sopns -prod-import-sopns: - cd deploy; \ - ansible-playbook import_sopns.yml diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py deleted file mode 100644 index cb68ffdf02..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py +++ /dev/null @@ -1,27 +0,0 @@ -from bulk_adding.models import RawPeople -from django.conf import settings -from django.core.management.base import BaseCommand -from official_documents.models import OfficialDocument - - -class Command(BaseCommand): - """ - Used to quickly delete existing objects used when testing SOPN - parsing so that you can start fresh for example, when you want - to start testing a new set of SOPNs. - """ - - def print_deleted(self, deleted_dict): - for object, count in deleted_dict.items(): - self.stdout.write(f"Deleted {count} {object}") - - def handle(self, *args, **options): - if settings.SETTINGS_MODULE != "ynr.settings.sopn_testing": - raise ValueError( - "You are trying to run this command outside of SOPN testing environment" - ) - - deleted_dict = {} - deleted_dict.update(OfficialDocument.objects.all().delete()[1]) - deleted_dict.update(RawPeople.objects.all().delete()[1]) - self.print_deleted(deleted_dict) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py deleted file mode 100644 index dbe5eb913a..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py +++ /dev/null @@ -1,237 +0,0 @@ -import json -import os -from collections import Counter - -from bulk_adding.models import RawPeople -from candidates.models import Ballot -from django.core.management import call_command -from official_documents.models import OfficialDocument -from popolo.models import Membership -from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand -from sopn_parsing.models import CamelotParsedSOPN - - -class Command(BaseSOPNParsingCommand): - CORRECT_EXACTLY = "correct_exactly" - NUM_CORRECT_MISSING_PARTIES = "num_correct_some_parties_missing" - NUM_INCORRECT = "num_incorrect" - ZERO_CANDIDATES = "zero_candidates" - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument("--loud", action="store_true", default=False) - - def handle(self, *args, **options): - """ - - Check we have a baseline file to compare with - - Prepare some OfficialDocuments - - Re-parse the documents - - Loop through the created RawPeople objects, comparing to our baseline - to make sure that we are parsing at least as many people as before - - If no asserts failed, use the data to write a new baseline file - """ - - self.loud = options.pop("loud") - - self.candidates_results = { - "correct_exactly": [], - "num_correct_some_parties_missing": [], - "num_incorrect": [], - "zero_candidates": [], - } - - raw_people_file = "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json" - if not os.path.isfile(raw_people_file): - call_command("sopn_tooling_write_baseline") - self.stdout.write("Baseline file didn't exist so one was created") - - options.update({"testing": True}) - - OfficialDocument.objects.update(relevant_pages="") - call_command("sopn_parsing_extract_page_numbers", *args, **options) - CamelotParsedSOPN.objects.all().delete() - call_command("sopn_parsing_extract_tables", *args, **options) - RawPeople.objects.all().delete() - call_command("sopn_parsing_parse_tables", *args, **options) - - with open(raw_people_file) as file: - old_raw_people = json.loads(file.read()) - - self.new_raw_people = {} - for ballot in Ballot.objects.exclude(officialdocument__isnull=True): - ballot_data = old_raw_people.get(ballot.ballot_paper_id, {}) - - self.compare_relevant_pages(ballot=ballot, ballot_data=ballot_data) - - self.compare_raw_people(ballot=ballot, ballot_data=ballot_data) - - # display some overall totals - self.stdout.write( - "Old total 'people' parsed WAS {old}\n" - "New total 'people' parsed IS {new}".format( - old=self.count_people_parsed(old_raw_people), - new=self.count_people_parsed(self.new_raw_people), - ) - ) - - old_raw_people_obj_count = len( - {k: v for k, v in old_raw_people.items() if v["raw_people"]} - ) - new_raw_people_obj_count = RawPeople.objects.count() - style = self.style.SUCCESS - if new_raw_people_obj_count < old_raw_people_obj_count: - style = self.style.ERROR - self.stdout.write( - style( - f"Old RawPeople count: {old_raw_people_obj_count}\n" - f"New total RawPeople count: {new_raw_people_obj_count}" - ) - ) - - for result, ballots in self.candidates_results.items(): - total = len(ballots) - self.stdout.write(f"{total} ballots parsed {result}") - # Write a new baseline - call_command("sopn_tooling_write_baseline") - - def compare_relevant_pages(self, ballot, ballot_data): - old_relevant_pages = ballot_data.get("relevant_pages", "") - new_relevant_pages = ballot.sopn.relevant_pages - - if old_relevant_pages != new_relevant_pages: - self.stdout.write( - self.style.WARNING( - f"RELEVANT PAGES CHANGED FROM {old_relevant_pages} to {new_relevant_pages} for {ballot.ballot_paper_id}" - ) - ) - - def compare_raw_people(self, ballot, ballot_data): - try: - raw_people = ballot.rawpeople.data - except RawPeople.DoesNotExist: - raw_people = [] - - old_raw_people_for_ballot = ballot_data.get("raw_people", []) - old_count = len(old_raw_people_for_ballot) - new_count = len(raw_people) - if new_count < old_count: - self.stderr.write( - f"Uh oh, parsed people for {ballot.ballot_paper_id} decreased from {old_count} to {new_count}. Stopping." - ) - - if new_count > old_count: - self.stdout.write( - f"{ballot.ballot_paper_id} increased from {old_count} to {new_count} parsed people.\n" - f"Check the SOPN at https://candidates.democracyclub.org.uk{ballot.get_sopn_url()}." - ) - for person in raw_people: - if person not in old_raw_people_for_ballot: - self.stdout.write(self.style.SUCCESS(person)) - - # when people parsed have changed e.g. different name/different party print it for further checking - changed_people = [ - person - for person in old_raw_people_for_ballot - if person not in raw_people - ] - if changed_people: - self.stdout.write( - self.style.WARNING( - f"Parsed data changed for {ballot.ballot_paper_id}\n" - f"New raw people data:\n" - f"{raw_people}\n" - "Missing people:" - ) - ) - for person in changed_people: - self.stderr.write(str(person)) - - self.new_raw_people[ballot.ballot_paper_id] = {"raw_people": raw_people} - - self.parties_correct(ballot, raw_people) - - def count_people_parsed(self, raw_people_data): - """ - Returns the total number of "people" that were parsed. - NB that just because something was parsed, it doesnt mean that it was - accurately parsed. Therefore this total is best used to look for large - changes that should then be checked in detail. - """ - return sum( - [len(data["raw_people"]) for data in raw_people_data.values()] - ) - - def parties_correct(self, ballot, raw_people_for_ballot): - candidates = Membership.objects.filter(ballot=ballot) - if not candidates: - self.stdout.write( - self.style.WARNING( - f"We dont have candidates for {ballot.ballot_paper_id}. Try updating with the live site first?" - ) - ) - - if not raw_people_for_ballot: - self.candidates_results[self.ZERO_CANDIDATES].append( - ballot.ballot_paper_id - ) - return None - - num_candidates_correct = candidates.count() == len( - raw_people_for_ballot - ) - - if self.loud: - if num_candidates_correct: - self.stdout.write( - self.style.SUCCESS( - f"Correct number of people parsed as expected for {ballot.ballot_paper_id}" - ) - ) - else: - self.stdout.write( - self.style.ERROR( - f"Incorrect number of people parsed for {ballot.ballot_paper_id}" - ) - ) - - parsed = sorted( - [person["party_id"] for person in raw_people_for_ballot] - ) - expected = list( - candidates.values_list("party__ec_id", flat=True).order_by( - "party__ec_id" - ) - ) - - if parsed == expected: - return self.candidates_results[self.CORRECT_EXACTLY].append( - ballot.ballot_paper_id - ) - - # count number of each missing party ID as there could be more than one - # missing candidate for a party e.g. 1 missing Green, 2 missing independents - parsed = Counter(parsed) - expected = Counter(expected) - missing = expected - parsed - if missing: - total = sum(missing.values()) - self.stderr.write( - f"{total} MISSING parties for {ballot.ballot_paper_id} (party_id:num_missing)\n{missing}" - ) - else: - # sometimes we incorrectly parse extra people - often independents - # due to an empty row - extras = parsed - expected - total = sum(extras.values()) - self.stderr.write( - f"{total} EXTRA parties for {ballot.ballot_paper_id}\n{extras}" - ) - - if num_candidates_correct: - return self.candidates_results[ - self.NUM_CORRECT_MISSING_PARTIES - ].append(ballot.ballot_paper_id) - - return self.candidates_results[self.NUM_INCORRECT].append( - ballot.ballot_paper_id - ) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py deleted file mode 100644 index e7c3f3e1b2..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py +++ /dev/null @@ -1,124 +0,0 @@ -import requests -from candidates.models import Ballot -from django.conf import settings -from django.core.files.base import ContentFile -from django.core.management.base import BaseCommand -from elections.models import Election -from official_documents.models import OfficialDocument - - -class Command(BaseCommand): - """This command uses the ballots endpoint to loop over each - ballot and store each sopn pdf (uploaded_file) locally""" - - def add_arguments(self, parser): - parser.add_argument( - "--date", - "-d", - action="store", - help="Election date in ISO format, defaults to 2021-05-06", - default="2021-05-06", - type=str, - ) - parser.add_argument( - "--site_url", - "-u", - action="store", - help="URL of site to download from", - default="https://candidates.democracyclub.org.uk/", - type=str, - ) - parser.add_argument( - "--election-count", - "-c", - action="store", - help="URL of site to download from", - default=50, - type=int, - ) - parser.add_argument( - "--election-slugs", "-s", action="store", required=False - ) - - def handle(self, *args, **options): - site_url = options.get("site_url") - election_date = options.get("date") - election_count = options.get("election_count") - - if options["election_slugs"]: - election_slugs = options["election_slugs"].split(",") - else: - election_slugs = Election.objects.filter( - election_date=election_date - ).values_list("slug", flat=True)[:election_count] - - for slug in election_slugs: - url = f"{site_url}api/next/ballots/?has_sopn=1&page_size=200&election_id={slug}&auth_token={settings.YNR_API_KEY}" - self.create_official_documents(url=url) - - def create_official_documents(self, url): - data = requests.get(url=url).json() - try: - next_page = data["next"] - except KeyError: - next_page = None - if "results" in data: - for ballot_data in data["results"]: - ballot = Ballot.objects.get( - ballot_paper_id=ballot_data["ballot_paper_id"] - ) - sopn_data = ballot_data["sopn"] - - # if we already have the SOPN no need to recreate - if ballot.officialdocument_set.filter( - source_url=sopn_data["source_url"] - ).exists(): - self.stdout.write( - f"SOPN already exists for {ballot.ballot_paper_id}" - ) - continue - - # check if we already have an OfficialDocument with this source - # downloaded - official_document = OfficialDocument.objects.filter( - source_url=sopn_data["source_url"] - ).first() - if official_document: - # if so we dont need to redownload the file, we can create a new - # object for this ballot with the same file - self.stdout.write( - f"Found SOPN for source {sopn_data['source_url']}" - ) - OfficialDocument.objects.create( - ballot=ballot, - source_url=sopn_data["source_url"], - uploaded_file=official_document.uploaded_file, - document_type=OfficialDocument.NOMINATION_PAPER, - ) - continue - - # otherwise we dont have this file stored already, so download it as - # part of creating the OfficialDocument - self.stdout.write( - f"Downloading SOPN from {sopn_data['uploaded_file']}" - ) - file_response = requests.get(sopn_data["uploaded_file"]) - file_object = ContentFile(content=file_response.content) - official_document = OfficialDocument( - ballot=ballot, - source_url=sopn_data["source_url"], - document_type=OfficialDocument.NOMINATION_PAPER, - ) - file_extension = sopn_data["uploaded_file"].split(".")[-1] - filename = f"{ballot.ballot_paper_id}.{file_extension}" - official_document.uploaded_file.save( - name=filename, content=file_object - ) - else: - self.stdout.write("No results found") - - # this should only be the case where the election object has > 200 - # ballots e.g. parliamentary elections - if next_page: - return self.create_official_documents(url=next_page) - return None diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py deleted file mode 100644 index 07ae9309cd..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py +++ /dev/null @@ -1,51 +0,0 @@ -import json -import os - -from bulk_adding.models import RawPeople -from candidates.models import Ballot -from django.core.management.base import BaseCommand -from django.db.models import Q - - -class Command(BaseCommand): - """ - Creates a JSON file to represent ballots that have an Officialdocument. - Only include ballots where: - - The source of the RawPeople is from parsing a PDF - - No RawPeople were created from the OfficialDocument. This is so that we - will know if we make make improvements that mean more RawPeople are parsed - from an OfficialDocument - """ - - def add_arguments(self, parser): - parser.add_argument( - "--data", - action="store", - help="Dictionary of raw people to write as a baseline", - ) - - def handle(self, *args, **options): - json_data = options["data"] or {} - - if not json_data: - qs = Ballot.objects.exclude(officialdocument__isnull=True).filter( - Q(rawpeople__source_type=RawPeople.SOURCE_PARSED_PDF) - | Q(rawpeople__isnull=True) - ) - for ballot in qs: - raw_people = getattr(ballot, "rawpeople", []) - try: - raw_people = ballot.rawpeople.data - except RawPeople.DoesNotExist: - raw_people = [] - - json_data[ballot.ballot_paper_id] = { - "raw_people": raw_people, - "relevant_pages": ballot.sopn.relevant_pages, - } - - file_path = os.path.join( - os.getcwd(), "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json" - ) - with open(file_path, "w") as f: - f.write(json.dumps(json_data)) From 02890ce0192c1a4410d6086403bd80e957af2d93 Mon Sep 17 00:00:00 2001 From: symroe Date: Sat, 15 Nov 2025 11:25:05 +0000 Subject: [PATCH 2/2] Remove a load of Camelot code This is a bit of a tangle to remove cleanly. I think I've removed too much, especially some of the tests that really should be converted to use AWS Textract rather than just removing them. The plan is to built these up again with a bit of a rethink / redesign of the whole system. Maintaining these tests while refactoring wouldn't be a good idea, so I suggest revisiting them later. --- ynr/apps/bulk_adding/tests/test_bulk_add.py | 76 --- ynr/apps/bulk_adding/views/sopns.py | 4 - .../elections/includes/_sopn_debug.html | 16 - ynr/apps/official_documents/models.py | 7 - .../official_documents/tests/test_upload.py | 25 +- .../sopn_parsing/helpers/extract_tables.py | 63 --- ynr/apps/sopn_parsing/helpers/parse_tables.py | 21 +- .../commands/sopn_parsing_extract_tables.py | 29 - .../commands/sopn_parsing_parse_tables.py | 67 --- .../commands/sopn_parsing_process_unparsed.py | 27 +- .../sopn_parsing/tests/test_extract_tables.py | 154 ----- .../sopn_parsing/tests/test_parse_tables.py | 529 ------------------ 12 files changed, 9 insertions(+), 1009 deletions(-) delete mode 100644 ynr/apps/sopn_parsing/helpers/extract_tables.py delete mode 100644 ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py delete mode 100644 ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py delete mode 100644 ynr/apps/sopn_parsing/tests/test_parse_tables.py diff --git a/ynr/apps/bulk_adding/tests/test_bulk_add.py b/ynr/apps/bulk_adding/tests/test_bulk_add.py index 5361cda441..e18589768a 100644 --- a/ynr/apps/bulk_adding/tests/test_bulk_add.py +++ b/ynr/apps/bulk_adding/tests/test_bulk_add.py @@ -752,79 +752,3 @@ def test_bulk_add_person_removes_spaces_from_name(self): self.assertContains(resp, "Review candidates") resp = form.submit() self.assertContains(resp, "Bart Simpson") - - def test_fall_back_to_camelot_if_no_textract(self): - data = {"name": "Bart", "party_id": "PP52"} - - raw_people = RawPeople.objects.create( - ballot=self.dulwich_post_ballot, - data=[data], - source_type=RawPeople.SOURCE_PARSED_PDF, - ) - - self.assertEqual( - raw_people.as_form_kwargs(), - { - "initial": [ - { - "name": "Bart", - "party": ["PP52", "PP52"], - "previous_party_affiliations": [], - "source": "", - } - ] - }, - ) - raw_people.delete() - - textract_data = {"name": "Lisa", "party_id": "PP53"} - raw_people = RawPeople.objects.create( - ballot=self.dulwich_post_ballot, - data=[data], - textract_data=[textract_data], - source_type=RawPeople.SOURCE_PARSED_PDF, - ) - - self.assertEqual( - raw_people.as_form_kwargs(), - { - "initial": [ - { - "name": "Lisa", - "party": ["PP53", "PP53"], - "previous_party_affiliations": [], - "source": "", - } - ] - }, - ) - - def test_can_change_parser_in_frontend(self): - """ - Check that a query param can change the parser we use - """ - BallotSOPN.objects.create( - source_url="http://example.com", - ballot=self.dulwich_post_ballot, - uploaded_file="sopn.pdf", - ) - RawPeople.objects.create( - ballot=self.dulwich_post_ballot, - data=[{"name": "Bart", "party_id": "PP52"}], - textract_data=[{"name": "Lisa", "party_id": "PP53"}], - source_type=RawPeople.SOURCE_PARSED_PDF, - ) - response = self.app.get( - "/bulk_adding/sopn/parl.65808.2015-05-07/", user=self.user - ) - form = response.forms["bulk_add_form"] - # This should be the Textract data - self.assertEqual(form.fields["form-0-name"][0].value, "Lisa") - - response = self.app.get( - "/bulk_adding/sopn/parl.65808.2015-05-07/?v1_parser=1", - user=self.user, - ) - form = response.forms["bulk_add_form"] - # This should be the Textract data - self.assertEqual(form.fields["form-0-name"][0].value, "Bart") diff --git a/ynr/apps/bulk_adding/views/sopns.py b/ynr/apps/bulk_adding/views/sopns.py index e0003e9c9e..6564db69a5 100644 --- a/ynr/apps/bulk_adding/views/sopns.py +++ b/ynr/apps/bulk_adding/views/sopns.py @@ -123,12 +123,8 @@ def get(self, request, *args, **kwargs): return super().get(request, *args, **kwargs) def get_active_parser(self) -> Optional[SOPNParsingBackends]: - if self.request.GET.get("v1_parser"): - return SOPNParsingBackends.CAMELOT if self.ballot.rawpeople.textract_data: return SOPNParsingBackends.TEXTRACT - if self.ballot.rawpeople.data: - return SOPNParsingBackends.CAMELOT return None def get_context_data(self, **kwargs): diff --git a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html index c61ab936ce..620857b058 100644 --- a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html +++ b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html @@ -7,28 +7,12 @@

Parsing Status

-

Camelot raw Data

- {% if object.sopn.camelotparsedsopn.raw_data %} -
{{ object.sopn.camelotparsedsopn.as_pandas.to_dict|pprint }}
- {% else %} - N/A - {% endif %} - -

Camelot table Data

- {% if object.sopn.camelotparsedsopn.data_as_html %} - {{ object.sopn.camelotparsedsopn.data_as_html|safe }} - {% else %} - N/A - {% endif %} -
- {% if textract_parsed and textract_parsed.as_textractor_document %}

AWS extracted table{{ textract_parsed.as_textractor_document.tables|pluralize }}

diff --git a/ynr/apps/official_documents/models.py b/ynr/apps/official_documents/models.py index 15de317657..55ca3e7d1d 100644 --- a/ynr/apps/official_documents/models.py +++ b/ynr/apps/official_documents/models.py @@ -260,7 +260,6 @@ def parse(self): """ - from sopn_parsing.helpers.extract_tables import extract_ballot_table from sopn_parsing.helpers.textract_helpers import ( NotUsingAWSException, TextractSOPNHelper, @@ -276,12 +275,6 @@ def parse(self): # There's a cron job that should pick up the result and carry on parsing later. textract_helper.start_detection() - if getattr( - settings, "CAMELOT_ENABLED", False - ) and self.uploaded_file.name.endswith(".pdf"): - # Camelot - extract_ballot_table(self.ballot) - class BallotSOPNHistory(BaseBallotSOPN): ballot = models.ForeignKey( diff --git a/ynr/apps/official_documents/tests/test_upload.py b/ynr/apps/official_documents/tests/test_upload.py index 03423f9ce7..443cc97fc6 100644 --- a/ynr/apps/official_documents/tests/test_upload.py +++ b/ynr/apps/official_documents/tests/test_upload.py @@ -114,20 +114,9 @@ def test_upload_authorized(self): with open(self.example_image_filename, "rb") as f: form["uploaded_file"] = Upload("pilot.jpg", f.read()) - # TODO: Add back in - # with patch( - # "official_documents.views.extract_pages_for_ballot" - # ) as extract_pages, patch( - # "official_documents.views.extract_ballot_table" - # ) as extract_tables, patch( - # "official_documents.views.parse_raw_data_for_ballot" - # ) as parse_tables: + response = form.submit() self.assertEqual(response.status_code, 302) - # TODO: Add back in - # extract_pages.assert_called_once() - # extract_tables.assert_called_once() - # parse_tables.assert_called_once() ballot_sopns = BallotSOPN.objects.all() self.assertEqual(ballot_sopns.count(), 1) @@ -181,20 +170,8 @@ def test_docx_upload_form_validation(self): with open(self.example_docx_filename, "rb") as f: form["uploaded_file"] = Upload("pilot.docx", f.read()) - # TODO: add back in - # with patch( - # "official_documents.views.extract_pages_for_ballot" - # ) as extract_pages, patch( - # "official_documents.views.extract_ballot_table" - # ) as extract_tables, patch( - # "official_documents.views.parse_raw_data_for_ballot" - # ) as parse_tables: response = form.submit() self.assertEqual(response.status_code, 302) - # TODO Add back in - # extract_pages.assert_called_once() - # extract_tables.assert_called_once() - # parse_tables.assert_called_once() self.assertEqual(BallotSOPN.objects.count(), 1) self.assertEqual(response.location, self.ballot.get_sopn_url()) diff --git a/ynr/apps/sopn_parsing/helpers/extract_tables.py b/ynr/apps/sopn_parsing/helpers/extract_tables.py deleted file mode 100644 index 0b610c7847..0000000000 --- a/ynr/apps/sopn_parsing/helpers/extract_tables.py +++ /dev/null @@ -1,63 +0,0 @@ -import json - -import pandas as pd -from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text -from sopn_parsing.models import CamelotParsedSOPN - - -def extract_ballot_table(ballot, parse_flavor="lattice"): - """ - Given a OfficialDocument model, update or create a CamelotParsedSOPN model with the - contents of the table as a JSON string. - - :type ballot: candidates.models.Ballot - - """ - import camelot # import here to avoid import error running tests without pdf deps installed - - document = ballot.sopn - try: - tables = camelot.read_pdf( - document.uploaded_file.path, - pages="all", - flavor=parse_flavor, - ) - except (NotImplementedError, AttributeError): - # * NotImplementedError is thrown if the PDF is an image or generally - # unreadable. - # * AttributeError is thrown on some PDFs saying they need a password. - # Assume this is a bug in camelot, and ignore these PDFs - raise NoTextInDocumentError() - - # Tables can span pages, camelot assumes they're different tables, so we - # need to join them back together - table_list = [] - for table in tables: - table_list.append(table) - table_list.sort(key=lambda t: (t.page, t.order)) - - if not table_list: - return None - - table_data = table_list.pop(0).df - - for table in table_list: - # It's possible to have the "situation of poll" document on the SOPN - # Ignore any table that contains "polling station" (SOPNs tables don't) - table = table.df - first_row = table.iloc[0].to_string() - - if "polling station" in clean_text(first_row): - break - # Append the continuation table to the first one in the document. - # ignore_index is needed so the e.g table 2 row 1 doesn't replace - # table 1 row 1 - table_data = pd.concat([table_data, table], ignore_index=True) - - if not table_data.empty: - parsed, _ = CamelotParsedSOPN.objects.update_or_create( - sopn=document, - defaults={"raw_data": json.dumps(table_data.to_dict())}, - ) - return parsed - return None diff --git a/ynr/apps/sopn_parsing/helpers/parse_tables.py b/ynr/apps/sopn_parsing/helpers/parse_tables.py index 247d0e7e31..e8152b0816 100644 --- a/ynr/apps/sopn_parsing/helpers/parse_tables.py +++ b/ynr/apps/sopn_parsing/helpers/parse_tables.py @@ -476,20 +476,12 @@ def parse_dataframe(ballot: Ballot, df: DataFrame): def parse_raw_data(ballot: Ballot, reparse=False): """ - Given a Ballot, go and get the Camelot and the AWS Textract dataframes + Given a Ballot, go and get the AWS Textract dataframes and process them """ - camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None) - camelot_data = {} textract_model = getattr(ballot.sopn, "awstextractparsedsopn", None) textract_data = {} - if ( - camelot_model - and camelot_model.raw_data_type == "pandas" - and (reparse or not camelot_model.parsed_data) - ): - camelot_data = parse_dataframe(ballot, camelot_model.as_pandas) if ( textract_model and textract_model.raw_data @@ -500,7 +492,7 @@ def parse_raw_data(ballot: Ballot, reparse=False): textract_model.parse_raw_data() textract_data = parse_dataframe(ballot, textract_model.as_pandas) - if camelot_data or textract_data: + if textract_data: # Check there isn't a rawpeople object from another (better) source rawpeople_qs = RawPeople.objects.filter(ballot=ballot).exclude( source_type=RawPeople.SOURCE_PARSED_PDF @@ -510,7 +502,7 @@ def parse_raw_data(ballot: Ballot, reparse=False): RawPeople.objects.update_or_create( ballot=ballot, defaults={ - "data": camelot_data or "", + "data": "", "textract_data": textract_data or "", "source": "Parsed from {}".format( ballot.sopn.source_url @@ -525,17 +517,10 @@ def parse_raw_data(ballot: Ballot, reparse=False): return # We've done the parsing, so let's still save the result storage = DefaultStorage() - storage.save( - f"raw_people/camelot_{ballot.ballot_paper_id}.json", - ContentFile(json.dumps(camelot_data, indent=4).encode("utf8")), - ) storage.save( f"raw_people/textract_{ballot.ballot_paper_id}.json", ContentFile(json.dumps(textract_data, indent=4).encode("utf8")), ) - if camelot_model: - ballot.sopn.camelotparsedsopn.status = "parsed" - ballot.sopn.camelotparsedsopn.save() if textract_model: ballot.sopn.awstextractparsedsopn.status = "parsed" ballot.sopn.awstextractparsedsopn.save() diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py deleted file mode 100644 index 3a4e091290..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py +++ /dev/null @@ -1,29 +0,0 @@ -from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand -from sopn_parsing.helpers.extract_tables import extract_ballot_table -from sopn_parsing.helpers.text_helpers import NoTextInDocumentError - - -class Command(BaseSOPNParsingCommand): - help = """ - Parse tables out of PDFs in to CamelotParsedSOPN models for later parsing. - """ - - def handle(self, *args, **options): - qs = self.get_queryset(options) - filter_kwargs = {} - if not options["ballot"] and not options["testing"]: - if not options["reparse"]: - filter_kwargs["sopn__camelotparsedsopn"] = None - - qs = qs.filter(**filter_kwargs) - for ballot in qs: - try: - extract_ballot_table(ballot) - except NoTextInDocumentError: - self.stdout.write( - f"{ballot} raised a NoTextInDocumentError trying to extract tables" - ) - except ValueError: - self.stdout.write( - f"{ballot} raised a ValueError trying extract tables" - ) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py deleted file mode 100644 index 26448b697f..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py +++ /dev/null @@ -1,67 +0,0 @@ -from bulk_adding.models import RawPeople -from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand -from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot - - -class Command(BaseSOPNParsingCommand): - help = """ - Convert the raw extracted tables on the CamelotParsedSOPN model to a parsed - RawPeople model, and set the status as parsed. - - """ - - def build_filter_kwargs(self, options): - """ - Build kwargs used to filter the BallotQuerySet that is parsed - - Always skip any ballots where we do not have a CamelotParsedSOPN to try to - extract candidates from - - When test flag is used, dont make any changes - - When parsing a single ballot, dont make any changes - - When reparsing, only use ballots where we have previously created a - RawPeople object from a CamelotParsedSOPN - - Otherwise filter by unparsed CamelotParsedSOPN objects - """ - # Always skip any ballots where we do not have a CamelotParsedSOPN to try to - # extract candidates from - filter_kwargs = {} - if options.get("testing"): - return filter_kwargs - - if options.get("ballot"): - return filter_kwargs - - if options.get("reparse"): - filter_kwargs[ - "rawpeople__source_type" - ] = RawPeople.SOURCE_PARSED_PDF - return filter_kwargs - - return filter_kwargs - - def handle(self, *args, **options): - # filters that we never change with args. These two would raise - # ValueErrors in the parse_raw_data_for_ballot function - base_qs = self.get_queryset(options) - filter_kwargs = self.build_filter_kwargs(options) - - qs = base_qs.filter(**filter_kwargs) - qs = qs.filter( - candidates_locked=False, # Never parse a locked ballot - suggestedpostlock=None, # Never parse a ballot with lock suggestions - ) - - if not qs.exists(): - msg = ["No ballots to parse found."] - - if options.get("ballot"): - msg.append( - "This ballot might be locked or have lock suggestions" - ) - - self.stderr.write("\n".join(msg)) - - for ballot in qs: - try: - parse_raw_data_for_ballot(ballot, options["reparse"]) - except ValueError as e: - print(str(e)) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py index 41db0e1f57..7b38b54b6a 100644 --- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py +++ b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py @@ -1,4 +1,3 @@ -from django.conf import settings from django.core.management.base import BaseCommand from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot from sopn_parsing.helpers.textract_helpers import ( @@ -8,7 +7,6 @@ from sopn_parsing.models import ( AWSTextractParsedSOPN, AWSTextractParsedSOPNStatus, - CamelotParsedSOPN, ) @@ -21,22 +19,16 @@ class Command(BaseCommand): This script picks up where `parse` left off. It manages two cases: - # Camelot - - We expect to have made a `CamelotParsedSOPN` with `raw_data` populated. This will only have - happened if the file is a PDF readable by Camelot. - - We need to parse the `raw_data` into `parsed_data` and then make a `RawData` object for bulk adding. - # AWS Textract - We should have made a `AWSTextractParsedSOPN` with `job_id` populated. Textract is async, - so the initial `parse` just submits the data to AWS and gets a job_id. + We should have made a `AWSTextractParsedSOPN` with `job_id` populated. + Textract is async, so the initial `parse` just submits the data to AWS and + gets a job_id. We need to check if the job ID has finished and pull in the data to `raw_data`. - We're then in the same state as the Camelot method above, we need to parse the `raw_data` into - `parsed_data` and makr a `RawData` object for bulk adding. + We need to parse the `raw_data` into `parsed_data` and makr a `RawData` + object for bulk adding. """ def handle(self, *args, **options): @@ -45,15 +37,6 @@ def handle(self, *args, **options): "sopn__ballot__candidates_locked": False, } - if getattr(settings, "CAMELOT_ENABLED", False): - # Camelot first - qs = ( - CamelotParsedSOPN.objects.filter(parsed_data=None) - .exclude(raw_data="") - .filter(**current_ballot_kwargs) - ) - self.parse_tables_for_qs(qs) - # Textract qs = AWSTextractParsedSOPN.objects.exclude( status__in=[ diff --git a/ynr/apps/sopn_parsing/tests/test_extract_tables.py b/ynr/apps/sopn_parsing/tests/test_extract_tables.py index 21a03dfb63..bda0d99fd3 100644 --- a/ynr/apps/sopn_parsing/tests/test_extract_tables.py +++ b/ynr/apps/sopn_parsing/tests/test_extract_tables.py @@ -1,15 +1,10 @@ from os.path import abspath, dirname, join -from unittest import skipIf from candidates.tests.helpers import TmpMediaRootMixin from candidates.tests.uk_examples import UK2015ExamplesMixin from django.core.files.uploadedfile import SimpleUploadedFile -from django.core.management import call_command from django.test import TestCase from official_documents.models import BallotSOPN -from sopn_parsing.helpers.extract_tables import extract_ballot_table -from sopn_parsing.models import CamelotParsedSOPN -from sopn_parsing.tests import should_skip_pdf_tests class TestSOPNHelpers(TmpMediaRootMixin, UK2015ExamplesMixin, TestCase): @@ -27,152 +22,3 @@ def setUp(self): uploaded_file=SimpleUploadedFile("sopn.pdf", sopn_file), source_url="example.com", ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_extract_tables(self): - extract_ballot_table(self.dulwich_post_ballot) - self.assertEqual( - CamelotParsedSOPN.objects.get().as_pandas.to_dict(), - { - "0": { - "0": "Name of \nCandidate", - "1": "ALAGARATNAM \nRathy", - "2": "BARBER \nJames", - "3": "HAYES \nHelen Elizabeth", - "4": "KANUMANSA \nAmadu", - "5": "KOTECHA \nResham", - "6": "LAMBERT \nRobin Andrew \nDavid", - "7": "NALLY \nSteve", - "8": "NIX \nRashid", - }, - "1": { - "0": "Home \nAddress", - "1": "(address in the \nMitcham and Morden \nConstituency)", - "2": "33 Champion Hill, \nLondon, SE5 8BS", - "3": "11 Woodsyre, \nSydenham Hill, \nLondon, SE26 6SS", - "4": "11 Coleridge House, \nBrowning Street, \nLondon, SE17 1DG", - "5": "(address in the \nRuislip, Northwood \nand Pinner \nConstituency)", - "6": "(address in the \nDuwlich and West \nNorwood \nConstituency)", - "7": "(address in the \nVauxhall \nConstituency)", - "8": "66 Guinness Court, \nLondon, SW3 2PQ", - }, - "2": { - "0": "Description \n(if any)", - "1": "UK Independence \nParty (UKIP)", - "2": "Liberal Democrat", - "3": "Labour Party", - "4": "All People`s Party", - "5": "The Conservative \nParty Candidate", - "6": "Independent", - "7": "Trade Unionist \nand Socialist \nCoalition", - "8": "The Green Party", - }, - "3": { - "0": "Name of Assentors \nProposer(+), Seconder(++)", - "1": "Coleman Alice M + \n" - "Potter Keith S ++ \n" - "Potter Stephanie \n" - "Smith Bryan L \n" - "Anderson Beth \n" - "Lumba Avita \n" - "Andersen Robert \n" - "Patel Sajal \n" - "Stanbury Linda \n" - "Stanbury James", - "2": "Fitchett Keith + \n" - "Price Jonathan ++ \n" - "Gardner Brigid \n" - "Waddington Simon \n" - "Morland Laura \n" - "Lester Rachel \n" - "Pidgeon Caroline \n" - "Hare David \n" - "Hanton Alastair \n" - "Haylett Alexander", - "3": "Samuel Gaynelle + \n" - "Whaley Stephen P ++ \n" - "Brazell Shadi M \n" - "De Souza Johnny \n" - "Alcock Heather \n" - "Natzler Robert S \n" - "Pearce Michelle E \n" - "Pickering Robert \n" - "Richardson Katherine G \n" - "Pickard Jane", - "4": "King James + \n" - "King Rosemary ++ \n" - "King David \n" - "Davies Yadalieu \n" - "Sesay Mary \n" - "Rahman Layla K \n" - "Rahman Syed A \n" - "Ahmed Jalaluddin \n" - "Rahman Tajwar S \n" - "Rahman Taamid S", - "5": "Davis James G + \n" - "Bradbury David S ++ \n" - "Badman Susan E \n" - "Hill-Archer Roderick C \n" - "Langley Anne C \n" - "Mitchell Andrew M \n" - "Virgo Marjorie J \n" - "Virgo Philip A \n" - "Chathli Lindsay \n" - "Broomhead Robert A", - "6": "Smith Caitlin + \n" - "Parks Jesse ++ \n" - "Connage Kyesha \n" - "Hendry Perihan \n" - "Mounty E J \n" - "Sharif B \n" - "Scott Wellesley \n" - "Harriott S A \n" - "Harriott Clive \n" - "Ojumu Ibi", - "7": "Tullis Andrew C + \n" - "Mason Joshua H ++ \n" - "Parkinson Francine M \n" - "Gait Elizabeth \n" - "Doolan Samantha \n" - "Ubiaro Elizabeth \n" - "Garner Stuart \n" - "Akinjogbin Dolapo \n" - "Walker Donna \n" - "Lang Geoffrey P", - "8": "Atwell E G + \n" - "Rose Lloyd ++ \n" - "O`Shea C \n" - "Gomes Jacqueline \n" - "Wood Thomas \n" - "Rosenfeld David \n" - "Conroy Martin \n" - "Skiadopoulou I \n" - "Rosenfeld Lawrence \n" - "Rosenfeld Emily", - }, - "4": { - "0": "Reason why \nno longer \nnominated*", - "1": "", - "2": "", - "3": "", - "4": "", - "5": "", - "6": "", - "7": "", - "8": "", - }, - }, - ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_extract_command_current(self): - self.assertEqual(CamelotParsedSOPN.objects.count(), 0) - call_command("sopn_parsing_extract_tables", current=True) - self.assertEqual(CamelotParsedSOPN.objects.count(), 1) - - def test_extract_command_current_no_current_elections(self): - self.election.current = False - self.election.save() - self.assertEqual(CamelotParsedSOPN.objects.count(), 0) - call_command("sopn_parsing_extract_tables", current=True) - self.assertEqual(CamelotParsedSOPN.objects.count(), 0) diff --git a/ynr/apps/sopn_parsing/tests/test_parse_tables.py b/ynr/apps/sopn_parsing/tests/test_parse_tables.py deleted file mode 100644 index 922c487dd4..0000000000 --- a/ynr/apps/sopn_parsing/tests/test_parse_tables.py +++ /dev/null @@ -1,529 +0,0 @@ -import json -from pathlib import Path -from unittest import skipIf -from unittest.mock import patch - -from bulk_adding.models import RawPeople -from candidates.tests.uk_examples import UK2015ExamplesMixin -from django.core.management import call_command -from django.db import connection -from django.test import TestCase -from official_documents.models import BallotSOPN -from pandas import Index, Series -from parties.models import Party, PartyDescription -from parties.tests.factories import PartyFactory -from parties.tests.fixtures import DefaultPartyFixtures -from sopn_parsing.helpers import parse_tables -from sopn_parsing.models import CamelotParsedSOPN -from sopn_parsing.tests import should_skip_pdf_tests -from sopn_parsing.tests.data.welsh_sopn_data import welsh_sopn_data - -from ynr.apps.sopn_parsing.management.commands.sopn_parsing_parse_tables import ( - Command as ParseTablesCommand, -) - - -class TestSOPNHelpers(DefaultPartyFixtures, UK2015ExamplesMixin, TestCase): - def setUp(self): - PartyFactory(ec_id="PP85", name="UK Independence Party (UKIP)") - with connection.cursor() as cursor: - cursor.execute("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;") - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_basic_parsing(self): - self.assertFalse(RawPeople.objects.exists()) - doc = BallotSOPN.objects.create( - ballot=self.dulwich_post_ballot, - source_url="example.com", - ) - dataframe = json.dumps( - { - "0": { - "0": "Name of \nCandidate", - "1": "BRADBURY \nAndrew John", - "2": "COLLINS \nDave", - "3": "HARVEY \nPeter John", - "4": "JENNER \nMelanie", - }, - "1": { - "0": "Home Address", - "1": "10 Fowey Close, \nShoreham by Sea, \nWest Sussex, \nBN43 5HE", - "2": "51 Old Fort Road, \nShoreham by Sea, \nBN43 5RL", - "3": "76 Harbour Way, \nShoreham by Sea, \nSussex, \nBN43 5HH", - "4": "9 Flag Square, \nShoreham by Sea, \nWest Sussex, \nBN43 5RZ", - }, - "2": { - "0": "Description (if \nany)", - "1": "Green Party", - "2": "Independent", - "3": "UK Independence \nParty (UKIP)", - "4": "Labour Party", - }, - "3": { - "0": "Name of \nProposer", - "1": "Tiffin Susan J", - "2": "Loader Jocelyn C", - "3": "Hearne James H", - "4": "O`Connor Lavinia", - }, - "4": { - "0": "Reason \nwhy no \nlonger \nnominated\n*", - "1": "", - "2": "", - "3": "", - "4": "", - }, - } - ) - CamelotParsedSOPN.objects.create( - sopn=doc, raw_data=dataframe, status="unparsed" - ) - call_command("sopn_parsing_parse_tables") - self.assertEqual(RawPeople.objects.count(), 1) - raw_people = RawPeople.objects.get() - self.assertEqual( - raw_people.data, - [ - {"name": "Andrew John Bradbury", "party_id": "PP63"}, - {"name": "Dave Collins", "party_id": "ynmp-party:2"}, - {"name": "Peter John Harvey", "party_id": "PP85"}, - {"name": "Melanie Jenner", "party_id": "PP53"}, - ], - ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_welsh_run_sopn(self): - """ - Test that if the ballot is welsh run and previous party affiliations - are included they are parsed - """ - self.assertFalse(RawPeople.objects.exists()) - doc = BallotSOPN.objects.create( - ballot=self.senedd_ballot, - source_url="example.com", - ) - - plaid_cymru, _ = Party.objects.update_or_create( - ec_id="PP77", - legacy_slug="party:77", - defaults={ - "name": "Plaid Cymru - The Party of Wales", - "date_registered": "1999-01-14", - }, - ) - - dataframe = json.dumps(welsh_sopn_data) - CamelotParsedSOPN.objects.create( - sopn=doc, raw_data=dataframe, status="unparsed" - ) - call_command("sopn_parsing_parse_tables") - self.assertEqual(RawPeople.objects.count(), 1) - raw_people = RawPeople.objects.get() - self.assertEqual( - raw_people.data, - [ - { - "name": "John Smith", - "party_id": self.conservative_party.ec_id, - "previous_party_affiliations": [self.ld_party.ec_id], - }, - { - "name": "Joe Bloggs", - "party_id": self.labour_party.ec_id, - "previous_party_affiliations": ["ynmp-party:2"], - }, - {"name": "Jon Doe", "party_id": self.ld_party.ec_id}, - { - "name": "Jane Brown", - "party_id": "ynmp-party:2", - "previous_party_affiliations": [plaid_cymru.ec_id], - }, - { - "name": "Judy Johnson", - "party_id": plaid_cymru.ec_id, - "previous_party_affiliations": [self.labour_party.ec_id], - }, - {"name": "Julie Williams", "party_id": "ynmp-party:2"}, - ], - ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_match_complex_descriptions(self): - self.assertFalse(RawPeople.objects.exists()) - doc = BallotSOPN.objects.create( - ballot=self.senedd_ballot, - source_url="example.com", - ) - - plaid_cymru, _ = Party.objects.update_or_create( - ec_id="PP77", - legacy_slug="party:77", - defaults={ - "name": "Plaid Cymru - The Party of Wales", - "date_registered": "1999-01-14", - }, - ) - - dickens_heath, _ = Party.objects.update_or_create( - ec_id="PP1", - legacy_slug="PP!", - defaults={ - "name": "Independent Dickens Heath Residents Action Group", - "date_registered": "1999-01-14", - }, - ) - PartyDescription.objects.create( - party=dickens_heath, - description="Independent Dickens Heath Residents Action Group", - ) - lib_dem, _ = Party.objects.update_or_create( - ec_id="PP100", - legacy_slug="PP100", - defaults={ - "name": "Liberal Democrats", - "date_registered": "1999-01-14", - }, - register="GB", - ) - - PartyDescription.objects.create( - party=lib_dem, - description="Liberal Democrat Focus Team | Tîm Ffocws y Democratiaid Rhyddfrydol", - ) - - data_path = ( - Path(__file__).parent / "data/edge_case_description_data.json" - ) - with data_path.open() as f: - CamelotParsedSOPN.objects.create( - sopn=doc, raw_data=f.read(), status="unparsed" - ) - call_command("sopn_parsing_parse_tables") - self.assertEqual(RawPeople.objects.count(), 1) - raw_people = RawPeople.objects.get() - self.assertEqual( - sorted(raw_people.data, key=lambda x: x["name"]), - sorted( - [ - { - "name": "John Smith", - "party_id": self.conservative_party.ec_id, - }, - { - "name": "Joe Bloggs", - "party_id": self.labour_party.ec_id, - }, - { - "name": "Jon Doe", - "party_id": self.ld_party.ec_id, - }, - { - "name": "Jane Brown", - "party_id": "ynmp-party:2", - }, - { - "name": "Judy Johnson", - "party_id": plaid_cymru.ec_id, - }, - {"name": "Julie Williams", "party_id": "ynmp-party:2"}, - ], - key=lambda x: x["name"], - ), - ) - - -class TestParseTablesUnitTests(UK2015ExamplesMixin, TestCase): - def get_two_name_field_cases(self): - # this could be updated with more combinations as we come across them - return [ - { - "name_fields": ["candidate surname", "candidate forename"], - "row": { - "candidate surname": "BAGSHAW", - "candidate forename": "Elaine Sheila", - "home address": "1 Foo Street \n London \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": [ - "candidate forename", - "candidate surname", - ], - "expected_name": "Elaine Sheila Bagshaw", - }, - { - "name_fields": ["surname", "other names"], - "row": { - "surname": "BAGSHAW", - "other names": "Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": ["other names", "surname"], - "expected_name": "Elaine Sheila Bagshaw", - }, - { - "name_fields": ["last name", "other names"], - "row": { - "last name": "BAGSHAW", - "other names": "Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": ["other names", "last name"], - "expected_name": "Elaine Sheila Bagshaw", - }, - { - "name_fields": ["candidate forename", "candidate surname"], - "row": { - "candidate forename": "Elaine Sheila", - "candidate surname": "BAGSHAW", - "home address": "1 Foo Street \n London \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": [ - "candidate forename", - "candidate surname", - ], - "expected_name": "Elaine Sheila Bagshaw", - }, - ] - - def get_single_name_field_cases(self): - return [ - { - "name_fields": ["name of candidate"], - "row": { - "name of candidate": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \n London \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["names of candidate"], - "row": { - "names of candidate": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["candidate name"], - "row": { - "candidate name": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["surname"], - "row": { - "surname": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["candidates surname"], - "row": { - "candidates surname": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["other name"], - "row": { - "other name": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - ] - - def test_get_name_single_field(self): - for case in self.get_single_name_field_cases(): - row = Series(case["row"]) - name_fields = case["name_fields"] - with self.subTest(name_fields=name_fields): - assert len(case["name_fields"]) == 1 - name = parse_tables.get_name(row=row, name_fields=name_fields) - assert name == "Elaine Sheila Bagshaw" - - def test_get_name_two_fields(self): - for case in self.get_two_name_field_cases(): - row = Series(case["row"]) - name_fields = case["name_fields"] - with self.subTest(name_fields=name_fields): - assert len(case["name_fields"]) == 2 - name = parse_tables.get_name(row=row, name_fields=name_fields) - assert name == case["expected_name"] - - def test_get_name_fields_single(self): - for case in self.get_single_name_field_cases(): - row = Index(case["row"]) - with self.subTest(row=row): - name_fields = parse_tables.get_name_fields(row=row) - assert len(name_fields) == 1 - assert name_fields == case["name_fields"] - - def test_get_name_fields_two(self): - for case in self.get_two_name_field_cases(): - row = Index(case["row"]) - with self.subTest(row=row): - name_fields = parse_tables.get_name_fields(row=row) - assert len(name_fields) == 2 - assert name_fields == case["name_fields"] - - def test_get_name_fields_raises_error(self): - row = Index({"foo": "Bar"}) - with self.assertRaises(ValueError): - parse_tables.get_name_fields(row=row) - - def test_order_name_fields(self): - for case in self.get_two_name_field_cases(): - name_fields = case["name_fields"] - with self.subTest(name_fields=name_fields): - result = parse_tables.order_name_fields(name_fields) - assert result == case["ordered_name_fields"] - - def test_clean_name_replaces_backticks(self): - name = parse_tables.clean_name("D`SOUZA") - assert "`" not in name - assert "'" in name - - def test_clean_name_replaces_newlines(self): - name = parse_tables.clean_name( - "A Very Long Name That Splits \nOver Lines" - ) - assert "\n" not in name - - def test_clean_name_capitalized_last_and_titalized(self): - name = parse_tables.clean_name("SMITH John") - assert name == "John Smith" - - def test_clean_last_names(self): - name = parse_tables.clean_last_names(["MACDONALD", "John"]) - assert name == "MacDonald" - - def test_clean_name_two_word_surnames(self): - names = [ - ("EDE COOPER \nPalmer", "Palmer Ede Cooper"), - ("VAN DULKEN \nRichard Michael", "Richard Michael Van Dulken"), - ("ARMSTRONG LILLEY \nLynne", "Lynne Armstrong Lilley"), - ( - " D`SOUZA Aaron Anthony Jose \nHasan", - "Aaron Anthony Jose Hasan D'Souza", - ), - ("Michael James Collins", "Michael James Collins"), - (" Michael James Collins ", "Michael James Collins"), - ("DAVE Nitesh Pravin", "Nitesh Pravin Dave"), - ("DAVE\nNitesh Pravin", "Nitesh Pravin Dave"), - ("COOKE Anne-Marie", "Anne-Marie Cooke"), - ("COOKE\nAnne-Marie", "Anne-Marie Cooke"), - ("BROOKES-\nDUNCAN\nKaty", "Katy Brookes-Duncan"), - ("HOUNSOME\nJohn", "John Hounsome"), - ("O`CONNELL \nStephen John", "Stephen John O'Connell"), - ("O`NEAL \nCarol Joy", "Carol Joy O'Neal"), - ("O`REILLY \nTracey Linda \nDiane", "Tracey Linda Diane O'Reilly"), - ("LIAM THOMAS O'ROURKE", "Liam Thomas O'Rourke"), - ("O'CALLAGHAN \nClaire Louise", "Claire Louise O'Callaghan"), - ] - for name in names: - with self.subTest(name=names[0]): - assert parse_tables.clean_name(name[0]) == name[1] - - def test_clean_description_removes_newlines(self): - cleaned_description = parse_tables.clean_description( - "A Long Description That Splits \nOver \\nLines" - ) - assert "\n" not in cleaned_description - assert "\\n" not in cleaned_description - - def test_clean_description_replaces_backticks(self): - cleaned_description = parse_tables.clean_description( - "All People`s Party" - ) - assert "`" not in cleaned_description - assert "'" in cleaned_description - assert cleaned_description == "All People's Party" - - def test_guess_previous_party_affiliations_field(self): - sopn = CamelotParsedSOPN(raw_data=json.dumps(welsh_sopn_data)) - data = sopn.as_pandas - data.columns = data.iloc[0] - - cases = [ - (self.dulwich_post_ballot, None), - (self.senedd_ballot, "statement of party membership"), - ] - for case in cases: - with self.subTest(msg=case[0]): - sopn.sopn = BallotSOPN(ballot=case[0]) - result = parse_tables.guess_previous_party_affiliations_field( - data=data, sopn=sopn - ) - assert result == case[1] - - def test_add_previous_party_affiliations(self): - cases = [ - {"party_str": "", "party": None, "expected": {}}, - {"party_str": "Unknown Party", "party": None, "expected": {}}, - { - "party_str": "Labour Party", - "party": self.labour_party, - "expected": { - "previous_party_affiliations": [self.labour_party.ec_id] - }, - }, - ] - for case in cases: - with self.subTest(msg=case["party_str"]), patch.object( - parse_tables, "get_party", return_value=case["party"] - ): - raw_data = {} - sopn = CamelotParsedSOPN() - result = parse_tables.add_previous_party_affiliations( - party_str=case["party_str"], - raw_data=raw_data, - sopn=sopn, - ) - assert result == case["expected"] - - -class TestParseTablesFilterKwargs(TestCase): - def setUp(self): - self.command = ParseTablesCommand() - self.default_filter_kwargs = {} - - def test_when_testing(self): - options = {"testing": True} - result = self.command.build_filter_kwargs(options) - self.assertEqual(result, self.default_filter_kwargs) - - def test_when_using_ballot(self): - options = {"ballot": "local.foo.bar.2021-05-06"} - result = self.command.build_filter_kwargs(options) - self.assertEqual(result, self.default_filter_kwargs) - - def test_when_using_reparse(self): - options = {"reparse": True} - result = self.command.build_filter_kwargs(options) - expected = self.default_filter_kwargs.copy() - expected["rawpeople__source_type"] = RawPeople.SOURCE_PARSED_PDF - self.assertEqual(result, expected) - - def test_when_no_options(self): - options = {} - result = self.command.build_filter_kwargs(options) - expected = self.default_filter_kwargs.copy() - self.assertEqual(result, expected)