diff --git a/ynr/apps/bulk_adding/forms.py b/ynr/apps/bulk_adding/forms.py index da1f3ee5f5..56778fe70c 100644 --- a/ynr/apps/bulk_adding/forms.py +++ b/ynr/apps/bulk_adding/forms.py @@ -69,9 +69,9 @@ def __init__(self, *args, **kwargs): def get_form_kwargs(self, index): kwargs = super().get_form_kwargs(index) kwargs["party_choices"] = self.parties - kwargs["previous_party_affiliations_choices"] = ( - self.previous_party_affiliations_choices - ) + kwargs[ + "previous_party_affiliations_choices" + ] = self.previous_party_affiliations_choices return kwargs @property diff --git a/ynr/apps/bulk_adding/tests/test_bulk_add_by_party.py b/ynr/apps/bulk_adding/tests/test_bulk_add_by_party.py index 2f5c570260..da78290be4 100644 --- a/ynr/apps/bulk_adding/tests/test_bulk_add_by_party.py +++ b/ynr/apps/bulk_adding/tests/test_bulk_add_by_party.py @@ -198,9 +198,9 @@ def test_submit_name_and_social_media_links_for_area(self): form[f"{ballot.pk}-0-person_identifiers_0_1"] = "homepage_url" form[f"{ballot.pk}-0-person_identifiers_1_0"] = "pp@gmail.com" form[f"{ballot.pk}-0-person_identifiers_1_1"] = "email" - form[f"{ballot.pk}-0-person_identifiers_2_0"] = ( - "https://linkedin.com/in/pamphero" - ) + form[ + f"{ballot.pk}-0-person_identifiers_2_0" + ] = "https://linkedin.com/in/pamphero" form[f"{ballot.pk}-0-person_identifiers_2_1"] = "linkedin_url" response = form.submit().follow() @@ -304,22 +304,22 @@ def test_bulk_add_with_100_ballots(self): form["source"] = "https://example.com/candidates/" for ballot in ballots: form[f"{ballot.pk}-0-name"] = f"Candidate {ballot.pk}" - form[f"{ballot.pk}-0-biography"] = ( - f"Biography for Candidate {ballot.pk}" - ) + form[ + f"{ballot.pk}-0-biography" + ] = f"Biography for Candidate {ballot.pk}" form[f"{ballot.pk}-0-gender"] = "female" form[f"{ballot.pk}-0-birth_date"] = "1990" - form[f"{ballot.pk}-0-person_identifiers_0_0"] = ( - f"https://example.com/{ballot.pk}" - ) + form[ + f"{ballot.pk}-0-person_identifiers_0_0" + ] = f"https://example.com/{ballot.pk}" form[f"{ballot.pk}-0-person_identifiers_0_1"] = "homepage_url" - form[f"{ballot.pk}-0-person_identifiers_1_0"] = ( - f"candidate{ballot.pk}@example.com" - ) + form[ + f"{ballot.pk}-0-person_identifiers_1_0" + ] = f"candidate{ballot.pk}@example.com" form[f"{ballot.pk}-0-person_identifiers_1_1"] = "email" - form[f"{ballot.pk}-0-person_identifiers_2_0"] = ( - f"https://linkedin.com/in/candidate{ballot.pk}" - ) + form[ + f"{ballot.pk}-0-person_identifiers_2_0" + ] = f"https://linkedin.com/in/candidate{ballot.pk}" form[f"{ballot.pk}-0-person_identifiers_2_1"] = "linkedin_url" # Submit the form diff --git a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html index c61ab936ce..cef7f89616 100644 --- a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html +++ b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html @@ -12,6 +12,15 @@

Parsing Status

  • AWS Textract Data: {% if textract_parsed.raw_data %}Yes{% else %}No{% endif %}
  • AWS Textract Parsed? {% if textract_parsed.parsed_data %}Yes{% else %} No{% endif %}
  • +
  • + Withdrawal detected: + {% if textract_parsed.withdrawal_rows %} + Yes in row{{ textract_parsed.withdrawal_rows|pluralize }} + {{ textract_parsed.withdrawal_rows|join:", " }} + {% else %} + No + {% endif %} +
  • Camelot raw Data

    @@ -28,6 +37,7 @@

    Camelot table Data

    N/A {% endif %}
    + {{ textract_parsed.as_pandas.to_html|safe }} {% if textract_parsed and textract_parsed.as_textractor_document %} @@ -37,6 +47,7 @@
    {{ table.title.text }}
    {{ table.to_html|safe }} {% endfor %} {% endif %} + {% if textract_parsed.parsed_data %}

    AWS document markdown

    diff --git a/ynr/apps/elections/templates/elections/sopn_for_ballot.html b/ynr/apps/elections/templates/elections/sopn_for_ballot.html index d4a39d6d96..c7bf428065 100644 --- a/ynr/apps/elections/templates/elections/sopn_for_ballot.html +++ b/ynr/apps/elections/templates/elections/sopn_for_ballot.html @@ -67,6 +67,7 @@

    {% include "elections/includes/_sopn_debug.html" %} {% endif %} +
    {% else %} @@ -78,13 +79,14 @@

    {% url 'admin:official_documents_ballotsopn_change' object.sopn.id as url %} You can edit this in the admin interface (e.g. to delete it) {% endif %} - {% if object.sopn.uploaded_file.url|slice:"-3:" == "pdf" %} {% else %} diff --git a/ynr/apps/official_documents/migrations/0039_ballotsopn_withdrawal_detected_and_more.py b/ynr/apps/official_documents/migrations/0039_ballotsopn_withdrawal_detected_and_more.py new file mode 100644 index 0000000000..f5c70604dc --- /dev/null +++ b/ynr/apps/official_documents/migrations/0039_ballotsopn_withdrawal_detected_and_more.py @@ -0,0 +1,22 @@ +# Generated by Django 4.2.16 on 2025-04-02 08:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("official_documents", "0038_ballotsopn_replacement_reason_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="ballotsopn", + name="withdrawal_detected", + field=models.BooleanField(default=False), + ), + migrations.AddField( + model_name="ballotsopnhistory", + name="withdrawal_detected", + field=models.BooleanField(default=False), + ), + ] diff --git a/ynr/apps/official_documents/models.py b/ynr/apps/official_documents/models.py index 15de317657..37167e904d 100644 --- a/ynr/apps/official_documents/models.py +++ b/ynr/apps/official_documents/models.py @@ -220,6 +220,8 @@ class BaseBallotSOPN(TimeStampedModel): blank=True, ) + withdrawal_detected = models.BooleanField(default=False) + class Meta: get_latest_by = "modified" abstract = True diff --git a/ynr/apps/official_documents/templates/official_documents/sopn_viewer.js b/ynr/apps/official_documents/templates/official_documents/sopn_viewer.js index c5e7994e3b..12bed5f16d 100644 --- a/ynr/apps/official_documents/templates/official_documents/sopn_viewer.js +++ b/ynr/apps/official_documents/templates/official_documents/sopn_viewer.js @@ -9,7 +9,17 @@ var SOPN_VIEWER = (function () { var module = {}; - function load_page(pdf, container, page_num) { + function drawRectangles(context, rectangles) { + rectangles.forEach(rect => { + context.beginPath(); + context.rect(rect.x, rect.y, rect.width, rect.height); + context.lineWidth = rect.lineWidth || 1; + context.strokeStyle = rect.color || 'red'; + context.stroke(); + }); + } + + function load_page(pdf, container, page_num, rectanglesPerPage) { return pdf.getPage(page_num).then(function (page) { var scale = 1.2; @@ -25,6 +35,8 @@ var SOPN_VIEWER = (function () { var context = canvas.getContext("2d"); canvas.height = viewport.height; canvas.width = viewport.width; + console.log(canvas.height) + console.log(canvas.width) var renderContext = { canvasContext: context, viewport: viewport @@ -33,6 +45,12 @@ var SOPN_VIEWER = (function () { var renderTask = page.render(renderContext); return renderTask.promise.then(function () { container.append(page_container); + + if (rectanglesPerPage && rectanglesPerPage[page_num]) { + drawRectangles(context, rectanglesPerPage[page_num]); + } + + return page.getTextContent({normalizeWhitespace: true}); }).then(function (textContent) { var pdf_canvas = $(canvas), @@ -57,6 +75,8 @@ var SOPN_VIEWER = (function () { viewport: viewport, textDivs: [] }); + + }); } @@ -64,7 +84,7 @@ var SOPN_VIEWER = (function () { }); } - function ShowSOPNInline(sopn_url, ballot_paper_id, options) { + function ShowSOPNInline(sopn_url, ballot_paper_id, rectanglesPerPage) { // The container element var this_pdf_container = document.getElementById("sopn-" + ballot_paper_id); @@ -73,7 +93,7 @@ var SOPN_VIEWER = (function () { loadingTask.promise.then(function (pdf) { var promise = Promise.resolve(); for (let page = 1; page <= pdf.numPages; page++) { - promise = promise.then(() => load_page(pdf, this_pdf_container, page)); + promise = promise.then(() => load_page(pdf, this_pdf_container, page, rectanglesPerPage)); } return promise; }).then(null, function (error) { diff --git a/ynr/apps/parties/constants.py b/ynr/apps/parties/constants.py index 63fec2ea72..921a97853d 100644 --- a/ynr/apps/parties/constants.py +++ b/ynr/apps/parties/constants.py @@ -60,7 +60,7 @@ "PP504": 8345, # Independent Network "PP1951": 8015, - #Propel + # Propel "PP12731": 7769, # Chesterfield And North Derbyshire Independents (CANDI) "PP2883": 4670, diff --git a/ynr/apps/people/forms/forms.py b/ynr/apps/people/forms/forms.py index 5cf26f6b0b..b4d0433386 100644 --- a/ynr/apps/people/forms/forms.py +++ b/ynr/apps/people/forms/forms.py @@ -107,9 +107,9 @@ def clean(self): if self.cleaned_data.get("value_type") in self.HTTP_IDENTIFIERS: # Add https schema if missing if not self.cleaned_data.get("value").startswith("http"): - self.cleaned_data["value"] = ( - f"https://{self.cleaned_data['value']}" - ) + self.cleaned_data[ + "value" + ] = f"https://{self.cleaned_data['value']}" URLValidator()(value=self.cleaned_data["value"]) if ( "value_type" in self.cleaned_data @@ -216,9 +216,9 @@ def __init__(self, *args, **kwargs): ) if self.show_previous_party_affiliations: - self.fields["previous_party_affiliations"] = ( - PreviousPartyAffiliationsField(membership=self.instance) - ) + self.fields[ + "previous_party_affiliations" + ] = PreviousPartyAffiliationsField(membership=self.instance) @property def show_previous_party_affiliations(self): diff --git a/ynr/apps/sopn_parsing/helpers/parse_tables.py b/ynr/apps/sopn_parsing/helpers/parse_tables.py index 247d0e7e31..6072e2c168 100644 --- a/ynr/apps/sopn_parsing/helpers/parse_tables.py +++ b/ynr/apps/sopn_parsing/helpers/parse_tables.py @@ -3,6 +3,7 @@ from bulk_adding.models import RawPeople from candidates.models import Ballot +from django.conf import settings from django.contrib.postgres.search import TrigramSimilarity from django.core.files.base import ContentFile from django.core.files.storage import DefaultStorage @@ -13,6 +14,7 @@ from pandas import DataFrame from parties.models import Party, PartyDescription from sopn_parsing.helpers.text_helpers import clean_text +from sopn_parsing.models import AWSTextractParsedSOPN from utils.db import Levenshtein FIRST_NAME_FIELDS = [ @@ -479,17 +481,20 @@ def parse_raw_data(ballot: Ballot, reparse=False): Given a Ballot, go and get the Camelot and the AWS Textract dataframes and process them """ - - camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None) camelot_data = {} - textract_model = getattr(ballot.sopn, "awstextractparsedsopn", None) - textract_data = {} - if ( + camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None) + if getattr(settings, "CAMELOT_ENABLED", False) and ( camelot_model and camelot_model.raw_data_type == "pandas" and (reparse or not camelot_model.parsed_data) ): camelot_data = parse_dataframe(ballot, camelot_model.as_pandas) + + textract_model: AWSTextractParsedSOPN = getattr( + ballot.sopn, "awstextractparsedsopn", None + ) + textract_data = {} + if ( textract_model and textract_model.raw_data @@ -498,6 +503,9 @@ def parse_raw_data(ballot: Ballot, reparse=False): ): if not textract_model.parsed_data: textract_model.parse_raw_data() + if textract_model.withdrawal_rows(): + ballot.sopn.withdrawal_detected = True + ballot.sopn.save() textract_data = parse_dataframe(ballot, textract_model.as_pandas) if camelot_data or textract_data: diff --git a/ynr/apps/sopn_parsing/migrations/0007_awstextractparsedsopn_sopn_camelotparsedsopn_sopn.py b/ynr/apps/sopn_parsing/migrations/0007_awstextractparsedsopn_sopn_camelotparsedsopn_sopn.py index c4824fb6ae..57f21612ac 100644 --- a/ynr/apps/sopn_parsing/migrations/0007_awstextractparsedsopn_sopn_camelotparsedsopn_sopn.py +++ b/ynr/apps/sopn_parsing/migrations/0007_awstextractparsedsopn_sopn_camelotparsedsopn_sopn.py @@ -10,10 +10,7 @@ class Migration(migrations.Migration): "sopn_parsing", "0006_rename_sopn_awstextractparsedsopn_official_document_and_more", ), - ( - "official_documents", - "0033_ballotsopnhistory_ballotsopn" - ), + ("official_documents", "0033_ballotsopnhistory_ballotsopn"), ] operations = [ diff --git a/ynr/apps/sopn_parsing/models.py b/ynr/apps/sopn_parsing/models.py index 1c10ae2cda..637692e900 100644 --- a/ynr/apps/sopn_parsing/models.py +++ b/ynr/apps/sopn_parsing/models.py @@ -1,10 +1,12 @@ import json +import re from io import BytesIO +import pandas from django.core.files.images import ImageFile from django.db import models +from django.utils.functional import cached_property from model_utils.models import TimeStampedModel -from pandas import concat from textractor.parsers import response_parser from textractor.parsers.response_parser import parse @@ -102,12 +104,15 @@ class AWSTextractParsedSOPN(TimeStampedModel): default=AWSTextractParsedSOPNStatus.NOT_STARTED, ) - @property + @cached_property def as_pandas(self): + if not self.parsed_data: + return None import pandas pandas.set_option("display.max_colwidth", None) - return pandas.DataFrame.from_dict(json.loads(self.parsed_data)) + # df = pandas.DataFrame.from_dict(json.loads(self.parsed_data)) + return self.parse_raw_data() def parse_raw_data(self): """ @@ -125,18 +130,208 @@ def parse_raw_data(self): # Store all data frames in a list frames = [] - # Table headers that we've seen - for table in parsed.tables: - # Get the pandas version of the table - df = table.to_pandas() - frames.append(df) - - # Merge all the dataframes - df = concat( - frames, - ignore_index=True, - ) + last_title = None + force_process_table = False + found_situation_of_poll = False + + for page in parsed.pages: + for layout in page.layouts[:5]: + if "polling station" in layout.text.lower(): + found_situation_of_poll = True + break + if found_situation_of_poll: + break + + for i, initial_table in enumerate(page.tables): + if initial_table.column_count < 3: + force_process_table = True + continue + try: + table_title = initial_table.title.text + except AttributeError: + table_title = "" + if "polling station" in table_title.lower(): + continue + + table = initial_table + if not force_process_table or page.page_num == 1: + df = self.remove_non_table_header_content( + initial_table.to_pandas() + ) + else: + df = initial_table.to_pandas() + # else: + # try: + # table = initial_table.strip_headers() + # df = table.to_pandas() + # except IndexError: + # df = self.remove_non_header_rows(initial_table.to_pandas()) + # + if i > 0 or page.page_num > 1: + if not force_process_table: + df = self.remove_header_rows(df) + force_process_table = False + + if df.empty: + continue + + frames.append(df) + + current_title = getattr(table.title, "text", None) + if last_title and current_title != last_title: + break + last_title = current_title + + all_rows = [] + max_len = 0 + for df in frames: + if df.empty: + continue + rows = df.values.tolist() + all_rows.extend(rows) + max_len = max(max_len, max(len(row) for row in rows)) + padded_rows = [row + [""] * (max_len - len(row)) for row in all_rows] + df = pandas.DataFrame(padded_rows) + # Don't parse situation of polling stations + df.reset_index(drop=True, inplace=True) + + polling_station_index = df[ + df.apply( + lambda row: row.astype(str) + .str.contains("polling station", case=False) + .any(), + axis=1, + ) + ].index + if not polling_station_index.empty: + polling_station_index = polling_station_index[0] + if isinstance(polling_station_index, str): + polling_station_index = int(polling_station_index) + new_df = df.loc[: polling_station_index - 1] + df = new_df + self.parsed_data = df.to_json() + return df + + def remove_non_table_header_content(self, df): + """ + Some tables include rows that aren't headers. Remove them + + """ + # How many rows to scan form the top of a df + max_search = 4 + + for i in range(min(len(df), max_search)): + if self.is_header_row(df.iloc[i]): + return df.iloc[i:].copy() + return df + + def remove_header_rows(self, df: pandas.DataFrame): + """ + Given a data frame, remove header rows + + """ + # How many rows to scan form the top of a df + max_search = 4 + header_start_index = 0 + header_row_found = False + + for i in range(min(len(df), max_search)): + if self.is_header_row(df.iloc[i]): + header_row_found = True + break + header_start_index += 1 + if header_row_found: + df = df.iloc[header_start_index + 1 :].copy() + return df def as_textractor_document(self): + if not self.raw_data: + return None return response_parser.parse(json.loads(self.raw_data)) + + def normalise_row(self, row): + """Convert a row to a cleaned, comparable list of strings.""" + return [ + str(re.sub("[^a-z\s]", "", cell.lower())).strip() + for cell in row + if cell + ] + + def is_header_row(self, row): + keywords = ["name", "first", "surname"] + cleaned = self.normalise_row(row) + if len(cleaned) <= 3: + return False + return any(any(kw in cell for kw in keywords) for cell in cleaned) + + def get_withdrawal_column(self): + column_names = [ + "no longer", + "withdrawal", + "invalid", + "decision", + ] + if self.as_pandas.empty: + return None + for i, heading in enumerate(self.as_pandas.iloc[0]): + if any(col in str(heading) for col in column_names): + return self.as_pandas[i] + return None + + def withdrawal_rows(self): + column_values = self.get_withdrawal_column() + if column_values is None: + return None + # column_values = self.as_pandas[column].tolist() + cells_with_value = [] + for i, row in enumerate(column_values): + # Skip the header, as that always contains a value + if i == 0: + continue + if row: + cells_with_value.append(i) + return cells_with_value + + def get_withdrawals_bboxes(self): + return "{}" + # headers = self.as_pandas.iloc[0].tolist() + # get colmun index from headers + column = "4" + column_values = self.as_pandas[column].tolist() + cells_with_value = [] + for i, row in enumerate(column_values): + if row: + cells_with_value.append(i) + cells_with_value.pop(0) + # Deal with more than one page + textract_cells = [] + for table in self.as_textractor_document().tables: + for cell in table.table_cells: + # if str(cell.col_index-1) != column: + # continue + if cell.row_index - 1 in cells_with_value: + textract_cells.append(cell) + print(textract_cells) + + doc_height = 1429 + doc_width = 1010 + + page = 1 + box_data = {page: []} + for cell in textract_cells: + absolute_x = cell.x * doc_width + absolute_y = cell.y * doc_height + absolute_width = cell.width * doc_width + absolute_height = cell.height * doc_height + box_data[page].append( + { + "x": absolute_x, + "y": absolute_y, + "width": absolute_width, + "height": absolute_height, + "color": "red", + "lineWidth": 2, + }, + ) + return json.dumps(box_data) diff --git a/ynr/apps/sopn_parsing/tests/test_parse_tables.py b/ynr/apps/sopn_parsing/tests/test_parse_tables.py index 5e4db9c1b6..bffe1a8025 100644 --- a/ynr/apps/sopn_parsing/tests/test_parse_tables.py +++ b/ynr/apps/sopn_parsing/tests/test_parse_tables.py @@ -7,7 +7,7 @@ from candidates.tests.uk_examples import UK2015ExamplesMixin from django.core.management import call_command from django.db import connection -from django.test import TestCase +from django.test import TestCase, override_settings from official_documents.models import BallotSOPN from pandas import Index, Series from parties.models import Party, PartyDescription @@ -30,6 +30,7 @@ def setUp(self): cursor.execute("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;") @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") + @override_settings(CAMELOT_ENABLED=True) def test_basic_parsing(self): self.assertFalse(RawPeople.objects.exists()) doc = BallotSOPN.objects.create( @@ -92,6 +93,7 @@ def test_basic_parsing(self): ) @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") + @override_settings(CAMELOT_ENABLED=True) def test_welsh_run_sopn(self): """ Test that if the ballot is welsh run and previous party affiliations @@ -148,6 +150,7 @@ def test_welsh_run_sopn(self): ) @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") + @override_settings(CAMELOT_ENABLED=True) def test_match_complex_descriptions(self): self.assertFalse(RawPeople.objects.exists()) doc = BallotSOPN.objects.create(