diff --git a/ynr/apps/bulk_adding/forms.py b/ynr/apps/bulk_adding/forms.py
index da1f3ee5f5..56778fe70c 100644
--- a/ynr/apps/bulk_adding/forms.py
+++ b/ynr/apps/bulk_adding/forms.py
@@ -69,9 +69,9 @@ def __init__(self, *args, **kwargs):
def get_form_kwargs(self, index):
kwargs = super().get_form_kwargs(index)
kwargs["party_choices"] = self.parties
- kwargs["previous_party_affiliations_choices"] = (
- self.previous_party_affiliations_choices
- )
+ kwargs[
+ "previous_party_affiliations_choices"
+ ] = self.previous_party_affiliations_choices
return kwargs
@property
diff --git a/ynr/apps/bulk_adding/tests/test_bulk_add_by_party.py b/ynr/apps/bulk_adding/tests/test_bulk_add_by_party.py
index 2f5c570260..da78290be4 100644
--- a/ynr/apps/bulk_adding/tests/test_bulk_add_by_party.py
+++ b/ynr/apps/bulk_adding/tests/test_bulk_add_by_party.py
@@ -198,9 +198,9 @@ def test_submit_name_and_social_media_links_for_area(self):
form[f"{ballot.pk}-0-person_identifiers_0_1"] = "homepage_url"
form[f"{ballot.pk}-0-person_identifiers_1_0"] = "pp@gmail.com"
form[f"{ballot.pk}-0-person_identifiers_1_1"] = "email"
- form[f"{ballot.pk}-0-person_identifiers_2_0"] = (
- "https://linkedin.com/in/pamphero"
- )
+ form[
+ f"{ballot.pk}-0-person_identifiers_2_0"
+ ] = "https://linkedin.com/in/pamphero"
form[f"{ballot.pk}-0-person_identifiers_2_1"] = "linkedin_url"
response = form.submit().follow()
@@ -304,22 +304,22 @@ def test_bulk_add_with_100_ballots(self):
form["source"] = "https://example.com/candidates/"
for ballot in ballots:
form[f"{ballot.pk}-0-name"] = f"Candidate {ballot.pk}"
- form[f"{ballot.pk}-0-biography"] = (
- f"Biography for Candidate {ballot.pk}"
- )
+ form[
+ f"{ballot.pk}-0-biography"
+ ] = f"Biography for Candidate {ballot.pk}"
form[f"{ballot.pk}-0-gender"] = "female"
form[f"{ballot.pk}-0-birth_date"] = "1990"
- form[f"{ballot.pk}-0-person_identifiers_0_0"] = (
- f"https://example.com/{ballot.pk}"
- )
+ form[
+ f"{ballot.pk}-0-person_identifiers_0_0"
+ ] = f"https://example.com/{ballot.pk}"
form[f"{ballot.pk}-0-person_identifiers_0_1"] = "homepage_url"
- form[f"{ballot.pk}-0-person_identifiers_1_0"] = (
- f"candidate{ballot.pk}@example.com"
- )
+ form[
+ f"{ballot.pk}-0-person_identifiers_1_0"
+ ] = f"candidate{ballot.pk}@example.com"
form[f"{ballot.pk}-0-person_identifiers_1_1"] = "email"
- form[f"{ballot.pk}-0-person_identifiers_2_0"] = (
- f"https://linkedin.com/in/candidate{ballot.pk}"
- )
+ form[
+ f"{ballot.pk}-0-person_identifiers_2_0"
+ ] = f"https://linkedin.com/in/candidate{ballot.pk}"
form[f"{ballot.pk}-0-person_identifiers_2_1"] = "linkedin_url"
# Submit the form
diff --git a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html
index c61ab936ce..cef7f89616 100644
--- a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html
+++ b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html
@@ -12,6 +12,15 @@
Parsing Status
AWS Textract Data: {% if textract_parsed.raw_data %}Yes{% else %}No{% endif %}
AWS Textract Parsed? {% if textract_parsed.parsed_data %}Yes{% else %}
No{% endif %}
+
+ Withdrawal detected:
+ {% if textract_parsed.withdrawal_rows %}
+ Yes in row{{ textract_parsed.withdrawal_rows|pluralize }}
+ {{ textract_parsed.withdrawal_rows|join:", " }}
+ {% else %}
+ No
+ {% endif %}
+
Camelot raw Data
@@ -28,6 +37,7 @@ Camelot table Data
N/A
{% endif %}
+ {{ textract_parsed.as_pandas.to_html|safe }}
{% if textract_parsed and textract_parsed.as_textractor_document %}
@@ -37,6 +47,7 @@ {{ table.title.text }}
{{ table.to_html|safe }}
{% endfor %}
{% endif %}
+
{% if textract_parsed.parsed_data %}
AWS document markdown
diff --git a/ynr/apps/elections/templates/elections/sopn_for_ballot.html b/ynr/apps/elections/templates/elections/sopn_for_ballot.html
index d4a39d6d96..c7bf428065 100644
--- a/ynr/apps/elections/templates/elections/sopn_for_ballot.html
+++ b/ynr/apps/elections/templates/elections/sopn_for_ballot.html
@@ -67,6 +67,7 @@
{% include "elections/includes/_sopn_debug.html" %}
{% endif %}
+
{% else %}
@@ -78,13 +79,14 @@
{% url 'admin:official_documents_ballotsopn_change' object.sopn.id as url %}
You can edit this in the admin interface (e.g. to delete it)
{% endif %}
-
{% if object.sopn.uploaded_file.url|slice:"-3:" == "pdf" %}
{% else %}
diff --git a/ynr/apps/official_documents/migrations/0039_ballotsopn_withdrawal_detected_and_more.py b/ynr/apps/official_documents/migrations/0039_ballotsopn_withdrawal_detected_and_more.py
new file mode 100644
index 0000000000..f5c70604dc
--- /dev/null
+++ b/ynr/apps/official_documents/migrations/0039_ballotsopn_withdrawal_detected_and_more.py
@@ -0,0 +1,22 @@
+# Generated by Django 4.2.16 on 2025-04-02 08:01
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("official_documents", "0038_ballotsopn_replacement_reason_and_more"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="ballotsopn",
+ name="withdrawal_detected",
+ field=models.BooleanField(default=False),
+ ),
+ migrations.AddField(
+ model_name="ballotsopnhistory",
+ name="withdrawal_detected",
+ field=models.BooleanField(default=False),
+ ),
+ ]
diff --git a/ynr/apps/official_documents/models.py b/ynr/apps/official_documents/models.py
index 15de317657..37167e904d 100644
--- a/ynr/apps/official_documents/models.py
+++ b/ynr/apps/official_documents/models.py
@@ -220,6 +220,8 @@ class BaseBallotSOPN(TimeStampedModel):
blank=True,
)
+ withdrawal_detected = models.BooleanField(default=False)
+
class Meta:
get_latest_by = "modified"
abstract = True
diff --git a/ynr/apps/official_documents/templates/official_documents/sopn_viewer.js b/ynr/apps/official_documents/templates/official_documents/sopn_viewer.js
index c5e7994e3b..12bed5f16d 100644
--- a/ynr/apps/official_documents/templates/official_documents/sopn_viewer.js
+++ b/ynr/apps/official_documents/templates/official_documents/sopn_viewer.js
@@ -9,7 +9,17 @@ var SOPN_VIEWER = (function () {
var module = {};
- function load_page(pdf, container, page_num) {
+ function drawRectangles(context, rectangles) {
+ rectangles.forEach(rect => {
+ context.beginPath();
+ context.rect(rect.x, rect.y, rect.width, rect.height);
+ context.lineWidth = rect.lineWidth || 1;
+ context.strokeStyle = rect.color || 'red';
+ context.stroke();
+ });
+ }
+
+ function load_page(pdf, container, page_num, rectanglesPerPage) {
return pdf.getPage(page_num).then(function (page) {
var scale = 1.2;
@@ -25,6 +35,8 @@ var SOPN_VIEWER = (function () {
var context = canvas.getContext("2d");
canvas.height = viewport.height;
canvas.width = viewport.width;
+ console.log(canvas.height)
+ console.log(canvas.width)
var renderContext = {
canvasContext: context,
viewport: viewport
@@ -33,6 +45,12 @@ var SOPN_VIEWER = (function () {
var renderTask = page.render(renderContext);
return renderTask.promise.then(function () {
container.append(page_container);
+
+ if (rectanglesPerPage && rectanglesPerPage[page_num]) {
+ drawRectangles(context, rectanglesPerPage[page_num]);
+ }
+
+
return page.getTextContent({normalizeWhitespace: true});
}).then(function (textContent) {
var pdf_canvas = $(canvas),
@@ -57,6 +75,8 @@ var SOPN_VIEWER = (function () {
viewport: viewport,
textDivs: []
});
+
+
});
}
@@ -64,7 +84,7 @@ var SOPN_VIEWER = (function () {
});
}
- function ShowSOPNInline(sopn_url, ballot_paper_id, options) {
+ function ShowSOPNInline(sopn_url, ballot_paper_id, rectanglesPerPage) {
// The container element
var this_pdf_container = document.getElementById("sopn-" + ballot_paper_id);
@@ -73,7 +93,7 @@ var SOPN_VIEWER = (function () {
loadingTask.promise.then(function (pdf) {
var promise = Promise.resolve();
for (let page = 1; page <= pdf.numPages; page++) {
- promise = promise.then(() => load_page(pdf, this_pdf_container, page));
+ promise = promise.then(() => load_page(pdf, this_pdf_container, page, rectanglesPerPage));
}
return promise;
}).then(null, function (error) {
diff --git a/ynr/apps/parties/constants.py b/ynr/apps/parties/constants.py
index 63fec2ea72..921a97853d 100644
--- a/ynr/apps/parties/constants.py
+++ b/ynr/apps/parties/constants.py
@@ -60,7 +60,7 @@
"PP504": 8345,
# Independent Network
"PP1951": 8015,
- #Propel
+ # Propel
"PP12731": 7769,
# Chesterfield And North Derbyshire Independents (CANDI)
"PP2883": 4670,
diff --git a/ynr/apps/people/forms/forms.py b/ynr/apps/people/forms/forms.py
index 5cf26f6b0b..b4d0433386 100644
--- a/ynr/apps/people/forms/forms.py
+++ b/ynr/apps/people/forms/forms.py
@@ -107,9 +107,9 @@ def clean(self):
if self.cleaned_data.get("value_type") in self.HTTP_IDENTIFIERS:
# Add https schema if missing
if not self.cleaned_data.get("value").startswith("http"):
- self.cleaned_data["value"] = (
- f"https://{self.cleaned_data['value']}"
- )
+ self.cleaned_data[
+ "value"
+ ] = f"https://{self.cleaned_data['value']}"
URLValidator()(value=self.cleaned_data["value"])
if (
"value_type" in self.cleaned_data
@@ -216,9 +216,9 @@ def __init__(self, *args, **kwargs):
)
if self.show_previous_party_affiliations:
- self.fields["previous_party_affiliations"] = (
- PreviousPartyAffiliationsField(membership=self.instance)
- )
+ self.fields[
+ "previous_party_affiliations"
+ ] = PreviousPartyAffiliationsField(membership=self.instance)
@property
def show_previous_party_affiliations(self):
diff --git a/ynr/apps/sopn_parsing/helpers/parse_tables.py b/ynr/apps/sopn_parsing/helpers/parse_tables.py
index 247d0e7e31..6072e2c168 100644
--- a/ynr/apps/sopn_parsing/helpers/parse_tables.py
+++ b/ynr/apps/sopn_parsing/helpers/parse_tables.py
@@ -3,6 +3,7 @@
from bulk_adding.models import RawPeople
from candidates.models import Ballot
+from django.conf import settings
from django.contrib.postgres.search import TrigramSimilarity
from django.core.files.base import ContentFile
from django.core.files.storage import DefaultStorage
@@ -13,6 +14,7 @@
from pandas import DataFrame
from parties.models import Party, PartyDescription
from sopn_parsing.helpers.text_helpers import clean_text
+from sopn_parsing.models import AWSTextractParsedSOPN
from utils.db import Levenshtein
FIRST_NAME_FIELDS = [
@@ -479,17 +481,20 @@ def parse_raw_data(ballot: Ballot, reparse=False):
Given a Ballot, go and get the Camelot and the AWS Textract dataframes
and process them
"""
-
- camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None)
camelot_data = {}
- textract_model = getattr(ballot.sopn, "awstextractparsedsopn", None)
- textract_data = {}
- if (
+ camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None)
+ if getattr(settings, "CAMELOT_ENABLED", False) and (
camelot_model
and camelot_model.raw_data_type == "pandas"
and (reparse or not camelot_model.parsed_data)
):
camelot_data = parse_dataframe(ballot, camelot_model.as_pandas)
+
+ textract_model: AWSTextractParsedSOPN = getattr(
+ ballot.sopn, "awstextractparsedsopn", None
+ )
+ textract_data = {}
+
if (
textract_model
and textract_model.raw_data
@@ -498,6 +503,9 @@ def parse_raw_data(ballot: Ballot, reparse=False):
):
if not textract_model.parsed_data:
textract_model.parse_raw_data()
+ if textract_model.withdrawal_rows():
+ ballot.sopn.withdrawal_detected = True
+ ballot.sopn.save()
textract_data = parse_dataframe(ballot, textract_model.as_pandas)
if camelot_data or textract_data:
diff --git a/ynr/apps/sopn_parsing/migrations/0007_awstextractparsedsopn_sopn_camelotparsedsopn_sopn.py b/ynr/apps/sopn_parsing/migrations/0007_awstextractparsedsopn_sopn_camelotparsedsopn_sopn.py
index c4824fb6ae..57f21612ac 100644
--- a/ynr/apps/sopn_parsing/migrations/0007_awstextractparsedsopn_sopn_camelotparsedsopn_sopn.py
+++ b/ynr/apps/sopn_parsing/migrations/0007_awstextractparsedsopn_sopn_camelotparsedsopn_sopn.py
@@ -10,10 +10,7 @@ class Migration(migrations.Migration):
"sopn_parsing",
"0006_rename_sopn_awstextractparsedsopn_official_document_and_more",
),
- (
- "official_documents",
- "0033_ballotsopnhistory_ballotsopn"
- ),
+ ("official_documents", "0033_ballotsopnhistory_ballotsopn"),
]
operations = [
diff --git a/ynr/apps/sopn_parsing/models.py b/ynr/apps/sopn_parsing/models.py
index 1c10ae2cda..637692e900 100644
--- a/ynr/apps/sopn_parsing/models.py
+++ b/ynr/apps/sopn_parsing/models.py
@@ -1,10 +1,12 @@
import json
+import re
from io import BytesIO
+import pandas
from django.core.files.images import ImageFile
from django.db import models
+from django.utils.functional import cached_property
from model_utils.models import TimeStampedModel
-from pandas import concat
from textractor.parsers import response_parser
from textractor.parsers.response_parser import parse
@@ -102,12 +104,15 @@ class AWSTextractParsedSOPN(TimeStampedModel):
default=AWSTextractParsedSOPNStatus.NOT_STARTED,
)
- @property
+ @cached_property
def as_pandas(self):
+ if not self.parsed_data:
+ return None
import pandas
pandas.set_option("display.max_colwidth", None)
- return pandas.DataFrame.from_dict(json.loads(self.parsed_data))
+ # df = pandas.DataFrame.from_dict(json.loads(self.parsed_data))
+ return self.parse_raw_data()
def parse_raw_data(self):
"""
@@ -125,18 +130,208 @@ def parse_raw_data(self):
# Store all data frames in a list
frames = []
- # Table headers that we've seen
- for table in parsed.tables:
- # Get the pandas version of the table
- df = table.to_pandas()
- frames.append(df)
-
- # Merge all the dataframes
- df = concat(
- frames,
- ignore_index=True,
- )
+ last_title = None
+ force_process_table = False
+ found_situation_of_poll = False
+
+ for page in parsed.pages:
+ for layout in page.layouts[:5]:
+ if "polling station" in layout.text.lower():
+ found_situation_of_poll = True
+ break
+ if found_situation_of_poll:
+ break
+
+ for i, initial_table in enumerate(page.tables):
+ if initial_table.column_count < 3:
+ force_process_table = True
+ continue
+ try:
+ table_title = initial_table.title.text
+ except AttributeError:
+ table_title = ""
+ if "polling station" in table_title.lower():
+ continue
+
+ table = initial_table
+ if not force_process_table or page.page_num == 1:
+ df = self.remove_non_table_header_content(
+ initial_table.to_pandas()
+ )
+ else:
+ df = initial_table.to_pandas()
+ # else:
+ # try:
+ # table = initial_table.strip_headers()
+ # df = table.to_pandas()
+ # except IndexError:
+ # df = self.remove_non_header_rows(initial_table.to_pandas())
+ #
+ if i > 0 or page.page_num > 1:
+ if not force_process_table:
+ df = self.remove_header_rows(df)
+ force_process_table = False
+
+ if df.empty:
+ continue
+
+ frames.append(df)
+
+ current_title = getattr(table.title, "text", None)
+ if last_title and current_title != last_title:
+ break
+ last_title = current_title
+
+ all_rows = []
+ max_len = 0
+ for df in frames:
+ if df.empty:
+ continue
+ rows = df.values.tolist()
+ all_rows.extend(rows)
+ max_len = max(max_len, max(len(row) for row in rows))
+ padded_rows = [row + [""] * (max_len - len(row)) for row in all_rows]
+ df = pandas.DataFrame(padded_rows)
+ # Don't parse situation of polling stations
+ df.reset_index(drop=True, inplace=True)
+
+ polling_station_index = df[
+ df.apply(
+ lambda row: row.astype(str)
+ .str.contains("polling station", case=False)
+ .any(),
+ axis=1,
+ )
+ ].index
+ if not polling_station_index.empty:
+ polling_station_index = polling_station_index[0]
+ if isinstance(polling_station_index, str):
+ polling_station_index = int(polling_station_index)
+ new_df = df.loc[: polling_station_index - 1]
+ df = new_df
+
self.parsed_data = df.to_json()
+ return df
+
+ def remove_non_table_header_content(self, df):
+ """
+ Some tables include rows that aren't headers. Remove them
+
+ """
+ # How many rows to scan form the top of a df
+ max_search = 4
+
+ for i in range(min(len(df), max_search)):
+ if self.is_header_row(df.iloc[i]):
+ return df.iloc[i:].copy()
+ return df
+
+ def remove_header_rows(self, df: pandas.DataFrame):
+ """
+ Given a data frame, remove header rows
+
+ """
+ # How many rows to scan form the top of a df
+ max_search = 4
+ header_start_index = 0
+ header_row_found = False
+
+ for i in range(min(len(df), max_search)):
+ if self.is_header_row(df.iloc[i]):
+ header_row_found = True
+ break
+ header_start_index += 1
+ if header_row_found:
+ df = df.iloc[header_start_index + 1 :].copy()
+ return df
def as_textractor_document(self):
+ if not self.raw_data:
+ return None
return response_parser.parse(json.loads(self.raw_data))
+
+ def normalise_row(self, row):
+ """Convert a row to a cleaned, comparable list of strings."""
+ return [
+ str(re.sub("[^a-z\s]", "", cell.lower())).strip()
+ for cell in row
+ if cell
+ ]
+
+ def is_header_row(self, row):
+ keywords = ["name", "first", "surname"]
+ cleaned = self.normalise_row(row)
+ if len(cleaned) <= 3:
+ return False
+ return any(any(kw in cell for kw in keywords) for cell in cleaned)
+
+ def get_withdrawal_column(self):
+ column_names = [
+ "no longer",
+ "withdrawal",
+ "invalid",
+ "decision",
+ ]
+ if self.as_pandas.empty:
+ return None
+ for i, heading in enumerate(self.as_pandas.iloc[0]):
+ if any(col in str(heading) for col in column_names):
+ return self.as_pandas[i]
+ return None
+
+ def withdrawal_rows(self):
+ column_values = self.get_withdrawal_column()
+ if column_values is None:
+ return None
+ # column_values = self.as_pandas[column].tolist()
+ cells_with_value = []
+ for i, row in enumerate(column_values):
+ # Skip the header, as that always contains a value
+ if i == 0:
+ continue
+ if row:
+ cells_with_value.append(i)
+ return cells_with_value
+
+ def get_withdrawals_bboxes(self):
+ return "{}"
+ # headers = self.as_pandas.iloc[0].tolist()
+ # get colmun index from headers
+ column = "4"
+ column_values = self.as_pandas[column].tolist()
+ cells_with_value = []
+ for i, row in enumerate(column_values):
+ if row:
+ cells_with_value.append(i)
+ cells_with_value.pop(0)
+ # Deal with more than one page
+ textract_cells = []
+ for table in self.as_textractor_document().tables:
+ for cell in table.table_cells:
+ # if str(cell.col_index-1) != column:
+ # continue
+ if cell.row_index - 1 in cells_with_value:
+ textract_cells.append(cell)
+ print(textract_cells)
+
+ doc_height = 1429
+ doc_width = 1010
+
+ page = 1
+ box_data = {page: []}
+ for cell in textract_cells:
+ absolute_x = cell.x * doc_width
+ absolute_y = cell.y * doc_height
+ absolute_width = cell.width * doc_width
+ absolute_height = cell.height * doc_height
+ box_data[page].append(
+ {
+ "x": absolute_x,
+ "y": absolute_y,
+ "width": absolute_width,
+ "height": absolute_height,
+ "color": "red",
+ "lineWidth": 2,
+ },
+ )
+ return json.dumps(box_data)
diff --git a/ynr/apps/sopn_parsing/tests/test_parse_tables.py b/ynr/apps/sopn_parsing/tests/test_parse_tables.py
index 5e4db9c1b6..bffe1a8025 100644
--- a/ynr/apps/sopn_parsing/tests/test_parse_tables.py
+++ b/ynr/apps/sopn_parsing/tests/test_parse_tables.py
@@ -7,7 +7,7 @@
from candidates.tests.uk_examples import UK2015ExamplesMixin
from django.core.management import call_command
from django.db import connection
-from django.test import TestCase
+from django.test import TestCase, override_settings
from official_documents.models import BallotSOPN
from pandas import Index, Series
from parties.models import Party, PartyDescription
@@ -30,6 +30,7 @@ def setUp(self):
cursor.execute("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;")
@skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
+ @override_settings(CAMELOT_ENABLED=True)
def test_basic_parsing(self):
self.assertFalse(RawPeople.objects.exists())
doc = BallotSOPN.objects.create(
@@ -92,6 +93,7 @@ def test_basic_parsing(self):
)
@skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
+ @override_settings(CAMELOT_ENABLED=True)
def test_welsh_run_sopn(self):
"""
Test that if the ballot is welsh run and previous party affiliations
@@ -148,6 +150,7 @@ def test_welsh_run_sopn(self):
)
@skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
+ @override_settings(CAMELOT_ENABLED=True)
def test_match_complex_descriptions(self):
self.assertFalse(RawPeople.objects.exists())
doc = BallotSOPN.objects.create(