Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 22 additions & 31 deletions backend/problem/management/commands/import_fracas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,20 @@
from tqdm import tqdm

from langpro_annotator.logger import logger
from problem.services import get_fracas_problems
from problem.services import FracasData
from problem.models import Problem


class Command(BaseCommand):
help = "Import FraCaS problems from fracas.xml."

ENTAILMENT_LABELS = {
"yes": Problem.EntailmentLabel.ENTAILMENT,
"no": Problem.EntailmentLabel.CONTRADICTION,
"unknown": Problem.EntailmentLabel.NEUTRAL,
"undefined": Problem.EntailmentLabel.UNKNOWN,
}

def add_arguments(self, parser):
parser.add_argument(
"--file",
Expand All @@ -25,13 +32,6 @@ def handle(self, *args, **options):
fracas_path = options["fracas_path"]
self.import_fracas_problems(fracas_path)

@staticmethod
def _text_from_element(element: ET.Element) -> str:
"""
Extracts stripped text from an XML element, returning an empty string if the element is None or has no text.
"""
return element.text.strip() if element is not None and element.text else ""

@staticmethod
def _annotate_section_subsections(tree: ET.ElementTree) -> None:
"""
Expand Down Expand Up @@ -72,7 +72,9 @@ def import_fracas_problems(self, fracas_path: str) -> None:
created = 0
skipped = 0

existing_fracas_problems = get_fracas_problems()
existing_fracas_problems = Problem.objects.filter(
dataset=Problem.Dataset.FRACAS
)
existing_fracas_ids = {p.fracas_id for p in existing_fracas_problems}

for problem in tqdm(all_problems, desc="Importing FraCaS problems"):
Expand All @@ -88,33 +90,22 @@ def import_fracas_problems(self, fracas_path: str) -> None:
skipped += 1
continue

question = self._text_from_element(problem.find("q"))
hypothesis = self._text_from_element(problem.find("h"))
answer = self._text_from_element(problem.find("a"))
note = self._text_from_element(problem.find("note"))

section = problem.get("section")
subsection = problem.get("subsection")
hypothesis = FracasData._text_from_element(problem.find("h"))
fracas_answer = problem.get("fracas_answer")
fracas_nonstandard = problem.get("fracas_nonstandard", False) == "true"

premise_nodes = problem.findall("p")
premises = [node.text.strip() for node in premise_nodes if node.text]
entailment_label = self.ENTAILMENT_LABELS.get(
fracas_answer, Problem.EntailmentLabel.UNKNOWN
)

extra_data = FracasData.import_data(problem)

Problem.objects.create(
type=Problem.ProblemType.FRACAS,
content={
"fracas_id": int(problem_id),
"question": question,
"hypothesis": hypothesis,
"answer": answer,
"fracas_answer": fracas_answer,
"fracas_non_standard": fracas_nonstandard,
"note": note,
"section_name": section,
"subsection_name": subsection,
"premises": premises,
},
dataset=Problem.Dataset.FRACAS,
premises=premises,
hypothesis=hypothesis,
entailment_label=entailment_label,
extra_data=extra_data,
)
created += 1

Expand Down
25 changes: 20 additions & 5 deletions backend/problem/management/commands/import_sick.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@

from langpro_annotator.logger import logger
from problem.models import Problem
from problem.services import get_sick_problems
from problem.services import SickData


class Command(BaseCommand):
help = "Import SICK problems from SICK.txt (a TSV file)."

ENTAILMENT_LABELS = {
"NEUTRAL": Problem.EntailmentLabel.NEUTRAL,
"ENTAILMENT": Problem.EntailmentLabel.ENTAILMENT,
"CONTRADICTION": Problem.EntailmentLabel.CONTRADICTION,
}

def add_arguments(self, parser):
parser.add_argument(
"--file",
Expand All @@ -33,7 +39,7 @@ def import_sick_problems(self, sick_path: str) -> None:
skipped = 0
created = 0

existing_sick_problems = get_sick_problems()
existing_sick_problems = Problem.objects.filter(dataset=Problem.Dataset.SICK)
existing_pair_ids = {p.pair_id for p in existing_sick_problems}

with open(sick_path, "r", encoding="utf-8") as file:
Expand All @@ -45,11 +51,20 @@ def import_sick_problems(self, sick_path: str) -> None:
skipped += 1
continue

created += 1
entailment_label = self.ENTAILMENT_LABELS.get(
problem["entailment_label"], Problem.EntailmentLabel.UNKNOWN
)

extra_data = SickData.import_data(problem)

Problem.objects.create(
type=Problem.ProblemType.SICK,
content=problem,
dataset=Problem.Dataset.SICK,
premises=[problem["sentence_A"]],
hypothesis=problem["sentence_B"],
entailment_label=entailment_label,
extra_data=extra_data,
)
created += 1

logger.info(
f"SICK problems import complete! Created: {created} | Skipped: {skipped}"
Expand Down
47 changes: 34 additions & 13 deletions backend/problem/management/commands/import_snli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,19 @@

from langpro_annotator.logger import logger
from problem.models import Problem
from problem.services import SNLIData


class Command(BaseCommand):
help = "Import SNLI 1.0 problems and save them in the DB. Use the flags --dev, --train, --test to specify the paths to the SNLI files. The development set contains 10K problems, the training set contains 550K problems, and the test set contains 10K problems."

ENTAILMENT_LABELS = {
"entailment": Problem.EntailmentLabel.ENTAILMENT,
"contradiction": Problem.EntailmentLabel.CONTRADICTION,
"neutral": Problem.EntailmentLabel.NEUTRAL,
"none": Problem.EntailmentLabel.UNKNOWN, # For empty gold labels.
}

def add_arguments(self, parser):
parser.add_argument(
"--dev",
Expand Down Expand Up @@ -53,8 +61,10 @@ def import_snli_problems(self, snli_paths: list[tuple[str, str]]) -> None:
skipped = 0
created = 0

existing_snli_problems = Problem.objects.filter(type=Problem.ProblemType.SNLI)
existing_pair_ids = {p.content.get("pairID") for p in existing_snli_problems}
existing_snli_problems = Problem.objects.filter(dataset=Problem.Dataset.SNLI)
existing_pair_ids = {
p.additional_content.get("pairID") for p in existing_snli_problems
}

for subset, snli_path in snli_paths:
try:
Expand All @@ -70,20 +80,31 @@ def import_snli_problems(self, snli_paths: list[tuple[str, str]]) -> None:
skipped += 1
continue

problem["subset"] = subset

# Handle empty gold labels.
if problem["gold_label"] == "-":
problem["gold_label"] = "none"

# Handle empty labels.
for key in ["label1", "label2", "label3", "label4", "label5"]:
if problem[key] == "":
problem[key] = "none"
for key in [
"gold_label",
"label1",
"label2",
"label3",
"label4",
"label5",
]:
label_value = problem.get(key, "")
if label_value in ["-", ""]:
problem[key] = self.ENTAILMENT_LABELS["none"]
else:
problem[key] = self.ENTAILMENT_LABELS.get(
label_value, Problem.EntailmentLabel.UNKNOWN
)

extra_data = SNLIData.import_data(problem, subset)

Problem.objects.create(
type=Problem.ProblemType.SNLI,
content=problem,
dataset=Problem.Dataset.SNLI,
premises=[problem["sentence1"]],
hypothesis=problem["sentence2"],
entailment_label=problem["gold_label"],
extra_data=extra_data,
)
created += 1
existing_pair_ids.add(problem["pairID"])
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Generated by Django 4.2.20 on 2025-07-08 10:16

import django.contrib.postgres.fields
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("problem", "0002_alter_problem_type"),
]

operations = [
migrations.RenameField(
model_name="problem",
old_name="content",
new_name="extra_data",
),
migrations.RemoveField(
model_name="problem",
name="type",
),
migrations.AddField(
model_name="problem",
name="dataset",
field=models.CharField(
choices=[
("sick", "Sick"),
("fracas", "FraCaS"),
("snli", "SNLI"),
("user", "User"),
],
default="user",
max_length=255,
),
),
migrations.AddField(
model_name="problem",
name="entailment_label",
field=models.CharField(
choices=[
("neutral", "Neutral"),
("entailment", "Entailment"),
("contradiction", "Contradiction"),
("unknown", "Unknown"),
],
default="unknown",
max_length=255,
),
),
migrations.AddField(
model_name="problem",
name="hypothesis",
field=models.CharField(blank=True, max_length=512, null=True),
),
migrations.AddField(
model_name="problem",
name="premises",
field=django.contrib.postgres.fields.ArrayField(
base_field=models.CharField(max_length=512), default=list, size=None
),
),
]
59 changes: 55 additions & 4 deletions backend/problem/models.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,47 @@
from django.db import models
from django.contrib.postgres.fields import ArrayField

from problem.services import FracasData, SNLIData, SickData
from langpro_annotator.logger import logger


class Problem(models.Model):
class ProblemType(models.TextChoices):
class Dataset(models.TextChoices):
SICK = "sick", "Sick"
FRACAS = "fracas", "FraCaS"
SNLI = "snli", "SNLI"
USER = "user", "User"

type = models.CharField(
class EntailmentLabel(models.TextChoices):
NEUTRAL = "neutral", "Neutral"
ENTAILMENT = "entailment", "Entailment"
CONTRADICTION = "contradiction", "Contradiction"
UNKNOWN = "unknown", "Unknown"

dataset = models.CharField(
max_length=255,
choices=ProblemType.choices,
choices=Dataset.choices,
default=Dataset.USER,
)

premises = ArrayField(
models.CharField(max_length=512),
default=list,
)

content = models.JSONField()
hypothesis = models.CharField(
max_length=512,
blank=True,
null=True,
)

entailment_label = models.CharField(
max_length=255,
choices=EntailmentLabel.choices,
default=EntailmentLabel.UNKNOWN,
)

extra_data = models.JSONField()

def get_index(self) -> int | None:
"""
Expand All @@ -25,3 +52,27 @@ def get_index(self) -> int | None:
except Exception as e:
logger.error(f"Error getting index for problem {self.id}: {e}")
return None

def serialize(self) -> dict:
"""
Serialize the Problem instance to a dictionary.
"""

match self.dataset:
case self.Dataset.SICK:
serialized_extra_data = SickData.serialize(self.extra_data)
case self.Dataset.FRACAS:
serialized_extra_data = FracasData.serialize(self.extra_data)
case self.Dataset.SNLI:
serialized_extra_data = SNLIData.serialize(self.extra_data)
case _:
serialized_extra_data = {}

return {
"id": self.id,
"dataset": self.dataset,
"premises": self.premises,
"hypothesis": self.hypothesis,
"entailmentLabel": self.entailment_label,
"extraData": serialized_extra_data,
}
Loading