CentreForDigitalHumanities · XanderVertegaal · Jul 11, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/backend/problem/management/commands/import_fracas.py b/backend/problem/management/commands/import_fracas.py
@@ -4,13 +4,20 @@
 from tqdm import tqdm
 
 from langpro_annotator.logger import logger
-from problem.services import get_fracas_problems
+from problem.services import FracasData
 from problem.models import Problem
 
 
 class Command(BaseCommand):
     help = "Import FraCaS problems from fracas.xml."
 
+    ENTAILMENT_LABELS = {
+        "yes": Problem.EntailmentLabel.ENTAILMENT,
+        "no": Problem.EntailmentLabel.CONTRADICTION,
+        "unknown": Problem.EntailmentLabel.NEUTRAL,
+        "undefined": Problem.EntailmentLabel.UNKNOWN,
+    }
+
     def add_arguments(self, parser):
         parser.add_argument(
             "--file",
@@ -25,13 +32,6 @@ def handle(self, *args, **options):
         fracas_path = options["fracas_path"]
         self.import_fracas_problems(fracas_path)
 
-    @staticmethod
-    def _text_from_element(element: ET.Element) -> str:
-        """
-        Extracts stripped text from an XML element, returning an empty string if the element is None or has no text.
-        """
-        return element.text.strip() if element is not None and element.text else ""
-
     @staticmethod
     def _annotate_section_subsections(tree: ET.ElementTree) -> None:
         """
@@ -72,7 +72,9 @@ def import_fracas_problems(self, fracas_path: str) -> None:
         created = 0
         skipped = 0
 
-        existing_fracas_problems = get_fracas_problems()
+        existing_fracas_problems = Problem.objects.filter(
+            dataset=Problem.Dataset.FRACAS
+        )
         existing_fracas_ids = {p.fracas_id for p in existing_fracas_problems}
 
         for problem in tqdm(all_problems, desc="Importing FraCaS problems"):
@@ -88,33 +90,22 @@ def import_fracas_problems(self, fracas_path: str) -> None:
                 skipped += 1
                 continue
 
-            question = self._text_from_element(problem.find("q"))
-            hypothesis = self._text_from_element(problem.find("h"))
-            answer = self._text_from_element(problem.find("a"))
-            note = self._text_from_element(problem.find("note"))
-
-            section = problem.get("section")
-            subsection = problem.get("subsection")
+            hypothesis = FracasData._text_from_element(problem.find("h"))
             fracas_answer = problem.get("fracas_answer")
-            fracas_nonstandard = problem.get("fracas_nonstandard", False) == "true"
-
             premise_nodes = problem.findall("p")
             premises = [node.text.strip() for node in premise_nodes if node.text]
+            entailment_label = self.ENTAILMENT_LABELS.get(
+                fracas_answer, Problem.EntailmentLabel.UNKNOWN
+            )
+
+            extra_data = FracasData.import_data(problem)
 
             Problem.objects.create(
-                type=Problem.ProblemType.FRACAS,
-                content={
-                    "fracas_id": int(problem_id),
-                    "question": question,
-                    "hypothesis": hypothesis,
-                    "answer": answer,
-                    "fracas_answer": fracas_answer,
-                    "fracas_non_standard": fracas_nonstandard,
-                    "note": note,
-                    "section_name": section,
-                    "subsection_name": subsection,
-                    "premises": premises,
-                },
+                dataset=Problem.Dataset.FRACAS,
+                premises=premises,
+                hypothesis=hypothesis,
+                entailment_label=entailment_label,
+                extra_data=extra_data,
             )
             created += 1
 

diff --git a/backend/problem/management/commands/import_sick.py b/backend/problem/management/commands/import_sick.py
@@ -5,12 +5,18 @@
 
 from langpro_annotator.logger import logger
 from problem.models import Problem
-from problem.services import get_sick_problems
+from problem.services import SickData
 
 
 class Command(BaseCommand):
     help = "Import SICK problems from SICK.txt (a TSV file)."
 
+    ENTAILMENT_LABELS = {
+        "NEUTRAL": Problem.EntailmentLabel.NEUTRAL,
+        "ENTAILMENT": Problem.EntailmentLabel.ENTAILMENT,
+        "CONTRADICTION": Problem.EntailmentLabel.CONTRADICTION,
+    }
+
     def add_arguments(self, parser):
         parser.add_argument(
             "--file",
@@ -33,7 +39,7 @@ def import_sick_problems(self, sick_path: str) -> None:
         skipped = 0
         created = 0
 
-        existing_sick_problems = get_sick_problems()
+        existing_sick_problems = Problem.objects.filter(dataset=Problem.Dataset.SICK)
         existing_pair_ids = {p.pair_id for p in existing_sick_problems}
 
         with open(sick_path, "r", encoding="utf-8") as file:
@@ -45,11 +51,20 @@ def import_sick_problems(self, sick_path: str) -> None:
                     skipped += 1
                     continue
 
-                created += 1
+                entailment_label = self.ENTAILMENT_LABELS.get(
+                    problem["entailment_label"], Problem.EntailmentLabel.UNKNOWN
+                )
+
+                extra_data = SickData.import_data(problem)
+
                 Problem.objects.create(
-                    type=Problem.ProblemType.SICK,
-                    content=problem,
+                    dataset=Problem.Dataset.SICK,
+                    premises=[problem["sentence_A"]],
+                    hypothesis=problem["sentence_B"],
+                    entailment_label=entailment_label,
+                    extra_data=extra_data,
                 )
+                created += 1
 
             logger.info(
                 f"SICK problems import complete! Created: {created} | Skipped: {skipped}"

diff --git a/backend/problem/management/commands/import_snli.py b/backend/problem/management/commands/import_snli.py
@@ -5,11 +5,19 @@
 
 from langpro_annotator.logger import logger
 from problem.models import Problem
+from problem.services import SNLIData
 
 
 class Command(BaseCommand):
     help = "Import SNLI 1.0 problems and save them in the DB. Use the flags --dev, --train, --test to specify the paths to the SNLI files. The development set contains 10K problems, the training set contains 550K problems, and the test set contains 10K problems."
 
+    ENTAILMENT_LABELS = {
+        "entailment": Problem.EntailmentLabel.ENTAILMENT,
+        "contradiction": Problem.EntailmentLabel.CONTRADICTION,
+        "neutral": Problem.EntailmentLabel.NEUTRAL,
+        "none": Problem.EntailmentLabel.UNKNOWN,  # For empty gold labels.
+    }
+
     def add_arguments(self, parser):
         parser.add_argument(
             "--dev",
@@ -53,8 +61,10 @@ def import_snli_problems(self, snli_paths: list[tuple[str, str]]) -> None:
         skipped = 0
         created = 0
 
-        existing_snli_problems = Problem.objects.filter(type=Problem.ProblemType.SNLI)
-        existing_pair_ids = {p.content.get("pairID") for p in existing_snli_problems}
+        existing_snli_problems = Problem.objects.filter(dataset=Problem.Dataset.SNLI)
+        existing_pair_ids = {
+            p.additional_content.get("pairID") for p in existing_snli_problems
+        }
 
         for subset, snli_path in snli_paths:
             try:
@@ -70,20 +80,31 @@ def import_snli_problems(self, snli_paths: list[tuple[str, str]]) -> None:
                             skipped += 1
                             continue
 
-                        problem["subset"] = subset
-
-                        # Handle empty gold labels.
-                        if problem["gold_label"] == "-":
-                            problem["gold_label"] = "none"
-
                         # Handle empty labels.
-                        for key in ["label1", "label2", "label3", "label4", "label5"]:
-                            if problem[key] == "":
-                                problem[key] = "none"
+                        for key in [
+                            "gold_label",
+                            "label1",
+                            "label2",
+                            "label3",
+                            "label4",
+                            "label5",
+                        ]:
+                            label_value = problem.get(key, "")
+                            if label_value in ["-", ""]:
+                                problem[key] = self.ENTAILMENT_LABELS["none"]
+                            else:
+                                problem[key] = self.ENTAILMENT_LABELS.get(
+                                    label_value, Problem.EntailmentLabel.UNKNOWN
+                                )
+
+                        extra_data = SNLIData.import_data(problem, subset)
 
                         Problem.objects.create(
-                            type=Problem.ProblemType.SNLI,
-                            content=problem,
+                            dataset=Problem.Dataset.SNLI,
+                            premises=[problem["sentence1"]],
+                            hypothesis=problem["sentence2"],
+                            entailment_label=problem["gold_label"],
+                            extra_data=extra_data,
                         )
                         created += 1
                         existing_pair_ids.add(problem["pairID"])

diff --git a/backend/problem/migrations/0003_rename_content_problem_extra_data_and_more.py b/backend/problem/migrations/0003_rename_content_problem_extra_data_and_more.py
@@ -0,0 +1,63 @@
+# Generated by Django 4.2.20 on 2025-07-08 10:16
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("problem", "0002_alter_problem_type"),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name="problem",
+            old_name="content",
+            new_name="extra_data",
+        ),
+        migrations.RemoveField(
+            model_name="problem",
+            name="type",
+        ),
+        migrations.AddField(
+            model_name="problem",
+            name="dataset",
+            field=models.CharField(
+                choices=[
+                    ("sick", "Sick"),
+                    ("fracas", "FraCaS"),
+                    ("snli", "SNLI"),
+                    ("user", "User"),
+                ],
+                default="user",
+                max_length=255,
+            ),
+        ),
+        migrations.AddField(
+            model_name="problem",
+            name="entailment_label",
+            field=models.CharField(
+                choices=[
+                    ("neutral", "Neutral"),
+                    ("entailment", "Entailment"),
+                    ("contradiction", "Contradiction"),
+                    ("unknown", "Unknown"),
+                ],
+                default="unknown",
+                max_length=255,
+            ),
+        ),
+        migrations.AddField(
+            model_name="problem",
+            name="hypothesis",
+            field=models.CharField(blank=True, max_length=512, null=True),
+        ),
+        migrations.AddField(
+            model_name="problem",
+            name="premises",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(max_length=512), default=list, size=None
+            ),
+        ),
+    ]
diff --git a/backend/problem/models.py b/backend/problem/models.py
@@ -1,20 +1,47 @@
 from django.db import models
+from django.contrib.postgres.fields import ArrayField
 
+from problem.services import FracasData, SNLIData, SickData
 from langpro_annotator.logger import logger
 
 
 class Problem(models.Model):
-    class ProblemType(models.TextChoices):
+    class Dataset(models.TextChoices):
         SICK = "sick", "Sick"
         FRACAS = "fracas", "FraCaS"
         SNLI = "snli", "SNLI"
+        USER = "user", "User"
 
-    type = models.CharField(
+    class EntailmentLabel(models.TextChoices):
+        NEUTRAL = "neutral", "Neutral"
+        ENTAILMENT = "entailment", "Entailment"
+        CONTRADICTION = "contradiction", "Contradiction"
+        UNKNOWN = "unknown", "Unknown"
+
+    dataset = models.CharField(
         max_length=255,
-        choices=ProblemType.choices,
+        choices=Dataset.choices,
+        default=Dataset.USER,
+    )
+
+    premises = ArrayField(
+        models.CharField(max_length=512),
+        default=list,
     )
 
-    content = models.JSONField()
+    hypothesis = models.CharField(
+        max_length=512,
+        blank=True,
+        null=True,
+    )
+
+    entailment_label = models.CharField(
+        max_length=255,
+        choices=EntailmentLabel.choices,
+        default=EntailmentLabel.UNKNOWN,
+    )
+
+    extra_data = models.JSONField()
 
     def get_index(self) -> int | None:
         """
@@ -25,3 +52,27 @@ def get_index(self) -> int | None:
         except Exception as e:
             logger.error(f"Error getting index for problem {self.id}: {e}")
             return None
+
+    def serialize(self) -> dict:
+        """
+        Serialize the Problem instance to a dictionary.
+        """
+
+        match self.dataset:
+            case self.Dataset.SICK:
+                serialized_extra_data = SickData.serialize(self.extra_data)
+            case self.Dataset.FRACAS:
+                serialized_extra_data = FracasData.serialize(self.extra_data)
+            case self.Dataset.SNLI:
+                serialized_extra_data = SNLIData.serialize(self.extra_data)
+            case _:
+                serialized_extra_data = {}
+
+        return {
+            "id": self.id,
+            "dataset": self.dataset,
+            "premises": self.premises,
+            "hypothesis": self.hypothesis,
+            "entailmentLabel": self.entailment_label,
+            "extraData": serialized_extra_data,
+        }