mongodb-labs · rueckstiess · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,33 @@
+name: Python package
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/[email protected]
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -e .
+
+      - name: Python Ruff Lint and Format
+        uses: adityabhangle658/[email protected]
+
+      - name: Run tests with pytest
+        run: |
+          pytest -v --tb=short .
diff --git a/notebooks/dungeon-results.ipynb b/notebooks/dungeon-results.ipynb
@@ -592,15 +592,15 @@
     "    plt.text(\n",
     "        i - width / 2,\n",
     "        stats_df[\"train_acc_mean\"].iloc[i] + 0.02,\n",
-    "        f'{stats_df[\"train_acc_mean\"].iloc[i]:.2f}',\n",
+    "        f\"{stats_df['train_acc_mean'].iloc[i]:.2f}\",\n",
     "        ha=\"center\",\n",
     "        va=\"bottom\",\n",
     "        fontsize=\"x-small\",\n",
     "    )\n",
     "    plt.text(\n",
     "        i + width / 2,\n",
     "        stats_df[\"test_acc_mean\"].iloc[i] + 0.02,\n",
-    "        f'{stats_df[\"test_acc_mean\"].iloc[i]:.2f}',\n",
+    "        f\"{stats_df['test_acc_mean'].iloc[i]:.2f}\",\n",
     "        ha=\"center\",\n",
     "        va=\"bottom\",\n",
     "        fontsize=\"x-small\",\n",
@@ -1714,7 +1714,6 @@
     "\n",
     "from origami.utils.guild import plot_scalar_history\n",
     "\n",
-    "\n",
     "runs_gr = guild.runs(labels=[\"ablation-6-dungeons-easy\"], filter_expr=\"model.guardrails=STRUCTURE_AND_VALUES\")\n",
     "runs_no_gr = guild.runs(labels=[\"ablation-6-dungeons-easy\"], filter_expr=\"model.guardrails=NONE\")\n",
     "\n",
@@ -1924,7 +1923,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/example_dungeons.ipynb b/notebooks/example_dungeons.ipynb
@@ -6,43 +6,44 @@
    "source": [
     "## Training an ORiGAMi model on the Dungeons dataset\n",
     "\n",
-    "The Dungeons dataset is a (dungeons-themed) challenging synthetic dataset for supervised classification on \n",
-    "semi-structured data. \n",
+    "The Dungeons dataset is a (dungeons-themed) challenging synthetic dataset for supervised classification on\n",
+    "semi-structured data.\n",
     "\n",
-    "Each instance constains a corridor array with several rooms. Each room has a door number and contains multiple \n",
+    "Each instance constains a corridor array with several rooms. Each room has a door number and contains multiple\n",
     "treasure chests with different-colored keys. All but one of the treasures are fake though.\n",
     "\n",
-    "The goal is to find the correct room number and key color in each dungeon based on some clues and return the \n",
-    "only non-fake treasure. \n",
+    "The goal is to find the correct room number and key color in each dungeon based on some clues and return the\n",
+    "only non-fake treasure.\n",
     "\n",
-    "The clues are given at the top-level of the object with keys `door`, `key_color`. \n",
+    "The clues are given at the top-level of the object with keys `door`, `key_color`.\n",
     "\n",
-    "To make it even harder, the `corridor` array may be shuffled, and room objects may have a number of monsters as \n",
-    "their first field, shifting the token positions of the serialized object by a variable amount. \n",
+    "To make it even harder, the `corridor` array may be shuffled, and room objects may have a number of monsters as\n",
+    "their first field, shifting the token positions of the serialized object by a variable amount.\n",
     "\n",
     "The following dictionary represents one example JSON instance:\n",
     "\n",
     "```json\n",
     "{\n",
-    "    \"door\": 1,                              // clue which door is the correct one\n",
-    "    \"key_color\": \"blue\",                    // clue which key is the correct one\n",
-    "    \"corridor\": [\n",
-    "        {\n",
-    "            \"monsters\": [\"troll\", \"wolf\"],  // optional monsters in front of the door\n",
-    "            \"door_no\": 1,                   // door number in the corridor\n",
-    "            \"red_key\": \"gemstones\",         // different keys return different treasures,\n",
-    "            \"blue_key\": \"spellbooks\",       // but only one is real, the others are fake\n",
-    "            \"green_key\": \"artifacts\"\n",
-    "        },\n",
-    "        {                                   // another room\n",
-    "            \"door_no\": 0,                   // rooms can be shuffled, here room 0 comes after 1        \n",
-    "            \"red_key\": \"diamonds\",          \n",
-    "            \"blue_key\": \"gold\",           \n",
-    "            \"green_key\": \"gemstones\"\n",
-    "        },\n",
-    "        // ... more doors ...\n",
-    "    ],\n",
-    "    \"treasure\": \"spellbooks\"                // correct treasure (target label)\n",
+    "  \"door\": 1, // clue which door is the correct one\n",
+    "  \"key_color\": \"blue\", // clue which key is the correct one\n",
+    "  \"corridor\": [\n",
+    "    {\n",
+    "      \"monsters\": [\"troll\", \"wolf\"], // optional monsters in front of the door\n",
+    "      \"door_no\": 1, // door number in the corridor\n",
+    "      \"red_key\": \"gemstones\", // different keys return different treasures,\n",
+    "      \"blue_key\": \"spellbooks\", // but only one is real, the others are fake\n",
+    "      \"green_key\": \"artifacts\"\n",
+    "    },\n",
+    "    {\n",
+    "      // another room\n",
+    "      \"door_no\": 0, // rooms can be shuffled, here room 0 comes after 1\n",
+    "      \"red_key\": \"diamonds\",\n",
+    "      \"blue_key\": \"gold\",\n",
+    "      \"green_key\": \"gemstones\"\n",
+    "    }\n",
+    "    // ... more doors ...\n",
+    "  ],\n",
+    "  \"treasure\": \"spellbooks\" // correct treasure (target label)\n",
     "}\n",
     "```\n",
     "\n",
@@ -133,8 +134,8 @@
     "    num_doors_range=(5, 10),\n",
     "    num_colors=3,\n",
     "    num_treasures=5,\n",
-    "    with_monsters=True,    # makes it harder as token positions get shifted by variable amount\n",
-    "    shuffle_rooms=True     # makes it harder because rooms are in random order\n",
+    "    with_monsters=True,  # makes it harder as token positions get shifted by variable amount\n",
+    "    shuffle_rooms=True,  # makes it harder because rooms are in random order\n",
     ")\n",
     "\n",
     "# print example dictionary\n",
@@ -463,7 +464,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/example_origami_dungeons.ipynb b/notebooks/example_origami_dungeons.ipynb
@@ -8,42 +8,44 @@
     "\n",
     "### The Dungeons Dataset\n",
     "\n",
-    "The Dungeons dataset is a (dungeons-themed) challenging synthetic dataset for supervised classification on \n",
-    "semi-structured data. \n",
+    "The Dungeons dataset is a (dungeons-themed) challenging synthetic dataset for supervised classification on\n",
+    "semi-structured data.\n",
     "\n",
-    "Each instance constains a corridor array with several rooms. Each room has a door number and contains multiple \n",
+    "Each instance constains a corridor array with several rooms. Each room has a door number and contains multiple\n",
     "treasure chests with different-colored keys. All but one of the treasures are fake though.\n",
     "\n",
-    "The goal is to find the correct room number and key color in each dungeon based on some clues and return the \n",
-    "only real treasure. The clues are given at the top-level of the object in the fields `door` and `key_color`. \n",
+    "The goal is to find the correct room number and key color in each dungeon based on some clues and return the\n",
+    "only real treasure. The clues are given at the top-level of the object in the fields `door` and `key_color`.\n",
     "\n",
-    "To make it even harder, the `corridor` array may be shuffled (`shuffle_rooms=True`), and room objects may \n",
-    "have a number of monsters as their first field (`with_monsters=True`), shifting the token positions of the \n",
-    "serialized object by a variable amount. \n",
+    "To make it even harder, the `corridor` array may be shuffled (`shuffle_rooms=True`), and room objects may\n",
+    "have a number of monsters as their first field (`with_monsters=True`), shifting the token positions of the\n",
+    "serialized object by a variable amount.\n",
     "\n",
     "The following dictionary represents one example JSON instance:\n",
     "\n",
     "```json\n",
     "{\n",
-    "    \"door\": 1,                              // clue which door is the correct one\n",
-    "    \"key_color\": \"blue\",                    // clue which key is the correct one\n",
-    "    \"corridor\": [                           // a corridor with many doors\n",
-    "        {\n",
-    "            \"monsters\": [\"troll\", \"wolf\"],  // optional monsters in front of the door\n",
-    "            \"door_no\": 1,                   // door number in the corridor\n",
-    "            \"red_key\": \"gemstones\",         // different keys return different treasures,\n",
-    "            \"blue_key\": \"spellbooks\",       // but only one is real, the others are fake\n",
-    "            \"green_key\": \"artifacts\"\n",
-    "        },\n",
-    "        {                                   // another room, here without monsters\n",
-    "            \"door_no\": 0,                   // rooms can be shuffled, here room 0 comes after 1        \n",
-    "            \"red_key\": \"diamonds\",          \n",
-    "            \"blue_key\": \"gold\",           \n",
-    "            \"green_key\": \"gemstones\"\n",
-    "        },\n",
-    "        // ... more rooms ...\n",
-    "    ],\n",
-    "    \"treasure\": \"spellbooks\"                // correct treasure (target label)\n",
+    "  \"door\": 1, // clue which door is the correct one\n",
+    "  \"key_color\": \"blue\", // clue which key is the correct one\n",
+    "  \"corridor\": [\n",
+    "    // a corridor with many doors\n",
+    "    {\n",
+    "      \"monsters\": [\"troll\", \"wolf\"], // optional monsters in front of the door\n",
+    "      \"door_no\": 1, // door number in the corridor\n",
+    "      \"red_key\": \"gemstones\", // different keys return different treasures,\n",
+    "      \"blue_key\": \"spellbooks\", // but only one is real, the others are fake\n",
+    "      \"green_key\": \"artifacts\"\n",
+    "    },\n",
+    "    {\n",
+    "      // another room, here without monsters\n",
+    "      \"door_no\": 0, // rooms can be shuffled, here room 0 comes after 1\n",
+    "      \"red_key\": \"diamonds\",\n",
+    "      \"blue_key\": \"gold\",\n",
+    "      \"green_key\": \"gemstones\"\n",
+    "    }\n",
+    "    // ... more rooms ...\n",
+    "  ],\n",
+    "  \"treasure\": \"spellbooks\" // correct treasure (target label)\n",
     "}\n",
     "```\n",
     "\n",
@@ -56,8 +58,8 @@
    "source": [
     "### Preprocessing\n",
     "\n",
-    "The JSON objects are tokenized by recursively walking through them depth-first and extracting key and value tokens. \n",
-    "Additionally, when encountering arrays or nested objects, special grammar tokens are included in the sequence. \n",
+    "The JSON objects are tokenized by recursively walking through them depth-first and extracting key and value tokens.\n",
+    "Additionally, when encountering arrays or nested objects, special grammar tokens are included in the sequence.\n",
     "This diagram illustrates tokenization.\n",
     "\n",
     "<img src=\"../assets/preprocessing-diagram.png\" width=\"600px\" />\n"
@@ -163,25 +165,24 @@
     "import json\n",
     "\n",
     "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.pipeline import Pipeline\n",
     "\n",
-    "from origami.utils.config import PipelineConfig\n",
-    "from origami.utils import set_seed\n",
     "from origami.datasets.dungeons import generate_data\n",
-    "from origami.preprocessing import docs_to_df, build_prediction_pipelines\n",
+    "from origami.preprocessing import build_prediction_pipelines, docs_to_df\n",
+    "from origami.utils import set_seed\n",
+    "from origami.utils.config import PipelineConfig\n",
     "\n",
     "# for reproducibility\n",
-    "# set_seed(123)\n",
+    "set_seed(123)\n",
     "\n",
     "# generate Dungeons dataset (see origami/datasets/dungeons.py)\n",
     "data = generate_data(\n",
     "    num_instances=10_000,\n",
     "    num_doors_range=(4, 8),\n",
     "    num_colors=3,\n",
     "    num_treasures=5,\n",
-    "    with_monsters=True,    # makes it harder as token positions get shifted by variable amount\n",
-    "    shuffle_rooms=True,    # makes it harder because rooms are in random order\n",
-    "    shuffle_keys=True      # makes it harder because keys are in random order\n",
+    "    with_monsters=True,  # makes it harder as token positions get shifted by variable amount\n",
+    "    shuffle_rooms=True,  # makes it harder because rooms are in random order\n",
+    "    shuffle_keys=True,  # makes it harder because keys are in random order\n",
     ")\n",
     "\n",
     "# print example dictionary\n",
@@ -195,18 +196,17 @@
     "\n",
     "# create train and test pipelines\n",
     "pipelines = build_prediction_pipelines(\n",
-    "    pipeline_config=PipelineConfig(sequence_order=\"ORDERED\", upscale=1),\n",
-    "    target_field=TARGET_FIELD\n",
+    "    pipeline_config=PipelineConfig(sequence_order=\"ORDERED\", upscale=1), target_field=TARGET_FIELD\n",
     ")\n",
     "\n",
     "# process train, eval and test data\n",
-    "train_df = pipelines['train'].fit_transform(train_docs_df)\n",
-    "test_df = pipelines['test'].transform(test_docs_df)\n",
+    "train_df = pipelines[\"train\"].fit_transform(train_docs_df)\n",
+    "test_df = pipelines[\"test\"].transform(test_docs_df)\n",
     "\n",
     "# get stateful objects\n",
-    "schema = pipelines['train'][\"schema\"].schema\n",
-    "encoder = pipelines['train'][\"encoder\"].encoder\n",
-    "block_size = pipelines['train'][\"padding\"].length\n",
+    "schema = pipelines[\"train\"][\"schema\"].schema\n",
+    "encoder = pipelines[\"train\"][\"encoder\"].encoder\n",
+    "block_size = pipelines[\"train\"][\"padding\"].length\n",
     "\n",
     "# print data stats\n",
     "print(f\"len train: {len(train_df)}, len test: {len(test_df)}\")\n",
@@ -231,8 +231,8 @@
     }
    ],
    "source": [
-    "# save dungeon dataset to MongoDB \n",
-    "from pymongo  import MongoClient\n",
+    "# save dungeon dataset to MongoDB\n",
+    "from pymongo import MongoClient\n",
     "\n",
     "client = MongoClient(\"mongodb://localhost:27017/\")\n",
     "collection = client.dungeons.dungeon_10k_4_8_3_5_mkr\n",
@@ -247,7 +247,7 @@
     "\n",
     "Here we instantiate an ORiGAMi model, a modified transformer trained on the token sequences created above.\n",
     "We use a standard \"medium\" configuration. ORiGAMi models are relatively robust to the choice of hyper-parameter\n",
-    "and default configurations often work well for mid-sized datasets. "
+    "and default configurations often work well for mid-sized datasets.\n"
    ]
   },
   {
@@ -270,7 +270,7 @@
     "from origami.utils import ModelConfig, TrainConfig, count_parameters\n",
     "\n",
     "# model and train configs\n",
-    "model_config = ModelConfig.from_preset(\"medium\")   # see origami/utils/config.py for different presets\n",
+    "model_config = ModelConfig.from_preset(\"medium\")  # see origami/utils/config.py for different presets\n",
     "model_config.position_encoding = \"SINE_COSINE\"\n",
     "model_config.vocab_size = encoder.vocab_size\n",
     "model_config.block_size = block_size\n",
@@ -284,12 +284,12 @@
     "train_dataset = DFDataset(train_df)\n",
     "test_dataset = DFDataset(test_df)\n",
     "\n",
-    "# create PDA and pass it to the model \n",
+    "# create PDA and pass it to the model\n",
     "vpda = ObjectVPDA(encoder, schema)\n",
     "model = ORIGAMI(model_config, train_config, vpda=vpda)\n",
     "\n",
     "n_params = count_parameters(model)\n",
-    "print(f\"Number of parameters: {n_params/1e6:.2f}M\")"
+    "print(f\"Number of parameters: {n_params / 1e6:.2f}M\")"
    ]
   },
   {
@@ -878,7 +878,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,