Improve cli (#8)

rueckstiess · Thomas Rueckstiess · web-flow · commit 010a2ff2aa0d · 2025-02-06T12:26:01.000+11:00
* small copy change in notebook.

* mapping learning rate, making --set-parameter a global option and lowercase, documentation of train command.

* predict fixes, default STRUCTURE_AND_VALUES

* CLI documentation, fixed issue where target field is ignored. Instead it returns Symbol.UNKNOWN in the TargetFieldPipe.

---------

Co-authored-by: Thomas Rueckstiess &lt;thomas@mongodb.com&gt;
diff --git a/CLI.md b/CLI.md
diff --git a/README.md b/README.md
@@ -50,33 +50,15 @@ ORiGAMi comes with a command line interface (CLI) and a Python SDK.
 
 ### Usage from the Command Line
 
-The CLI allows to train a model and make predictions and generate synthetic data from a trained model. After installation, run `origami` from your shell to see an overview of available commands.
+The CLI allows to train a model and make predictions from a trained model. After installation, run `origami` from your shell to see an overview of available commands.
 
-Help for specific commands is available with `origami <command> --help`, where `<command>` is one of `train`, `predict`, `generate`.
+Help for specific commands is available with `origami <command> --help`, where `<command>` is currently one of `train` or `predict`.
 
-#### Model Training
-
-To train a model, use the `origami train` command. ORiGAMi works well with MongoDB. For example, to train a model on the `shop.orders` collection on a locally running MongoDB instance on standard port 27017, use the following command:
-
-```
-origami train "mongodb://localhost:27017" --source-db shop --source-coll orders
-```
-
-#### Making Predictions
-
-...TBD...
-
-#### Generating Synthetic Data
-
-...TBD...
+Detailed documentation for the CLI and available options can be found in [`CLI.md`](CLI.md).
 
 ### Usage with Python
 
-...TBD...
-
-```python
-from origami.model import ORIGAMI
-```
+To see an example on how to use ORiGAMi from Python, take a look at the provided [./notebooks](./notebooks/) folder, e.g. the [`example_origami_dungeons.ipynb`](./notebooks/example_origami_dungeons.ipynb) notebook.
 
 ## Experiment Reproduction
 
diff --git a/notebooks/example_rf_dungeons.ipynb b/notebooks/example_rf_dungeons.ipynb
@@ -325,7 +325,7 @@
     "\n",
     "We will attempt to learn the same Dungeons dataset as used in `example_origami_dungeons.ipynb` with a\n",
     "[RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)\n",
-    "from scikit-learn.\n",
+    "from scikit-learn. However this will not generalize to the test set, as we discuss in the paper.\n",
     "\n",
     "We recursively flatten the dataset, creating a column for each field path (e.g. `corridor.2.blue_key`). The we\n",
     "transform all features through one-hot encoding, including the numeric fields (`door` and `door_no`) as these are\n",
diff --git a/origami/cli/main.py b/origami/cli/main.py
@@ -1,8 +1,8 @@
 import click
 
-from .generate import generate
-from .predict import predict
-from .train import train
+from origami.cli.generate import generate
+from origami.cli.predict import predict
+from origami.cli.train import train
 
 CONTEXT_SETTINGS = dict(max_content_width=120)
 
diff --git a/origami/cli/predict.py b/origami/cli/predict.py
@@ -5,15 +5,14 @@
 from click_option_group import optgroup
 from omegaconf import OmegaConf
 
+from origami.cli.utils import create_projection, load_data
 from origami.inference import Predictor
 from origami.model import ORIGAMI
 from origami.model.vpda import ObjectVPDA
 from origami.preprocessing import DFDataset, TargetFieldPipe
 from origami.utils import Symbol, count_parameters, load_origami_model
 from origami.utils.config import GuardrailsMethod
 
-from .utils import create_projection, load_data
-
 
 @click.command()
 @click.argument("source", type=str)
@@ -35,7 +34,7 @@
 @optgroup.option("--limit", "-l", type=int, default=0, help="limit the number of documents to load")
 @optgroup.group("Output Options")
 @optgroup.option("--json", "-j", is_flag=True, default=False, help="output full JSON objects including target field")
-@click.option("--verbose", "-v", is_flag=True, default=True)
+@click.option("--verbose", "-v", is_flag=True, default=False)
 def predict(source, **kwargs):
     """Predict target fields with a trained ORIGAMI model."""
 
@@ -58,8 +57,6 @@ def predict(source, **kwargs):
         case GuardrailsMethod.NONE:
             vpda = None
 
-    click.echo(f"config:\n {OmegaConf.to_yaml(config)}")
-
     model = ORIGAMI(config.model, config.train, vpda=vpda)
     model.load_state_dict(state_dict)
 
@@ -77,7 +74,8 @@ def predict(source, **kwargs):
     # update or create new target pipe with new target_field
     test_pipeline = pipelines["test"]
 
-    if "target" in test_pipeline:
+    # update pipeline parameters and transform data
+    if "target" in test_pipeline.named_steps:
         test_pipeline["target"].target_field = config.data.target_field
     else:
         test_pipeline.steps.insert(0, ["target", TargetFieldPipe(config.data.target_field)])
@@ -89,9 +87,9 @@ def predict(source, **kwargs):
     if kwargs["verbose"]:
         # report number of parameters (note we don't count the decoder parameters in lm_head)
         n_params = count_parameters(model)
-        click.echo(f"running on device: {model.device}")
-        click.echo(f"number of parameters: {n_params / 1e6:.2f}M")
-        click.echo(f"config:\n {OmegaConf.to_yaml(config)}")
+        click.echo(f"running on device: {model.device}", err=True)
+        click.echo(f"number of parameters: {n_params / 1e6:.2f}M", err=True)
+        click.echo(f"config:\n {OmegaConf.to_yaml(config)}", err=True)
 
     # predict target field
     predictor = Predictor(model, encoder, config.data.target_field, max_batch_size=config.train.batch_size)
diff --git a/origami/cli/train.py b/origami/cli/train.py
@@ -29,6 +29,13 @@
     help="path to write trained model",
 )
 @click.option("--seed", type=int, default=1234, show_default=True, help="random seed")
+@click.option(
+    "--set-parameter",
+    "-p",
+    type=str,
+    multiple=True,
+    help="set additional config parameters, format: key.subkey=value. Multiple parameters can be set.",
+)
 @click.option("--verbose", "-v", is_flag=True, default=False)
 @optgroup.group("Source Options")
 @optgroup.option("--source-db", "-d", type=str, help="database name, only used when SOURCE is a MongoDB URI.")
@@ -38,14 +45,7 @@
 @optgroup.option("--skip", "-s", type=int, default=0, help="number of documents to skip")
 @optgroup.option("--limit", "-l", type=int, default=0, help="limit the number of documents to load")
 @optgroup.group("Config Options")
-@optgroup.option("--config-file", "-C", type=click.File("r"), help="path to config file")
-@optgroup.option(
-    "--set-parameter",
-    "-P",
-    type=str,
-    multiple=True,
-    help="set additional config parameters, format: key.subkey=value. Multiple parameters can be set.",
-)
+# @optgroup.option("--config-file", "-C", type=click.File("r"), help="path to config file")
 @optgroup.option(
     "--max-vocab-size",
     "-V",
@@ -56,7 +56,7 @@
 )
 @optgroup.option(
     "--num-layers",
-    "-L",
+    "-T",
     type=int,
     default=4,
     show_default=True,
@@ -78,6 +78,14 @@
     show_default=True,
     help="hidden dimensionality of transformer layers",
 )
+@optgroup.option(
+    "--learning-rate",
+    "-L",
+    type=float,
+    default=1e-3,
+    show_default=True,
+    help="max. learning rate of the model",
+)
 @optgroup.option(
     "--num-batches", "-N", type=int, default=10000, show_default=True, help="number of batches to train on"
 )
@@ -102,7 +110,7 @@
     "--guardrails",
     "-G",
     type=click.Choice(["NONE", "STRUCTURE_ONLY", "STRUCTURE_AND_VALUES"]),
-    default="STRUCTURE_ONLY",
+    default="STRUCTURE_AND_VALUES",
     help="guardrails settings",
 )
 @optgroup.option(
@@ -152,8 +160,7 @@ def train(source: str, **kwargs):
     # train configs
     config.train.n_batches = kwargs["num_batches"]
     config.train.batch_size = kwargs["batch_size"]
-    config.train.learning_rate = 1e-3
-    config.train.n_warmup_batches = 1000
+    config.train.learning_rate = kwargs["learning_rate"]
     config.train.print_every = 10
     config.train.eval_every = 100
     config.train.test_split = kwargs["val_split_ratio"]
diff --git a/origami/preprocessing/utils.py b/origami/preprocessing/utils.py
@@ -99,7 +99,7 @@ def consume_doc(gen):
 
 
 def target_collate_fn(target_token_id: int):
-    def collate_fn(tokens: torch.tensor) -> torch.tensor:
+    def collate_fn(tokens: list[torch.tensor]) -> torch.tensor:
         """collate function that only returns sequences up to a target token (incl.). Assumes
         the target token is at the same position in each sequence. (use with TargetTokenBatchSampler)"""
         tokens = default_collate(tokens)
diff --git a/origami/utils/common.py b/origami/utils/common.py
@@ -282,14 +282,11 @@ def get_value_at_path(d: dict, path: List[str]) -> Tuple[Any, bool]:
 def reorder_with_target_last(d: dict, target_path: str) -> Tuple[OrderedDict, Any]:
     """
     Reorder dictionary so target field appears last, maintaining nested structure.
-    If target field doesn't exist, returns (OrderedDict(d), Symbol.UNKNOWN).
+    Creates missing intermediate paths and sets target to Symbol.UNKNOWN if not found.
     """
     path_components = parse_path(target_path)
     target_value, found = get_value_at_path(d, path_components)
 
-    if not found:
-        return OrderedDict(d), Symbol.UNKNOWN
-
     def reorder_level(current_dict: dict, remaining_path: List[str]) -> OrderedDict:
         if not remaining_path:
             return OrderedDict(current_dict)
@@ -302,18 +299,82 @@ def reorder_level(current_dict: dict, remaining_path: List[str]) -> OrderedDict:
             if k != current_target:
                 result[k] = v if not isinstance(v, dict) else reorder_level(v, [])
 
-        # Add target field last
+        # Handle the target path
         if current_target in current_dict:
             target_dict = current_dict[current_target]
-            if len(remaining_path) > 1:
-                # If we have more path components, recurse with remaining path
-                result[current_target] = reorder_level(target_dict, remaining_path[1:])
-            else:
-                # If this is the final path component, add it last
-                result[current_target] = (
-                    target_dict if not isinstance(target_dict, dict) else reorder_level(target_dict, [])
-                )
+        else:
+            # Create empty dict for missing intermediate paths
+            target_dict = {} if len(remaining_path) > 1 else Symbol.UNKNOWN
+
+        if len(remaining_path) > 1:
+            # If we have more path components, recurse with remaining path
+            result[current_target] = reorder_level(target_dict, remaining_path[1:])
+        else:
+            # If this is the final path component, add it last
+            result[current_target] = (
+                target_dict if not isinstance(target_dict, dict) else reorder_level(target_dict, [])
+            )
 
         return result
 
-    return reorder_level(d, path_components), target_value
+    # Check if we're trying to traverse through a non-dict value
+    current = d
+    for i, component in enumerate(path_components[:-1]):
+        if component in current and not isinstance(current[component], dict):
+            # If we hit a non-dict value in the path, treat the entire remaining path
+            # as a top-level field
+            new_target = ".".join(path_components[i:])
+            result = OrderedDict()
+            for k, v in d.items():
+                if k != new_target:
+                    result[k] = v if not isinstance(v, dict) else reorder_level(v, [])
+            result[new_target] = Symbol.UNKNOWN
+            return result, Symbol.UNKNOWN
+
+        if component not in current:
+            break
+        current = current[component]
+
+    return reorder_level(d, path_components), target_value if found else Symbol.UNKNOWN
+
+
+# def reorder_with_target_last(d: dict, target_path: str) -> Tuple[OrderedDict, Any]:
+#     """
+#     Reorder dictionary so target field appears last, maintaining nested structure.
+#     If target field doesn't exist, returns (OrderedDict(d), Symbol.UNKNOWN).
+#     """
+#     path_components = parse_path(target_path)
+#     target_value, found = get_value_at_path(d, path_components)
+
+#     if not found:
+#         od = OrderedDict(d)
+#         od[target_path] = Symbol.UNKNOWN
+#         return od, Symbol.UNKNOWN
+
+#     def reorder_level(current_dict: dict, remaining_path: List[str]) -> OrderedDict:
+#         if not remaining_path:
+#             return OrderedDict(current_dict)
+
+#         current_target = remaining_path[0]
+#         result = OrderedDict()
+
+#         # Add all non-target fields first
+#         for k, v in current_dict.items():
+#             if k != current_target:
+#                 result[k] = v if not isinstance(v, dict) else reorder_level(v, [])
+
+#         # Add target field last
+#         if current_target in current_dict:
+#             target_dict = current_dict[current_target]
+#             if len(remaining_path) > 1:
+#                 # If we have more path components, recurse with remaining path
+#                 result[current_target] = reorder_level(target_dict, remaining_path[1:])
+#             else:
+#                 # If this is the final path component, add it last
+#                 result[current_target] = (
+#                     target_dict if not isinstance(target_dict, dict) else reorder_level(target_dict, [])
+#                 )
+
+#         return result
+
+#     return reorder_level(d, path_components), target_value
diff --git a/origami/utils/config.py b/origami/utils/config.py
@@ -66,7 +66,7 @@ class ModelConfig(BaseConfig):
     mask_field_token_losses: bool = False
 
     # whether or not to use guardrails (requires a ObjectVPDA to be passed into model)
-    guardrails: GuardrailsMethod = GuardrailsMethod.STRUCTURE_ONLY
+    guardrails: GuardrailsMethod = GuardrailsMethod.STRUCTURE_AND_VALUES
 
     @staticmethod
     def from_preset(size: str, **kwargs) -> "ModelConfig":
diff --git a/tests/preprocessing/test_pipes.py b/tests/preprocessing/test_pipes.py
@@ -153,12 +153,9 @@ def test_supervised_target_pipe(self):
 
         self.assertIn("target", df.columns)
 
-        for i, (doc, target) in enumerate(zip(df["docs"], df["target"])):
-            if target == Symbol.UNKNOWN:
-                self.assertNotIn("b", doc)
-            else:
-                self.assertIn("b", doc)
-                self.assertEqual(doc["b"], target)
+        for doc, target in zip(df["docs"], df["target"]):
+            self.assertIn("b", doc)
+            self.assertEqual(doc["b"], target)
 
 
 class TestDocTokenizerPipe(unittest.TestCase):
diff --git a/tests/utils/test_common.py b/tests/utils/test_common.py
@@ -172,7 +172,7 @@ def test_deep_nesting(self):
     def test_empty_dict(self):
         result, value = reorder_with_target_last({}, "any_key")
         self.assertEqual(value, Symbol.UNKNOWN)
-        self.assertEqual(dict(result), {})
+        self.assertEqual(dict(result), {"any_key": Symbol.UNKNOWN})
 
     def test_various_value_types(self):
         input_dict = {
@@ -222,19 +222,30 @@ def test_target_is_dict(self):
 
     def test_missing_field(self):
         input_dict = {"a": 1, "b": {"b1": True, "b2": False}, "c": "test"}
+
         # Test missing top-level field
         result, value = reorder_with_target_last(input_dict, "nonexistent")
-        self.assertEqual(dict(result), input_dict)  # Structure preserved
+        expected = {"a": 1, "b": {"b1": True, "b2": False}, "c": "test", "nonexistent": Symbol.UNKNOWN}
+        self.assertEqual(dict(result), expected)
         self.assertEqual(value, Symbol.UNKNOWN)
 
         # Test missing nested field
         result, value = reorder_with_target_last(input_dict, "b.nonexistent")
-        self.assertEqual(dict(result), input_dict)  # Structure preserved
+        expected = {"a": 1, "b": {"b1": True, "b2": False, "nonexistent": Symbol.UNKNOWN}, "c": "test"}
+        self.assertEqual(dict(result), expected)
         self.assertEqual(value, Symbol.UNKNOWN)
 
         # Test path through non-dict value
         result, value = reorder_with_target_last(input_dict, "a.something")
-        self.assertEqual(dict(result), input_dict)  # Structure preserved
+        # Should still treat 'a' as a terminal value since it's not a dict
+        expected = {"a": 1, "b": {"b1": True, "b2": False}, "c": "test", "a.something": Symbol.UNKNOWN}
+        self.assertEqual(dict(result), expected)
+        self.assertEqual(value, Symbol.UNKNOWN)
+
+        # Additional test for multiple levels of missing nested fields
+        result, value = reorder_with_target_last(input_dict, "b.new.deeper")
+        expected = {"a": 1, "b": {"b1": True, "b2": False, "new": {"deeper": Symbol.UNKNOWN}}, "c": "test"}
+        self.assertEqual(dict(result), expected)
         self.assertEqual(value, Symbol.UNKNOWN)