dottxt-ai · agourdel · Mar 8, 2025 · Mar 17, 2025 · Mar 12, 2025 · Mar 19, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,10 +20,19 @@ bincode = "2.0.0-rc.3"
 hf-hub = "=0.3.2"
 tokenizers = { version = "=0.20.3", features = ["http"] }
 rustc-hash = "2.1.0"
-regex-automata = "0.4.9"
+regex-automata = { git = "https://github.com/agourdel/regex.git", branch = "custom_dfa",package = "regex-automata" } 
+smallvec = "1.14.0"
+regex-syntax = "0.8.5"
+rayon = "1.10.0"
+
+[dev-dependencies]
+rand = { version = "0.9.0" }
+
 
 [features]
 python-bindings = ["pyo3", "serde-pyobject"]
+run_benchmarks = []
+
 
 [lib]
 name = "outlines_core"

diff --git a/benchmarks/bench_indexes.py b/benchmarks/bench_indexes.py
@@ -0,0 +1,226 @@
+# flake8: noqa
+# mypy: ignore-errors
+import os
+import random
+import time
+
+import psutil
+from outlines_core import Guide, Index, Vocabulary, create_mask, mask_to_list
+from outlines_core.json_schema import build_regex_from_schema
+
+os.environ["RUST_LOG"] = "debug"
+
+
+regexes = [
+    {
+        "name": "email",
+        "regex": r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]{1,63}(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]{1,63}){0,10})@(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.){1,3}[a-z0-9](?:[a-z0-9-]{0,30}[a-z0-9])?",
+    },
+    {"name": "simple_phone", "regex": r"\+?[1-9][0-9]{7,14}"},
+    {
+        "name": "complex_phone",
+        "regex": r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}",
+    },
+    {"name": "permissive_any", "regex": r".{255}$"},
+    {"name": "permissive_words", "regex": r"[a-zA-Z]{100}"},
+    {"name": "https", "regex" : r"(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?"},
+    {"name": "complexe", "regex" : r"""\{[ ]?"name"[ ]?:[ ]?"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*"[ ]?,[ ]?"age"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)[ ]?,[ ]?"complexe_phone"[ ]?:[ ]?"(\+?\d{1,4}?[-. ]?\(\d{1,3}\)?[-. ]?\d{1,4}[-. ]?\d{1,4}[-. ]?\d{1,9})"[ ]?\}"""}
+]
+schemas = [
+    {
+        "name": "schema_simple",
+        "regex": r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}, "required": ["name", "age"]}',
+    },
+    {
+        "name": "schema_simple_phone",
+        "regex": r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}, "complexe_phone": {"type": "string", "pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"}}, "required": ["name", "age", "complexe_phone"]}',
+    },
+    {
+        "name": "schema_complexe",
+        "regex": """{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "Schema for a recording",
+  "type": "object",
+  "definitions": {
+    "artist": {
+      "type": "object",
+      "properties": {
+        "id": {"type": "number"},
+        "name": {"type": "string"},
+        "functions": {
+          "type": "array",
+          "items": {"type": "string"}
+        }
+      },
+      "required": ["id", "name", "functions"]
+    }
+  },
+  "properties": {
+    "id": {"type": "number"},
+    "work": {
+      "type": "object",
+      "properties": {
+        "id": {"type": "number"},
+        "name": {"type": "string"},
+        "composer": {"$ref": "#/definitions/artist"}
+      }
+    },
+    "recording_artists": {
+      "type": "array",
+      "items": {"$ref": "#/definitions/artist"}
+    }
+  },
+  "required": ["id", "work", "recording_artists"]
+}"""
+    },
+    {
+        "name" : "schema_curriculum",
+        "regex" : r'''{
+                "$schema": "http://json-schema.org/draft-04/schema#",
+                "title": "Schema for a Curriculum Vitae",
+                "type": "object",
+                "definitions": {
+                    "experienceEntry": {
+                    "type": "object",
+                    "properties": {
+                        "date": {
+                        "type": "string",
+                        "format": "date"
+                        },
+                        "position": {
+                        "type": "string"
+                        }
+                    },
+                    "required": ["date", "position"]
+                    }
+                },
+                "properties": {
+                    "name": {
+                    "type": "string"
+                    },
+                    "surname": {
+                    "type": "string"
+                    },
+                    "email": {
+                    "type": "string",
+                    "pattern": "[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?"
+                    },
+                    "phone": {
+                    "type": "string",
+                    "pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"
+                    },
+                    "website": {
+                    "type": "string",
+                    "pattern": "(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?"
+                    },
+                    "resume": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/experienceEntry"
+                    }
+                    }
+                },
+                "required": ["name", "surname", "email", "phone", "resume"]
+                }'''
+    }
+]
+
+
+class V2IndexBenchmark:
+    def setup(self, regex):
+        self.vocab = Vocabulary.from_pretrained("unsloth/Llama-3.1-8B-Instruct")
+        self.v2_index = Index(regex, self.vocab)
+
+        self.v2_guide = Guide(self.v2_index)
+
+        self.mask = create_mask(len(self.vocab) + 1)
+
+        self.process = psutil.Process()
+
+        assert (
+            not self.v2_guide.is_finished()
+        ), f"Compressed Guide should not be finished for {regex}"
+
+    def run_benchmark(self):
+        iterations = 0
+        v2_total_time = 0
+
+        self.current_token_id = -1
+
+        if not self.v2_guide.is_finished():
+            iterations += 1
+
+            start_compressed = time.perf_counter()
+            self.v2_guide.get_tokens(self.mask)
+            end_compressed = time.perf_counter()
+
+            v2_time = end_compressed - start_compressed
+            v2_total_time += v2_time
+
+
+            mask_tokens_list = mask_to_list(self.mask)
+            random_idx = random.randrange(len(mask_tokens_list))
+            self.current_token_id = mask_tokens_list[random_idx]
+
+
+        while not self.v2_guide.is_finished():
+            iterations += 1
+
+            start_compressed = time.perf_counter()
+            self.v2_guide.advance(self.current_token_id, self.mask)
+            end_compressed = time.perf_counter()
+
+            v2_time = end_compressed - start_compressed
+            v2_total_time += v2_time
+
+
+            if not self.v2_guide.is_finished():
+                if iterations > 2000 :
+                    break
+                mask_tokens_list = mask_to_list(self.mask)
+                random_idx = random.randrange(len(mask_tokens_list))
+
+                self.current_token_id = mask_tokens_list[random_idx]
+
+
+
+        v2_total_time_us = v2_total_time * 1e6
+
+        print(f"  Total iterations (Number of tokens): {iterations}")
+        print(
+            f"  Guide with Compressed Index: {v2_total_time_us:.2f} µs ({v2_total_time_us / iterations:.2f} µs per iteration)"
+        )
+
+
+
+def test_benchmark_v2index():
+    for r in regexes:
+        name = r["name"]
+        regex = r["regex"]
+
+        print(f"> Regex : '{name}'")
+        bench = V2IndexBenchmark()
+        bench.setup(regex)
+        bench.run_benchmark()
+
+    for s in schemas:
+        name = s["name"]
+        schema = s["regex"]
+        regex = build_regex_from_schema(schema, None)
+        print(regex)
+        print(f"> Schema : '{name}'")
+        bench = V2IndexBenchmark()
+        bench.setup(regex)
+        bench.run_benchmark()
+
+
+if __name__ == "__main__":
+    print("Running main...")
+    #test_benchmark_v2index()
+    schema = schemas[3]['regex']
+    regex = build_regex_from_schema(schema, None)
+    print(regex)
+    print(f"> Schema : curriculum")
+    bench = V2IndexBenchmark()
+    bench.setup(regex)
+    bench.run_benchmark()
diff --git a/benchmarks/bench_regex_guide.py b/benchmarks/bench_regex_guide.py
@@ -14,6 +14,7 @@
     "url": r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?",
     "ssn": r"\d{3}-\d{2}-\d{4}",
     "complex_span_constrained_relation_extraction": "(['\"\\ ,]?((?:of|resulting|case|which|cultures|a|core|extreme|selflessness|spiritual|various|However|both|vary|in|other|secular|the|religious|among|moral|and|It|object|worldviews|altruism|traditional|material|aspect|or|life|beings|virtue|is|however|opposite|concern|an|practice|it|for|s|quality|religions|In|Altruism|animals|happiness|many|become|principle|human|selfishness|may|synonym)['\"\\ ,]?)+['\"\\ ,]?\\s\\|\\s([^|\\(\\)\n]{1,})\\s\\|\\s['\"\\ ,]?((?:of|resulting|case|which|cultures|a|core|extreme|selflessness|spiritual|various|However|both|vary|in|other|secular|the|religious|among|moral|and|It|object|worldviews|altruism|traditional|material|aspect|or|life|beings|virtue|is|however|opposite|concern|an|practice|it|for|s|quality|religions|In|Altruism|animals|happiness|many|become|principle|human|selfishness|may|synonym)['\"\\ ,]?)+['\"\\ ,]?(\\s\\|\\s\\(([^|\\(\\)\n]{1,})\\s\\|\\s([^|\\(\\)\n]{1,})\\))*\\n)*",
+    "complexe":  r"""\{[ ]?"name"[ ]?:[ ]?"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*"[ ]?,[ ]?"age"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)[ ]?,[ ]?"complexe_phone"[ ]?:[ ]?"(\+?\d{1,4}?[-. ]?\(\d{1,3}\)?[-. ]?\d{1,4}[-. ]?\d{1,4}[-. ]?\d{1,9})"[ ]?\}"""
 }
 
 

diff --git a/benchmarks/bench_schema_guide.py b/benchmarks/bench_schema_guide.py
@@ -0,0 +1,130 @@
+import os
+from concurrent.futures import ThreadPoolExecutor
+
+import psutil
+from outlines_core import Guide, Index, Vocabulary, json_schema
+
+schema_samples = {
+    "schema_simple":r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}, "required": ["name", "age"]}',
+    "schema_simple_and_complex_phone" : r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}, "complexe_phone": {"type": "string", "pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"}}, "required": ["name", "age", "complexe_phone"]}',
+    "schema_complexe": """{
+  "$schema": "http://json-schema.org/draft-04/schema#",
+  "title": "Schema for a recording",
+  "type": "object",
+  "definitions": {
+    "artist": {
+      "type": "object",
+      "properties": {
+        "id": {"type": "number"},
+        "name": {"type": "string"},
+        "functions": {
+          "type": "array",
+          "items": {"type": "string"}
+        }
+      },
+      "required": ["id", "name", "functions"]
+    }
+  },
+  "properties": {
+    "id": {"type": "number"},
+    "work": {
+      "type": "object",
+      "properties": {
+        "id": {"type": "number"},
+        "name": {"type": "string"},
+        "composer": {"$ref": "#/definitions/artist"}
+      }
+    },
+    "recording_artists": {
+      "type": "array",
+      "items": {"$ref": "#/definitions/artist"}
+    }
+  },
+  "required": ["id", "work", "recording_artists"]
+}""",
+ "schema_curriculum":r'''{
+                "$schema": "http://json-schema.org/draft-04/schema#",
+                "title": "Schema for a Curriculum Vitae",
+                "type": "object",
+                "definitions": {
+                    "experienceEntry": {
+                    "type": "object",
+                    "properties": {
+                        "date": {
+                        "type": "string",
+                        "format": "date"
+                        },
+                        "position": {
+                        "type": "string"
+                        }
+                    },
+                    "required": ["date", "position"]
+                    }
+                },
+                "properties": {
+                    "name": {
+                    "type": "string"
+                    },
+                    "surname": {
+                    "type": "string"
+                    },
+                    "email": {
+                    "type": "string",
+                    "pattern": "[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?"
+                    },
+                    "phone": {
+                    "type": "string",
+                    "pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"
+                    },
+                    "website": {
+                    "type": "string",
+                    "pattern": "(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?"
+                    },
+                    "resume": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/experienceEntry"
+                    }
+                    }
+                },
+                "required": ["name", "surname", "email", "phone", "resume"]
+                }'''
+}
+
+
+class SchemaIndexBenchmark:
+    params = schema_samples.keys()
+
+    def setup(self, pattern_name):
+        self.vocabulary = Vocabulary.from_pretrained("unsloth/Llama-3.1-8B-Instruct")
+        self.pattern = json_schema.build_regex_from_schema(schema_samples[pattern_name])
+
+    def time_schema_to_guide(self, pattern_name):
+        Index(self.pattern, self.vocabulary)
+
+    def time_schema_to_guide_threads(self, pattern_name):
+        # Default GIL switch interval is 5ms (0.005), which isn't helpful for cpu heavy tasks,
+        # this parallel case should be relatively close in runtime to one thread, but it is not,
+        # because of the GIL.
+        core_count = psutil.cpu_count(logical=False)
+        with ThreadPoolExecutor(max_workers=core_count) as executor:
+            list(executor.map(self._from_schema, [pattern_name] * core_count))
+
+    def time_schema_to_guide_threads_with_custom_switch_interval(self, pattern_name):
+        # Note: after moving to full rust implementation for index and guide creation, this experiment
+        # is no longer shows the drastic difference as it once showed when python was heavily involved,
+        # due to average speedup ~10 times.
+
+        # This test is to show, that if GIL's switch interval is set to be longer, then the parallel
+        # test's runtime on physical cores will be much closer to the one-threaded case.
+        import sys
+
+        sys.setswitchinterval(5)
+
+        core_count = psutil.cpu_count(logical=False)
+        with ThreadPoolExecutor(max_workers=core_count) as executor:
+            list(executor.map(self._from_schema, [pattern_name] * core_count))
+
+    def _from_schema(self, pattern_name):
+        Index(self.pattern, self.vocabulary)
+