Skip to content

V2Index and TokensDFA extension : Compilation and Mask times improvment. #194

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,19 @@ bincode = "2.0.0-rc.3"
hf-hub = "=0.3.2"
tokenizers = { version = "=0.20.3", features = ["http"] }
rustc-hash = "2.1.0"
regex-automata = "0.4.9"
regex-automata = { git = "https://github.com/agourdel/regex.git", branch = "custom_dfa",package = "regex-automata" }
smallvec = "1.14.0"
regex-syntax = "0.8.5"
rayon = "1.10.0"

[dev-dependencies]
rand = { version = "0.9.0" }


[features]
python-bindings = ["pyo3", "serde-pyobject"]
run_benchmarks = []


[lib]
name = "outlines_core"
Expand Down
226 changes: 226 additions & 0 deletions benchmarks/bench_indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
# flake8: noqa
# mypy: ignore-errors
import os
import random
import time

import psutil
from outlines_core import Guide, Index, Vocabulary, create_mask, mask_to_list
from outlines_core.json_schema import build_regex_from_schema

os.environ["RUST_LOG"] = "debug"


regexes = [
{
"name": "email",
"regex": r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]{1,63}(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]{1,63}){0,10})@(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.){1,3}[a-z0-9](?:[a-z0-9-]{0,30}[a-z0-9])?",
},
{"name": "simple_phone", "regex": r"\+?[1-9][0-9]{7,14}"},
{
"name": "complex_phone",
"regex": r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}",
},
{"name": "permissive_any", "regex": r".{255}$"},
{"name": "permissive_words", "regex": r"[a-zA-Z]{100}"},
{"name": "https", "regex" : r"(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?"},
{"name": "complexe", "regex" : r"""\{[ ]?"name"[ ]?:[ ]?"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*"[ ]?,[ ]?"age"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)[ ]?,[ ]?"complexe_phone"[ ]?:[ ]?"(\+?\d{1,4}?[-. ]?\(\d{1,3}\)?[-. ]?\d{1,4}[-. ]?\d{1,4}[-. ]?\d{1,9})"[ ]?\}"""}
]
schemas = [
{
"name": "schema_simple",
"regex": r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}, "required": ["name", "age"]}',
},
{
"name": "schema_simple_phone",
"regex": r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}, "complexe_phone": {"type": "string", "pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"}}, "required": ["name", "age", "complexe_phone"]}',
},
{
"name": "schema_complexe",
"regex": """{
"$schema": "http://json-schema.org/draft-04/schema#",
"title": "Schema for a recording",
"type": "object",
"definitions": {
"artist": {
"type": "object",
"properties": {
"id": {"type": "number"},
"name": {"type": "string"},
"functions": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["id", "name", "functions"]
}
},
"properties": {
"id": {"type": "number"},
"work": {
"type": "object",
"properties": {
"id": {"type": "number"},
"name": {"type": "string"},
"composer": {"$ref": "#/definitions/artist"}
}
},
"recording_artists": {
"type": "array",
"items": {"$ref": "#/definitions/artist"}
}
},
"required": ["id", "work", "recording_artists"]
}"""
},
{
"name" : "schema_curriculum",
"regex" : r'''{
"$schema": "http://json-schema.org/draft-04/schema#",
"title": "Schema for a Curriculum Vitae",
"type": "object",
"definitions": {
"experienceEntry": {
"type": "object",
"properties": {
"date": {
"type": "string",
"format": "date"
},
"position": {
"type": "string"
}
},
"required": ["date", "position"]
}
},
"properties": {
"name": {
"type": "string"
},
"surname": {
"type": "string"
},
"email": {
"type": "string",
"pattern": "[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?"
},
"phone": {
"type": "string",
"pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"
},
"website": {
"type": "string",
"pattern": "(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?"
},
"resume": {
"type": "array",
"items": {
"$ref": "#/definitions/experienceEntry"
}
}
},
"required": ["name", "surname", "email", "phone", "resume"]
}'''
}
]


class V2IndexBenchmark:
def setup(self, regex):
self.vocab = Vocabulary.from_pretrained("unsloth/Llama-3.1-8B-Instruct")
self.v2_index = Index(regex, self.vocab)

self.v2_guide = Guide(self.v2_index)

self.mask = create_mask(len(self.vocab) + 1)

self.process = psutil.Process()

assert (
not self.v2_guide.is_finished()
), f"Compressed Guide should not be finished for {regex}"

def run_benchmark(self):
iterations = 0
v2_total_time = 0

self.current_token_id = -1

if not self.v2_guide.is_finished():
iterations += 1

start_compressed = time.perf_counter()
self.v2_guide.get_tokens(self.mask)
end_compressed = time.perf_counter()

v2_time = end_compressed - start_compressed
v2_total_time += v2_time


mask_tokens_list = mask_to_list(self.mask)
random_idx = random.randrange(len(mask_tokens_list))
self.current_token_id = mask_tokens_list[random_idx]


while not self.v2_guide.is_finished():
iterations += 1

start_compressed = time.perf_counter()
self.v2_guide.advance(self.current_token_id, self.mask)
end_compressed = time.perf_counter()

v2_time = end_compressed - start_compressed
v2_total_time += v2_time


if not self.v2_guide.is_finished():
if iterations > 2000 :
break
mask_tokens_list = mask_to_list(self.mask)
random_idx = random.randrange(len(mask_tokens_list))

self.current_token_id = mask_tokens_list[random_idx]



v2_total_time_us = v2_total_time * 1e6

print(f" Total iterations (Number of tokens): {iterations}")
print(
f" Guide with Compressed Index: {v2_total_time_us:.2f} µs ({v2_total_time_us / iterations:.2f} µs per iteration)"
)



def test_benchmark_v2index():
for r in regexes:
name = r["name"]
regex = r["regex"]

print(f"> Regex : '{name}'")
bench = V2IndexBenchmark()
bench.setup(regex)
bench.run_benchmark()

for s in schemas:
name = s["name"]
schema = s["regex"]
regex = build_regex_from_schema(schema, None)
print(regex)
print(f"> Schema : '{name}'")
bench = V2IndexBenchmark()
bench.setup(regex)
bench.run_benchmark()


if __name__ == "__main__":
print("Running main...")
#test_benchmark_v2index()
schema = schemas[3]['regex']
regex = build_regex_from_schema(schema, None)
print(regex)
print(f"> Schema : curriculum")
bench = V2IndexBenchmark()
bench.setup(regex)
bench.run_benchmark()
1 change: 1 addition & 0 deletions benchmarks/bench_regex_guide.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"url": r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?",
"ssn": r"\d{3}-\d{2}-\d{4}",
"complex_span_constrained_relation_extraction": "(['\"\\ ,]?((?:of|resulting|case|which|cultures|a|core|extreme|selflessness|spiritual|various|However|both|vary|in|other|secular|the|religious|among|moral|and|It|object|worldviews|altruism|traditional|material|aspect|or|life|beings|virtue|is|however|opposite|concern|an|practice|it|for|s|quality|religions|In|Altruism|animals|happiness|many|become|principle|human|selfishness|may|synonym)['\"\\ ,]?)+['\"\\ ,]?\\s\\|\\s([^|\\(\\)\n]{1,})\\s\\|\\s['\"\\ ,]?((?:of|resulting|case|which|cultures|a|core|extreme|selflessness|spiritual|various|However|both|vary|in|other|secular|the|religious|among|moral|and|It|object|worldviews|altruism|traditional|material|aspect|or|life|beings|virtue|is|however|opposite|concern|an|practice|it|for|s|quality|religions|In|Altruism|animals|happiness|many|become|principle|human|selfishness|may|synonym)['\"\\ ,]?)+['\"\\ ,]?(\\s\\|\\s\\(([^|\\(\\)\n]{1,})\\s\\|\\s([^|\\(\\)\n]{1,})\\))*\\n)*",
"complexe": r"""\{[ ]?"name"[ ]?:[ ]?"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*"[ ]?,[ ]?"age"[ ]?:[ ]?(-)?(0|[1-9][0-9]*)[ ]?,[ ]?"complexe_phone"[ ]?:[ ]?"(\+?\d{1,4}?[-. ]?\(\d{1,3}\)?[-. ]?\d{1,4}[-. ]?\d{1,4}[-. ]?\d{1,9})"[ ]?\}"""
}


Expand Down
130 changes: 130 additions & 0 deletions benchmarks/bench_schema_guide.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import os
from concurrent.futures import ThreadPoolExecutor

import psutil
from outlines_core import Guide, Index, Vocabulary, json_schema

schema_samples = {
"schema_simple":r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}, "required": ["name", "age"]}',
"schema_simple_and_complex_phone" : r'{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}, "complexe_phone": {"type": "string", "pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"}}, "required": ["name", "age", "complexe_phone"]}',
"schema_complexe": """{
"$schema": "http://json-schema.org/draft-04/schema#",
"title": "Schema for a recording",
"type": "object",
"definitions": {
"artist": {
"type": "object",
"properties": {
"id": {"type": "number"},
"name": {"type": "string"},
"functions": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["id", "name", "functions"]
}
},
"properties": {
"id": {"type": "number"},
"work": {
"type": "object",
"properties": {
"id": {"type": "number"},
"name": {"type": "string"},
"composer": {"$ref": "#/definitions/artist"}
}
},
"recording_artists": {
"type": "array",
"items": {"$ref": "#/definitions/artist"}
}
},
"required": ["id", "work", "recording_artists"]
}""",
"schema_curriculum":r'''{
"$schema": "http://json-schema.org/draft-04/schema#",
"title": "Schema for a Curriculum Vitae",
"type": "object",
"definitions": {
"experienceEntry": {
"type": "object",
"properties": {
"date": {
"type": "string",
"format": "date"
},
"position": {
"type": "string"
}
},
"required": ["date", "position"]
}
},
"properties": {
"name": {
"type": "string"
},
"surname": {
"type": "string"
},
"email": {
"type": "string",
"pattern": "[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?"
},
"phone": {
"type": "string",
"pattern": "\\+?\\d{1,4}?[-. ]?\\(\\d{1,3}\\)?[-. ]?\\d{1,4}[-. ]?\\d{1,4}[-. ]?\\d{1,9}"
},
"website": {
"type": "string",
"pattern": "(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?"
},
"resume": {
"type": "array",
"items": {
"$ref": "#/definitions/experienceEntry"
}
}
},
"required": ["name", "surname", "email", "phone", "resume"]
}'''
}


class SchemaIndexBenchmark:
params = schema_samples.keys()

def setup(self, pattern_name):
self.vocabulary = Vocabulary.from_pretrained("unsloth/Llama-3.1-8B-Instruct")
self.pattern = json_schema.build_regex_from_schema(schema_samples[pattern_name])

def time_schema_to_guide(self, pattern_name):
Index(self.pattern, self.vocabulary)

def time_schema_to_guide_threads(self, pattern_name):
# Default GIL switch interval is 5ms (0.005), which isn't helpful for cpu heavy tasks,
# this parallel case should be relatively close in runtime to one thread, but it is not,
# because of the GIL.
core_count = psutil.cpu_count(logical=False)
with ThreadPoolExecutor(max_workers=core_count) as executor:
list(executor.map(self._from_schema, [pattern_name] * core_count))

def time_schema_to_guide_threads_with_custom_switch_interval(self, pattern_name):
# Note: after moving to full rust implementation for index and guide creation, this experiment
# is no longer shows the drastic difference as it once showed when python was heavily involved,
# due to average speedup ~10 times.

# This test is to show, that if GIL's switch interval is set to be longer, then the parallel
# test's runtime on physical cores will be much closer to the one-threaded case.
import sys

sys.setswitchinterval(5)

core_count = psutil.cpu_count(logical=False)
with ThreadPoolExecutor(max_workers=core_count) as executor:
list(executor.map(self._from_schema, [pattern_name] * core_count))

def _from_schema(self, pattern_name):
Index(self.pattern, self.vocabulary)

Loading