-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathengine.py
271 lines (219 loc) · 13.8 KB
/
engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import time
import requests
import importlib
import json
import logging
import os
from environment import TestEnvironment
from launch import SystemLaunched
from tests import Tester
logger = logging.getLogger("engine")
logging.basicConfig(level=logging.INFO, format='%(message)s')
import yaml
class TestEngine:
def __init__(
self, trustgraph_dir, templates_dir, version,
target_run_dir="target-environment",
):
self.trustgraph_dir = trustgraph_dir
self.templates_dir = templates_dir
self.target_run_dir = target_run_dir
self.version = version
self.pulsar = "http://localhost:9090"
self.test_dir = os.getcwd()
spec = importlib.util.spec_from_file_location(
'generate', f'{self.templates_dir}/generate.py'
)
self.generate = importlib.util.module_from_spec(spec)
spec.loader.exec_module(self.generate)
def generate_config(self, target):
return [
{
"name": target["graph"],
"parameters": {}
},
{
"name": "pulsar",
"parameters": {}
},
{
"name": target["vector"],
"parameters": {}
},
{
"name": "embeddings-hf",
"parameters": {}
},
{
"name": "graph-rag",
"parameters": {}
},
{
"name": "document-rag",
"parameters": {}
},
{
"name": "grafana",
"parameters": {}
},
{
"name": "trustgraph-base",
"parameters": {}
},
{
"name": "googleaistudio",
"parameters": {
"temperature": 0.1,
"max-output-tokens": 8000,
"model": "gemini-1.5-flash-002"
}
},
{
"name": "googleaistudio-rag",
"parameters": {
"temperature": 0.1,
"max-output-tokens": 8000,
"model": "gemini-1.5-flash-002"
}
},
{
"name": "prompt-template",
"parameters": {}
},
{
"name": "override-recursive-chunker",
"parameters": {
"chunk-size": 1000,
"chunk-overlap": 50
}
},
{
"name": "prompt-overrides",
"parameters": {
"system-template": "You are a helpful assistant that performs NLP, Natural Language Processing, tasks.\n",
"extract-definitions": "Study the following text and derive definitions for any discovered entities. Do not provide definitions for entities whose definitions are incomplete or unknown. Output relationships in JSON format as an array of objects with keys:\n- entity: the name of the entity\n- definition: English text which defines the entity\n\nHere is the text:\n{{text}}\n\nRequirements:\n- Do not provide explanations.\n- Do not use special characters in the response text.\n- The response will be written as plain text.\n- Do not include null or unknown definitions.\n- The response shall use the following JSON schema structure:\n\n```json\n[{\"entity\": string, \"definition\": string}]\n```",
"extract-relationships": "Study the following text and derive entity relationships. For each relationship, derive the subject, predicate and object of the relationship. Output relationships in JSON format as an array of objects with keys:\n- subject: the subject of the relationship\n- predicate: the predicate\n- object: the object of the relationship\n- object-entity: FALSE if the object is a simple data type and TRUE if the object is an entity\n\nHere is the text:\n{{text}}\n\nRequirements:\n- You will respond only with well formed JSON.\n- Do not provide explanations.\n- Respond only with plain text.\n- Do not respond with special characters.\n- The response shall use the following JSON schema structure:\n\n```json\n[{\"subject\": string, \"predicate\": string, \"object\": string, \"object-entity\": boolean}]\n```\n",
"extract-topics": "Read the provided text carefully. You will identify topics and their definitions found in the provided text. Topics are intangible concepts.\n\nReading Instructions:\n- Ignore document formatting in the provided text.\n- Study the provided text carefully for intangible concepts.\n\nHere is the text:\n{{text}}\n\nResponse Instructions: \n- Do not respond with special characters.\n- Return only topics that are concepts and unique to the provided text.\n- Respond only with well-formed JSON.\n- The JSON response shall be an array of objects with keys \"topic\" and \"definition\". \n- The response shall use the following JSON schema structure:\n\n```json\n[{\"topic\": string, \"definition\": string}]\n```\n\n- Do not write any additional text or explanations.",
"extract-rows": "<instructions>\nStudy the following text and derive objects which match the schema provided.\n\nYou must output an array of JSON objects for each object you discover\nwhich matches the schema. For each object, output a JSON object whose fields\ncarry the name field specified in the schema.\n</instructions>\n\n<schema>\n{{schema}}\n</schema>\n\n<text>\n{{text}}\n</text>\n\n<requirements>\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not add markdown formatting or headers or prefixes.\n</requirements>",
"kg-prompt": "Study the following set of knowledge statements. The statements are written in Cypher format that has been extracted from a knowledge graph. Use only the provided set of knowledge statements in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere's the knowledge statements:\n{% for edge in knowledge %}({{edge.s}})-[{{edge.p}}]->({{edge.o}})\n{%endfor%}\n\nUse only the provided knowledge statements to respond to the following:\n{{query}}\n",
"document-prompt": "Study the following context. Use only the information provided in the context in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere is the context:\n{{documents}}\n\nUse only the provided knowledge statements to respond to the following:\n{{query}}\n",
"agent-react": "Answer the following questions as best you can. You have\naccess to the following functions:\n\n{% for tool in tools %}{\n \"function\": \"{{ tool.name }}\",\n \"description\": \"{{ tool.description }}\",\n \"arguments\": [\n{% for arg in tool.arguments %} {\n \"name\": \"{{ arg.name }}\",\n \"type\": \"{{ arg.type }}\",\n \"description\": \"{{ arg.description }}\",\n }\n{% endfor %}\n ]\n}\n{% endfor %}\n\nYou can either choose to call a function to get more information, or\nreturn a final answer.\n \nTo call a function, respond with a JSON object of the following format:\n\n{\n \"thought\": \"your thought about what to do\",\n \"action\": \"the action to take, should be one of [{{tool_names}}]\",\n \"arguments\": {\n \"argument1\": \"argument_value\",\n \"argument2\": \"argument_value\"\n }\n}\n\nTo provide a final answer, response a JSON object of the following format:\n\n{\n \"thought\": \"I now know the final answer\",\n \"final-answer\": \"the final answer to the original input question\"\n}\n\nPrevious steps are included in the input. Each step has the following\nformat in your output:\n\n{\n \"thought\": \"your thought about what to do\",\n \"action\": \"the action taken\",\n \"arguments\": {\n \"argument1\": action argument,\n \"argument2\": action argument2\n },\n \"observation\": \"the result of the action\",\n}\n\nRespond by describing either one single thought/action/arguments or\nthe final-answer. Pause after providing one action or final-answer.\n\n{% if context %}Additional context has been provided:\n{{context}}{% endif %}\n\nQuestion: {{question}}\n\nInput:\n \n{% for h in history %}\n{\n \"action\": \"{{h.action}}\",\n \"arguments\": [\n{% for k, v in h.arguments.items() %} {\n \"{{k}}\": \"{{v}}\",\n{%endfor%} }\n ],\n \"observation\": \"{{h.observation}}\"\n}\n{% endfor %}"
}
},
{
"name": "agent-manager-react",
"parameters": {
"tools": [
{
"id": "sample-query",
"name": "Sample query",
"type": "knowledge-query",
"config": {},
"description": "Query a knowledge base that has already been extracted. The query should be a simple natural language question.",
"arguments": [
{
"name": "query",
"type": "string",
"description": "Describe the search query here."
}
]
},
{
"id": "sample-completion",
"name": "Sample text completion",
"type": "text-completion",
"config": {},
"description": "Describe the request to send to LLM. This request will be sent with no additional context.",
"arguments": [
{
"name": "response",
"type": "string",
"description": "The response expected from the LLM."
}
]
}
]
}
},
{
"name": "workbench-ui",
"parameters": {}
}
]
def generate_launch_config(self, target):
cfg = self.generate_config(target)
platform = "docker-compose"
with open(f"{self.templates_dir}/config-to-{platform}.jsonnet", "r") as f:
wrapper = f.read()
gen = self.generate.Generator(
json.dumps(cfg).encode("utf-8"),
version=self.version,
base=self.templates_dir,
)
processed = gen.process(wrapper)
y = yaml.dump(processed)
return y
# /api/v1/query?query=processor_state{job="pdf-decoder",processor_state="running"}'
def track_running(self, proc, timeout):
until = time.time() + timeout
while time.time() < until:
try:
resp = requests.get(
f'{self.pulsar}/api/v1/query?query=processor_state{{job="{proc}",processor_state="running"}}'
)
resp = resp.json()
datum = resp["data"]["result"][0]["value"][1]
if datum == "1":
logger.debug(f"{proc} is running")
return
except Exception as e:
pass
time.sleep(2)
raise RuntimeError(f"Timeout waiting for {proc} to start.")
def track_up(self, proc, timeout):
until = time.time() + timeout
while time.time() < until:
try:
resp = requests.get(
f'{self.pulsar}/api/v1/query?query=up{{job="{proc}"}}'
)
resp = resp.json()
datum = resp["data"]["result"][0]["value"][1]
if datum == "1":
logger.debug(f"{proc} is up")
return
except Exception as e:
pass
time.sleep(2)
raise RuntimeError(f"Timeout waiting for {proc} to start.")
def run_test(self, target):
label = ", ".join(map(lambda x: f"{x[0]}={x[1]}", target.items()))
logger.info(f"* Running tests for configuration ({label})")
with TestEnvironment(self, target) as environment:
with SystemLaunched(self.target_run_dir) as system:
logger.debug("Waiting for processes to start...")
self.track_up("zookeeper", 60)
self.track_up("bookie", 60)
self.track_up("pulsar", 60)
self.track_running("agent-manager", 60)
self.track_running("chunker", 10)
self.track_running("embeddings", 60)
self.track_running("graph-rag", 10)
self.track_running("kg-extract-definitions", 10)
self.track_running("kg-extract-relationships", 10)
self.track_running("kg-extract-topics", 10)
self.track_running("metering", 10)
self.track_running("metering-rag", 10)
self.track_running("pdf-decoder", 10)
self.track_running("prompt", 10)
self.track_running("prompt-rag", 10)
self.track_running("query-graph-embeddings", 10)
self.track_running("query-triples", 10)
self.track_running("store-graph-embeddings", 10)
self.track_running("store-triples", 10)
self.track_running("text-completion", 10)
self.track_running("text-completion-rag", 10)
logger.debug("System is up!")
tester = Tester(self)
tester.run()
logger.info("Tests passed!")