-
Notifications
You must be signed in to change notification settings - Fork 4.5k
/
Copy pathasync_engine.py
309 lines (249 loc) · 11.4 KB
/
async_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import asyncio
import logging
from functools import partial
from typing import AsyncIterator, Dict, Iterable, List, Optional, Set, Tuple, Type
from colossalai.inference.core.engine import InferenceEngine
# CLI logger
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("colossalai-inference")
def _raise_exception_on_finish(task: asyncio.Task, request_tracker: "Tracer") -> None:
msg = "Task finished unexpectedly. This should never happen! "
try:
try:
task.result()
except asyncio.CancelledError:
return
except Exception as exc:
raise RuntimeError(msg + " See stack trace above for the actual cause.") from exc
raise RuntimeError(msg)
except Exception as exc:
request_tracker.propagate_exception(exc)
raise exc
class RequstStream:
"""
A stream of Output for a request that can be iterated over asynchronously.
Attributes: 1.request_id: The id of the request.
2._future: A future that will be set when the request is finished.
Methods: set_result and get_result, results will be set when finished, for once, and
the `self.future` will be set to done.
"""
def __init__(self, request_id: int) -> None:
self.request_id = request_id
self._future = asyncio.Future()
def set_result(self, result) -> None:
"""Set final result and signal taht it's ready"""
if not self._future.done():
self._future.set_result(result)
async def get_result(self):
"""Wait for the result to be set and return it."""
return await self._future
@property
def finished(self) -> bool:
"""Check if the stream has finished by checking if the future is done."""
return self._future.done()
class Tracer:
"""
Recording new requests and finished requests.
Attributes: 1._request_streams: We create one stream for each request to trace the output.
2._finished_requests: A queue to store the finished requests.
3._new_requests: New requests will be stored in this queue first, before sending them to the engine.
4.new_requests_event: An event to notify the engine that there are new requests.
"""
def __init__(self) -> None:
self._request_streams: Dict[int, RequstStream] = {}
self._finished_requests: asyncio.Queue[int] = asyncio.Queue()
self._new_requests: asyncio.Queue[Tuple[RequstStream, dict]] = asyncio.Queue()
self.new_requests_event = None
def __contains__(self, item):
return item in self._request_streams
def init_event(self):
self.new_requests_event = asyncio.Event()
def propagate_exception(self, exc: Exception, request_id: Optional[int] = None) -> None:
"""
Propagate an exception to request streams (all if request_id is None).
"""
if request_id is not None:
self._request_streams[request_id].set_result(exc)
else:
for stream in self._request_streams.values():
stream.set_result(exc)
def process_finished_request(self, finished_request) -> None:
"""Process a finished request from the engine."""
request_id = finished_request.request_id
try:
self._request_streams[request_id].set_result(finished_request)
except:
raise RuntimeError(f"The request_id {request_id} is not found in our stream, please check")
self.abort_request(request_id)
def add_request(self, request_id: int, **engine_add_request_kwargs) -> RequstStream:
"""
Add a request to be sent to the engine on the next background
loop iteration.
"""
if request_id in self._request_streams:
raise KeyError(f"Request {request_id} already exists.")
stream = RequstStream(request_id)
logger.info(f"Added request {request_id}.")
self._new_requests.put_nowait((stream, {"request_id": request_id, **engine_add_request_kwargs}))
self.new_requests_event.set()
return stream
def abort_request(self, request_id: int, *, verbose: bool = False) -> None:
"""Abort a request during next background loop iteration."""
if verbose:
logger.info(f"Aborted request {request_id}.")
self._finished_requests.put_nowait(request_id)
if request_id not in self._request_streams or self._request_streams[request_id].finished:
# The request has already finished or been aborted.
# The requests in new_requests will be aborted when try to get them(if marked aborted)
return
self._request_streams[request_id].set_result(None)
def get_new_requests(self):
"""
Get new requests from http server.
"""
new_requests: List[Dict] = []
finished_requests: Set[int] = set()
while not self._finished_requests.empty():
request_id = self._finished_requests.get_nowait()
finished_requests.add(request_id)
while not self._new_requests.empty():
stream, new_request = self._new_requests.get_nowait()
if new_request["request_id"] in finished_requests:
# The request has been aborted.
stream.set_result(None)
continue
self._request_streams[stream.request_id] = stream
new_requests.append(new_request)
self.new_requests_event.clear()
return new_requests
async def wait_for_new_requests(self):
await self.new_requests_event.wait()
class _AsyncInferenceEngine(InferenceEngine):
"""
Async methods for Inference Engine. This engine is an extension for InferenceEngine, and the additional methods will only be used for
Methods: 1. async_step: The async version of Engine.step()
"""
async def async_step(self) -> List[str]:
"""
The async version of Engine.step()
Performs one decoding iteration and returns newly generated results.
It first schedules the sequences to be executed in the next iteration.
Then, it executes the model and updates the scheduler with the model
outputs. Finally, it decodes the sequences and returns the newly
generated results.
"""
batch = self.request_handler.schedule()
loop = asyncio.get_running_loop()
# Use run_in_executor to asyncally run the sync method model.forward().
logits = await loop.run_in_executor(
None,
self.model,
batch,
self.k_cache,
self.v_cache,
)
if self.inference_config.pad_input:
logits = logits[:, -1, :]
self.request_handler.search_tokens(self.generation_config, logits)
finished_sequences = self.request_handler.update()
for sequence in finished_sequences:
sequence.output = self.tokenizer.decode(sequence.output_token_id)
return finished_sequences, self.request_handler.total_requests_in_batch_bucket() > 0
class AsyncInferenceEngine:
"""An asynchronous wrapper for the InferenceEngine class.
This class is used to wrap the InferenceEngine class to make it asynchronous.
It uses asyncio to create a background loop that keeps processing incoming
requests. Note that this class does not hold model directly, when incoming a new
request, it first called `add_request` and the Tracer will record the request, putting
it to the background `InferenceEngine`(done in background loop) to process. You can
consider this engine as an interface.
"""
_engine_class: Type[_AsyncInferenceEngine] = _AsyncInferenceEngine
def __init__(self, start_engine_loop: bool = True, **kwargs):
self.engine = self._init_engine(**kwargs)
self.background_loop = None
# reference to the unshielded loop
self._background_loop_unshielded = None
self.start_engine_loop = start_engine_loop
self._request_tracer = Tracer()
@property
def background_loop_status(self):
return self.background_loop is not None and not self.background_loop.done()
def start_background_loop(self):
if self.background_loop_status:
raise RuntimeError("Existing loop is running")
self._request_tracer.init_event()
self._background_loop_unshielded = asyncio.get_event_loop().create_task(self.run_engine_loop())
self._background_loop_unshielded.add_done_callback(
partial(_raise_exception_on_finish, request_tracker=self._request_tracer)
)
self.background_loop = asyncio.shield(self._background_loop_unshielded)
def _init_engine(self, **kwargs):
return self._engine_class(**kwargs)
async def step(self):
"""
Run engine to process requests
Returns True if there are in-progress requests.
"""
new_requests = self._request_tracer.get_new_requests()
for new_request in new_requests:
self.engine.add_single_request(**new_request)
newly_finished_seqs, has_running_requests = await self.engine.async_step()
for seq in newly_finished_seqs:
self._request_tracer.process_finished_request(seq)
return has_running_requests
async def _engine_abort(self, request_ids: Iterable[int]):
self.engine.abort_request(request_ids)
async def abort(self, request_id: int):
"""
Abort a single request
"""
if not self.background_loop_status:
raise RuntimeError("Background loop is not running or launched correctly.")
return self._abort(request_id)
def _abort(self, request_id: int):
self._request_tracer.abort_request(request_id)
async def run_engine_loop(self):
processing_requests = False
while True:
if not processing_requests:
await self._request_tracer.wait_for_new_requests()
processing_requests = await self.step()
await asyncio.sleep(0)
async def add_request(
self,
request_id: int,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = None,
) -> RequstStream:
"""
Add a request to the background tracker(waiting queue), start the background loop if needed.
"""
if not self.background_loop_status:
if self.start_engine_loop:
self.start_background_loop()
else:
raise RuntimeError("Background loop is not running.")
stream = self._request_tracer.add_request(
request_id,
prompt=prompt,
prompt_token_ids=prompt_token_ids,
)
return stream
async def generate(
self,
request_id: int,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]] = None,
) -> AsyncIterator[str]:
"""
Generate output from a request. It receives the request from http server, adds it into the
waitting queue of Async Engine and streams the output sequence.
"""
try:
stream = await self.add_request(request_id, prompt, prompt_token_ids=prompt_token_ids)
return await stream.get_result()
except (Exception, asyncio.CancelledError) as e:
# If there is an exception or coroutine is cancelled, abort the request.
self._abort(request_id)
raise e