Generic adapter support in the grpc server (#32)

joerunde · web-flow · commit 79b73643e47c · 2024-06-11T16:16:49.000-06:00
Adds support for multi-lora adapters. Passing tests added over in this PR: https://github.ibm.com/ai-foundation/tgis-deploy-tests/pull/25/files --------- Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
diff --git a/proto/generation.proto b/proto/generation.proto
@@ -27,15 +27,19 @@ enum DecodingMethod {
 
 message BatchedGenerationRequest {
   string model_id = 1;
+  // Deprecated in favor of adapter_id
   optional string prefix_id = 2;
+  optional string adapter_id = 4;
   repeated GenerationRequest requests = 3;
 
   Parameters params = 10;
 }
 
 message SingleGenerationRequest {
   string model_id = 1;
+  // Deprecated in favor of adapter_id
   optional string prefix_id = 2;
+  optional string adapter_id = 4;
   GenerationRequest request = 3;
 
   Parameters params = 10;
diff --git a/vllm/entrypoints/grpc/adapters.py b/vllm/entrypoints/grpc/adapters.py
@@ -0,0 +1,118 @@
+"""Contains code to map api requests for adapters (e.g. peft prefixes, LoRA)
+into valid LLM engine requests"""
+import asyncio
+import concurrent.futures
+import dataclasses
+import json
+import os
+import re
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+from vllm.entrypoints.grpc.pb.generation_pb2 import (BatchedGenerationRequest,
+                                                     SingleGenerationRequest)
+from vllm.entrypoints.grpc.validation import TGISValidationError
+from vllm.lora.request import LoRARequest
+
+global_thread_pool = None  # used for loading adapter files from disk
+
+VALID_ADAPTER_ID_PATTERN = re.compile("[/\\w\\-]+")
+
+
+@dataclasses.dataclass
+class AdapterMetadata:
+    unique_id: int  # Unique integer for vllm to identify the adapter
+    adapter_type: str  # The string name of the peft adapter type, e.g. LORA
+    full_path: str
+
+
+@dataclasses.dataclass
+class AdapterStore:
+    cache_path: str  # Path to local store of adapters to load from
+    adapters: Dict[str, AdapterMetadata]
+    next_unique_id: int = 1
+
+
+async def validate_adapters(
+        request: Union[SingleGenerationRequest, BatchedGenerationRequest],
+        adapter_store: Optional[AdapterStore]) -> Dict[str, LoRARequest]:
+    """Takes the adapter name from the request and constructs a valid
+        engine request if one is set. Raises if the requested adapter
+        does not exist or adapter type is unsupported
+
+        Returns the kwarg dictionary to add to an engine.generate() call.
+        """
+    global global_thread_pool
+    adapter_id = request.adapter_id
+
+    if adapter_id and not adapter_store:
+        TGISValidationError.AdaptersDisabled.error()
+
+    if not adapter_id or not adapter_store:
+        return {}
+
+    # If not already cached, we need to validate that files exist and
+    # grab the type out of the adapter_config.json file
+    if (adapter_metadata := adapter_store.adapters.get(adapter_id)) is None:
+        _reject_bad_adapter_id(adapter_id)
+        local_adapter_path = os.path.join(adapter_store.cache_path, adapter_id)
+
+        loop = asyncio.get_running_loop()
+        if global_thread_pool is None:
+            global_thread_pool = concurrent.futures.ThreadPoolExecutor(
+                max_workers=2)
+
+        adapter_type = await loop.run_in_executor(global_thread_pool,
+                                                  _get_adapter_type_from_file,
+                                                  adapter_id,
+                                                  local_adapter_path)
+
+        # Add to cache
+        adapter_metadata = AdapterMetadata(
+            unique_id=adapter_store.next_unique_id,
+            adapter_type=adapter_type,
+            full_path=local_adapter_path)
+        adapter_store.adapters[adapter_id] = adapter_metadata
+
+    # Build the proper vllm request object
+    if adapter_metadata.adapter_type == "LORA":
+        lora_request = LoRARequest(lora_name=adapter_id,
+                                   lora_int_id=adapter_metadata.unique_id,
+                                   lora_local_path=adapter_metadata.full_path)
+        return {"lora_request": lora_request}
+
+    # All other types unsupported
+    TGISValidationError.AdapterUnsupported.error(adapter_metadata.adapter_type)
+
+
+def _get_adapter_type_from_file(adapter_id: str, adapter_path: str) -> str:
+    """This function does all the filesystem access required to deduce the type
+     of the adapter. It's run in a separate thread pool executor so that file
+     access does not block the main event loop."""
+    if not os.path.exists(adapter_path):
+        TGISValidationError.AdapterNotFound.error(adapter_id,
+                                                  "directory does not exist")
+
+    adapter_config_path = os.path.join(adapter_path, "adapter_config.json")
+    if not os.path.exists(adapter_config_path):
+        TGISValidationError.AdapterNotFound.error(
+            adapter_id, "invalid adapter: no adapter_config.json found")
+
+    # NB: blocks event loop
+    with open(adapter_config_path) as adapter_config_file:
+        adapter_config = json.load(adapter_config_file)
+
+    return adapter_config.get("peft_type", None)
+
+
+def _reject_bad_adapter_id(adapter_id: str) -> None:
+    """Raise if the adapter id attempts path traversal or has invalid file path
+    characters"""
+    if not VALID_ADAPTER_ID_PATTERN.fullmatch(adapter_id):
+        TGISValidationError.InvalidAdapterID.error(adapter_id)
+
+    # Check for path traversal
+    root_path = Path("/some/file/root")
+    derived_path = root_path / adapter_id
+    if not os.path.normpath(derived_path).startswith(str(root_path) + "/"):
+        TGISValidationError.InvalidAdapterID.error(adapter_id)
diff --git a/vllm/entrypoints/grpc/grpc_server.py b/vllm/entrypoints/grpc/grpc_server.py
@@ -14,6 +14,7 @@
 from vllm import (AsyncLLMEngine, CompletionOutput, RequestOutput,
                   SamplingParams)
 from vllm.config import ModelConfig
+from vllm.entrypoints.grpc.adapters import AdapterStore, validate_adapters
 from vllm.entrypoints.grpc.pb import generation_pb2_grpc  # type: ignore
 # yapf: disable
 from vllm.entrypoints.grpc.pb.generation_pb2 import (BatchedGenerationRequest,
@@ -33,6 +34,7 @@
 from vllm.entrypoints.openai.serving_completion import merge_async_iterators
 from vllm.inputs import TextTokensPrompt
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob
 from vllm.tgis_utils import logs
 from vllm.tgis_utils.guided_decoding import (
@@ -119,6 +121,13 @@ def __init__(self, engine: AsyncLLMEngine, args: argparse.Namespace):
         self.skip_special_tokens = not args.output_special_tokens
         self.default_include_stop_seqs = args.default_include_stop_seqs
 
+        self.adapter_store: Optional[AdapterStore] = None
+        if args.adapter_cache:
+            self.adapter_store = AdapterStore(
+                cache_path=args.adapter_cache,
+                adapters={}
+            )
+
     async def _post_init(self):
         self.config = await self.engine.get_model_config()
         # self.tokenizer_group = await self.engine.get_tokenizer_group()
@@ -148,6 +157,9 @@ async def Generate(self, request: BatchedGenerationRequest,
 
         generators = []
         max_is_token_limit = [False] * request_count
+
+        adapter_kwargs = await self._validate_adapters(request, context)
+
         for i, req in enumerate(request.requests):
             input_ids, max_is_token_limit[i]\
                 = await self._validate_prompt_and_tokenize(
@@ -161,7 +173,8 @@ async def Generate(self, request: BatchedGenerationRequest,
                 # re-tokenized when `prompt_token_ids` is supplied
                 self.engine.generate(inputs=inputs,
                                      sampling_params=sampling_params,
-                                     request_id=f"{request_id}-{i}"),
+                                     request_id=f"{request_id}-{i}",
+                                     **adapter_kwargs),
             )
 
         # TODO handle cancellation
@@ -218,6 +231,7 @@ async def GenerateStream(
             sampling_params, truncate_input_tokens, request.request.text,
             context)
 
+        adapter_kwargs = await self._validate_adapters(request, context)
         inputs = TextTokensPrompt(
             prompt=request.request.text,
             prompt_token_ids=input_ids
@@ -228,7 +242,8 @@ async def GenerateStream(
             # re-tokenized when `prompt_token_ids` is supplied
             inputs=inputs,
             sampling_params=sampling_params,
-            request_id=request_id
+            request_id=request_id,
+            **adapter_kwargs
         )
 
         resp_options = request.params.response
@@ -442,6 +457,19 @@ async def _validate_and_convert_params(
 
         return sampling_params, deadline
 
+    async def _validate_adapters(self,
+                                 request: Union[SingleGenerationRequest,
+                                                BatchedGenerationRequest],
+                                 context: ServicerContext) \
+            -> Dict[str, LoRARequest]:
+        try:
+            adapters = await validate_adapters(
+                request=request, adapter_store=self.adapter_store)
+        except ValueError as e:
+            service_metrics.count_request_failure(FailureReasonLabel.VALIDATION)
+            await context.abort(StatusCode.INVALID_ARGUMENT, str(e))
+        return adapters
+
     @staticmethod
     def _convert_reason(output: CompletionOutput, max_is_token_limit: bool,
                         time_limit_reached: bool
diff --git a/vllm/entrypoints/grpc/validation.py b/vllm/entrypoints/grpc/validation.py
@@ -1,3 +1,4 @@
+import typing
 from enum import Enum
 
 from vllm import SamplingParams
@@ -39,8 +40,13 @@ class TGISValidationError(str, Enum):
 
     # Additions that are _not_ in TGIS
     TopN = "top_n_tokens ({0}) must be <= {1}"
+    AdapterNotFound = "can't retrieve adapter with id '{0}': {1}"
+    AdaptersDisabled = "adapter_id supplied but no adapter store was configured"
+    AdapterUnsupported = "adapter type {0} is not currently supported"
+    InvalidAdapterID = ("Invalid adapter id '{0}', must contain only "
+                        "alphanumeric, _ and - and /")
 
-    def error(self, *args, **kwargs):
+    def error(self, *args, **kwargs) -> typing.NoReturn:
         """Raises a ValueError with a nicely formatted string"""
         raise ValueError(self.value.format(*args, **kwargs))
 
diff --git a/vllm/tgis_utils/args.py b/vllm/tgis_utils/args.py
@@ -82,6 +82,8 @@ def add_tgis_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     parser.add_argument('--tls-key-path', type=str)
     # map to ssl_ca_certs
     parser.add_argument('--tls-client-ca-cert-path', type=str)
+    # add a path when peft adapters will be loaded from
+    parser.add_argument('--adapter-cache', type=str)
 
     # TODO check/add other args here
 
diff --git a/vllm/tgis_utils/logs.py b/vllm/tgis_utils/logs.py
@@ -41,6 +41,7 @@ def log_response(
         response=response,
         params=request.params,
         prefix_id=request.prefix_id,
+        adapter_id=request.adapter_id,
         engine_metrics=engine_metrics,
         start_time=start_time,
         kind_log=kind_log,
@@ -57,6 +58,7 @@ def log_error(request: Union[BatchedGenerationRequest,
     # of just logging the simple string representation of the error
     param_str = text_format.MessageToString(request.params, as_one_line=True)
     prefix_id = request.prefix_id
+    adapter_id = request.adapter_id
 
     if isinstance(request, BatchedGenerationRequest):
         method_str = "generate"
@@ -69,13 +71,14 @@ def log_error(request: Union[BatchedGenerationRequest,
     input_chars = sum(len(input_) for input_ in inputs)
 
     span_str = (f"{method_str}{{input={short_input} prefix_id={prefix_id} "
-                f"input_chars=[{input_chars}] params={param_str}")
+                f"adapter_id={adapter_id} input_chars=[{input_chars}] "
+                f"params={param_str}")
 
     logger.error("%s: %s", span_str, exception_str)
 
 
 def _log_response(inputs: List[str], params: Parameters, prefix_id: str,
-                  response: GenerationResponse,
+                  adapter_id: str, response: GenerationResponse,
                   engine_metrics: Optional[RequestMetrics], start_time: float,
                   kind_log: str, method_str: str, logger: logging.Logger):
     """Logs responses similar to how the TGIS server does"""
@@ -99,6 +102,7 @@ def _log_response(inputs: List[str], params: Parameters, prefix_id: str,
 
     paramstr = text_format.MessageToString(params, as_one_line=True)
     span_str = (f"{method_str}{{input={short_input} prefix_id={prefix_id} "
+                f"adapter_id={adapter_id} "
                 f"input_chars=[{input_chars}] params={paramstr} "
                 f"tokenization_time={tokenization_time * 1e3:.2f}ms "
                 f"queue_time={queue_time * 1e3:.2f}ms "