Merge pull request #17 from runpod-workers/fx-con

fix dynamic batching
runpod-workers · Jan 6, 2025 · e991704 · e991704
2 parents 95109c3 + 22919bd
commit e991704
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 1 deletion.
diff --git a/src/handler.py b/src/handler.py
@@ -3,13 +3,29 @@
 from engine import SGlangEngine
 from utils import process_response
 import runpod
+import os
 
 # Initialize the engine
 engine = SGlangEngine()
 engine.start_server()
 engine.wait_for_server()
 
 
+def get_max_concurrency(default=300):
+    """
+    Returns the maximum concurrency value.
+    By default, it uses 50 unless the 'MAX_CONCURRENCY' environment variable is set.
+
+    Args:
+        default (int): The default concurrency value if the environment variable is not set.
+
+    Returns:
+        int: The maximum concurrency value.
+    """
+    return int(os.getenv('MAX_CONCURRENCY', default))
+
+
+
 async def async_handler(job):
     """Handle the requests asynchronously."""
     job_input = job["input"]
@@ -41,4 +57,4 @@ async def async_handler(job):
         else:
             yield {"error": f"Generate request failed with status code {response.status_code}", "details": response.text}
 
-runpod.serverless.start({"handler": async_handler, "return_aggregate_stream": True})
+runpod.serverless.start({"handler": async_handler, "concurrency_modifier": get_max_concurrency, "return_aggregate_stream": True})
diff --git a/worker-config.json b/worker-config.json
@@ -23,6 +23,7 @@
             "PORT",
             "ADDITIONAL_PORTS",
             "API_KEY",
+            "MAX_CONCURRENCY",
             "LOG_LEVEL",
             "LOG_LEVEL_HTTP",
             "FILE_STORAGE_PTH"
@@ -90,6 +91,7 @@
             "PORT",
             "ADDITIONAL_PORTS",
             "API_KEY",
+            "MAX_CONCURRENCY",
             "LOG_LEVEL",
             "LOG_LEVEL_HTTP",
             "FILE_STORAGE_PTH"
@@ -371,6 +373,14 @@
       "required": false,
       "type": "text"
     },
+    "MAX_CONCURRENCY": {
+    "env_var_name": "MAX_CONCURRENCY",
+    "value": "300",
+    "title": "Max Concurrency",
+    "description": "Max concurrent requests per worker. SGLang has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency",
+    "required": false,
+    "type": "number"
+    },
     "FILE_STORAGE_PTH": {
       "env_var_name": "FILE_STORAGE_PTH",
       "value": "",