ray-project · kangwangamd · Nov 3, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
@@ -134,3 +134,38 @@ def build_openai_app(builder_config: dict) -> Application:
     return serve.deployment(ingress_cls, **ingress_options).bind(
         llm_deployments=llm_deployments, **ingress_cls_config.ingress_extra_kwargs
     )
+
+
+def build_sglang_openai_app(builder_config: dict) -> Application:
+    """Build an OpenAI compatible app with the llm deployment setup from
+    the given builder configuration.
+
+    Args:
+        builder_config: The configuration for the builder. It has to conform
+            to the LLMServingArgs pydantic model.
+
+    Returns:
+        The configured Ray Serve Application router.
+    """
+
+    builder_config = LLMServingArgs.model_validate(builder_config)
+    llm_configs = builder_config.llm_configs
+
+    llm_deployments = [build_sglang_deployment(c) for c in llm_configs]
+
+    ingress_cls_config = builder_config.ingress_cls_config
+    ingress_options = ingress_cls_config.ingress_cls.get_deployment_options(llm_configs)
+
+    if builder_config.ingress_deployment_config:
+        ingress_options = deep_merge_dicts(
+            ingress_options, builder_config.ingress_deployment_config
+        )
+
+    ingress_cls = make_fastapi_ingress(ingress_cls_config.ingress_cls)
+
+    logger.info("============== Ingress Options ==============")
+    logger.info(pprint.pformat(ingress_options))
+
+    return serve.deployment(ingress_cls, **ingress_options).bind(
+        llm_deployments=llm_deployments, **ingress_cls_config.ingress_extra_kwargs
+    )
@@ -12,6 +12,8 @@
     LLMConfig,
 )
 from ray.llm._internal.serve.core.server.llm_server import LLMServer
+from ray.llm._internal.serve.core.server.engines.sglang.sglang_engine import SGLangServer
+
 from ray.llm._internal.serve.observability.logging import get_logger
 from ray.serve.deployment import Application
 
@@ -76,3 +78,50 @@ def build_llm_deployment(
     return serve.deployment(deployment_cls, **deployment_options).bind(
         llm_config=llm_config, **bind_kwargs
     )
+
+def build_sglang_deployment(
+    llm_config: LLMConfig,
+    *,
+    name_prefix: Optional[str] = None,
+    bind_kwargs: Optional[dict] = None,
+    override_serve_options: Optional[dict] = None,
+    deployment_cls: Optional[Type[LLMServer]] = None,
+) -> Application:
+    """Build an LLMServer deployment.
+
+    Args:
+        llm_config: The LLMConfig to build the deployment.
+        name_prefix: The prefix to add to the deployment name.
+        bind_kwargs: The optional extra kwargs to pass to the deployment.
+            Used for customizing the deployment.
+        override_serve_options: The optional serve options to override the
+            default options.
+        deployment_cls: The deployment class to use. Defaults to LLMServer.
+
+    Returns:
+        The Ray Serve Application for the LLMServer deployment.
+    """
+    deployment_cls = SGLangServer #deployment_cls or LLMServer
+    name_prefix = name_prefix or f"{deployment_cls.__name__}:"
+    bind_kwargs = bind_kwargs or {}
+
+    deployment_options = deployment_cls.get_deployment_options(llm_config)
+    # Set the name of the deployment config to map to the model ID.
+    deployment_name = deployment_options.get("name", _get_deployment_name(llm_config))
+
+    if name_prefix:
+        deployment_options["name"] = name_prefix + deployment_name
+
+    if override_serve_options:
+        deployment_options.update(override_serve_options)
+
+    deployment_options = deep_merge_dicts(
+        DEFAULT_DEPLOYMENT_OPTIONS, deployment_options
+    )
+
+    logger.info("============== Deployment Options ==============")
+    logger.info(pprint.pformat(deployment_options))
+
+    return serve.deployment(deployment_cls, **deployment_options).bind(
+        llm_config=llm_config, **bind_kwargs
+    )        
@@ -0,0 +1,50 @@
+import ray
+import requests
+from ray import serve
+from ray.serve.handle import DeploymentHandle
+
+#@serve.deployment disable serve.deployment
+class SGLangServer:
+    def __init__(self, llm_config: LLMConfig):
+
+        default_engine_kwargs = dict(
+            model_path = "/scratch2/huggingface/hub/meta-llama/Llama-3.1-8B-Instruct/",
+            mem_fraction_static = 0.8,
+            tp_size = 8,
+        )
+
+        if llm_config.engine_kwargs:
+            default_engine_kwargs.update(llm_config.engine_kwargs)
+        self.engine_kwargs = default_engine_kwargs
+
+        try:
+            import sglang
+        except ImportError as e:
+            raise ImportError(
+                "SGLang is not installed or failed to import. Please run "
+                "`pip install sglang[all]` to install required dependencies."
+            ) from e
+        self.engine = sglang.Engine(**self.engine_kwargs)
+
+    async def chat(self, message: str):
+        print('In SGLangServer CHAT with message', message)
-        print('In SGLangServer CHAT with message', message)
+        logger.info(f'In SGLangServer CHAT with message: {message}')
-        print('In SGLangServer CHAT with message', message)
+        logger.info(f'In SGLangServer CHAT with message: {message}')
+        res = await self.engine.async_generate(
+            prompt = message,
+            stream = False
+        )
-        res = await self.engine.async_generate(
-            prompt = message,
-            stream = False
-        )
+        res = await self.engine.async_generate(
+            prompt=message,
+            stream=False
+        )
-        res = await self.engine.async_generate(
-            prompt = message,
-            stream = False
-        )
+        res = await self.engine.async_generate(
+            prompt=message,
+            stream=False
+        )
+        return {"echo": res}
+
+    @classmethod
+    def get_deployment_options(cls, llm_config: "LLMConfig"):
+        return {'autoscaling_config': {'min_replicas': 1, 'max_replicas': 1}, 
+                'placement_group_bundles': [{'CPU': 1, 'GPU': 1, 'accelerator_type:H100': 0.001}, {'GPU': 1, 'accelerator_type:H100': 0.001}], 
+                'placement_group_strategy': 'PACK', 
+                'ray_actor_options': {'runtime_env': 
+                                      {'worker_process_setup_hook': 'ray.llm._internal.serve._worker_process_setup_hook'}
+                                      }
+                }
+
+#sglangServer = SGLangServer.bind()
+#my_App = MyFastAPIDeployment.bind(sglangServer)
+#handle: DeploymentHandle = serve.run(my_App, blocking = True)
+