Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions python/ray/llm/_internal/serve/core/ingress/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,38 @@ def build_openai_app(builder_config: dict) -> Application:
return serve.deployment(ingress_cls, **ingress_options).bind(
llm_deployments=llm_deployments, **ingress_cls_config.ingress_extra_kwargs
)


def build_sglang_openai_app(builder_config: dict) -> Application:
"""Build an OpenAI compatible app with the llm deployment setup from
the given builder configuration.
Args:
builder_config: The configuration for the builder. It has to conform
to the LLMServingArgs pydantic model.
Returns:
The configured Ray Serve Application router.
"""

builder_config = LLMServingArgs.model_validate(builder_config)
llm_configs = builder_config.llm_configs

llm_deployments = [build_sglang_deployment(c) for c in llm_configs]

ingress_cls_config = builder_config.ingress_cls_config
ingress_options = ingress_cls_config.ingress_cls.get_deployment_options(llm_configs)

if builder_config.ingress_deployment_config:
ingress_options = deep_merge_dicts(
ingress_options, builder_config.ingress_deployment_config
)

ingress_cls = make_fastapi_ingress(ingress_cls_config.ingress_cls)

logger.info("============== Ingress Options ==============")
logger.info(pprint.pformat(ingress_options))

return serve.deployment(ingress_cls, **ingress_options).bind(
llm_deployments=llm_deployments, **ingress_cls_config.ingress_extra_kwargs
)
49 changes: 49 additions & 0 deletions python/ray/llm/_internal/serve/core/server/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
LLMConfig,
)
from ray.llm._internal.serve.core.server.llm_server import LLMServer
from ray.llm._internal.serve.core.server.engines.sglang.sglang_engine import SGLangServer

from ray.llm._internal.serve.observability.logging import get_logger
from ray.serve.deployment import Application

Expand Down Expand Up @@ -76,3 +78,50 @@ def build_llm_deployment(
return serve.deployment(deployment_cls, **deployment_options).bind(
llm_config=llm_config, **bind_kwargs
)

def build_sglang_deployment(
llm_config: LLMConfig,
*,
name_prefix: Optional[str] = None,
bind_kwargs: Optional[dict] = None,
override_serve_options: Optional[dict] = None,
deployment_cls: Optional[Type[LLMServer]] = None,
) -> Application:
"""Build an LLMServer deployment.
Args:
llm_config: The LLMConfig to build the deployment.
name_prefix: The prefix to add to the deployment name.
bind_kwargs: The optional extra kwargs to pass to the deployment.
Used for customizing the deployment.
override_serve_options: The optional serve options to override the
default options.
deployment_cls: The deployment class to use. Defaults to LLMServer.
Returns:
The Ray Serve Application for the LLMServer deployment.
"""
deployment_cls = SGLangServer #deployment_cls or LLMServer
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Misleading Deployment Class Behavior

The deployment_cls parameter is accepted but completely ignored, always being overridden to SGLangServer. This creates misleading API behavior where callers might pass a custom deployment class expecting it to be used, but it will be silently ignored.

Fix in Cursor Fix in Web

name_prefix = name_prefix or f"{deployment_cls.__name__}:"
bind_kwargs = bind_kwargs or {}

deployment_options = deployment_cls.get_deployment_options(llm_config)
# Set the name of the deployment config to map to the model ID.
deployment_name = deployment_options.get("name", _get_deployment_name(llm_config))

if name_prefix:
deployment_options["name"] = name_prefix + deployment_name

if override_serve_options:
deployment_options.update(override_serve_options)

deployment_options = deep_merge_dicts(
DEFAULT_DEPLOYMENT_OPTIONS, deployment_options
)

logger.info("============== Deployment Options ==============")
logger.info(pprint.pformat(deployment_options))

return serve.deployment(deployment_cls, **deployment_options).bind(
llm_config=llm_config, **bind_kwargs
)
50 changes: 50 additions & 0 deletions python/ray/llm/_internal/serve/engines/sglang/sglang_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import ray
import requests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The requests module is imported but never used. It should be removed to keep the code clean.

from ray import serve
from ray.serve.handle import DeploymentHandle

#@serve.deployment disable serve.deployment
class SGLangServer:
def __init__(self, llm_config: LLMConfig):

default_engine_kwargs = dict(
model_path = "/scratch2/huggingface/hub/meta-llama/Llama-3.1-8B-Instruct/",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Hardcoded Dev Path Breaks Production Deployment

Hardcoded personal development path /scratch2/huggingface/hub/meta-llama/Llama-3.1-8B-Instruct/ that is specific to a development environment and will not work for other users or in production. This appears to be accidentally committed personal configuration.

Fix in Cursor Fix in Web

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is only for demo usage Hardcoded personal development path

mem_fraction_static = 0.8,
tp_size = 8,
)

if llm_config.engine_kwargs:
default_engine_kwargs.update(llm_config.engine_kwargs)
self.engine_kwargs = default_engine_kwargs

try:
import sglang
except ImportError as e:
raise ImportError(
"SGLang is not installed or failed to import. Please run "
"`pip install sglang[all]` to install required dependencies."
) from e
self.engine = sglang.Engine(**self.engine_kwargs)

async def chat(self, message: str):
print('In SGLangServer CHAT with message', message)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using print for logging is generally discouraged in library code. It's better to use the logging module, which allows for configurable log levels, formatting, and output streams. You can add import logging and logger = logging.getLogger(__name__) at the top of the file.

Suggested change
print('In SGLangServer CHAT with message', message)
logger.info(f'In SGLangServer CHAT with message: {message}')

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can be ignore

res = await self.engine.async_generate(
prompt = message,
stream = False
)
Comment on lines +31 to +34
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

According to PEP 8, there should be no spaces around the equals sign for keyword arguments.

Suggested change
res = await self.engine.async_generate(
prompt = message,
stream = False
)
res = await self.engine.async_generate(
prompt=message,
stream=False
)

return {"echo": res}

@classmethod
def get_deployment_options(cls, llm_config: "LLMConfig"):
return {'autoscaling_config': {'min_replicas': 1, 'max_replicas': 1},
'placement_group_bundles': [{'CPU': 1, 'GPU': 1, 'accelerator_type:H100': 0.001}, {'GPU': 1, 'accelerator_type:H100': 0.001}],
'placement_group_strategy': 'PACK',
'ray_actor_options': {'runtime_env':
{'worker_process_setup_hook': 'ray.llm._internal.serve._worker_process_setup_hook'}
}
}

#sglangServer = SGLangServer.bind()
#my_App = MyFastAPIDeployment.bind(sglangServer)
#handle: DeploymentHandle = serve.run(my_App, blocking = True)