diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh index 57489df64f51..4a22be1da33c 100644 --- a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh +++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh @@ -104,6 +104,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ --ec-transfer-config '{ "ec_connector": "ECSharedStorageConnector", "ec_role": "ec_producer", + "ec_buffer_device": "cuda", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" } @@ -128,6 +129,7 @@ vllm serve "$MODEL" \ --ec-transfer-config '{ "ec_connector": "ECSharedStorageConnector", "ec_role": "ec_consumer", + "ec_buffer_device": "cuda", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" } diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh index 6073e0580b11..344f4a2ffe58 100644 --- a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh +++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh @@ -98,6 +98,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ --ec-transfer-config '{ "ec_connector": "ECSharedStorageConnector", "ec_role": "ec_producer", + "ec_buffer_device": "cuda", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" } @@ -119,6 +120,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \ --ec-transfer-config '{ "ec_connector": "ECSharedStorageConnector", "ec_role": "ec_consumer", + "ec_buffer_device": "cuda", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" } diff --git a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py index c8388141dcc9..77211ae3b344 100644 --- a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +++ b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py @@ -51,10 +51,13 @@ def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole): # req_id -> index self._mm_datas_need_loads: dict[str, int] = {} transfer_config = vllm_config.ec_transfer_config + self.device = "cuda" if transfer_config is not None: self._storage_path = transfer_config.get_from_extra_config( "shared_storage_path", "/tmp" ) + if transfer_config.ec_buffer_device is not None: + self.device = transfer_config.ec_buffer_device logger.debug(transfer_config) logger.debug("Shared storage path is %s", self._storage_path) else: @@ -91,7 +94,7 @@ def start_load_caches(self, encoder_cache, **kwargs) -> None: if mm_data.mm_hash in encoder_cache: continue filename = self._generate_filename_debug(mm_data.mm_hash) - ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda() + ec_cache = safetensors.torch.load_file(filename)["ec_cache"].to(self.device) encoder_cache[mm_data.mm_hash] = ec_cache logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash)