From 1d93476809cfa03d125f85a243bedaaa36bca6ca Mon Sep 17 00:00:00 2001 From: root Date: Mon, 4 Aug 2025 18:11:28 +0000 Subject: [PATCH] removing megablocks uninstall --- compose_rl/utils/ray_utils.py | 26 -------------------------- test_single_controller_ppo.py | 18 ++---------------- 2 files changed, 2 insertions(+), 42 deletions(-) diff --git a/compose_rl/utils/ray_utils.py b/compose_rl/utils/ray_utils.py index 021696b4..629ad817 100644 --- a/compose_rl/utils/ray_utils.py +++ b/compose_rl/utils/ray_utils.py @@ -236,29 +236,3 @@ def is_cuda_visible_devices_set(): 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES', '0', ) == '0' - -# TODO: Since this uninstallation deals specifically with ray, -# added the function here instead of the regular utils.py file -# We need to investigate this further after the hackathon since -# this is a super hacky solution to support CPU workers -def uninstall_megablocks_if_exists(): - """ - Megablocks exists on the ray workers but is not supported on CPU. - We need to uninstall it to avoid errors. - - Note: Installing `llm-foundry[all-cpu]` (which doesn't have megablocks) - on the StreamingDatasetActor worker through ray runtime options - doesn't seem to actually resolve this issue even though it's supposed - to set up a new environment... - TODO: Figure out why that's the case and if there's a better way to - resolve this issue. - """ - import sys - import subprocess - - # First uninstall megablocks package (if it exists) - command = [sys.executable, "-m", "pip", "uninstall", "megablocks", "-y"] - subprocess.run(command, check=False, capture_output=True, text=True) - # Then remove from sys.modules if present - if 'megablocks' in sys.modules: - del sys.modules['megablocks'] diff --git a/test_single_controller_ppo.py b/test_single_controller_ppo.py index 608e0286..f91dca72 100644 --- a/test_single_controller_ppo.py +++ b/test_single_controller_ppo.py @@ -37,7 +37,7 @@ create_vllm_engines, _vllm_generate, ) -from compose_rl.utils.ray_utils import start_ray_server, uninstall_megablocks_if_exists +from compose_rl.utils.ray_utils import start_ray_server from compose_rl.controllers import BaseDistributedGPUActor, SPMDActorGroup from compose_rl.controllers.buffer import Buffer from compose_rl.algorithms.online.callback_utils import preprocess_batches @@ -687,21 +687,7 @@ def _run_single_controller_ppo( vllm_tensor_parallel_size=vllm_tensor_parallel_size, pretrain_model_name=pretrain_model_name, ) - - # We are using a CPU worker for the StreamingActor - # and this involves a super hacky workaround by - # uninstalling megablocks if it exists. Better solutions - # would include: - # 1) decouple StreamingActor from llm-foundry altogether - # 2) don't broadly import llm-foundry in compose-rl (only - # import it into codepaths/files that will only be used by - # GPUActors as opposed to CPUActors) - # 3) Setting up ray actors with correct environments (which - # would involve creating a BaseDistributedActor instead of a - # BaseDistributedGPUActor so that we can use CPUs) - # We uninstall megablocks after the Train Actors have been - # created so that those actors still have megablocks functionality. - uninstall_megablocks_if_exists() + # Creating a CPU worker for the StreamingDatasetActor streaming_dataset_actor = ray.remote(num_gpus=0)(StreamingDatasetActor).remote() rollout_agent = RolloutAgent(inference_server, streaming_dataset_actor)