Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions compose_rl/utils/ray_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,29 +236,3 @@ def is_cuda_visible_devices_set():
'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES',
'0',
) == '0'

# TODO: Since this uninstallation deals specifically with ray,
# added the function here instead of the regular utils.py file
# We need to investigate this further after the hackathon since
# this is a super hacky solution to support CPU workers
def uninstall_megablocks_if_exists():
"""
Megablocks exists on the ray workers but is not supported on CPU.
We need to uninstall it to avoid errors.

Note: Installing `llm-foundry[all-cpu]` (which doesn't have megablocks)
on the StreamingDatasetActor worker through ray runtime options
doesn't seem to actually resolve this issue even though it's supposed
to set up a new environment...
TODO: Figure out why that's the case and if there's a better way to
resolve this issue.
"""
import sys
import subprocess

# First uninstall megablocks package (if it exists)
command = [sys.executable, "-m", "pip", "uninstall", "megablocks", "-y"]
subprocess.run(command, check=False, capture_output=True, text=True)
# Then remove from sys.modules if present
if 'megablocks' in sys.modules:
del sys.modules['megablocks']
18 changes: 2 additions & 16 deletions test_single_controller_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
create_vllm_engines,
_vllm_generate,
)
from compose_rl.utils.ray_utils import start_ray_server, uninstall_megablocks_if_exists
from compose_rl.utils.ray_utils import start_ray_server
from compose_rl.controllers import BaseDistributedGPUActor, SPMDActorGroup
from compose_rl.controllers.buffer import Buffer
from compose_rl.algorithms.online.callback_utils import preprocess_batches
Expand Down Expand Up @@ -687,21 +687,7 @@ def _run_single_controller_ppo(
vllm_tensor_parallel_size=vllm_tensor_parallel_size,
pretrain_model_name=pretrain_model_name,
)

# We are using a CPU worker for the StreamingActor
# and this involves a super hacky workaround by
# uninstalling megablocks if it exists. Better solutions
# would include:
# 1) decouple StreamingActor from llm-foundry altogether
# 2) don't broadly import llm-foundry in compose-rl (only
# import it into codepaths/files that will only be used by
# GPUActors as opposed to CPUActors)
# 3) Setting up ray actors with correct environments (which
# would involve creating a BaseDistributedActor instead of a
# BaseDistributedGPUActor so that we can use CPUs)
# We uninstall megablocks after the Train Actors have been
# created so that those actors still have megablocks functionality.
uninstall_megablocks_if_exists()
# Creating a CPU worker for the StreamingDatasetActor
streaming_dataset_actor = ray.remote(num_gpus=0)(StreamingDatasetActor).remote()
rollout_agent = RolloutAgent(inference_server, streaming_dataset_actor)

Expand Down