fix bug done

yuhao-zh · yuhao-zh · commit ffeb18f9a523 · 2025-11-14T15:29:28.000+08:00
diff --git a/src/parallax/server/executor.py b/src/parallax/server/executor.py
@@ -446,6 +446,57 @@ def recv_requests_from_peer(self) -> List[Request]:
 
         return recv_reqs
 
+    def _compute_expected_intermediate_tokens(self, scheduler_output: Any) -> Optional[int]:
+        """Estimate the padded token count expected by vLLM for this batch."""
+        if scheduler_output is None:
+            return None
+
+        total_tokens = getattr(scheduler_output, "total_num_scheduled_tokens", None)
+        if total_tokens is None:
+            return None
+
+        try:
+            total_tokens = int(total_tokens)
+        except (TypeError, ValueError):
+            return None
+
+        model_runner = getattr(self, "model_runner", None)
+        if model_runner is None:
+            return None
+
+        get_num_input_tokens = getattr(model_runner, "_get_num_input_tokens", None)
+        get_dp_padding = getattr(model_runner, "get_dp_padding", None)
+        if get_num_input_tokens is None or get_dp_padding is None:
+            return None
+
+        num_input_tokens = get_num_input_tokens(total_tokens)
+        num_pad, _ = get_dp_padding(num_input_tokens)
+        return num_input_tokens + num_pad
+
+    @staticmethod
+    def _pad_or_trim_tensor(tensor: torch.Tensor, target_len: int) -> torch.Tensor:
+        if target_len < 0:
+            return tensor
+        current_len = tensor.shape[0]
+        if current_len == target_len:
+            return tensor
+        if current_len > target_len:
+            return tensor[:target_len]
+        pad_shape = (target_len - current_len,) + tensor.shape[1:]
+        pad = tensor.new_zeros(pad_shape)
+        return torch.cat((tensor, pad), dim=0)
+
+    def _resize_intermediate_tensors(self, intermediate_tensors, target_len: Optional[int]):
+        if intermediate_tensors is None or target_len is None:
+            return intermediate_tensors
+        if target_len < 0:
+            return intermediate_tensors
+
+        # Create a list to avoid "dictionary changed size during iteration".
+        for key, tensor in list(intermediate_tensors.items()):
+            intermediate_tensors[key] = self._pad_or_trim_tensor(tensor, target_len)
+        return intermediate_tensors
+
     def _prepare_cuda_prefill_batch(self, batched_requests: List[Request]) -> Dict[str, Any]:
         """
         Prepares inputs for CUDA backends from a batch of prefill requests.
@@ -459,6 +510,7 @@ def _prepare_cuda_prefill_batch(self, batched_requests: List[Request]) -> Dict[s
 
         # Prepare PP proxy tensors (common for both backends when not first peer)
         pp_proxy_tensors = None
+        pp_proxy_initial_tokens = None
         if not self.is_first_peer:
             # Concatenate hidden states from all requests
             # For vLLM, we need to flatten to (total_tokens, hidden_size)
@@ -478,6 +530,7 @@ def _prepare_cuda_prefill_batch(self, batched_requests: List[Request]) -> Dict[s
 
             # Concatenate along sequence dimension to get (total_tokens, hidden_size)
             hidden_states = torch.cat(hidden_states_list, dim=0)
+            pp_proxy_initial_tokens = hidden_states.shape[0]
 
             # Create residual tensor with same shape
             residual = torch.zeros(
@@ -515,6 +568,29 @@ def _prepare_cuda_prefill_batch(self, batched_requests: List[Request]) -> Dict[s
 
             schedule_outputs_prefill = form_vllm_batch_prefill(batched_requests, self.model_runner)
 
+            if not self.is_first_peer and pp_proxy_tensors is not None:
+                target_tokens = self._compute_expected_intermediate_tokens(schedule_outputs_prefill)
+                if target_tokens is not None:
+                    before = pp_proxy_tensors["hidden_states"].shape[0]
+                    pp_proxy_tensors = self._resize_intermediate_tensors(
+                        pp_proxy_tensors, target_tokens
+                    )
+                    after = pp_proxy_tensors["hidden_states"].shape[0]
+                    if after != before:
+                        logger.debug(
+                            "PP Proxy: resized hidden_states from %d to %d tokens (requested=%s, initial=%s)",
+                            before,
+                            after,
+                            target_tokens,
+                            pp_proxy_initial_tokens,
+                        )
+
+            if not self.is_first_peer and pp_proxy_tensors is not None:
+                logger.debug(
+                    "PP Proxy: hidden_states shape after adjustment: %s",
+                    tuple(pp_proxy_tensors["hidden_states"].shape),
+                )
+
             ret = {
                 "scheduler_output": schedule_outputs_prefill,
                 "pp_proxy_tensors": pp_proxy_tensors,
@@ -572,6 +648,7 @@ def _prepare_cuda_decode_batch(self, batched_requests: List[Request]) -> Dict[st
 
             # Concatenate along sequence dimension to get (total_tokens, hidden_size)
             hidden_states = torch.cat(hidden_states_list, dim=0)
+            pp_proxy_initial_tokens = hidden_states.shape[0]
 
             # Create residual tensor with same shape
             residual = torch.zeros(
@@ -918,6 +995,10 @@ def _handle_cuda_input_requests(self, requests: List[Request]):
 
                     assert req.next_token_id is not None
                     original_req.commit_new_token(req.next_token_id)
+                    logger.debug(
+                        f"[FirstPeer-CUDA] Committed token {req.next_token_id} for {req.request_id}, "
+                        f"output_ids now has {len(original_req.output_ids)} tokens"
+                    )
                     if len(req.routing_table) > 0:
                         original_req.routing_table = req.routing_table
 
@@ -1102,6 +1183,8 @@ def _prepare_next_single_request(self, request: Request, hidden_states: Any) ->
             assert isinstance(
                 request, IntermediateRequest
             ), "Last peer must receive an IntermediateRequest."
+            logger.info(f"hidden_states shape: {hidden_states.shape}")
+            logger.info(f"hidden_states: {hidden_states}")
             if self.device == "cuda":
                 assert hidden_states.dtype in (
                     torch.int64,
@@ -1143,6 +1226,7 @@ def _prepare_next_batch_requests(
             for i, src_request in enumerate(requests):
                 if self.is_last_peer:
                     # Last peer gets a 1D array of token IDs
+                    logger.info(f"hidden_states: {hidden_states}")
                     hidden_state_for_req = hidden_states[i : i + 1]
                 else:
                     # Other peers get a 3D array of hidden states
@@ -1217,6 +1301,7 @@ def _process_batch_cuda(
                 import torch
 
                 sampled_token_ids = output.sampled_token_ids
+                logger.info(f"sampled_token_ids: {sampled_token_ids}")
                 if isinstance(sampled_token_ids, list) and len(sampled_token_ids) > 0:
                     # Convert to tensor: pad sequences to same length
                     max_len = max(len(seq) for seq in sampled_token_ids)
@@ -1498,6 +1583,7 @@ def run_loop(self):
                         output = self.process_batch(
                             prepared_inputs, return_decoded_tokens=self.is_last_peer
                         )
+                        logger.info(f"output: {output}")
                         # Update metrics with per-layer latency sample (throttled by decode steps)
                         if batch_type == "decode_batch":
                             try:
diff --git a/src/parallax/vllm/batch_info.py b/src/parallax/vllm/batch_info.py
@@ -159,6 +159,7 @@ def form_vllm_batch_prefill(
 def form_vllm_batch_decode(
     batched_requests: List[Request],
     model_runner: Any = None,
+    scheduler: Any = None,
 ) -> Optional[SchedulerOutput]:
     if not batched_requests:
         return None
@@ -183,7 +184,32 @@ def form_vllm_batch_decode(
     for req in batched_requests:
         req_ids.append(req.request_id)
         resumed_from_preemption.append(False)
+
+        # For GPU workers (non-first peer), IntermediateRequest doesn't have output_ids
+        # We need to get it from vLLM's CachedRequestState in model_runner
         output_ids = getattr(req, "output_ids", None) or []
+
+        # If this request doesn't have output_ids (IntermediateRequest case),
+        # try to get it from model_runner's cached request state (vLLM internal state)
+        if not output_ids and hasattr(model_runner, "requests"):
+            cached_req_state = model_runner.requests.get(req.request_id)
+            if cached_req_state is not None:
+                output_ids = getattr(cached_req_state, "output_token_ids", [])
+                logger.debug(
+                    f"[Decode] Retrieved output_token_ids from vLLM CachedRequestState for "
+                    f"{req.request_id}: len={len(output_ids)}"
+                )
+
+        # Fallback: try scheduler if available
+        if not output_ids and scheduler is not None:
+            running_req = scheduler.get_running_request(req.request_id)
+            if running_req is not None:
+                output_ids = getattr(running_req, "output_ids", None) or []
+                logger.debug(
+                    f"[Decode] Retrieved output_ids from scheduler for {req.request_id}: "
+                    f"len={len(output_ids)}"
+                )
+
         if output_ids:
             last_token = output_ids[-1]
             new_token_ids.append([last_token])
@@ -196,13 +222,23 @@ def form_vllm_batch_decode(
         vllm_req = _build_vllm_request(req, sampling_params, model_runner, include_outputs=True)
 
         prompt_ids = getattr(req, "input_ids", None) or []
-        output_ids = getattr(req, "output_ids", None) or []
+        # For decode stage, computed_token_count should be the total number of tokens
+        # that have been processed (including all output tokens).
+        # In pipeline parallelism, this must match what GPU worker expects.
         if output_ids:
-            computed_token_count = len(prompt_ids) + len(output_ids) - 1
+            # All tokens (prompt + all generated outputs) have been computed
+            computed_token_count = len(prompt_ids) + len(output_ids)
         else:
+            # First decode step: only prompt has been computed
             computed_token_count = len(prompt_ids)
         vllm_req.num_computed_tokens = computed_token_count
 
+        # Debug logging to track state synchronization
+        logger.debug(
+            f"[Decode] req_id={req.request_id}, prompt_len={len(prompt_ids)}, "
+            f"output_len={len(output_ids)}, computed_tokens={computed_token_count}"
+        )
+
         new_blocks = kv_cache_manager.allocate_slots(
             request=vllm_req,
             num_new_tokens=1,
diff --git a/src/parallax/vllm/model_runner.py b/src/parallax/vllm/model_runner.py
@@ -319,7 +319,21 @@ def custom_get_pp_indices(num_layers: int, rank: int, world_size: int):
         finally:
             vllm.distributed.utils.get_pp_indices = original_get_pp_indices
 
-        logger.debug("Model loaded successfully with partial layers")
+    def execute_model(self, scheduler_output, intermediate_tensors=None):
+        """
+        Execute the model with the given scheduler output and intermediate tensors.
+        If this is not the first peer, and the intermediate_tensors buffer is not initialized,
+        initialize it.
+        """
+        if not self.is_first_peer and self.intermediate_tensors is None:
+            self.intermediate_tensors = self.model.make_empty_intermediate_tensors(
+                batch_size=self.max_num_tokens,
+                dtype=self.model_config.dtype,
+                device=self.device,
+            )
+            logger.debug("Successfully initialized intermediate_tensors buffer")
+
+        return super().execute_model(scheduler_output, intermediate_tensors)
 
 
 def initialize_vllm_model_runner(
@@ -348,15 +362,17 @@ def initialize_vllm_model_runner(
     config = load_config(model_path)
     tokenizer = load_tokenizer(model_path, eos_token_ids=config.get("eos_token_id", None))
     dtype = config.get("torch_dtype", "bfloat16")
-    
+
     num_hidden_layers = config.get("num_hidden_layers")
     is_first_peer = start_layer == 0
     is_last_peer = end_layer == num_hidden_layers
 
     # Apply Parallax vLLM monkey patches for pipeline parallelism
     try:
         apply_parallax_vllm_monkey_patch(is_first_stage=is_first_peer, is_last_stage=is_last_peer)
-        logger.debug(f"Applied Parallax vLLM monkey patches: is_first_stage={is_first_peer}, is_last_stage={is_last_peer}")
+        logger.debug(
+            f"Applied Parallax vLLM monkey patches: is_first_stage={is_first_peer}, is_last_stage={is_last_peer}"
+        )
     except Exception as e:
         logger.warning("Failed to apply Parallax vLLM monkey patches: %s", e)