diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py index 786c953b0fd..57494abc457 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py +++ b/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py @@ -491,7 +491,7 @@ def sample( # compute unnecessarily softmax also for situations allowing # flashinfer.sampling...._sampling_from_logits. # indices=group_logit_indices, - filter_apply_order="top_k_first", + filter_apply_order="joint", deterministic=True, check_nan=self._flashinfer_check_nans(logits), generator=generator,