[https://nvbugs/5731717][fix] fixed flashinfer build race condition during test (#9983)

MrGeva · web-flow · commit ce7a42f4cf5b · 2025-12-15T20:30:24.000-08:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/test_unittests.py b/tests/integration/defs/test_unittests.py
@@ -125,7 +125,7 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request):
                               f'results-sub-unittests-{case_fn}.xml')
 
     command = [
-        '-m', 'pytest', ignore_opt, "-v", "--timeout=2400",
+        '-m', 'pytest', ignore_opt, "-v", "--tb=short", "-rF", "--timeout=2400",
         "--timeout-method=thread"
     ]
     if test_prefix:
@@ -153,7 +153,19 @@ def run_command(cmd, num_workers=1):
                 cwd=test_root,
                 env=env,
             )
-        except CalledProcessError:
+        except CalledProcessError as e:
+            print(f"\n{'='*60}")
+            print(f"UNITTEST FAILED with exit code: {e.returncode}")
+            print(f"Command: {' '.join(cmd)}")
+            if hasattr(e, 'stdout') and e.stdout:
+                print(
+                    f"STDOUT:\n{e.stdout.decode() if isinstance(e.stdout, bytes) else e.stdout}"
+                )
+            if hasattr(e, 'stderr') and e.stderr:
+                print(
+                    f"STDERR:\n{e.stderr.decode() if isinstance(e.stderr, bytes) else e.stderr}"
+                )
+            print(f"{'='*60}\n")
             return False
         return True
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py
@@ -23,6 +23,9 @@
 from tensorrt_llm.commands.bench import main
 from tensorrt_llm.functional import AllReduceStrategy
 
+# needed since LLM API uses MPI executor pool internally for TP>1, which leaks a thread on shutdown
+pytestmark = pytest.mark.threadleak(enabled=False)
+
 
 class TimeoutError(Exception):
     """Exception raised when a test times out."""
@@ -55,6 +58,71 @@ def timeout_handler(signum, frame):
         signal.signal(signal.SIGALRM, old_handler)
 
 
+@pytest.fixture(scope="module", autouse=True)
+def prewarm_flashinfer_jit():
+    """Pre-warm FlashInfer JIT kernels before multi-GPU tests.
+
+    This prevents a race condition where multiple MPI ranks try to JIT-compile
+    FlashInfer kernels simultaneously to the same cache directory, causing
+    Ninja build failures like: "ninja: error: opening build log: No such file or directory"
+
+    By triggering the compilation in the main process first, the kernels are
+    cached and available for all worker ranks.
+    """
+    try:
+        import flashinfer
+        import flashinfer.page
+        import flashinfer.sampling
+
+        if torch.cuda.is_available():
+            # Prevent concurrent JIT warmup across multiple pytest processes (e.g., xdist).
+            try:
+                import fcntl  # Linux-only
+            except ImportError:
+                fcntl = None
+
+            lock_f = None
+            if fcntl is not None:
+                import pathlib
+                import tempfile
+
+                lock_path = pathlib.Path(tempfile.gettempdir()) / "flashinfer_jit_prewarm.lock"
+                lock_f = open(lock_path, "w")
+                fcntl.flock(lock_f.fileno(), fcntl.LOCK_EX)
+            # Create dummy tensors to trigger kernel JIT compilation
+            with torch.no_grad():
+                device = torch.device("cuda:0")
+
+                # Trigger page kernel compilation
+                try:
+                    # Force module loading (this triggers JIT compilation)
+                    _ = flashinfer.page.gen_page_module()
+                except Exception as exc:  # noqa: BLE001
+                    import warnings
+
+                    warnings.warn(f"FlashInfer page-kernel prewarm failed: {exc!r}", RuntimeWarning)
+
+                # Trigger sampling kernel compilation
+                try:
+                    dummy_probs = torch.softmax(torch.randn(1, 100, device=device), dim=-1)
+                    _ = flashinfer.sampling.sampling_from_probs(dummy_probs, deterministic=True)
+                except Exception as exc:  # noqa: BLE001
+                    import warnings
+
+                    warnings.warn(
+                        f"FlashInfer sampling-kernel prewarm failed: {exc!r}", RuntimeWarning
+                    )
+
+                torch.cuda.empty_cache()
+            if lock_f is not None:
+                lock_f.close()
+
+    except ImportError:
+        pass  # FlashInfer not available
+
+    yield
+
+
 @pytest.fixture(scope="module")
 def shared_dataset(llm_root):  # noqa: F811
     """Prepare dataset once for all tests in this module."""
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py
@@ -12,6 +12,9 @@
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op
 from tensorrt_llm.llmapi.mpi_session import MpiPoolSession
 
+# needed since MPI executor pool leaks a thread (_manager_spawn) on shutdown
+pytestmark = pytest.mark.threadleak(enabled=False)
+
 
 class RMSNorm(torch.nn.Module):
     """Implementation of LlamaRMSNorm."""