6060_CALIBRATION_PORT = 11499
6161_KV_CACHE_MIN_STEP_MB = 1024.0 # binary search precision and safety margin
6262_KV_CACHE_VRAM_CAP_RATIO = 0.8 # fraction of total GPU VRAM used as KV search ceiling
63+ _FAILED_COMMANDS_FILE = "calibration_failed_commands.txt"
6364
6465# ---------------------------------------------------------------------------
6566# KV-cache size parsing
@@ -96,6 +97,47 @@ def _round_up_gb(mb: float) -> float:
9697 return math .ceil (mb / 1024.0 ) * 1024.0
9798
9899
100+ # ---------------------------------------------------------------------------
101+ # Failed-command blacklist
102+ # ---------------------------------------------------------------------------
103+
104+
105+ def _cmd_fingerprint (cmd : list [str ]) -> str :
106+ """Build a canonical one-line string from a vLLM command list.
107+
108+ Strips ``--host`` and ``--port`` (calibration infra, not model-specific)
109+ so that retries on a different port aren't falsely considered "new".
110+ """
111+ filtered : list [str ] = []
112+ skip_next = False
113+ for i , tok in enumerate (cmd ):
114+ if skip_next :
115+ skip_next = False
116+ continue
117+ if tok in ("--host" , "--port" ):
118+ skip_next = True
119+ continue
120+ filtered .append (tok )
121+ return " " .join (filtered )
122+
123+
124+ def _load_failed_commands (failed_path : Path ) -> set [str ]:
125+ if not failed_path .exists ():
126+ return set ()
127+ return {
128+ line .strip ()
129+ for line in failed_path .read_text (encoding = "utf-8" ).splitlines ()
130+ if line .strip () and not line .strip ().startswith ("#" )
131+ }
132+
133+
134+ def _record_failed_command (failed_path : Path , fingerprint : str ) -> None :
135+ failed_path .parent .mkdir (parents = True , exist_ok = True )
136+ with failed_path .open ("a" , encoding = "utf-8" ) as f :
137+ f .write (fingerprint + "\n " )
138+ logger .info (" Blacklisted command → %s" , failed_path )
139+
140+
99141# ---------------------------------------------------------------------------
100142# GPU VRAM helpers
101143# ---------------------------------------------------------------------------
@@ -198,30 +240,23 @@ def _post(
198240# ---------------------------------------------------------------------------
199241
200242
201- def spawn_vllm (
243+ def _build_vllm_cmd (
202244 plan : dict [str , Any ],
203245 vllm_binary : str ,
204246 host : str ,
205247 port : int ,
206- log_path : Path ,
207248 kv_cache_memory_bytes : str ,
208- ) -> subprocess .Popen [str ]:
249+ ) -> list [str ]:
250+ """Build the vLLM command list without spawning a process."""
209251 model = plan ["model" ]
210252 tp = int (plan .get ("tensor_parallel_size" , 1 ))
211253 dtype = str (plan .get ("dtype" , "auto" ))
212254 quant = str (plan .get ("quantization" ) or "" )
213255 max_model_len = plan .get ("max_model_len" )
214256 enforce_eager = bool (plan .get ("enforce_eager" , False ))
215257 disable_custom_all_reduce = bool (plan .get ("disable_custom_all_reduce" , False ))
216- disable_nccl_p2p = bool (plan .get ("disable_nccl_p2p" , False ))
217258 extra_args : list [str ] = list (plan .get ("extra_args" ) or [])
218259 kv_bytes = str (plan .get ("kv_cache_memory_bytes" ) or kv_cache_memory_bytes )
219-
220- # When kv_cache_memory_bytes is set, omit --gpu-memory-utilization and let
221- # vLLM default to 0.9. kv_cache_memory_bytes controls the KV pool size
222- # directly; adding gpu_memory_utilization=0.1 caps total VRAM to 10% which
223- # prevents the model weights from loading at all.
224- # An explicit per-model override takes precedence.
225260 explicit_gmu = plan .get ("gpu_memory_utilization" )
226261
227262 cmd = [
@@ -250,8 +285,23 @@ def spawn_vllm(
250285 cmd .append ("--enforce-eager" )
251286 if disable_custom_all_reduce :
252287 cmd .append ("--disable-custom-all-reduce" )
253- # disable_nccl_p2p is applied via NCCL_P2P_DISABLE env var below (not a vLLM CLI flag)
254288 cmd .extend (extra_args )
289+ return cmd
290+
291+
292+ def spawn_vllm (
293+ plan : dict [str , Any ],
294+ vllm_binary : str ,
295+ host : str ,
296+ port : int ,
297+ log_path : Path ,
298+ kv_cache_memory_bytes : str ,
299+ ) -> tuple [subprocess .Popen [str ], list [str ]]:
300+ """Spawn vLLM and return ``(process, cmd_list)``."""
301+ tp = int (plan .get ("tensor_parallel_size" , 1 ))
302+ disable_nccl_p2p = bool (plan .get ("disable_nccl_p2p" , False ))
303+
304+ cmd = _build_vllm_cmd (plan , vllm_binary , host , port , kv_cache_memory_bytes )
255305
256306 env = os .environ .copy ()
257307 env ["VLLM_SERVER_DEV_MODE" ] = "1"
@@ -284,7 +334,7 @@ def spawn_vllm(
284334
285335 logger .info (" Spawned PID=%d log=%s" , proc .pid , log_path )
286336 logger .info (" Command: %s" , " " .join (cmd ))
287- return proc
337+ return proc , cmd
288338
289339
290340def _kill_stale_vllm_workers () -> None :
@@ -554,13 +604,28 @@ def calibrate_model(
554604 kv_cache_sent_mb , _KV_CACHE_MIN_STEP_MB ,
555605 )
556606
607+ failed_path = log_dir / _FAILED_COMMANDS_FILE
608+ failed_commands = _load_failed_commands (failed_path )
609+
557610 def _try_start (kv_mb : float ) -> subprocess .Popen [str ] | None :
558611 """Try to start vLLM with the given KV cache. Returns the
559612 running process on success, ``None`` on failure (process is
560- cleaned up)."""
613+ cleaned up). Blacklisted commands are skipped immediately. """
561614 kv_str = _format_kv_mb (kv_mb )
562- proc = spawn_vllm (
563- {** plan , "kv_cache_memory_bytes" : kv_str },
615+ planned = {** plan , "kv_cache_memory_bytes" : kv_str }
616+ # Check the blacklist *before* spawning to avoid wasting time.
617+ fingerprint = _cmd_fingerprint (
618+ _build_vllm_cmd (planned , vllm_binary , host , port , kv_str )
619+ )
620+ if fingerprint in failed_commands :
621+ logger .warning (
622+ " SKIP kv_cache=%s — command previously failed "
623+ "(remove line from %s to retry)" ,
624+ kv_str , failed_path ,
625+ )
626+ return None
627+ proc , _ = spawn_vllm (
628+ planned ,
564629 vllm_binary , host , port , log_path ,
565630 kv_cache_memory_bytes = kv_str ,
566631 )
@@ -585,6 +650,8 @@ def _try_start(kv_mb: float) -> subprocess.Popen[str] | None:
585650 logger .warning (" -- vLLM log tail --\n %s" , log_tail )
586651 stop_vllm (proc )
587652 time .sleep (_VRAM_SETTLE_S )
653+ _record_failed_command (failed_path , fingerprint )
654+ failed_commands .add (fingerprint )
588655 return None
589656
590657 proc : subprocess .Popen [str ] | None = None
0 commit comments