diff --git a/backend/src/recommendation/capacity_planner.py b/backend/src/recommendation/capacity_planner.py index 8439114..7348afb 100644 --- a/backend/src/recommendation/capacity_planner.py +++ b/backend/src/recommendation/capacity_planner.py @@ -232,13 +232,20 @@ def plan_all_capacities( if slo_status == "exceeds" and not include_near_miss: continue - # Calculate accuracy score - # If model is in catalog and we have an evaluator, use score_model() - # Otherwise, accuracy = 0 - if model and model_evaluator: - accuracy_score = int(model_evaluator.score_model(model, intent)) - else: - accuracy_score = 0 + # Calculate accuracy score - USE RAW AA BENCHMARK SCORE + # This is the actual model accuracy from Artificial Analysis benchmarks + # NOT a composite score with latency/budget bonuses + from .usecase_quality_scorer import score_model_quality + + # Try to get raw AA score using the benchmark model name + model_name_for_scoring = model.name if model else bench.model_hf_repo + raw_accuracy = score_model_quality(model_name_for_scoring, intent.use_case) + + # If no score found, try with benchmark's model_hf_repo + if raw_accuracy == 0 and bench.model_hf_repo: + raw_accuracy = score_model_quality(bench.model_hf_repo, intent.use_case) + + accuracy_score = int(raw_accuracy) complexity_score = scorer.score_complexity(gpu_config.gpu_count) diff --git a/backend/src/recommendation/usecase_quality_scorer.py b/backend/src/recommendation/usecase_quality_scorer.py index c9fc7b8..381a636 100644 --- a/backend/src/recommendation/usecase_quality_scorer.py +++ b/backend/src/recommendation/usecase_quality_scorer.py @@ -91,6 +91,63 @@ def _load_csv_scores(self, filepath: str) -> Dict[str, float]: return scores + # BLIS model variant to AA model mapping (for models with valid AA data) + BLIS_TO_AA_MAP = { + # === OPTION A: 25 VALID VARIANTS WITH REAL BLIS DATA === + # GPT-OSS (61.62%, 55.23%) + "gpt-oss-120b": "gpt-oss-120b (high)", + "gpt-oss-20b": "gpt-oss-20b (high)", + # Llama 4 Maverick (46.86%) + "llama-4-maverick-17b-128e-instruct-fp8": "llama 4 maverick", + # Qwen 2.5 7B (44.71%) - maps to Qwen2.5 Max + "qwen2.5-7b-instruct": "qwen2.5 max", + "qwen2.5-7b-instruct-fp8-dynamic": "qwen2.5 max", + "qwen2.5-7b-instruct-quantized.w4a16": "qwen2.5 max", + "qwen2.5-7b-instruct-quantized.w8a8": "qwen2.5 max", + # Llama 3.3 70B (42.99%) + "llama-3.3-70b-instruct": "llama 3.3 instruct 70b", + "llama-3.3-70b-instruct-quantized.w4a16": "llama 3.3 instruct 70b", + "llama-3.3-70b-instruct-quantized.w8a8": "llama 3.3 instruct 70b", + # Llama 4 Scout (42.42%) + "llama-4-scout-17b-16e-instruct": "llama 4 scout", + "llama-4-scout-17b-16e-instruct-fp8-dynamic": "llama 4 scout", + "llama-4-scout-17b-16e-instruct-quantized.w4a16": "llama 4 scout", + # Mistral Small 3.1 (35.70%) + "mistral-small-3.1-24b-instruct-2503": "mistral small 3.1", + "mistral-small-3.1-24b-instruct-2503-fp8-dynamic": "mistral small 3.1", + "mistral-small-3.1-24b-instruct-2503-quantized.w4a16": "mistral small 3.1", + "mistral-small-3.1-24b-instruct-2503-quantized.w8a8": "mistral small 3.1", + # Phi-4 (35.57%) + "phi-4": "phi-4", + "phi-4-fp8-dynamic": "phi-4", + "phi-4-quantized.w4a16": "phi-4", + "phi-4-quantized.w8a8": "phi-4", + # Mistral Small 24B (33.79%) + "mistral-small-24b-instruct-2501": "mistral small 3", + # Mixtral 8x7B (20.51%) + "mixtral-8x7b-instruct-v0.1": "mixtral 8x7b instruct", + } + + def _normalize_model_name(self, model_name: str) -> str: + """Normalize model name by removing quantization suffixes and org prefixes.""" + name = model_name.lower() + + # Remove org prefixes + if '/' in name: + name = name.split('/')[-1] + + # Remove quantization suffixes + suffixes_to_remove = [ + '-fp8-dynamic', '-fp8', + '-quantized.w4a16', '-quantized.w8a8', + '-instruct-2501', '-instruct-2503', '-instruct-hf', + '-instruct-v0.1', '-instruct' + ] + for suffix in suffixes_to_remove: + name = name.replace(suffix, '') + + return name.strip('-').strip() + def get_quality_score(self, model_name: str, use_case: str) -> float: """Get quality score for a model on a specific use case. @@ -99,7 +156,7 @@ def get_quality_score(self, model_name: str, use_case: str) -> float: use_case: Use case identifier (e.g., "code_completion") Returns: - Quality score 0-100 (higher is better) + Quality score 0-100 (higher is better), or 0 if no valid AA data """ # Normalize use case use_case_normalized = use_case.lower().replace(" ", "_").replace("-", "_") @@ -110,27 +167,35 @@ def get_quality_score(self, model_name: str, use_case: str) -> float: scores = self._cache.get(use_case_normalized, {}) - # Try exact match first + # Normalize the model name model_lower = model_name.lower() + base_model = self._normalize_model_name(model_name) + + # Try exact match first if model_lower in scores: return scores[model_lower] + # Try BLIS to AA mapping (for known valid models) + for blis_pattern, aa_name in self.BLIS_TO_AA_MAP.items(): + if blis_pattern in base_model: + if aa_name in scores: + logger.debug(f"Matched {model_name} -> {aa_name} via BLIS mapping") + return scores[aa_name] + # Try partial matching (for HuggingFace repo names) for cached_name, score in scores.items(): - model_words = set(model_lower.replace("-", " ").replace("/", " ").replace("_", " ").split()) + model_words = set(base_model.replace("-", " ").replace("/", " ").replace("_", " ").split()) cached_words = set(cached_name.replace("-", " ").replace("/", " ").replace("_", " ").split()) common_words = model_words & cached_words - if len(common_words) >= 3: + if len(common_words) >= 2: # Reduced from 3 to 2 for better matching + logger.debug(f"Partial match {model_name} -> {cached_name} (common: {common_words})") return score - # Fallback: return median score for the use case - if scores: - median_score = sorted(scores.values())[len(scores) // 2] - logger.debug(f"No score found for {model_name}, using median: {median_score:.1f}") - return median_score - - return 50.0 # Default fallback + # No valid AA data found - return 0 to indicate missing data + # This allows filtering out models without quality scores + logger.debug(f"No AA score found for {model_name} (base: {base_model})") + return 0.0 # Return 0 so min_accuracy filter can exclude these def get_top_models_for_usecase(self, use_case: str, top_n: int = 10) -> List[Tuple[str, float]]: """Get top N models for a specific use case.""" diff --git a/data/research/slo_ranges.json b/data/research/slo_ranges.json index 389d924..95328e9 100644 --- a/data/research/slo_ranges.json +++ b/data/research/slo_ranges.json @@ -32,11 +32,11 @@ "chatbot_conversational": { "description": "Real-time conversational chatbots", "token_config": {"prompt": 512, "output": 256}, - "ttft_ms": {"min": 50, "max": 200, "blis_observed": {"min": 13.3, "max": 141.5, "mean": 44.3}}, - "itl_ms": {"min": 10, "max": 40, "blis_observed": {"min": 2.8, "max": 65.6, "mean": 13.0}}, - "e2e_ms": {"min": 1000, "max": 5000, "blis_observed": {"min": 769, "max": 16545, "mean": 3312}}, + "ttft_ms": {"min": 50, "max": 500, "default": 150, "blis_observed": {"min": 13.3, "max": 141.5, "mean": 44.3}}, + "itl_ms": {"min": 10, "max": 80, "default": 30, "blis_observed": {"min": 2.8, "max": 65.6, "mean": 13.0}}, + "e2e_ms": {"min": 500, "max": 5000, "default": 1500, "blis_observed": {"min": 769, "max": 16545, "mean": 3312}}, "tokens_per_sec": {"target": 200, "blis_range": [238, 27878]}, - "research_note": "Nielsen's 1s guideline for conversational flow. BLIS: 345 samples show E2E mean of 3.3s." + "research_note": "Nielsen's 1s guideline for conversational flow. Research-based ranges for user experience." }, "code_generation_detailed": { diff --git a/ui/app.py b/ui/app.py index 56a36ac..a71791a 100644 --- a/ui/app.py +++ b/ui/app.py @@ -127,14 +127,14 @@ --shadow-glow: 0 0 40px rgba(99, 102, 241, 0.15); } - /* Hero Section - Enterprise Grade Design */ + /* Hero Section - Compact Design */ .hero-container { background: var(--gradient-hero); background-size: 200% 200%; animation: gradient-shift 15s ease infinite; - padding: 4.5rem 4rem; - border-radius: 1.5rem; - margin-bottom: 3rem; + padding: 1.5rem 2rem; + border-radius: 1rem; + margin-bottom: 1.5rem; box-shadow: var(--shadow-lg), var(--shadow-glow); border: 1px solid rgba(139, 92, 246, 0.2); position: relative; @@ -163,40 +163,41 @@ pointer-events: none; } .hero-emoji { - font-size: 5rem; - margin-bottom: 1.25rem; + font-size: 2.5rem; + margin-bottom: 0.5rem; animation: float 5s ease-in-out infinite; - filter: drop-shadow(0 10px 25px rgba(0,0,0,0.4)); + filter: drop-shadow(0 5px 15px rgba(0,0,0,0.4)); position: relative; z-index: 1; + display: inline-block; + margin-right: 1rem; + vertical-align: middle; } .hero-title { - font-size: 4rem; + font-size: 2.5rem; font-weight: 800; color: white; - margin-bottom: 1rem; - text-shadow: 0 4px 30px rgba(0,0,0,0.4); - letter-spacing: -2px; + margin-bottom: 0.5rem; + text-shadow: 0 2px 15px rgba(0,0,0,0.4); + letter-spacing: -1px; font-family: 'Space Grotesk', 'Inter', sans-serif; position: relative; z-index: 1; + display: inline-block; + vertical-align: middle; } .hero-subtitle { - font-size: 1.4rem; + font-size: 1rem; color: rgba(255,255,255,0.85); font-weight: 400; max-width: 700px; - line-height: 1.6; + line-height: 1.4; position: relative; z-index: 1; + margin-top: 0.5rem; } .hero-badges { - display: flex; - gap: 1rem; - margin-top: 2.5rem; - flex-wrap: wrap; - position: relative; - z-index: 1; + display: none; } .hero-badge { background: rgba(255,255,255,0.1); @@ -387,7 +388,7 @@ .leaderboard-table th:nth-child(2), .leaderboard-table td:nth-child(2) { width: 18%; text-align: left; } /* Model */ .leaderboard-table th:nth-child(3), - .leaderboard-table td:nth-child(3) { width: 10%; } /* Quality */ + .leaderboard-table td:nth-child(3) { width: 10%; } /* Accuracy */ .leaderboard-table th:nth-child(4), .leaderboard-table td:nth-child(4) { width: 10%; } /* Latency */ .leaderboard-table th:nth-child(5), @@ -457,84 +458,132 @@ box-shadow: 0 4px 12px rgba(99, 102, 241, 0.3); } - /* Score Bars - HuggingFace Inspired Progress Bars */ + /* Score Bars - Corporate Enhanced Style */ .score-mini-container { display: flex; flex-direction: column; align-items: center; justify-content: center; - gap: 5px; + gap: 6px; width: 100%; - max-width: 100%; + max-width: 120px; margin: 0 auto; + padding: 0.5rem 0; } .score-mini-bar { - height: 8px; - border-radius: 4px; - background: rgba(255,255,255,0.06); + height: 6px; + border-radius: 3px; + background: rgba(255,255,255,0.08); overflow: hidden; width: 100%; position: relative; } .score-mini-fill { height: 100%; - border-radius: 4px; - transition: width 0.5s cubic-bezier(0.4, 0, 0.2, 1); + border-radius: 3px; + transition: width 0.6s cubic-bezier(0.4, 0, 0.2, 1); } .score-mini-label { - font-size: 0.9rem; - font-weight: 600; - font-family: 'JetBrains Mono', 'Inter', monospace; + font-size: 1.4rem; + font-weight: 700; + font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; + letter-spacing: -0.02em; } .score-num { display: none; } + .fill-accuracy { background: linear-gradient(90deg, #db2777, #ec4899); } .fill-quality { background: linear-gradient(90deg, #059669, #10b981); } .fill-latency { background: linear-gradient(90deg, #2563eb, #3b82f6); } .fill-cost { background: linear-gradient(90deg, #ea580c, #f97316); } .fill-capacity { background: linear-gradient(90deg, #7c3aed, #8b5cf6); } - /* Score label colors */ - .label-quality { color: #10b981; } - .label-latency { color: #3b82f6; } - .label-cost { color: #f97316; } - .label-capacity { color: #8b5cf6; } + /* Score label colors - Enhanced visibility */ + .label-accuracy { color: #f472b6; text-shadow: 0 0 12px rgba(244, 114, 182, 0.3); } + .label-quality { color: #34d399; text-shadow: 0 0 12px rgba(16, 185, 129, 0.3); } + .label-latency { color: #60a5fa; text-shadow: 0 0 12px rgba(59, 130, 246, 0.3); } + .label-cost { color: #fb923c; text-shadow: 0 0 12px rgba(249, 115, 22, 0.3); } + .label-capacity { color: #a78bfa; text-shadow: 0 0 12px rgba(139, 92, 246, 0.3); } - /* Model Card in Table - Clean Typography */ + /* Model Card in Table - Corporate Typography */ .model-cell { display: flex; align-items: center; - gap: 0.875rem; + gap: 1rem; } .model-info { display: flex; flex-direction: column; - gap: 3px; + gap: 4px; } .model-name { font-weight: 600; - font-size: 1rem; + font-size: 1.05rem; color: #f9fafb; - font-family: 'Inter', sans-serif; + font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; line-height: 1.3; + letter-spacing: -0.01em; } .model-provider { - font-size: 0.8rem; - color: #6b7280; + font-size: 0.85rem; + color: #9ca3af; font-weight: 500; + font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; + } + + /* Enhanced Select Button - Corporate Style */ + .select-btn { + background: linear-gradient(135deg, #6366f1, #8b5cf6); + color: white; + border: none; + padding: 0.6rem 1.25rem; + border-radius: 8px; + font-weight: 600; + font-size: 0.85rem; + cursor: pointer; + transition: all 0.2s ease; + font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; + box-shadow: 0 4px 12px rgba(99, 102, 241, 0.25); + } + .select-btn:hover { + transform: translateY(-1px); + box-shadow: 0 6px 16px rgba(99, 102, 241, 0.35); } /* Final Score Display - BIG and prominent */ .final-score { - font-size: 1.75rem; + font-size: 2rem; font-weight: 800; - color: var(--accent-green) !important; - font-family: 'Inter', sans-serif; - text-shadow: 0 0 20px rgba(63, 185, 80, 0.4); + background: linear-gradient(135deg, #6366f1, #8b5cf6); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; + font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; + letter-spacing: -0.02em; display: block; text-align: center; } + /* Enhanced table row spacing */ + .leaderboard-table tbody tr { + border-bottom: 1px solid rgba(255,255,255,0.04); + } + .leaderboard-table tbody tr:hover { + background: rgba(99, 102, 241, 0.08); + } + .leaderboard-table td { + padding: 1rem 0.75rem !important; + vertical-align: middle; + } + .leaderboard-table th { + padding: 1rem 0.75rem !important; + font-size: 0.75rem; + text-transform: uppercase; + letter-spacing: 0.08em; + color: rgba(255,255,255,0.5); + font-weight: 600; + } + /* Enhanced Slider Styling */ .stSlider { padding: 0.5rem 0; @@ -604,30 +653,6 @@ justify-content: center; align-items: center; } - .select-btn { - background: linear-gradient(135deg, var(--accent-blue), var(--accent-purple)); - color: white; - padding: 10px 18px; - border-radius: 8px; - font-weight: 600; - font-size: 0.85rem; - border: none; - cursor: pointer; - transition: all 0.2s ease; - white-space: nowrap; - text-decoration: none; - display: inline-flex; - align-items: center; - justify-content: center; - gap: 6px; - font-family: 'Inter', sans-serif; - margin: 0 auto; - } - .select-btn:hover { - transform: translateY(-2px); - box-shadow: 0 8px 20px rgba(88, 166, 255, 0.35); - filter: brightness(1.1); - } /* Extraction Card - Clean, spacious design */ .extraction-card { @@ -872,7 +897,7 @@ } .priority-low_latency { background: linear-gradient(135deg, #059669, var(--accent-green)); } .priority-cost_saving { background: linear-gradient(135deg, var(--accent-blue), var(--accent-cyan)); } - .priority-high_quality { background: linear-gradient(135deg, var(--accent-purple), #7c3aed); } + .priority-high_accuracy { background: linear-gradient(135deg, var(--accent-purple), #7c3aed); } .priority-high_throughput { background: linear-gradient(135deg, var(--accent-orange), var(--accent-pink)); } .priority-balanced { background: linear-gradient(135deg, #6b7280, #4b5563); } @@ -1194,7 +1219,7 @@ .metric-badge:hover { transform: scale(1.03); } - .metric-badge-quality { + .metric-badge-accuracy { background: rgba(63, 185, 80, 0.12); color: var(--accent-green); border: 1px solid rgba(63, 185, 80, 0.25); @@ -1315,7 +1340,7 @@ height: 14px; border-radius: 4px; } - .legend-color-quality { background: var(--accent-green); } + .legend-color-accuracy { background: var(--accent-green); } .legend-color-latency { background: var(--accent-blue); } .legend-color-cost { background: var(--accent-orange); } .legend-color-capacity { background: var(--accent-purple); } @@ -1371,6 +1396,16 @@ if "expanded_categories" not in st.session_state: st.session_state.expanded_categories = set() +# Winner dialog state - must be explicitly initialized to False +if "show_winner_dialog" not in st.session_state: + st.session_state.show_winner_dialog = False +if "balanced_winner" not in st.session_state: + st.session_state.balanced_winner = None +if "winner_priority" not in st.session_state: + st.session_state.winner_priority = "balanced" +if "winner_extraction" not in st.session_state: + st.session_state.winner_extraction = {} + # ============================================================================= # DATA LOADING # ============================================================================= @@ -1389,17 +1424,30 @@ def load_206_models() -> pd.DataFrame: @st.cache_data def load_slo_templates(): - """Load SLO templates for all 9 use cases.""" + """Load SLO templates for all 9 use cases. + + DEFAULTS ARE SET TO MIDDLE OF RESEARCH-BASED RANGES + This ensures default values show GREEN (within range). + """ return { - "chatbot_conversational": {"ttft": 150, "itl": 30, "e2e": 500, "qps": 100}, - "code_completion": {"ttft": 100, "itl": 20, "e2e": 300, "qps": 200}, - "code_generation_detailed": {"ttft": 200, "itl": 30, "e2e": 800, "qps": 50}, - "document_analysis_rag": {"ttft": 200, "itl": 40, "e2e": 1000, "qps": 50}, - "summarization_short": {"ttft": 300, "itl": 50, "e2e": 1500, "qps": 30}, - "long_document_summarization": {"ttft": 500, "itl": 60, "e2e": 5000, "qps": 10}, - "translation": {"ttft": 200, "itl": 40, "e2e": 1000, "qps": 80}, - "content_generation": {"ttft": 300, "itl": 50, "e2e": 2000, "qps": 40}, - "research_legal_analysis": {"ttft": 500, "itl": 60, "e2e": 5000, "qps": 10}, + # Research range: TTFT 50-500, ITL 10-80, E2E 500-5000 + "chatbot_conversational": {"ttft": 275, "itl": 45, "e2e": 2750, "qps": 100}, + # Research range: TTFT 15-100, ITL 5-30, E2E 300-2000 + "code_completion": {"ttft": 60, "itl": 18, "e2e": 1150, "qps": 200}, + # Research range: TTFT 50-300, ITL 5-30, E2E 2000-15000 + "code_generation_detailed": {"ttft": 175, "itl": 18, "e2e": 8500, "qps": 50}, + # Research range: TTFT 200-800, ITL 15-50, E2E 5000-25000 + "document_analysis_rag": {"ttft": 500, "itl": 33, "e2e": 15000, "qps": 50}, + # Research range: TTFT 100-500, ITL 10-45, E2E 2000-12000 + "summarization_short": {"ttft": 300, "itl": 28, "e2e": 7000, "qps": 30}, + # Research range: TTFT 500-2000, ITL 20-60, E2E 10000-60000 + "long_document_summarization": {"ttft": 1250, "itl": 40, "e2e": 35000, "qps": 10}, + # Research range: TTFT 100-400, ITL 15-50, E2E 2000-10000 + "translation": {"ttft": 250, "itl": 33, "e2e": 6000, "qps": 80}, + # Research range: TTFT 150-600, ITL 15-50, E2E 3000-15000 + "content_generation": {"ttft": 375, "itl": 33, "e2e": 9000, "qps": 40}, + # Research range: TTFT 1000-4000, ITL 25-70, E2E 30000-180000 + "research_legal_analysis": {"ttft": 2500, "itl": 48, "e2e": 105000, "qps": 10}, } @st.cache_data @@ -1503,44 +1551,86 @@ def get_slo_targets_for_use_case(use_case: str, priority: str = "balanced") -> d } +def calculate_slo_defaults_from_research(use_case: str, priority: str = "balanced") -> dict: + """Calculate SLO DEFAULT values as the MAX of the priority-adjusted research range. + + Using MAX as default ensures: + - User sees ALL models that meet acceptable performance (more options) + - User can then tighten SLOs to filter down if needed + - All shown models are still within research-backed acceptable ranges + + Models will be filtered to only those meeting these SLO targets from BLIS data. + + Returns: + dict with ttft, itl, e2e, qps defaults (integers) + """ + slo_targets = get_slo_targets_for_use_case(use_case, priority) + + if not slo_targets: + # Fallback to static defaults if research data unavailable + templates = load_slo_templates() + return templates.get(use_case, {"ttft": 200, "itl": 30, "e2e": 3000, "qps": 50}) + + # Use MAX of the adjusted range for each SLO (shows more models by default) + # User can tighten these values to filter down to fewer/better options + ttft_default = slo_targets["ttft_target"]["max"] + itl_default = slo_targets["itl_target"]["max"] + e2e_default = slo_targets["e2e_target"]["max"] + + # QPS based on use case defaults + templates = load_slo_templates() + qps_default = templates.get(use_case, {}).get("qps", 50) + + return { + "ttft": ttft_default, + "itl": itl_default, + "e2e": e2e_default, + "qps": qps_default, + "ttft_range": slo_targets["ttft_target"], + "itl_range": slo_targets["itl_target"], + "e2e_range": slo_targets["e2e_target"], + "research_note": slo_targets.get("research_note", ""), + } + + def recommend_optimal_hardware(use_case: str, priority: str, user_hardware: str = None) -> dict: """Recommend optimal hardware from BLIS benchmarks based on SLO requirements. DEPRECATED: This function is kept for potential future use. The UI now uses the backend API via fetch_ranked_recommendations() instead. - + Logic: - cost_saving: Find CHEAPEST hardware that meets MAX SLO (slowest acceptable) - low_latency: Find hardware that meets MIN SLO (fastest required) - balanced: Find hardware that meets MEAN of SLO range - - high_quality: Relax latency, focus on larger models + - high_accuracy: Relax latency, focus on larger models - high_throughput: Focus on tokens/sec capacity - + Returns hardware recommendation with BLIS benchmark data. """ # Get SLO targets slo_targets = get_slo_targets_for_use_case(use_case, priority) if not slo_targets: return None - + # Get token config prompt_tokens = slo_targets['token_config']['prompt'] output_tokens = slo_targets['token_config']['output'] - + # Load BLIS benchmarks blis_data = load_blis_benchmarks() if not blis_data or 'benchmarks' not in blis_data: return None - + benchmarks = blis_data['benchmarks'] - + # Filter by token config - matching = [b for b in benchmarks + matching = [b for b in benchmarks if b['prompt_tokens'] == prompt_tokens and b['output_tokens'] == output_tokens] - + if not matching: return None - + # Define hardware costs (approximate monthly cost) # Both H100 and A100-80 are REAL BLIS benchmarks from Andre's data hardware_costs = { @@ -1552,7 +1642,7 @@ def recommend_optimal_hardware(use_case: str, priority: str, user_hardware: str ("A100-80", 2): {"cost": 3200, "tier": 2}, ("A100-80", 4): {"cost": 6400, "tier": 3}, } - + # Determine target SLO based on priority if priority == "cost_saving": # Target MAX SLO (slowest acceptable) to use cheapest hardware @@ -1569,12 +1659,12 @@ def recommend_optimal_hardware(use_case: str, priority: str, user_hardware: str target_ttft = slo_targets['ttft_target']['max'] # Relax latency target_e2e = slo_targets['e2e_target']['max'] sort_by = "throughput" # Sort by tokens/sec descending - else: # balanced, high_quality + else: # balanced, high_accuracy # Target MEAN of range target_ttft = (slo_targets['ttft_target']['min'] + slo_targets['ttft_target']['max']) // 2 target_e2e = (slo_targets['e2e_target']['min'] + slo_targets['e2e_target']['max']) // 2 sort_by = "balanced" - + # Group benchmarks by hardware config hw_benchmarks = {} for b in matching: @@ -1582,26 +1672,26 @@ def recommend_optimal_hardware(use_case: str, priority: str, user_hardware: str if hw_key not in hw_benchmarks: hw_benchmarks[hw_key] = [] hw_benchmarks[hw_key].append(b) - + # Evaluate each hardware option viable_options = [] for hw_key, benches in hw_benchmarks.items(): # Get best benchmark (lowest TTFT at reasonable RPS) best = min(benches, key=lambda x: x['ttft_mean']) - + hw_cost = hardware_costs.get(hw_key, {"cost": 99999, "tier": 99}) - + # Check if meets SLO requirements meets_ttft = best['ttft_p95'] <= target_ttft * 1.2 # 20% buffer meets_e2e = best['e2e_p95'] <= target_e2e * 1.2 - + # Don't recommend hardware that's WAY faster than needed (over-provisioning) too_fast = False if priority == "cost_saving": # If TTFT is less than 50% of max, it's over-provisioned if best['ttft_mean'] < slo_targets['ttft_target']['max'] * 0.3: too_fast = True - + viable_options.append({ "hardware": hw_key[0], "hardware_count": hw_key[1], @@ -1618,14 +1708,14 @@ def recommend_optimal_hardware(use_case: str, priority: str, user_hardware: str "benchmark_count": len(benches), "model_repo": best['model_hf_repo'], }) - + # Filter to only viable options (meets SLO) viable = [v for v in viable_options if v['meets_slo']] - + # If no viable options, return best available if not viable: viable = viable_options - + # Sort based on priority if sort_by == "cost": # For cost_saving: prefer cheapest that meets SLO, not over-provisioned @@ -1638,13 +1728,13 @@ def recommend_optimal_hardware(use_case: str, priority: str, user_hardware: str else: # balanced # Balance cost and latency viable.sort(key=lambda x: (x['tier'], x['ttft_mean'])) - + if not viable: return None - + best_option = viable[0] alternatives = viable[1:4] if len(viable) > 1 else [] - + return { "recommended": best_option, "alternatives": alternatives, @@ -1663,15 +1753,15 @@ def _get_hardware_selection_reason(priority: str, hw_option: dict, slo_targets: cost = hw_option['cost_monthly'] target_max = slo_targets['ttft_target']['max'] target_min = slo_targets['ttft_target']['min'] - + if priority == "cost_saving": return f"💰 {hw_name} is the cheapest option (${cost:,}/mo) that meets your SLO max ({target_max}ms TTFT). Actual TTFT: {ttft:.0f}ms - good value!" elif priority == "low_latency": return f"⚡ {hw_name} achieves {ttft:.0f}ms TTFT, meeting your aggressive target ({target_min}ms). Fastest option for your use case." elif priority == "high_throughput": return f"📈 {hw_name} offers {hw_option['tokens_per_sec']:.0f} tokens/sec - best throughput for high-volume workloads." - elif priority == "high_quality": - return f"⭐ {hw_name} provides headroom for larger, higher-quality models with {ttft:.0f}ms TTFT." + elif priority == "high_accuracy": + return f"⭐ {hw_name} provides headroom for larger, higher-accuracy models with {ttft:.0f}ms TTFT." else: # balanced return f"⚖️ {hw_name} balances cost (${cost:,}/mo) and latency ({ttft:.0f}ms TTFT) - optimal for balanced priority." @@ -1719,12 +1809,13 @@ def fetch_ranked_recommendations( "balanced": {"latency_requirement": "high", "budget_constraint": "moderate"}, "cost_saving": {"latency_requirement": "medium", "budget_constraint": "strict"}, "high_throughput": {"latency_requirement": "high", "budget_constraint": "moderate"}, - "high_quality": {"latency_requirement": "medium", "budget_constraint": "flexible"}, + "high_accuracy": {"latency_requirement": "medium", "budget_constraint": "flexible"}, } mapping = priority_mapping.get(priority, priority_mapping["balanced"]) # Build request payload + # min_accuracy=35 filters out models with 30% fallback (no AA data) payload = { "use_case": use_case, "user_count": user_count, @@ -1737,6 +1828,7 @@ def fetch_ranked_recommendations( "itl_p95_target_ms": itl_p95_target_ms, "e2e_p95_target_ms": e2e_p95_target_ms, "include_near_miss": include_near_miss, + "min_accuracy": 35, # Filter out models without AA accuracy data (30% fallback) } if weights: @@ -2246,10 +2338,13 @@ def get_blis_slo_for_model(model_name: str, use_case: str, hardware: str = "H100 } def validate_slo_against_research(use_case: str, ttft: int, itl: int, e2e: int, priority: str = "balanced") -> list: - """Validate SLO values against research-backed ranges and return warnings/info messages. + """Validate SLO values against RESEARCH-BASED ranges only. Returns list of tuples: (icon, color, message, severity) - Severity: 'error' (red), 'warning' (orange), 'info' (blue), 'success' (green) + - GREEN: within research range + - RED: outside research range (too low or too high) + + NOTE: BLIS data is NOT used here - only in Recommendation tab """ messages = [] research_data = load_research_slo_ranges() @@ -2271,7 +2366,7 @@ def validate_slo_against_research(use_case: str, ttft: int, itl: int, e2e: int, itl_factor = priority_factor.get('itl_factor', 1.0) e2e_factor = priority_factor.get('e2e_factor', 1.0) - # Adjust ranges based on priority + # Adjust ranges based on priority (research-based) ttft_min = int(use_case_ranges['ttft_ms']['min'] * ttft_factor) ttft_max = int(use_case_ranges['ttft_ms']['max'] * ttft_factor) itl_min = int(use_case_ranges['itl_ms']['min'] * itl_factor) @@ -2279,78 +2374,67 @@ def validate_slo_against_research(use_case: str, ttft: int, itl: int, e2e: int, e2e_min = int(use_case_ranges['e2e_ms']['min'] * e2e_factor) e2e_max = int(use_case_ranges['e2e_ms']['max'] * e2e_factor) - # Get BLIS observed values for context - blis_ttft = use_case_ranges.get('ttft_ms', {}).get('blis_observed', {}) - blis_itl = use_case_ranges.get('itl_ms', {}).get('blis_observed', {}) - blis_e2e = use_case_ranges.get('e2e_ms', {}).get('blis_observed', {}) - - # TTFT validation with BLIS context + # TTFT validation - RESEARCH BASED ONLY if ttft < ttft_min: - blis_min = blis_ttft.get('min', 'N/A') messages.append(( - "🔬", "#f5576c", - f"TTFT ({ttft}ms) is BELOW min ({ttft_min}ms). BLIS observed min: {blis_min}ms on H100x8!", + "🔴", "#ef4444", + f"TTFT ({ttft}ms) is BELOW research min ({ttft_min}ms) - may be unrealistic", "error" )) elif ttft > ttft_max: - blis_mean = blis_ttft.get('mean', 'N/A') messages.append(( - "💸", "#fbbf24", - f"TTFT ({ttft}ms) is ABOVE max ({ttft_max}ms). BLIS avg: {blis_mean}ms - you're over-provisioning!", - "warning" + "🔴", "#ef4444", + f"TTFT ({ttft}ms) is ABOVE research max ({ttft_max}ms) - poor user experience", + "error" )) else: messages.append(( "✅", "#10b981", - f"TTFT ({ttft}ms) ✓ within range ({ttft_min}-{ttft_max}ms)", + f"TTFT ({ttft}ms) ✓ within research range ({ttft_min}-{ttft_max}ms)", "success" )) - # ITL validation with BLIS context + # ITL validation - RESEARCH BASED ONLY if itl < itl_min: - blis_min = blis_itl.get('min', 'N/A') messages.append(( - "🔬", "#f5576c", - f"ITL ({itl}ms) is BELOW min ({itl_min}ms). BLIS observed min: {blis_min}ms - needs batch size 1!", + "🔴", "#ef4444", + f"ITL ({itl}ms) is BELOW research min ({itl_min}ms) - may be unrealistic", "error" )) elif itl > itl_max: - blis_mean = blis_itl.get('mean', 'N/A') messages.append(( - "💸", "#fbbf24", - f"ITL ({itl}ms) is ABOVE max ({itl_max}ms). BLIS avg: {blis_mean}ms - streaming may feel slow.", - "warning" + "🔴", "#ef4444", + f"ITL ({itl}ms) is ABOVE research max ({itl_max}ms) - streaming will feel slow", + "error" )) else: messages.append(( "✅", "#10b981", - f"ITL ({itl}ms) ✓ within range ({itl_min}-{itl_max}ms)", + f"ITL ({itl}ms) ✓ within research range ({itl_min}-{itl_max}ms)", "success" )) - # E2E validation with BLIS context + # E2E validation - RESEARCH BASED ONLY if e2e < e2e_min: - blis_min = blis_e2e.get('min', 'N/A') messages.append(( - "🔬", "#f5576c", - f"E2E ({e2e}ms) is BELOW min ({e2e_min}ms). BLIS best: {blis_min}ms - very aggressive!", + "🔴", "#ef4444", + f"E2E ({e2e}ms) is BELOW research min ({e2e_min}ms) - may be unrealistic", "error" )) elif e2e > e2e_max: - blis_mean = blis_e2e.get('mean', 'N/A') messages.append(( - "💸", "#fbbf24", - f"E2E ({e2e}ms) is ABOVE max ({e2e_max}ms). BLIS avg: {blis_mean}ms - over-provisioned!", - "warning" + "🔴", "#ef4444", + f"E2E ({e2e}ms) is ABOVE research max ({e2e_max}ms) - poor user experience", + "error" )) else: messages.append(( "✅", "#10b981", - f"E2E ({e2e}ms) ✓ within range ({e2e_min}-{e2e_max}ms)", + f"E2E ({e2e}ms) ✓ within research range ({e2e_min}-{e2e_max}ms)", "success" )) - # Add research note + # Add research note (no BLIS reference) if use_case_ranges.get('research_note'): messages.append(( "📚", "#a371f7", @@ -2506,34 +2590,14 @@ def get_workload_insights(use_case: str, qps: int, user_count: int) -> list: "info" )) - # Add BLIS E2E latency at optimal load - if blis_e2e_p95: - messages.append(( - "⏱️", "#06b6d4", - f"BLIS E2E p95 at {blis_optimal_rps} RPS: {blis_e2e_p95}ms", - "info" - )) + # Note: Peak multiplier info now shown inline in workload profile box if traffic: prompt_tokens = traffic.get('prompt_tokens', 512) output_tokens = traffic.get('output_tokens', 256) - blis_samples = traffic.get('blis_samples', 0) - sample_info = f" ({blis_samples} BLIS samples)" if blis_samples else "" - messages.append(( - "📝", "#3b82f6", - f"Traffic: {prompt_tokens} → {output_tokens} tokens{sample_info}", - "info" - )) + # Note: Token profile info now shown inline in workload profile box - # Add hardware recommendation from BLIS - if hardware_throughput and capacity_guidance: - h100_max = capacity_guidance.get('H100_x1_max_rps', 10) - if qps > h100_max: - messages.append(( - "🔧", "#f97316", - f"QPS {qps} > H100x1 max ({h100_max}). Recommend H100x2 or horizontal scaling.", - "info" - )) + # Hardware recommendations moved to Recommendation tab (uses BLIS data) return messages @@ -2562,6 +2626,97 @@ def load_weighted_scores(use_case: str) -> pd.DataFrame: except Exception: return pd.DataFrame() +# Model name mapping from BLIS/backend names to AA CSV names (exact mapping) +BLIS_TO_AA_NAME_MAP = { + # GPT-OSS - specific size mapping + "gpt-oss-120b": "gpt-oss-120b (high)", + "gpt-oss 120b": "gpt-oss-120b (high)", + "gpt-oss-20b": "gpt-oss-20b (high)", + "gpt-oss 20b": "gpt-oss-20b (high)", + # Llama models + "llama-4-maverick-17b-128e-instruct-fp8": "llama 4 maverick", + "llama-4-scout-17b-16e-instruct": "llama 4 scout", + "llama-4-scout-17b-16e-instruct-fp8-dynamic": "llama 4 scout", + "llama-3.3-70b-instruct": "llama 3.3 instruct 70b", + # Phi + "phi-4": "phi-4", + "phi-4-fp8-dynamic": "phi-4", + # Mistral + "mistral-small-24b-instruct-2501": "mistral small 3", + "mistral-small-3.1-24b-instruct-2503": "mistral small 3.1", + "mistral-small-3.1-24b-instruct-2503-fp8-dynamic": "mistral small 3.1", + "mixtral-8x7b-instruct-v0.1": "mixtral 8x7b instruct", + # Qwen + "qwen2.5-7b-instruct": "qwen2.5 7b instruct", + "qwen2.5-7b-instruct-fp8-dynamic": "qwen2.5 7b instruct", +} + +def get_raw_aa_accuracy(model_name: str, use_case: str) -> float: + """Get raw AA benchmark accuracy for a model from the weighted scores CSV. + + This returns the actual benchmark score, NOT the composite quality score. + """ + df = load_weighted_scores(use_case) + if df.empty: + return 0.0 + + # Normalize model name - remove extra spaces, convert to lowercase + model_lower = model_name.lower().strip().replace(' ', ' ') + + # Extract size identifier (e.g., "120b", "20b", "70b") for differentiation + import re + size_match = re.search(r'(\d+)b', model_lower) + model_size = size_match.group(1) if size_match else None + + # Try direct mapping first + aa_name = BLIS_TO_AA_NAME_MAP.get(model_lower) + if not aa_name: + # Try with dashes converted to spaces + aa_name = BLIS_TO_AA_NAME_MAP.get(model_lower.replace('-', ' ')) + if not aa_name: + aa_name = model_lower + + # Look for EXACT model in CSV (case-insensitive) + for _, row in df.iterrows(): + csv_model = str(row.get('Model Name', row.get('model_name', ''))).lower().strip() + + # Exact match with mapped name + if csv_model == aa_name.lower(): + score_str = str(row.get('Use Case Score', row.get('Weighted Score', '0'))) + try: + return float(score_str.replace('%', '')) + except: + return 0.0 + + # Partial match - but must match SIZE to avoid 120B/20B confusion + for _, row in df.iterrows(): + csv_model = str(row.get('Model Name', row.get('model_name', ''))).lower().strip() + + # Check if base model name matches AND size matches + base_name = model_lower.replace('-', ' ').replace('_', ' ').split()[0] if model_lower else "" + + if base_name and base_name in csv_model: + # Verify size matches to avoid 120B vs 20B confusion + csv_size_match = re.search(r'(\d+)b', csv_model) + csv_size = csv_size_match.group(1) if csv_size_match else None + + if model_size and csv_size and model_size == csv_size: + # Size matches - this is the right model + score_str = str(row.get('Use Case Score', row.get('Weighted Score', '0'))) + try: + return float(score_str.replace('%', '')) + except: + return 0.0 + elif not model_size and not csv_size: + # No size in either - match on name + score_str = str(row.get('Use Case Score', row.get('Weighted Score', '0'))) + try: + return float(score_str.replace('%', '')) + except: + return 0.0 + + return 0.0 + @st.cache_data def load_model_pricing() -> pd.DataFrame: """Load model pricing and latency data from model_pricing.csv. @@ -2697,20 +2852,31 @@ def mock_extraction(user_input: str) -> dict: # Detect priority from user input priority = "balanced" # default - latency_keywords = ["latency", "fast", "speed", "quick", "responsive", "real-time", "instant", "low latency", "critical"] - cost_keywords = ["cost", "cheap", "budget", "efficient", "affordable", "save money", "cost-effective"] - quality_keywords = ["quality", "accurate", "best", "precision", "top quality", "high quality", "most important"] - throughput_keywords = ["throughput", "scale", "high volume", "capacity", "concurrent", "many users"] + # Quality keywords - check these FIRST (accuracy is more specific than generic "critical") + quality_keywords = ["accuracy", "accurate", "quality", "precision", "high quality", "top quality", + "accuracy is critical", "quality is critical", "quality is most important", + "accuracy is most important", "best quality", "highest accuracy"] + + # Latency keywords - "critical" removed (too generic) + latency_keywords = ["latency", "fast", "speed", "quick", "responsive", "real-time", "instant", + "low latency", "latency is critical", "under 200ms", "under 100ms", "millisecond"] + + cost_keywords = ["cost", "cheap", "budget", "efficient", "affordable", "save money", "cost-effective", + "budget is tight", "minimize cost"] + + throughput_keywords = ["throughput", "scale", "high volume", "capacity", "concurrent", "many users", + "high traffic", "peak load"] + + # Check for QUALITY priority FIRST (most specific signals) + if any(kw in text_lower for kw in quality_keywords): + priority = "high_accuracy" # Check for latency priority - if any(kw in text_lower for kw in latency_keywords): + elif any(kw in text_lower for kw in latency_keywords): priority = "low_latency" # Check for cost priority elif any(kw in text_lower for kw in cost_keywords): priority = "cost_saving" - # Check for quality priority - elif any(kw in text_lower for kw in quality_keywords): - priority = "high_quality" # Check for throughput priority elif any(kw in text_lower for kw in throughput_keywords): priority = "high_throughput" @@ -2745,20 +2911,74 @@ def get_enhanced_recommendation(business_context: dict) -> Optional[dict]: # ============================================================================= -# BLIS MODEL NAME MAPPING -# Maps BLIS repo names to our quality CSV model names +# VALID MODELS - Only models with BOTH AA Quality AND BLIS Performance data +# These 25 variants are the only ones we should recommend (have both AA quality + BLIS performance) # ============================================================================= +VALID_BLIS_MODELS = { + # GPT-OSS (highest accuracy for chatbot!) + 'openai/gpt-oss-120b', + 'openai/gpt-oss-20b', + # Phi-4 variants + 'microsoft/phi-4', + 'microsoft/phi-4-fp8-dynamic', + 'microsoft/phi-4-quantized.w4a16', + 'microsoft/phi-4-quantized.w8a8', + # Mistral Small 3/3.1 variants + 'mistralai/mistral-small-24b-instruct-2501', + 'mistralai/mistral-small-3.1-24b-instruct-2503', + 'mistralai/mistral-small-3.1-24b-instruct-2503-fp8-dynamic', + 'mistralai/mistral-small-3.1-24b-instruct-2503-quantized.w4a16', + 'mistralai/mistral-small-3.1-24b-instruct-2503-quantized.w8a8', + # Mixtral 8x7B + 'mistralai/mixtral-8x7b-instruct-v0.1', + # Llama 4 Scout variants + 'meta-llama/llama-4-scout-17b-16e-instruct', + 'meta-llama/llama-4-scout-17b-16e-instruct-fp8-dynamic', + 'meta-llama/llama-4-scout-17b-16e-instruct-quantized.w4a16', + # Llama 4 Maverick + 'meta-llama/llama-4-maverick-17b-128e-instruct-fp8', + # Qwen 2.5 7B variants (note: quantized use redhatai/ prefix) + 'qwen/qwen2.5-7b-instruct', + 'redhatai/qwen2.5-7b-instruct-fp8-dynamic', + 'redhatai/qwen2.5-7b-instruct-quantized.w4a16', + 'redhatai/qwen2.5-7b-instruct-quantized.w8a8', + # Llama 3.3 70B variants (note: quantized use redhatai/ prefix) + 'meta-llama/llama-3.3-70b-instruct', + 'redhatai/llama-3.3-70b-instruct-quantized.w4a16', + 'redhatai/llama-3.3-70b-instruct-quantized.w8a8', +} + +# Maps BLIS repo names to AA quality CSV model names BLIS_TO_QUALITY_MODEL_MAP = { - 'ibm-granite/granite-3.1-8b-instruct': 'Granite 3.3 8B (Non-reasoning)', - 'meta-llama/llama-3.1-8b-instruct': 'Llama 3.1 8B Instruct', - 'meta-llama/llama-3.3-70b-instruct': 'Llama 3.3 70B Instruct', - 'microsoft/phi-4': 'Phi-4', - 'mistralai/mistral-small-24b-instruct-2501': 'Mistral Small 3.1', - 'mistralai/mistral-small-3.1-24b-instruct-2503': 'Mistral Small 3.2', - 'mistralai/mixtral-8x7b-instruct-v0.1': 'Mixtral 8x7B Instruct', + # GPT-OSS (highest accuracy) 'openai/gpt-oss-120b': 'gpt-oss-120B (high)', 'openai/gpt-oss-20b': 'gpt-oss-20B (high)', - 'qwen/qwen2.5-7b-instruct': 'Qwen 2.5 7B Instruct', + # Phi-4 + 'microsoft/phi-4': 'Phi-4', + 'microsoft/phi-4-fp8-dynamic': 'Phi-4', + 'microsoft/phi-4-quantized.w4a16': 'Phi-4', + 'microsoft/phi-4-quantized.w8a8': 'Phi-4', + # Mistral Small + 'mistralai/mistral-small-24b-instruct-2501': 'Mistral Small 3', + 'mistralai/mistral-small-3.1-24b-instruct-2503': 'Mistral Small 3.1', + 'mistralai/mistral-small-3.1-24b-instruct-2503-fp8-dynamic': 'Mistral Small 3.1', + 'mistralai/mistral-small-3.1-24b-instruct-2503-quantized.w4a16': 'Mistral Small 3.1', + 'mistralai/mistral-small-3.1-24b-instruct-2503-quantized.w8a8': 'Mistral Small 3.1', + 'mistralai/mixtral-8x7b-instruct-v0.1': 'Mixtral 8x7B Instruct', + # Llama 4 + 'meta-llama/llama-4-scout-17b-16e-instruct': 'Llama 4 Scout', + 'meta-llama/llama-4-scout-17b-16e-instruct-fp8-dynamic': 'Llama 4 Scout', + 'meta-llama/llama-4-scout-17b-16e-instruct-quantized.w4a16': 'Llama 4 Scout', + 'meta-llama/llama-4-maverick-17b-128e-instruct-fp8': 'Llama 4 Maverick', + # Qwen 2.5 7B (note: quantized use redhatai/ prefix) + 'qwen/qwen2.5-7b-instruct': 'Qwen2.5 Max', + 'redhatai/qwen2.5-7b-instruct-fp8-dynamic': 'Qwen2.5 Max', + 'redhatai/qwen2.5-7b-instruct-quantized.w4a16': 'Qwen2.5 Max', + 'redhatai/qwen2.5-7b-instruct-quantized.w8a8': 'Qwen2.5 Max', + # Llama 3.3 70B (note: quantized use redhatai/ prefix) + 'meta-llama/llama-3.3-70b-instruct': 'Llama 3.3 Instruct 70B', + 'redhatai/llama-3.3-70b-instruct-quantized.w4a16': 'Llama 3.3 Instruct 70B', + 'redhatai/llama-3.3-70b-instruct-quantized.w8a8': 'Llama 3.3 Instruct 70B', } # Hardware costs (monthly) - BOTH H100 and A100-80 are real BLIS data @@ -2784,7 +3004,7 @@ def blis_recommendation(context: dict) -> dict: Creates MODEL+HARDWARE combinations ranked by priority: - cost_saving: cheapest hardware that meets SLO for best models - low_latency: fastest hardware (lowest TTFT) for best models - - high_quality: best model quality with hardware that meets SLO + - high_accuracy: best model accuracy with hardware that meets SLO - balanced: weighted combination of all factors """ use_case = context.get("use_case", "chatbot_conversational") @@ -2818,17 +3038,23 @@ def blis_recommendation(context: dict) -> dict: # Priority weights for MCDM weights = { - "balanced": {"quality": 0.30, "latency": 0.30, "cost": 0.25, "throughput": 0.15}, - "low_latency": {"quality": 0.15, "latency": 0.50, "cost": 0.15, "throughput": 0.20}, - "cost_saving": {"quality": 0.20, "latency": 0.15, "cost": 0.50, "throughput": 0.15}, - "high_quality": {"quality": 0.50, "latency": 0.20, "cost": 0.15, "throughput": 0.15}, - "high_throughput": {"quality": 0.15, "latency": 0.15, "cost": 0.15, "throughput": 0.55}, + "balanced": {"accuracy": 0.30, "latency": 0.30, "cost": 0.25, "throughput": 0.15}, + "low_latency": {"accuracy": 0.15, "latency": 0.50, "cost": 0.15, "throughput": 0.20}, + "cost_saving": {"accuracy": 0.20, "latency": 0.15, "cost": 0.50, "throughput": 0.15}, + "high_accuracy": {"accuracy": 0.50, "latency": 0.20, "cost": 0.15, "throughput": 0.15}, + "high_throughput": {"accuracy": 0.15, "latency": 0.15, "cost": 0.15, "throughput": 0.55}, }[priority] # Aggregate BLIS data by model+hardware (use best config per combo) + # FILTER: Only include models that have BOTH AA quality AND BLIS performance data model_hw_combos = {} for b in benchmarks: model_repo = b['model_hf_repo'] + + # Skip models not in our valid list (must have both AA + BLIS data) + if model_repo not in VALID_BLIS_MODELS: + continue + hw = b['hardware'] hw_count = b['hardware_count'] key = (model_repo, hw, hw_count) @@ -2882,7 +3108,7 @@ def blis_recommendation(context: dict) -> dict: # Calculate weighted MCDM score final_score = ( - weights['quality'] * quality_score + + weights['accuracy'] * quality_score + weights['latency'] * latency_score + weights['cost'] * cost_score + weights['throughput'] * throughput_score @@ -2955,7 +3181,7 @@ def blis_recommendation(context: dict) -> dict: ], }, "score_breakdown": { - "quality": {"score": top['quality_score'], "weight": weights['quality']}, + "accuracy": {"score": top['quality_score'], "weight": weights['accuracy']}, "latency": {"score": top['latency_score'], "weight": weights['latency']}, "cost": {"score": top['cost_score'], "weight": weights['cost']}, "throughput": {"score": top['throughput_score'], "weight": weights['throughput']}, @@ -2984,15 +3210,31 @@ def blis_recommendation(context: dict) -> dict: "latency_score": c['latency_score'], "cost_score": c['cost_score'], "capacity_score": c['throughput_score'], - "quality_contribution": round(c['quality_score'] * weights['quality'] / 100 * c['final_score'], 1), + "accuracy_contribution": round(c['quality_score'] * weights['accuracy'] / 100 * c['final_score'], 1), "latency_contribution": round(c['latency_score'] * weights['latency'] / 100 * c['final_score'], 1), "cost_contribution": round(c['cost_score'] * weights['cost'] / 100 * c['final_score'], 1), "capacity_contribution": round(c['throughput_score'] * weights['throughput'] / 100 * c['final_score'], 1), }, - "blis_metrics": { - "ttft_p95_ms": c['ttft_p95'], - "e2e_p95_ms": c['e2e_p95'], - "tokens_per_second": c['tokens_per_second'], + "blis_slo": { + "slo_actual": { + "ttft_mean_ms": c['ttft_mean'], + "ttft_p95_ms": c['ttft_p95'], + "itl_mean_ms": c['itl_mean'], + "itl_p95_ms": c['itl_p95'], + "e2e_mean_ms": c['e2e_mean'], + "e2e_p95_ms": c['e2e_p95'], + }, + "throughput": { + "tokens_per_sec": c['tokens_per_second'], + }, + "token_config": { + "prompt": c['prompt_tokens'], + "output": c['output_tokens'], + }, + "hardware": c['hardware'], + "hardware_count": c['hardware_count'], + "model_repo": c['model_repo'], + "benchmark_samples": 1, }, "cost_monthly": c['hw_cost_monthly'], "meets_slo": c['meets_slo'], @@ -3016,8 +3258,8 @@ def get_selection_reason(top: dict, priority: str) -> str: return f"💰 {model} on {hw} is the most cost-effective option (${cost:,}/mo) that meets your SLO requirements with {ttft:.0f}ms TTFT." elif priority == "low_latency": return f"⚡ {model} on {hw} delivers the lowest latency ({ttft:.0f}ms TTFT P95) from actual BLIS benchmarks." - elif priority == "high_quality": - return f"⭐ {model} has the highest quality score for your use case, running on {hw} with {ttft:.0f}ms TTFT." + elif priority == "high_accuracy": + return f"⭐ {model} has the highest accuracy score for your use case, running on {hw} with {ttft:.0f}ms TTFT." elif priority == "high_throughput": return f"📈 {model} on {hw} achieves {tps:.0f} tokens/sec throughput from actual BLIS benchmarks." else: # balanced @@ -3046,7 +3288,7 @@ def get_model_pros(combo: dict, priority: str) -> list: pros.append(f"💰 Cost-efficient (${cost:,}/mo)") if quality > 50: - pros.append(f"⭐ High quality ({quality:.0f}%)") + pros.append(f"⭐ High accuracy ({quality:.0f}%)") if combo['meets_slo']: pros.append("✅ Meets SLO targets") @@ -3072,7 +3314,7 @@ def get_model_cons(combo: dict, priority: str) -> list: cons.append(f"💸 Premium cost (${cost:,}/mo)") if quality < 40: - cons.append(f"📊 Lower quality score ({quality:.0f}%)") + cons.append(f"📊 Lower accuracy score ({quality:.0f}%)") if not combo['meets_slo']: cons.append("⚠️ May not meet SLO") @@ -3089,7 +3331,7 @@ def mock_recommendation(context: dict) -> dict: """FALLBACK: Recommendation using CSV data when BLIS unavailable. Data sources: - - Quality: weighted_scores/{use_case}.csv (task-specific benchmark scores) + - Accuracy: weighted_scores/{use_case}.csv (task-specific benchmark scores) - Cost: model_pricing.csv (price_blended - $/1M tokens) - Latency: model_pricing.csv (median_output_tokens_per_sec, median_ttft_seconds) @@ -3112,7 +3354,7 @@ def mock_recommendation(context: dict) -> dict: use_case = "chatbot_conversational" # Validate priority is in allowed list - valid_priorities = ["balanced", "low_latency", "cost_saving", "high_quality", "high_throughput"] + valid_priorities = ["balanced", "low_latency", "cost_saving", "high_accuracy", "high_throughput"] if priority not in valid_priorities: priority = "balanced" @@ -3145,11 +3387,11 @@ def mock_recommendation(context: dict) -> dict: # Priority-based weights for MCDM scoring weights = { - "balanced": {"quality": 0.30, "latency": 0.25, "cost": 0.25, "capacity": 0.20}, - "low_latency": {"quality": 0.20, "latency": 0.45, "cost": 0.15, "capacity": 0.20}, - "cost_saving": {"quality": 0.20, "latency": 0.15, "cost": 0.50, "capacity": 0.15}, - "high_quality": {"quality": 0.50, "latency": 0.20, "cost": 0.15, "capacity": 0.15}, - "high_throughput": {"quality": 0.20, "latency": 0.15, "cost": 0.15, "capacity": 0.50}, + "balanced": {"accuracy": 0.30, "latency": 0.25, "cost": 0.25, "capacity": 0.20}, + "low_latency": {"accuracy": 0.20, "latency": 0.45, "cost": 0.15, "capacity": 0.20}, + "cost_saving": {"accuracy": 0.20, "latency": 0.15, "cost": 0.50, "capacity": 0.15}, + "high_accuracy": {"accuracy": 0.50, "latency": 0.20, "cost": 0.15, "capacity": 0.15}, + "high_throughput": {"accuracy": 0.20, "latency": 0.15, "cost": 0.15, "capacity": 0.50}, }[priority] # Parse use case score from weighted_scores CSV @@ -3296,7 +3538,7 @@ def calculate_latency_score(model_name: str) -> float: capacity = m["capacity"] if m["capacity"] and not math.isnan(m["capacity"]) else 50.0 m["final_score"] = ( - quality * weights["quality"] + + quality * weights["accuracy"] + latency * weights["latency"] + cost * weights["cost"] + capacity * weights["capacity"] @@ -3363,7 +3605,7 @@ def calculate_latency_score(model_name: str) -> float: "latency_score": m["latency"], "cost_score": m["cost"], "capacity_score": m["capacity"], - "quality_contribution": m["quality"] * weights["quality"], + "accuracy_contribution": m["quality"] * weights["accuracy"], "latency_contribution": m["latency"] * weights["latency"], "cost_contribution": m["cost"] * weights["cost"], "capacity_contribution": m["capacity"] * weights["capacity"], @@ -3393,33 +3635,11 @@ def calculate_latency_score(model_name: str) -> float: # ============================================================================= def render_hero(): - """Render the animated hero section with project description.""" + """Render compact hero section.""" st.markdown("""
- Compass uses Qwen 2.5 7B to extract your business requirements from natural language, - then scores 206 open-source models using Multi-Criteria Decision Making (MCDM) - across Quality, Latency, Cost, and Capacity to recommend the best model for your deployment. - All data powered by Artificial Analysis benchmarks. -
| Formula & Explanation | ||||||||
|---|---|---|---|---|---|---|---|---|
| 🎯 Quality | +🎯 Accuracy |
- Quality = UseCase_Score(model) × 100+ Accuracy = UseCase_Score(model) × 100Use-case specific score from weighted_scores CSVs. Each use case has pre-ranked models based on relevant benchmarks (e.g., LiveCodeBench for code, MMLU for chatbot). Score range: 0-100.
|
| Component | +Explanation | +
|---|---|
| 🎯 Accuracy | +Use-case specific score from weighted benchmark CSVs (MMLU-Pro, LiveCodeBench, etc.) | +
| ⚡ Latency | +Based on tokens/sec from model_pricing.csv + TTFT bonus for fast response | +
| 💰 Cost | +Inverted price score - cheaper models score higher (self-hosted = 95) | +
| 📈 Capacity | +Throughput potential based on model size and architecture (MoE bonus) | +
+ Complete benchmark data from Artificial Analysis covering + 206 open-source LLMs across + 15 benchmark datasets. +
+ """, unsafe_allow_html=True) - with col5: - show_count = st.selectbox("Show Top", [3, 5, 10], key="show_count") - - # Show "Best Model for Priority" when specific priority is selected (not All Priorities) - if priority_filter != "All Priorities" and recommendations: - # Calculate best model for selected priority - priority_weights_map = { - "⚖️ Balanced": {"quality": 0.30, "latency": 0.25, "cost": 0.25, "capacity": 0.20}, - "⚡ Low Latency": {"quality": 0.20, "latency": 0.45, "cost": 0.15, "capacity": 0.20}, - "💰 Cost Saving": {"quality": 0.20, "latency": 0.15, "cost": 0.50, "capacity": 0.15}, - "⭐ High Quality": {"quality": 0.50, "latency": 0.20, "cost": 0.15, "capacity": 0.15}, - "📈 High Throughput": {"quality": 0.20, "latency": 0.15, "cost": 0.15, "capacity": 0.50}, - } - pweights = priority_weights_map.get(priority_filter, priority_weights_map["⚖️ Balanced"]) + if models_df is not None and not models_df.empty: + # Search + search = st.text_input("🔍 Search models", placeholder="e.g., Llama, Qwen, DeepSeek...", key="about_catalog_search") - best_model = None - best_score = 0 - for rec in recommendations: - breakdown = rec.get("score_breakdown", {}) - score = ( - (breakdown.get("quality_score") or 0) * pweights["quality"] + - (breakdown.get("latency_score") or 0) * pweights["latency"] + - (breakdown.get("cost_score") or 0) * pweights["cost"] + - (breakdown.get("capacity_score") or 0) * pweights["capacity"] - ) - if score > best_score: - best_score = score - best_model = rec + filtered_df = models_df.copy() + if search: + filtered_df = filtered_df[filtered_df.apply(lambda row: search.lower() in str(row).lower(), axis=1)] - if best_model: - model_name = best_model.get("model_name", "Unknown") - provider = best_model.get("provider", "Unknown") - breakdown = best_model.get("score_breakdown", {}) - - st.markdown(f""" -| Rank | -Model | -🎯 Quality | -⚡ Latency | -💰 Cost | -📈 Capacity | -Final Score | -Pros & Cons | -Action | -
|---|---|---|---|---|---|---|---|---|
{i} |
-
-
-
- {rec.get('model_name', 'Unknown')}
- {rec.get('provider', 'Open Source')}
+ return {
+ "accuracy": raw_aa,
+ "latency": backend_scores.get("latency_score", ui_breakdown.get("latency_score", 0)),
+ "cost": backend_scores.get("price_score", ui_breakdown.get("cost_score", 0)),
+ "complexity": backend_scores.get("complexity_score", ui_breakdown.get("capacity_score", 0)),
+ "final": backend_scores.get("balanced_score", rec.get("final_score", 0)),
+ }
+
+ # Find best model for each category
+ best_overall = max(recommendations, key=lambda x: get_scores(x)["final"])
+ best_accuracy = max(recommendations, key=lambda x: get_scores(x)["accuracy"])
+ best_latency = max(recommendations, key=lambda x: get_scores(x)["latency"])
+ best_cost = max(recommendations, key=lambda x: get_scores(x)["cost"])
+
+ # Helper to render a "Best" card
+ def render_best_card(title, icon, color, rec, highlight_field):
+ scores = get_scores(rec)
+ model_name = rec.get('model_name', 'Unknown')
+ gpu_cfg = rec.get('gpu_config', {}) or {}
+ hw_type = gpu_cfg.get('gpu_type', rec.get('hardware', 'H100'))
+ hw_count = gpu_cfg.get('gpu_count', rec.get('hardware_count', 1))
+ hw_display = f"{hw_count}x {hw_type}"
+
+ highlight_value = scores.get(highlight_field, 0)
+ final_score = scores.get("final", 0)
+
+ return f'''
+
+
+ {icon}
+ {title}
+
+
+
- {model_name}
+ {hw_display}
|
-
-
- {breakdown.get('quality_score', 0):.0f}%
-
- |
-
-
- {breakdown.get('latency_score', 0):.0f}%
-
-
- |
-
-
- {breakdown.get('cost_score', 0):.0f}%
-
- |
-
-
- {breakdown.get('capacity_score', 0):.0f}%
-
-
- |
- {rec.get('final_score', 0):.1f} | -
-
- {tags_html}
-
- |
-
-
-
-
- |
-
+ 📊 Research-Based Defaults: Values are set to the maximum acceptable
+ for {use_case.replace('_', ' ').title()} with {priority.replace('_', ' ').title()} priority — showing you all viable options.
+
+ 🎯 How it works: Only models whose actual BLIS benchmark performance
+ meets these SLO targets will be shown. Lower the values to filter down to faster/better models.
+
+ Adjust SLO targets above to find models with different latency/performance trade-offs. + Stricter SLOs = fewer models, Relaxed SLOs = more options. +
+Based on {priority_text} optimization
+Model
+{model_name}
+Quality Score: {quality_score:.0f}%
+Hardware Configuration
+{hw_display}
+⚡ Expected SLO (BLIS p95)
+Based on {priority.replace('_', ' ').title()} priority weighting
""", unsafe_allow_html=True) - # Display BLIS SLO data if available (REAL benchmark data) - blis_slo = winner.get("blis_slo") - if blis_slo: + # Display BLIS SLO data - use backend fields or blis_slo + # Get predicted values from backend OR from blis_slo + blis_slo = winner.get("blis_slo", {}) or {} + gpu_config = winner.get("gpu_config", {}) or {} + + # Get SLO values - prioritize backend's predicted_* fields, fallback to blis_slo + ttft_p95_val = winner.get("predicted_ttft_p95_ms") or blis_slo.get("slo_actual", {}).get("ttft_p95_ms", 0) + itl_p95_val = winner.get("predicted_itl_p95_ms") or blis_slo.get("slo_actual", {}).get("itl_p95_ms", 0) + e2e_p95_val = winner.get("predicted_e2e_p95_ms") or blis_slo.get("slo_actual", {}).get("e2e_p95_ms", 0) + throughput_qps_val = winner.get("predicted_throughput_qps") or (blis_slo.get("throughput", {}).get("tokens_per_sec", 0) / 100 if blis_slo.get("throughput", {}).get("tokens_per_sec") else 0) + + # Get traffic profile from winner or result + traffic_profile = winner.get("traffic_profile", {}) or {} + prompt_tokens_val = traffic_profile.get("prompt_tokens", blis_slo.get("token_config", {}).get("prompt", 512)) + output_tokens_val = traffic_profile.get("output_tokens", blis_slo.get("token_config", {}).get("output", 256)) + + # Get hardware info + hw_type_val = gpu_config.get("gpu_type", blis_slo.get("hardware", "H100")) + hw_count_val = gpu_config.get("gpu_count", blis_slo.get("hardware_count", 1)) + tp_val = gpu_config.get("tensor_parallel", 1) + replicas_val = gpu_config.get("replicas", 1) + + # Show BLIS benchmark box if we have any SLO data + if ttft_p95_val or itl_p95_val or e2e_p95_val: st.markdown("---") st.markdown("""- 🔬 BLIS Benchmarks: These are real measured values from the BLIS simulator across 591 benchmark samples. - Unlike research-backed targets, these represent actual achievable SLOs for this model/hardware configuration. + 🔬 BLIS Benchmarks: Real measured values from IBM BLIS vLLM simulation. + Hardware: {hw_count_val}x {hw_type_val} | + Token Config: {prompt_tokens_val}→{output_tokens_val}
{int(ttft_p95_show) if ttft_p95_show else 'N/A'}ms
+p95 latency
{int(itl_p95_show) if itl_p95_show else 'N/A'}ms
+inter-token latency
{int(e2e_p95_show) if e2e_p95_show else 'N/A'}ms
+end-to-end
Tokens/sec
-{throughput.get('tokens_per_sec', 'N/A')}
+Throughput
+{int(tps_show) if tps_show else 'N/A'} tok/s
Hardware
-{hardware} x{hw_count}
+{hw_count_val}x {hw_type_val}
Token Config
-{token_config.get('prompt', '?')} → {token_config.get('output', '?')}
+{prompt_tokens_val} → {output_tokens_val}
- 📊 BLIS Samples: {blis_slo.get('benchmark_samples', 0)} benchmarks | - Model: {blis_slo.get('model_repo', 'N/A').split('/')[-1]} -
-- {model_name} is not in the BLIS benchmark dataset. - The quality, latency, and cost scores above are derived from Artificial Analysis benchmarks and model characteristics. -
-- 📊 BLIS models available: Qwen2.5-7B, Llama-3.1-8B, Llama-3.3-70B, Phi-4, Mistral-Small-24B, Mixtral-8x7B, Granite-3.1-8B + 📊 Data Source: IBM BLIS vLLM Simulation | + Model: {winner.get('model_name', 'Unknown')}
Priority-based weight adjustment:
| Priority | Quality | Latency | Cost | Capacity |
|---|---|---|---|---|
| Priority | Accuracy | Latency | Cost | Capacity |
| ⚖️ Balanced | 30% | 25% | 25% | 20% |
| ⚡ Low Latency | 20% | 45% | 15% | 20% |
| 💰 Cost Saving | 20% | 15% | 50% | 15% |
| ⭐ High Quality | 50% | 20% | 15% | 15% |
| ⭐ High Accuracy | 50% | 20% | 15% | 15% |
| 📈 High Throughput | 20% | 15% | 15% | 50% |
| Factor | Impact on Recommendation | Example |
|---|---|---|
| 🎯 Use Case | Models are ranked by use-case-specific benchmarks from our 206-model evaluation. Higher-ranked models for your use case get better Quality scores. | Code Completion → LiveCodeBench weighted heavily |
| 🎯 Use Case | Models are ranked by use-case-specific benchmarks from our 206-model evaluation. Higher-ranked models for your use case get better Accuracy scores. | Code Completion → LiveCodeBench weighted heavily |
| 👥 User Count | High user counts increase importance of Capacity & Latency. More users = need for faster, scalable models. | 10K users → Capacity weight +15% |
| 🖥️ Hardware | GPU type affects Cost & Throughput calculations. Premium GPUs enable larger models. | H100 → Can run 70B+ models efficiently |
| ⚡ Priority | Dynamically shifts MCDM weight distribution. Your priority becomes the dominant factor (45-50%). | "Cost Saving" → Cost weight = 50% |
Each use case has a dedicated Weighted Scores CSV (e.g., opensource_chatbot_conversational.csv)
that ranks all 206 models based on relevant benchmarks for that task:
@@ -5334,7 +5911,7 @@ def render_how_it_works_tab():
- 📈 The use-case quality score becomes the "Quality" component in the MCDM formula, ensuring models best suited for your task rank highest. + 📈 The use-case accuracy score becomes the "Accuracy" component in the MCDM formula, ensuring models best suited for your task rank highest.
""", unsafe_allow_html=True)