Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions backend/src/recommendation/capacity_planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,13 +232,20 @@ def plan_all_capacities(
if slo_status == "exceeds" and not include_near_miss:
continue

# Calculate accuracy score
# If model is in catalog and we have an evaluator, use score_model()
# Otherwise, accuracy = 0
if model and model_evaluator:
accuracy_score = int(model_evaluator.score_model(model, intent))
else:
accuracy_score = 0
# Calculate accuracy score - USE RAW AA BENCHMARK SCORE
# This is the actual model accuracy from Artificial Analysis benchmarks
# NOT a composite score with latency/budget bonuses
from .usecase_quality_scorer import score_model_quality

# Try to get raw AA score using the benchmark model name
model_name_for_scoring = model.name if model else bench.model_hf_repo
raw_accuracy = score_model_quality(model_name_for_scoring, intent.use_case)

# If no score found, try with benchmark's model_hf_repo
if raw_accuracy == 0 and bench.model_hf_repo:
raw_accuracy = score_model_quality(bench.model_hf_repo, intent.use_case)

accuracy_score = int(raw_accuracy)

complexity_score = scorer.score_complexity(gpu_config.gpu_count)

Expand Down
87 changes: 76 additions & 11 deletions backend/src/recommendation/usecase_quality_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,63 @@ def _load_csv_scores(self, filepath: str) -> Dict[str, float]:

return scores

# BLIS model variant to AA model mapping (for models with valid AA data)
BLIS_TO_AA_MAP = {
# === OPTION A: 25 VALID VARIANTS WITH REAL BLIS DATA ===
# GPT-OSS (61.62%, 55.23%)
"gpt-oss-120b": "gpt-oss-120b (high)",
"gpt-oss-20b": "gpt-oss-20b (high)",
# Llama 4 Maverick (46.86%)
"llama-4-maverick-17b-128e-instruct-fp8": "llama 4 maverick",
# Qwen 2.5 7B (44.71%) - maps to Qwen2.5 Max
"qwen2.5-7b-instruct": "qwen2.5 max",
"qwen2.5-7b-instruct-fp8-dynamic": "qwen2.5 max",
"qwen2.5-7b-instruct-quantized.w4a16": "qwen2.5 max",
"qwen2.5-7b-instruct-quantized.w8a8": "qwen2.5 max",
# Llama 3.3 70B (42.99%)
"llama-3.3-70b-instruct": "llama 3.3 instruct 70b",
"llama-3.3-70b-instruct-quantized.w4a16": "llama 3.3 instruct 70b",
"llama-3.3-70b-instruct-quantized.w8a8": "llama 3.3 instruct 70b",
# Llama 4 Scout (42.42%)
"llama-4-scout-17b-16e-instruct": "llama 4 scout",
"llama-4-scout-17b-16e-instruct-fp8-dynamic": "llama 4 scout",
"llama-4-scout-17b-16e-instruct-quantized.w4a16": "llama 4 scout",
# Mistral Small 3.1 (35.70%)
"mistral-small-3.1-24b-instruct-2503": "mistral small 3.1",
"mistral-small-3.1-24b-instruct-2503-fp8-dynamic": "mistral small 3.1",
"mistral-small-3.1-24b-instruct-2503-quantized.w4a16": "mistral small 3.1",
"mistral-small-3.1-24b-instruct-2503-quantized.w8a8": "mistral small 3.1",
# Phi-4 (35.57%)
"phi-4": "phi-4",
"phi-4-fp8-dynamic": "phi-4",
"phi-4-quantized.w4a16": "phi-4",
"phi-4-quantized.w8a8": "phi-4",
# Mistral Small 24B (33.79%)
"mistral-small-24b-instruct-2501": "mistral small 3",
# Mixtral 8x7B (20.51%)
"mixtral-8x7b-instruct-v0.1": "mixtral 8x7b instruct",
}

def _normalize_model_name(self, model_name: str) -> str:
"""Normalize model name by removing quantization suffixes and org prefixes."""
name = model_name.lower()

# Remove org prefixes
if '/' in name:
name = name.split('/')[-1]

# Remove quantization suffixes
suffixes_to_remove = [
'-fp8-dynamic', '-fp8',
'-quantized.w4a16', '-quantized.w8a8',
'-instruct-2501', '-instruct-2503', '-instruct-hf',
'-instruct-v0.1', '-instruct'
]
for suffix in suffixes_to_remove:
name = name.replace(suffix, '')

return name.strip('-').strip()

def get_quality_score(self, model_name: str, use_case: str) -> float:
"""Get quality score for a model on a specific use case.

Expand All @@ -99,7 +156,7 @@ def get_quality_score(self, model_name: str, use_case: str) -> float:
use_case: Use case identifier (e.g., "code_completion")

Returns:
Quality score 0-100 (higher is better)
Quality score 0-100 (higher is better), or 0 if no valid AA data
"""
# Normalize use case
use_case_normalized = use_case.lower().replace(" ", "_").replace("-", "_")
Expand All @@ -110,27 +167,35 @@ def get_quality_score(self, model_name: str, use_case: str) -> float:

scores = self._cache.get(use_case_normalized, {})

# Try exact match first
# Normalize the model name
model_lower = model_name.lower()
base_model = self._normalize_model_name(model_name)

# Try exact match first
if model_lower in scores:
return scores[model_lower]

# Try BLIS to AA mapping (for known valid models)
for blis_pattern, aa_name in self.BLIS_TO_AA_MAP.items():
if blis_pattern in base_model:
if aa_name in scores:
logger.debug(f"Matched {model_name} -> {aa_name} via BLIS mapping")
return scores[aa_name]

# Try partial matching (for HuggingFace repo names)
for cached_name, score in scores.items():
model_words = set(model_lower.replace("-", " ").replace("/", " ").replace("_", " ").split())
model_words = set(base_model.replace("-", " ").replace("/", " ").replace("_", " ").split())
cached_words = set(cached_name.replace("-", " ").replace("/", " ").replace("_", " ").split())

common_words = model_words & cached_words
if len(common_words) >= 3:
if len(common_words) >= 2: # Reduced from 3 to 2 for better matching
logger.debug(f"Partial match {model_name} -> {cached_name} (common: {common_words})")
return score

# Fallback: return median score for the use case
if scores:
median_score = sorted(scores.values())[len(scores) // 2]
logger.debug(f"No score found for {model_name}, using median: {median_score:.1f}")
return median_score

return 50.0 # Default fallback
# No valid AA data found - return 0 to indicate missing data
# This allows filtering out models without quality scores
logger.debug(f"No AA score found for {model_name} (base: {base_model})")
return 0.0 # Return 0 so min_accuracy filter can exclude these

def get_top_models_for_usecase(self, use_case: str, top_n: int = 10) -> List[Tuple[str, float]]:
"""Get top N models for a specific use case."""
Expand Down
8 changes: 4 additions & 4 deletions data/research/slo_ranges.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@
"chatbot_conversational": {
"description": "Real-time conversational chatbots",
"token_config": {"prompt": 512, "output": 256},
"ttft_ms": {"min": 50, "max": 200, "blis_observed": {"min": 13.3, "max": 141.5, "mean": 44.3}},
"itl_ms": {"min": 10, "max": 40, "blis_observed": {"min": 2.8, "max": 65.6, "mean": 13.0}},
"e2e_ms": {"min": 1000, "max": 5000, "blis_observed": {"min": 769, "max": 16545, "mean": 3312}},
"ttft_ms": {"min": 50, "max": 500, "default": 150, "blis_observed": {"min": 13.3, "max": 141.5, "mean": 44.3}},
"itl_ms": {"min": 10, "max": 80, "default": 30, "blis_observed": {"min": 2.8, "max": 65.6, "mean": 13.0}},
"e2e_ms": {"min": 500, "max": 5000, "default": 1500, "blis_observed": {"min": 769, "max": 16545, "mean": 3312}},
"tokens_per_sec": {"target": 200, "blis_range": [238, 27878]},
"research_note": "Nielsen's 1s guideline for conversational flow. BLIS: 345 samples show E2E mean of 3.3s."
"research_note": "Nielsen's 1s guideline for conversational flow. Research-based ranges for user experience."
},

"code_generation_detailed": {
Expand Down
Loading