nod-ai · renxida · Feb 14, 2025 · Feb 7, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml
@@ -88,6 +88,11 @@ jobs:
           source ${VENV_DIR}/bin/activate
           uv pip install -r requirements-iree-pinned.txt
 
+      - name: Run LLM Smoke Test
+        run: |
+          source ${VENV_DIR}/bin/activate
+          pytest -v -s --test_device=${{ matrix.test_device }} app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py --log-cli-level=INFO
+
       - name: Run LLM Integration Tests
         run: |
           source ${VENV_DIR}/bin/activate

diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
@@ -33,7 +33,7 @@
 # we can replace this with an import after #890 merges
 TEST_MODELS = {
     "llama3.1_8b": ModelConfig(
-        source=ModelSource.HUGGINGFACE,
+        source=ModelSource.HUGGINGFACE_FROM_GGUF,
         repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF",
         model_file="meta-llama-3.1-8b-instruct.f16.gguf",
         tokenizer_id="NousResearch/Meta-Llama-3.1-8B",

diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py
@@ -1,9 +1,12 @@
 """Module for managing model artifacts through various processing stages."""
 import logging
+import tempfile
+import zipfile
+import urllib.request
 from pathlib import Path
 import subprocess
 from dataclasses import dataclass
-from typing import Optional, Tuple, Dict
+from typing import Optional, Tuple
 from enum import Enum, auto
 
 from sharktank.utils.hf_datasets import Dataset, RemoteFile, get_dataset
@@ -13,6 +16,37 @@
 logger = logging.getLogger(__name__)
 
 
+def get_llama_cpp_path() -> Path:
+    """Downloads and extracts llama.cpp if needed, returns path to installation."""
+    # Use system temp directory as base
+    temp_base = Path(tempfile.gettempdir()) / "sharktank_llamacpp"
+    llama_cpp_dir = temp_base / "llama.cpp-b4696"
+
+    # Only download and extract if not already present
+    if not llama_cpp_dir.exists():
+        temp_base.mkdir(parents=True, exist_ok=True)
+        zip_path = temp_base / "llama.cpp.zip"
+
+        # Download zip file
+        logger.info("Downloading llama.cpp...")
+        urllib.request.urlretrieve(
+            "https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4696.zip",
+            zip_path,
+        )
+
+        # Extract zip file
+        logger.info("Extracting llama.cpp...")
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(temp_base)
+
+        # Clean up zip file
+        zip_path.unlink()
+
+        logger.info(f"llama.cpp installed at {llama_cpp_dir}")
+
+    return llama_cpp_dir
+
+
 class AccuracyValidationException(RuntimeError):
     """Custom exception for accuracy validation failures."""
 
@@ -32,9 +66,10 @@ def __init__(
 
 
 class ModelSource(Enum):
-    HUGGINGFACE = auto()
+    HUGGINGFACE_FROM_GGUF = auto()
     LOCAL = auto()
     AZURE = auto()
+    HUGGINGFACE_FROM_SAFETENSORS = auto()
 
 
 @dataclass
@@ -62,7 +97,7 @@ class ModelConfig:
     azure_config: Optional[AzureConfig] = None
 
     def __post_init__(self):
-        if self.source == ModelSource.HUGGINGFACE:
+        if self.source == ModelSource.HUGGINGFACE_FROM_GGUF:
             if not (self.dataset_name or self.repo_id):
                 raise ValueError(
                     "Either dataset_name or repo_id required for HuggingFace models"
@@ -71,6 +106,11 @@ def __post_init__(self):
             raise ValueError("local_path required for local models")
         elif self.source == ModelSource.AZURE and not self.azure_config:
             raise ValueError("azure_config required for Azure models")
+        elif self.source == ModelSource.HUGGINGFACE_FROM_SAFETENSORS:
+            if not self.dataset_name:
+                raise ValueError(
+                    "dataset_name required for HUGGINGFACE_FROM_SAFETENSORS models"
+                )
 
 
 @dataclass
@@ -96,7 +136,7 @@ def __init__(self, base_dir: Path, config: ModelConfig):
 
     def _get_model_dir(self) -> Path:
         """Creates and returns appropriate model directory based on source."""
-        if self.config.source == ModelSource.HUGGINGFACE:
+        if self.config.source == ModelSource.HUGGINGFACE_FROM_GGUF:
             if self.config.dataset_name:
                 return self.base_dir / self.config.dataset_name.replace("/", "_")
             return self.base_dir / self.config.repo_id.replace("/", "_")
@@ -108,6 +148,8 @@ def _get_model_dir(self) -> Path:
                 / "azure"
                 / self.config.azure_config.blob_path.replace("/", "_")
             )
+        elif self.config.source == ModelSource.HUGGINGFACE_FROM_SAFETENSORS:
+            return self.base_dir / self.config.dataset_name.replace("/", "_")
         raise ValueError(f"Unsupported model source: {self.config.source}")
 
     def _download_from_huggingface(self) -> Path:
@@ -143,12 +185,67 @@ def _download_from_huggingface(self) -> Path:
 
         return model_path
 
+    def _download_and_convert_from_huggingface(self) -> Path:
+        """Downloads model from HuggingFace and converts through GGUF to IRPA."""
+        irpa_path = self.model_dir / "model.irpa"
+
+        if not irpa_path.exists():
+            logger.info(
+                f"Processing model {self.config.repo_id} from HuggingFace through GGUF to IRPA"
+            )
+
+            # Step 1: Download from HuggingFace
+            hf_model_path = self.model_dir / "model_hf_repo_clone"
+            if not hf_model_path.exists():
+                logger.info(
+                    f"Downloading model from HuggingFace: {self.config.repo_id}"
+                )
+                dataset = get_dataset(self.config.dataset_name)
+                downloaded_files = dataset.download(local_dir=self.model_dir)
+
+            # Step 2: Convert to GGUF
+            gguf_path = self.model_dir / "model.gguf"
+            if not gguf_path.exists():
+                logger.info("Converting model to GGUF format")
+                subprocess.run(
+                    [
+                        "python",
+                        get_llama_cpp_path() / "convert_hf_to_gguf.py",
+                        self.model_dir,
+                        "--outfile",
+                        str(gguf_path),
+                        "--outtype",
+                        "f32",
+                    ],
+                    check=True,
+                )
+
+            # Step 3: Convert to IRPA
+            logger.info("Converting GGUF to IRPA format")
+            subprocess.run(
+                [
+                    "python",
+                    "-m",
+                    "sharktank.tools.dump_gguf",
+                    f"--gguf-file={gguf_path}",
+                    "--save",
+                    str(irpa_path),
+                ],
+                check=True,
+            )
+
+            # Cleanup intermediate files if desired
+            # shutil.rmtree(hf_model_path)
+            # gguf_path.unlink()
+
+        return irpa_path
+
     def _copy_from_local(self) -> Path:
         """Copies model from local filesystem."""
-        import shutil
-
         model_path = self.model_dir / self.config.model_file
         if not model_path.exists():
+            import shutil
+
             logger.info(f"Copying local model from {self.config.local_path}")
             shutil.copy2(self.config.local_path, model_path)
         return model_path
@@ -267,12 +364,14 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts:
         manager = ModelStageManager(self.base_dir, config)
 
         # Stage 1: Download weights and tokenizer (cached)
-        if config.source == ModelSource.HUGGINGFACE:
+        if config.source == ModelSource.HUGGINGFACE_FROM_GGUF:
             weights_path = manager._download_from_huggingface()
         elif config.source == ModelSource.LOCAL:
             weights_path = manager._copy_from_local()
         elif config.source == ModelSource.AZURE:
             weights_path = manager._download_from_azure()
+        elif config.source == ModelSource.HUGGINGFACE_FROM_SAFETENSORS:
+            weights_path = manager._download_and_convert_from_huggingface()
         else:
             raise ValueError(f"Unsupported model source: {config.source}")
 
@@ -294,33 +393,62 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts:
         )
 
 
-TEST_MODELS = {
-    "open_llama_3b": ModelConfig(
-        source=ModelSource.HUGGINGFACE,
-        repo_id="SlyEcho/open_llama_3b_v2_gguf",
-        model_file="open-llama-3b-v2-f16.gguf",
-        tokenizer_id="openlm-research/open_llama_3b_v2",
-        batch_sizes=(1, 4),
-        device_settings=None,
+TEST_MODELS = {}
+
+TEST_MODELS["open_llama_3b"] = ModelConfig(
+    source=ModelSource.HUGGINGFACE_FROM_GGUF,
+    repo_id="SlyEcho/open_llama_3b_v2_gguf",
+    model_file="open-llama-3b-v2-f16.gguf",
+    tokenizer_id="openlm-research/open_llama_3b_v2",
+    batch_sizes=(1, 4),
+    device_settings=None,
+)
+
+TEST_MODELS["llama3.1_8b"] = ModelConfig(
+    source=ModelSource.HUGGINGFACE_FROM_GGUF,
+    repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF",
+    model_file="meta-llama-3.1-8b-instruct.f16.gguf",
+    tokenizer_id="NousResearch/Meta-Llama-3.1-8B",
+    batch_sizes=(1, 4),
+    device_settings=None,
+)
+TEST_MODELS[
+    "azure_llama"
+] = ModelConfig(  # This model is currently unused. When you use it, check to make sure the irpa indeed still exist and remove this comment.
+    source=ModelSource.AZURE,
+    azure_config=AzureConfig(
+        account_name="sharkblobs",
+        container_name="halo-models",
+        blob_path="llm-dev/llama3_8b/8b_f16.irpa",
     ),
-    "llama3.1_8b": ModelConfig(
-        source=ModelSource.HUGGINGFACE,
-        repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF",
-        model_file="meta-llama-3.1-8b-instruct.f16.gguf",
-        tokenizer_id="NousResearch/Meta-Llama-3.1-8B",
-        batch_sizes=(1, 4),
-        device_settings=None,
-    ),
-    "azure_llama": ModelConfig(
-        source=ModelSource.AZURE,
-        azure_config=AzureConfig(
-            account_name="sharkblobs",
-            container_name="halo-models",
-            blob_path="llm-dev/llama3_8b/8b_f16.irpa",
+    model_file="azure-llama.irpa",
+    tokenizer_id="openlm-research/open_llama_3b_v2",
+    batch_sizes=(1, 4),
+    device_settings=None,
+)
+
+# TODO: upstream this to sharktank
+Dataset(
+    "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA",
+    files=[
+        RemoteFile(
+            file_id="model.safetensors",
+            filename="model.safetensors",
+            repo_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA",
+            extra_filenames=(
+                "config.json",
+                "tokenizer.json",
+                "tokenizer_config.json",
+            ),
         ),
-        model_file="azure-llama.irpa",
-        tokenizer_id="openlm-research/open_llama_3b_v2",
-        batch_sizes=(1, 4),
-        device_settings=None,
-    ),
-}
+    ],
+)
+
+TEST_MODELS["tinystories_llama2_25m"] = ModelConfig(
+    source=ModelSource.HUGGINGFACE_FROM_SAFETENSORS,
+    dataset_name="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA",
+    model_file="model.irpa",  # This will be the final converted file name
+    tokenizer_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA",
+    batch_sizes=(1, 4),
+    device_settings=None,
+)
diff --git a/app_tests/integration_tests/llm/sglang/conftest.py b/app_tests/integration_tests/llm/sglang/conftest.py
@@ -40,7 +40,7 @@ def model_artifacts(request, tmp_path_factory):
     tmp_dir = tmp_path_factory.mktemp("sglang_integration_tests")
 
     model_config = ModelConfig(
-        source=ModelSource.HUGGINGFACE,
+        source=ModelSource.HUGGINGFACE_FROM_GGUF,
         repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF",
         model_file="meta-llama-3.1-8b-instruct.f16.gguf",
         tokenizer_id="NousResearch/Meta-Llama-3.1-8B",

diff --git a/app_tests/integration_tests/llm/shortfin/conftest.py b/app_tests/integration_tests/llm/shortfin/conftest.py
@@ -33,7 +33,7 @@ def test_device(request):
     return ret
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="session")
 def model_artifacts(tmp_path_factory, request, test_device):
     """Prepares model artifacts in a cached directory."""
     model_config = TEST_MODELS[request.param]