feat(cloud_cache): normalize model_id and include precision

gadicc · gadicc · commit ad1b2efc6021 · 2022-12-12T13:08:20.000Z
diff --git a/app.py b/app.py
@@ -18,7 +18,7 @@
 from getPipeline import getPipelineForModel, listAvailablePipelines, clearPipelines
 import re
 import requests
-from download import download_model
+from download import download_model, normalize_model_id
 import traceback
 
 RUNTIME_DOWNLOADS = os.getenv("RUNTIME_DOWNLOADS") == "1"
@@ -130,11 +130,14 @@ def inference(all_inputs: dict) -> dict:
     if not model_id:
         model_id = MODEL_ID
         result["$meta"].update({"MODEL_ID": MODEL_ID})
+    normalized_model_id = model_id
 
     if RUNTIME_DOWNLOADS:
         global downloaded_models
-        if last_model_id != model_id:
-            if not downloaded_models.get(model_id, None):
+        model_precision = call_inputs.get("MODEL_PRECISION", None)
+        normalized_model_id = normalize_model_id(model_id, model_precision)
+        if last_model_id != normalized_model_id:
+            if not downloaded_models.get(normalized_model_id, None):
                 model_url = call_inputs.get("MODEL_URL", None)
                 if not model_url:
                     return {
@@ -143,18 +146,22 @@ def inference(all_inputs: dict) -> dict:
                             "message": "Currently RUNTIME_DOWNOADS requires a MODEL_URL callInput",
                         }
                     }
-                download_model(model_id=model_id, model_url=model_url)
-                downloaded_models.update({model_id: True})
-            model = loadModel(model_id)
+                download_model(
+                    model_id=model_id,
+                    model_url=model_url,
+                    model_revision=model_precision,
+                )
+                downloaded_models.update({normalized_model_id: True})
+            model = loadModel(normalized_model_id)
             if PIPELINE == "ALL":
                 clearPipelines()
-            last_model_id = model_id
+            last_model_id = normalized_model_id
 
     if MODEL_ID == "ALL":
-        if last_model_id != model_id:
-            model = loadModel(model_id)
+        if last_model_id != normalized_model_id:
+            model = loadModel(normalized_model_id)
             clearPipelines()
-            last_model_id = model_id
+            last_model_id = normalized_model_id
     else:
         if model_id != MODEL_ID and not RUNTIME_DOWNLOADS:
             return {
@@ -172,7 +179,7 @@ def inference(all_inputs: dict) -> dict:
             pipeline_name = "StableDiffusionPipeline"
             result["$meta"].update({"PIPELINE": pipeline_name})
 
-        pipeline = getPipelineForModel(pipeline_name, model, model_id)
+        pipeline = getPipelineForModel(pipeline_name, model, normalized_model_id)
         if not pipeline:
             return {
                 "$error": {
@@ -190,7 +197,7 @@ def inference(all_inputs: dict) -> dict:
         scheduler_name = "DPMSolverMultistepScheduler"
         result["$meta"].update({"SCHEDULER": scheduler_name})
 
-    pipeline.scheduler = getScheduler(model_id, scheduler_name)
+    pipeline.scheduler = getScheduler(normalized_model_id, scheduler_name)
     if pipeline.scheduler == None:
         return {
             "$error": {
@@ -289,7 +296,9 @@ def inference(all_inputs: dict) -> dict:
                 }
             }
         torch.set_grad_enabled(True)
-        result = result | TrainDreamBooth(model_id, pipeline, model_inputs, call_inputs)
+        result = result | TrainDreamBooth(
+            normalized_model_id, pipeline, model_inputs, call_inputs
+        )
         torch.set_grad_enabled(False)
         send("inference", "done", {"startRequestId": startRequestId})
         result.update({"$timings": getTimings()})
diff --git a/docs/internal_safetensor_cache_flow.md b/docs/internal_safetensor_cache_flow.md
@@ -17,3 +17,32 @@ e.g. stabilityai/stable-diffusion-2-1-base
     1. Run inference with HF model.
 
 FileNotFoundError: [Errno 2] No such file or directory: '/root/.cache/huggingface/diffusers/models--stabilityai--stable-diffusion-2-1-base/refs/main'
+
+
+NVIDIA RTX Quadro 5000
+
+NO SAFETENSORS
+Downloaded in 462557 ms
+Loading model: stabilityai/stable-diffusion-2-1 (fp32)
+Loaded from disk in 3113 ms, to gpu in 1644 ms
+
+SAFETENSORS_FAST_GPU=0
+Loaded from disk in 2741 ms, to gpu in 557 ms
+
+SAFETENSORS_FAST_GPU=1
+Loaded from disk in 1153 ms, to gpu in 1495 ms
+
+
+
+NVIDIA RTX Quadro 5000 (fp16)
+
+NO SAFETENSORS
+Downloaded in 462557 ms
+Loading model: stabilityai/stable-diffusion-2-1-base (fp16)
+Loaded from disk in 2043 ms, to gpu in 1539 ms
+
+SAFETENSORS_FAST_GPU=0
+
+
+SAFETENSORS_FAST_GPU=1
+Loaded from disk in 1134 ms, to gpu in 1184 ms
diff --git a/download.py b/download.py
@@ -5,7 +5,7 @@
 from loadModel import loadModel, MODEL_IDS
 from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler
 from transformers import CLIPTextModel, CLIPTokenizer
-from precision import revision
+from precision import PRECISION, revision_from_precision, torch_dtype_from_precision
 from utils import Storage
 import subprocess
 from pathlib import Path
@@ -24,12 +24,30 @@ def send(type: str, status: str, payload: dict = {}):
         _send(type, status, payload)
 
 
-def download_model(model_url=None, model_id=None):
+def normalize_model_id(model_id: str, model_revision):
+    normalized_model_id = "models--" + model_id.replace("/", "--")
+    if model_revision:
+        normalized_model_id += "--" + model_revision
+    return normalized_model_id
+
+
+def download_model(model_url=None, model_id=None, model_revision=None):
+    print(
+        "download_model",
+        {
+            "model_url": model_url,
+            "model_id": model_id,
+            "model_revision": model_revision,
+        },
+    )
     id = model_id or MODEL_ID
     url = model_url or MODEL_URL
+    revision = model_revision or revision_from_precision()
+    normalized_model_id = id
 
     if url != "":
-        normalized_model_id = "models--" + model_id.replace("/", "--")
+        normalized_model_id = normalize_model_id(model_id, model_revision)
+        print({"normalized_model_id": normalized_model_id})
         filename = url.split("/").pop()
         if not filename:
             filename = normalized_model_id + ".tar.zst"
@@ -38,17 +56,31 @@ def download_model(model_url=None, model_id=None):
         if exists:
             storage.download_file(filename)
             # os.mkdir(id)
-            Path(id).mkdir(parents=True, exist_ok=False)
+            # Path(id).mkdir(parents=True, exist_ok=False)
+            os.mkdir(normalized_model_id)
             subprocess.run(
-                ["tar", "--use-compress-program=unzstd", "-C", id, "-xvf", filename],
+                [
+                    "tar",
+                    "--use-compress-program=unzstd",
+                    "-C",
+                    normalized_model_id,
+                    "-xvf",
+                    filename,
+                ],
                 check=True,
             )
             subprocess.run(["ls", "-l"])
         else:
             print("Does not exist, let's try find it on huggingface")
-            download_model(model_id=model_id)
-            model = loadModel(model_id, True)
-            dir = "models--" + model_id.replace("/", "--") + "--dda"
+            print("precision = ", {"model_revision": model_revision})
+            # This would be quicker to just model.to("cuda") afterwards, but
+            # this conveniently logs all the timings (and doesn't happen often)
+            print("download")
+            model = loadModel(model_id, False, precision=model_revision)  # download
+            print("load")
+            model = loadModel(model_id, True, precision=model_revision)  # load
+            # dir = "models--" + model_id.replace("/", "--") + "--dda"
+            dir = normalized_model_id
             model.save_pretrained(dir, safe_serialization=True)
 
             # This is all duped from train_dreambooth, need to refactor TODO XXX
@@ -67,7 +99,10 @@ def download_model(model_url=None, model_id=None):
             send("upload", "done")
             print(upload_result)
             os.remove(filename)
-            shutil.rmtree(dir)
+
+            # leave model dir for future loads... make configurable?
+            # shutil.rmtree(dir)
+
             # TODO, swap directories, inside HF's cache structure.
 
         return
@@ -76,9 +111,9 @@ def download_model(model_url=None, model_id=None):
     # For local dev & preview deploys, download all the models (terrible for serverless deploys)
     if MODEL_ID == "ALL":
         for MODEL_I in MODEL_IDS:
-            loadModel(MODEL_I, False)
+            loadModel(MODEL_I, False, precision=model_revision)
     else:
-        loadModel(MODEL_ID, False)
+        loadModel(normalized_model_id, False, precision=model_revision)
 
     # if USE_DREAMBOOTH:
     # Actually we can re-use these from the above loaded model
diff --git a/loadModel.py b/loadModel.py
@@ -2,7 +2,7 @@
 import os
 from diffusers import pipelines as _pipelines, StableDiffusionPipeline
 from getScheduler import getScheduler, DEFAULT_SCHEDULER
-from precision import revision, torch_dtype
+from precision import revision_from_precision, torch_dtype_from_precision
 import time
 
 HF_AUTH_TOKEN = os.getenv("HF_AUTH_TOKEN")
@@ -21,8 +21,16 @@
 ]
 
 
-def loadModel(model_id: str, load=True):
-    print(("Loading" if load else "Downloading") + " model: " + model_id)
+def loadModel(model_id: str, load=True, precision=None):
+    print("loadModel", {"model_id": model_id, "load": load, "precision": precision})
+    revision = revision_from_precision(precision)
+    torch_dtype = torch_dtype_from_precision(precision)
+    print(
+        ("Loading" if load else "Downloading")
+        + " model: "
+        + model_id
+        + (f" ({revision})" if revision else "")
+    )
 
     pipeline = (
         StableDiffusionPipeline if PIPELINE == "ALL" else getattr(_pipelines, PIPELINE)
@@ -51,4 +59,4 @@ def loadModel(model_id: str, load=True):
     else:
         print(f"Downloaded in {from_pretrained} ms")
 
-    return model.to("cuda") if load else None
+    return model if load else None
diff --git a/precision.py b/precision.py
@@ -5,3 +5,13 @@
 
 revision = None if PRECISION == "" else PRECISION
 torch_dtype = None if PRECISION == "" else torch.float16
+
+
+def revision_from_precision(precision=PRECISION):
+    return precision if precision else None
+
+
+def torch_dtype_from_precision(precision=PRECISION):
+    if precision == "fp16":
+        return torch.float16
+    return None