feat: add LocalAI support

Signed-off-by: Adrian Cole <[email protected]>
square · Sep 23, 2024 · c87dfb0 · c87dfb0
1 parent 5b34bc5
commit c87dfb0
Show file tree

Hide file tree

Showing 10 changed files with 247 additions and 13 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -35,15 +35,15 @@ jobs:
       - name: Run tests
         run: uv run pytest tests -m 'not integration'
 
-  # This runs integration tests of the OpenAI API, using Ollama to host models.
+  # This integration tests the OpenAI API, using Ollama to host models.
   # This lets us test PRs from forks which can't access secrets like API keys.
   ollama:
     runs-on: ubuntu-latest
 
     strategy:
       matrix:
         python-version:
-          # Only test the lastest python version.
+          # Only test the latest python version.
           - "3.12"
         ollama-model:
           # For quicker CI, use a smaller, tool-capable model than the default.
@@ -70,7 +70,7 @@ jobs:
             nohup ollama serve > ollama.log 2>&1 &
 
             # Block using the ready endpoint
-            time curl --retry 5 --retry-connrefused --retry-delay 1 -sf http://localhost:11434
+            time curl --retry 5 --retry-connrefused --retry-delay 1 -sf http://localhost:11434 || cat ollama.log
 
       # Tests use OpenAI which does not have a mechanism to pull models. Run a
       # simple prompt to (pull and) test the model first.
@@ -80,6 +80,71 @@ jobs:
           OLLAMA_MODEL: ${{ matrix.ollama-model }}
 
       - name: Run Ollama tests
-        run: uv run pytest tests -m integration -k ollama
+        run: uv run pytest tests -m integration -k ollama || cat ollama.log
         env:
           OLLAMA_MODEL: ${{ matrix.ollama-model }}
+
+  # This integration tests the OpenAI API, using LocalAI to host models.
+  # This lets us test PRs from forks which can't access secrets like API keys.
+  localai:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version:
+          # Only test the latest python version.
+          - "3.12"
+        localai-model:
+          # TODO: This is is the default model, as we haven't yet found a
+          # small model that passes tests when run with LocalAI. For example,
+          # "qwen2.5-0.5b-instruct" fails or hangs.
+          - "mistral-nemo-instruct-2407"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install UV
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Source Cargo Environment
+        run: source $HOME/.cargo/env
+
+      - name: Set up Python
+        run: uv python install ${{ matrix.python-version }}
+
+      - name: Download LocalAI
+        uses: robinraju/[email protected]
+        with:
+          repository: mudler/LocalAI
+          latest: true
+          # Note the LocalAI linux binary is >1.2GB, so this step may take a while.
+          fileName: 'local-ai-Linux-x86_64'
+
+      - name: Install LocalAI
+        run: |
+          mv local-ai-Linux-x86_64 /usr/local/bin/local-ai
+          chmod +x /usr/local/bin/local-ai
+
+      - name: Start LocalAI
+        run: |
+            # Run the background, in a way that survives to the next step
+            nohup local-ai run > localai.log 2>&1 &
+
+            # Note: we don't `local-ai run` with the `LOCALAI_MODELS` env var
+            # because the it would introduce a race. The below check would pass
+            # before the model is downloaded.
+
+            # Block using the ready endpoint
+            time curl --retry 5 --retry-connrefused --retry-delay 1 -sf http://localhost:8080/readyz || cat localai.log
+
+      # Tests use OpenAI which does not have a mechanism to install models.
+      # This blocks until the model is installed to prevent failures.
+      - name: Install LocalAI model
+        run: local-ai models install $LOCALAI_MODEL || cat localai.log
+        env:
+          LOCALAI_MODEL: ${{ matrix.localai-model }}
+
+      - name: Run LocalAI tests
+        run: uv run pytest tests -m integration -k localai || cat localai.log
+        env:
+          LOCALAI_MODEL: ${{ matrix.localai-model }}
diff --git a/.gitignore b/.gitignore
@@ -144,3 +144,6 @@ uv.lock
 
 # PyCharm
 .idea/
+
+# LocalAI
+models/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -34,11 +34,32 @@ uv run pytest tests -m integration
 # or `just integration`
 ```
 
+### Integration testing with LocalAI
+
+To run integration tests against LocalAI, you need the model that tests expect available locally.
+
+First, run `local-ai` and pull the models you want to test.
+```bash
+local-ai run
+# Then in another terminal, pull the model
+LOCALAI_MODEL=$(uv run python -c "from src.exchange.providers.localai import LOCALAI_MODEL; print(LOCALAI_MODEL)")
+local-ai models install $LOCALAI_MODEL
+```
+
+Finally, run LocalAI integration tests.
+```bash
+uv run pytest tests -m integration -k localai
+# or `just integration -k localai`
+```
+
+Note: The `LOCALAI_MODEL` variable controls which model is used in tests. If you want to run with a
+different model, set that before invoking them.
+
 ### Integration testing with Ollama
 
 To run integration tests against Ollama, you need the model that tests expect available locally.
 
-First, run ollama and pull the models you want to test.
+First, run `ollama` and pull the models you want to test.
 ```bash
 ollama serve
 # Then in another terminal, pull the model
@@ -52,6 +73,9 @@ uv run pytest tests -m integration -k ollama
 # or `just integration -k ollama`
 ```
 
+Note: The `OLLAMA_MODEL` variable controls which model is used in tests. If you want to run with a
+different model, set that before invoking them.
+
 ## Pull Requests
 
 When opening a pull request, please ensure that your PR title adheres to the [Conventional Commits specification](https://www.conventionalcommits.org/).

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,7 @@ databricks = "exchange.providers.databricks:DatabricksProvider"
 anthropic = "exchange.providers.anthropic:AnthropicProvider"
 bedrock = "exchange.providers.bedrock:BedrockProvider"
 ollama = "exchange.providers.ollama:OllamaProvider"
+localai = "exchange.providers.localai:LocalAIProvider"
 
 [project.entry-points."exchange.moderator"]
 passive = "exchange.moderators.passive:PassiveModerator"

diff --git a/src/exchange/providers/__init__.py b/src/exchange/providers/__init__.py
@@ -7,6 +7,7 @@
 from exchange.providers.openai import OpenAiProvider  # noqa
 from exchange.providers.ollama import OllamaProvider  # noqa
 from exchange.providers.azure import AzureProvider  # noqa
+from exchange.providers.localai import LocalAIProvider  # noqa
 
 from exchange.utils import load_plugins
 

diff --git a/src/exchange/providers/localai.py b/src/exchange/providers/localai.py
@@ -0,0 +1,43 @@
+import os
+from typing import Type
+
+import httpx
+
+from exchange.providers.openai import OpenAiProvider
+
+LOCALAI_HOST = "http://localhost:8080/"
+LOCALAI_MODEL = "mistral-nemo-instruct-2407"
+
+
+class LocalAIProvider(OpenAiProvider):
+    """Provides chat completions for models hosted by LocalAI"""
+
+    __doc__ += f"""
+
+Here's an example profile configuration to try:
+
+    localai:
+      provider: localai
+      processor: {LOCALAI_HOST}
+      accelerator: {LOCALAI_MODEL}
+      moderator: passive
+      toolkits:
+      - name: developer
+        requires: {{}}
+"""
+
+    def __init__(self, client: httpx.Client) -> None:
+        print("PLEASE NOTE: the localai provider is experimental, use with care")
+        super().__init__(client)
+
+    @classmethod
+    def from_env(cls: Type["LocalAIProvider"]) -> "LocalAIProvider":
+        url = os.environ.get("LOCALAI_HOST", LOCALAI_HOST)
+        client = httpx.Client(
+            base_url=url,
+            timeout=httpx.Timeout(60 * 10),
+        )
+        # from_env is expected to fail if provider is not available
+        # so we run a quick test that the endpoint is running
+        client.get("readyz")
+        return cls(client)
diff --git a/tests/providers/openai/cassettes/test_localai_completion.yaml b/tests/providers/openai/cassettes/test_localai_completion.yaml
@@ -0,0 +1,68 @@
+interactions:
+- request:
+    body: ''
+    headers:
+      accept:
+      - '*/*'
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      host:
+      - localhost:8080
+      user-agent:
+      - python-httpx/0.27.2
+    method: GET
+    uri: http://localhost:8080/readyz
+  response:
+    body:
+      string: OK
+    headers:
+      Content-Length:
+      - '2'
+      Content-Type:
+      - text/plain; charset=utf-8
+      Date:
+      - Mon, 23 Sep 2024 03:40:21 GMT
+      Set-Cookie: test_set_cookie
+      openai-organization: test_openai_org_key
+    status:
+      code: 200
+      message: OK
+- request:
+    body: '{"messages": [{"role": "system", "content": "You are a helpful assistant
+      who is succinct."}, {"role": "user", "content": "Hello"}], "model": "mistral-nemo-instruct-2407"}'
+    headers:
+      accept:
+      - '*/*'
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '170'
+      content-type:
+      - application/json
+      host:
+      - localhost:8080
+      user-agent:
+      - python-httpx/0.27.2
+    method: POST
+    uri: http://localhost:8080/v1/chat/completions
+  response:
+    body:
+      string: '{"created":1727062822,"object":"chat.completion","id":"e3f2f8d6-ab2b-4a30-bcf0-b2c1cfd88912","model":"mistral-nemo-instruct-2407","choices":[{"index":0,"finish_reason":"stop","message":{"role":"assistant","content":"Hi!
+        How can I help you today?"}}],"usage":{"prompt_tokens":15,"completion_tokens":10,"total_tokens":25}}'
+    headers:
+      Content-Length:
+      - '320'
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 23 Sep 2024 03:40:22 GMT
+      Set-Cookie: test_set_cookie
+      openai-organization: test_openai_org_key
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/providers/openai/test_localai.py b/tests/providers/openai/test_localai.py
@@ -0,0 +1,33 @@
+from typing import Tuple
+
+import os
+import pytest
+
+from exchange import Text
+from exchange.message import Message
+from exchange.providers.base import Usage
+from exchange.providers.localai import LocalAIProvider, LOCALAI_MODEL
+
+
+@pytest.mark.vcr()
+def test_localai_completion(default_openai_api_key):
+    reply_message, reply_usage = localai_complete()
+
+    assert reply_message.content == [Text(text="Hi! How can I help you today?")]
+    assert reply_usage.total_tokens == 25
+
+
+@pytest.mark.integration
+def test_localai_completion_integration():
+    reply = localai_complete()
+
+    assert reply[0].content is not None
+    print("Completion content from OpenAI:", reply[0].content)
+
+
+def localai_complete() -> Tuple[Message, Usage]:
+    provider = LocalAIProvider.from_env()
+    model = os.getenv("LOCALAI_MODEL", LOCALAI_MODEL)
+    system = "You are a helpful assistant who is succinct."
+    messages = [Message.user("Hello")]
+    return provider.complete(model=model, system=system, messages=messages, tools=None)
diff --git a/tests/providers/openai/test_openai.py b/tests/providers/openai/test_openai.py
@@ -1,22 +1,16 @@
 from typing import Tuple
 
-import os
 import pytest
 
 from exchange import Text
 from exchange.message import Message
 from exchange.providers.base import Usage
 from exchange.providers.openai import OpenAiProvider
-from .conftest import OPENAI_MODEL, OPENAI_API_KEY
+from .conftest import OPENAI_MODEL
 
 
 @pytest.mark.vcr()
-def test_openai_completion(monkeypatch):
-    # When running VCR tests the first time, it needs OPENAI_API_KEY to call
-    # the real service. Afterward, it is not needed as VCR mocks the service.
-    if "OPENAI_API_KEY" not in os.environ:
-        monkeypatch.setenv("OPENAI_API_KEY", OPENAI_API_KEY)
-
+def test_openai_completion(default_openai_api_key):
     reply_message, reply_usage = openai_complete()
 
     assert reply_message.content == [Text(text="Hello! How can I assist you today?")]

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -4,12 +4,14 @@
 from exchange.message import Message
 from exchange.moderators import ContextTruncate
 from exchange.providers import get_provider
+from exchange.providers.localai import LOCALAI_MODEL
 from exchange.providers.ollama import OLLAMA_MODEL
 from exchange.tool import Tool
 
 too_long_chars = "x" * (2**20 + 1)
 
 cases = [
+    (get_provider("localai"), os.getenv("LOCALAI_MODEL", LOCALAI_MODEL)),
     (get_provider("ollama"), os.getenv("OLLAMA_MODEL", OLLAMA_MODEL)),
     (get_provider("openai"), "gpt-4o-mini"),
     (get_provider("databricks"), "databricks-meta-llama-3-70b-instruct"),