From a88b8827202206e3c376ec2b9908fb69f9dcd155 Mon Sep 17 00:00:00 2001
From: Siddharth Senthilkumar <Siddharth.Senthilkumar@valuehealthai.com>
Date: Tue, 17 Mar 2026 20:41:48 +0530
Subject: [PATCH 1/2] update project

---
 app/.env                                   |  16 ++
 app/.gitignore                             |  21 ++
 app/README.md                              |  36 +++
 app/backend/main.py                        | 258 +++++++++++++++++++++
 app/backend/main.py.bak                    | 218 +++++++++++++++++
 app/requirements.txt                       |  14 ++
 examples/rag_agentic_demo/README.md        |  49 ++++
 examples/rag_agentic_demo/app.py           | 156 +++++++++++++
 examples/rag_agentic_demo/requirements.txt |   1 +
 test.ipynb                                 |  80 +++++++
 10 files changed, 849 insertions(+)
 create mode 100644 app/.env
 create mode 100644 app/.gitignore
 create mode 100644 app/README.md
 create mode 100644 app/backend/main.py
 create mode 100644 app/backend/main.py.bak
 create mode 100644 app/requirements.txt
 create mode 100644 examples/rag_agentic_demo/README.md
 create mode 100644 examples/rag_agentic_demo/app.py
 create mode 100644 examples/rag_agentic_demo/requirements.txt
 create mode 100644 test.ipynb
diff --git a/app/.env b/app/.env
new file mode 100644
index 000000000..4376506b1
--- /dev/null
+++ b/app/.env
@@ -0,0 +1,16 @@
+﻿# API keys
+GEMINI_API_KEY="AIzaSyAZu4NTKc_krZYZmMZRasiCQYtsSifOcrw"
+GEMINI_MODEL="gemini-2.5-flash"
+
+# Vector DB
+ENDEE_BASE_URL=http://localhost:8080/api/v1
+ENDEE_AUTH_TOKEN=
+ENDEE_INDEX_NAME=rag_app
+
+# RAG params
+CHUNK_SIZE=800
+CHUNK_OVERLAP=120
+TOP_K=6
+
+# Frontend
+BACKEND_URL=http://localhost:8000
diff --git a/app/.gitignore b/app/.gitignore
new file mode 100644
index 000000000..449021385
--- /dev/null
+++ b/app/.gitignore
@@ -0,0 +1,21 @@
+cat <<EOL > .gitignore
+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+
+# Virtual environment
+.venv/
+venv/
+
+# IDE
+.vscode/
+.idea/
+
+# OS
+.DS_Store
+
+# Logs
+*.log
+EOL
\ No newline at end of file
diff --git a/app/README.md b/app/README.md
new file mode 100644
index 000000000..ae0930b41
--- /dev/null
+++ b/app/README.md
@@ -0,0 +1,36 @@
+﻿# Agentic RAG App (FastAPI + Streamlit + Endee)
+
+## What it does
+- Upload PDF/DOCX/TXT → chunk with LangChain → embed via `sentence-transformers/all-MiniLM-L6-v2` → store in Endee.
+- Chat endpoint: embed query, search Endee (top_k), rerank with `cross-encoder/ms-marco-MiniLM-L-6-v2`, build context, answer with Gemini model (default `gemini-2.5-flash`).
+- Toggle RAG on/off in the UI; responses include source metadata.
+
+## Layout
+- `app/backend/main.py` – FastAPI service (`/upload`, `/chat`, `/health`).
+- `app/frontend/streamlit_app.py` – Streamlit UI client.
+- `app/requirements.txt` – all deps for both frontend + backend.
+- `app/.env` – fill with your keys; defaults point to local Endee + backend.
+
+## Setup
+```bash
+cd app
+python -m venv .venv
+. .venv/Scripts/activate  # or source .venv/bin/activate
+pip install -r requirements.txt
+```
+Edit `.env` with your `GEMINI_API_KEY` (and `ENDEE_AUTH_TOKEN` if your server requires it).
+
+## Run backend
+```bash
+uvicorn backend.main:app --host 0.0.0.0 --port 8000 --reload
+```
+
+## Run frontend
+```bash
+streamlit run frontend/streamlit_app.py
+```
+
+## Notes
+- Endee index name is `rag_app` by default; change via `ENDEE_INDEX_NAME`.
+- Embedding dim is fixed at 384 to match `all-MiniLM-L6-v2` and the Endee index is created automatically if missing.
+- Reranker uses a cross-encoder; if you want faster responses, you can disable reranking by returning `results` directly in `retrieve`.
diff --git a/app/backend/main.py b/app/backend/main.py
new file mode 100644
index 000000000..81b478143
--- /dev/null
+++ b/app/backend/main.py
@@ -0,0 +1,258 @@
+﻿#!/usr/bin/env python
+import os
+from typing import List, Optional
+
+from dotenv import load_dotenv
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_google_genai import ChatGoogleGenerativeAI
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from starlette.responses import JSONResponse
+
+from endee import Endee, Precision
+from endee.schema import VectorItem as EndeeVectorItem
+
+# Monkey-patch Endee VectorItem to behave like a dict for `.get()` calls inside the SDK
+if not hasattr(EndeeVectorItem, "get"):
+    EndeeVectorItem.get = lambda self, key, default=None: getattr(self, key, default)
+
+load_dotenv()
+
+# Environment
+ENDEE_BASE_URL = os.getenv("ENDEE_BASE_URL", "http://localhost:8080/api/v1")
+ENDEE_AUTH_TOKEN = os.getenv("ENDEE_AUTH_TOKEN", "")
+ENDEE_INDEX_NAME = os.getenv("ENDEE_INDEX_NAME", "rag_app")
+
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
+
+CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "800"))
+CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "120"))
+TOP_K = int(os.getenv("TOP_K", "6"))
+
+app = FastAPI(title="RAG Agentic Backend", version="0.1.0")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+def get_endee_client() -> Endee:
+    client = Endee(ENDEE_AUTH_TOKEN) if ENDEE_AUTH_TOKEN else Endee()
+    client.set_base_url(ENDEE_BASE_URL)
+    return client
+
+
+def ensure_index(client: Endee, dim: int) -> None:
+    try:
+        client.get_index(name=ENDEE_INDEX_NAME)
+        return
+    except Exception:
+        pass
+    client.create_index(
+        name=ENDEE_INDEX_NAME,
+        dimension=dim,
+        space_type="cosine",
+        precision=Precision.INT8,
+    )
+
+
+embedder: Optional[SentenceTransformer] = None
+reranker: Optional[CrossEncoder] = None
+llm: Optional[ChatGoogleGenerativeAI] = None
+endee_client: Optional[Endee] = None
+endee_index = None
+
+
+def bootstrap():
+    global embedder, reranker, llm, endee_client, endee_index
+    if GEMINI_API_KEY is None:
+        raise RuntimeError("GEMINI_API_KEY is required")
+
+    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+    llm = ChatGoogleGenerativeAI(
+        model=GEMINI_MODEL,
+        api_key=GEMINI_API_KEY,
+        temperature=0.2,
+    )
+    endee_client = get_endee_client()
+    ensure_index(endee_client, embedder.get_sentence_embedding_dimension())
+    endee_index = endee_client.get_index(name=ENDEE_INDEX_NAME)
+
+
+bootstrap()
+
+
+def read_file(file: UploadFile) -> str:
+    suffix = file.filename.split(".")[-1].lower()
+    content = file.file.read()
+    if suffix == "pdf":
+        try:
+            from pypdf import PdfReader
+        except Exception as exc:
+            raise HTTPException(status_code=500, detail=f"pypdf missing: {exc}")
+        file.file.seek(0)
+        reader = PdfReader(file.file)
+        text = "\n".join([p.extract_text() or "" for p in reader.pages])
+    elif suffix in {"txt", "md"}:
+        text = content.decode("utf-8", errors="ignore")
+    elif suffix in {"docx"}:
+        try:
+            import docx2txt
+        except Exception as exc:
+            raise HTTPException(status_code=500, detail=f"docx2txt missing: {exc}")
+        temp_path = f"/tmp/{file.filename}"
+        with open(temp_path, "wb") as f:
+            f.write(content)
+        text = docx2txt.process(temp_path) or ""
+        os.remove(temp_path)
+    else:
+        raise HTTPException(status_code=400, detail="Unsupported file type")
+    return text
+
+
+def chunk_text(text: str) -> List[str]:
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        separators=["\n\n", "\n", " ", ""],
+    )
+    return splitter.split_text(text)
+
+
+def upsert_chunks(chunks: List[str], source: str):
+    embeddings = embedder.encode(chunks, convert_to_numpy=False)
+    payload = []
+    for idx, (chunk, vector) in enumerate(zip(chunks, embeddings)):
+        payload.append(
+            {
+                "id": f"{source}::chunk-{idx}",
+                "vector": vector.tolist(),
+                "meta": {"source": source, "text": chunk},
+            }
+        )
+    endee_index.upsert(payload)
+
+
+def _normalize_result(item) -> dict:
+    if hasattr(item, "dict"):
+        base = item.dict()
+        similarity = getattr(item, "similarity", None)
+        distance = getattr(item, "distance", None)
+    else:
+        base = dict(item)
+        similarity = base.get("similarity")
+        distance = base.get("distance")
+    return {
+        "id": base.get("id"),
+        "meta": base.get("meta") or {},
+        "similarity": similarity,
+        "distance": distance,
+        "raw": item,
+    }
+
+
+def retrieve(query: str):
+    query_vec = embedder.encode([query])[0].tolist()
+    results = endee_index.query(vector=query_vec, top_k=TOP_K, include_vectors=False)
+    if not results:
+        return []
+    docs = [_normalize_result(r) for r in results]
+    rerank_inputs = [(query, doc["meta"].get("text", "")) for doc in docs]
+    scores = reranker.predict(rerank_inputs)
+    reranked = sorted(
+        [
+            {**doc, "rerank_score": float(score)}
+            for doc, score in zip(docs, scores)
+        ],
+        key=lambda x: x["rerank_score"],
+        reverse=True,
+    )
+    return reranked
+
+
+def build_context(docs: List[dict]) -> str:
+    parts = []
+    for doc in docs:
+        meta = doc.get("meta", {})
+        parts.append(f"[{meta.get('source')}] {meta.get('text', '')}")
+    return "\n\n".join(parts)
+
+
+def build_history(history: List[dict]) -> str:
+    lines = []
+    for turn in history[-5:]:
+        u = turn.get("user") or ""
+        a = turn.get("answer") or ""
+        lines.append(f"User: {u}\nAssistant: {a}")
+    return "\n\n".join(lines)
+
+
+def answer(question: str, context_docs: List[dict], history: List[dict]) -> dict:
+    if context_docs:
+        context_text = build_context(context_docs)
+        prompt = (
+            "Use ONLY the provided context (and brief history if present). "
+            "If something is missing, say you don't have it.\n"
+            f"Context:\n{context_text}\n\n"
+        )
+    else:
+        prompt = "Answer the user. If you don't know, say so.\n"
+
+    history_text = build_history(history)
+    if history_text:
+        prompt += f"Recent history (for coherence, not new facts):\n{history_text}\n\n"
+
+    prompt += f"Question: {question}"
+
+    resp = llm.invoke(prompt)
+    return {
+        "answer": resp.content,
+        "sources": [
+            {
+                "id": doc.get("id"),
+                "source": doc.get("meta", {}).get("source"),
+                "score": doc.get("rerank_score", doc.get("similarity")),
+                "preview": doc.get("meta", {}).get("text", "")[:200],
+            }
+            for doc in context_docs
+        ],
+        "mode": "rag" if context_docs else "direct",
+    }
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+
+
+@app.post("/upload")
+async def upload(file: UploadFile = File(...)):
+    try:
+        text = read_file(file)
+        chunks = chunk_text(text)
+        upsert_chunks(chunks, source=file.filename)
+        return {"message": f"Indexed {len(chunks)} chunks from {file.filename}"}
+    except HTTPException as e:
+        raise e
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+@app.post("/chat")
+async def chat(payload: dict):
+    question = payload.get("message") or ""
+    use_rag = payload.get("use_rag", True)
+    history = payload.get("history") or []
+    if not question:
+        raise HTTPException(status_code=400, detail="message is required")
+
+    docs = retrieve(question) if use_rag else []
+    result = answer(question, docs, history)
+    return JSONResponse(result)
diff --git a/app/backend/main.py.bak b/app/backend/main.py.bak
new file mode 100644
index 000000000..27f7df4fa
--- /dev/null
+++ b/app/backend/main.py.bak
@@ -0,0 +1,218 @@
+#!/usr/bin/env python
+import os
+from typing import List, Optional
+
+from dotenv import load_dotenv
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_google_genai import ChatGoogleGenerativeAI
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from starlette.responses import JSONResponse
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from endee import Endee, Precision
+
+load_dotenv()
+
+# Environment
+ENDEE_BASE_URL = os.getenv("ENDEE_BASE_URL", "http://localhost:8080/api/v1")
+ENDEE_AUTH_TOKEN = os.getenv("ENDEE_AUTH_TOKEN", "")
+ENDEE_INDEX_NAME = os.getenv("ENDEE_INDEX_NAME", "rag_app")
+
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
+
+CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "800"))
+CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "120"))
+TOP_K = int(os.getenv("TOP_K", "6"))
+
+app = FastAPI(title="RAG Agentic Backend", version="0.1.0")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+def get_endee_client() -> Endee:
+    client = Endee(ENDEE_AUTH_TOKEN) if ENDEE_AUTH_TOKEN else Endee()
+    client.set_base_url(ENDEE_BASE_URL)
+    return client
+
+
+def ensure_index(client: Endee, dim: int) -> None:
+    try:
+        client.get_index(name=ENDEE_INDEX_NAME)
+        return
+    except Exception:
+        pass
+    client.create_index(
+        name=ENDEE_INDEX_NAME,
+        dimension=dim,
+        space_type="cosine",
+        precision=Precision.INT8,
+    )
+
+
+embedder: Optional[SentenceTransformer] = None
+reranker: Optional[CrossEncoder] = None
+llm: Optional[ChatGoogleGenerativeAI] = None
+endee_client: Optional[Endee] = None
+endee_index = None
+
+
+def bootstrap():
+    global embedder, reranker, llm, endee_client, endee_index
+    if GEMINI_API_KEY is None:
+        raise RuntimeError("GEMINI_API_KEY is required")
+
+    embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+    llm = ChatGoogleGenerativeAI(
+        model=GEMINI_MODEL,
+        api_key=GEMINI_API_KEY,
+        temperature=0.2,
+    )
+    endee_client = get_endee_client()
+    ensure_index(endee_client, embedder.get_sentence_embedding_dimension())
+    endee_index = endee_client.get_index(name=ENDEE_INDEX_NAME)
+
+
+bootstrap()
+
+
+def read_file(file: UploadFile) -> str:
+    suffix = file.filename.split(".")[-1].lower()
+    content = file.file.read()
+    if suffix == "pdf":
+        try:
+            from pypdf import PdfReader
+        except Exception as exc:
+            raise HTTPException(status_code=500, detail=f"pypdf missing: {exc}")
+        file.file.seek(0)
+        reader = PdfReader(file.file)
+        text = "\n".join([p.extract_text() or "" for p in reader.pages])
+    elif suffix in {"txt", "md"}:
+        text = content.decode("utf-8", errors="ignore")
+    elif suffix in {"docx"}:
+        try:
+            import docx2txt
+        except Exception as exc:
+            raise HTTPException(status_code=500, detail=f"docx2txt missing: {exc}")
+        temp_path = f"/tmp/{file.filename}"
+        with open(temp_path, "wb") as f:
+            f.write(content)
+        text = docx2txt.process(temp_path) or ""
+        os.remove(temp_path)
+    else:
+        raise HTTPException(status_code=400, detail="Unsupported file type")
+    return text
+
+
+def chunk_text(text: str) -> List[str]:
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        separators=["\n\n", "\n", " ", ""],
+    )
+    return splitter.split_text(text)
+
+
+def upsert_chunks(chunks: List[str], source: str):
+    embeddings = embedder.encode(chunks, convert_to_numpy=False)
+    payload = []
+    for idx, (chunk, vector) in enumerate(zip(chunks, embeddings)):
+        payload.append(
+            {
+                "id": f"{source}::chunk-{idx}",
+                "vector": vector.tolist(),
+                "meta": {"source": source, "text": chunk},
+            }
+        )
+    endee_index.upsert(payload)
+
+
+def retrieve(query: str):
+    query_vec = embedder.encode([query])[0].tolist()
+    results = endee_index.query(vector=query_vec, top_k=TOP_K, include_vectors=False)
+    if not results:
+        return []
+    rerank_inputs = [(query, item.get("meta", {}).get("text", "")) for item in results]
+    scores = reranker.predict(rerank_inputs)
+    reranked = sorted(
+        [
+            {**item, "rerank_score": float(score)}
+            for item, score in zip(results, scores)
+        ],
+        key=lambda x: x["rerank_score"],
+        reverse=True,
+    )
+    return reranked
+
+
+def build_context(docs: List[dict]) -> str:
+    parts = []
+    for doc in docs:
+        meta = doc.get("meta", {})
+        parts.append(f"[{meta.get('source')}] {meta.get('text', '')}")
+    return "\n\n".join(parts)
+
+
+def answer(question: str, context_docs: List[dict]) -> dict:
+    if context_docs:
+        context_text = build_context(context_docs)
+        prompt = (
+            "Use the context to answer. If information is missing, say you don't have it.\n"
+            f"Context:\n{context_text}\n\nQuestion: {question}"
+        )
+    else:
+        prompt = question
+    resp = llm.invoke(prompt)
+    return {
+        "answer": resp.content,
+        "sources": [
+            {
+                "id": doc["id"],
+                "source": doc.get("meta", {}).get("source"),
+                "score": doc.get("rerank_score", doc.get("similarity")),
+                "preview": doc.get("meta", {}).get("text", "")[:200],
+            }
+            for doc in context_docs
+        ],
+        "mode": "rag" if context_docs else "direct",
+    }
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+
+
+@app.post("/upload")
+async def upload(file: UploadFile = File(...)):
+    try:
+        text = read_file(file)
+        chunks = chunk_text(text)
+        upsert_chunks(chunks, source=file.filename)
+        return {"message": f"Indexed {len(chunks)} chunks from {file.filename}"}
+    except HTTPException as e:
+        raise e
+        print(e)
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+@app.post("/chat")
+async def chat(payload: dict):
+    question = payload.get("message") or ""
+    use_rag = payload.get("use_rag", True)
+    if not question:
+        raise HTTPException(status_code=400, detail="message is required")
+
+    docs = retrieve(question) if use_rag else []
+    result = answer(question, docs)
+    return JSONResponse(result)
diff --git a/app/requirements.txt b/app/requirements.txt
new file mode 100644
index 000000000..8f5fcac58
--- /dev/null
+++ b/app/requirements.txt
@@ -0,0 +1,14 @@
+﻿fastapi>=0.110.0
+uvicorn[standard]>=0.27.1
+python-dotenv>=1.0.1
+langchain>=0.1.16
+langchain-community>=0.0.34
+langchain-google-genai>=0.0.12
+langchain-text-splitters>=0.0.1
+sentence-transformers>=2.5.1
+pypdf>=4.2.0
+docx2txt>=0.8
+endee>=0.1.0
+requests>=2.31.0
+streamlit>=1.32.0
+python-multipart>=0.0.9
diff --git a/examples/rag_agentic_demo/README.md b/examples/rag_agentic_demo/README.md
new file mode 100644
index 000000000..574b3d157
--- /dev/null
+++ b/examples/rag_agentic_demo/README.md
@@ -0,0 +1,49 @@
+﻿# RAG + Agentic Demo with Endee
+
+This example shows a minimal retrieval-augmented generation loop that uses the Endee vector database for retrieval and OpenAI for embeddings + answers. It includes a tiny agentic step: each user query is first rewritten by the LLM for better retrieval, then the rewritten query is embedded and searched in Endee.
+
+## Files
+- `app.py` – main script (ingest sample docs, interactive Q&A loop)
+- `requirements.txt` – Python dependencies
+- `data/sample_docs/*.txt` – tiny corpus to load by default
+
+## Prerequisites
+- Python 3.10+
+- Running Endee server at `http://localhost:8080` (default from `run.sh`)
+- OpenAI API key (set `OPENAI_API_KEY`)
+
+## Setup
+```bash
+cd examples/rag_agentic_demo
+python -m venv .venv
+. .venv/Scripts/activate  # Windows
+# or: source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Environment
+Create `.env` in this folder:
+```
+OPENAI_API_KEY=sk-...
+ENDEE_BASE_URL=http://localhost:8080/api/v1  # optional
+ENDEE_AUTH_TOKEN=                             # optional if you enabled auth
+```
+
+## Run
+```bash
+python app.py --index rag_demo
+```
+- On first run, the script ingests the sample docs.
+- Ask questions in the prompt; type `exit` to quit.
+- Use `--skip-ingest` if the index already has your data.
+
+## Using your own data
+Place text files under `data/sample_docs` (or modify `SAMPLE_DIR` in `app.py`). The script will embed and upsert them on startup.
+
+## How it works (flow)
+1) **Rewrite** the user question via `rewrite_query` (agentic query optimization).
+2) **Embed** rewritten query with `text-embedding-3-small`.
+3) **Search** Endee (`top_k=4`).
+4) **Grounded answer** with `gpt-4o-mini`, constrained to retrieved context.
+
+You can swap models, add filters, or extend the agent step to call other tools.
diff --git a/examples/rag_agentic_demo/app.py b/examples/rag_agentic_demo/app.py
new file mode 100644
index 000000000..2c56030dd
--- /dev/null
+++ b/examples/rag_agentic_demo/app.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+import argparse
+import glob
+import os
+from pathlib import Path
+from typing import List, Tuple
+
+from dotenv import load_dotenv
+from openai import OpenAI
+from endee import Endee, Precision
+
+load_dotenv()
+
+DEFAULT_INDEX = "rag_demo"
+EMBED_MODEL = "text-embedding-3-small"
+EMBED_DIM = 1536  # dimension for text-embedding-3-small
+SAMPLE_DIR = Path(__file__).parent / "data" / "sample_docs"
+
+
+def get_openai_client() -> OpenAI:
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise RuntimeError("OPENAI_API_KEY is required for embeddings and generation")
+    return OpenAI(api_key=api_key)
+
+
+def get_endee_client() -> Endee:
+    auth = os.getenv("ENDEE_AUTH_TOKEN", "")
+    base_url = os.getenv("ENDEE_BASE_URL", "http://localhost:8080/api/v1")
+    client = Endee(auth) if auth else Endee()
+    client.set_base_url(base_url)
+    return client
+
+
+def ensure_index(client: Endee, name: str, dim: int) -> None:
+    try:
+        client.get_index(name=name)
+        return
+    except Exception:
+        pass
+    client.create_index(name=name, dimension=dim, space_type="cosine", precision=Precision.INT8)
+
+
+def load_corpus() -> List[Tuple[str, str]]:
+    docs = []
+    for path in glob.glob(str(SAMPLE_DIR / "*.txt")):
+        with open(path, "r", encoding="utf-8") as f:
+            text = f.read().strip()
+            docs.append((Path(path).stem, text))
+    if not docs:
+        raise RuntimeError(f"No documents found in {SAMPLE_DIR}")
+    return docs
+
+
+def embed_texts(client: OpenAI, texts: List[str]) -> List[List[float]]:
+    resp = client.embeddings.create(model=EMBED_MODEL, input=texts)
+    return [item.embedding for item in resp.data]
+
+
+def upsert_docs(index, ids: List[str], texts: List[str], embeddings: List[List[float]]):
+    payload = []
+    for doc_id, text, emb in zip(ids, texts, embeddings):
+        payload.append({
+            "id": doc_id,
+            "vector": emb,
+            "meta": {"source": doc_id, "preview": text[:200]}
+        })
+    index.upsert(payload)
+
+
+def rewrite_query(llm: OpenAI, query: str) -> str:
+    prompt = (
+        "Rewrite the user query to be retrieval-friendly, short, and factual. "
+        "Preserve intent and key nouns; drop pronouns and chit-chat."
+    )
+    res = llm.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": prompt},
+            {"role": "user", "content": query},
+        ],
+        max_tokens=50,
+    )
+    return res.choices[0].message.content.strip()
+
+
+def search(index, embedding: List[float], k: int):
+    return index.query(vector=embedding, top_k=k, include_vectors=False)
+
+
+def build_context(results) -> str:
+    chunks = []
+    for i, item in enumerate(results, 1):
+        meta = item.get("meta") or {}
+        preview = meta.get("preview") or ""
+        chunks.append(f"[{i}] id={item['id']} score={item.get('similarity', item.get('distance'))}\n{preview}\n")
+    return "\n".join(chunks)
+
+
+def answer_question(llm: OpenAI, question: str, context: str) -> str:
+    system = (
+        "You are a concise assistant. Answer using only the provided context. "
+        "If the answer is not in the context, say you cannot find it."
+    )
+    res = llm.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": system},
+            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"},
+        ],
+        max_tokens=200,
+        temperature=0.2,
+    )
+    return res.choices[0].message.content.strip()
+
+
+def interactive_loop(index, llm: OpenAI):
+    print("Type your question (or 'exit' to quit)")
+    while True:
+        try:
+            q = input("ask> ").strip()
+        except (EOFError, KeyboardInterrupt):
+            break
+        if not q or q.lower() in {"exit", "quit"}:
+            break
+        rewritten = rewrite_query(llm, q)
+        embedding = embed_texts(llm, [rewritten])[0]
+        results = search(index, embedding, k=4)
+        context = build_context(results)
+        answer = answer_question(llm, q, context)
+        print("\n---\n", answer, "\n---\n", sep="")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="RAG + agentic demo with Endee")
+    parser.add_argument("--index", default=DEFAULT_INDEX, help="Index name to use/create")
+    parser.add_argument("--skip-ingest", action="store_true", help="Skip loading sample corpus")
+    args = parser.parse_args()
+
+    llm = get_openai_client()
+    endee_client = get_endee_client()
+    ensure_index(endee_client, args.index, EMBED_DIM)
+    index = endee_client.get_index(name=args.index)
+
+    if not args.skip_ingest:
+        docs = load_corpus()
+        ids, texts = zip(*docs)
+        embeddings = embed_texts(llm, list(texts))
+        upsert_docs(index, list(ids), list(texts), embeddings)
+        print(f"Ingested {len(ids)} docs into index '{args.index}'")
+
+    interactive_loop(index, llm)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/rag_agentic_demo/requirements.txt b/examples/rag_agentic_demo/requirements.txt
new file mode 100644
index 000000000..7b70ca6fd
--- /dev/null
+++ b/examples/rag_agentic_demo/requirements.txt
@@ -0,0 +1 @@
+openai>=1.12.0\npython-dotenv>=1.0.1\nendee>=0.1.0\n
diff --git a/test.ipynb b/test.ipynb
new file mode 100644
index 000000000..6b0ecb53d
--- /dev/null
+++ b/test.ipynb
@@ -0,0 +1,80 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6a9d3cb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from endee import Endee, Precision\n",
+    " "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "40aa2059",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = Endee()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "01a51046",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Index created successfully'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client.create_index(\n",
+    "    name=\"my_index2\",\n",
+    "    dimension=384,\n",
+    "    space_type=\"cosine\",\n",
+    "    precision=Precision.INT8\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "225da938",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 7592dc25abf6b1d36560c2fe3baae30ee929bc7a Mon Sep 17 00:00:00 2001
From: CodexSandboxOffline <CodexSandboxOffline@CVHS-TE-L155.valuehealthai.com>
Date: Tue, 17 Mar 2026 23:41:07 +0530
Subject: [PATCH 2/2] Add agentic app updates, sample env, and ignores

---
 app/{.env => .env.example} |  13 ++--
 app/.gitignore             |  23 ++----
 app/README.md              | 139 +++++++++++++++++++++++++++++++------
 app/backend/main.py        |  80 ++++++++++++++++++---
 app/requirements.txt       |   1 +
 5 files changed, 195 insertions(+), 61 deletions(-)
 rename app/{.env => .env.example} (51%)

diff --git a/app/.env b/app/.env.example
similarity index 51%
rename from app/.env
rename to app/.env.example
index 4376506b1..9a8970874 100644
--- a/app/.env
+++ b/app/.env.example
@@ -1,16 +1,11 @@
-﻿# API keys
-GEMINI_API_KEY="AIzaSyAZu4NTKc_krZYZmMZRasiCQYtsSifOcrw"
-GEMINI_MODEL="gemini-2.5-flash"
-
-# Vector DB
+﻿# Sample environment (fill with your keys before running)
+GEMINI_API_KEY=your_gemini_key
+GEMINI_MODEL=gemini-2.5-flash
 ENDEE_BASE_URL=http://localhost:8080/api/v1
 ENDEE_AUTH_TOKEN=
 ENDEE_INDEX_NAME=rag_app
-
-# RAG params
 CHUNK_SIZE=800
 CHUNK_OVERLAP=120
 TOP_K=6
-
-# Frontend
+TAVILY_API_KEY=your_tavily_key
 BACKEND_URL=http://localhost:8000
diff --git a/app/.gitignore b/app/.gitignore
index 449021385..abe52e10c 100644
--- a/app/.gitignore
+++ b/app/.gitignore
@@ -1,21 +1,8 @@
-cat <<EOL > .gitignore
-# Python
+﻿.env
+.env.local
+!.env.example
+.venv/
 __pycache__/
 *.pyc
-*.pyo
-*.pyd
-
-# Virtual environment
-.venv/
-venv/
-
-# IDE
-.vscode/
-.idea/
-
-# OS
-.DS_Store
-
-# Logs
 *.log
-EOL
\ No newline at end of file
+.cache/
diff --git a/app/README.md b/app/README.md
index ae0930b41..978f612ee 100644
--- a/app/README.md
+++ b/app/README.md
@@ -1,36 +1,129 @@
-﻿# Agentic RAG App (FastAPI + Streamlit + Endee)
+﻿# Agentic RAG Starter (FastAPI + Streamlit + Endee + Gemini + Tavily)
 
-## What it does
-- Upload PDF/DOCX/TXT → chunk with LangChain → embed via `sentence-transformers/all-MiniLM-L6-v2` → store in Endee.
-- Chat endpoint: embed query, search Endee (top_k), rerank with `cross-encoder/ms-marco-MiniLM-L-6-v2`, build context, answer with Gemini model (default `gemini-2.5-flash`).
-- Toggle RAG on/off in the UI; responses include source metadata.
+This app lets you chat with three routing paths:
+- **Agent** (auto-route): LLM decides whether to use RAG (Endee), Web (Tavily), or Direct (LLM only).
+- **RAG**: embed → Endee search → rerank → Gemini answer with sources.
+- **Web**: Tavily search → Gemini answer with web citations.
+- **Direct**: Gemini only (no retrieval).
 
-## Layout
-- `app/backend/main.py` – FastAPI service (`/upload`, `/chat`, `/health`).
-- `app/frontend/streamlit_app.py` – Streamlit UI client.
-- `app/requirements.txt` – all deps for both frontend + backend.
-- `app/.env` – fill with your keys; defaults point to local Endee + backend.
+## Tech Stack
+- Backend: FastAPI (`app/backend/main.py`)
+- Frontend: Streamlit (`app/frontend/streamlit_app.py`)
+- Vector DB: Endee server at `http://localhost:8080` (cosine, INT8, dim 384)
+- Embeddings: `sentence-transformers/all-MiniLM-L6-v2`
+- Reranker: `cross-encoder/ms-marco-MiniLM-L-6-v2`
+- LLM: Gemini (default `gemini-2.5-flash` via Google AI key)
+- Web search: Tavily API
+
+## Prerequisites
+- Python 3.10+
+- Endee server running locally on 8080 (from `run.sh` or docker-compose)
+- API keys:
+  - `GEMINI_API_KEY` (required)
+  - `TAVILY_API_KEY` (for Web route)
 
 ## Setup
-```bash
-cd app
+```powershell
+cd C:\VH811\projects\endee\app
 python -m venv .venv
-. .venv/Scripts/activate  # or source .venv/bin/activate
+. .venv\Scripts\activate
 pip install -r requirements.txt
 ```
-Edit `.env` with your `GEMINI_API_KEY` (and `ENDEE_AUTH_TOKEN` if your server requires it).
 
-## Run backend
-```bash
-uvicorn backend.main:app --host 0.0.0.0 --port 8000 --reload
+### Environment
+Edit `app/.env` (already created) and set:
+```
+GEMINI_API_KEY=your_gemini_key
+GEMINI_MODEL=gemini-2.5-flash
+ENDEE_BASE_URL=http://localhost:8080/api/v1
+ENDEE_AUTH_TOKEN=           # blank if Endee auth disabled
+ENDEE_INDEX_NAME=rag_app
+CHUNK_SIZE=800
+CHUNK_OVERLAP=120
+TOP_K=6
+TAVILY_API_KEY=your_tavily_key
+BACKEND_URL=http://localhost:8000
 ```
 
-## Run frontend
-```bash
-streamlit run frontend/streamlit_app.py
+## Run
+Backend:
+```powershell
+cd C:\VH811\projects\endee\app
+. .venv\Scripts\activate
+python -m uvicorn backend.main:app --host 0.0.0.0 --port 8000
+```
+Frontend:
+```powershell
+cd C:\VH811\projects\endee\app
+. .venv\Scripts\activate
+streamlit run frontend\streamlit_app.py
 ```
+Open Streamlit at http://localhost:8501.
+
+## How it Works (Pipeline)
+1) **Uploads**: pdf/docx/txt/md → text extract (pypdf/docx2txt) → chunk (800/120) → embed (MiniLM) → upsert to Endee with metadata.
+2) **Routing** (Agent): quick Gemini prompt decides `rag|web|direct` unless user forces mode via UI radio.
+3) **RAG path**: embed query → Endee top_k=6 → rerank top 4 with cross-encoder → build context → Gemini answers; returns sources (chunk previews).
+4) **Web path**: Tavily search max 5 results → context → Gemini answers; returns web URLs/titles.
+5) **Direct path**: Gemini answers without retrieval.
+6) **History**: last 5 turns sent to LLM for coherence (not for facts).
+
+## Routing Modes in UI
+- Agent (auto): LLM router picks best path.
+- RAG: force vector search.
+- Web: force Tavily search.
+- Direct: force plain LLM.
+
+## Tuning & Performance
+- Lower `TOP_K` or rerank cutoff to reduce latency (currently rerank keeps top 4).
+- Smaller reranker model (e.g., `cross-encoder/ms-marco-MiniLM-L-2-v2`) for speed.
+- Use Direct mode for general chit-chat to avoid retrieval.
+- Warm cache by one request after restart to load HF models.
+
+## Troubleshooting
+- `API key not valid / 403 / 429`: check/replace `GEMINI_API_KEY`, quota or billing; switch model if needed.
+- `TAVILY_API_KEY not configured`: set the key in `.env` for Web mode.
+- Upload errors: ensure file type is pdf/docx/txt/md and Endee server is reachable at `ENDEE_BASE_URL`.
+
+## File Map
+- `app/backend/main.py` — API routes, router, RAG/Web/Direct flows
+- `app/frontend/streamlit_app.py` — UI and routing selector
+- `app/requirements.txt` — dependencies
+- `app/.env` — keys and settings
 
 ## Notes
-- Endee index name is `rag_app` by default; change via `ENDEE_INDEX_NAME`.
-- Embedding dim is fixed at 384 to match `all-MiniLM-L6-v2` and the Endee index is created automatically if missing.
-- Reranker uses a cross-encoder; if you want faster responses, you can disable reranking by returning `results` directly in `retrieve`.
+- Index auto-creates with dimension 384 to match MiniLM embeddings.
+- Sources shown only for RAG/Web modes; direct replies have no citations.
+- Last 5 message pairs are passed for conversational continuity.
+
+## Architecture Diagram (Text)
+```
+User (Streamlit UI)
+    |
+    v
+Routing Selector (Agent/RAG/Web/Direct)
+    |
+    +-- Agent -> Router Prompt (Gemini) -> choose path
+    |        |
+    |        +-- RAG Path:
+    |        |       embed query (MiniLM)
+    |        |       -> Endee search (cosine, INT8, top_k)
+    |        |       -> rerank (cross-encoder)
+    |        |       -> context -> Gemini answer + chunk sources
+    |        |
+    |        +-- Web Path:
+    |        |       Tavily search (max 5)
+    |        |       -> context -> Gemini answer + web sources
+    |        |
+    |        +-- Direct Path:
+    |                Gemini answer (no retrieval)
+    |
+Uploads
+    |
+    v
+File ingest (pdf/docx/txt/md)
+    -> extract text
+    -> chunk (800/120)
+    -> embed (MiniLM)
+    -> Endee upsert (metadata)
+```
diff --git a/app/backend/main.py b/app/backend/main.py
index 81b478143..99d753ccc 100644
--- a/app/backend/main.py
+++ b/app/backend/main.py
@@ -1,4 +1,4 @@
-﻿#!/usr/bin/env python
+#!/usr/bin/env python
 import os
 from typing import List, Optional
 
@@ -9,6 +9,7 @@
 from langchain_google_genai import ChatGoogleGenerativeAI
 from sentence_transformers import SentenceTransformer, CrossEncoder
 from starlette.responses import JSONResponse
+from tavily import TavilyClient
 
 from endee import Endee, Precision
 from endee.schema import VectorItem as EndeeVectorItem
@@ -31,7 +32,9 @@
 CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "120"))
 TOP_K = int(os.getenv("TOP_K", "6"))
 
-app = FastAPI(title="RAG Agentic Backend", version="0.1.0")
+TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
+
+app = FastAPI(title="RAG Agentic Backend", version="0.2.0")
 
 app.add_middleware(
     CORSMiddleware,
@@ -67,10 +70,11 @@ def ensure_index(client: Endee, dim: int) -> None:
 llm: Optional[ChatGoogleGenerativeAI] = None
 endee_client: Optional[Endee] = None
 endee_index = None
+tavily_client: Optional[TavilyClient] = None
 
 
 def bootstrap():
-    global embedder, reranker, llm, endee_client, endee_index
+    global embedder, reranker, llm, endee_client, endee_index, tavily_client
     if GEMINI_API_KEY is None:
         raise RuntimeError("GEMINI_API_KEY is required")
 
@@ -85,6 +89,9 @@ def bootstrap():
     ensure_index(endee_client, embedder.get_sentence_embedding_dimension())
     endee_index = endee_client.get_index(name=ENDEE_INDEX_NAME)
 
+    if TAVILY_API_KEY:
+        tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
+
 
 bootstrap()
 
@@ -174,14 +181,35 @@ def retrieve(query: str):
         key=lambda x: x["rerank_score"],
         reverse=True,
     )
-    return reranked
+    return reranked[:4]
+
+
+def web_search(query: str):
+    if not tavily_client:
+        raise HTTPException(status_code=500, detail="TAVILY_API_KEY not configured")
+    res = tavily_client.search(query=query, max_results=5, include_images=False)
+    docs = []
+    for item in res.get("results", []):
+        docs.append(
+            {
+                "id": item.get("url"),
+                "meta": {
+                    "source": item.get("url"),
+                    "text": item.get("content", ""),
+                    "title": item.get("title", "")
+                },
+                "score": item.get("score"),
+            }
+        )
+    return docs
 
 
 def build_context(docs: List[dict]) -> str:
     parts = []
     for doc in docs:
         meta = doc.get("meta", {})
-        parts.append(f"[{meta.get('source')}] {meta.get('text', '')}")
+        src = meta.get("title") or meta.get("source")
+        parts.append(f"[{src}] {meta.get('text', '')}")
     return "\n\n".join(parts)
 
 
@@ -194,7 +222,28 @@ def build_history(history: List[dict]) -> str:
     return "\n\n".join(lines)
 
 
-def answer(question: str, context_docs: List[dict], history: List[dict]) -> dict:
+def route_query(question: str, force_mode: str = "auto") -> str:
+    if force_mode in {"rag", "web", "direct"}:
+        return force_mode
+    # lightweight classification with the main LLM
+    prompt = (
+        "Decide routing for the question. Output only one token: RAG, WEB, or DIRECT.\n"
+        "Use RAG if it likely needs internal docs; WEB if it's about current/general external info; otherwise DIRECT.\n"
+        f"Question: {question}\nAnswer:"
+    )
+    try:
+        resp = llm.invoke(prompt)
+        text = resp.content.strip().upper()
+        if "WEB" in text:
+            return "web"
+        if "RAG" in text:
+            return "rag"
+        return "direct"
+    except Exception:
+        return "direct"
+
+
+def answer(question: str, context_docs: List[dict], history: List[dict], mode: str) -> dict:
     if context_docs:
         context_text = build_context(context_docs)
         prompt = (
@@ -218,12 +267,13 @@ def answer(question: str, context_docs: List[dict], history: List[dict]) -> dict
             {
                 "id": doc.get("id"),
                 "source": doc.get("meta", {}).get("source"),
-                "score": doc.get("rerank_score", doc.get("similarity")),
+                "title": doc.get("meta", {}).get("title"),
+                "score": doc.get("rerank_score", doc.get("similarity", doc.get("score"))),
                 "preview": doc.get("meta", {}).get("text", "")[:200],
             }
             for doc in context_docs
         ],
-        "mode": "rag" if context_docs else "direct",
+        "mode": mode,
     }
 
 
@@ -248,11 +298,19 @@ async def upload(file: UploadFile = File(...)):
 @app.post("/chat")
 async def chat(payload: dict):
     question = payload.get("message") or ""
-    use_rag = payload.get("use_rag", True)
+    force_mode = (payload.get("mode") or "auto").lower()
     history = payload.get("history") or []
     if not question:
         raise HTTPException(status_code=400, detail="message is required")
 
-    docs = retrieve(question) if use_rag else []
-    result = answer(question, docs, history)
+    route = route_query(question, force_mode=force_mode)
+
+    if route == "rag":
+        docs = retrieve(question)
+    elif route == "web":
+        docs = web_search(question)
+    else:
+        docs = []
+
+    result = answer(question, docs, history, mode=route)
     return JSONResponse(result)
diff --git a/app/requirements.txt b/app/requirements.txt
index 8f5fcac58..4b53dec82 100644
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -12,3 +12,4 @@ endee>=0.1.0
 requests>=2.31.0
 streamlit>=1.32.0
 python-multipart>=0.0.9
+tavily-python>=0.3.8