From a88b8827202206e3c376ec2b9908fb69f9dcd155 Mon Sep 17 00:00:00 2001 From: Siddharth Senthilkumar Date: Tue, 17 Mar 2026 20:41:48 +0530 Subject: [PATCH 1/2] update project --- app/.env | 16 ++ app/.gitignore | 21 ++ app/README.md | 36 +++ app/backend/main.py | 258 +++++++++++++++++++++ app/backend/main.py.bak | 218 +++++++++++++++++ app/requirements.txt | 14 ++ examples/rag_agentic_demo/README.md | 49 ++++ examples/rag_agentic_demo/app.py | 156 +++++++++++++ examples/rag_agentic_demo/requirements.txt | 1 + test.ipynb | 80 +++++++ 10 files changed, 849 insertions(+) create mode 100644 app/.env create mode 100644 app/.gitignore create mode 100644 app/README.md create mode 100644 app/backend/main.py create mode 100644 app/backend/main.py.bak create mode 100644 app/requirements.txt create mode 100644 examples/rag_agentic_demo/README.md create mode 100644 examples/rag_agentic_demo/app.py create mode 100644 examples/rag_agentic_demo/requirements.txt create mode 100644 test.ipynb diff --git a/app/.env b/app/.env new file mode 100644 index 000000000..4376506b1 --- /dev/null +++ b/app/.env @@ -0,0 +1,16 @@ +# API keys +GEMINI_API_KEY="AIzaSyAZu4NTKc_krZYZmMZRasiCQYtsSifOcrw" +GEMINI_MODEL="gemini-2.5-flash" + +# Vector DB +ENDEE_BASE_URL=http://localhost:8080/api/v1 +ENDEE_AUTH_TOKEN= +ENDEE_INDEX_NAME=rag_app + +# RAG params +CHUNK_SIZE=800 +CHUNK_OVERLAP=120 +TOP_K=6 + +# Frontend +BACKEND_URL=http://localhost:8000 diff --git a/app/.gitignore b/app/.gitignore new file mode 100644 index 000000000..449021385 --- /dev/null +++ b/app/.gitignore @@ -0,0 +1,21 @@ +cat < .gitignore +# Python +__pycache__/ +*.pyc +*.pyo +*.pyd + +# Virtual environment +.venv/ +venv/ + +# IDE +.vscode/ +.idea/ + +# OS +.DS_Store + +# Logs +*.log +EOL \ No newline at end of file diff --git a/app/README.md b/app/README.md new file mode 100644 index 000000000..ae0930b41 --- /dev/null +++ b/app/README.md @@ -0,0 +1,36 @@ +# Agentic RAG App (FastAPI + Streamlit + Endee) + +## What it does +- Upload PDF/DOCX/TXT → chunk with LangChain → embed via `sentence-transformers/all-MiniLM-L6-v2` → store in Endee. +- Chat endpoint: embed query, search Endee (top_k), rerank with `cross-encoder/ms-marco-MiniLM-L-6-v2`, build context, answer with Gemini model (default `gemini-2.5-flash`). +- Toggle RAG on/off in the UI; responses include source metadata. + +## Layout +- `app/backend/main.py` – FastAPI service (`/upload`, `/chat`, `/health`). +- `app/frontend/streamlit_app.py` – Streamlit UI client. +- `app/requirements.txt` – all deps for both frontend + backend. +- `app/.env` – fill with your keys; defaults point to local Endee + backend. + +## Setup +```bash +cd app +python -m venv .venv +. .venv/Scripts/activate # or source .venv/bin/activate +pip install -r requirements.txt +``` +Edit `.env` with your `GEMINI_API_KEY` (and `ENDEE_AUTH_TOKEN` if your server requires it). + +## Run backend +```bash +uvicorn backend.main:app --host 0.0.0.0 --port 8000 --reload +``` + +## Run frontend +```bash +streamlit run frontend/streamlit_app.py +``` + +## Notes +- Endee index name is `rag_app` by default; change via `ENDEE_INDEX_NAME`. +- Embedding dim is fixed at 384 to match `all-MiniLM-L6-v2` and the Endee index is created automatically if missing. +- Reranker uses a cross-encoder; if you want faster responses, you can disable reranking by returning `results` directly in `retrieve`. diff --git a/app/backend/main.py b/app/backend/main.py new file mode 100644 index 000000000..81b478143 --- /dev/null +++ b/app/backend/main.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python +import os +from typing import List, Optional + +from dotenv import load_dotenv +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_google_genai import ChatGoogleGenerativeAI +from sentence_transformers import SentenceTransformer, CrossEncoder +from starlette.responses import JSONResponse + +from endee import Endee, Precision +from endee.schema import VectorItem as EndeeVectorItem + +# Monkey-patch Endee VectorItem to behave like a dict for `.get()` calls inside the SDK +if not hasattr(EndeeVectorItem, "get"): + EndeeVectorItem.get = lambda self, key, default=None: getattr(self, key, default) + +load_dotenv() + +# Environment +ENDEE_BASE_URL = os.getenv("ENDEE_BASE_URL", "http://localhost:8080/api/v1") +ENDEE_AUTH_TOKEN = os.getenv("ENDEE_AUTH_TOKEN", "") +ENDEE_INDEX_NAME = os.getenv("ENDEE_INDEX_NAME", "rag_app") + +GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") +GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-flash") + +CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "800")) +CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "120")) +TOP_K = int(os.getenv("TOP_K", "6")) + +app = FastAPI(title="RAG Agentic Backend", version="0.1.0") + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +def get_endee_client() -> Endee: + client = Endee(ENDEE_AUTH_TOKEN) if ENDEE_AUTH_TOKEN else Endee() + client.set_base_url(ENDEE_BASE_URL) + return client + + +def ensure_index(client: Endee, dim: int) -> None: + try: + client.get_index(name=ENDEE_INDEX_NAME) + return + except Exception: + pass + client.create_index( + name=ENDEE_INDEX_NAME, + dimension=dim, + space_type="cosine", + precision=Precision.INT8, + ) + + +embedder: Optional[SentenceTransformer] = None +reranker: Optional[CrossEncoder] = None +llm: Optional[ChatGoogleGenerativeAI] = None +endee_client: Optional[Endee] = None +endee_index = None + + +def bootstrap(): + global embedder, reranker, llm, endee_client, endee_index + if GEMINI_API_KEY is None: + raise RuntimeError("GEMINI_API_KEY is required") + + embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") + reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") + llm = ChatGoogleGenerativeAI( + model=GEMINI_MODEL, + api_key=GEMINI_API_KEY, + temperature=0.2, + ) + endee_client = get_endee_client() + ensure_index(endee_client, embedder.get_sentence_embedding_dimension()) + endee_index = endee_client.get_index(name=ENDEE_INDEX_NAME) + + +bootstrap() + + +def read_file(file: UploadFile) -> str: + suffix = file.filename.split(".")[-1].lower() + content = file.file.read() + if suffix == "pdf": + try: + from pypdf import PdfReader + except Exception as exc: + raise HTTPException(status_code=500, detail=f"pypdf missing: {exc}") + file.file.seek(0) + reader = PdfReader(file.file) + text = "\n".join([p.extract_text() or "" for p in reader.pages]) + elif suffix in {"txt", "md"}: + text = content.decode("utf-8", errors="ignore") + elif suffix in {"docx"}: + try: + import docx2txt + except Exception as exc: + raise HTTPException(status_code=500, detail=f"docx2txt missing: {exc}") + temp_path = f"/tmp/{file.filename}" + with open(temp_path, "wb") as f: + f.write(content) + text = docx2txt.process(temp_path) or "" + os.remove(temp_path) + else: + raise HTTPException(status_code=400, detail="Unsupported file type") + return text + + +def chunk_text(text: str) -> List[str]: + splitter = RecursiveCharacterTextSplitter( + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + separators=["\n\n", "\n", " ", ""], + ) + return splitter.split_text(text) + + +def upsert_chunks(chunks: List[str], source: str): + embeddings = embedder.encode(chunks, convert_to_numpy=False) + payload = [] + for idx, (chunk, vector) in enumerate(zip(chunks, embeddings)): + payload.append( + { + "id": f"{source}::chunk-{idx}", + "vector": vector.tolist(), + "meta": {"source": source, "text": chunk}, + } + ) + endee_index.upsert(payload) + + +def _normalize_result(item) -> dict: + if hasattr(item, "dict"): + base = item.dict() + similarity = getattr(item, "similarity", None) + distance = getattr(item, "distance", None) + else: + base = dict(item) + similarity = base.get("similarity") + distance = base.get("distance") + return { + "id": base.get("id"), + "meta": base.get("meta") or {}, + "similarity": similarity, + "distance": distance, + "raw": item, + } + + +def retrieve(query: str): + query_vec = embedder.encode([query])[0].tolist() + results = endee_index.query(vector=query_vec, top_k=TOP_K, include_vectors=False) + if not results: + return [] + docs = [_normalize_result(r) for r in results] + rerank_inputs = [(query, doc["meta"].get("text", "")) for doc in docs] + scores = reranker.predict(rerank_inputs) + reranked = sorted( + [ + {**doc, "rerank_score": float(score)} + for doc, score in zip(docs, scores) + ], + key=lambda x: x["rerank_score"], + reverse=True, + ) + return reranked + + +def build_context(docs: List[dict]) -> str: + parts = [] + for doc in docs: + meta = doc.get("meta", {}) + parts.append(f"[{meta.get('source')}] {meta.get('text', '')}") + return "\n\n".join(parts) + + +def build_history(history: List[dict]) -> str: + lines = [] + for turn in history[-5:]: + u = turn.get("user") or "" + a = turn.get("answer") or "" + lines.append(f"User: {u}\nAssistant: {a}") + return "\n\n".join(lines) + + +def answer(question: str, context_docs: List[dict], history: List[dict]) -> dict: + if context_docs: + context_text = build_context(context_docs) + prompt = ( + "Use ONLY the provided context (and brief history if present). " + "If something is missing, say you don't have it.\n" + f"Context:\n{context_text}\n\n" + ) + else: + prompt = "Answer the user. If you don't know, say so.\n" + + history_text = build_history(history) + if history_text: + prompt += f"Recent history (for coherence, not new facts):\n{history_text}\n\n" + + prompt += f"Question: {question}" + + resp = llm.invoke(prompt) + return { + "answer": resp.content, + "sources": [ + { + "id": doc.get("id"), + "source": doc.get("meta", {}).get("source"), + "score": doc.get("rerank_score", doc.get("similarity")), + "preview": doc.get("meta", {}).get("text", "")[:200], + } + for doc in context_docs + ], + "mode": "rag" if context_docs else "direct", + } + + +@app.get("/health") +def health(): + return {"status": "ok"} + + +@app.post("/upload") +async def upload(file: UploadFile = File(...)): + try: + text = read_file(file) + chunks = chunk_text(text) + upsert_chunks(chunks, source=file.filename) + return {"message": f"Indexed {len(chunks)} chunks from {file.filename}"} + except HTTPException as e: + raise e + except Exception as exc: + raise HTTPException(status_code=500, detail=str(exc)) + + +@app.post("/chat") +async def chat(payload: dict): + question = payload.get("message") or "" + use_rag = payload.get("use_rag", True) + history = payload.get("history") or [] + if not question: + raise HTTPException(status_code=400, detail="message is required") + + docs = retrieve(question) if use_rag else [] + result = answer(question, docs, history) + return JSONResponse(result) diff --git a/app/backend/main.py.bak b/app/backend/main.py.bak new file mode 100644 index 000000000..27f7df4fa --- /dev/null +++ b/app/backend/main.py.bak @@ -0,0 +1,218 @@ +#!/usr/bin/env python +import os +from typing import List, Optional + +from dotenv import load_dotenv +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_google_genai import ChatGoogleGenerativeAI +from sentence_transformers import SentenceTransformer, CrossEncoder +from starlette.responses import JSONResponse +from langchain_text_splitters import RecursiveCharacterTextSplitter + +from endee import Endee, Precision + +load_dotenv() + +# Environment +ENDEE_BASE_URL = os.getenv("ENDEE_BASE_URL", "http://localhost:8080/api/v1") +ENDEE_AUTH_TOKEN = os.getenv("ENDEE_AUTH_TOKEN", "") +ENDEE_INDEX_NAME = os.getenv("ENDEE_INDEX_NAME", "rag_app") + +GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") +GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-flash") + +CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "800")) +CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "120")) +TOP_K = int(os.getenv("TOP_K", "6")) + +app = FastAPI(title="RAG Agentic Backend", version="0.1.0") + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +def get_endee_client() -> Endee: + client = Endee(ENDEE_AUTH_TOKEN) if ENDEE_AUTH_TOKEN else Endee() + client.set_base_url(ENDEE_BASE_URL) + return client + + +def ensure_index(client: Endee, dim: int) -> None: + try: + client.get_index(name=ENDEE_INDEX_NAME) + return + except Exception: + pass + client.create_index( + name=ENDEE_INDEX_NAME, + dimension=dim, + space_type="cosine", + precision=Precision.INT8, + ) + + +embedder: Optional[SentenceTransformer] = None +reranker: Optional[CrossEncoder] = None +llm: Optional[ChatGoogleGenerativeAI] = None +endee_client: Optional[Endee] = None +endee_index = None + + +def bootstrap(): + global embedder, reranker, llm, endee_client, endee_index + if GEMINI_API_KEY is None: + raise RuntimeError("GEMINI_API_KEY is required") + + embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") + reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") + llm = ChatGoogleGenerativeAI( + model=GEMINI_MODEL, + api_key=GEMINI_API_KEY, + temperature=0.2, + ) + endee_client = get_endee_client() + ensure_index(endee_client, embedder.get_sentence_embedding_dimension()) + endee_index = endee_client.get_index(name=ENDEE_INDEX_NAME) + + +bootstrap() + + +def read_file(file: UploadFile) -> str: + suffix = file.filename.split(".")[-1].lower() + content = file.file.read() + if suffix == "pdf": + try: + from pypdf import PdfReader + except Exception as exc: + raise HTTPException(status_code=500, detail=f"pypdf missing: {exc}") + file.file.seek(0) + reader = PdfReader(file.file) + text = "\n".join([p.extract_text() or "" for p in reader.pages]) + elif suffix in {"txt", "md"}: + text = content.decode("utf-8", errors="ignore") + elif suffix in {"docx"}: + try: + import docx2txt + except Exception as exc: + raise HTTPException(status_code=500, detail=f"docx2txt missing: {exc}") + temp_path = f"/tmp/{file.filename}" + with open(temp_path, "wb") as f: + f.write(content) + text = docx2txt.process(temp_path) or "" + os.remove(temp_path) + else: + raise HTTPException(status_code=400, detail="Unsupported file type") + return text + + +def chunk_text(text: str) -> List[str]: + splitter = RecursiveCharacterTextSplitter( + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + separators=["\n\n", "\n", " ", ""], + ) + return splitter.split_text(text) + + +def upsert_chunks(chunks: List[str], source: str): + embeddings = embedder.encode(chunks, convert_to_numpy=False) + payload = [] + for idx, (chunk, vector) in enumerate(zip(chunks, embeddings)): + payload.append( + { + "id": f"{source}::chunk-{idx}", + "vector": vector.tolist(), + "meta": {"source": source, "text": chunk}, + } + ) + endee_index.upsert(payload) + + +def retrieve(query: str): + query_vec = embedder.encode([query])[0].tolist() + results = endee_index.query(vector=query_vec, top_k=TOP_K, include_vectors=False) + if not results: + return [] + rerank_inputs = [(query, item.get("meta", {}).get("text", "")) for item in results] + scores = reranker.predict(rerank_inputs) + reranked = sorted( + [ + {**item, "rerank_score": float(score)} + for item, score in zip(results, scores) + ], + key=lambda x: x["rerank_score"], + reverse=True, + ) + return reranked + + +def build_context(docs: List[dict]) -> str: + parts = [] + for doc in docs: + meta = doc.get("meta", {}) + parts.append(f"[{meta.get('source')}] {meta.get('text', '')}") + return "\n\n".join(parts) + + +def answer(question: str, context_docs: List[dict]) -> dict: + if context_docs: + context_text = build_context(context_docs) + prompt = ( + "Use the context to answer. If information is missing, say you don't have it.\n" + f"Context:\n{context_text}\n\nQuestion: {question}" + ) + else: + prompt = question + resp = llm.invoke(prompt) + return { + "answer": resp.content, + "sources": [ + { + "id": doc["id"], + "source": doc.get("meta", {}).get("source"), + "score": doc.get("rerank_score", doc.get("similarity")), + "preview": doc.get("meta", {}).get("text", "")[:200], + } + for doc in context_docs + ], + "mode": "rag" if context_docs else "direct", + } + + +@app.get("/health") +def health(): + return {"status": "ok"} + + +@app.post("/upload") +async def upload(file: UploadFile = File(...)): + try: + text = read_file(file) + chunks = chunk_text(text) + upsert_chunks(chunks, source=file.filename) + return {"message": f"Indexed {len(chunks)} chunks from {file.filename}"} + except HTTPException as e: + raise e + print(e) + except Exception as exc: + raise HTTPException(status_code=500, detail=str(exc)) + + +@app.post("/chat") +async def chat(payload: dict): + question = payload.get("message") or "" + use_rag = payload.get("use_rag", True) + if not question: + raise HTTPException(status_code=400, detail="message is required") + + docs = retrieve(question) if use_rag else [] + result = answer(question, docs) + return JSONResponse(result) diff --git a/app/requirements.txt b/app/requirements.txt new file mode 100644 index 000000000..8f5fcac58 --- /dev/null +++ b/app/requirements.txt @@ -0,0 +1,14 @@ +fastapi>=0.110.0 +uvicorn[standard]>=0.27.1 +python-dotenv>=1.0.1 +langchain>=0.1.16 +langchain-community>=0.0.34 +langchain-google-genai>=0.0.12 +langchain-text-splitters>=0.0.1 +sentence-transformers>=2.5.1 +pypdf>=4.2.0 +docx2txt>=0.8 +endee>=0.1.0 +requests>=2.31.0 +streamlit>=1.32.0 +python-multipart>=0.0.9 diff --git a/examples/rag_agentic_demo/README.md b/examples/rag_agentic_demo/README.md new file mode 100644 index 000000000..574b3d157 --- /dev/null +++ b/examples/rag_agentic_demo/README.md @@ -0,0 +1,49 @@ +# RAG + Agentic Demo with Endee + +This example shows a minimal retrieval-augmented generation loop that uses the Endee vector database for retrieval and OpenAI for embeddings + answers. It includes a tiny agentic step: each user query is first rewritten by the LLM for better retrieval, then the rewritten query is embedded and searched in Endee. + +## Files +- `app.py` – main script (ingest sample docs, interactive Q&A loop) +- `requirements.txt` – Python dependencies +- `data/sample_docs/*.txt` – tiny corpus to load by default + +## Prerequisites +- Python 3.10+ +- Running Endee server at `http://localhost:8080` (default from `run.sh`) +- OpenAI API key (set `OPENAI_API_KEY`) + +## Setup +```bash +cd examples/rag_agentic_demo +python -m venv .venv +. .venv/Scripts/activate # Windows +# or: source .venv/bin/activate +pip install -r requirements.txt +``` + +## Environment +Create `.env` in this folder: +``` +OPENAI_API_KEY=sk-... +ENDEE_BASE_URL=http://localhost:8080/api/v1 # optional +ENDEE_AUTH_TOKEN= # optional if you enabled auth +``` + +## Run +```bash +python app.py --index rag_demo +``` +- On first run, the script ingests the sample docs. +- Ask questions in the prompt; type `exit` to quit. +- Use `--skip-ingest` if the index already has your data. + +## Using your own data +Place text files under `data/sample_docs` (or modify `SAMPLE_DIR` in `app.py`). The script will embed and upsert them on startup. + +## How it works (flow) +1) **Rewrite** the user question via `rewrite_query` (agentic query optimization). +2) **Embed** rewritten query with `text-embedding-3-small`. +3) **Search** Endee (`top_k=4`). +4) **Grounded answer** with `gpt-4o-mini`, constrained to retrieved context. + +You can swap models, add filters, or extend the agent step to call other tools. diff --git a/examples/rag_agentic_demo/app.py b/examples/rag_agentic_demo/app.py new file mode 100644 index 000000000..2c56030dd --- /dev/null +++ b/examples/rag_agentic_demo/app.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +import argparse +import glob +import os +from pathlib import Path +from typing import List, Tuple + +from dotenv import load_dotenv +from openai import OpenAI +from endee import Endee, Precision + +load_dotenv() + +DEFAULT_INDEX = "rag_demo" +EMBED_MODEL = "text-embedding-3-small" +EMBED_DIM = 1536 # dimension for text-embedding-3-small +SAMPLE_DIR = Path(__file__).parent / "data" / "sample_docs" + + +def get_openai_client() -> OpenAI: + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + raise RuntimeError("OPENAI_API_KEY is required for embeddings and generation") + return OpenAI(api_key=api_key) + + +def get_endee_client() -> Endee: + auth = os.getenv("ENDEE_AUTH_TOKEN", "") + base_url = os.getenv("ENDEE_BASE_URL", "http://localhost:8080/api/v1") + client = Endee(auth) if auth else Endee() + client.set_base_url(base_url) + return client + + +def ensure_index(client: Endee, name: str, dim: int) -> None: + try: + client.get_index(name=name) + return + except Exception: + pass + client.create_index(name=name, dimension=dim, space_type="cosine", precision=Precision.INT8) + + +def load_corpus() -> List[Tuple[str, str]]: + docs = [] + for path in glob.glob(str(SAMPLE_DIR / "*.txt")): + with open(path, "r", encoding="utf-8") as f: + text = f.read().strip() + docs.append((Path(path).stem, text)) + if not docs: + raise RuntimeError(f"No documents found in {SAMPLE_DIR}") + return docs + + +def embed_texts(client: OpenAI, texts: List[str]) -> List[List[float]]: + resp = client.embeddings.create(model=EMBED_MODEL, input=texts) + return [item.embedding for item in resp.data] + + +def upsert_docs(index, ids: List[str], texts: List[str], embeddings: List[List[float]]): + payload = [] + for doc_id, text, emb in zip(ids, texts, embeddings): + payload.append({ + "id": doc_id, + "vector": emb, + "meta": {"source": doc_id, "preview": text[:200]} + }) + index.upsert(payload) + + +def rewrite_query(llm: OpenAI, query: str) -> str: + prompt = ( + "Rewrite the user query to be retrieval-friendly, short, and factual. " + "Preserve intent and key nouns; drop pronouns and chit-chat." + ) + res = llm.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": prompt}, + {"role": "user", "content": query}, + ], + max_tokens=50, + ) + return res.choices[0].message.content.strip() + + +def search(index, embedding: List[float], k: int): + return index.query(vector=embedding, top_k=k, include_vectors=False) + + +def build_context(results) -> str: + chunks = [] + for i, item in enumerate(results, 1): + meta = item.get("meta") or {} + preview = meta.get("preview") or "" + chunks.append(f"[{i}] id={item['id']} score={item.get('similarity', item.get('distance'))}\n{preview}\n") + return "\n".join(chunks) + + +def answer_question(llm: OpenAI, question: str, context: str) -> str: + system = ( + "You are a concise assistant. Answer using only the provided context. " + "If the answer is not in the context, say you cannot find it." + ) + res = llm.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}, + ], + max_tokens=200, + temperature=0.2, + ) + return res.choices[0].message.content.strip() + + +def interactive_loop(index, llm: OpenAI): + print("Type your question (or 'exit' to quit)") + while True: + try: + q = input("ask> ").strip() + except (EOFError, KeyboardInterrupt): + break + if not q or q.lower() in {"exit", "quit"}: + break + rewritten = rewrite_query(llm, q) + embedding = embed_texts(llm, [rewritten])[0] + results = search(index, embedding, k=4) + context = build_context(results) + answer = answer_question(llm, q, context) + print("\n---\n", answer, "\n---\n", sep="") + + +def main(): + parser = argparse.ArgumentParser(description="RAG + agentic demo with Endee") + parser.add_argument("--index", default=DEFAULT_INDEX, help="Index name to use/create") + parser.add_argument("--skip-ingest", action="store_true", help="Skip loading sample corpus") + args = parser.parse_args() + + llm = get_openai_client() + endee_client = get_endee_client() + ensure_index(endee_client, args.index, EMBED_DIM) + index = endee_client.get_index(name=args.index) + + if not args.skip_ingest: + docs = load_corpus() + ids, texts = zip(*docs) + embeddings = embed_texts(llm, list(texts)) + upsert_docs(index, list(ids), list(texts), embeddings) + print(f"Ingested {len(ids)} docs into index '{args.index}'") + + interactive_loop(index, llm) + + +if __name__ == "__main__": + main() diff --git a/examples/rag_agentic_demo/requirements.txt b/examples/rag_agentic_demo/requirements.txt new file mode 100644 index 000000000..7b70ca6fd --- /dev/null +++ b/examples/rag_agentic_demo/requirements.txt @@ -0,0 +1 @@ +openai>=1.12.0\npython-dotenv>=1.0.1\nendee>=0.1.0\n diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 000000000..6b0ecb53d --- /dev/null +++ b/test.ipynb @@ -0,0 +1,80 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "6a9d3cb5", + "metadata": {}, + "outputs": [], + "source": [ + "from endee import Endee, Precision\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "40aa2059", + "metadata": {}, + "outputs": [], + "source": [ + "client = Endee()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "01a51046", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Index created successfully'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.create_index(\n", + " name=\"my_index2\",\n", + " dimension=384,\n", + " space_type=\"cosine\",\n", + " precision=Precision.INT8\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "225da938", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 7592dc25abf6b1d36560c2fe3baae30ee929bc7a Mon Sep 17 00:00:00 2001 From: CodexSandboxOffline Date: Tue, 17 Mar 2026 23:41:07 +0530 Subject: [PATCH 2/2] Add agentic app updates, sample env, and ignores --- app/{.env => .env.example} | 13 ++-- app/.gitignore | 23 ++---- app/README.md | 139 +++++++++++++++++++++++++++++++------ app/backend/main.py | 80 ++++++++++++++++++--- app/requirements.txt | 1 + 5 files changed, 195 insertions(+), 61 deletions(-) rename app/{.env => .env.example} (51%) diff --git a/app/.env b/app/.env.example similarity index 51% rename from app/.env rename to app/.env.example index 4376506b1..9a8970874 100644 --- a/app/.env +++ b/app/.env.example @@ -1,16 +1,11 @@ -# API keys -GEMINI_API_KEY="AIzaSyAZu4NTKc_krZYZmMZRasiCQYtsSifOcrw" -GEMINI_MODEL="gemini-2.5-flash" - -# Vector DB +# Sample environment (fill with your keys before running) +GEMINI_API_KEY=your_gemini_key +GEMINI_MODEL=gemini-2.5-flash ENDEE_BASE_URL=http://localhost:8080/api/v1 ENDEE_AUTH_TOKEN= ENDEE_INDEX_NAME=rag_app - -# RAG params CHUNK_SIZE=800 CHUNK_OVERLAP=120 TOP_K=6 - -# Frontend +TAVILY_API_KEY=your_tavily_key BACKEND_URL=http://localhost:8000 diff --git a/app/.gitignore b/app/.gitignore index 449021385..abe52e10c 100644 --- a/app/.gitignore +++ b/app/.gitignore @@ -1,21 +1,8 @@ -cat < .gitignore -# Python +.env +.env.local +!.env.example +.venv/ __pycache__/ *.pyc -*.pyo -*.pyd - -# Virtual environment -.venv/ -venv/ - -# IDE -.vscode/ -.idea/ - -# OS -.DS_Store - -# Logs *.log -EOL \ No newline at end of file +.cache/ diff --git a/app/README.md b/app/README.md index ae0930b41..978f612ee 100644 --- a/app/README.md +++ b/app/README.md @@ -1,36 +1,129 @@ -# Agentic RAG App (FastAPI + Streamlit + Endee) +# Agentic RAG Starter (FastAPI + Streamlit + Endee + Gemini + Tavily) -## What it does -- Upload PDF/DOCX/TXT → chunk with LangChain → embed via `sentence-transformers/all-MiniLM-L6-v2` → store in Endee. -- Chat endpoint: embed query, search Endee (top_k), rerank with `cross-encoder/ms-marco-MiniLM-L-6-v2`, build context, answer with Gemini model (default `gemini-2.5-flash`). -- Toggle RAG on/off in the UI; responses include source metadata. +This app lets you chat with three routing paths: +- **Agent** (auto-route): LLM decides whether to use RAG (Endee), Web (Tavily), or Direct (LLM only). +- **RAG**: embed → Endee search → rerank → Gemini answer with sources. +- **Web**: Tavily search → Gemini answer with web citations. +- **Direct**: Gemini only (no retrieval). -## Layout -- `app/backend/main.py` – FastAPI service (`/upload`, `/chat`, `/health`). -- `app/frontend/streamlit_app.py` – Streamlit UI client. -- `app/requirements.txt` – all deps for both frontend + backend. -- `app/.env` – fill with your keys; defaults point to local Endee + backend. +## Tech Stack +- Backend: FastAPI (`app/backend/main.py`) +- Frontend: Streamlit (`app/frontend/streamlit_app.py`) +- Vector DB: Endee server at `http://localhost:8080` (cosine, INT8, dim 384) +- Embeddings: `sentence-transformers/all-MiniLM-L6-v2` +- Reranker: `cross-encoder/ms-marco-MiniLM-L-6-v2` +- LLM: Gemini (default `gemini-2.5-flash` via Google AI key) +- Web search: Tavily API + +## Prerequisites +- Python 3.10+ +- Endee server running locally on 8080 (from `run.sh` or docker-compose) +- API keys: + - `GEMINI_API_KEY` (required) + - `TAVILY_API_KEY` (for Web route) ## Setup -```bash -cd app +```powershell +cd C:\VH811\projects\endee\app python -m venv .venv -. .venv/Scripts/activate # or source .venv/bin/activate +. .venv\Scripts\activate pip install -r requirements.txt ``` -Edit `.env` with your `GEMINI_API_KEY` (and `ENDEE_AUTH_TOKEN` if your server requires it). -## Run backend -```bash -uvicorn backend.main:app --host 0.0.0.0 --port 8000 --reload +### Environment +Edit `app/.env` (already created) and set: +``` +GEMINI_API_KEY=your_gemini_key +GEMINI_MODEL=gemini-2.5-flash +ENDEE_BASE_URL=http://localhost:8080/api/v1 +ENDEE_AUTH_TOKEN= # blank if Endee auth disabled +ENDEE_INDEX_NAME=rag_app +CHUNK_SIZE=800 +CHUNK_OVERLAP=120 +TOP_K=6 +TAVILY_API_KEY=your_tavily_key +BACKEND_URL=http://localhost:8000 ``` -## Run frontend -```bash -streamlit run frontend/streamlit_app.py +## Run +Backend: +```powershell +cd C:\VH811\projects\endee\app +. .venv\Scripts\activate +python -m uvicorn backend.main:app --host 0.0.0.0 --port 8000 +``` +Frontend: +```powershell +cd C:\VH811\projects\endee\app +. .venv\Scripts\activate +streamlit run frontend\streamlit_app.py ``` +Open Streamlit at http://localhost:8501. + +## How it Works (Pipeline) +1) **Uploads**: pdf/docx/txt/md → text extract (pypdf/docx2txt) → chunk (800/120) → embed (MiniLM) → upsert to Endee with metadata. +2) **Routing** (Agent): quick Gemini prompt decides `rag|web|direct` unless user forces mode via UI radio. +3) **RAG path**: embed query → Endee top_k=6 → rerank top 4 with cross-encoder → build context → Gemini answers; returns sources (chunk previews). +4) **Web path**: Tavily search max 5 results → context → Gemini answers; returns web URLs/titles. +5) **Direct path**: Gemini answers without retrieval. +6) **History**: last 5 turns sent to LLM for coherence (not for facts). + +## Routing Modes in UI +- Agent (auto): LLM router picks best path. +- RAG: force vector search. +- Web: force Tavily search. +- Direct: force plain LLM. + +## Tuning & Performance +- Lower `TOP_K` or rerank cutoff to reduce latency (currently rerank keeps top 4). +- Smaller reranker model (e.g., `cross-encoder/ms-marco-MiniLM-L-2-v2`) for speed. +- Use Direct mode for general chit-chat to avoid retrieval. +- Warm cache by one request after restart to load HF models. + +## Troubleshooting +- `API key not valid / 403 / 429`: check/replace `GEMINI_API_KEY`, quota or billing; switch model if needed. +- `TAVILY_API_KEY not configured`: set the key in `.env` for Web mode. +- Upload errors: ensure file type is pdf/docx/txt/md and Endee server is reachable at `ENDEE_BASE_URL`. + +## File Map +- `app/backend/main.py` — API routes, router, RAG/Web/Direct flows +- `app/frontend/streamlit_app.py` — UI and routing selector +- `app/requirements.txt` — dependencies +- `app/.env` — keys and settings ## Notes -- Endee index name is `rag_app` by default; change via `ENDEE_INDEX_NAME`. -- Embedding dim is fixed at 384 to match `all-MiniLM-L6-v2` and the Endee index is created automatically if missing. -- Reranker uses a cross-encoder; if you want faster responses, you can disable reranking by returning `results` directly in `retrieve`. +- Index auto-creates with dimension 384 to match MiniLM embeddings. +- Sources shown only for RAG/Web modes; direct replies have no citations. +- Last 5 message pairs are passed for conversational continuity. + +## Architecture Diagram (Text) +``` +User (Streamlit UI) + | + v +Routing Selector (Agent/RAG/Web/Direct) + | + +-- Agent -> Router Prompt (Gemini) -> choose path + | | + | +-- RAG Path: + | | embed query (MiniLM) + | | -> Endee search (cosine, INT8, top_k) + | | -> rerank (cross-encoder) + | | -> context -> Gemini answer + chunk sources + | | + | +-- Web Path: + | | Tavily search (max 5) + | | -> context -> Gemini answer + web sources + | | + | +-- Direct Path: + | Gemini answer (no retrieval) + | +Uploads + | + v +File ingest (pdf/docx/txt/md) + -> extract text + -> chunk (800/120) + -> embed (MiniLM) + -> Endee upsert (metadata) +``` diff --git a/app/backend/main.py b/app/backend/main.py index 81b478143..99d753ccc 100644 --- a/app/backend/main.py +++ b/app/backend/main.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python import os from typing import List, Optional @@ -9,6 +9,7 @@ from langchain_google_genai import ChatGoogleGenerativeAI from sentence_transformers import SentenceTransformer, CrossEncoder from starlette.responses import JSONResponse +from tavily import TavilyClient from endee import Endee, Precision from endee.schema import VectorItem as EndeeVectorItem @@ -31,7 +32,9 @@ CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "120")) TOP_K = int(os.getenv("TOP_K", "6")) -app = FastAPI(title="RAG Agentic Backend", version="0.1.0") +TAVILY_API_KEY = os.getenv("TAVILY_API_KEY") + +app = FastAPI(title="RAG Agentic Backend", version="0.2.0") app.add_middleware( CORSMiddleware, @@ -67,10 +70,11 @@ def ensure_index(client: Endee, dim: int) -> None: llm: Optional[ChatGoogleGenerativeAI] = None endee_client: Optional[Endee] = None endee_index = None +tavily_client: Optional[TavilyClient] = None def bootstrap(): - global embedder, reranker, llm, endee_client, endee_index + global embedder, reranker, llm, endee_client, endee_index, tavily_client if GEMINI_API_KEY is None: raise RuntimeError("GEMINI_API_KEY is required") @@ -85,6 +89,9 @@ def bootstrap(): ensure_index(endee_client, embedder.get_sentence_embedding_dimension()) endee_index = endee_client.get_index(name=ENDEE_INDEX_NAME) + if TAVILY_API_KEY: + tavily_client = TavilyClient(api_key=TAVILY_API_KEY) + bootstrap() @@ -174,14 +181,35 @@ def retrieve(query: str): key=lambda x: x["rerank_score"], reverse=True, ) - return reranked + return reranked[:4] + + +def web_search(query: str): + if not tavily_client: + raise HTTPException(status_code=500, detail="TAVILY_API_KEY not configured") + res = tavily_client.search(query=query, max_results=5, include_images=False) + docs = [] + for item in res.get("results", []): + docs.append( + { + "id": item.get("url"), + "meta": { + "source": item.get("url"), + "text": item.get("content", ""), + "title": item.get("title", "") + }, + "score": item.get("score"), + } + ) + return docs def build_context(docs: List[dict]) -> str: parts = [] for doc in docs: meta = doc.get("meta", {}) - parts.append(f"[{meta.get('source')}] {meta.get('text', '')}") + src = meta.get("title") or meta.get("source") + parts.append(f"[{src}] {meta.get('text', '')}") return "\n\n".join(parts) @@ -194,7 +222,28 @@ def build_history(history: List[dict]) -> str: return "\n\n".join(lines) -def answer(question: str, context_docs: List[dict], history: List[dict]) -> dict: +def route_query(question: str, force_mode: str = "auto") -> str: + if force_mode in {"rag", "web", "direct"}: + return force_mode + # lightweight classification with the main LLM + prompt = ( + "Decide routing for the question. Output only one token: RAG, WEB, or DIRECT.\n" + "Use RAG if it likely needs internal docs; WEB if it's about current/general external info; otherwise DIRECT.\n" + f"Question: {question}\nAnswer:" + ) + try: + resp = llm.invoke(prompt) + text = resp.content.strip().upper() + if "WEB" in text: + return "web" + if "RAG" in text: + return "rag" + return "direct" + except Exception: + return "direct" + + +def answer(question: str, context_docs: List[dict], history: List[dict], mode: str) -> dict: if context_docs: context_text = build_context(context_docs) prompt = ( @@ -218,12 +267,13 @@ def answer(question: str, context_docs: List[dict], history: List[dict]) -> dict { "id": doc.get("id"), "source": doc.get("meta", {}).get("source"), - "score": doc.get("rerank_score", doc.get("similarity")), + "title": doc.get("meta", {}).get("title"), + "score": doc.get("rerank_score", doc.get("similarity", doc.get("score"))), "preview": doc.get("meta", {}).get("text", "")[:200], } for doc in context_docs ], - "mode": "rag" if context_docs else "direct", + "mode": mode, } @@ -248,11 +298,19 @@ async def upload(file: UploadFile = File(...)): @app.post("/chat") async def chat(payload: dict): question = payload.get("message") or "" - use_rag = payload.get("use_rag", True) + force_mode = (payload.get("mode") or "auto").lower() history = payload.get("history") or [] if not question: raise HTTPException(status_code=400, detail="message is required") - docs = retrieve(question) if use_rag else [] - result = answer(question, docs, history) + route = route_query(question, force_mode=force_mode) + + if route == "rag": + docs = retrieve(question) + elif route == "web": + docs = web_search(question) + else: + docs = [] + + result = answer(question, docs, history, mode=route) return JSONResponse(result) diff --git a/app/requirements.txt b/app/requirements.txt index 8f5fcac58..4b53dec82 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -12,3 +12,4 @@ endee>=0.1.0 requests>=2.31.0 streamlit>=1.32.0 python-multipart>=0.0.9 +tavily-python>=0.3.8