|
1 |
| -from fastapi import APIRouter, Query |
| 1 | +from fastapi import APIRouter, Query, Body |
2 | 2 | from fastapi.responses import JSONResponse
|
| 3 | +from pydantic import BaseModel |
| 4 | +from typing import Optional |
3 | 5 |
|
4 |
| -from .rag import MemoryAlphaRAG, ThinkingMode |
| 6 | +from .rag import MemoryAlphaRAG |
5 | 7 |
|
6 | 8 | router = APIRouter()
|
7 | 9 |
|
8 | 10 | # Singleton or global instance for demo; in production, manage lifecycle properly
|
9 | 11 | rag_instance = MemoryAlphaRAG()
|
10 |
| -ThinkingMode = ThinkingMode |
| 12 | + |
| 13 | +class AskRequest(BaseModel): |
| 14 | + question: str |
| 15 | + max_tokens: Optional[int] = 2048 |
| 16 | + top_k: Optional[int] = 10 |
| 17 | + top_p: Optional[float] = 0.8 |
| 18 | + temperature: Optional[float] = 0.3 |
| 19 | + |
| 20 | +@router.post("/memoryalpha/rag/ask") |
| 21 | +def ask_endpoint_post(request: AskRequest): |
| 22 | + """ |
| 23 | + Query the RAG pipeline and return the full response. |
| 24 | + Accepts POST requests with JSON payload for cleaner API usage. |
| 25 | + """ |
| 26 | + try: |
| 27 | + answer = rag_instance.ask( |
| 28 | + request.question, |
| 29 | + max_tokens=request.max_tokens, |
| 30 | + top_k=request.top_k, |
| 31 | + top_p=request.top_p, |
| 32 | + temperature=request.temperature |
| 33 | + ) |
| 34 | + return JSONResponse(content={"response": answer}) |
| 35 | + except Exception as e: |
| 36 | + return JSONResponse(status_code=500, content={"error": str(e)}) |
11 | 37 |
|
12 | 38 | @router.get("/memoryalpha/rag/ask")
|
13 | 39 | def ask_endpoint(
|
14 | 40 | question: str = Query(..., description="The user question"),
|
15 |
| - thinkingmode: str = Query("DISABLED", description="Thinking mode: DISABLED, QUIET, or VERBOSE"), |
16 | 41 | max_tokens: int = Query(2048, description="Maximum tokens to generate"),
|
17 | 42 | top_k: int = Query(10, description="Number of documents to retrieve"),
|
18 | 43 | top_p: float = Query(0.8, description="Sampling parameter"),
|
19 | 44 | temperature: float = Query(0.3, description="Randomness/creativity of output")
|
20 | 45 | ):
|
21 | 46 | """
|
22 |
| - Query the RAG pipeline and return the full response (including thinking if enabled). |
| 47 | + Query the RAG pipeline and return the full response. |
| 48 | + Now uses advanced tool-enabled RAG by default for better results. |
23 | 49 | """
|
24 | 50 | try:
|
25 |
| - # Set the thinking mode for this request |
26 |
| - rag_instance.thinking_mode = ThinkingMode[thinkingmode.upper()] |
27 | 51 | answer = rag_instance.ask(
|
28 | 52 | question,
|
29 | 53 | max_tokens=max_tokens,
|
|
0 commit comments