diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7785ae3..9c28ff7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,5 @@ name: MLOps Pipeline (Langfuse v3) - +run-name: "CI Check - wandb-integration" on: push: branches: [ "main" ] diff --git a/.gitignore b/.gitignore index 0fc8cb4..5eb7bb5 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,6 @@ downloads/ qdrant_db minikube-linux-amd64 opt/ -opt \ No newline at end of file +opt +wandb +wandb/ \ No newline at end of file diff --git a/Makefile b/Makefile index 623e0a9..dab8243 100644 --- a/Makefile +++ b/Makefile @@ -86,17 +86,20 @@ clean-all: down ## Clean everything: stop, remove images, volumes, prune # --- Kubernetes (K8s) --- -k8s-deploy: ## Deploy to Kubernetes +k8s-deploy: + @echo "🚀 Deploying Qdrant storage & database..." + kubectl apply -f k8s/qdrant-pvc.yaml kubectl apply -f k8s/qdrant-statefulset.yaml - kubectl apply -f k8s/qdrant-service.yaml + + @echo "🚀 Deploying RAG API..." kubectl apply -f k8s/deployment.yaml kubectl apply -f k8s/service.yaml k8s-delete: ## Delete Kubernetes resources - kubectl delete -f k8s/deployment.yaml || true kubectl delete -f k8s/service.yaml || true - kubectl delete -f k8s/qdrant-service.yaml || true + kubectl delete -f k8s/deployment.yaml || true kubectl delete -f k8s/qdrant-statefulset.yaml || true + kubectl delete -f k8s/qdrant-pvc.yaml || true k8s-forward: ## Port forward Kubernetes service kubectl port-forward service/rag-service 8000:8000 @@ -106,6 +109,9 @@ k8s-logs: ## Tail logs for Kubernetes deployment # --- Utils --- +track: + PYTHONPATH=. $(VENV_BIN)/python evaluation/track_experiment.py + clean: ## Clean Python caches and virtual env rm -rf __pycache__ .pytest_cache venv .venv find . -type d -name "__pycache__" -exec rm -rf {} + diff --git a/README.md b/README.md index 83c259d..7599d95 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ ![Grafana](https://img.shields.io/badge/Grafana-F46800?style=flat&logo=grafana&logoColor=white) ![LangChain](https://img.shields.io/badge/Framework-LangChain-121212?style=flat&logo=chainlink) ![Make](https://img.shields.io/badge/Automation-Makefile-008080?style=flat&logo=gnu-make&logoColor=white) +![WandB](https://img.shields.io/badge/Experiment_Tracking-Weights_&_Biases-gold?logo=weightsandbiases) --- ## 💡 TL;DR — What this is @@ -362,15 +363,29 @@ Access Grafana at `http://localhost:3000` (admin/admin) ## 🧪 Evaluation -### Ragas Metrics +### 📊 Evaluation & Tracking +We use **Ragas** for checking quality and **Weights & Biases** for experiment tracking. + +![RAG Evaluation Results using W&B](images/rag-eval-metrics-wandb.png) + +### Running Experiments Run evaluation pipeline: ```bash make eval -# Or: python evaluation/evaluate.py +# Or: +# 1) - python evaluation/track_experiment.py +# 2) 1) - python evaluation/evaluate.py ``` +**Tracked Experiment (with W&B)** + +| Metric | Score | Description | +|--------------------|:-----:|-------------| +| Faithfulness | 1.00 | Zero hallucinations | +| Context Precision | 1.00 | Perfect retrieval | +| Answer Relevancy | N/a | (Rate limited in free tier) or 0.83 without free tier | -**Latest Results:** +**Latest Results (evaluate.py):** | Metric | Score | Description | |--------------------|:-----:|-------------| diff --git a/evaluation/track_experiment.py b/evaluation/track_experiment.py new file mode 100644 index 0000000..4aef3a5 --- /dev/null +++ b/evaluation/track_experiment.py @@ -0,0 +1,173 @@ +import os +import sys +import time +import logging +import wandb +from datasets import Dataset, Features, Sequence, Value +from ragas import evaluate, RunConfig +from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall + +# 1. Приглушуємо шумні бібліотеки +for logger_name in ["httpx", "httpcore", "groq", "openai", "qdrant_client", "sentence_transformers"]: + logging.getLogger(logger_name).setLevel(logging.WARNING) + +# Додаємо корінь проєкту в шлях для імпорту src +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +from src.rag import engine as rag_engine +from src.config import CHUNK_SIZE, LLM_MODEL + +# Налаштування логера +logging.basicConfig(level=logging.INFO, format='%(message)s') +logger = logging.getLogger("WANDB-EVAL") + +class RAGWandbEvaluator: + def __init__(self): + # Тестові дані, які точно співпадають із нашим згенерованим PDF + self.eval_data = [ + { + "question": "What is the main topic of this document?", + "ground_truth": "The document discusses RAG architecture and MLOps integration." + }, + { + "question": "Which embedding model is used?", + "ground_truth": "The system uses huggingface/all-MiniLM-L6-v2." + } + ] + self.metrics = [faithfulness, answer_relevancy, context_precision, context_recall] + self.test_file = "test_data_autogen.pdf" + + def _create_dummy_pdf(self): + """ + Створює PDF файл програмно. + Це замінює необхідність ручного створення example.pdf. + """ + text_content = """ + Talk to Your Docs System. + This document discusses RAG architecture and MLOps integration. + The system uses huggingface/all-MiniLM-L6-v2 embedding model for semantic search. + """ + + try: + # Спроба використати reportlab для "чесного" PDF + from reportlab.pdfgen import canvas + c = canvas.Canvas(self.test_file) + c.drawString(100, 750, "RAG MLOps Test Document") + y = 700 + for line in text_content.split('\n'): + c.drawString(50, y, line.strip()) + y -= 20 + c.save() + logger.info(f"✅ Generated synthetic PDF using ReportLab: {self.test_file}") + + except ImportError: + # Фолбек: Створюємо PDF "хардкорно" (простий формат, який зрозуміє PyPDFLoader) + # Якщо reportlab не встановлено, ми все одно не впадемо. + logger.warning("⚠️ ReportLab not found. Creating simple text-based PDF fallback.") + with open(self.test_file, "w") as f: + # Це не валідний бінарний PDF, але PyPDFLoader іноді може читати текст + # Краще рішення для продакшну: `pip install reportlab` + f.write(text_content) + + def _ensure_data_exists(self): + """ + Перевіряє базу. Якщо пуста -> генерує файл -> завантажує -> видаляє файл. + """ + try: + # Перевіряємо, чи є документи в колекції + info = rag_engine.client.get_collection(rag_engine.vector_store.collection_name) + if info.points_count == 0: + logger.warning("⚠️ Database is empty! Starting auto-ingestion...") + + # 1. Створюємо файл + self._create_dummy_pdf() + + # 2. Завантажуємо в RAG + if os.path.exists(self.test_file): + rag_engine.ingest_file(self.test_file) + logger.info(f"✅ Ingested {self.test_file} into Qdrant") + + # 3. Прибираємо за собою (Cleanup) + os.remove(self.test_file) + logger.info(f"🧹 Cleaned up temporary file: {self.test_file}") + else: + logger.error("❌ Failed to create test file.") + else: + logger.info("✅ Database already has data. Skipping ingestion.") + + except Exception as e: + logger.warning(f"⚠️ Could not check DB status (might be connection error): {e}") + + def _generate_dataset(self): + questions, answers, contexts, ground_truths = [], [], [], [] + + logger.info(f"🚀 Starting RAG inference on {len(self.eval_data)} samples...") + + for i, item in enumerate(self.eval_data): + q = item["question"] + logger.info(f"[{i+1}/{len(self.eval_data)}] Processing: {q}...") + + try: + ans, sources, _ = rag_engine.get_answer_with_sources(query=q) + + questions.append(q) + answers.append(ans) + # Витягуємо текст з джерел + contexts.append([s['text'] for s in sources]) + ground_truths.append(item["ground_truth"]) + + # Пауза для Groq Free Tier + time.sleep(1.5) + + except Exception as e: + logger.error(f"❌ Error at sample {i+1}: {e}") + + # Сувора схема даних для Arrow/Ragas + features = Features({ + 'question': Value('string'), + 'answer': Value('string'), + 'contexts': Sequence(Value('string')), + 'ground_truth': Value('string') + }) + + return Dataset.from_dict({ + "question": questions, "answer": answers, + "contexts": contexts, "ground_truth": ground_truths + }, features=features) + + def run(self): + # 1. Підготовка даних (Cold Start) + self._ensure_data_exists() + + # 2. Ініціалізація W&B + run = wandb.init( + project="talk-to-your-docs-rag", + name=f"eval-{LLM_MODEL.replace('/', '-')}-v3", + config={"chunk_size": CHUNK_SIZE, "llm": LLM_MODEL} + ) + + # 3. Генерація датасету + dataset = self._generate_dataset() + + logger.info("\n📊 Calculating Ragas Metrics (LLM-as-a-Judge)...") + # max_workers=1 важливо для уникнення лімітів API + results = evaluate( + dataset=dataset, + metrics=self.metrics, + llm=rag_engine.llm, + embeddings=rag_engine.embeddings, + run_config=RunConfig(max_workers=1, timeout=60, max_retries=2) + ) + + # 4. Логування результатів + wandb.log(results) + + # Логування таблиці для ручного аналізу + eval_df = results.to_pandas() + wandb.log({"detailed_results": wandb.Table(dataframe=eval_df)}) + + logger.info(f"\n✅ Evaluation Complete. Results:\n{results}") + wandb.finish() + +if __name__ == "__main__": + evaluator = RAGWandbEvaluator() + evaluator.run() \ No newline at end of file diff --git a/images/rag-eval-metrics-wandb.png b/images/rag-eval-metrics-wandb.png new file mode 100644 index 0000000..15f092b Binary files /dev/null and b/images/rag-eval-metrics-wandb.png differ diff --git a/k8s/qdrant-pvc.yaml b/k8s/qdrant-pvc.yaml index ab41137..b06a289 100644 --- a/k8s/qdrant-pvc.yaml +++ b/k8s/qdrant-pvc.yaml @@ -7,4 +7,4 @@ spec: - ReadWriteOnce resources: requests: - storage: 500mi + storage: 10Gi diff --git a/requirements.txt b/requirements.txt index 68594a2..947d77d 100644 Binary files a/requirements.txt and b/requirements.txt differ