Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,8 @@ __pycache__/
.env.*
.idea/
.vscode/
*.db
*.db
.mypy_cache/
.cache/
.DS_Store
output*
18 changes: 18 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
version: v1
ingest_threads: 8
collections:
- name: "Source Collection"
id: "source_collection"
mode: "overwrite"
chunk_size: 500
chunk_overlap: 250
embedding_model: "all-MiniLM-L6-v2"
metadata:
key: "value"
sources:
- type: "source"
url_fragment: "/"
recursive: true
attachments: true
metadata:
key: "value"
1 change: 1 addition & 0 deletions logs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.csv
9 changes: 9 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,12 @@ Jinja2==3.1.6
MarkupSafe==3.0.2
slack_sdk==3.35.0
Werkzeug==3.1.3
pyigloo @ git+https://github.com/xkahn/pyigloo.git
langchain_huggingface
langchain_postgres
langchain_community
types-beautifulsoup4 # can be removed after testing with igloo API
hf_xet
tf-keras
selenium # can be removed after tesing with igloo API
pdfminer.six
13 changes: 13 additions & 0 deletions sample.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Igloo
IGLOO_API_KEY=
IGLOO_ACCESS_KEY=
IGLOO_USER=
IGLOO_PASS=

# PGVector
PGVECTOR_DRIVER="psycopg2"
PGVECTOR_USER=
PGVECTOR_PASS=
PGVECTOR_DATABASE_NAME=
PGVECTOR_URI="localhost"
PGVECTOR_PORT="5432"
Empty file removed scripts/ingest_data.py
Empty file.
26 changes: 26 additions & 0 deletions vector_store/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
import pathlib

import torch
from dotenv import load_dotenv

load_dotenv()

# PATHS
DIRECTORY_PATH = pathlib.Path.cwd()
KNOWLEDGE_REPOSITORY_PATH = DIRECTORY_PATH / "knowledge"
SOURCE_RESPOSITORY_PATH = KNOWLEDGE_REPOSITORY_PATH / "source"

# INGEST
DEVICE = (
"cuda"
if torch.cuda.is_available()
else ("mps" if torch.backends.mps.is_available() else "cpu")
)

# PGVECTOR
PGVECTOR_USER = os.environ.get("PGVECTOR_USER")
PGVECTOR_PASS = os.environ.get("PGVECTOR_PASS")
PGVECTOR_DATABASE_NAME = os.environ.get("PGVECTOR_DATABASE_NAME")
PGVECTOR_HOST = os.environ.get("PGVECTOR_URI", "localhost")
PGVECTOR_PORT = int(os.environ.get("PGVECTOR_PORT", 5432))
13 changes: 13 additions & 0 deletions vector_store/delete_knowledge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import logging
import shutil

from constants import KNOWLEDGE_REPOSITORY_PATH

logger = logging.getLogger(__name__)


def delete_knowledge():
"""Delete everything in the knowledge folder."""
if KNOWLEDGE_REPOSITORY_PATH.exists():
logger.info(f"Deleting {KNOWLEDGE_REPOSITORY_PATH}")
shutil.rmtree(KNOWLEDGE_REPOSITORY_PATH)
122 changes: 122 additions & 0 deletions vector_store/ingest_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Data Ingestion"""

import logging
import pathlib
from datetime import datetime

import pandas as pd
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_postgres import PGVector

from constants import (
DEVICE,
DIRECTORY_PATH,
KNOWLEDGE_REPOSITORY_PATH,
PGVECTOR_DATABASE_NAME,
PGVECTOR_HOST,
PGVECTOR_PASS,
PGVECTOR_PORT,
PGVECTOR_USER,
)
from split import load_documents, split_document

logger = logging.getLogger(__name__)


def get_embedder(embedding_model_name: str) -> HuggingFaceEmbeddings:
"""Initialize an embedder to convert text into vectors."""
return HuggingFaceEmbeddings(
model_name=embedding_model_name,
model_kwargs={"device": DEVICE},
show_progress=True,
)


def ingest(
meta_lookup: dict[pathlib.Path, dict],
collection_name: str,
chunk_size: int,
chunk_overlap: int,
ingest_threads: int = 8,
embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
mode: str = "overwrite",
collection_metadata: dict = {},
):
"""Load documents into a vectorstore."""
# Get documents
all_documents = []
origin_urls = {}
documents = load_documents(KNOWLEDGE_REPOSITORY_PATH, ingest_threads=ingest_threads)
for extension, document in documents:
# Split each document into chunks
document = document[0]
# Rename "source" to "_source" and save filename to "source"
source = pathlib.Path(document.metadata["source"])
file_name = source.stem
document.metadata["_source"] = document.metadata["source"]
document.metadata["source"] = file_name
chunks = split_document(
document, extension, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
# Attach metadata to each chunk
for chunk in chunks:
path_metadata = meta_lookup.get(source, {})
chunk.metadata = chunk.metadata | path_metadata
# Record how many chunks were made
rel_path = source.relative_to(KNOWLEDGE_REPOSITORY_PATH)
origin = rel_path.parts[0]
origin_url = (origin, chunk.metadata.get("url"))
origin_urls[origin_url] = len(chunks)
all_documents.extend(chunks)

# Create embeddings
embedder = get_embedder(embedding_model_name)

# Build the Postgres connection string
connection_string = PGVector.connection_string_from_db_params(
driver="psycopg",
host=PGVECTOR_HOST,
port=int(PGVECTOR_PORT),
database=PGVECTOR_DATABASE_NAME,
user=PGVECTOR_USER,
password=PGVECTOR_PASS,
)

# Connect to the db
db = PGVector(
connection=connection_string,
embeddings=embedder,
collection_name=collection_name,
collection_metadata=collection_metadata,
use_jsonb=True,
)

# Overwrite the collection (if requested)
if mode == "overwrite":
db.delete_collection()
logger.info(f"Collection {collection_name} deleted")
db.create_collection()
logger.info(f"Collection {collection_name} created")

# Load the documents
logger.info(
f"Loading {len(all_documents)} embeddings to {PGVECTOR_HOST} - {PGVECTOR_DATABASE_NAME} - {collection_name}"
)

# Add documents to DB in batches to accomodate the large numbers of parameters
batch_size = 150
for i in range(0, len(all_documents), batch_size):
batch = all_documents[i:i + batch_size]
logger.info(f"Ingesting batch {i // batch_size + 1} of {len(batch)} documents")
db.add_documents(documents=batch)

logger.info(f"Successfully loaded {len(all_documents)} embeddings")

directory_source_url_chunks = [
list(origin_url) + [chunks] for origin_url, chunks in origin_urls.items()
]
df = pd.DataFrame(directory_source_url_chunks, columns=["origin", "url", "chunks"])
filename = f"{PGVECTOR_HOST} - {collection_name} - {datetime.now()}.csv"
outpath = DIRECTORY_PATH / "logs" / filename
outpath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(outpath, index=False)
122 changes: 122 additions & 0 deletions vector_store/knowledge_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# TODO (@abhikdps): Remove this file once the Igloo API keys
# are aquired and rename the knowledge_source_igloo.py file to knowledge_source.py
import pathlib
import time
import logging
from typing import Any
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from constants import SOURCE_RESPOSITORY_PATH

logger = logging.getLogger(__name__)


class SourceScraper:
def __init__(self, base_url: str = "https://source.redhat.com/"):
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
self.driver = webdriver.Chrome(options=chrome_options)
self.base_url = base_url

self.driver.get(self.base_url)
print("\n Please log in manually and press ENTER here once done...")
input()
print(" Login confirmed. Proceeding with scraping.")

def fetch_all_pages(self, url_fragment: str, recursive: bool = False):
url = self.base_url.rstrip("/") + url_fragment
self.driver.get(url)
time.sleep(3)

soup = BeautifulSoup(self.driver.page_source, "html.parser")
pages = [soup]

if recursive:
children_links = soup.select("a[href^='/']")
visited = set()

for link in children_links:
href = link.get("href")
full_url = self.base_url.rstrip("/") + href
if href and href.startswith("/") and full_url not in visited:
visited.add(full_url)
try:
self.driver.get(full_url)
time.sleep(2)
sub_soup = BeautifulSoup(self.driver.page_source, "html.parser")
pages.append(sub_soup)
except Exception as e:
logger.warning(f"Failed to visit {full_url}: {e}")

return pages

def extract_attachments(self, soup: BeautifulSoup):
attachments = []
links = soup.select("a")
for link in links:
href = link.get("href")
if href and any(ext in href for ext in [".pdf", ".docx", ".xlsx"]):
attachments.append(href)
return attachments

def save_page(self, soup: BeautifulSoup, path: pathlib.Path):
with open(path, "w", encoding="utf-8") as f:
f.write(str(soup))

def download_attachments(self, attachments: list[str], base_path: pathlib.Path):
for link in attachments:
file_name = link.split("/")[-1]
full_path = base_path / file_name
try:
self.driver.get(
link
if link.startswith("http")
else self.base_url.rstrip("/") + link
)
with open(full_path, "wb") as f:
f.write(self.driver.page_source.encode("utf-8"))
except Exception as e:
logger.warning(f"Failed to download attachment {link}: {e}")

def scrape(
self,
url_fragment: str,
recursive: bool,
attachments: bool,
metadata: dict[str, Any],
):
meta_lookup = {}
pages = self.fetch_all_pages(url_fragment, recursive)

for i, soup in enumerate(pages):
title = soup.title.string if soup.title else f"page_{i}"
safe_title = title.replace("/", "_").replace(" ", "_")[:50]
page_path = (
SOURCE_RESPOSITORY_PATH / url_fragment.strip("/") / f"{safe_title}.html"
)
page_path.parent.mkdir(parents=True, exist_ok=True)

self.save_page(soup, page_path)
file_metadata = metadata.copy()
file_metadata["url"] = self.base_url.rstrip("/") + url_fragment

if attachments:
attachment_links = self.extract_attachments(soup)
self.download_attachments(attachment_links, page_path.parent)

meta_lookup[page_path] = file_metadata

return meta_lookup


def fetchall(
url_fragment: str,
recursive: bool = False,
attachments: bool = True,
metadata: dict = {},
**kwargs,
):
scraper = SourceScraper()
return scraper.scrape(url_fragment, recursive, attachments, metadata)
Loading