agentic-chatbot/vector_store/knowledge_source.py at 411f21519a6164a7d9f256e85148b16c4e0c8a0f · Ruchip16/agentic-chatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# TODO (@abhikdps): Remove this file once the Igloo API keys
# are aquired and rename the knowledge_source_igloo.py file to knowledge_source.py
import pathlib
import time
import logging
from typing import Any
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from constants import SOURCE_RESPOSITORY_PATH

logger = logging.getLogger(__name__)


class SourceScraper:
    def __init__(self, base_url: str = "https://source.redhat.com/"):
        chrome_options = Options()
        chrome_options.add_argument("--start-maximized")
        self.driver = webdriver.Chrome(options=chrome_options)
        self.base_url = base_url

        self.driver.get(self.base_url)
        print("\n Please log in manually and press ENTER here once done...")
        input()
        print(" Login confirmed. Proceeding with scraping.")

    def fetch_all_pages(self, url_fragment: str, recursive: bool = False):
        url = self.base_url.rstrip("/") + url_fragment
        self.driver.get(url)
        time.sleep(3)

        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        pages = [soup]

        if recursive:
            children_links = soup.select("a[href^='/']")
            visited = set()

            for link in children_links:
                href = link.get("href")
                full_url = self.base_url.rstrip("/") + href
                if href and href.startswith("/") and full_url not in visited:
                    visited.add(full_url)
                    try:
                        self.driver.get(full_url)
                        time.sleep(2)
                        sub_soup = BeautifulSoup(self.driver.page_source, "html.parser")
                        pages.append(sub_soup)
                    except Exception as e:
                        logger.warning(f"Failed to visit {full_url}: {e}")

        return pages

    def extract_attachments(self, soup: BeautifulSoup):
        attachments = []
        links = soup.select("a")
        for link in links:
            href = link.get("href")
            if href and any(ext in href for ext in [".pdf", ".docx", ".xlsx"]):
                attachments.append(href)
        return attachments

    def save_page(self, soup: BeautifulSoup, path: pathlib.Path):
        with open(path, "w", encoding="utf-8") as f:
            f.write(str(soup))

    def download_attachments(self, attachments: list[str], base_path: pathlib.Path):
        for link in attachments:
            file_name = link.split("/")[-1]
            full_path = base_path / file_name
            try:
                self.driver.get(
                    link
                    if link.startswith("http")
                    else self.base_url.rstrip("/") + link
                )
                with open(full_path, "wb") as f:
                    f.write(self.driver.page_source.encode("utf-8"))
            except Exception as e:
                logger.warning(f"Failed to download attachment {link}: {e}")

    def scrape(
        self,
        url_fragment: str,
        recursive: bool,
        attachments: bool,
        metadata: dict[str, Any],
    ):
        meta_lookup = {}
        pages = self.fetch_all_pages(url_fragment, recursive)

        for i, soup in enumerate(pages):
            title = soup.title.string if soup.title else f"page_{i}"
            safe_title = title.replace("/", "_").replace(" ", "_")[:50]
            page_path = (
                SOURCE_RESPOSITORY_PATH / url_fragment.strip("/") / f"{safe_title}.html"
            )
            page_path.parent.mkdir(parents=True, exist_ok=True)

            self.save_page(soup, page_path)
            file_metadata = metadata.copy()
            file_metadata["url"] = self.base_url.rstrip("/") + url_fragment

            if attachments:
                attachment_links = self.extract_attachments(soup)
                self.download_attachments(attachment_links, page_path.parent)

            meta_lookup[page_path] = file_metadata

        return meta_lookup


def fetchall(
    url_fragment: str,
    recursive: bool = False,
    attachments: bool = True,
    metadata: dict = {},
    **kwargs,
):
    scraper = SourceScraper()
    return scraper.scrape(url_fragment, recursive, attachments, metadata)