diff --git a/.gitignore b/.gitignore
index 6e9b265..e646a9a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,8 @@ __pycache__/
 .env.*
 .idea/
 .vscode/
-*.db
\ No newline at end of file
+*.db
+.mypy_cache/
+.cache/
+.DS_Store
+output*
diff --git a/config/config.yaml b/config/config.yaml
index e69de29..8c09ba3 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -0,0 +1,18 @@
+version: v1
+ingest_threads: 8
+collections:
+  - name: "Source Collection"
+    id: "source_collection"
+    mode: "overwrite"
+    chunk_size: 500
+    chunk_overlap: 250
+    embedding_model: "all-MiniLM-L6-v2"
+    metadata:
+      key: "value"
+    sources:
+      - type: "source"
+        url_fragment: "/"
+        recursive: true
+        attachments: true
+        metadata:
+          key: "value"
diff --git a/logs/.gitignore b/logs/.gitignore
new file mode 100644
index 0000000..16f2dc5
--- /dev/null
+++ b/logs/.gitignore
@@ -0,0 +1 @@
+*.csv
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 041375b..d35ab7e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,12 @@ Jinja2==3.1.6
 MarkupSafe==3.0.2
 slack_sdk==3.35.0
 Werkzeug==3.1.3
+pyigloo @ git+https://github.com/xkahn/pyigloo.git
+langchain_huggingface
+langchain_postgres
+langchain_community
+types-beautifulsoup4 # can be removed after testing with igloo API
+hf_xet
+tf-keras
+selenium # can be removed after tesing with igloo API
+pdfminer.six
diff --git a/sample.env b/sample.env
new file mode 100644
index 0000000..f6dbdcc
--- /dev/null
+++ b/sample.env
@@ -0,0 +1,13 @@
+# Igloo
+IGLOO_API_KEY=
+IGLOO_ACCESS_KEY=
+IGLOO_USER=
+IGLOO_PASS=
+
+# PGVector
+PGVECTOR_DRIVER="psycopg2"
+PGVECTOR_USER=
+PGVECTOR_PASS=
+PGVECTOR_DATABASE_NAME=
+PGVECTOR_URI="localhost"
+PGVECTOR_PORT="5432"
diff --git a/scripts/ingest_data.py b/scripts/ingest_data.py
deleted file mode 100644
index e69de29..0000000
diff --git a/vector_store/constants.py b/vector_store/constants.py
new file mode 100644
index 0000000..20f3cd5
--- /dev/null
+++ b/vector_store/constants.py
@@ -0,0 +1,26 @@
+import os
+import pathlib
+
+import torch
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# PATHS
+DIRECTORY_PATH = pathlib.Path.cwd()
+KNOWLEDGE_REPOSITORY_PATH = DIRECTORY_PATH / "knowledge"
+SOURCE_RESPOSITORY_PATH = KNOWLEDGE_REPOSITORY_PATH / "source"
+
+# INGEST
+DEVICE = (
+    "cuda"
+    if torch.cuda.is_available()
+    else ("mps" if torch.backends.mps.is_available() else "cpu")
+)
+
+# PGVECTOR
+PGVECTOR_USER = os.environ.get("PGVECTOR_USER")
+PGVECTOR_PASS = os.environ.get("PGVECTOR_PASS")
+PGVECTOR_DATABASE_NAME = os.environ.get("PGVECTOR_DATABASE_NAME")
+PGVECTOR_HOST = os.environ.get("PGVECTOR_URI", "localhost")
+PGVECTOR_PORT = int(os.environ.get("PGVECTOR_PORT", 5432))
diff --git a/vector_store/delete_knowledge.py b/vector_store/delete_knowledge.py
new file mode 100644
index 0000000..39958fa
--- /dev/null
+++ b/vector_store/delete_knowledge.py
@@ -0,0 +1,13 @@
+import logging
+import shutil
+
+from constants import KNOWLEDGE_REPOSITORY_PATH
+
+logger = logging.getLogger(__name__)
+
+
+def delete_knowledge():
+    """Delete everything in the knowledge folder."""
+    if KNOWLEDGE_REPOSITORY_PATH.exists():
+        logger.info(f"Deleting {KNOWLEDGE_REPOSITORY_PATH}")
+        shutil.rmtree(KNOWLEDGE_REPOSITORY_PATH)
diff --git a/vector_store/ingest_data.py b/vector_store/ingest_data.py
new file mode 100644
index 0000000..ce7de0c
--- /dev/null
+++ b/vector_store/ingest_data.py
@@ -0,0 +1,122 @@
+"""Data Ingestion"""
+
+import logging
+import pathlib
+from datetime import datetime
+
+import pandas as pd
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain_postgres import PGVector
+
+from constants import (
+    DEVICE,
+    DIRECTORY_PATH,
+    KNOWLEDGE_REPOSITORY_PATH,
+    PGVECTOR_DATABASE_NAME,
+    PGVECTOR_HOST,
+    PGVECTOR_PASS,
+    PGVECTOR_PORT,
+    PGVECTOR_USER,
+)
+from split import load_documents, split_document
+
+logger = logging.getLogger(__name__)
+
+
+def get_embedder(embedding_model_name: str) -> HuggingFaceEmbeddings:
+    """Initialize an embedder to convert text into vectors."""
+    return HuggingFaceEmbeddings(
+        model_name=embedding_model_name,
+        model_kwargs={"device": DEVICE},
+        show_progress=True,
+    )
+
+
+def ingest(
+    meta_lookup: dict[pathlib.Path, dict],
+    collection_name: str,
+    chunk_size: int,
+    chunk_overlap: int,
+    ingest_threads: int = 8,
+    embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+    mode: str = "overwrite",
+    collection_metadata: dict = {},
+):
+    """Load documents into a vectorstore."""
+    # Get documents
+    all_documents = []
+    origin_urls = {}
+    documents = load_documents(KNOWLEDGE_REPOSITORY_PATH, ingest_threads=ingest_threads)
+    for extension, document in documents:
+        # Split each document into chunks
+        document = document[0]
+        # Rename "source" to "_source" and save filename to "source"
+        source = pathlib.Path(document.metadata["source"])
+        file_name = source.stem
+        document.metadata["_source"] = document.metadata["source"]
+        document.metadata["source"] = file_name
+        chunks = split_document(
+            document, extension, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+        )
+        # Attach metadata to each chunk
+        for chunk in chunks:
+            path_metadata = meta_lookup.get(source, {})
+            chunk.metadata = chunk.metadata | path_metadata
+        # Record how many chunks were made
+        rel_path = source.relative_to(KNOWLEDGE_REPOSITORY_PATH)
+        origin = rel_path.parts[0]
+        origin_url = (origin, chunk.metadata.get("url"))
+        origin_urls[origin_url] = len(chunks)
+        all_documents.extend(chunks)
+
+    # Create embeddings
+    embedder = get_embedder(embedding_model_name)
+
+    # Build the Postgres connection string
+    connection_string = PGVector.connection_string_from_db_params(
+        driver="psycopg",
+        host=PGVECTOR_HOST,
+        port=int(PGVECTOR_PORT),
+        database=PGVECTOR_DATABASE_NAME,
+        user=PGVECTOR_USER,
+        password=PGVECTOR_PASS,
+    )
+
+    # Connect to the db
+    db = PGVector(
+        connection=connection_string,
+        embeddings=embedder,
+        collection_name=collection_name,
+        collection_metadata=collection_metadata,
+        use_jsonb=True,
+    )
+
+    # Overwrite the collection (if requested)
+    if mode == "overwrite":
+        db.delete_collection()
+        logger.info(f"Collection {collection_name} deleted")
+        db.create_collection()
+        logger.info(f"Collection {collection_name} created")
+
+    # Load the documents
+    logger.info(
+        f"Loading {len(all_documents)} embeddings to {PGVECTOR_HOST} - {PGVECTOR_DATABASE_NAME} - {collection_name}"
+    )
+
+    # Add documents to DB in batches to accomodate the large numbers of parameters
+    batch_size = 150
+    for i in range(0, len(all_documents), batch_size):
+        batch = all_documents[i:i + batch_size]
+        logger.info(f"Ingesting batch {i // batch_size + 1} of {len(batch)} documents")
+        db.add_documents(documents=batch)
+
+    logger.info(f"Successfully loaded {len(all_documents)} embeddings")
+
+    directory_source_url_chunks = [
+        list(origin_url) + [chunks] for origin_url, chunks in origin_urls.items()
+    ]
+    df = pd.DataFrame(directory_source_url_chunks, columns=["origin", "url", "chunks"])
+    filename = f"{PGVECTOR_HOST} - {collection_name} - {datetime.now()}.csv"
+    outpath = DIRECTORY_PATH / "logs" / filename
+    outpath.parent.mkdir(parents=True, exist_ok=True)
+    df.to_csv(outpath, index=False)
diff --git a/vector_store/knowledge_source.py b/vector_store/knowledge_source.py
new file mode 100644
index 0000000..3753cdb
--- /dev/null
+++ b/vector_store/knowledge_source.py
@@ -0,0 +1,122 @@
+# TODO (@abhikdps): Remove this file once the Igloo API keys
+# are aquired and rename the knowledge_source_igloo.py file to knowledge_source.py
+import pathlib
+import time
+import logging
+from typing import Any
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+from constants import SOURCE_RESPOSITORY_PATH
+
+logger = logging.getLogger(__name__)
+
+
+class SourceScraper:
+    def __init__(self, base_url: str = "https://source.redhat.com/"):
+        chrome_options = Options()
+        chrome_options.add_argument("--start-maximized")
+        self.driver = webdriver.Chrome(options=chrome_options)
+        self.base_url = base_url
+
+        self.driver.get(self.base_url)
+        print("\n Please log in manually and press ENTER here once done...")
+        input()
+        print(" Login confirmed. Proceeding with scraping.")
+
+    def fetch_all_pages(self, url_fragment: str, recursive: bool = False):
+        url = self.base_url.rstrip("/") + url_fragment
+        self.driver.get(url)
+        time.sleep(3)
+
+        soup = BeautifulSoup(self.driver.page_source, "html.parser")
+        pages = [soup]
+
+        if recursive:
+            children_links = soup.select("a[href^='/']")
+            visited = set()
+
+            for link in children_links:
+                href = link.get("href")
+                full_url = self.base_url.rstrip("/") + href
+                if href and href.startswith("/") and full_url not in visited:
+                    visited.add(full_url)
+                    try:
+                        self.driver.get(full_url)
+                        time.sleep(2)
+                        sub_soup = BeautifulSoup(self.driver.page_source, "html.parser")
+                        pages.append(sub_soup)
+                    except Exception as e:
+                        logger.warning(f"Failed to visit {full_url}: {e}")
+
+        return pages
+
+    def extract_attachments(self, soup: BeautifulSoup):
+        attachments = []
+        links = soup.select("a")
+        for link in links:
+            href = link.get("href")
+            if href and any(ext in href for ext in [".pdf", ".docx", ".xlsx"]):
+                attachments.append(href)
+        return attachments
+
+    def save_page(self, soup: BeautifulSoup, path: pathlib.Path):
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(str(soup))
+
+    def download_attachments(self, attachments: list[str], base_path: pathlib.Path):
+        for link in attachments:
+            file_name = link.split("/")[-1]
+            full_path = base_path / file_name
+            try:
+                self.driver.get(
+                    link
+                    if link.startswith("http")
+                    else self.base_url.rstrip("/") + link
+                )
+                with open(full_path, "wb") as f:
+                    f.write(self.driver.page_source.encode("utf-8"))
+            except Exception as e:
+                logger.warning(f"Failed to download attachment {link}: {e}")
+
+    def scrape(
+        self,
+        url_fragment: str,
+        recursive: bool,
+        attachments: bool,
+        metadata: dict[str, Any],
+    ):
+        meta_lookup = {}
+        pages = self.fetch_all_pages(url_fragment, recursive)
+
+        for i, soup in enumerate(pages):
+            title = soup.title.string if soup.title else f"page_{i}"
+            safe_title = title.replace("/", "_").replace(" ", "_")[:50]
+            page_path = (
+                SOURCE_RESPOSITORY_PATH / url_fragment.strip("/") / f"{safe_title}.html"
+            )
+            page_path.parent.mkdir(parents=True, exist_ok=True)
+
+            self.save_page(soup, page_path)
+            file_metadata = metadata.copy()
+            file_metadata["url"] = self.base_url.rstrip("/") + url_fragment
+
+            if attachments:
+                attachment_links = self.extract_attachments(soup)
+                self.download_attachments(attachment_links, page_path.parent)
+
+            meta_lookup[page_path] = file_metadata
+
+        return meta_lookup
+
+
+def fetchall(
+    url_fragment: str,
+    recursive: bool = False,
+    attachments: bool = True,
+    metadata: dict = {},
+    **kwargs,
+):
+    scraper = SourceScraper()
+    return scraper.scrape(url_fragment, recursive, attachments, metadata)
diff --git a/vector_store/knowlegde_source_igloo.py b/vector_store/knowlegde_source_igloo.py
new file mode 100644
index 0000000..d16a29c
--- /dev/null
+++ b/vector_store/knowlegde_source_igloo.py
@@ -0,0 +1,174 @@
+import logging
+import os
+
+import pyigloo
+
+from constants import SOURCE_RESPOSITORY_PATH
+
+logger = logging.getLogger(__name__)
+
+
+class Igloo:
+    """Class for connecting to igloo."""
+
+    def __init__(self, endpoint: str):
+        """Initialize."""
+        self.endpoint: str = endpoint
+        # TODO: Raise an error if any of these are None
+        self.api_user: str = os.environ.get("IGLOO_USER", None)
+        self.api_pass: str = os.environ.get("IGLOO_PASS", None)
+        self.api_key: str = os.environ.get("IGLOO_API_KEY", None)
+        self.access_key: str = os.environ.get("IGLOO_ACCESS_KEY", None)
+
+        info = {
+            "ACCESS_KEY": self.access_key,
+            "API_KEY": self.api_key,
+            "API_USER": self.api_user,
+            "API_PASSWORD": self.api_pass,
+            "API_ENDPOINT": self.endpoint,
+        }
+        self.session = pyigloo.igloo(info=info)
+
+    def get_object(self, object_id: str):
+        """Get a single object."""
+        result = self.session.objects_view(objectid=object_id)
+        return result
+
+    def get_children_from_parent(
+        self,
+        parent_path: str | None = None,
+        parent_object_id: str | None = None,
+        recursive: bool = False,
+    ):
+        """Get all children from a parent url path."""
+        # Get the parent object id
+        if parent_path is None and parent_object_id is None:
+            raise ValueError("Must set one of 'parent_path' or 'parent_object_id'")
+        if parent_path is not None:
+            logger.info(f"Fetching objects under path {parent_path}")
+            response = self.session.objects_bypath(path=parent_path)
+            if response is None:
+                raise ValueError(
+                    f"Parent path {parent_path} does not exist. Please check the path and try again."
+                )
+            parent_object_id = response["id"]
+
+        # Get all the children
+        all_children = []
+        for child in self.session.get_all_children_from_object(
+            parent_object_id, pagesize=100
+        ):
+            children = [child]
+            if recursive:
+                try:
+                    child_object_id = child["id"]
+                    childs_children = self.get_children_from_parent(
+                        parent_object_id=child_object_id, recursive=True
+                    )
+                except TypeError:
+                    continue
+                children.extend(childs_children)
+            all_children.extend(children)
+
+        return all_children
+
+    def get_document_binary(self, document_id: str) -> bytes:
+        """Get the contents of a document."""
+        # Send a request to the /documents/document_id/view_binary endpoint to get file contents
+        endpoint = self.session.endpoint
+        api_root = self.session.IGLOO_API_ROOT_V1
+        url = "{0}{1}/documents/{2}/view_binary".format(endpoint, api_root, document_id)
+        headers = {b"Accept": "application/json"}
+        response = self.session.igloo.get(url=url, headers=headers)
+        return response.content
+
+    def get_attachments(self, object_id: str):
+        """Get all attachments on an object."""
+        # Get page metadata
+        page = self.get_object(object_id=object_id)
+        # List the attachments
+        page_attachments = self.session.attachments_view(objectid=object_id)
+        items = page_attachments.get("items", [])
+        # Get information about each attachment
+        attachments = []
+        for item in items:
+            document_id = item["ToId"]
+            document_metadata = self.session.objects_view(document_id)
+            document_binary = self.get_document_binary(document_id=document_id)
+            attachment = document_metadata | {
+                "contentBinary": document_binary,
+                "attachedToHref": page["href"],
+            }
+            attachments.append(attachment)
+        return attachments
+
+
+def fetchall(
+    url_fragment: str,
+    recursive: bool = False,
+    attachments: bool = True,
+    metadata: dict = {},
+    **kwargs,
+):
+    """
+    Fetch pages from the Source.
+
+    Args:
+    ----
+        url_fragment (str): URL fragment to pull all children from.
+            For example, to pull all pages under https://source.redhat.com/departments/operations/travel,
+            set url_fragment="/departments/operations/travel"
+        recursive (bool): Whether or not to recurse into child pages. Defaults to False.
+        attachments (bool): Whether or not to fetch page attachments. Defaults to True.
+        metadata (dict): Metadata to attach to each page chunk. Defaults to {}.
+        **kwargs: Additional arguments not used.
+
+    """
+    endpoint = "https://source.redhat.com/"
+
+    # Connect to Igloo
+    igloo = Igloo(endpoint=endpoint)
+
+    # Get all documents under parent path
+    fragment_documents = igloo.get_children_from_parent(
+        parent_path=url_fragment, recursive=recursive
+    )
+
+    # Fetch all attachments
+    if attachments:
+        for document in fragment_documents:
+            object_id = document["id"]
+            object_attachments = igloo.get_attachments(object_id=object_id)
+            fragment_documents += object_attachments
+
+    # Convert to files and save locally
+    meta_lookup = {}
+    for document in fragment_documents:
+        if document["isPublished"] and not document["IsArchived"]:
+            # Write the document in it's URL path locally
+            doc_href: str = document.get("attachedToHref", document["href"])
+            extension = document.get("fileExtension", ".html")
+            doc_title: str = document["title"].replace(extension, "")
+            doc_path = doc_href.lstrip("/") + "/" + doc_title + extension
+            path = SOURCE_RESPOSITORY_PATH / doc_path
+            folder_path = path.parent
+            if document["content"].strip() != "" or "contentBinary" in document:
+                if not os.path.exists(folder_path):
+                    os.makedirs(folder_path)
+                if "contentBinary" in document:
+                    with open(path, "wb") as f:
+                        f.write(document["contentBinary"])
+                else:
+                    with open(path, "w") as f:
+                        f.write(document["content"])
+
+            # Save metadata
+            used_columns = ["content", "contentBinary"]
+            file_metadata = {
+                key: value for key, value in document.items() if key not in used_columns
+            }
+            file_metadata["url"] = endpoint + doc_href.lstrip("/")
+            file_metadata = file_metadata | metadata
+            meta_lookup[path] = file_metadata
+
+    return meta_lookup
diff --git a/vector_store/split.py b/vector_store/split.py
new file mode 100644
index 0000000..3f7fc2a
--- /dev/null
+++ b/vector_store/split.py
@@ -0,0 +1,144 @@
+import logging
+import os
+import pathlib
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+
+from langchain.docstore.document import Document
+from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import (
+    CSVLoader,
+    Docx2txtLoader,
+    PDFMinerLoader,
+    TextLoader,
+    UnstructuredExcelLoader,
+    UnstructuredPowerPointLoader,
+)
+
+DOCUMENT_MAP = {
+    ".txt": {
+        "loader": TextLoader,
+        "language": None,
+    },
+    ".html": {
+        "loader": TextLoader,
+        "language": Language.HTML,
+    },
+    ".md": {
+        "loader": TextLoader,
+        "language": Language.MARKDOWN,
+    },
+    ".py": {
+        "loader": TextLoader,
+        "language": Language.PYTHON,
+    },
+    ".pdf": {
+        "loader": PDFMinerLoader,
+        "language": None,
+    },
+    ".csv": {
+        "loader": CSVLoader,
+    },
+    ".xls": {
+        "loader": UnstructuredExcelLoader,
+    },
+    ".xlsx": {
+        "loader": UnstructuredExcelLoader,
+    },
+    ".docx": {
+        "loader": Docx2txtLoader,
+        "language": None,
+    },
+    ".doc": {
+        "loader": Docx2txtLoader,
+        "language": None,
+    },
+    ".pptx": {
+        "loader": UnstructuredPowerPointLoader,
+    },
+    ".ppt": {
+        "loader": UnstructuredPowerPointLoader,
+    },
+}
+
+
+def load_single_document(file_path: str) -> tuple[str, list[Document]]:
+    """Load a single document from a file path."""
+    logging.info(f"Loading {file_path}")
+    file_extension = os.path.splitext(file_path)[1]
+    ext_metadata = DOCUMENT_MAP.get(file_extension)
+    if ext_metadata:
+        loader_class = ext_metadata.get("loader")
+        loader = loader_class(file_path)
+    else:
+        raise ValueError("Document type is undefined")
+    return file_extension, loader.load()
+
+
+def load_document_batch(filepaths: list[str]):
+    """Load multiple documents in parallel."""
+    logging.info("Loading document batch")
+    # create a thread pool
+    with ThreadPoolExecutor(len(filepaths)) as exe:
+        # load files
+        futures = [exe.submit(load_single_document, name) for name in filepaths]
+        # collect data
+        data_list = [future.result() for future in futures]
+        # return data and file paths
+        return (data_list, filepaths)
+
+
+def load_documents(source_dir: pathlib.Path, ingest_threads: int) -> list[Document]:
+    """Load all documents from the source documents directory."""
+    all_files = source_dir.rglob("*")
+    paths = []
+    for file_path in all_files:
+        file_extension = os.path.splitext(file_path)[1]
+        source_file_path = os.path.join(source_dir, file_path)
+        if file_extension in DOCUMENT_MAP.keys():
+            paths.append(source_file_path)
+
+    # Have at least one worker and at most INGEST_THREADS workers
+    n_workers = min(ingest_threads, max(len(paths), 1))
+    chunksize = round(len(paths) / n_workers)
+    docs = []
+    with ProcessPoolExecutor(n_workers) as executor:
+        futures = []
+        # split the load operations into chunks
+        for i in range(0, len(paths), chunksize):
+            # select a chunk of filenames
+            filepaths = paths[i : (i + chunksize)]
+            # submit the task
+            future = executor.submit(load_document_batch, filepaths)
+            futures.append(future)
+        # process all results
+        for future in as_completed(futures):
+            # open the file and load the data
+            contents, _ = future.result()
+            docs.extend(contents)
+
+    return docs
+
+
+def split_document(
+    document: Document, file_extension: str, chunk_size: int, chunk_overlap: int
+):
+    """Split a document into chunks."""
+    ext_metadata = DOCUMENT_MAP.get(file_extension)
+    # If there is no language defined, don't chunk the text
+    if "language" not in ext_metadata:
+        chunks = [document]
+    # If there is a language defined, chunk the text according to the language
+    else:
+        language = ext_metadata["language"]
+        # If the language is None, use the basic splitter
+        if language is None:
+            splitter = RecursiveCharacterTextSplitter(
+                chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+        # Otherwise use the specific language
+        else:
+            splitter = RecursiveCharacterTextSplitter.from_language(
+                language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+        chunks = splitter.split_documents(documents=[document])
+    return chunks
diff --git a/vector_store/vector_store.py b/vector_store/vector_store.py
new file mode 100644
index 0000000..81d43c0
--- /dev/null
+++ b/vector_store/vector_store.py
@@ -0,0 +1,101 @@
+"""The Vector store service."""
+
+import argparse
+import logging
+import pathlib
+from typing import Any, List
+import yaml
+
+from ingest_data import ingest
+from knowledge_source import fetchall as fetch_source
+from delete_knowledge import delete_knowledge
+
+logger = logging.getLogger(__name__)
+
+
+def parse_config(path: pathlib.Path) -> dict[str, Any]:
+    """Parse the YAML configuration file."""
+    if not path.exists():
+        raise FileNotFoundError(f"Config file {path} does not exist")
+    if path.is_dir():
+        raise ValueError(f"Expected a file but got a directory: {path}")
+
+    with path.open("r", encoding="utf-8") as f:
+        config = yaml.safe_load(f)
+
+    if not isinstance(config, dict):
+        raise ValueError(f"Invalid configuration format in {path}")
+
+    return config
+
+
+def main(args: argparse.Namespace) -> None:
+    """
+    Ingests multiple document collections into a vector store
+    using the configuration file specified in `args.config`.
+
+    Args:
+        args: Parsed arguments containing the config file path.
+    """
+    config_path = args.config
+    config: dict = parse_config(config_path)
+    ingest_threads = config.get("ingest_threads", 8)
+    collections = config.get("collections", [])
+    errors: List[Exception] = []
+    for collection in collections:
+        try:
+            name = collection.get("id")
+            mode = collection.get("mode")
+            chunk_size = collection.get("chunk_size")
+            chunk_overlap = collection.get("chunk_overlap")
+            required_values = [name, mode, chunk_size, chunk_overlap]
+            if any(value is None for value in required_values):
+                required_keys = ["name", "mode", "chunk_size", "chunk_overlap"]
+                raise ValueError(f"Missing required keys in collection {required_keys}")
+            embedding_model_name = collection.get(
+                "embedding_model", "sentence-transformers/all-MiniLM-L6-v2"
+            )
+            metadata = collection.get("metadata", {})
+            sources = collection.get("sources", [])
+            meta_lookup: dict[pathlib.Path, dict[Any, Any]] = {}
+            for source in sources:
+                source_meta_lookup = fetch_source(**source)
+            meta_lookup = meta_lookup | source_meta_lookup
+            ingest(
+                meta_lookup=meta_lookup,
+                collection_name=name,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                ingest_threads=ingest_threads,
+                embedding_model_name=embedding_model_name,
+                mode=mode,
+                collection_metadata=metadata,
+            )
+        except Exception as e:
+            logger.error(
+                "Failed to ingest collection %s: %s", collection.get("id", "unknown"), e
+            )
+            errors.append(e)
+        finally:
+            delete_knowledge()
+
+    if errors:
+        error_messages = "\n".join(str(e) for e in errors)
+        raise RuntimeError(
+            f"Ingest failed for {len(errors)} collection(s):\n{error_messages}"
+        )
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    )
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=pathlib.Path,
+        help="Path to config file.",
+        default=pathlib.Path("./config/config.yaml"),
+    )
+    main(parser.parse_args())