diff --git a/.gitignore b/.gitignore index 6e9b265..e646a9a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,8 @@ __pycache__/ .env.* .idea/ .vscode/ -*.db \ No newline at end of file +*.db +.mypy_cache/ +.cache/ +.DS_Store +output* diff --git a/config/config.yaml b/config/config.yaml index e69de29..8c09ba3 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -0,0 +1,18 @@ +version: v1 +ingest_threads: 8 +collections: + - name: "Source Collection" + id: "source_collection" + mode: "overwrite" + chunk_size: 500 + chunk_overlap: 250 + embedding_model: "all-MiniLM-L6-v2" + metadata: + key: "value" + sources: + - type: "source" + url_fragment: "/" + recursive: true + attachments: true + metadata: + key: "value" diff --git a/logs/.gitignore b/logs/.gitignore new file mode 100644 index 0000000..16f2dc5 --- /dev/null +++ b/logs/.gitignore @@ -0,0 +1 @@ +*.csv \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 041375b..d35ab7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,12 @@ Jinja2==3.1.6 MarkupSafe==3.0.2 slack_sdk==3.35.0 Werkzeug==3.1.3 +pyigloo @ git+https://github.com/xkahn/pyigloo.git +langchain_huggingface +langchain_postgres +langchain_community +types-beautifulsoup4 # can be removed after testing with igloo API +hf_xet +tf-keras +selenium # can be removed after tesing with igloo API +pdfminer.six diff --git a/sample.env b/sample.env new file mode 100644 index 0000000..f6dbdcc --- /dev/null +++ b/sample.env @@ -0,0 +1,13 @@ +# Igloo +IGLOO_API_KEY= +IGLOO_ACCESS_KEY= +IGLOO_USER= +IGLOO_PASS= + +# PGVector +PGVECTOR_DRIVER="psycopg2" +PGVECTOR_USER= +PGVECTOR_PASS= +PGVECTOR_DATABASE_NAME= +PGVECTOR_URI="localhost" +PGVECTOR_PORT="5432" diff --git a/scripts/ingest_data.py b/scripts/ingest_data.py deleted file mode 100644 index e69de29..0000000 diff --git a/vector_store/constants.py b/vector_store/constants.py new file mode 100644 index 0000000..20f3cd5 --- /dev/null +++ b/vector_store/constants.py @@ -0,0 +1,26 @@ +import os +import pathlib + +import torch +from dotenv import load_dotenv + +load_dotenv() + +# PATHS +DIRECTORY_PATH = pathlib.Path.cwd() +KNOWLEDGE_REPOSITORY_PATH = DIRECTORY_PATH / "knowledge" +SOURCE_RESPOSITORY_PATH = KNOWLEDGE_REPOSITORY_PATH / "source" + +# INGEST +DEVICE = ( + "cuda" + if torch.cuda.is_available() + else ("mps" if torch.backends.mps.is_available() else "cpu") +) + +# PGVECTOR +PGVECTOR_USER = os.environ.get("PGVECTOR_USER") +PGVECTOR_PASS = os.environ.get("PGVECTOR_PASS") +PGVECTOR_DATABASE_NAME = os.environ.get("PGVECTOR_DATABASE_NAME") +PGVECTOR_HOST = os.environ.get("PGVECTOR_URI", "localhost") +PGVECTOR_PORT = int(os.environ.get("PGVECTOR_PORT", 5432)) diff --git a/vector_store/delete_knowledge.py b/vector_store/delete_knowledge.py new file mode 100644 index 0000000..39958fa --- /dev/null +++ b/vector_store/delete_knowledge.py @@ -0,0 +1,13 @@ +import logging +import shutil + +from constants import KNOWLEDGE_REPOSITORY_PATH + +logger = logging.getLogger(__name__) + + +def delete_knowledge(): + """Delete everything in the knowledge folder.""" + if KNOWLEDGE_REPOSITORY_PATH.exists(): + logger.info(f"Deleting {KNOWLEDGE_REPOSITORY_PATH}") + shutil.rmtree(KNOWLEDGE_REPOSITORY_PATH) diff --git a/vector_store/ingest_data.py b/vector_store/ingest_data.py new file mode 100644 index 0000000..ce7de0c --- /dev/null +++ b/vector_store/ingest_data.py @@ -0,0 +1,122 @@ +"""Data Ingestion""" + +import logging +import pathlib +from datetime import datetime + +import pandas as pd +from langchain_huggingface.embeddings import HuggingFaceEmbeddings +from langchain_postgres import PGVector + +from constants import ( + DEVICE, + DIRECTORY_PATH, + KNOWLEDGE_REPOSITORY_PATH, + PGVECTOR_DATABASE_NAME, + PGVECTOR_HOST, + PGVECTOR_PASS, + PGVECTOR_PORT, + PGVECTOR_USER, +) +from split import load_documents, split_document + +logger = logging.getLogger(__name__) + + +def get_embedder(embedding_model_name: str) -> HuggingFaceEmbeddings: + """Initialize an embedder to convert text into vectors.""" + return HuggingFaceEmbeddings( + model_name=embedding_model_name, + model_kwargs={"device": DEVICE}, + show_progress=True, + ) + + +def ingest( + meta_lookup: dict[pathlib.Path, dict], + collection_name: str, + chunk_size: int, + chunk_overlap: int, + ingest_threads: int = 8, + embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2", + mode: str = "overwrite", + collection_metadata: dict = {}, +): + """Load documents into a vectorstore.""" + # Get documents + all_documents = [] + origin_urls = {} + documents = load_documents(KNOWLEDGE_REPOSITORY_PATH, ingest_threads=ingest_threads) + for extension, document in documents: + # Split each document into chunks + document = document[0] + # Rename "source" to "_source" and save filename to "source" + source = pathlib.Path(document.metadata["source"]) + file_name = source.stem + document.metadata["_source"] = document.metadata["source"] + document.metadata["source"] = file_name + chunks = split_document( + document, extension, chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + # Attach metadata to each chunk + for chunk in chunks: + path_metadata = meta_lookup.get(source, {}) + chunk.metadata = chunk.metadata | path_metadata + # Record how many chunks were made + rel_path = source.relative_to(KNOWLEDGE_REPOSITORY_PATH) + origin = rel_path.parts[0] + origin_url = (origin, chunk.metadata.get("url")) + origin_urls[origin_url] = len(chunks) + all_documents.extend(chunks) + + # Create embeddings + embedder = get_embedder(embedding_model_name) + + # Build the Postgres connection string + connection_string = PGVector.connection_string_from_db_params( + driver="psycopg", + host=PGVECTOR_HOST, + port=int(PGVECTOR_PORT), + database=PGVECTOR_DATABASE_NAME, + user=PGVECTOR_USER, + password=PGVECTOR_PASS, + ) + + # Connect to the db + db = PGVector( + connection=connection_string, + embeddings=embedder, + collection_name=collection_name, + collection_metadata=collection_metadata, + use_jsonb=True, + ) + + # Overwrite the collection (if requested) + if mode == "overwrite": + db.delete_collection() + logger.info(f"Collection {collection_name} deleted") + db.create_collection() + logger.info(f"Collection {collection_name} created") + + # Load the documents + logger.info( + f"Loading {len(all_documents)} embeddings to {PGVECTOR_HOST} - {PGVECTOR_DATABASE_NAME} - {collection_name}" + ) + + # Add documents to DB in batches to accomodate the large numbers of parameters + batch_size = 150 + for i in range(0, len(all_documents), batch_size): + batch = all_documents[i:i + batch_size] + logger.info(f"Ingesting batch {i // batch_size + 1} of {len(batch)} documents") + db.add_documents(documents=batch) + + logger.info(f"Successfully loaded {len(all_documents)} embeddings") + + directory_source_url_chunks = [ + list(origin_url) + [chunks] for origin_url, chunks in origin_urls.items() + ] + df = pd.DataFrame(directory_source_url_chunks, columns=["origin", "url", "chunks"]) + filename = f"{PGVECTOR_HOST} - {collection_name} - {datetime.now()}.csv" + outpath = DIRECTORY_PATH / "logs" / filename + outpath.parent.mkdir(parents=True, exist_ok=True) + df.to_csv(outpath, index=False) diff --git a/vector_store/knowledge_source.py b/vector_store/knowledge_source.py new file mode 100644 index 0000000..3753cdb --- /dev/null +++ b/vector_store/knowledge_source.py @@ -0,0 +1,122 @@ +# TODO (@abhikdps): Remove this file once the Igloo API keys +# are aquired and rename the knowledge_source_igloo.py file to knowledge_source.py +import pathlib +import time +import logging +from typing import Any +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.chrome.options import Options + +from constants import SOURCE_RESPOSITORY_PATH + +logger = logging.getLogger(__name__) + + +class SourceScraper: + def __init__(self, base_url: str = "https://source.redhat.com/"): + chrome_options = Options() + chrome_options.add_argument("--start-maximized") + self.driver = webdriver.Chrome(options=chrome_options) + self.base_url = base_url + + self.driver.get(self.base_url) + print("\n Please log in manually and press ENTER here once done...") + input() + print(" Login confirmed. Proceeding with scraping.") + + def fetch_all_pages(self, url_fragment: str, recursive: bool = False): + url = self.base_url.rstrip("/") + url_fragment + self.driver.get(url) + time.sleep(3) + + soup = BeautifulSoup(self.driver.page_source, "html.parser") + pages = [soup] + + if recursive: + children_links = soup.select("a[href^='/']") + visited = set() + + for link in children_links: + href = link.get("href") + full_url = self.base_url.rstrip("/") + href + if href and href.startswith("/") and full_url not in visited: + visited.add(full_url) + try: + self.driver.get(full_url) + time.sleep(2) + sub_soup = BeautifulSoup(self.driver.page_source, "html.parser") + pages.append(sub_soup) + except Exception as e: + logger.warning(f"Failed to visit {full_url}: {e}") + + return pages + + def extract_attachments(self, soup: BeautifulSoup): + attachments = [] + links = soup.select("a") + for link in links: + href = link.get("href") + if href and any(ext in href for ext in [".pdf", ".docx", ".xlsx"]): + attachments.append(href) + return attachments + + def save_page(self, soup: BeautifulSoup, path: pathlib.Path): + with open(path, "w", encoding="utf-8") as f: + f.write(str(soup)) + + def download_attachments(self, attachments: list[str], base_path: pathlib.Path): + for link in attachments: + file_name = link.split("/")[-1] + full_path = base_path / file_name + try: + self.driver.get( + link + if link.startswith("http") + else self.base_url.rstrip("/") + link + ) + with open(full_path, "wb") as f: + f.write(self.driver.page_source.encode("utf-8")) + except Exception as e: + logger.warning(f"Failed to download attachment {link}: {e}") + + def scrape( + self, + url_fragment: str, + recursive: bool, + attachments: bool, + metadata: dict[str, Any], + ): + meta_lookup = {} + pages = self.fetch_all_pages(url_fragment, recursive) + + for i, soup in enumerate(pages): + title = soup.title.string if soup.title else f"page_{i}" + safe_title = title.replace("/", "_").replace(" ", "_")[:50] + page_path = ( + SOURCE_RESPOSITORY_PATH / url_fragment.strip("/") / f"{safe_title}.html" + ) + page_path.parent.mkdir(parents=True, exist_ok=True) + + self.save_page(soup, page_path) + file_metadata = metadata.copy() + file_metadata["url"] = self.base_url.rstrip("/") + url_fragment + + if attachments: + attachment_links = self.extract_attachments(soup) + self.download_attachments(attachment_links, page_path.parent) + + meta_lookup[page_path] = file_metadata + + return meta_lookup + + +def fetchall( + url_fragment: str, + recursive: bool = False, + attachments: bool = True, + metadata: dict = {}, + **kwargs, +): + scraper = SourceScraper() + return scraper.scrape(url_fragment, recursive, attachments, metadata) diff --git a/vector_store/knowlegde_source_igloo.py b/vector_store/knowlegde_source_igloo.py new file mode 100644 index 0000000..d16a29c --- /dev/null +++ b/vector_store/knowlegde_source_igloo.py @@ -0,0 +1,174 @@ +import logging +import os + +import pyigloo + +from constants import SOURCE_RESPOSITORY_PATH + +logger = logging.getLogger(__name__) + + +class Igloo: + """Class for connecting to igloo.""" + + def __init__(self, endpoint: str): + """Initialize.""" + self.endpoint: str = endpoint + # TODO: Raise an error if any of these are None + self.api_user: str = os.environ.get("IGLOO_USER", None) + self.api_pass: str = os.environ.get("IGLOO_PASS", None) + self.api_key: str = os.environ.get("IGLOO_API_KEY", None) + self.access_key: str = os.environ.get("IGLOO_ACCESS_KEY", None) + + info = { + "ACCESS_KEY": self.access_key, + "API_KEY": self.api_key, + "API_USER": self.api_user, + "API_PASSWORD": self.api_pass, + "API_ENDPOINT": self.endpoint, + } + self.session = pyigloo.igloo(info=info) + + def get_object(self, object_id: str): + """Get a single object.""" + result = self.session.objects_view(objectid=object_id) + return result + + def get_children_from_parent( + self, + parent_path: str | None = None, + parent_object_id: str | None = None, + recursive: bool = False, + ): + """Get all children from a parent url path.""" + # Get the parent object id + if parent_path is None and parent_object_id is None: + raise ValueError("Must set one of 'parent_path' or 'parent_object_id'") + if parent_path is not None: + logger.info(f"Fetching objects under path {parent_path}") + response = self.session.objects_bypath(path=parent_path) + if response is None: + raise ValueError( + f"Parent path {parent_path} does not exist. Please check the path and try again." + ) + parent_object_id = response["id"] + + # Get all the children + all_children = [] + for child in self.session.get_all_children_from_object( + parent_object_id, pagesize=100 + ): + children = [child] + if recursive: + try: + child_object_id = child["id"] + childs_children = self.get_children_from_parent( + parent_object_id=child_object_id, recursive=True + ) + except TypeError: + continue + children.extend(childs_children) + all_children.extend(children) + + return all_children + + def get_document_binary(self, document_id: str) -> bytes: + """Get the contents of a document.""" + # Send a request to the /documents/document_id/view_binary endpoint to get file contents + endpoint = self.session.endpoint + api_root = self.session.IGLOO_API_ROOT_V1 + url = "{0}{1}/documents/{2}/view_binary".format(endpoint, api_root, document_id) + headers = {b"Accept": "application/json"} + response = self.session.igloo.get(url=url, headers=headers) + return response.content + + def get_attachments(self, object_id: str): + """Get all attachments on an object.""" + # Get page metadata + page = self.get_object(object_id=object_id) + # List the attachments + page_attachments = self.session.attachments_view(objectid=object_id) + items = page_attachments.get("items", []) + # Get information about each attachment + attachments = [] + for item in items: + document_id = item["ToId"] + document_metadata = self.session.objects_view(document_id) + document_binary = self.get_document_binary(document_id=document_id) + attachment = document_metadata | { + "contentBinary": document_binary, + "attachedToHref": page["href"], + } + attachments.append(attachment) + return attachments + + +def fetchall( + url_fragment: str, + recursive: bool = False, + attachments: bool = True, + metadata: dict = {}, + **kwargs, +): + """ + Fetch pages from the Source. + + Args: + ---- + url_fragment (str): URL fragment to pull all children from. + For example, to pull all pages under https://source.redhat.com/departments/operations/travel, + set url_fragment="/departments/operations/travel" + recursive (bool): Whether or not to recurse into child pages. Defaults to False. + attachments (bool): Whether or not to fetch page attachments. Defaults to True. + metadata (dict): Metadata to attach to each page chunk. Defaults to {}. + **kwargs: Additional arguments not used. + + """ + endpoint = "https://source.redhat.com/" + + # Connect to Igloo + igloo = Igloo(endpoint=endpoint) + + # Get all documents under parent path + fragment_documents = igloo.get_children_from_parent( + parent_path=url_fragment, recursive=recursive + ) + + # Fetch all attachments + if attachments: + for document in fragment_documents: + object_id = document["id"] + object_attachments = igloo.get_attachments(object_id=object_id) + fragment_documents += object_attachments + + # Convert to files and save locally + meta_lookup = {} + for document in fragment_documents: + if document["isPublished"] and not document["IsArchived"]: + # Write the document in it's URL path locally + doc_href: str = document.get("attachedToHref", document["href"]) + extension = document.get("fileExtension", ".html") + doc_title: str = document["title"].replace(extension, "") + doc_path = doc_href.lstrip("/") + "/" + doc_title + extension + path = SOURCE_RESPOSITORY_PATH / doc_path + folder_path = path.parent + if document["content"].strip() != "" or "contentBinary" in document: + if not os.path.exists(folder_path): + os.makedirs(folder_path) + if "contentBinary" in document: + with open(path, "wb") as f: + f.write(document["contentBinary"]) + else: + with open(path, "w") as f: + f.write(document["content"]) + + # Save metadata + used_columns = ["content", "contentBinary"] + file_metadata = { + key: value for key, value in document.items() if key not in used_columns + } + file_metadata["url"] = endpoint + doc_href.lstrip("/") + file_metadata = file_metadata | metadata + meta_lookup[path] = file_metadata + + return meta_lookup diff --git a/vector_store/split.py b/vector_store/split.py new file mode 100644 index 0000000..3f7fc2a --- /dev/null +++ b/vector_store/split.py @@ -0,0 +1,144 @@ +import logging +import os +import pathlib +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed + +from langchain.docstore.document import Document +from langchain.text_splitter import Language, RecursiveCharacterTextSplitter +from langchain_community.document_loaders import ( + CSVLoader, + Docx2txtLoader, + PDFMinerLoader, + TextLoader, + UnstructuredExcelLoader, + UnstructuredPowerPointLoader, +) + +DOCUMENT_MAP = { + ".txt": { + "loader": TextLoader, + "language": None, + }, + ".html": { + "loader": TextLoader, + "language": Language.HTML, + }, + ".md": { + "loader": TextLoader, + "language": Language.MARKDOWN, + }, + ".py": { + "loader": TextLoader, + "language": Language.PYTHON, + }, + ".pdf": { + "loader": PDFMinerLoader, + "language": None, + }, + ".csv": { + "loader": CSVLoader, + }, + ".xls": { + "loader": UnstructuredExcelLoader, + }, + ".xlsx": { + "loader": UnstructuredExcelLoader, + }, + ".docx": { + "loader": Docx2txtLoader, + "language": None, + }, + ".doc": { + "loader": Docx2txtLoader, + "language": None, + }, + ".pptx": { + "loader": UnstructuredPowerPointLoader, + }, + ".ppt": { + "loader": UnstructuredPowerPointLoader, + }, +} + + +def load_single_document(file_path: str) -> tuple[str, list[Document]]: + """Load a single document from a file path.""" + logging.info(f"Loading {file_path}") + file_extension = os.path.splitext(file_path)[1] + ext_metadata = DOCUMENT_MAP.get(file_extension) + if ext_metadata: + loader_class = ext_metadata.get("loader") + loader = loader_class(file_path) + else: + raise ValueError("Document type is undefined") + return file_extension, loader.load() + + +def load_document_batch(filepaths: list[str]): + """Load multiple documents in parallel.""" + logging.info("Loading document batch") + # create a thread pool + with ThreadPoolExecutor(len(filepaths)) as exe: + # load files + futures = [exe.submit(load_single_document, name) for name in filepaths] + # collect data + data_list = [future.result() for future in futures] + # return data and file paths + return (data_list, filepaths) + + +def load_documents(source_dir: pathlib.Path, ingest_threads: int) -> list[Document]: + """Load all documents from the source documents directory.""" + all_files = source_dir.rglob("*") + paths = [] + for file_path in all_files: + file_extension = os.path.splitext(file_path)[1] + source_file_path = os.path.join(source_dir, file_path) + if file_extension in DOCUMENT_MAP.keys(): + paths.append(source_file_path) + + # Have at least one worker and at most INGEST_THREADS workers + n_workers = min(ingest_threads, max(len(paths), 1)) + chunksize = round(len(paths) / n_workers) + docs = [] + with ProcessPoolExecutor(n_workers) as executor: + futures = [] + # split the load operations into chunks + for i in range(0, len(paths), chunksize): + # select a chunk of filenames + filepaths = paths[i : (i + chunksize)] + # submit the task + future = executor.submit(load_document_batch, filepaths) + futures.append(future) + # process all results + for future in as_completed(futures): + # open the file and load the data + contents, _ = future.result() + docs.extend(contents) + + return docs + + +def split_document( + document: Document, file_extension: str, chunk_size: int, chunk_overlap: int +): + """Split a document into chunks.""" + ext_metadata = DOCUMENT_MAP.get(file_extension) + # If there is no language defined, don't chunk the text + if "language" not in ext_metadata: + chunks = [document] + # If there is a language defined, chunk the text according to the language + else: + language = ext_metadata["language"] + # If the language is None, use the basic splitter + if language is None: + splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + # Otherwise use the specific language + else: + splitter = RecursiveCharacterTextSplitter.from_language( + language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + chunks = splitter.split_documents(documents=[document]) + return chunks diff --git a/vector_store/vector_store.py b/vector_store/vector_store.py new file mode 100644 index 0000000..81d43c0 --- /dev/null +++ b/vector_store/vector_store.py @@ -0,0 +1,101 @@ +"""The Vector store service.""" + +import argparse +import logging +import pathlib +from typing import Any, List +import yaml + +from ingest_data import ingest +from knowledge_source import fetchall as fetch_source +from delete_knowledge import delete_knowledge + +logger = logging.getLogger(__name__) + + +def parse_config(path: pathlib.Path) -> dict[str, Any]: + """Parse the YAML configuration file.""" + if not path.exists(): + raise FileNotFoundError(f"Config file {path} does not exist") + if path.is_dir(): + raise ValueError(f"Expected a file but got a directory: {path}") + + with path.open("r", encoding="utf-8") as f: + config = yaml.safe_load(f) + + if not isinstance(config, dict): + raise ValueError(f"Invalid configuration format in {path}") + + return config + + +def main(args: argparse.Namespace) -> None: + """ + Ingests multiple document collections into a vector store + using the configuration file specified in `args.config`. + + Args: + args: Parsed arguments containing the config file path. + """ + config_path = args.config + config: dict = parse_config(config_path) + ingest_threads = config.get("ingest_threads", 8) + collections = config.get("collections", []) + errors: List[Exception] = [] + for collection in collections: + try: + name = collection.get("id") + mode = collection.get("mode") + chunk_size = collection.get("chunk_size") + chunk_overlap = collection.get("chunk_overlap") + required_values = [name, mode, chunk_size, chunk_overlap] + if any(value is None for value in required_values): + required_keys = ["name", "mode", "chunk_size", "chunk_overlap"] + raise ValueError(f"Missing required keys in collection {required_keys}") + embedding_model_name = collection.get( + "embedding_model", "sentence-transformers/all-MiniLM-L6-v2" + ) + metadata = collection.get("metadata", {}) + sources = collection.get("sources", []) + meta_lookup: dict[pathlib.Path, dict[Any, Any]] = {} + for source in sources: + source_meta_lookup = fetch_source(**source) + meta_lookup = meta_lookup | source_meta_lookup + ingest( + meta_lookup=meta_lookup, + collection_name=name, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ingest_threads=ingest_threads, + embedding_model_name=embedding_model_name, + mode=mode, + collection_metadata=metadata, + ) + except Exception as e: + logger.error( + "Failed to ingest collection %s: %s", collection.get("id", "unknown"), e + ) + errors.append(e) + finally: + delete_knowledge() + + if errors: + error_messages = "\n".join(str(e) for e in errors) + raise RuntimeError( + f"Ingest failed for {len(errors)} collection(s):\n{error_messages}" + ) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + ) + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + type=pathlib.Path, + help="Path to config file.", + default=pathlib.Path("./config/config.yaml"), + ) + main(parser.parse_args())