diff --git a/docker-compose.yml b/docker-compose.yml index 2ac2f7b5..d8b11a49 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -70,6 +70,7 @@ services: path: . ignore: - bot.py + - pdf_bot.py bot: @@ -102,9 +103,44 @@ services: path: . ignore: - loader.py + - pdf_bot.py ports: - 8501:8501 + + pdf_bot: + build: + dockerfile: pdf_bot.Dockerfile + environment: + - NEO4J_URI=${NEO4J_URI-neo4j://database:7687} + - NEO4J_PASSWORD=${NEO4J_PASSWORD-password} + - NEO4J_USERNAME=${NEO4J_USERNAME-neo4j} + - OPENAI_API_KEY=${OPENAI_API_KEY} + - OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434} + - LLM=${LLM-llama2} + - EMBEDDING_MODEL=${EMBEDDING_MODEL-sentence_transformer} + - LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-"https://api.smith.langchain.com"} + - LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-false} + - LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT} + - LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY} + networks: + - net + depends_on: + database: + condition: service_healthy + pull-model: + condition: service_completed_successfully + x-develop: + watch: + - action: rebuild + path: . + ignore: + - loader.py + - bot.py + + ports: + - 8503:8503 + networks: net: diff --git a/pdf_bot.Dockerfile b/pdf_bot.Dockerfile new file mode 100644 index 00000000..5efb078f --- /dev/null +++ b/pdf_bot.Dockerfile @@ -0,0 +1,23 @@ +FROM langchain/langchain + +WORKDIR /app + +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + software-properties-common \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . + +RUN pip install --upgrade -r requirements.txt + +COPY pdf_bot.py . +COPY utils.py . +COPY chains.py . + +EXPOSE 8503 + +HEALTHCHECK CMD curl --fail http://localhost:8503/_stcore/health + +ENTRYPOINT ["streamlit", "run", "pdf_bot.py", "--server.port=8503", "--server.address=0.0.0.0"] diff --git a/pdf_bot.py b/pdf_bot.py new file mode 100644 index 00000000..9731c24f --- /dev/null +++ b/pdf_bot.py @@ -0,0 +1,93 @@ +import os + +import streamlit as st +from langchain.chains import RetrievalQA +from PyPDF2 import PdfReader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.callbacks.base import BaseCallbackHandler +from langchain.vectorstores.neo4j_vector import Neo4jVector +from streamlit.logger import get_logger +from chains import ( + load_embedding_model, + load_llm, +) + +# load api key lib +from dotenv import load_dotenv + +load_dotenv(".env") + + +url = os.getenv("NEO4J_URI") +username = os.getenv("NEO4J_USERNAME") +password = os.getenv("NEO4J_PASSWORD") +ollama_base_url = os.getenv("OLLAMA_BASE_URL") +embedding_model_name = os.getenv("EMBEDDING_MODEL") +llm_name = os.getenv("LLM") +# Remapping for Langchain Neo4j integration +os.environ["NEO4J_URL"] = url + +logger = get_logger(__name__) + + +embeddings, dimension = load_embedding_model( + embedding_model_name, config={ollama_base_url: ollama_base_url}, logger=logger +) + + +class StreamHandler(BaseCallbackHandler): + def __init__(self, container, initial_text=""): + self.container = container + self.text = initial_text + + def on_llm_new_token(self, token: str, **kwargs) -> None: + self.text += token + self.container.markdown(self.text) + + +llm = load_llm(llm_name, logger=logger, config={"ollama_base_url": ollama_base_url}) + + +def main(): + st.header("📄Chat with your pdf file") + + # upload a your pdf file + pdf = st.file_uploader("Upload your PDF", type="pdf") + + if pdf is not None: + pdf_reader = PdfReader(pdf) + + text = "" + for page in pdf_reader.pages: + text += page.extract_text() + + # langchain_textspliter + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, chunk_overlap=200, length_function=len + ) + + chunks = text_splitter.split_text(text=text) + + # Store the chunks part in db (vector) + vectorstore = Neo4jVector.from_texts( + chunks, + url=url, + username=username, + password=password, + embedding=embeddings, + pre_delete_collection=True, # Delete existing PDF data + ) + qa = RetrievalQA.from_chain_type( + llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever() + ) + + # Accept user questions/query + query = st.text_input("Ask questions about related your upload pdf file") + + if query: + stream_handler = StreamHandler(st.empty()) + qa.run(query, callbacks=[stream_handler]) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 458553ef..7cc85b48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ neo4j streamlit sentence_transformers Pillow +PyPDF2