Skip to content

Commit 75bfb89

Browse files
committed
Add pdf bot
1 parent f057411 commit 75bfb89

File tree

4 files changed

+153
-0
lines changed

4 files changed

+153
-0
lines changed

docker-compose.yml

+36
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ services:
7070
path: .
7171
ignore:
7272
- bot.py
73+
- pdf_bot.py
7374

7475

7576
bot:
@@ -102,9 +103,44 @@ services:
102103
path: .
103104
ignore:
104105
- loader.py
106+
- pdf_bot.py
105107

106108
ports:
107109
- 8501:8501
108110

111+
112+
pdf_bot:
113+
build:
114+
dockerfile: pdf_bot.Dockerfile
115+
environment:
116+
- NEO4J_URI=${NEO4J_URI-neo4j://database:7687}
117+
- NEO4J_PASSWORD=${NEO4J_PASSWORD-password}
118+
- NEO4J_USERNAME=${NEO4J_USERNAME-neo4j}
119+
- OPENAI_API_KEY=${OPENAI_API_KEY}
120+
- OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434}
121+
- LLM=${LLM-llama2}
122+
- EMBEDDING_MODEL=${EMBEDDING_MODEL-sentence_transformer}
123+
- LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-"https://api.smith.langchain.com"}
124+
- LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-false}
125+
- LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT}
126+
- LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY}
127+
networks:
128+
- net
129+
depends_on:
130+
database:
131+
condition: service_healthy
132+
pull-model:
133+
condition: service_completed_successfully
134+
x-develop:
135+
watch:
136+
- action: rebuild
137+
path: .
138+
ignore:
139+
- loader.py
140+
- bot.py
141+
142+
ports:
143+
- 8503:8503
144+
109145
networks:
110146
net:

pdf_bot.Dockerfile

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
FROM langchain/langchain
2+
3+
WORKDIR /app
4+
5+
RUN apt-get update && apt-get install -y \
6+
build-essential \
7+
curl \
8+
software-properties-common \
9+
&& rm -rf /var/lib/apt/lists/*
10+
11+
COPY requirements.txt .
12+
13+
RUN pip install --upgrade -r requirements.txt
14+
15+
COPY pdf_bot.py .
16+
COPY utils.py .
17+
COPY chains.py .
18+
19+
EXPOSE 8503
20+
21+
HEALTHCHECK CMD curl --fail http://localhost:8503/_stcore/health
22+
23+
ENTRYPOINT ["streamlit", "run", "pdf_bot.py", "--server.port=8503", "--server.address=0.0.0.0"]

pdf_bot.py

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import os
2+
3+
import streamlit as st
4+
from langchain.chains import RetrievalQA
5+
from PyPDF2 import PdfReader
6+
from langchain.text_splitter import RecursiveCharacterTextSplitter
7+
from langchain.callbacks.base import BaseCallbackHandler
8+
from langchain.vectorstores.neo4j_vector import Neo4jVector
9+
from streamlit.logger import get_logger
10+
from chains import (
11+
load_embedding_model,
12+
load_llm,
13+
)
14+
15+
# load api key lib
16+
from dotenv import load_dotenv
17+
18+
load_dotenv(".env")
19+
20+
21+
url = os.getenv("NEO4J_URI")
22+
username = os.getenv("NEO4J_USERNAME")
23+
password = os.getenv("NEO4J_PASSWORD")
24+
ollama_base_url = os.getenv("OLLAMA_BASE_URL")
25+
embedding_model_name = os.getenv("EMBEDDING_MODEL")
26+
llm_name = os.getenv("LLM")
27+
# Remapping for Langchain Neo4j integration
28+
os.environ["NEO4J_URL"] = url
29+
30+
logger = get_logger(__name__)
31+
32+
33+
embeddings, dimension = load_embedding_model(
34+
embedding_model_name, config={ollama_base_url: ollama_base_url}, logger=logger
35+
)
36+
37+
38+
class StreamHandler(BaseCallbackHandler):
39+
def __init__(self, container, initial_text=""):
40+
self.container = container
41+
self.text = initial_text
42+
43+
def on_llm_new_token(self, token: str, **kwargs) -> None:
44+
self.text += token
45+
self.container.markdown(self.text)
46+
47+
48+
llm = load_llm(llm_name, logger=logger, config={"ollama_base_url": ollama_base_url})
49+
50+
51+
def main():
52+
st.header("📄Chat with your pdf file")
53+
54+
# upload a your pdf file
55+
pdf = st.file_uploader("Upload your PDF", type="pdf")
56+
57+
if pdf is not None:
58+
pdf_reader = PdfReader(pdf)
59+
60+
text = ""
61+
for page in pdf_reader.pages:
62+
text += page.extract_text()
63+
64+
# langchain_textspliter
65+
text_splitter = RecursiveCharacterTextSplitter(
66+
chunk_size=1000, chunk_overlap=200, length_function=len
67+
)
68+
69+
chunks = text_splitter.split_text(text=text)
70+
71+
# Store the chunks part in db (vector)
72+
vectorstore = Neo4jVector.from_texts(
73+
chunks,
74+
url=url,
75+
username=username,
76+
password=password,
77+
embedding=embeddings,
78+
pre_delete_collection=True, # Delete existing PDF data
79+
)
80+
qa = RetrievalQA.from_chain_type(
81+
llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever()
82+
)
83+
84+
# Accept user questions/query
85+
query = st.text_input("Ask questions about related your upload pdf file")
86+
87+
if query:
88+
stream_handler = StreamHandler(st.empty())
89+
qa.run(query, callbacks=[stream_handler])
90+
91+
92+
if __name__ == "__main__":
93+
main()

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ neo4j
66
streamlit
77
sentence_transformers
88
Pillow
9+
PyPDF2

0 commit comments

Comments
 (0)