Skip to content

Commit a24b4bf

Browse files
starter code
1 parent fbe59ca commit a24b4bf

2 files changed

Lines changed: 198 additions & 0 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.ipynb_checkpoints/
2+
__pycache__/
3+
minsearch.py

starter.ipynb

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "d658f909-e679-41e9-9c4e-e0241c719049",
6+
"metadata": {},
7+
"source": [
8+
"If you're not running in Saturn Cloud, you need to install these libraries:\n",
9+
"\n",
10+
"Make sure you use the latest versions\n",
11+
"\n",
12+
"```\n",
13+
"pip install -U transformers accelerate bitsandbytes\n",
14+
"```"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 1,
20+
"id": "506fab2a-a50c-42bd-a106-c83a9d2828ea",
21+
"metadata": {},
22+
"outputs": [
23+
{
24+
"name": "stderr",
25+
"output_type": "stream",
26+
"text": [
27+
"--2024-06-13 12:33:48-- https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py\n",
28+
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
29+
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
30+
"HTTP request sent, awaiting response... 200 OK\n",
31+
"Length: 3832 (3.7K) [text/plain]\n",
32+
"Saving to: 'minsearch.py'\n",
33+
"\n",
34+
" 0K ... 100% 969K=0.004s\n",
35+
"\n",
36+
"2024-06-13 12:33:49 (969 KB/s) - 'minsearch.py' saved [3832/3832]\n",
37+
"\n"
38+
]
39+
}
40+
],
41+
"source": [
42+
"!rm -f minsearch.py\n",
43+
"!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": 2,
49+
"id": "3ac947de-effd-4b61-8792-a6d7a133f347",
50+
"metadata": {},
51+
"outputs": [
52+
{
53+
"data": {
54+
"text/plain": [
55+
"<minsearch.Index at 0x28d98e5ab10>"
56+
]
57+
},
58+
"execution_count": 2,
59+
"metadata": {},
60+
"output_type": "execute_result"
61+
}
62+
],
63+
"source": [
64+
"import requests \n",
65+
"import minsearch\n",
66+
"\n",
67+
"docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'\n",
68+
"docs_response = requests.get(docs_url)\n",
69+
"documents_raw = docs_response.json()\n",
70+
"\n",
71+
"documents = []\n",
72+
"\n",
73+
"for course in documents_raw:\n",
74+
" course_name = course['course']\n",
75+
"\n",
76+
" for doc in course['documents']:\n",
77+
" doc['course'] = course_name\n",
78+
" documents.append(doc)\n",
79+
"\n",
80+
"index = minsearch.Index(\n",
81+
" text_fields=[\"question\", \"text\", \"section\"],\n",
82+
" keyword_fields=[\"course\"]\n",
83+
")\n",
84+
"\n",
85+
"index.fit(documents)"
86+
]
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": 3,
91+
"id": "8f087272-b44d-4738-9ea2-175ec63a058b",
92+
"metadata": {},
93+
"outputs": [],
94+
"source": [
95+
"def search(query):\n",
96+
" boost = {'question': 3.0, 'section': 0.5}\n",
97+
"\n",
98+
" results = index.search(\n",
99+
" query=query,\n",
100+
" filter_dict={'course': 'data-engineering-zoomcamp'},\n",
101+
" boost_dict=boost,\n",
102+
" num_results=5\n",
103+
" )\n",
104+
"\n",
105+
" return results"
106+
]
107+
},
108+
{
109+
"cell_type": "code",
110+
"execution_count": 4,
111+
"id": "742ab881-499a-4675-83c4-2013ea1377b9",
112+
"metadata": {},
113+
"outputs": [],
114+
"source": [
115+
"def build_prompt(query, search_results):\n",
116+
" prompt_template = \"\"\"\n",
117+
"You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n",
118+
"Use only the facts from the CONTEXT when answering the QUESTION.\n",
119+
"\n",
120+
"QUESTION: {question}\n",
121+
"\n",
122+
"CONTEXT: \n",
123+
"{context}\n",
124+
"\"\"\".strip()\n",
125+
"\n",
126+
" context = \"\"\n",
127+
" \n",
128+
" for doc in search_results:\n",
129+
" context = context + f\"section: {doc['section']}\\nquestion: {doc['question']}\\nanswer: {doc['text']}\\n\\n\"\n",
130+
" \n",
131+
" prompt = prompt_template.format(question=query, context=context).strip()\n",
132+
" return prompt\n",
133+
"\n",
134+
"def llm(prompt):\n",
135+
" response = client.chat.completions.create(\n",
136+
" model='gpt-4o',\n",
137+
" messages=[{\"role\": \"user\", \"content\": prompt}]\n",
138+
" )\n",
139+
" \n",
140+
" return response.choices[0].message.content"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": 5,
146+
"id": "fe8bff3e-b672-42be-866b-f2d9bb217106",
147+
"metadata": {},
148+
"outputs": [],
149+
"source": [
150+
"def rag(query):\n",
151+
" search_results = search(query)\n",
152+
" prompt = build_prompt(query, search_results)\n",
153+
" answer = llm(prompt)\n",
154+
" return answer"
155+
]
156+
},
157+
{
158+
"cell_type": "code",
159+
"execution_count": null,
160+
"id": "091a77e6-936b-448e-a04b-bad1001f5bb0",
161+
"metadata": {},
162+
"outputs": [],
163+
"source": []
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": null,
168+
"id": "21aa255e-c971-44ca-9826-a721df3ad063",
169+
"metadata": {},
170+
"outputs": [],
171+
"source": []
172+
}
173+
],
174+
"metadata": {
175+
"kernelspec": {
176+
"display_name": "Python 3 (ipykernel)",
177+
"language": "python",
178+
"name": "python3"
179+
},
180+
"language_info": {
181+
"codemirror_mode": {
182+
"name": "ipython",
183+
"version": 3
184+
},
185+
"file_extension": ".py",
186+
"mimetype": "text/x-python",
187+
"name": "python",
188+
"nbconvert_exporter": "python",
189+
"pygments_lexer": "ipython3",
190+
"version": "3.11.9"
191+
}
192+
},
193+
"nbformat": 4,
194+
"nbformat_minor": 5
195+
}

0 commit comments

Comments
 (0)