Skip to content

Commit 24f1040

Browse files
committed
Naive bayes
1 parent 66c8408 commit 24f1040

File tree

4 files changed

+646
-94
lines changed

4 files changed

+646
-94
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 3,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": [
11+
"import numpy as np\n",
12+
"from matplotlib import pyplot as plt\n",
13+
"%matplotlib inline\n",
14+
"import random"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 13,
20+
"metadata": {
21+
"collapsed": false
22+
},
23+
"outputs": [
24+
{
25+
"name": "stdout",
26+
"output_type": "stream",
27+
"text": [
28+
"[1 3 3 2 1 2 2 2 3 3 2 2 2 1 3 2 2 3 2 2]\n",
29+
"[2 1 3 1 2 3 3 1 1 1 1 3 2 1 2 2 1 2 2 1]\n",
30+
"[1 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 1 0 0 0]\n"
31+
]
32+
}
33+
],
34+
"source": [
35+
"set1 = [1, 2, 3]\n",
36+
"lab_set = [0, 1]\n",
37+
"\n",
38+
"n_exmp = 20\n",
39+
"\n",
40+
"var1 = np.asarray([random.choice(set1) for ix in range(n_exmp)])\n",
41+
"var2 = np.asarray([random.choice(set1) for ix in range(n_exmp)])\n",
42+
"lab = np.asarray([random.choice(lab_set) for ix in range(n_exmp)])\n",
43+
"\n",
44+
"print var1\n",
45+
"print var2\n",
46+
"print lab"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": 20,
52+
"metadata": {
53+
"collapsed": false
54+
},
55+
"outputs": [
56+
{
57+
"name": "stdout",
58+
"output_type": "stream",
59+
"text": [
60+
"0.45\n"
61+
]
62+
}
63+
],
64+
"source": [
65+
"def probability(val, n_sample):\n",
66+
" cnt = 0\n",
67+
" \n",
68+
" for ix in range(n_sample.shape[0]):\n",
69+
" if n_sample[ix] == val:\n",
70+
" cnt += 1\n",
71+
" return float(cnt)/float(n_sample.shape[0])\n",
72+
"\n",
73+
"def conditional_probability(c1, data, c2):\n",
74+
" n = lab.shape[0]\n",
75+
" count_c2 = 0\n",
76+
" count_c1_c2 = 0\n",
77+
" for ix in range(n):\n",
78+
" if lab[ix] == c2:\n",
79+
" count_c2 += 1\n",
80+
" if lab[ix] == c2 and data[ix] == c1:\n",
81+
" count_c1_c2 += 1\n",
82+
" return float(count_c1_c2)/float(count_c2)\n",
83+
" \n",
84+
"\n",
85+
"print probability(1, var2)"
86+
]
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": 32,
91+
"metadata": {
92+
"collapsed": true
93+
},
94+
"outputs": [],
95+
"source": [
96+
"def classify(inp):\n",
97+
" # for given input\n",
98+
" # decide probability for class 0\n",
99+
" \n",
100+
" prod_cond_prob = 1\n",
101+
" prod_pos_prob = 1\n",
102+
" \n",
103+
" prod_cond_prob *= conditional_probability(inp[0], var1, 0)\n",
104+
" prod_cond_prob *= conditional_probability(inp[1], var2, 0)\n",
105+
" \n",
106+
" prod_pos_prob *= probability(inp[0], var1)\n",
107+
" prod_pos_prob *= probability(inp[1], var2)\n",
108+
" \n",
109+
" p0 = probability(0, lab) * prod_cond_prob / prod_pos_prob\n",
110+
" \n",
111+
" prod_cond_prob = 1\n",
112+
" prod_pos_prob = 1\n",
113+
" \n",
114+
" prod_cond_prob *= conditional_probability(inp[0], var1, 1)\n",
115+
" prod_cond_prob *= conditional_probability(inp[1], var2, 1)\n",
116+
" \n",
117+
" print conditional_probability(inp[0], var1, 1)\n",
118+
" print conditional_probability(inp[1], var2, 1)\n",
119+
" \n",
120+
" prod_pos_prob *= probability(inp[0], var1)\n",
121+
" prod_pos_prob *= probability(inp[1], var2)\n",
122+
" \n",
123+
" print probability(inp[0], var1)\n",
124+
" print probability(inp[1], var2)\n",
125+
"\n",
126+
" \n",
127+
" p1 = (probability(1, lab) * prod_cond_prob) / prod_pos_prob\n",
128+
" \n",
129+
" return p0, p1"
130+
]
131+
},
132+
{
133+
"cell_type": "code",
134+
"execution_count": 34,
135+
"metadata": {
136+
"collapsed": false
137+
},
138+
"outputs": [
139+
{
140+
"name": "stdout",
141+
"output_type": "stream",
142+
"text": [
143+
"[1 3 3 2 1 2 2 2 3 3 2 2 2 1 3 2 2 3 2 2]\n",
144+
"[2 1 3 1 2 3 3 1 1 1 1 3 2 1 2 2 1 2 2 1]\n",
145+
"[1 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 1 0 0 0]\n",
146+
"0.7\n",
147+
"0.4\n",
148+
"0.55\n",
149+
"0.2\n"
150+
]
151+
},
152+
{
153+
"data": {
154+
"text/plain": [
155+
"(0.0, 1.2727272727272725)"
156+
]
157+
},
158+
"execution_count": 34,
159+
"metadata": {},
160+
"output_type": "execute_result"
161+
}
162+
],
163+
"source": [
164+
"print var1\n",
165+
"print var2\n",
166+
"print lab\n",
167+
"\n",
168+
"classify([2, 3])"
169+
]
170+
},
171+
{
172+
"cell_type": "code",
173+
"execution_count": 38,
174+
"metadata": {
175+
"collapsed": false
176+
},
177+
"outputs": [],
178+
"source": [
179+
"import sklearn\n",
180+
"import pandas as pd\n",
181+
"from sklearn.naive_bayes import GaussianNB"
182+
]
183+
},
184+
{
185+
"cell_type": "code",
186+
"execution_count": 43,
187+
"metadata": {
188+
"collapsed": false
189+
},
190+
"outputs": [
191+
{
192+
"name": "stdout",
193+
"output_type": "stream",
194+
"text": [
195+
"(10000, 784) (5000, 784)\n",
196+
"(10000,) (5000,)\n"
197+
]
198+
}
199+
],
200+
"source": [
201+
"ds = pd.read_csv('./train.csv')\n",
202+
"data_X = ds.values[:10000, 1:]\n",
203+
"data_y = ds.values[:10000, 0]\n",
204+
"\n",
205+
"test_X = ds.values[10000:15000, 1:]\n",
206+
"test_y = ds.values[10000:15000, 0]\n",
207+
"\n",
208+
"print data_X.shape, test_X.shape\n",
209+
"print data_y.shape, test_y.shape"
210+
]
211+
},
212+
{
213+
"cell_type": "code",
214+
"execution_count": 45,
215+
"metadata": {
216+
"collapsed": false
217+
},
218+
"outputs": [
219+
{
220+
"data": {
221+
"text/plain": [
222+
"0.55820000000000003"
223+
]
224+
},
225+
"execution_count": 45,
226+
"metadata": {},
227+
"output_type": "execute_result"
228+
}
229+
],
230+
"source": [
231+
"gnb = GaussianNB()\n",
232+
"\n",
233+
"gnb.fit(data_X, data_y)\n",
234+
"\n",
235+
"gnb.score(test_X, test_y)"
236+
]
237+
},
238+
{
239+
"cell_type": "code",
240+
"execution_count": 60,
241+
"metadata": {
242+
"collapsed": false
243+
},
244+
"outputs": [],
245+
"source": [
246+
"from sklearn.decomposition import PCA\n",
247+
"\n",
248+
"pca = PCA(n_components=319)\n",
249+
"\n",
250+
"dt = pca.fit_transform(ds.values[:15000, 1:])\n"
251+
]
252+
},
253+
{
254+
"cell_type": "code",
255+
"execution_count": 61,
256+
"metadata": {
257+
"collapsed": false
258+
},
259+
"outputs": [
260+
{
261+
"name": "stdout",
262+
"output_type": "stream",
263+
"text": [
264+
"(10000, 442)\n"
265+
]
266+
}
267+
],
268+
"source": [
269+
"trans_X = dt[:10000, :]\n",
270+
"test_trans_X = dt[10000:, :]\n",
271+
"\n",
272+
"print trans_X.shape"
273+
]
274+
},
275+
{
276+
"cell_type": "code",
277+
"execution_count": 62,
278+
"metadata": {
279+
"collapsed": false
280+
},
281+
"outputs": [
282+
{
283+
"data": {
284+
"text/plain": [
285+
"0.67859999999999998"
286+
]
287+
},
288+
"execution_count": 62,
289+
"metadata": {},
290+
"output_type": "execute_result"
291+
}
292+
],
293+
"source": [
294+
"gnb = GaussianNB()\n",
295+
"\n",
296+
"gnb.fit(trans_X, data_y)\n",
297+
"\n",
298+
"gnb.score(test_trans_X, test_y)"
299+
]
300+
}
301+
],
302+
"metadata": {
303+
"kernelspec": {
304+
"display_name": "Python 2",
305+
"language": "python",
306+
"name": "python2"
307+
},
308+
"language_info": {
309+
"codemirror_mode": {
310+
"name": "ipython",
311+
"version": 2
312+
},
313+
"file_extension": ".py",
314+
"mimetype": "text/x-python",
315+
"name": "python",
316+
"nbconvert_exporter": "python",
317+
"pygments_lexer": "ipython2",
318+
"version": "2.7.12"
319+
}
320+
},
321+
"nbformat": 4,
322+
"nbformat_minor": 2
323+
}

class_05/.ipynb_checkpoints/Untitled-checkpoint.ipynb

-47
This file was deleted.

0 commit comments

Comments
 (0)