Skip to content

Commit 4a8c495

Browse files
committed
word2vec with gensim. bad accuracy. less data
1 parent 6b2b743 commit 4a8c495

File tree

6 files changed

+1134
-40
lines changed

6 files changed

+1134
-40
lines changed

class_15/.ipynb_checkpoints/Untitled-checkpoint.ipynb

-6
This file was deleted.

class_15/.ipynb_checkpoints/w2v proto-checkpoint.ipynb

+385
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"collapsed": true
8+
},
9+
"outputs": [],
10+
"source": [
11+
"import numpy as np\n",
12+
"from matplotlib import pyplot as plt\n",
13+
"import pickle as pk\n",
14+
"%matplotlib inline\n",
15+
"import gensim"
16+
]
17+
},
18+
{
19+
"cell_type": "code",
20+
"execution_count": 21,
21+
"metadata": {
22+
"collapsed": true
23+
},
24+
"outputs": [],
25+
"source": [
26+
"f = open('./data.txt')\n",
27+
"d = f.read()\n",
28+
"f.close()\n",
29+
"\n",
30+
"data = d[1260:]\n",
31+
"data = data.lower().decode('utf-8')\n",
32+
"import re\n",
33+
"\n",
34+
"p = re.sub('[^A-Za-z.]+', ' ', data)\n",
35+
"ds = p.split('.')"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 24,
41+
"metadata": {
42+
"collapsed": false
43+
},
44+
"outputs": [
45+
{
46+
"data": {
47+
"text/plain": [
48+
"u'it is often enough and always with great surprise intimated to me that there is something both ordinary and unusual in all my writings from the birth of tragedy to the recently published prelude to a philosophy of the future they all contain i have been told snares and nets for short sighted birds and something that is almost a constant subtle incitement to an overturning of habitual opinions and of approved customs'"
49+
]
50+
},
51+
"execution_count": 24,
52+
"metadata": {},
53+
"output_type": "execute_result"
54+
}
55+
],
56+
"source": [
57+
"ds[0]"
58+
]
59+
},
60+
{
61+
"cell_type": "code",
62+
"execution_count": 26,
63+
"metadata": {
64+
"collapsed": false,
65+
"scrolled": false
66+
},
67+
"outputs": [],
68+
"source": [
69+
"for ix in range(len(ds)):\n",
70+
" ds[ix] = ds[ix].split()"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 29,
76+
"metadata": {
77+
"collapsed": false
78+
},
79+
"outputs": [
80+
{
81+
"name": "stdout",
82+
"output_type": "stream",
83+
"text": [
84+
"1462\n"
85+
]
86+
}
87+
],
88+
"source": [
89+
"print len(ds)"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": 30,
95+
"metadata": {
96+
"collapsed": false
97+
},
98+
"outputs": [],
99+
"source": [
100+
"model = gensim.models.Word2Vec(ds, min_count=2)"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": 42,
106+
"metadata": {
107+
"collapsed": false
108+
},
109+
"outputs": [
110+
{
111+
"data": {
112+
"text/plain": [
113+
"[(u'an', 0.999976396560669),\n",
114+
" (u'a', 0.9999758005142212),\n",
115+
" (u'from', 0.9999721050262451),\n",
116+
" (u'that', 0.9999714493751526),\n",
117+
" (u'and', 0.9999709725379944),\n",
118+
" (u'of', 0.9999709725379944),\n",
119+
" (u'this', 0.9999706745147705),\n",
120+
" (u'not', 0.9999699592590332),\n",
121+
" (u'are', 0.9999694228172302),\n",
122+
" (u'which', 0.9999691247940063)]"
123+
]
124+
},
125+
"execution_count": 42,
126+
"metadata": {},
127+
"output_type": "execute_result"
128+
}
129+
],
130+
"source": [
131+
"model.similar_by_word('to')"
132+
]
133+
}
134+
],
135+
"metadata": {
136+
"kernelspec": {
137+
"display_name": "Python 2",
138+
"language": "python",
139+
"name": "python2"
140+
},
141+
"language_info": {
142+
"codemirror_mode": {
143+
"name": "ipython",
144+
"version": 2
145+
},
146+
"file_extension": ".py",
147+
"mimetype": "text/x-python",
148+
"name": "python",
149+
"nbconvert_exporter": "python",
150+
"pygments_lexer": "ipython2",
151+
"version": "2.7.12"
152+
}
153+
},
154+
"nbformat": 4,
155+
"nbformat_minor": 2
156+
}

class_15/Untitled.ipynb

-34
This file was deleted.

0 commit comments

Comments
 (0)