|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 1, |
| 6 | + "metadata": { |
| 7 | + "collapsed": true |
| 8 | + }, |
| 9 | + "outputs": [], |
| 10 | + "source": [ |
| 11 | + "import numpy as np\n", |
| 12 | + "from matplotlib import pyplot as plt\n", |
| 13 | + "import pickle as pk\n", |
| 14 | + "%matplotlib inline\n", |
| 15 | + "import gensim" |
| 16 | + ] |
| 17 | + }, |
| 18 | + { |
| 19 | + "cell_type": "code", |
| 20 | + "execution_count": 21, |
| 21 | + "metadata": { |
| 22 | + "collapsed": true |
| 23 | + }, |
| 24 | + "outputs": [], |
| 25 | + "source": [ |
| 26 | + "f = open('./data.txt')\n", |
| 27 | + "d = f.read()\n", |
| 28 | + "f.close()\n", |
| 29 | + "\n", |
| 30 | + "data = d[1260:]\n", |
| 31 | + "data = data.lower().decode('utf-8')\n", |
| 32 | + "import re\n", |
| 33 | + "\n", |
| 34 | + "p = re.sub('[^A-Za-z.]+', ' ', data)\n", |
| 35 | + "ds = p.split('.')" |
| 36 | + ] |
| 37 | + }, |
| 38 | + { |
| 39 | + "cell_type": "code", |
| 40 | + "execution_count": 24, |
| 41 | + "metadata": { |
| 42 | + "collapsed": false |
| 43 | + }, |
| 44 | + "outputs": [ |
| 45 | + { |
| 46 | + "data": { |
| 47 | + "text/plain": [ |
| 48 | + "u'it is often enough and always with great surprise intimated to me that there is something both ordinary and unusual in all my writings from the birth of tragedy to the recently published prelude to a philosophy of the future they all contain i have been told snares and nets for short sighted birds and something that is almost a constant subtle incitement to an overturning of habitual opinions and of approved customs'" |
| 49 | + ] |
| 50 | + }, |
| 51 | + "execution_count": 24, |
| 52 | + "metadata": {}, |
| 53 | + "output_type": "execute_result" |
| 54 | + } |
| 55 | + ], |
| 56 | + "source": [ |
| 57 | + "ds[0]" |
| 58 | + ] |
| 59 | + }, |
| 60 | + { |
| 61 | + "cell_type": "code", |
| 62 | + "execution_count": 26, |
| 63 | + "metadata": { |
| 64 | + "collapsed": false, |
| 65 | + "scrolled": false |
| 66 | + }, |
| 67 | + "outputs": [], |
| 68 | + "source": [ |
| 69 | + "for ix in range(len(ds)):\n", |
| 70 | + " ds[ix] = ds[ix].split()" |
| 71 | + ] |
| 72 | + }, |
| 73 | + { |
| 74 | + "cell_type": "code", |
| 75 | + "execution_count": 29, |
| 76 | + "metadata": { |
| 77 | + "collapsed": false |
| 78 | + }, |
| 79 | + "outputs": [ |
| 80 | + { |
| 81 | + "name": "stdout", |
| 82 | + "output_type": "stream", |
| 83 | + "text": [ |
| 84 | + "1462\n" |
| 85 | + ] |
| 86 | + } |
| 87 | + ], |
| 88 | + "source": [ |
| 89 | + "print len(ds)" |
| 90 | + ] |
| 91 | + }, |
| 92 | + { |
| 93 | + "cell_type": "code", |
| 94 | + "execution_count": 30, |
| 95 | + "metadata": { |
| 96 | + "collapsed": false |
| 97 | + }, |
| 98 | + "outputs": [], |
| 99 | + "source": [ |
| 100 | + "model = gensim.models.Word2Vec(ds, min_count=2)" |
| 101 | + ] |
| 102 | + }, |
| 103 | + { |
| 104 | + "cell_type": "code", |
| 105 | + "execution_count": 42, |
| 106 | + "metadata": { |
| 107 | + "collapsed": false |
| 108 | + }, |
| 109 | + "outputs": [ |
| 110 | + { |
| 111 | + "data": { |
| 112 | + "text/plain": [ |
| 113 | + "[(u'an', 0.999976396560669),\n", |
| 114 | + " (u'a', 0.9999758005142212),\n", |
| 115 | + " (u'from', 0.9999721050262451),\n", |
| 116 | + " (u'that', 0.9999714493751526),\n", |
| 117 | + " (u'and', 0.9999709725379944),\n", |
| 118 | + " (u'of', 0.9999709725379944),\n", |
| 119 | + " (u'this', 0.9999706745147705),\n", |
| 120 | + " (u'not', 0.9999699592590332),\n", |
| 121 | + " (u'are', 0.9999694228172302),\n", |
| 122 | + " (u'which', 0.9999691247940063)]" |
| 123 | + ] |
| 124 | + }, |
| 125 | + "execution_count": 42, |
| 126 | + "metadata": {}, |
| 127 | + "output_type": "execute_result" |
| 128 | + } |
| 129 | + ], |
| 130 | + "source": [ |
| 131 | + "model.similar_by_word('to')" |
| 132 | + ] |
| 133 | + } |
| 134 | + ], |
| 135 | + "metadata": { |
| 136 | + "kernelspec": { |
| 137 | + "display_name": "Python 2", |
| 138 | + "language": "python", |
| 139 | + "name": "python2" |
| 140 | + }, |
| 141 | + "language_info": { |
| 142 | + "codemirror_mode": { |
| 143 | + "name": "ipython", |
| 144 | + "version": 2 |
| 145 | + }, |
| 146 | + "file_extension": ".py", |
| 147 | + "mimetype": "text/x-python", |
| 148 | + "name": "python", |
| 149 | + "nbconvert_exporter": "python", |
| 150 | + "pygments_lexer": "ipython2", |
| 151 | + "version": "2.7.12" |
| 152 | + } |
| 153 | + }, |
| 154 | + "nbformat": 4, |
| 155 | + "nbformat_minor": 2 |
| 156 | +} |
0 commit comments