-
Notifications
You must be signed in to change notification settings - Fork 51
/
Copy pathvocab_builder.py
40 lines (33 loc) · 1.39 KB
/
vocab_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from data_helpers import *
import itertools
from collections import Counter
import csv
"""
Reads the dataset and creates two csv files: one with the vocabulary used in the dataset, and one with the vocabulary's integer mapping (sorted from most to least used).
"""
def build_vocab(sentences):
"""
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
# Build vocabulary
word_counts = Counter(itertools.chain(*sentences))
# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
return [vocabulary, vocabulary_inv]
# Load and preprocess data
print 'vocab_builder: loading...'
sentences, labels = load_data_and_labels(1) # 1 is passed so that load_data_and_labels() will parse the whole dataset
print 'vocab_builder: padding...'
sentences_padded = pad_sentences(sentences)
print 'vocab_builder: building vocabularies...'
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
print 'vocab_builder: writing csv...'
voc = csv.writer(open('twitter-sentiment-dataset/vocab.csv', 'w'))
voc_inv = csv.writer(open('twitter-sentiment-dataset/vocab_inv.csv', 'w'))
for key, val in vocabulary.items():
voc.writerow([key, val])
for val in vocabulary_inv:
voc_inv.writerow([val])