-
Notifications
You must be signed in to change notification settings - Fork 51
/
Copy pathcsv_parser.py
60 lines (52 loc) · 1.99 KB
/
csv_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import re
from tqdm import tqdm
"""
Reads the csv dataset available at
http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip
and splits it into two files (.pos and .neg) containing the positive and
negative tweets.
Does some word preprocessing during the parsing.
"""
# TODO Maybe, eventually, if the gods allow it, I'll do the migration to pandas.
# import pandas as pd
# df = pd.read_csv('twitter-sentiment-dataset/sentiment-dataset.csv',
# error_bad_lines=False)
#
# df.SentimentText = df.SentimentText.str.strip()
# df.SentimentText = df.SentimentText.str.replace(r'http://.*', '<link/>')
# df.SentimentText = df.SentimentText.str.replace('#', '<HASHTAG/> ')
# df.SentimentText = df.SentimentText.str.replace('"', ' \" ')
# df.SentimentText = df.SentimentText.str.replace('&', ' & ')
# df.SentimentText = df.SentimentText.str.replace('>', ' > ')
# df.SentimentText = df.SentimentText.str.replace('<', ' < ')
try:
full_dataset = open("twitter-sentiment-dataset/sentiment-dataset.csv", "r")
pos_dataset = open("twitter-sentiment-dataset/tw-data.pos", "w")
neg_dataset = open("twitter-sentiment-dataset/tw-data.neg", "w")
except IOError:
print "Failed to open file"
quit()
csv_lines = full_dataset.readlines()
i = 0.0
for line in tqdm(csv_lines):
i += 1.0
line = line.split(",", 3)
tweet = line[3].strip()
new_tweet = ''
for word in tweet.split():
# String preprocessing
if re.match('^.*@.*', word):
word = '<NAME/>'
if re.match('^.*http://.*', word):
word = '<LINK/>'
word = word.replace('#', '<HASHTAG/> ')
word = word.replace('"', ' \" ')
word = word.replace('&', ' & ')
word = word.replace('>', ' > ')
word = word.replace('<', ' < ')
new_tweet = ' '.join([new_tweet, word])
tweet = new_tweet.strip() + '\n'
if line[1].strip() == '1':
pos_dataset.write(tweet)
else:
neg_dataset.write(tweet)