-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze.py
70 lines (61 loc) · 2.97 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import sys
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
def word_in_text(word, text):
'''Returns true if the string word is contained in the string text'''
return True if re.search(word.lower(), text.lower()) is not None else False
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Please supply path to data in argument")
sys.exit()
data_path = sys.argv[1]
tweets_data = []
# Read each line, append to tweets_data array
with open(data_path, 'r') as tweets_file:
for line in tweets_file:
try:
tweet = json.loads(line)
if 'text' in tweet:
tweets_data.append(json.loads(line))
except:
continue
#tweets_data = [json.loads(line) for line in tweets_file]
# Create a data frame for our data
tweets = pd.DataFrame()
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['created_at'] = map(lambda tweet: tweet['created_at'], tweets_data)
tweets['user_id'] = map(lambda tweet: tweet['user']['id'], tweets_data)
tweets['user_name'] = map(lambda tweet: tweet['user']['name'], tweets_data)
tweets['retweets'] = map(lambda tweet: tweet['retweet_count'], tweets_data)
tweets['favorites'] = map(lambda tweet: tweet['favorite_count'], tweets_data)
# Find keywords
tweets['Clinton'] = tweets['text'].apply(lambda tweet: word_in_text('Clinton', tweet))
tweets['Sanders'] = tweets['text'].apply(lambda tweet: word_in_text('Sanders', tweet))
tweets['Trump'] = tweets['text'].apply(lambda tweet: word_in_text('Trump', tweet))
tweets['Carson'] = tweets['text'].apply(lambda tweet: word_in_text('Carson', tweet))
tweets['Rubio'] = tweets['text'].apply(lambda tweet: word_in_text('Rubio', tweet))
tweets['Cruz'] = tweets['text'].apply(lambda tweet: word_in_text('Cruz', tweet))
tweets['Bush'] = tweets['text'].apply(lambda tweet: word_in_text('Bush', tweet))
# Aggregate count of tweets containing each candidate
candidates =['Clinton', 'Sanders', 'Trump', 'Carson', 'Rubio', 'Cruz', 'Bush']
tweets_by_candidate = [len(tweets[tweets['Clinton']]),
len(tweets[tweets['Sanders']]),
len(tweets[tweets['Trump']]),
len(tweets[tweets['Carson']]),
len(tweets[tweets['Rubio']]),
len(tweets[tweets['Cruz']]),
len(tweets[tweets['Bush']])]
# Plot
x_pos = list(range(len(candidates)))
width = 0.8
fig, ax = plt.subplots()
plt.bar(x_pos, tweets_by_candidate, width, alpha=1, color='g')
# Setup axis labels
ax.set_ylabel('Number of tweets', fontsize=15)
ax.set_title('Number of tweets containing names of popular 2016 election candidates (sample data)', fontsize=10, fontweight='bold')
ax.set_xticks([p+0.5*width for p in x_pos])
ax.set_xticklabels(candidates)
plt.grid()
plt.show()