-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateInfoText.py
executable file
·96 lines (86 loc) · 2.43 KB
/
createInfoText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from dbHelper import *
import re
import Stemmer
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
stemmer = Stemmer.Stemmer("english")
file = open("graphAttributesCharlotte.txt", "w+")
print "Creating friends map."
friendsMap = {}
friends = selectFromWhere("user1, user2", "Friends")
for friend in friends:
user1, user2 = friend["user1"], friend["user2"]
if user1 not in friendsMap:
friendsMap[user1] = set()
friendsMap[user1].add(user2)
print "Finished creating friends map"
print "Creating review map."
reviewsMap = {}
reviews = selectFromWhere("review_id, user_id, business_id, stars", "Reviews")
for res in reviews:
r_review_id = res["review_id"]
r_user_id = res["user_id"]
r_bus_id = res["business_id"]
r_stars = res["stars"]
if r_user_id not in reviewsMap:
reviewsMap[r_user_id] = []
reviewsMap[r_user_id].append((r_review_id, r_bus_id, r_stars))
print "Finished creating review map"
users = selectUsers("*")
user_count = 0
# allWords = {}
for user in users:
user_count += 1
if user_count % 10000 == 0:
print "User %s" % user_count
user_id = user["user_id"]
wroteReview, wroteFriend, wroteWords = False, False, False
line = user_id + "|"
reviews = []
if user_id in reviewsMap:
reviews = reviewsMap[user_id]
review_set = set()
wordMap = {}
for review in reviews:
review_id, business_id, stars = review
if wroteReview:
line += ","
line += ("%s,%s" % (business_id, stars))
wroteReview = True
review_text = selectFromWhere("review", "Reviews", "review_id=\"%s\"" % review_id)[0]["review"]
review_arr = re.findall(r"[\w']+", review_text)
for word in review_arr:
word = word.lower()
if word not in stops and not word.isdigit():
word = stemmer.stemWord(word)
if word not in wordMap:
wordMap[word] = 0
wordMap[word] += 1
# if word not in allWords:
# allWords[word] = 0
# allWords[word] += 1
line += "|"
friend_set = None
if user_id in friendsMap:
friend_set = friendsMap[user_id]
else:
friend_set = []
for friend in friend_set:
if friend is not None:
if wroteFriend:
line += ","
line += friend
wroteFriend = True
line += "|"
for word, count in wordMap.items():
if wroteWords:
line += ","
line += "%s,%s" % (word, count)
wroteWords = True
file.write(line + "\n")
file.close()
# print "Top 10 words:"
# sortedWords = sorted(allWords.items(), key=lambda x: x[1], reverse=True)
# for i in xrange(10):
# word = sortedWords[i]
# print word