-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateTables.py
93 lines (82 loc) · 2.82 KB
/
createTables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import sqlite3
import json
conn = sqlite3.connect('Yelp.db')
sqliteScript = conn.executescript(open("schema.sql", "r").read())
setOfCities = {}
print "Opening businesses"
file = open("yelp_academic_dataset_business.json", "r")
print "Parsing businesses"
business_count = 0
for line in file:
business_count += 1
if business_count % 20000 == 0:
print "#" + str(business_count)
c = conn.cursor()
parsed = json.loads(line)
business_id = parsed["business_id"]
review_count = int(parsed["review_count"])
name = parsed["name"]
longitude = float(parsed["longitude"])
latitude = float(parsed["latitude"])
city = parsed["city"]
if city not in setOfCities:
setOfCities[city] = 0
setOfCities[city] += 1
state = parsed["state"]
stars = float(parsed["stars"])
attributeMap = parsed["attributes"]
price_range = 0 if "Price Range" not in attributeMap else attributeMap["Price Range"]
newTuple = (business_id, name, stars, city, state, longitude, latitude, review_count, price_range)
c.execute("INSERT INTO Businesses VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", newTuple)
categories = parsed["categories"]
for category in categories:
c.execute("INSERT INTO Categories VALUES (?, ?)", (business_id, category))
print "Committing businesses"
conn.commit()
print "Finished %s businesses \n" % business_count
sortedCities = sorted(setOfCities.items(), key = lambda x: x[1], reverse = True)
for city, freq in sortedCities:
print "%s: %s" % (city, freq)
print "Opening users"
file = open("yelp_academic_dataset_user.json", "r")
print "Parsing users"
user_count = 0
for line in file:
user_count += 1
if user_count % 200000 == 0:
print "#" + str(user_count)
c = conn.cursor()
parsed = json.loads(line)
user_id = parsed["user_id"]
average_stars = float(parsed["average_stars"])
review_count = int(parsed["review_count"])
fans = int(parsed["fans"])
yelping_since = parsed["yelping_since"] + "-01"
c.execute("INSERT INTO Users VALUES (?, ?, ?, ?, ?)", (user_id, average_stars, review_count, fans, yelping_since))
friends = parsed["friends"]
for friend in friends:
c.execute("INSERT INTO Friends VALUES (?, ?)", (user_id, friend))
print "Committing users"
conn.commit()
print "Finished %s users \n" % user_count
print "Opening reviews"
file = open("yelp_academic_dataset_review.json", "r")
print "Parsing reviews"
review_count = 0
for line in file:
review_count += 1
if review_count % 500000 == 0:
print "#" + str(review_count)
c = conn.cursor()
parsed = json.loads(line)
review_id = parsed["review_id"]
user_id = parsed["user_id"]
business_id = parsed["business_id"]
stars = int(parsed["stars"])
review = parsed["text"]
date = parsed["date"]
c.execute("INSERT INTO Reviews VALUES (?, ?, ?, ?, ?, ?)", (review_id, user_id, business_id, stars, review, date))
print "Committing reviews"
conn.commit()
conn.close()
print "Finished %s reviews" % review_count