Skip to content

Commit 4e928b0

Browse files
committed
Initial Commit
0 parents  commit 4e928b0

25 files changed

+1491
-0
lines changed

.gitattributes

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Auto detect text files and perform LF normalization
2+
* text=auto

Analysis/business.ipynb

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"collapsed": true,
8+
"pycharm": {
9+
"is_executing": false
10+
}
11+
},
12+
"outputs": [],
13+
"source": [
14+
"from pyspark.sql import SparkSession\n",
15+
"import pyspark.sql.functions as func\n",
16+
"from pyspark.sql.types import StructType, StructField, StringType, DoubleType"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 2,
22+
"outputs": [],
23+
"source": [
24+
"spark = SparkSession.builder.appName(\"YelpHelp\")\\\n",
25+
" .master(\"local\")\\\n",
26+
" .config(\"spark.executor.memory\", \"16g\")\\\n",
27+
" .config(\"spark.driver.memory\", \"16g\")\\\n",
28+
" .getOrCreate()"
29+
],
30+
"metadata": {
31+
"collapsed": false,
32+
"pycharm": {
33+
"name": "#%%\n",
34+
"is_executing": false
35+
}
36+
}
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 3,
41+
"outputs": [],
42+
"source": [
43+
"hourSchema = StructType({\n",
44+
" StructField(\"Monday\",StringType(),True),\n",
45+
" StructField(\"Tuesday\",StringType(),True),\n",
46+
" StructField(\"Wednesday\",StringType(),True),\n",
47+
" StructField(\"Thursday\",StringType(),True),\n",
48+
" StructField(\"Friday\",StringType(),True),\n",
49+
" StructField(\"Sunday\",StringType(),True),\n",
50+
" StructField(\"Saturday\",StringType(),True)})\n",
51+
"b_schema = StructType([\n",
52+
" StructField(\"business_id\", StringType(), False),\n",
53+
" StructField(\"name\",StringType(),True),\n",
54+
" StructField(\"address\",StringType(),True),\n",
55+
" StructField(\"city\",StringType(),True),\n",
56+
" StructField(\"state\",StringType(),True),\n",
57+
" StructField(\"postal_code\",StringType(),True),\n",
58+
" StructField(\"latitude\",DoubleType(),True),\n",
59+
" StructField(\"longitude\",DoubleType(),True),\n",
60+
" StructField(\"categories\", StringType(),True),\n",
61+
" StructField(\"hours\", hourSchema, True)\n",
62+
" \n",
63+
"])\n",
64+
"businessDataset = spark.read.json(\"../yelp_dataset/yelp_academic_dataset_business.json\", schema=b_schema)"
65+
],
66+
"metadata": {
67+
"collapsed": false,
68+
"pycharm": {
69+
"name": "#%%\n",
70+
"is_executing": false
71+
}
72+
}
73+
},
74+
{
75+
"cell_type": "code",
76+
"execution_count": 4,
77+
"outputs": [],
78+
"source": [
79+
"businessDataset.write.json(\"../YelpDatasetYearly/Business/yelp_academic_dataset_business\")\n",
80+
"\n"
81+
],
82+
"metadata": {
83+
"collapsed": false,
84+
"pycharm": {
85+
"name": "#%%\n",
86+
"is_executing": false
87+
}
88+
}
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": null,
93+
"outputs": [],
94+
"source": [],
95+
"metadata": {
96+
"collapsed": false,
97+
"pycharm": {
98+
"name": "#%%\n"
99+
}
100+
}
101+
}
102+
],
103+
"metadata": {
104+
"kernelspec": {
105+
"display_name": "Python 3",
106+
"language": "python",
107+
"name": "python3"
108+
},
109+
"language_info": {
110+
"codemirror_mode": {
111+
"name": "ipython",
112+
"version": 2
113+
},
114+
"file_extension": ".py",
115+
"mimetype": "text/x-python",
116+
"name": "python",
117+
"nbconvert_exporter": "python",
118+
"pygments_lexer": "ipython2",
119+
"version": "2.7.6"
120+
},
121+
"pycharm": {
122+
"stem_cell": {
123+
"cell_type": "raw",
124+
"source": [],
125+
"metadata": {
126+
"collapsed": false
127+
}
128+
}
129+
}
130+
},
131+
"nbformat": 4,
132+
"nbformat_minor": 0
133+
}

Analysis/checkin.ipynb

+214
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {
7+
"collapsed": true,
8+
"pycharm": {
9+
"is_executing": false
10+
}
11+
},
12+
"outputs": [],
13+
"source": [
14+
"from pyspark.sql import SparkSession\n",
15+
"import pyspark.sql.functions as func\n",
16+
"from pyspark.sql.types import StructType, StructField, StringType, LongType, DateType, DoubleType, BooleanType, \\\n",
17+
" ArrayType"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 40,
23+
"outputs": [],
24+
"source": [
25+
"spark = SparkSession.builder.appName(\"YelpHelp\")\\\n",
26+
" .master(\"local\")\\\n",
27+
" .config(\"spark.executor.memory\", \"16g\")\\\n",
28+
" .config(\"spark.driver.memory\", \"16g\")\\\n",
29+
" .getOrCreate()\n",
30+
"\n",
31+
"schema2 = StructType([\n",
32+
" StructField(\"business_id\", StringType(), True),\n",
33+
" StructField(\"date\", StringType(), True) \n",
34+
"])\n",
35+
"\n",
36+
"\n",
37+
"dataset2 = spark.read.json(\"../yelp_dataset/yelp_academic_dataset_checkin.json\", schema=schema2)"
38+
],
39+
"metadata": {
40+
"collapsed": false,
41+
"pycharm": {
42+
"name": "#%%\n",
43+
"is_executing": false
44+
}
45+
}
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": 41,
50+
"outputs": [],
51+
"source": [
52+
"import datetime\n",
53+
"def convert(x):\n",
54+
" x = x.split(', ')\n",
55+
" return x\n",
56+
"\n",
57+
"def convert2(x):\n",
58+
" x = [str(datetime.datetime.strptime(i, '%Y-%m-%d %H:%M:%S').date()) for i in x]\n",
59+
" return x\n",
60+
"\n",
61+
"dataset2 = dataset2.rdd.map(lambda x: (x[0], convert(x[1])))\n",
62+
"dataset2 = dataset2.map(lambda x: (x[0], convert2(x[1])))\n",
63+
"dataset2 = dataset2.toDF()"
64+
],
65+
"metadata": {
66+
"collapsed": false,
67+
"pycharm": {
68+
"name": "#%%\n",
69+
"is_executing": false
70+
}
71+
}
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 42,
76+
"outputs": [
77+
{
78+
"name": "stdout",
79+
"text": [
80+
"+--------------------+--------------------+----------+\n",
81+
"| _1| _2| Checkin|\n",
82+
"+--------------------+--------------------+----------+\n",
83+
"|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-04-26|\n",
84+
"|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-08-30|\n",
85+
"|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-10-15|\n",
86+
"|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-11-18|\n",
87+
"|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2017-04-20|\n",
88+
"+--------------------+--------------------+----------+\n",
89+
"only showing top 5 rows\n",
90+
"\n"
91+
],
92+
"output_type": "stream"
93+
}
94+
],
95+
"source": [
96+
"df_exploded = dataset2.withColumn('Checkin', func.explode('_2'))\n",
97+
"df_exploded.show(5)"
98+
],
99+
"metadata": {
100+
"collapsed": false,
101+
"pycharm": {
102+
"name": "#%%\n",
103+
"is_executing": false
104+
}
105+
}
106+
},
107+
{
108+
"cell_type": "code",
109+
"execution_count": 43,
110+
"outputs": [],
111+
"source": [
112+
"df_exploded = df_exploded.select(func.col(\"_1\").alias(\"business_id\"), func.col(\"Checkin\").alias(\"checkin\").cast(DateType()))"
113+
],
114+
"metadata": {
115+
"collapsed": false,
116+
"pycharm": {
117+
"name": "#%%\n",
118+
"is_executing": false
119+
}
120+
}
121+
},
122+
{
123+
"cell_type": "code",
124+
"execution_count": 44,
125+
"outputs": [
126+
{
127+
"name": "stdout",
128+
"text": [
129+
"+--------------------+----------+\n",
130+
"| business_id| checkin|\n",
131+
"+--------------------+----------+\n",
132+
"|--1UhMGODdWsrMast...|2016-04-26|\n",
133+
"|--1UhMGODdWsrMast...|2016-08-30|\n",
134+
"|--1UhMGODdWsrMast...|2016-10-15|\n",
135+
"|--1UhMGODdWsrMast...|2016-11-18|\n",
136+
"|--1UhMGODdWsrMast...|2017-04-20|\n",
137+
"+--------------------+----------+\n",
138+
"only showing top 5 rows\n",
139+
"\n"
140+
],
141+
"output_type": "stream"
142+
}
143+
],
144+
"source": [
145+
"df_exploded.show(5)"
146+
],
147+
"metadata": {
148+
"collapsed": false,
149+
"pycharm": {
150+
"name": "#%%\n",
151+
"is_executing": false
152+
}
153+
}
154+
},
155+
{
156+
"cell_type": "code",
157+
"execution_count": 46,
158+
"outputs": [],
159+
"source": [
160+
"df_exploded = df_exploded.withColumn('year', func.year(\"checkin\")).repartition(10, \"year\")\n",
161+
"df_exploded.write.partitionBy(\"year\").json(\"../YelpDatasetYearly/CheckIns/yelp_academic_dataset_checkin\")"
162+
],
163+
"metadata": {
164+
"collapsed": false,
165+
"pycharm": {
166+
"name": "#%%\n",
167+
"is_executing": false
168+
}
169+
}
170+
},
171+
{
172+
"cell_type": "code",
173+
"execution_count": null,
174+
"outputs": [],
175+
"source": [],
176+
"metadata": {
177+
"collapsed": false,
178+
"pycharm": {
179+
"name": "#%%\n"
180+
}
181+
}
182+
}
183+
],
184+
"metadata": {
185+
"kernelspec": {
186+
"display_name": "Python 3",
187+
"language": "python",
188+
"name": "python3"
189+
},
190+
"language_info": {
191+
"codemirror_mode": {
192+
"name": "ipython",
193+
"version": 2
194+
},
195+
"file_extension": ".py",
196+
"mimetype": "text/x-python",
197+
"name": "python",
198+
"nbconvert_exporter": "python",
199+
"pygments_lexer": "ipython2",
200+
"version": "2.7.6"
201+
},
202+
"pycharm": {
203+
"stem_cell": {
204+
"cell_type": "raw",
205+
"source": [],
206+
"metadata": {
207+
"collapsed": false
208+
}
209+
}
210+
}
211+
},
212+
"nbformat": 4,
213+
"nbformat_minor": 0
214+
}

0 commit comments

Comments
 (0)