1
+ {
2
+ "cells" : [
3
+ {
4
+ "cell_type" : " code" ,
5
+ "execution_count" : 1 ,
6
+ "metadata" : {
7
+ "collapsed" : true ,
8
+ "pycharm" : {
9
+ "is_executing" : false
10
+ }
11
+ },
12
+ "outputs" : [],
13
+ "source" : [
14
+ " from pyspark.sql import SparkSession\n " ,
15
+ " import pyspark.sql.functions as func\n " ,
16
+ " from pyspark.sql.types import StructType, StructField, StringType, LongType, DateType, DoubleType, BooleanType, \\\n " ,
17
+ " ArrayType"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type" : " code" ,
22
+ "execution_count" : 40 ,
23
+ "outputs" : [],
24
+ "source" : [
25
+ " spark = SparkSession.builder.appName(\" YelpHelp\" )\\\n " ,
26
+ " .master(\" local\" )\\\n " ,
27
+ " .config(\" spark.executor.memory\" , \" 16g\" )\\\n " ,
28
+ " .config(\" spark.driver.memory\" , \" 16g\" )\\\n " ,
29
+ " .getOrCreate()\n " ,
30
+ " \n " ,
31
+ " schema2 = StructType([\n " ,
32
+ " StructField(\" business_id\" , StringType(), True),\n " ,
33
+ " StructField(\" date\" , StringType(), True) \n " ,
34
+ " ])\n " ,
35
+ " \n " ,
36
+ " \n " ,
37
+ " dataset2 = spark.read.json(\" ../yelp_dataset/yelp_academic_dataset_checkin.json\" , schema=schema2)"
38
+ ],
39
+ "metadata" : {
40
+ "collapsed" : false ,
41
+ "pycharm" : {
42
+ "name" : " #%%\n " ,
43
+ "is_executing" : false
44
+ }
45
+ }
46
+ },
47
+ {
48
+ "cell_type" : " code" ,
49
+ "execution_count" : 41 ,
50
+ "outputs" : [],
51
+ "source" : [
52
+ " import datetime\n " ,
53
+ " def convert(x):\n " ,
54
+ " x = x.split(', ')\n " ,
55
+ " return x\n " ,
56
+ " \n " ,
57
+ " def convert2(x):\n " ,
58
+ " x = [str(datetime.datetime.strptime(i, '%Y-%m-%d %H:%M:%S').date()) for i in x]\n " ,
59
+ " return x\n " ,
60
+ " \n " ,
61
+ " dataset2 = dataset2.rdd.map(lambda x: (x[0], convert(x[1])))\n " ,
62
+ " dataset2 = dataset2.map(lambda x: (x[0], convert2(x[1])))\n " ,
63
+ " dataset2 = dataset2.toDF()"
64
+ ],
65
+ "metadata" : {
66
+ "collapsed" : false ,
67
+ "pycharm" : {
68
+ "name" : " #%%\n " ,
69
+ "is_executing" : false
70
+ }
71
+ }
72
+ },
73
+ {
74
+ "cell_type" : " code" ,
75
+ "execution_count" : 42 ,
76
+ "outputs" : [
77
+ {
78
+ "name" : " stdout" ,
79
+ "text" : [
80
+ " +--------------------+--------------------+----------+\n " ,
81
+ " | _1| _2| Checkin|\n " ,
82
+ " +--------------------+--------------------+----------+\n " ,
83
+ " |--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-04-26|\n " ,
84
+ " |--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-08-30|\n " ,
85
+ " |--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-10-15|\n " ,
86
+ " |--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-11-18|\n " ,
87
+ " |--1UhMGODdWsrMast...|[2016-04-26, 2016...|2017-04-20|\n " ,
88
+ " +--------------------+--------------------+----------+\n " ,
89
+ " only showing top 5 rows\n " ,
90
+ " \n "
91
+ ],
92
+ "output_type" : " stream"
93
+ }
94
+ ],
95
+ "source" : [
96
+ " df_exploded = dataset2.withColumn('Checkin', func.explode('_2'))\n " ,
97
+ " df_exploded.show(5)"
98
+ ],
99
+ "metadata" : {
100
+ "collapsed" : false ,
101
+ "pycharm" : {
102
+ "name" : " #%%\n " ,
103
+ "is_executing" : false
104
+ }
105
+ }
106
+ },
107
+ {
108
+ "cell_type" : " code" ,
109
+ "execution_count" : 43 ,
110
+ "outputs" : [],
111
+ "source" : [
112
+ " df_exploded = df_exploded.select(func.col(\" _1\" ).alias(\" business_id\" ), func.col(\" Checkin\" ).alias(\" checkin\" ).cast(DateType()))"
113
+ ],
114
+ "metadata" : {
115
+ "collapsed" : false ,
116
+ "pycharm" : {
117
+ "name" : " #%%\n " ,
118
+ "is_executing" : false
119
+ }
120
+ }
121
+ },
122
+ {
123
+ "cell_type" : " code" ,
124
+ "execution_count" : 44 ,
125
+ "outputs" : [
126
+ {
127
+ "name" : " stdout" ,
128
+ "text" : [
129
+ " +--------------------+----------+\n " ,
130
+ " | business_id| checkin|\n " ,
131
+ " +--------------------+----------+\n " ,
132
+ " |--1UhMGODdWsrMast...|2016-04-26|\n " ,
133
+ " |--1UhMGODdWsrMast...|2016-08-30|\n " ,
134
+ " |--1UhMGODdWsrMast...|2016-10-15|\n " ,
135
+ " |--1UhMGODdWsrMast...|2016-11-18|\n " ,
136
+ " |--1UhMGODdWsrMast...|2017-04-20|\n " ,
137
+ " +--------------------+----------+\n " ,
138
+ " only showing top 5 rows\n " ,
139
+ " \n "
140
+ ],
141
+ "output_type" : " stream"
142
+ }
143
+ ],
144
+ "source" : [
145
+ " df_exploded.show(5)"
146
+ ],
147
+ "metadata" : {
148
+ "collapsed" : false ,
149
+ "pycharm" : {
150
+ "name" : " #%%\n " ,
151
+ "is_executing" : false
152
+ }
153
+ }
154
+ },
155
+ {
156
+ "cell_type" : " code" ,
157
+ "execution_count" : 46 ,
158
+ "outputs" : [],
159
+ "source" : [
160
+ " df_exploded = df_exploded.withColumn('year', func.year(\" checkin\" )).repartition(10, \" year\" )\n " ,
161
+ " df_exploded.write.partitionBy(\" year\" ).json(\" ../YelpDatasetYearly/CheckIns/yelp_academic_dataset_checkin\" )"
162
+ ],
163
+ "metadata" : {
164
+ "collapsed" : false ,
165
+ "pycharm" : {
166
+ "name" : " #%%\n " ,
167
+ "is_executing" : false
168
+ }
169
+ }
170
+ },
171
+ {
172
+ "cell_type" : " code" ,
173
+ "execution_count" : null ,
174
+ "outputs" : [],
175
+ "source" : [],
176
+ "metadata" : {
177
+ "collapsed" : false ,
178
+ "pycharm" : {
179
+ "name" : " #%%\n "
180
+ }
181
+ }
182
+ }
183
+ ],
184
+ "metadata" : {
185
+ "kernelspec" : {
186
+ "display_name" : " Python 3" ,
187
+ "language" : " python" ,
188
+ "name" : " python3"
189
+ },
190
+ "language_info" : {
191
+ "codemirror_mode" : {
192
+ "name" : " ipython" ,
193
+ "version" : 2
194
+ },
195
+ "file_extension" : " .py" ,
196
+ "mimetype" : " text/x-python" ,
197
+ "name" : " python" ,
198
+ "nbconvert_exporter" : " python" ,
199
+ "pygments_lexer" : " ipython2" ,
200
+ "version" : " 2.7.6"
201
+ },
202
+ "pycharm" : {
203
+ "stem_cell" : {
204
+ "cell_type" : " raw" ,
205
+ "source" : [],
206
+ "metadata" : {
207
+ "collapsed" : false
208
+ }
209
+ }
210
+ }
211
+ },
212
+ "nbformat" : 4 ,
213
+ "nbformat_minor" : 0
214
+ }
0 commit comments