Balaviknesh
diff --git a/‎.gitattributes
+2 b/‎.gitattributes
+2
diff --git a/‎Analysis/business.ipynb
+133 b/‎Analysis/business.ipynb
+133
diff --git a/‎Analysis/checkin.ipynb
+214 b/‎Analysis/checkin.ipynb
+214
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
@@ -0,0 +1,133 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "import pyspark.sql.functions as func\n",
+    "from pyspark.sql.types import StructType, StructField, StringType, DoubleType"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder.appName(\"YelpHelp\")\\\n",
+    "    .master(\"local\")\\\n",
+    "    .config(\"spark.executor.memory\", \"16g\")\\\n",
+    "    .config(\"spark.driver.memory\", \"16g\")\\\n",
+    "    .getOrCreate()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [],
+   "source": [
+    "hourSchema = StructType({\n",
+    "    StructField(\"Monday\",StringType(),True),\n",
+    "         StructField(\"Tuesday\",StringType(),True),\n",
+    "         StructField(\"Wednesday\",StringType(),True),\n",
+    "         StructField(\"Thursday\",StringType(),True),\n",
+    "        StructField(\"Friday\",StringType(),True),\n",
+    "         StructField(\"Sunday\",StringType(),True),\n",
+    "         StructField(\"Saturday\",StringType(),True)})\n",
+    "b_schema = StructType([\n",
+    "    StructField(\"business_id\", StringType(), False),\n",
+    "    StructField(\"name\",StringType(),True),\n",
+    "    StructField(\"address\",StringType(),True),\n",
+    "    StructField(\"city\",StringType(),True),\n",
+    "    StructField(\"state\",StringType(),True),\n",
+    "    StructField(\"postal_code\",StringType(),True),\n",
+    "    StructField(\"latitude\",DoubleType(),True),\n",
+    "    StructField(\"longitude\",DoubleType(),True),\n",
+    "    StructField(\"categories\", StringType(),True),\n",
+    "    StructField(\"hours\", hourSchema, True)\n",
+    "    \n",
+    "])\n",
+    "businessDataset = spark.read.json(\"../yelp_dataset/yelp_academic_dataset_business.json\", schema=b_schema)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [],
+   "source": [
+    "businessDataset.write.json(\"../YelpDatasetYearly/Business/yelp_academic_dataset_business\")\n",
+    "\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
@@ -0,0 +1,214 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "import pyspark.sql.functions as func\n",
+    "from pyspark.sql.types import StructType, StructField, StringType, LongType, DateType, DoubleType, BooleanType, \\\n",
+    "    ArrayType"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder.appName(\"YelpHelp\")\\\n",
+    "    .master(\"local\")\\\n",
+    "    .config(\"spark.executor.memory\", \"16g\")\\\n",
+    "    .config(\"spark.driver.memory\", \"16g\")\\\n",
+    "    .getOrCreate()\n",
+    "\n",
+    "schema2 = StructType([\n",
+    "    StructField(\"business_id\", StringType(), True),\n",
+    "    StructField(\"date\", StringType(), True)  \n",
+    "])\n",
+    "\n",
+    "\n",
+    "dataset2 = spark.read.json(\"../yelp_dataset/yelp_academic_dataset_checkin.json\", schema=schema2)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "def convert(x):\n",
+    "    x = x.split(', ')\n",
+    "    return  x\n",
+    "\n",
+    "def convert2(x):\n",
+    "    x = [str(datetime.datetime.strptime(i, '%Y-%m-%d %H:%M:%S').date())  for i in x]\n",
+    "    return x\n",
+    "\n",
+    "dataset2 = dataset2.rdd.map(lambda x: (x[0], convert(x[1])))\n",
+    "dataset2 = dataset2.map(lambda x: (x[0], convert2(x[1])))\n",
+    "dataset2 = dataset2.toDF()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "outputs": [
+    {
+     "name": "stdout",
+     "text": [
+      "+--------------------+--------------------+----------+\n",
+      "|                  _1|                  _2|   Checkin|\n",
+      "+--------------------+--------------------+----------+\n",
+      "|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-04-26|\n",
+      "|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-08-30|\n",
+      "|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-10-15|\n",
+      "|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2016-11-18|\n",
+      "|--1UhMGODdWsrMast...|[2016-04-26, 2016...|2017-04-20|\n",
+      "+--------------------+--------------------+----------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ],
+     "output_type": "stream"
+    }
+   ],
+   "source": [
+    "df_exploded = dataset2.withColumn('Checkin', func.explode('_2'))\n",
+    "df_exploded.show(5)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "outputs": [],
+   "source": [
+    "df_exploded = df_exploded.select(func.col(\"_1\").alias(\"business_id\"), func.col(\"Checkin\").alias(\"checkin\").cast(DateType()))"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "outputs": [
+    {
+     "name": "stdout",
+     "text": [
+      "+--------------------+----------+\n",
+      "|         business_id|   checkin|\n",
+      "+--------------------+----------+\n",
+      "|--1UhMGODdWsrMast...|2016-04-26|\n",
+      "|--1UhMGODdWsrMast...|2016-08-30|\n",
+      "|--1UhMGODdWsrMast...|2016-10-15|\n",
+      "|--1UhMGODdWsrMast...|2016-11-18|\n",
+      "|--1UhMGODdWsrMast...|2017-04-20|\n",
+      "+--------------------+----------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ],
+     "output_type": "stream"
+    }
+   ],
+   "source": [
+    "df_exploded.show(5)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "outputs": [],
+   "source": [
+    "df_exploded = df_exploded.withColumn('year', func.year(\"checkin\")).repartition(10, \"year\")\n",
+    "df_exploded.write.partitionBy(\"year\").json(\"../YelpDatasetYearly/CheckIns/yelp_academic_dataset_checkin\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": false
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Auto detect text files and perform LF normalization`
	`2`	`+* text=auto`