|
103 | 103 | "cell_type": "markdown",
|
104 | 104 | "metadata": {},
|
105 | 105 | "source": [
|
| 106 | + "From the following [reference](https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html):\n", |
| 107 | + "\n", |
106 | 108 | "A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. "
|
107 | 109 | ]
|
108 | 110 | },
|
| 111 | + { |
| 112 | + "cell_type": "markdown", |
| 113 | + "metadata": {}, |
| 114 | + "source": [ |
| 115 | + "Create a DataFrame from JSON files on S3:" |
| 116 | + ] |
| 117 | + }, |
| 118 | + { |
| 119 | + "cell_type": "code", |
| 120 | + "execution_count": null, |
| 121 | + "metadata": { |
| 122 | + "collapsed": false |
| 123 | + }, |
| 124 | + "outputs": [], |
| 125 | + "source": [ |
| 126 | + "users = context.load(\"s3n://path/to/users.json\", \"json\")" |
| 127 | + ] |
| 128 | + }, |
| 129 | + { |
| 130 | + "cell_type": "markdown", |
| 131 | + "metadata": {}, |
| 132 | + "source": [ |
| 133 | + "Create a new DataFrame that contains “young users” only:" |
| 134 | + ] |
| 135 | + }, |
| 136 | + { |
| 137 | + "cell_type": "code", |
| 138 | + "execution_count": null, |
| 139 | + "metadata": { |
| 140 | + "collapsed": true |
| 141 | + }, |
| 142 | + "outputs": [], |
| 143 | + "source": [ |
| 144 | + "young = users.filter(users.age<21)" |
| 145 | + ] |
| 146 | + }, |
| 147 | + { |
| 148 | + "cell_type": "markdown", |
| 149 | + "metadata": {}, |
| 150 | + "source": [ |
| 151 | + "Alternatively, using Pandas-like syntax:" |
| 152 | + ] |
| 153 | + }, |
| 154 | + { |
| 155 | + "cell_type": "code", |
| 156 | + "execution_count": null, |
| 157 | + "metadata": { |
| 158 | + "collapsed": true |
| 159 | + }, |
| 160 | + "outputs": [], |
| 161 | + "source": [ |
| 162 | + "young = users[users.age<21]" |
| 163 | + ] |
| 164 | + }, |
| 165 | + { |
| 166 | + "cell_type": "markdown", |
| 167 | + "metadata": {}, |
| 168 | + "source": [ |
| 169 | + "Increment everybody’s age by 1:" |
| 170 | + ] |
| 171 | + }, |
| 172 | + { |
| 173 | + "cell_type": "code", |
| 174 | + "execution_count": null, |
| 175 | + "metadata": { |
| 176 | + "collapsed": true |
| 177 | + }, |
| 178 | + "outputs": [], |
| 179 | + "source": [ |
| 180 | + "young.select(young.name, young.age+1)" |
| 181 | + ] |
| 182 | + }, |
| 183 | + { |
| 184 | + "cell_type": "markdown", |
| 185 | + "metadata": {}, |
| 186 | + "source": [ |
| 187 | + "Count the number of young users by gender:" |
| 188 | + ] |
| 189 | + }, |
| 190 | + { |
| 191 | + "cell_type": "code", |
| 192 | + "execution_count": null, |
| 193 | + "metadata": { |
| 194 | + "collapsed": true |
| 195 | + }, |
| 196 | + "outputs": [], |
| 197 | + "source": [ |
| 198 | + "young.groupBy(\"gender\").count()" |
| 199 | + ] |
| 200 | + }, |
| 201 | + { |
| 202 | + "cell_type": "markdown", |
| 203 | + "metadata": {}, |
| 204 | + "source": [ |
| 205 | + "Join young users with another DataFrame called logs:" |
| 206 | + ] |
| 207 | + }, |
| 208 | + { |
| 209 | + "cell_type": "code", |
| 210 | + "execution_count": null, |
| 211 | + "metadata": { |
| 212 | + "collapsed": false |
| 213 | + }, |
| 214 | + "outputs": [], |
| 215 | + "source": [ |
| 216 | + "young.join(logs, logs.userId == users.userId, \"left_outer\")" |
| 217 | + ] |
| 218 | + }, |
| 219 | + { |
| 220 | + "cell_type": "markdown", |
| 221 | + "metadata": {}, |
| 222 | + "source": [ |
| 223 | + "Count the number of users in the young DataFrame:" |
| 224 | + ] |
| 225 | + }, |
| 226 | + { |
| 227 | + "cell_type": "code", |
| 228 | + "execution_count": null, |
| 229 | + "metadata": { |
| 230 | + "collapsed": true |
| 231 | + }, |
| 232 | + "outputs": [], |
| 233 | + "source": [ |
| 234 | + "young.registerTempTable(\"young\")\n", |
| 235 | + "context.sql(\"SELECT count(*) FROM young\")" |
| 236 | + ] |
| 237 | + }, |
| 238 | + { |
| 239 | + "cell_type": "markdown", |
| 240 | + "metadata": {}, |
| 241 | + "source": [ |
| 242 | + "Convert Spark DataFrame to Pandas:" |
| 243 | + ] |
| 244 | + }, |
| 245 | + { |
| 246 | + "cell_type": "code", |
| 247 | + "execution_count": null, |
| 248 | + "metadata": { |
| 249 | + "collapsed": true |
| 250 | + }, |
| 251 | + "outputs": [], |
| 252 | + "source": [ |
| 253 | + "pandas_df = young.toPandas()" |
| 254 | + ] |
| 255 | + }, |
| 256 | + { |
| 257 | + "cell_type": "markdown", |
| 258 | + "metadata": {}, |
| 259 | + "source": [ |
| 260 | + "Create a Spark DataFrame from Pandas:" |
| 261 | + ] |
| 262 | + }, |
| 263 | + { |
| 264 | + "cell_type": "code", |
| 265 | + "execution_count": null, |
| 266 | + "metadata": { |
| 267 | + "collapsed": true |
| 268 | + }, |
| 269 | + "outputs": [], |
| 270 | + "source": [ |
| 271 | + "spark_df = context.createDataFrame(pandas_df)" |
| 272 | + ] |
| 273 | + }, |
109 | 274 | {
|
110 | 275 | "cell_type": "markdown",
|
111 | 276 | "metadata": {},
|
|
129 | 294 | "cell_type": "markdown",
|
130 | 295 | "metadata": {},
|
131 | 296 | "source": [
|
132 |
| - "Create a dataframe based on the content of a file:" |
| 297 | + "Create a DataFrame based on the content of a file:" |
133 | 298 | ]
|
134 | 299 | },
|
135 | 300 | {
|
|
212 | 377 | },
|
213 | 378 | "outputs": [],
|
214 | 379 | "source": [
|
215 |
| - "df.filter(df.column_name > 10)" |
| 380 | + "df.filter(df.column_name>10)" |
216 | 381 | ]
|
217 | 382 | },
|
218 | 383 | {
|
|
284 | 449 | },
|
285 | 450 | "outputs": [],
|
286 | 451 | "source": [
|
287 |
| - "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\") #the result is a RDD" |
| 452 | + "rdd_from_df = sqlContext.sql(\"SELECT * FROM dataframe_name\")" |
288 | 453 | ]
|
289 | 454 | },
|
290 | 455 | {
|
|
0 commit comments