diff --git a/Getting_Started/Part_2_Reading_Spatial_Files.ipynb b/Getting_Started/Part_2_Reading_Spatial_Files.ipynb index 5f182af..d9fd91a 100644 --- a/Getting_Started/Part_2_Reading_Spatial_Files.ipynb +++ b/Getting_Started/Part_2_Reading_Spatial_Files.ipynb @@ -1159,7 +1159,7 @@ "id": "11615753-6b49-4168-8247-ad09ac06500f", "metadata": {}, "source": [ - "Below is the **point location** we are querying against the **raster dataframe**:\n", + "Below is the **point location** we are querying against the **raster dataframe** (Image: Google Maps):\n", "\n", "![Query Area](https://i.ibb.co/W4K0Lg90/Clean-Shot-2025-02-05-at-12-46-59-2x.png) " ] diff --git a/Getting_Started/Part_3_Accelerating_Geospatial_Datasets.ipynb b/Getting_Started/Part_3_Accelerating_Geospatial_Datasets.ipynb index fa2d143..d58177f 100644 --- a/Getting_Started/Part_3_Accelerating_Geospatial_Datasets.ipynb +++ b/Getting_Started/Part_3_Accelerating_Geospatial_Datasets.ipynb @@ -335,12 +335,12 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "id": "be203bd6-60e3-4a05-b0d3-d366eb7e1467", "metadata": {}, "outputs": [], "source": [ - "name = 'matt'" + "name = 'your_name'" ] }, { @@ -530,52 +530,10 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": null, "id": "39e7cc5d-23b5-4359-84d1-bcba410da18e", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "25/02/06 02:46:15 ERROR TaskSchedulerImpl: Lost executor 66 on 10.1.50.156: 308]\n", - "The executor with id 66 exited with exit code -1(unexpected).\n", - "\n", - "\n", - "\n", - "The API gave the following container statuses:\n", - "\n", - "\n", - "\t container name: spark-kubernetes-executor\n", - "\t container image: 329898491045.dkr.ecr.us-west-2.amazonaws.com/wherobots-spark:v1.5.0-db-12565648598\n", - "\t container state: running\n", - "\t container started at: 2025-02-05T22:36:12Z\n", - " \n", - "25/02/06 02:46:56 ERROR TaskSchedulerImpl: Lost executor 89 on 10.1.48.146: 30]\n", - "The executor with id 89 exited with exit code -1(unexpected).\n", - "\n", - "\n", - "\n", - "The API gave the following container statuses:\n", - "\n", - "\n", - "\t container name: spark-kubernetes-executor\n", - "\t container image: 329898491045.dkr.ecr.us-west-2.amazonaws.com/wherobots-spark:v1.5.0-db-12565648598\n", - "\t container state: running\n", - "\t container started at: 2025-02-06T01:50:45Z\n", - " \n", - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 79.2 ms, sys: 28.7 ms, total: 108 ms\n", - "Wall time: 3min 52s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "\n", diff --git a/Getting_Started/assets/conf/config.json b/Getting_Started/assets/conf/config.json index 83596b0..0c2ecc6 100644 --- a/Getting_Started/assets/conf/config.json +++ b/Getting_Started/assets/conf/config.json @@ -130,7 +130,7 @@ "pitch": 54.90395491103014, "zoom": 12.456022235128147, "isSplit": false}, - "mapStyle": {"styleType": "dark", + "mapStyle": {"styleType": "dark-matter", "topLayerGroups": {}, "visibleLayerGroups": {"label": true, "road": true, diff --git a/Reading_and_Writing_Data/Creating_Efficient_GeoParquet_Files.ipynb b/Reading_and_Writing_Data/Creating_Efficient_GeoParquet_Files.ipynb new file mode 100644 index 0000000..76c6394 --- /dev/null +++ b/Reading_and_Writing_Data/Creating_Efficient_GeoParquet_Files.ipynb @@ -0,0 +1,660 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "64d6f4dc-0858-4045-b96a-010a3c815824", + "metadata": {}, + "source": [ + "# 📄 Notebook Overview: GeoParquet Transformation Pipeline\n", + "\n", + "This notebook demonstrates how to process and optimize geospatial performance data from Ookla using Apache Sedona and Apache Spark. It focuses on reading mobile and fixed broadband datasets stored in Parquet format from a public S3 bucket, performing spatial transformations, and writing the outputs in the GeoParquet format. The main goals of the notebook include:\n", + "\n", + "- Enabling scalable, distributed spatial data processing.\n", + "- Extracting temporal metadata (year, quarter) from file paths.\n", + "- Converting WKT columns into geometries and calculating spatial attributes like bounding boxes and geohashes.\n", + "- Optimizing datasets through strategic repartitioning and sorting.\n", + "- Writing the final results using the GeoParquet specification with custom CRS and compression.\n", + "\n", + "By the end of this workflow, you’ll have a highly optimized, cloud-native format (GeoParquet) ready for scalable querying and analysis, tailored for large-scale spatial datasets." + ] + }, + { + "cell_type": "markdown", + "id": "6333bc18-0936-42db-85b9-113603a07836", + "metadata": {}, + "source": [ + "# 🔧 Setting Up Sedona Connection" + ] + }, + { + "cell_type": "markdown", + "id": "28756d69-dd02-4836-940c-7fde8fd964dd", + "metadata": {}, + "source": [ + "This block initializes **Apache Sedona**, an extension of Apache Spark for spatial data processing. It:\n", + "\n", + "- Imports Sedona and Spark SQL functions.\n", + "- Configures Spark to access the Ookla Open Data S3 bucket anonymously.\n", + "- Creates a `SedonaContext` instance that enables spatial processing capabilities.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "15f3e28b-47d7-47af-acef-f7ec3d60123a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-04-01T18:11:02.134749Z", + "iopub.status.busy": "2025-04-01T18:11:02.133689Z", + "iopub.status.idle": "2025-04-01T18:13:01.546677Z", + "shell.execute_reply": "2025-04-01T18:13:01.545880Z", + "shell.execute_reply.started": "2025-04-01T18:11:02.134723Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + " \r" + ] + } + ], + "source": [ + "from sedona.spark import *\n", + "import pyspark.sql.functions as f\n", + "\n", + "config = SedonaContext. \\\n", + " builder(). \\\n", + " config(\"spark.hadoop.fs.s3a.bucket.ookla-open-data.aws.credentials.provider\",\"org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider\"). \\\n", + " getOrCreate()\n", + "\n", + "sedona = SedonaContext.create(config)" + ] + }, + { + "cell_type": "markdown", + "id": "517e6f08-8297-4464-b765-add07dd8bc12", + "metadata": {}, + "source": [ + "# 📱 Loading and Transforming Mobile Parquet Data" + ] + }, + { + "cell_type": "markdown", + "id": "e801bd37-478d-4c35-9069-1fd08c44403d", + "metadata": {}, + "source": [ + "```python\n", + "from pyspark.sql.functions import input_file_name, regexp_extract\n", + "```\n", + "**What this does:**\n", + "- Imports two useful functions from PySpark:\n", + " - `input_file_name()`: Captures the full path of the input file each row originated from, which is useful for extracting metadata embedded in the directory structure (e.g., year, quarter).\n", + " - `regexp_extract()`: Applies a regular expression to a string column and extracts specific patterns, used here to pull out the `year` and `quarter` from file paths.\n", + "\n", + "---\n", + "\n", + "```python\n", + "mobile = sedona.read.format(\"parquet\")\\\n", + " .load('s3://ookla-open-data/parquet/performance/type=mobile/*/*/*.parquet') \\\n", + "```\n", + "**What this does:**\n", + "- Loads all mobile performance parquet files from Ookla's public S3 bucket.\n", + "- Uses wildcard characters to recursively grab all files regardless of year or quarter.\n", + "- Reads the data into a Spark DataFrame using Sedona's reader.\n", + "\n", + "---\n", + "\n", + "\n", + "```python\n", + " .withColumn(\"file_path\", input_file_name()) \\\n", + "```\n", + "**What this does:**\n", + "- Adds a new column called `file_path` containing the S3 path where each row came from.\n", + "- This column is used to extract temporal metadata.\n", + "\n", + "---\n", + "\n", + "\n", + "```python\n", + " .withColumn(\"year\", regexp_extract(\"file_path\", r\"year=(\\\\d+)\", 1)) \\\n", + "```\n", + "**What this does:**\n", + "- Extracts the four-digit `year` from the file path using a regex pattern and stores it in a new column.\n", + "\n", + "---\n", + "\n", + "```python\n", + " .withColumn(\"quarter\", regexp_extract(\"file_path\", r\"quarter=(\\\\d+)\", 1)) \\\n", + "```\n", + "**What this does:**\n", + "- Extracts the `quarter` (1 to 4) from the file path using another regex pattern.\n", + "\n", + "---\n", + "\n", + "```python\n", + " .withColumn(\"geometry\", expr(\"ST_GeomFromText(tile)\")) \\\n", + " \n", + "```\n", + "**What this does:**\n", + "- Converts the `tile` column (in WKT format) into a geometry column using Sedona's `ST_GeomFromText()` function.\n", + "- This enables spatial operations on the geometries.\n", + "\n", + "---\n", + "\n", + "```python\n", + " .withColumn(\"bbox\", expr(\"struct(st_xmin(ST_GeomFromText(tile)) as xmin, st_ymin(ST_GeomFromText(tile)) as ymin, st_xmax(ST_GeomFromText(tile)) as xmax, st_ymax(ST_GeomFromText(tile)) as ymax) as bbox\")) \\\n", + "```\n", + "**What this does:**\n", + "- Constructs a bounding box for each geometry by extracting the min and max x and y coordinates.\n", + "- Creates a struct (`bbox`) with fields `xmin`, `ymin`, `xmax`, and `ymax` for spatial indexing or filtering.\n", + "\n", + "---\n", + "\n", + "\n", + "```python\n", + " .withColumn(\"geohash\", expr(\"ST_GeoHash(ST_GeomFromText(tile), 10)\")) \\\n", + "```\n", + "**What this does:**\n", + "- Generates a geohash with precision 10 for each geometry using Sedona.\n", + "- Geohashing encodes spatial location into alphanumeric strings and is useful for spatial partitioning or clustering.\n", + "\n", + "---\n", + "\n", + "\n", + "```python\n", + " .selectExpr(\"*\", ''' \"mobile\" as type''') \\\n", + "```\n", + "**What this does:**\n", + "- Adds a new column called `type` and sets its value to \"mobile\" for all rows.\n", + "- Helps distinguish this dataset from others, such as fixed broadband data.\n", + "\n", + "---\n", + "\n", + "```python\n", + " .orderBy(expr(\"ST_GeoHash(ST_GeomFromText(tile), 6)\")) \\\n", + "```\n", + "**What this does:**\n", + "- Orders the rows based on a geohash of precision 6 (lower precision = larger area).\n", + "- This helps optimize data locality for downstream spatial operations.\n", + "\n", + "---\n", + "\n", + "```python\n", + " .drop(\"file_path\")\n", + "```\n", + "**What this does:**\n", + "- Removes the `file_path` column since it was only used temporarily to extract metadata like year and quarter.\n", + "- Cleans up the final DataFrame for further analysis or saving.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a0ee8c31-7208-49ca-85aa-e4064d6665e0", + "metadata": { + "execution": { + "iopub.execute_input": "2025-04-01T18:13:01.548385Z", + "iopub.status.busy": "2025-04-01T18:13:01.547946Z", + "iopub.status.idle": "2025-04-01T18:13:04.991277Z", + "shell.execute_reply": "2025-04-01T18:13:04.990826Z", + "shell.execute_reply.started": "2025-04-01T18:13:01.548365Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "from pyspark.sql.functions import input_file_name, regexp_extract, expr\n", + "\n", + "fixed = sedona.read.format(\"parquet\")\\\n", + " .load('s3://ookla-open-data/parquet/performance/type=fixed/*/*/*.parquet') \\\n", + "\n", + "fixed = fixed.withColumn(\"file_path\", input_file_name()) \\\n", + " .withColumn(\"year\", regexp_extract(\"file_path\", r\"year=(\\d+)\", 1)) \\\n", + " .withColumn(\"quarter\", regexp_extract(\"file_path\", r\"quarter=(\\d+)\", 1)) \\\n", + " .withColumn(\"geometry\", expr(\"ST_GeomFromText(tile)\")) \\\n", + " .withColumn(\"bbox\", expr(\"struct(st_xmin(ST_GeomFromText(tile)) as xmin, st_ymin(ST_GeomFromText(tile)) as ymin, st_xmax(ST_GeomFromText(tile)) as xmax, st_ymax(ST_GeomFromText(tile)) as ymax) as bbox\")) \\\n", + " .withColumn(\"geohash\", expr(\"ST_GeoHash(ST_GeomFromText(tile), 10)\")) \\\n", + " .selectExpr(\"*\", ''' \"mobile\" as type''') \\\n", + " .orderBy(expr(\"ST_GeoHash(ST_GeomFromText(tile), 6)\")) \\\n", + " .drop(\"file_path\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dc532581-4fdb-46c5-8440-9f276715784d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-04-01T18:13:04.992137Z", + "iopub.status.busy": "2025-04-01T18:13:04.991952Z", + "iopub.status.idle": "2025-04-01T18:13:07.976607Z", + "shell.execute_reply": "2025-04-01T18:13:07.976077Z", + "shell.execute_reply.started": "2025-04-01T18:13:04.992121Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "from pyspark.sql.functions import input_file_name, regexp_extract\n", + "\n", + "mobile = sedona.read.format(\"parquet\")\\\n", + " .load('s3://ookla-open-data/parquet/performance/type=mobile/*/*/*.parquet') \\\n", + "\n", + "mobile = mobile.withColumn(\"file_path\", input_file_name()) \\\n", + " .withColumn(\"year\", regexp_extract(\"file_path\", r\"year=(\\d+)\", 1)) \\\n", + " .withColumn(\"quarter\", regexp_extract(\"file_path\", r\"quarter=(\\d+)\", 1)) \\\n", + " .withColumn(\"geometry\", expr(\"ST_GeomFromText(tile)\")) \\\n", + " .withColumn(\"bbox\", expr(\"struct(st_xmin(ST_GeomFromText(tile)) as xmin, st_ymin(ST_GeomFromText(tile)) as ymin, st_xmax(ST_GeomFromText(tile)) as xmax, st_ymax(ST_GeomFromText(tile)) as ymax) as bbox\")) \\\n", + " .withColumn(\"geohash\", expr(\"ST_GeoHash(ST_GeomFromText(tile), 10)\")) \\\n", + " .selectExpr(\"*\", ''' \"mobile\" as type''') \\\n", + " .orderBy(expr(\"ST_GeoHash(ST_GeomFromText(tile), 6)\")) \\\n", + " .drop(\"file_path\")" + ] + }, + { + "cell_type": "markdown", + "id": "f20e57f3-f4df-440c-86cf-ef78cf919e9f", + "metadata": {}, + "source": [ + "# Store the `projjson`" + ] + }, + { + "cell_type": "markdown", + "id": "61983f39-8adb-4408-9d84-9fc6d5c8c667", + "metadata": {}, + "source": [ + "This section stores the projection information of the data in a `json` format that can be passed to the GeoParquet metadata. \n", + "\n", + "You can find this projection and other as [EPSG.io](https://epsg.io/4326)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5d9f361e-3661-458d-ad8a-51217ece4f81", + "metadata": { + "execution": { + "iopub.execute_input": "2025-04-01T18:13:07.978261Z", + "iopub.status.busy": "2025-04-01T18:13:07.978080Z", + "iopub.status.idle": "2025-04-01T18:13:07.982171Z", + "shell.execute_reply": "2025-04-01T18:13:07.981722Z", + "shell.execute_reply.started": "2025-04-01T18:13:07.978246Z" + } + }, + "outputs": [], + "source": [ + "projjson = '''{\n", + " \"$schema\": \"https://proj.org/schemas/v0.7/projjson.schema.json\",\n", + " \"type\": \"GeographicCRS\",\n", + " \"name\": \"WGS 84\",\n", + " \"datum_ensemble\": {\n", + " \"name\": \"World Geodetic System 1984 ensemble\",\n", + " \"members\": [\n", + " {\n", + " \"name\": \"World Geodetic System 1984 (Transit)\",\n", + " \"id\": {\n", + " \"authority\": \"EPSG\",\n", + " \"code\": 1166\n", + " }\n", + " },\n", + " {\n", + " \"name\": \"World Geodetic System 1984 (G730)\",\n", + " \"id\": {\n", + " \"authority\": \"EPSG\",\n", + " \"code\": 1152\n", + " }\n", + " },\n", + " {\n", + " \"name\": \"World Geodetic System 1984 (G873)\",\n", + " \"id\": {\n", + " \"authority\": \"EPSG\",\n", + " \"code\": 1153\n", + " }\n", + " },\n", + " {\n", + " \"name\": \"World Geodetic System 1984 (G1150)\",\n", + " \"id\": {\n", + " \"authority\": \"EPSG\",\n", + " \"code\": 1154\n", + " }\n", + " },\n", + " {\n", + " \"name\": \"World Geodetic System 1984 (G1674)\",\n", + " \"id\": {\n", + " \"authority\": \"EPSG\",\n", + " \"code\": 1155\n", + " }\n", + " },\n", + " {\n", + " \"name\": \"World Geodetic System 1984 (G1762)\",\n", + " \"id\": {\n", + " \"authority\": \"EPSG\",\n", + " \"code\": 1156\n", + " }\n", + " },\n", + " {\n", + " \"name\": \"World Geodetic System 1984 (G2139)\",\n", + " \"id\": {\n", + " \"authority\": \"EPSG\",\n", + " \"code\": 1309\n", + " }\n", + " }\n", + " ],\n", + " \"ellipsoid\": {\n", + " \"name\": \"WGS 84\",\n", + " \"semi_major_axis\": 6378137,\n", + " \"inverse_flattening\": 298.257223563\n", + " },\n", + " \"accuracy\": \"2.0\",\n", + " \"id\": {\n", + " \"authority\": \"EPSG\",\n", + " \"code\": 6326\n", + " }\n", + " },\n", + " \"coordinate_system\": {\n", + " \"subtype\": \"ellipsoidal\",\n", + " \"axis\": [\n", + " {\n", + " \"name\": \"Geodetic latitude\",\n", + " \"abbreviation\": \"Lat\",\n", + " \"direction\": \"north\",\n", + " \"unit\": \"degree\"\n", + " },\n", + " {\n", + " \"name\": \"Geodetic longitude\",\n", + " \"abbreviation\": \"Lon\",\n", + " \"direction\": \"east\",\n", + " \"unit\": \"degree\"\n", + " }\n", + " ]\n", + " },\n", + " \"scope\": \"Horizontal component of 3D system.\",\n", + " \"area\": \"World.\",\n", + " \"bbox\": {\n", + " \"south_latitude\": -90,\n", + " \"west_longitude\": -180,\n", + " \"north_latitude\": 90,\n", + " \"east_longitude\": 180\n", + " },\n", + " \"id\": {\n", + " \"authority\": \"EPSG\",\n", + " \"code\": 4326\n", + " }\n", + "}'''" + ] + }, + { + "cell_type": "markdown", + "id": "009cfb9f-41c9-419a-a9cf-83fa47efadd3", + "metadata": {}, + "source": [ + "# 📊 GeoParquet Optimizations" + ] + }, + { + "cell_type": "markdown", + "id": "d4022da3-5a43-445e-bc11-ccc1a490452f", + "metadata": {}, + "source": [ + "```python\n", + "sedona.conf.set(\"spark.sql.parquet.page.size\", \"128MB\")\n", + "```\n", + "**What this does:**\n", + "- Adjusts the Parquet page size to 128 MB.\n", + "- Larger page sizes reduce metadata overhead and improve read performance for large datasets.\n", + "\n", + "---\n", + "\n", + "```python\n", + "mobile = mobile.repartition(1)\n", + "fixed = fixed.repartition(1)\n", + "```\n", + "**What this does:**\n", + "- Repartitions both mobile and fixed datasets into a single partition.\n", + "- This ensures each is written to a single output file, useful for testing but not scalable for large data.\n", + "\n", + "---\n", + "\n", + "```python\n", + "mobile = mobile.repartitionByRange(10, \"geohash\") \\\n", + " .sortWithinPartitions(\"geohash\") \\\n", + " .drop(\"geohash\")\n", + "\n", + "fixed = fixed.repartitionByRange(10, \"geohash\") \\\n", + " .sortWithinPartitions(\"geohash\") \\\n", + " .drop(\"geohash\")\n", + "```\n", + "**What this does:**\n", + "- Repartitions the data into 10 ranges based on `geohash` - *note you will need to adjust this number to achieve your ideal partitoned file size*.\n", + "- Sorts data within each partition by `geohash` for spatial locality.\n", + "- Drops the `geohash` column after it's used for partitioning and sorting.\n", + "\n", + "---\n", + "\n", + "\n", + "```python\n", + "import os\n", + "user_uri = os.getenv(\"USER_S3_PATH\")\n", + "```\n", + "**What this does:**\n", + "- Loads a custom S3 path from an environment variable named `USER_S3_PATH`.\n", + "- This is where the final GeoParquet files will be saved.\n", + "\n", + "---\n", + "\n", + "\n", + "```python\n", + "mobile.write \\\n", + " .format(\"geoparquet\") \\\n", + " .option(\"geoparquet.version\", \"1.1.0\") \\\n", + " .option(\"geoparquet.crs\", projjson) \\\n", + " .option(\"geoparquet.covering\", \"bbox\") \\\n", + " .save(user_uri + \"ookla_mobile\", mode='overwrite', compression='zstd')\n", + "```\n", + "**What this does:**\n", + "- Writes the `mobile` DataFrame to S3 in the GeoParquet format.\n", + "- Sets the format version, CRS in `projjson`, and specifies bounding box coverage.\n", + "- Uses Zstandard (zstd) compression for efficient storage.\n", + "\n", + "---\n", + "\n", + "```python\n", + "mobile.count()\n", + "fixed.count()\n", + "```\n", + "**What this does:**\n", + "- Triggers a count operation to evaluate and materialize the transformations.\n", + "- Useful for logging or validating the number of records written.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b615c2c0-6aeb-49f8-a18f-7e04f3b96fd0", + "metadata": { + "execution": { + "iopub.execute_input": "2025-04-01T18:13:07.982969Z", + "iopub.status.busy": "2025-04-01T18:13:07.982724Z", + "iopub.status.idle": "2025-04-01T18:13:08.059701Z", + "shell.execute_reply": "2025-04-01T18:13:08.059207Z", + "shell.execute_reply.started": "2025-04-01T18:13:07.982948Z" + } + }, + "outputs": [], + "source": [ + "sedona.conf.set(\"spark.sql.parquet.page.size\", \"128MB\") # Set page size to 128 MB" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cf94c8d5-5d4f-4c14-a7c3-71e9163f30b6", + "metadata": { + "execution": { + "iopub.execute_input": "2025-04-01T18:13:08.061051Z", + "iopub.status.busy": "2025-04-01T18:13:08.060578Z", + "iopub.status.idle": "2025-04-01T18:13:08.144651Z", + "shell.execute_reply": "2025-04-01T18:13:08.144238Z", + "shell.execute_reply.started": "2025-04-01T18:13:08.061026Z" + } + }, + "outputs": [], + "source": [ + "mobile = mobile.repartition(1)\n", + "fixed = fixed.repartition(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9f76f8f2-44de-4b20-a347-dbbc737730a7", + "metadata": { + "execution": { + "iopub.execute_input": "2025-04-01T18:13:08.145917Z", + "iopub.status.busy": "2025-04-01T18:13:08.145369Z", + "iopub.status.idle": "2025-04-01T18:13:08.259268Z", + "shell.execute_reply": "2025-04-01T18:13:08.258637Z", + "shell.execute_reply.started": "2025-04-01T18:13:08.145891Z" + } + }, + "outputs": [], + "source": [ + "mobile = mobile.repartitionByRange(10, \"geohash\") \\\n", + " .sortWithinPartitions(\"geohash\") \\\n", + " .drop(\"geohash\") \n", + "\n", + "fixed = fixed.repartitionByRange(10, \"geohash\") \\\n", + " .sortWithinPartitions(\"geohash\") \\\n", + " .drop(\"geohash\") " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7495dd30-af5c-4f9c-afa6-91f239779b74", + "metadata": { + "execution": { + "iopub.execute_input": "2025-04-01T18:13:08.260256Z", + "iopub.status.busy": "2025-04-01T18:13:08.260097Z", + "iopub.status.idle": "2025-04-01T18:13:08.289013Z", + "shell.execute_reply": "2025-04-01T18:13:08.288482Z", + "shell.execute_reply.started": "2025-04-01T18:13:08.260241Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'s3://wbts-wbc-ymm1bun8sj/jf3gkm4ile/data/customer-besg0oop07pktb/'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "user_uri = os.getenv(\"USER_S3_PATH\")\n", + "user_uri" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cf6dac4-410e-424a-b285-abbdfc25db28", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "mobile.write \\\n", + " .format(\"geoparquet\") \\\n", + " .option(\"geoparquet.version\", \"1.1.0\") \\\n", + " .option(\"geoparquet.crs\", projjson) \\\n", + " .option(\"geoparquet.covering\", \"bbox\") \\\n", + " .save(user_uri + \"ookla_mobile\", mode='overwrite', compression='zstd')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e31467ad-39d7-4402-9580-5933ed4d113d", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "fixed.write \\\n", + " .format(\"geoparquet\") \\\n", + " .option(\"geoparquet.version\", \"1.1.0\") \\\n", + " .option(\"geoparquet.crs\", projjson) \\\n", + " .option(\"geoparquet.covering\", \"bbox\") \\\n", + " .save(user_uri + \"ookla_fixed\", mode='overwrite', compression='zstd')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2dcc2bb-6d30-4ef8-af93-b0bc551cebce", + "metadata": {}, + "outputs": [], + "source": [ + "mobile.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d02a6220-1617-4fd5-a5db-ebe9433e1b84", + "metadata": {}, + "outputs": [], + "source": [ + "fixed.count()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}