Skip to content

Commit

Permalink
Geoparquet simplification and filtering (#895)
Browse files Browse the repository at this point in the history
* Simplifies the object model to minimize the creation of wrappers and arrays to hold values.

* Removes unused classes and methods related to the geoparquet writer.

* Uses lower level ParquetFileReader instead of ParquetReader.

* Adds bbox based filtering capabilities to the geoparquet reader, hence loading only relevant records to the db.

* Adds some benchmarks and tests.
  • Loading branch information
bchapuis authored Oct 14, 2024
1 parent 68c9022 commit ba6350b
Show file tree
Hide file tree
Showing 42 changed files with 1,720 additions and 2,644 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ examples/openstreetmap/tiles/

examples/transformation/*.pbf

# Benchmarking
baremaps-benchmarking/data/

# Docs
.jekyll-cache/
_site/
Expand Down
64 changes: 64 additions & 0 deletions baremaps-benchmarking/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.baremaps</groupId>
<artifactId>baremaps</artifactId>
<version>0.7.4-SNAPSHOT</version>
</parent>

<artifactId>baremaps-benchmarking</artifactId>

<properties>
<jmh.version>1.37</jmh.version>
<maven.deploy.skip>true</maven.deploy.skip>
</properties>

<dependencies>
<dependency>
<groupId>org.apache.baremaps</groupId>
<artifactId>baremaps-geoparquet</artifactId>
</dependency>
<dependency>
<groupId>org.apache.baremaps</groupId>
<artifactId>baremaps-testing</artifactId>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-core</artifactId>
<version>${jmh.version}</version>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-generator-annprocess</artifactId>
<version>${jmh.version}</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.6.0</version>
<executions>
<execution>
<goals>
<goal>shade</goal>
</goals>
<phase>package</phase>
<configuration>
<finalName>benchmarks</finalName>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.openjdk.jmh.Main</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.baremaps.benchmarking.geoparquet;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;
import org.apache.baremaps.geoparquet.GeoParquetReader;
import org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.GetObjectRequest;
import software.amazon.awssdk.services.s3.model.ListObjectsV2Request;

@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 0)
@Measurement(iterations = 1)
public class OvertureMapsBenchmark {

private static Path directory = Path.of("baremaps-benchmarking/data/overturemaps");

public static void main(String[] args) throws RunnerException {
Options opt = new OptionsBuilder()
.include(OvertureMapsBenchmark.class.getSimpleName())
.forks(1)
.build();
new Runner(opt).run();
}

@Setup
public void setup() throws IOException {
if (!Files.exists(directory)) {
try (var client = S3Client.builder()
.region(Region.US_EAST_1)
.credentialsProvider(new AnonymousAWSCredentialsProvider())
.build()) {

var listRequest = ListObjectsV2Request.builder()
.bucket("overturemaps-us-west-2")
.prefix("release/2024-09-18.0/theme=addresses/")
.build();
var objects = client.listObjectsV2(listRequest).contents();
for (var object : objects) {
var key = object.key();
var name = key.substring(key.lastIndexOf("/") + 1);
var file = directory.resolve(name);
Files.createDirectories(file.getParent());
if (!Files.exists(file)) {
var getRequest = GetObjectRequest.builder()
.bucket("overturemaps-us-west-2")
.key(key)
.build();
client.getObject(getRequest, file);
}
}
}
}
}

@SuppressWarnings({"squid:S1481", "squid:S2201"})
@Benchmark
public void read() {
GeoParquetReader reader = new GeoParquetReader(directory.toUri());
reader.read().count();
}

@SuppressWarnings({"squid:S1481", "squid:S2201"})
@Benchmark
public void readParallel() {
GeoParquetReader reader = new GeoParquetReader(directory.toUri());
reader.readParallel().count();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.baremaps.benchmarking.geoparquet;


import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;
import org.apache.baremaps.geoparquet.GeoParquetReader;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;

@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 0)
@Measurement(iterations = 1)
public class SmallFileBenchmark {

private Path source = Path.of("baremaps-testing/data/samples/example.parquet").toAbsolutePath();
private Path directory = Path.of("baremaps-benchmarking/data/small").toAbsolutePath();

public static void main(String[] args) throws RunnerException {
Options opt = new OptionsBuilder()
.include(SmallFileBenchmark.class.getSimpleName())
.forks(1)
.build();
new Runner(opt).run();
}

@Setup
public void setup() throws IOException {
if (!Files.exists(directory)) {
for (int i = 0; i < 1000; i++) {
Path target = directory.resolve(i + ".parquet");
Files.createDirectories(target.getParent());
Files.copy(source, target);
}
}
}

@SuppressWarnings({"squid:S1481", "squid:S2201"})
@Benchmark
public void read() {
GeoParquetReader reader =
new GeoParquetReader(Path.of("baremaps-benchmarking/data/small/*.parquet").toUri());
reader.read().count();
}

@SuppressWarnings({"squid:S1481", "squid:S2201"})
@Benchmark
public void readParallel() {
GeoParquetReader reader =
new GeoParquetReader(Path.of("baremaps-benchmarking/data/small/*.parquet").toUri());
reader.readParallel().count();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,12 @@ public class GeoParquetDataTable implements DataTable {

public GeoParquetDataTable(URI path) {
this.path = path;
}

private GeoParquetReader reader() {
if (reader == null) {
reader = new GeoParquetReader(path);
}
return reader;
this.reader = new GeoParquetReader(path);
}

@Override
public long size() {
return reader().size();
return reader.size();
}

@Override
Expand All @@ -66,8 +60,8 @@ public Stream<DataRow> stream() {

@Override
public Stream<DataRow> parallelStream() {
return reader().readParallel().map(group -> new DataRowImpl(
GeoParquetTypeConversion.asSchema(path.toString(), group.getSchema()),
return reader.readParallel().map(group -> new DataRowImpl(
GeoParquetTypeConversion.asSchema(path.toString(), group.getGeoParquetSchema()),
GeoParquetTypeConversion.asRowValues(group)));
}

Expand All @@ -76,7 +70,6 @@ public void clear() {
if (reader != null) {
reader = null;
}

if (schema != null) {
schema = null;
}
Expand All @@ -87,15 +80,15 @@ public DataSchema schema() {
if (schema == null) {
this.schema = GeoParquetTypeConversion.asSchema(
path.toString(),
reader().getGeoParquetSchema());
reader.getGeoParquetSchema());
return this.schema;
}
return schema;
}

public int srid(String column) {
try {
return reader().getGeoParquetMetadata().getSrid(column);
return reader.getGeoParquetMetadata().getSrid(column);
} catch (Exception e) {
throw new GeoParquetException("Fail to read the SRID from the GeoParquet metadata", e);
}
Expand Down
Loading

0 comments on commit ba6350b

Please sign in to comment.