Skip to content

Commit 8487f71

Browse files
committed
get projjson from metadata
roundtrip projjson sort of actually pass the value of extensions enabled to the Parquet reader remove accidentally added file rename files from geometry to geospatial rename geometry to geospatial more geometry to geospatial maybe better geospatial statistics python output use string view for algorithm_name schema_internal tweaks column writer and metadata geometry_stats -> geospatial_stats fix build reader test comments clarify the point size constant refactor test util for generating WKB better test utils maybe fix test lint slightly better handling of wraparound collapse missing header remove unneded constructors fewer constructors improve comments, equality lowercase trivial methods add experimental note correct comment throw instead of attempt to correctly write stats for dictionaries two more WKB utility edits add upper bound/lower bound more iterating on the geospatial statistics remove one more constructor AppendValues -> Append rename constants fix comment better tostring for boundingbox all the spans, better fix whitespace check bounds empty before emitting offload enum converts ro thrift_internal ensure stats set is checked at least once remove unused variable snake case geospatial stat to geo_stats GeospatialStatistics -> GeoStatistics deparameterize test that no longer depends on data page version remove write_geospatial_types option Update python build for no geospatial logical types and renamed statistics separate JSON-requiring code and ensure tests pass separate json-requiring things to their own file conditionally link to rapidjson fix comment fix tidy link privately where possible add the crs context maybe better context remember to include config to not skip tests Add crs context to writer move crs logic in the other direction to internal_json fix comment use a hash in the projjson key dev lint fix accidental diffs use enum class for EdgeInterpolationAlgorithm pragma once the offending header use k syntax for constant prefixes add an is_empty() revert checked cast export classes for Windows try something for MingW remove unnecessary constructors don't test multiple data page version in reader test
1 parent e9d5180 commit 8487f71

36 files changed

+1640
-1277
lines changed

cpp/src/arrow/dataset/file_parquet.cc

+2
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties(
133133
arrow_properties.set_io_context(
134134
parquet_scan_options.arrow_reader_properties->io_context());
135135
arrow_properties.set_use_threads(options.use_threads);
136+
arrow_properties.set_arrow_extensions_enabled(
137+
parquet_scan_options.arrow_reader_properties->get_arrow_extensions_enabled());
136138
return arrow_properties;
137139
}
138140

cpp/src/parquet/CMakeLists.txt

+10-7
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,9 @@ set(PARQUET_SRCS
171171
exception.cc
172172
file_reader.cc
173173
file_writer.cc
174-
geometry_statistics.cc
175-
geometry_util_internal.cc
174+
geospatial_statistics.cc
175+
geospatial_util_internal.cc
176+
geospatial_util_internal_json.cc
176177
level_comparison.cc
177178
level_conversion.cc
178179
metadata.cc
@@ -261,9 +262,11 @@ endif()
261262
if(NOT PARQUET_MINIMAL_DEPENDENCY)
262263
list(APPEND PARQUET_SHARED_LINK_LIBS arrow_shared)
263264

264-
# TODO(paleolimbot): Remove once sample files are generated
265-
list(APPEND PARQUET_SHARED_LINK_LIBS RapidJSON)
266-
list(APPEND PARQUET_STATIC_LINK_LIBS RapidJSON)
265+
# TODO(paleolimbot): Make sure this is OK or remove if not!
266+
if(ARROW_JSON)
267+
list(APPEND PARQUET_SHARED_PRIVATE_LINK_LIBS RapidJSON)
268+
list(APPEND PARQUET_STATIC_LINK_LIBS RapidJSON)
269+
endif()
267270

268271
# These are libraries that we will link privately with parquet_shared (as they
269272
# do not need to be linked transitively by other linkers)
@@ -378,8 +381,8 @@ add_parquet_test(internals-test
378381
statistics_test.cc
379382
encoding_test.cc
380383
metadata_test.cc
381-
geometry_statistics_test.cc
382-
geometry_util_internal_test.cc
384+
geospatial_statistics_test.cc
385+
geospatial_util_internal_test.cc
383386
page_index_test.cc
384387
public_api_test.cc
385388
size_statistics_test.cc

cpp/src/parquet/api/reader.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
#include "parquet/column_scanner.h"
2323
#include "parquet/exception.h"
2424
#include "parquet/file_reader.h"
25-
#include "parquet/geometry_statistics.h"
25+
#include "parquet/geospatial_statistics.h"
2626
#include "parquet/metadata.h"
2727
#include "parquet/platform.h"
2828
#include "parquet/printer.h"

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

+8-8
Original file line numberDiff line numberDiff line change
@@ -1489,15 +1489,17 @@ TEST_F(TestGeoArrowParquetIO, GeoArrowExtension) {
14891489

14901490
// Build a binary WKB array with at least one null value
14911491
::arrow::BinaryBuilder builder;
1492-
std::array<char, test::kWkbPointSize> item;
1492+
14931493
for (int k = 0; k < 10; k++) {
1494-
test::GenerateWKBPoint(reinterpret_cast<uint8_t*>(item.data()), k, k + 1);
1495-
ASSERT_OK(builder.AppendValues({std::string(item.data(), item.size())}));
1494+
std::string item = test::MakeWKBPoint(
1495+
{static_cast<double>(k), static_cast<double>(k + 1)}, false, false);
1496+
ASSERT_OK(builder.Append(item));
14961497
}
14971498
ASSERT_OK(builder.AppendNull());
14981499
for (int k = 0; k < 5; k++) {
1499-
test::GenerateWKBPoint(reinterpret_cast<uint8_t*>(item.data()), k, k + 1);
1500-
ASSERT_OK(builder.AppendValues({std::string(item.data(), item.size())}));
1500+
std::string item = test::MakeWKBPoint(
1501+
{static_cast<double>(k), static_cast<double>(k + 1)}, false, false);
1502+
ASSERT_OK(builder.Append(item));
15011503
}
15021504

15031505
ASSERT_OK_AND_ASSIGN(const auto binary_array, builder.Finish());
@@ -1512,9 +1514,7 @@ TEST_F(TestGeoArrowParquetIO, GeoArrowExtension) {
15121514

15131515
// When the original Arrow schema isn't stored and Arrow extensions are disabled,
15141516
// LogicalType::GEOMETRY is read as utf8.
1515-
auto writer_properties = ::parquet::ArrowWriterProperties::Builder()
1516-
.write_geospatial_logical_types()
1517-
->build();
1517+
auto writer_properties = default_arrow_writer_properties();
15181518
this->RoundTripSingleColumn(wkb_array, binary_array, writer_properties);
15191519
this->RoundTripSingleColumn(large_wkb_array, binary_array, writer_properties);
15201520

cpp/src/parquet/arrow/arrow_schema_test.cc

+45-31
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#include <vector>
2020

2121
#include "gmock/gmock-matchers.h"
22-
#include "gmock/gmock.h"
2322
#include "gtest/gtest.h"
2423

2524
#include "parquet/arrow/reader.h"
@@ -32,6 +31,7 @@
3231
#include "parquet/thrift_internal.h"
3332

3433
#include "arrow/array.h"
34+
#include "arrow/config.h"
3535
#include "arrow/extension/json.h"
3636
#include "arrow/ipc/writer.h"
3737
#include "arrow/testing/extension_type.h"
@@ -1251,15 +1251,12 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) {
12511251
}
12521252

12531253
TEST_F(TestConvertArrowSchema, ParquetGeoArrowCrsLonLat) {
1254+
#ifdef ARROW_JSON
12541255
// All the Arrow Schemas below should convert to the type defaults for GEOMETRY
12551256
// and GEOGRAPHY when GeoArrow extension types are registered and the appropriate
12561257
// writer option is set.
12571258
::arrow::ExtensionTypeGuard guard(test::geoarrow_wkb());
12581259

1259-
ArrowWriterProperties::Builder builder;
1260-
builder.write_geospatial_logical_types();
1261-
auto arrow_properties = builder.build();
1262-
12631260
std::vector<NodePtr> parquet_fields;
12641261
parquet_fields.push_back(PrimitiveNode::Make("geometry", Repetition::OPTIONAL,
12651262
LogicalType::Geometry(),
@@ -1289,22 +1286,22 @@ TEST_F(TestConvertArrowSchema, ParquetGeoArrowCrsLonLat) {
12891286
R"(, "edges": "spherical"})"),
12901287
true)};
12911288

1292-
ASSERT_OK(ConvertSchema(arrow_fields, arrow_properties));
1289+
ASSERT_OK(ConvertSchema(arrow_fields));
12931290
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
12941291
}
1292+
#else
1293+
GTEST_SKIP() << "non-default CRS testing requires ARROW_JSON";
1294+
#endif
12951295
}
12961296

12971297
TEST_F(TestConvertArrowSchema, ParquetGeoArrowCrsSrid) {
1298+
#ifdef ARROW_JSON
12981299
// Checks the conversion between GeoArrow's crs_type: srid and Parquet's srid:XXX.
12991300
// SRID (spatial reference identifier) is an opaque application specific identifier
13001301
// that GeoArrow will transport but refuse to resolve if required for a spatial
13011302
// operation.
13021303
::arrow::ExtensionTypeGuard guard(test::geoarrow_wkb());
13031304

1304-
ArrowWriterProperties::Builder builder;
1305-
builder.write_geospatial_logical_types();
1306-
auto arrow_properties = builder.build();
1307-
13081305
std::vector<NodePtr> parquet_fields;
13091306
parquet_fields.push_back(PrimitiveNode::Make("geometry", Repetition::OPTIONAL,
13101307
LogicalType::Geometry("srid:1234"),
@@ -1321,40 +1318,57 @@ TEST_F(TestConvertArrowSchema, ParquetGeoArrowCrsSrid) {
13211318
R"({"crs": "5678", "crs_type": "srid", "edges": "spherical"})"),
13221319
true)};
13231320

1324-
ASSERT_OK(ConvertSchema(arrow_fields, arrow_properties));
1321+
ASSERT_OK(ConvertSchema(arrow_fields));
13251322
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
1323+
#else
1324+
GTEST_SKIP() << "non-default CRS testing requires ARROW_JSON";
1325+
#endif
13261326
}
13271327

13281328
TEST_F(TestConvertArrowSchema, ParquetGeoArrowCrsProjjson) {
1329-
GTEST_SKIP() << "GeoArrow/PROJJSON support not yet implemented";
1330-
1329+
#ifdef ARROW_JSON
13311330
// Checks the conversion between GeoArrow that contains non-lon/lat PROJJSON
13321331
// to Parquet. Almost all GeoArrow types that arrive at the Parquet reader
13331332
// will have their CRS expressed in this way.
13341333
::arrow::ExtensionTypeGuard guard(test::geoarrow_wkb());
13351334

1336-
ArrowWriterProperties::Builder builder;
1337-
builder.write_geospatial_logical_types();
1338-
auto arrow_properties = builder.build();
1335+
std::vector<std::shared_ptr<Field>> arrow_fields = {
1336+
::arrow::field(
1337+
"geometry",
1338+
test::geoarrow_wkb(R"({"crs": {"key0": "value0"}, "crs_type": "projjson"})"),
1339+
true),
1340+
::arrow::field(
1341+
"geography",
1342+
test::geoarrow_wkb(
1343+
R"({"crs": {"key1": "value1"}, "crs_type": "projjson", "edges": "spherical"})"),
1344+
true)};
1345+
1346+
auto arrow_properties = default_arrow_writer_properties();
1347+
ASSERT_OK(ConvertSchema(arrow_fields, arrow_properties));
13391348

1340-
std::vector<NodePtr> parquet_fields;
1341-
parquet_fields.push_back(PrimitiveNode::Make("geometry", Repetition::OPTIONAL,
1342-
LogicalType::Geometry("projjson:1234"),
1343-
ParquetType::BYTE_ARRAY));
1344-
parquet_fields.push_back(PrimitiveNode::Make("geography", Repetition::OPTIONAL,
1345-
LogicalType::Geography("projjson:5678"),
1346-
ParquetType::BYTE_ARRAY));
1349+
std::shared_ptr<::arrow::KeyValueMetadata> crs_metadata =
1350+
::arrow::key_value_metadata({}, {});
1351+
arrow_properties->geo_crs_context()->AddProjjsonCrsFieldsToFileMetadata(
1352+
crs_metadata.get());
13471353

1348-
std::vector<std::shared_ptr<Field>> arrow_fields = {
1349-
::arrow::field("geometry",
1350-
test::geoarrow_wkb(R"({"crs": "1234", "crs_type": "srid"})"), true),
1351-
::arrow::field("geography",
1352-
test::geoarrow_wkb(
1353-
R"({"crs": "5678", "crs_type": "srid", "edges": "spherical"})"),
1354-
true)};
1354+
ASSERT_EQ(crs_metadata->size(), 2);
1355+
ASSERT_EQ(crs_metadata->value(0), R"({"key0":"value0"})");
1356+
ASSERT_EQ(crs_metadata->value(1), R"({"key1":"value1"})");
1357+
1358+
std::vector<NodePtr> parquet_fields;
1359+
parquet_fields.push_back(
1360+
PrimitiveNode::Make("geometry", Repetition::OPTIONAL,
1361+
LogicalType::Geometry("projjson:" + crs_metadata->key(0)),
1362+
ParquetType::BYTE_ARRAY));
1363+
parquet_fields.push_back(
1364+
PrimitiveNode::Make("geography", Repetition::OPTIONAL,
1365+
LogicalType::Geography("projjson:" + crs_metadata->key(1)),
1366+
ParquetType::BYTE_ARRAY));
13551367

1356-
ASSERT_OK(ConvertSchema(arrow_fields, arrow_properties));
13571368
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
1369+
#else
1370+
GTEST_SKIP() << "non-default CRS testing requires ARROW_JSON";
1371+
#endif
13581372
}
13591373

13601374
TEST_F(TestConvertArrowSchema, ParquetLists) {

cpp/src/parquet/arrow/schema.cc

+20-88
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@
2121
#include <string>
2222
#include <vector>
2323

24-
// TODO(paleolimbot): Remove once example files are generated
25-
#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
26-
27-
#include <rapidjson/document.h>
28-
#include <rapidjson/writer.h>
29-
24+
#include "arrow/config.h"
3025
#include "arrow/extension/json.h"
3126
#include "arrow/extension_type.h"
3227
#include "arrow/io/memory.h"
@@ -42,6 +37,7 @@
4237

4338
#include "parquet/arrow/schema_internal.h"
4439
#include "parquet/exception.h"
40+
#include "parquet/geospatial_util_internal_json.h"
4541
#include "parquet/metadata.h"
4642
#include "parquet/properties.h"
4743
#include "parquet/types.h"
@@ -249,90 +245,21 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type,
249245
return Status::OK();
250246
}
251247

252-
// TODO(paleolimbot): Remove once example files are written
253-
Result<std::string> GeospatialGeoArrowCrsToParquetCrs(
254-
const ::arrow::rapidjson::Document& document,
255-
const ArrowWriterProperties& arrow_properties) {
256-
namespace rj = ::arrow::rapidjson;
257-
258-
std::string crs_type;
259-
if (document.HasMember("crs_type")) {
260-
crs_type = document["crs_type"].GetString();
261-
}
262-
263-
if (!document.HasMember("crs") || document["crs"].IsNull()) {
264-
// Parquet GEOMETRY/GEOGRAPHY do not have a concept of a null/missing
265-
// CRS, but an omitted one is more likely to have meant "lon/lat" than
266-
// a truly unspecified one (i.e., Engineering CRS with arbitrary XY units)
267-
return "";
268-
}
269-
270-
const auto& json_crs = document["crs"];
271-
if (json_crs.IsString() && crs_type == "srid") {
272-
// srid is an application-specific identifier. GeoArrow lets this be propagated via
273-
// "crs_type": "srid".
274-
return std::string("srid:") + json_crs.GetString();
275-
} else if (json_crs.IsString() &&
276-
(json_crs == "EPSG:4326" || json_crs == "OGC:CRS84")) {
277-
// crs can be left empty because these cases both correspond to
278-
// longitude/latitude in WGS84 according to the Parquet specification
279-
return "";
280-
} else if (json_crs.IsObject()) {
281-
if (json_crs.HasMember("id")) {
282-
const auto& identifier = json_crs["id"];
283-
if (identifier.HasMember("authority") && identifier.HasMember("code")) {
284-
if (identifier["authority"] == "OGC" && identifier["code"] == "CRS84") {
285-
// longitude/latitude
286-
return "";
287-
} else if (identifier["authority"] == "EPSG" && identifier["code"] == 4326) {
288-
// longitude/latitude
289-
return "";
290-
}
291-
}
292-
}
293-
294-
// TODO(paleolimbot) this is not quite correct because we're supposed to put this
295-
// in the metadata according to the spec. We need to find a way to put
296-
// this in the arrow_properties/file metadata via a CrsProvider or something.
297-
rj::StringBuffer buffer;
298-
rj::Writer<rj::StringBuffer> writer(buffer);
299-
document.Accept(writer);
300-
return std::string("projjson:") + buffer.GetString();
301-
} else {
302-
// e.g., authority:code, WKT2, arbitrary string. A pluggable CrsProvider
303-
// could handle these and return something we're allowed to write here.
304-
return Status::Invalid("Unsupported GeoArrow CRS for Parquet");
305-
}
306-
}
307-
308248
Result<std::shared_ptr<const LogicalType>> GeospatialLogicalTypeFromArrow(
309249
const std::string& serialized_data, const ArrowWriterProperties& arrow_properties) {
310-
// Parquet has no way to interpret a null or missing CRS; however, it is more likely
311-
// to induce confusion insert the fully specified equivalent of a null CRS (custom
312-
// engineering CRS with unspecified units)
250+
// Without a JSON parser, we can still handle a few trivial cases
313251
if (serialized_data.empty() || serialized_data == "{}") {
314252
return LogicalType::Geometry();
253+
} else if (
254+
serialized_data ==
255+
R"({"edges": "spherical", "crs": "OGC:CRS84", "crs_type": "authority_code"})") {
256+
return LogicalType::Geography();
257+
} else if (serialized_data == R"({"crs": "OGC:CRS84", "crs_type": "authority_code"})") {
258+
return LogicalType::Geometry();
259+
} else {
260+
// Will return an error status if Parquet was not built with ARROW_JSON
261+
return GeospatialLogicalTypeFromGeoArrowJSON(serialized_data, arrow_properties);
315262
}
316-
317-
namespace rj = ::arrow::rapidjson;
318-
rj::Document document;
319-
if (document.Parse(serialized_data.data(), serialized_data.length()).HasParseError()) {
320-
return Status::Invalid("Invalid serialized JSON data: ", serialized_data);
321-
}
322-
323-
ARROW_ASSIGN_OR_RAISE(std::string crs,
324-
GeospatialGeoArrowCrsToParquetCrs(document, arrow_properties));
325-
326-
if (document.HasMember("edges") && document["edges"] == "planar") {
327-
return LogicalType::Geometry(crs);
328-
} else if (document.HasMember("edges") && document["edges"] == "spherical") {
329-
return LogicalType::Geography(crs,
330-
LogicalType::EdgeInterpolationAlgorithm::SPHERICAL);
331-
} else if (document.HasMember("edges")) {
332-
return Status::NotImplemented("GeoArrow edge type: ", serialized_data);
333-
}
334-
335-
return LogicalType::Geometry(crs);
336263
}
337264

338265
static constexpr char FIELD_ID_KEY[] = "PARQUET:field_id";
@@ -526,8 +453,7 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
526453
type = ParquetType::BYTE_ARRAY;
527454
logical_type = LogicalType::JSON();
528455
break;
529-
} else if (arrow_properties.write_geospatial_logical_types() &&
530-
ext_type->extension_name() == std::string("geoarrow.wkb")) {
456+
} else if (ext_type->extension_name() == std::string("geoarrow.wkb")) {
531457
type = ParquetType::BYTE_ARRAY;
532458
ARROW_ASSIGN_OR_RAISE(logical_type, GeospatialLogicalTypeFromArrow(
533459
ext_type->Serialize(), arrow_properties));
@@ -562,6 +488,7 @@ struct SchemaTreeContext {
562488
SchemaManifest* manifest;
563489
ArrowReaderProperties properties;
564490
const SchemaDescriptor* schema;
491+
std::shared_ptr<const KeyValueMetadata> metadata;
565492

566493
void LinkParent(const SchemaField* child, const SchemaField* parent) {
567494
manifest->child_to_parent[child] = parent;
@@ -584,7 +511,7 @@ ::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
584511
int column_index, const schema::PrimitiveNode& primitive_node,
585512
SchemaTreeContext* ctx) {
586513
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> storage_type,
587-
GetArrowType(primitive_node, ctx->properties));
514+
GetArrowType(primitive_node, ctx->properties, ctx->metadata));
588515
if (ctx->properties.read_dictionary(column_index) &&
589516
IsDictionaryReadSupported(*storage_type)) {
590517
return ::arrow::dictionary(::arrow::int32(), storage_type);
@@ -1187,6 +1114,10 @@ Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
11871114
const WriterProperties& properties,
11881115
const ArrowWriterProperties& arrow_properties,
11891116
std::shared_ptr<SchemaDescriptor>* out) {
1117+
// TODO(paleolimbot): I'm wondering if this geo_crs_context is reused when testing
1118+
// on MINGW, where we get some failures indicating non-empty metadata where it was
1119+
// expected
1120+
arrow_properties.geo_crs_context()->Clear();
11901121
std::vector<NodePtr> nodes(arrow_schema->num_fields());
11911122
for (int i = 0; i < arrow_schema->num_fields(); i++) {
11921123
RETURN_NOT_OK(
@@ -1249,6 +1180,7 @@ Status SchemaManifest::Make(const SchemaDescriptor* schema,
12491180
ctx.manifest = manifest;
12501181
ctx.properties = properties;
12511182
ctx.schema = schema;
1183+
ctx.metadata = metadata;
12521184
const GroupNode& schema_node = *schema->group_node();
12531185
manifest->descr = schema;
12541186
manifest->schema_fields.resize(schema_node.field_count());

0 commit comments

Comments
 (0)