From cba31c7d0c63f32b2105ab9ca94936fa4ac225f2 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 18 Jan 2025 21:07:19 -0500 Subject: [PATCH 01/20] Support decimal32/64 in schema conversion --- cpp/src/parquet/arrow/arrow_schema_test.cc | 62 ++++++++++++++++++---- cpp/src/parquet/arrow/schema.cc | 4 +- cpp/src/parquet/arrow/schema_internal.cc | 6 ++- 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 535efa0c8e5de..10431c3a11813 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -185,11 +185,13 @@ TEST_F(TestConvertParquetSchema, ParquetAnnotatedFields) { {"string", LogicalType::String(), ParquetType::BYTE_ARRAY, -1, ::arrow::utf8()}, {"enum", LogicalType::Enum(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()}, {"decimal(8, 2)", LogicalType::Decimal(8, 2), ParquetType::INT32, -1, - ::arrow::decimal128(8, 2)}, + ::arrow::decimal32(8, 2)}, {"decimal(16, 4)", LogicalType::Decimal(16, 4), ParquetType::INT64, -1, - ::arrow::decimal128(16, 4)}, + ::arrow::decimal64(16, 4)}, {"decimal(32, 8)", LogicalType::Decimal(32, 8), ParquetType::FIXED_LEN_BYTE_ARRAY, 16, ::arrow::decimal128(32, 8)}, + {"decimal(73, 38)", LogicalType::Decimal(73, 38), ParquetType::FIXED_LEN_BYTE_ARRAY, + 31, ::arrow::decimal256(73, 38)}, {"date", LogicalType::Date(), ParquetType::INT32, -1, ::arrow::date32()}, {"time(ms)", LogicalType::Time(true, LogicalType::TimeUnit::MILLIS), ParquetType::INT32, -1, ::arrow::time32(::arrow::TimeUnit::MILLI)}, @@ -970,12 +972,12 @@ class TestConvertArrowSchema : public ::testing::Test { ::arrow::Status ConvertSchema( const std::vector>& fields, std::shared_ptr<::parquet::ArrowWriterProperties> arrow_properties = - ::parquet::default_arrow_writer_properties()) { + ::parquet::default_arrow_writer_properties(), + std::shared_ptr<::parquet::WriterProperties> parquet_properties = + ::parquet::default_writer_properties()) { arrow_schema_ = ::arrow::schema(fields); - std::shared_ptr<::parquet::WriterProperties> properties = - ::parquet::default_writer_properties(); - return ToParquetSchema(arrow_schema_.get(), *properties.get(), *arrow_properties, - &result_schema_); + return ToParquetSchema(arrow_schema_.get(), *parquet_properties.get(), + *arrow_properties, &result_schema_); } protected: @@ -1069,14 +1071,16 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { {"utf8", ::arrow::utf8(), LogicalType::String(), ParquetType::BYTE_ARRAY, -1}, {"large_utf8", ::arrow::large_utf8(), LogicalType::String(), ParquetType::BYTE_ARRAY, -1}, - {"decimal(1, 0)", ::arrow::decimal128(1, 0), LogicalType::Decimal(1, 0), + {"decimal(1, 0)", ::arrow::decimal32(1, 0), LogicalType::Decimal(1, 0), ParquetType::FIXED_LEN_BYTE_ARRAY, 1}, - {"decimal(8, 2)", ::arrow::decimal128(8, 2), LogicalType::Decimal(8, 2), + {"decimal(8, 2)", ::arrow::decimal32(8, 2), LogicalType::Decimal(8, 2), ParquetType::FIXED_LEN_BYTE_ARRAY, 4}, - {"decimal(16, 4)", ::arrow::decimal128(16, 4), LogicalType::Decimal(16, 4), + {"decimal(16, 4)", ::arrow::decimal64(16, 4), LogicalType::Decimal(16, 4), ParquetType::FIXED_LEN_BYTE_ARRAY, 7}, {"decimal(32, 8)", ::arrow::decimal128(32, 8), LogicalType::Decimal(32, 8), ParquetType::FIXED_LEN_BYTE_ARRAY, 14}, + {"decimal(73, 38)", ::arrow::decimal256(73, 38), LogicalType::Decimal(73, 38), + ParquetType::FIXED_LEN_BYTE_ARRAY, 31}, {"float16", ::arrow::float16(), LogicalType::Float16(), ParquetType::FIXED_LEN_BYTE_ARRAY, 2}, {"time32", ::arrow::time32(::arrow::TimeUnit::MILLI), @@ -1134,6 +1138,44 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { // ASSERT_NO_FATAL_FAILURE(); } +TEST_F(TestConvertArrowSchema, ArrowDecimalFieldsStoredAsInt) { + struct FieldConstructionArguments { + std::string name; + std::shared_ptr<::arrow::DataType> datatype; + std::shared_ptr logical_type; + parquet::Type::type physical_type; + int physical_length; + }; + + std::vector cases = { + {"decimal(1, 0)", ::arrow::decimal32(1, 0), LogicalType::Decimal(1, 0), + ParquetType::INT32, 1}, + {"decimal(8, 2)", ::arrow::decimal32(8, 2), LogicalType::Decimal(8, 2), + ParquetType::INT32, 4}, + {"decimal(16, 4)", ::arrow::decimal64(16, 4), LogicalType::Decimal(16, 4), + ParquetType::INT64, 7}, + {"decimal(32, 8)", ::arrow::decimal128(32, 8), LogicalType::Decimal(32, 8), + ParquetType::FIXED_LEN_BYTE_ARRAY, 14}, + {"decimal(73, 38)", ::arrow::decimal256(73, 38), LogicalType::Decimal(73, 38), + ParquetType::FIXED_LEN_BYTE_ARRAY, 31}}; + + std::vector> arrow_fields; + std::vector parquet_fields; + + for (const FieldConstructionArguments& c : cases) { + arrow_fields.push_back(::arrow::field(c.name, c.datatype, false)); + parquet_fields.push_back(PrimitiveNode::Make(c.name, Repetition::REQUIRED, + c.logical_type, c.physical_type, + c.physical_length)); + } + + auto parquet_properties_builder = ::parquet::WriterProperties::Builder(); + ASSERT_OK(ConvertSchema( + arrow_fields, ::parquet::default_arrow_writer_properties(), + parquet_properties_builder.enable_store_decimal_as_integer()->build())); + CheckFlatSchema(parquet_fields); +} + TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) { struct FieldConstructionArguments { std::string name; diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index d94c73452c44d..d9b0404f31d29 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -355,13 +355,15 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, static_cast(*field->type()); length = fixed_size_binary_type.byte_width(); } break; + case ArrowTypeId::DECIMAL32: + case ArrowTypeId::DECIMAL64: case ArrowTypeId::DECIMAL128: case ArrowTypeId::DECIMAL256: { const auto& decimal_type = static_cast(*field->type()); precision = decimal_type.precision(); scale = decimal_type.scale(); if (properties.store_decimal_as_integer() && 1 <= precision && precision <= 18) { - type = precision <= 9 ? ParquetType ::INT32 : ParquetType ::INT64; + type = precision <= 9 ? ParquetType::INT32 : ParquetType::INT64; } else { type = ParquetType::FIXED_LEN_BYTE_ARRAY; length = DecimalType::DecimalSize(precision); diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index 261a00940654d..d372e2fb681c0 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -34,7 +34,11 @@ using ::arrow::internal::checked_cast; Result> MakeArrowDecimal(const LogicalType& logical_type) { const auto& decimal = checked_cast(logical_type); - if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) { + if (decimal.precision() <= ::arrow::Decimal32Type::kMaxPrecision) { + return ::arrow::Decimal32Type::Make(decimal.precision(), decimal.scale()); + } else if (decimal.precision() <= ::arrow::Decimal64Type::kMaxPrecision) { + return ::arrow::Decimal64Type::Make(decimal.precision(), decimal.scale()); + } else if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) { return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale()); } return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale()); From a9398a260f57c1c7df30f36082a7ab91279de21f Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 18 Jan 2025 21:39:53 -0500 Subject: [PATCH 02/20] Support decimal32/64 in column writer --- cpp/src/parquet/column_writer.cc | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 4998e6f301a00..174db8ab948e3 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1999,10 +1999,20 @@ struct SerializeFunctor< template value_type TransferValue(const uint8_t* in) const { - static_assert(byte_width == 16 || byte_width == 32, - "only 16 and 32 byte Decimals supported"); + static_assert(byte_width == ::arrow::Decimal32Type::kByteWidth || + byte_width == ::arrow::Decimal64Type::kByteWidth || + byte_width == ::arrow::Decimal128Type::kByteWidth || + byte_width == ::arrow::Decimal256Type::kByteWidth, + "only 4/8/16/32 byte Decimals supported"); + value_type value = 0; - if constexpr (byte_width == 16) { + if constexpr (byte_width == ::arrow::Decimal32Type::kByteWidth) { + ::arrow::Decimal32 decimal_value(in); + PARQUET_THROW_NOT_OK(decimal_value.ToInteger(&value)); + } else if constexpr (byte_width == ::arrow::Decimal64Type::kByteWidth) { + ::arrow::Decimal64 decimal_value(in); + PARQUET_THROW_NOT_OK(decimal_value.ToInteger(&value)); + } else if constexpr (byte_width == ::arrow::Decimal128Type::kByteWidth) { ::arrow::Decimal128 decimal_value(in); PARQUET_THROW_NOT_OK(decimal_value.ToInteger(&value)); } else { @@ -2048,6 +2058,8 @@ Status TypedColumnWriterImpl::WriteArrowDense( WRITE_ZERO_COPY_CASE(DATE32, Date32Type, Int32Type) WRITE_SERIALIZE_CASE(DATE64, Date64Type, Int32Type) WRITE_SERIALIZE_CASE(TIME32, Time32Type, Int32Type) + WRITE_SERIALIZE_CASE(DECIMAL32, Decimal32Type, Int32Type) + WRITE_SERIALIZE_CASE(DECIMAL64, Decimal64Type, Int32Type) WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, Int32Type) WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, Int32Type) default: @@ -2220,6 +2232,8 @@ Status TypedColumnWriterImpl::WriteArrowDense( WRITE_SERIALIZE_CASE(UINT64, UInt64Type, Int64Type) WRITE_ZERO_COPY_CASE(TIME64, Time64Type, Int64Type) WRITE_ZERO_COPY_CASE(DURATION, DurationType, Int64Type) + WRITE_SERIALIZE_CASE(DECIMAL32, Decimal32Type, Int64Type) + WRITE_SERIALIZE_CASE(DECIMAL64, Decimal64Type, Int64Type) WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, Int64Type) WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, Int64Type) default: @@ -2441,6 +2455,8 @@ Status TypedColumnWriterImpl::WriteArrowDense( const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) { switch (array.type()->id()) { WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType) + WRITE_SERIALIZE_CASE(DECIMAL32, Decimal32Type, FLBAType) + WRITE_SERIALIZE_CASE(DECIMAL64, Decimal64Type, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType) WRITE_SERIALIZE_CASE(HALF_FLOAT, HalfFloatType, FLBAType) From e1dc0235a1ad900a989a9cb0063bdd5d6bd035e8 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sun, 19 Jan 2025 12:34:21 -0500 Subject: [PATCH 03/20] Restrict column writer with correct decimal types --- cpp/src/parquet/column_writer.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 174db8ab948e3..118dca7cf3d9b 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2059,9 +2059,6 @@ Status TypedColumnWriterImpl::WriteArrowDense( WRITE_SERIALIZE_CASE(DATE64, Date64Type, Int32Type) WRITE_SERIALIZE_CASE(TIME32, Time32Type, Int32Type) WRITE_SERIALIZE_CASE(DECIMAL32, Decimal32Type, Int32Type) - WRITE_SERIALIZE_CASE(DECIMAL64, Decimal64Type, Int32Type) - WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, Int32Type) - WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, Int32Type) default: ARROW_UNSUPPORTED() } @@ -2232,10 +2229,7 @@ Status TypedColumnWriterImpl::WriteArrowDense( WRITE_SERIALIZE_CASE(UINT64, UInt64Type, Int64Type) WRITE_ZERO_COPY_CASE(TIME64, Time64Type, Int64Type) WRITE_ZERO_COPY_CASE(DURATION, DurationType, Int64Type) - WRITE_SERIALIZE_CASE(DECIMAL32, Decimal32Type, Int64Type) WRITE_SERIALIZE_CASE(DECIMAL64, Decimal64Type, Int64Type) - WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, Int64Type) - WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, Int64Type) default: ARROW_UNSUPPORTED(); } From 6032b02df9d091353478c94c74e80b895cb3e5b3 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sun, 19 Jan 2025 12:34:43 -0500 Subject: [PATCH 04/20] Support decimal32/64 in reader & vector kernels & tests --- cpp/src/arrow/compute/kernels/vector_hash.cc | 4 +- .../arrow/compute/kernels/vector_selection.cc | 3 +- .../parquet/arrow/arrow_reader_writer_test.cc | 170 +++++++++++------- cpp/src/parquet/arrow/arrow_schema_test.cc | 2 +- cpp/src/parquet/arrow/reader_internal.cc | 122 +++++++++---- cpp/src/parquet/arrow/test_util.h | 130 +++++++++++++- cpp/src/parquet/column_writer.cc | 42 +++-- 7 files changed, 358 insertions(+), 115 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 5067298858132..0d13316001906 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -555,6 +555,7 @@ KernelInit GetHashInit(Type::type type_id) { case Type::DATE32: case Type::TIME32: case Type::INTERVAL_MONTHS: + case Type::DECIMAL32: return HashInit>; case Type::INT64: case Type::UINT64: @@ -564,6 +565,7 @@ KernelInit GetHashInit(Type::type type_id) { case Type::TIMESTAMP: case Type::DURATION: case Type::INTERVAL_DAY_TIME: + case Type::DECIMAL64: return HashInit>; case Type::BINARY: case Type::STRING: @@ -707,7 +709,7 @@ void AddHashKernels(VectorFunction* func, VectorKernel base, OutputType out_ty) DCHECK_OK(func->AddKernel(base)); } - for (auto t : {Type::DECIMAL128, Type::DECIMAL256}) { + for (auto t : {Type::DECIMAL32, Type::DECIMAL64, Type::DECIMAL128, Type::DECIMAL256}) { base.init = GetHashInit(t); base.signature = KernelSignature::Make({t}, out_ty); DCHECK_OK(func->AddKernel(base)); diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc index b265673e23c86..b902352ee51e8 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection.cc @@ -308,7 +308,8 @@ std::shared_ptr MakeIndicesNonZeroFunction(std::string name, AddKernels(NumericTypes()); AddKernels({boolean()}); - for (const auto& ty : {Type::DECIMAL128, Type::DECIMAL256}) { + for (const auto& ty : + {Type::DECIMAL32, Type::DECIMAL64, Type::DECIMAL128, Type::DECIMAL256}) { kernel.signature = KernelSignature::Make({ty}, uint64()); DCHECK_OK(func->AddKernel(kernel)); } diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 27cb849365ca7..0990c73494737 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -181,6 +181,14 @@ std::shared_ptr get_logical_type(const DataType& type) { static_cast(type); return get_logical_type(*dict_type.value_type()); } + case ArrowId::DECIMAL32: { + const auto& dec_type = static_cast(type); + return LogicalType::Decimal(dec_type.precision(), dec_type.scale()); + } + case ArrowId::DECIMAL64: { + const auto& dec_type = static_cast(type); + return LogicalType::Decimal(dec_type.precision(), dec_type.scale()); + } case ArrowId::DECIMAL128: { const auto& dec_type = static_cast(type); return LogicalType::Decimal(dec_type.precision(), dec_type.scale()); @@ -206,9 +214,11 @@ ParquetType::type get_physical_type(const DataType& type) { case ArrowId::INT16: case ArrowId::UINT32: case ArrowId::INT32: + case ArrowId::DECIMAL32: return ParquetType::INT32; case ArrowId::UINT64: case ArrowId::INT64: + case ArrowId::DECIMAL64: return ParquetType::INT64; case ArrowId::FLOAT: return ParquetType::FLOAT; @@ -533,6 +543,8 @@ static std::shared_ptr MakeSimpleSchema(const DataType& type, case ::arrow::Type::HALF_FLOAT: byte_width = sizeof(::arrow::HalfFloatType::c_type); break; + case ::arrow::Type::DECIMAL32: + case ::arrow::Type::DECIMAL64: case ::arrow::Type::DECIMAL128: case ::arrow::Type::DECIMAL256: { const auto& decimal_type = static_cast(values_type); @@ -548,6 +560,8 @@ static std::shared_ptr MakeSimpleSchema(const DataType& type, case ::arrow::Type::HALF_FLOAT: byte_width = sizeof(::arrow::HalfFloatType::c_type); break; + case ::arrow::Type::DECIMAL32: + case ::arrow::Type::DECIMAL64: case ::arrow::Type::DECIMAL128: case ::arrow::Type::DECIMAL256: { const auto& decimal_type = static_cast(type); @@ -783,21 +797,55 @@ class TestReadDecimals : public ParquetIOTestBase { // The Decimal roundtrip tests always go through the FixedLenByteArray path, // check the ByteArray case manually. -TEST_F(TestReadDecimals, Decimal128ByteArray) { +TEST_F(TestReadDecimals, Decimal32ByteArray) { const std::vector> big_endian_decimals = { // 123456 {1, 226, 64}, // 987654 {15, 18, 6}, // -123456 - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192}, + {255, 254, 29, 192}, }; auto expected = - ArrayFromJSON(::arrow::decimal128(6, 3), R"(["123.456", "987.654", "-123.456"])"); + ArrayFromJSON(::arrow::decimal32(6, 3), R"(["123.456", "987.654", "-123.456"])"); CheckReadFromByteArrays(LogicalType::Decimal(6, 3), big_endian_decimals, *expected); } +TEST_F(TestReadDecimals, Decimal64ByteArray) { + const std::vector> big_endian_decimals = { + // 123456 + {1, 226, 64}, + // 987654 + {15, 18, 6}, + // -123456 + {255, 254, 29, 192}, + // -123456 + {255, 255, 255, 255, 255, 254, 29, 192}, + }; + + auto expected = ArrayFromJSON(::arrow::decimal64(16, 3), + R"(["123.456", "987.654", "-123.456", "-123.456"])"); + CheckReadFromByteArrays(LogicalType::Decimal(16, 3), big_endian_decimals, *expected); +} + +TEST_F(TestReadDecimals, Decimal128ByteArray) { + const std::vector> big_endian_decimals = { + // 123456 + {1, 226, 64}, + // 987654 + {15, 18, 6}, + // -123456 + {255, 254, 29, 192}, + // -123456 + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192}, + }; + + auto expected = ArrayFromJSON(::arrow::decimal128(20, 3), + R"(["123.456", "987.654", "-123.456", "-123.456"])"); + CheckReadFromByteArrays(LogicalType::Decimal(20, 3), big_endian_decimals, *expected); +} + TEST_F(TestReadDecimals, Decimal256ByteArray) { const std::vector> big_endian_decimals = { // 123456 @@ -805,12 +853,14 @@ TEST_F(TestReadDecimals, Decimal256ByteArray) { // 987654 {15, 18, 6}, // -123456 + {255, 254, 29, 192}, + // -123456 {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192}, }; - auto expected = - ArrayFromJSON(::arrow::decimal256(40, 3), R"(["123.456", "987.654", "-123.456"])"); + auto expected = ArrayFromJSON(::arrow::decimal256(40, 3), + R"(["123.456", "987.654", "-123.456", "-123.456"])"); CheckReadFromByteArrays(LogicalType::Decimal(40, 3), big_endian_decimals, *expected); } @@ -858,9 +908,9 @@ typedef ::testing::Types< ::arrow::Int16Type, ::arrow::Int32Type, ::arrow::UInt64Type, ::arrow::Int64Type, ::arrow::Date32Type, ::arrow::FloatType, ::arrow::DoubleType, ::arrow::StringType, ::arrow::BinaryType, ::arrow::FixedSizeBinaryType, ::arrow::HalfFloatType, - Decimal128WithPrecisionAndScale<1>, Decimal128WithPrecisionAndScale<5>, - Decimal128WithPrecisionAndScale<10>, Decimal128WithPrecisionAndScale<19>, - Decimal128WithPrecisionAndScale<23>, Decimal128WithPrecisionAndScale<27>, + Decimal32WithPrecisionAndScale<1>, Decimal32WithPrecisionAndScale<5>, + Decimal64WithPrecisionAndScale<10>, Decimal64WithPrecisionAndScale<18>, + Decimal128WithPrecisionAndScale<19>, Decimal128WithPrecisionAndScale<27>, Decimal128WithPrecisionAndScale<38>, Decimal256WithPrecisionAndScale<39>, Decimal256WithPrecisionAndScale<56>, Decimal256WithPrecisionAndScale<76>> TestTypes; @@ -903,8 +953,9 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { std::shared_ptr table = MakeSimpleTable(values, false); this->ResetSink(); - ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_, - values->length(), default_writer_properties())); + ASSERT_OK_NO_THROW(WriteTable( + *table, ::arrow::default_memory_pool(), this->sink_, values->length(), + ::parquet::WriterProperties::Builder().enable_store_decimal_as_integer()->build())); std::shared_ptr
out; std::unique_ptr reader; @@ -2944,7 +2995,7 @@ TEST(ArrowReadWrite, Decimal256) { using ::arrow::Decimal256; using ::arrow::field; - auto type = ::arrow::decimal256(8, 4); + auto type = ::arrow::decimal256(48, 4); const char* json = R"(["1.0000", null, "-1.2345", "-1000.5678", "-9999.9999", "9999.9999"])"; @@ -2958,7 +3009,7 @@ TEST(ArrowReadWrite, DecimalStats) { using ::arrow::Decimal128; using ::arrow::field; - auto type = ::arrow::decimal128(/*precision=*/8, /*scale=*/0); + auto type = ::arrow::decimal128(/*precision=*/28, /*scale=*/0); const char* json = R"(["255", "128", null, "0", "1", "-127", "-128", "-129", "-255"])"; auto array = ::arrow::ArrayFromJSON(type, json); @@ -3447,8 +3498,8 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { types.push_back(::arrow::duration(::arrow::TimeUnit::MILLI)); types.push_back(::arrow::duration(::arrow::TimeUnit::MICRO)); types.push_back(::arrow::duration(::arrow::TimeUnit::NANO)); - types.push_back(::arrow::decimal128(3, 2)); - types.push_back(::arrow::decimal256(3, 2)); + types.push_back(::arrow::decimal32(3, 2)); + types.push_back(::arrow::decimal128(23, 2)); types.push_back(::arrow::fixed_size_binary(4)); // Note large variants of types appear to get converted back to regular on read types.push_back(::arrow::dictionary(::arrow::int32(), ::arrow::binary())); @@ -3500,9 +3551,8 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptionalDecimal) { ByteArray("\x0f\x12\x06"), // 987654 }; const std::vector int32_values = {123456, 987654}; - const std::vector int64_values = {123456, 987654}; - const auto inner_type = ::arrow::decimal128(6, 3); + const auto inner_type = ::arrow::decimal32(6, 3); auto inner_field = ::arrow::field("inner", inner_type, /*nullable=*/false); auto type = ::arrow::struct_({inner_field}); auto field = ::arrow::field("outer", type, /*nullable=*/true); @@ -3512,7 +3562,7 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptionalDecimal) { ::arrow::StructArray::Make({inner}, {inner_field}, null_bitmap)); auto table = ::arrow::Table::Make(::arrow::schema({field}), {array}); - for (const auto& encoding : {Type::BYTE_ARRAY, Type::INT32, Type::INT64}) { + for (const auto& encoding : {Type::BYTE_ARRAY, Type::INT32}) { // Manually write out file based on encoding type ARROW_SCOPED_TRACE("Encoding decimals as ", encoding); auto parquet_schema = GroupNode::Make( @@ -3543,12 +3593,6 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptionalDecimal) { int32_values.data()); break; } - case Type::INT64: { - auto typed_writer = checked_cast(column_writer); - typed_writer->WriteBatch(4, def_levels.data(), /*rep_levels=*/nullptr, - int64_values.data()); - break; - } default: FAIL() << "Invalid encoding"; return; @@ -3562,11 +3606,11 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptionalDecimal) { } } -TEST(ArrowReadWrite, Decimal256AsInt) { +TEST(ArrowReadWrite, Decimal32AsInt) { using ::arrow::Decimal256; using ::arrow::field; - auto type = ::arrow::decimal256(8, 4); + auto type = ::arrow::decimal32(8, 4); const char* json = R"(["1.0000", null, "-1.2345", "-1000.5678", "-9999.9999", "9999.9999"])"; @@ -4059,7 +4103,7 @@ TEST(TestArrowReaderAdHoc, WriteBatchedNestedNullableStringColumn) { // ARROW-10493 std::vector> fields{ ::arrow::field("s", ::arrow::utf8(), /*nullable=*/true), - ::arrow::field("d", ::arrow::decimal128(4, 2), /*nullable=*/true), + ::arrow::field("d", ::arrow::decimal32(4, 2), /*nullable=*/true), ::arrow::field("b", ::arrow::boolean(), /*nullable=*/true), ::arrow::field("i8", ::arrow::int8(), /*nullable=*/true), ::arrow::field("i64", ::arrow::int64(), /*nullable=*/true)}; @@ -4222,25 +4266,47 @@ TEST_P(TestArrowReaderAdHocSparkAndHvr, ReadDecimals) { std::shared_ptr expected_array; - ::arrow::Decimal128Builder builder(decimal_type, pool); - - for (int32_t i = 0; i < expected_length; ++i) { - ::arrow::Decimal128 value((i + 1) * 100); - ASSERT_OK(builder.Append(value)); + if (decimal_type->id() == ::arrow::Decimal32Type::type_id) { + ::arrow::Decimal32Builder builder(decimal_type, pool); + for (int32_t i = 0; i < expected_length; ++i) { + ::arrow::Decimal32 value((i + 1) * 100); + ASSERT_OK(builder.Append(value)); + } + ASSERT_OK(builder.Finish(&expected_array)); + } else if (decimal_type->id() == ::arrow::Decimal64Type::type_id) { + ::arrow::Decimal64Builder builder(decimal_type, pool); + for (int32_t i = 0; i < expected_length; ++i) { + ::arrow::Decimal64 value((i + 1) * 100); + ASSERT_OK(builder.Append(value)); + } + ASSERT_OK(builder.Finish(&expected_array)); + } else if (decimal_type->id() == ::arrow::Decimal128Type::type_id) { + ::arrow::Decimal128Builder builder(decimal_type, pool); + for (int32_t i = 0; i < expected_length; ++i) { + ::arrow::Decimal128 value((i + 1) * 100); + ASSERT_OK(builder.Append(value)); + } + ASSERT_OK(builder.Finish(&expected_array)); + } else { + ::arrow::Decimal256Builder builder(decimal_type, pool); + for (int32_t i = 0; i < expected_length; ++i) { + ::arrow::Decimal256 value((i + 1) * 100); + ASSERT_OK(builder.Append(value)); + } + ASSERT_OK(builder.Finish(&expected_array)); } - ASSERT_OK(builder.Finish(&expected_array)); + AssertArraysEqual(*expected_array, *chunk); } INSTANTIATE_TEST_SUITE_P( ReadDecimals, TestArrowReaderAdHocSparkAndHvr, ::testing::Values( - std::make_tuple("int32_decimal.parquet", ::arrow::decimal128(4, 2)), - std::make_tuple("int64_decimal.parquet", ::arrow::decimal128(10, 2)), + std::make_tuple("int32_decimal.parquet", ::arrow::decimal32(4, 2)), + std::make_tuple("int64_decimal.parquet", ::arrow::decimal64(10, 2)), std::make_tuple("fixed_length_decimal.parquet", ::arrow::decimal128(25, 2)), - std::make_tuple("fixed_length_decimal_legacy.parquet", - ::arrow::decimal128(13, 2)), - std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal128(4, 2)))); + std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal64(13, 2)), + std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal32(4, 2)))); TEST(TestArrowReaderAdHoc, ReadFloat16Files) { using ::arrow::util::Float16; @@ -5264,33 +5330,17 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO { this->ReaderFromSink(&reader); this->ReadSingleColumnFile(std::move(reader), &out); - // Reader always read values as DECIMAL128 type - ASSERT_EQ(out->type()->id(), ::arrow::Type::DECIMAL128); - - if (values.type()->id() == ::arrow::Type::DECIMAL128) { - AssertArraysEqual(values, *out); - } else { - auto& expected_values = dynamic_cast(values); - auto& read_values = dynamic_cast(*out); - ASSERT_EQ(expected_values.length(), read_values.length()); - ASSERT_EQ(expected_values.null_count(), read_values.null_count()); - ASSERT_EQ(expected_values.length(), read_values.length()); - for (int64_t i = 0; i < expected_values.length(); ++i) { - ASSERT_EQ(expected_values.IsNull(i), read_values.IsNull(i)); - if (!expected_values.IsNull(i)) { - ASSERT_EQ(::arrow::Decimal256(expected_values.Value(i)).ToString(0), - ::arrow::Decimal128(read_values.Value(i)).ToString(0)); - } - } - } + ASSERT_EQ(out->type()->id(), TestType::type_id); + AssertArraysEqual(values, *out); } }; typedef ::testing::Types< - Decimal128WithPrecisionAndScale<1>, Decimal128WithPrecisionAndScale<5>, - Decimal128WithPrecisionAndScale<10>, Decimal128WithPrecisionAndScale<18>, - Decimal256WithPrecisionAndScale<1>, Decimal256WithPrecisionAndScale<5>, - Decimal256WithPrecisionAndScale<10>, Decimal256WithPrecisionAndScale<18>> + Decimal32WithPrecisionAndScale<1>, Decimal32WithPrecisionAndScale<5>, + Decimal64WithPrecisionAndScale<10>, Decimal64WithPrecisionAndScale<18>, + Decimal128WithPrecisionAndScale<19>, Decimal128WithPrecisionAndScale<27>, + Decimal128WithPrecisionAndScale<38>, Decimal256WithPrecisionAndScale<39>, + Decimal256WithPrecisionAndScale<56>, Decimal256WithPrecisionAndScale<76>> DecimalTestTypes; TYPED_TEST_SUITE(TestIntegerAnnotateDecimalTypeParquetIO, DecimalTestTypes); diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 10431c3a11813..4de4e1d3299f9 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -67,7 +67,7 @@ const auto TIMESTAMP_MS = ::arrow::timestamp(TimeUnit::MILLI); const auto TIMESTAMP_US = ::arrow::timestamp(TimeUnit::MICRO); const auto TIMESTAMP_NS = ::arrow::timestamp(TimeUnit::NANO); const auto BINARY = ::arrow::binary(); -const auto DECIMAL_8_4 = std::make_shared<::arrow::Decimal128Type>(8, 4); +const auto DECIMAL_8_4 = std::make_shared<::arrow::Decimal32Type>(8, 4); class TestConvertParquetSchema : public ::testing::Test { public: diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 59fe2b4600209..2a2c392eb02bc 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -69,6 +69,12 @@ using arrow::Decimal128Type; using arrow::Decimal256; using arrow::Decimal256Array; using arrow::Decimal256Type; +using arrow::Decimal32; +using arrow::Decimal32Array; +using arrow::Decimal32Type; +using arrow::Decimal64; +using arrow::Decimal64Array; +using arrow::Decimal64Type; using arrow::Field; using arrow::Int32Array; using arrow::ListArray; @@ -211,25 +217,35 @@ Status ExtractDecimalMinMaxFromBytesType(const Statistics& statistics, std::shared_ptr<::arrow::Scalar>* max) { const DecimalLogicalType& decimal_type = checked_cast(logical_type); - - Result> maybe_type = - Decimal128Type::Make(decimal_type.precision(), decimal_type.scale()); + auto precision = decimal_type.precision(); + auto scale = decimal_type.scale(); std::shared_ptr arrow_type; - if (maybe_type.ok()) { - arrow_type = maybe_type.ValueOrDie(); + + if (precision <= Decimal32Type::kMaxPrecision) { + ARROW_ASSIGN_OR_RAISE(arrow_type, Decimal32Type::Make(precision, scale)); + ARROW_ASSIGN_OR_RAISE( + *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); + ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), + std::move(arrow_type))); + } else if (precision <= Decimal64Type::kMaxPrecision) { + ARROW_ASSIGN_OR_RAISE(arrow_type, Decimal64Type::Make(precision, scale)); + ARROW_ASSIGN_OR_RAISE( + *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); + ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), + std::move(arrow_type))); + } else if (precision <= Decimal128Type::kMaxPrecision) { + ARROW_ASSIGN_OR_RAISE(arrow_type, Decimal128Type::Make(precision, scale)); ARROW_ASSIGN_OR_RAISE( *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), std::move(arrow_type))); - return Status::OK(); + } else { + ARROW_ASSIGN_OR_RAISE(arrow_type, Decimal256Type::Make(precision, scale)); + ARROW_ASSIGN_OR_RAISE( + *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); + ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), + std::move(arrow_type))); } - // Fallback to see if Decimal256 can represent the type. - ARROW_ASSIGN_OR_RAISE( - arrow_type, Decimal256Type::Make(decimal_type.precision(), decimal_type.scale())); - ARROW_ASSIGN_OR_RAISE( - *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); - ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), - std::move(arrow_type))); return Status::OK(); } @@ -574,7 +590,8 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool, } // ---------------------------------------------------------------------- -// INT32 / INT64 / BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 || Decimal256 +// INT32 / INT64 / BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY +// -> Decimal32 || Decimal64 || Decimal128 || Decimal256 template Status RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width, @@ -587,6 +604,16 @@ Status RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width, template struct DecimalTypeTrait; +template <> +struct DecimalTypeTrait<::arrow::Decimal32Array> { + using value = ::arrow::Decimal32; +}; + +template <> +struct DecimalTypeTrait<::arrow::Decimal64Array> { + using value = ::arrow::Decimal64; +}; + template <> struct DecimalTypeTrait<::arrow::Decimal128Array> { using value = ::arrow::Decimal128; @@ -705,7 +732,7 @@ struct DecimalConverter { } }; -/// \brief Convert an Int32 or Int64 array into a Decimal128Array +/// \brief Convert an Int32 or Int64 array into a Decimal32Array or Decimal64Array /// The parquet spec allows systems to write decimals in int32, int64 if the values are /// small enough to fit in less 4 bytes or less than 8 bytes, respectively. /// This function implements the conversion from int32 and int64 arrays to decimal arrays. @@ -715,10 +742,10 @@ template < std::is_same::value>> static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, const std::shared_ptr& field, Datum* out) { - // Decimal128 and Decimal256 are only Arrow constructs. Parquet does not + // Decimal32 and Decimal64 are only Arrow constructs. Parquet does not // specifically distinguish between decimal byte widths. - DCHECK(field->type()->id() == ::arrow::Type::DECIMAL128 || - field->type()->id() == ::arrow::Type::DECIMAL256); + DCHECK(field->type()->id() == ::arrow::Type::DECIMAL32 || + field->type()->id() == ::arrow::Type::DECIMAL64); const int64_t length = reader->values_written(); @@ -741,11 +768,11 @@ static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, // sign/zero extend int32_t values, otherwise a no-op const auto value = static_cast(values[i]); - if constexpr (std::is_same_v) { - ::arrow::Decimal128 decimal(value); + if constexpr (std::is_same_v) { + ::arrow::Decimal32 decimal(value); decimal.ToBytes(out_ptr); } else { - ::arrow::Decimal256 decimal(value); + ::arrow::Decimal64 decimal(value); decimal.ToBytes(out_ptr); } } @@ -884,40 +911,63 @@ Status TransferColumnData(RecordReader* reader, } RETURN_NOT_OK(TransferHalfFloat(reader, pool, value_field, &result)); } break; - case ::arrow::Type::DECIMAL128: { + case ::arrow::Type::DECIMAL32: { switch (descr->physical_type()) { case ::parquet::Type::INT32: { - auto fn = DecimalIntegerTransfer; + auto fn = DecimalIntegerTransfer; + RETURN_NOT_OK(fn(reader, pool, value_field, &result)); + } break; + case ::parquet::Type::BYTE_ARRAY: { + auto fn = &TransferDecimal; + RETURN_NOT_OK(fn(reader, pool, value_field, &result)); + } break; + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: { + auto fn = &TransferDecimal; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); } break; + default: + return Status::Invalid( + "Physical type for decimal32 must be int32, byte array, or fixed length " + "binary"); + } + } break; + case ::arrow::Type::DECIMAL64: { + switch (descr->physical_type()) { case ::parquet::Type::INT64: { - auto fn = &DecimalIntegerTransfer; + auto fn = DecimalIntegerTransfer; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); } break; case ::parquet::Type::BYTE_ARRAY: { - auto fn = &TransferDecimal; + auto fn = &TransferDecimal; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); } break; case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: { - auto fn = &TransferDecimal; + auto fn = &TransferDecimal; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); } break; default: return Status::Invalid( - "Physical type for decimal128 must be int32, int64, byte array, or fixed " - "length binary"); + "Physical type for decimal64 must be int64, byte array, or fixed length " + "binary"); } } break; - case ::arrow::Type::DECIMAL256: + case ::arrow::Type::DECIMAL128: { switch (descr->physical_type()) { - case ::parquet::Type::INT32: { - auto fn = DecimalIntegerTransfer; + case ::parquet::Type::BYTE_ARRAY: { + auto fn = &TransferDecimal; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); } break; - case ::parquet::Type::INT64: { - auto fn = &DecimalIntegerTransfer; + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: { + auto fn = &TransferDecimal; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); } break; + default: + return Status::Invalid( + "Physical type for decimal128 must be byte array, or fixed length binary"); + } + } break; + case ::arrow::Type::DECIMAL256: { + switch (descr->physical_type()) { case ::parquet::Type::BYTE_ARRAY: { auto fn = &TransferDecimal; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); @@ -928,11 +978,9 @@ Status TransferColumnData(RecordReader* reader, } break; default: return Status::Invalid( - "Physical type for decimal256 must be int32, int64, byte array, or fixed " - "length binary"); + "Physical type for decimal256 must be byte array, or fixed length binary"); } - break; - + } break; case ::arrow::Type::TIMESTAMP: { const ::arrow::TimestampType& timestamp_type = checked_cast<::arrow::TimestampType&>(*value_field->type()); diff --git a/cpp/src/parquet/arrow/test_util.h b/cpp/src/parquet/arrow/test_util.h index cfc57ce6ea743..8659c6f3f8bdc 100644 --- a/cpp/src/parquet/arrow/test_util.h +++ b/cpp/src/parquet/arrow/test_util.h @@ -47,9 +47,35 @@ using ::arrow::Array; using ::arrow::ChunkedArray; using ::arrow::Status; +template +struct Decimal32WithPrecisionAndScale { + static_assert(PRECISION >= ::arrow::Decimal32Type::kMinPrecision && + PRECISION <= ::arrow::Decimal32Type::kMaxPrecision, + "Invalid precision value"); + + using type = ::arrow::Decimal32Type; + static constexpr ::arrow::Type::type type_id = ::arrow::Decimal32Type::type_id; + static constexpr int32_t precision = PRECISION; + static constexpr int32_t scale = PRECISION - 1; +}; + +template +struct Decimal64WithPrecisionAndScale { + static_assert(PRECISION >= ::arrow::Decimal64Type::kMinPrecision && + PRECISION <= ::arrow::Decimal64Type::kMaxPrecision, + "Invalid precision value"); + + using type = ::arrow::Decimal64Type; + static constexpr ::arrow::Type::type type_id = ::arrow::Decimal64Type::type_id; + static constexpr int32_t precision = PRECISION; + static constexpr int32_t scale = PRECISION - 1; +}; + template struct Decimal128WithPrecisionAndScale { - static_assert(PRECISION >= 1 && PRECISION <= 38, "Invalid precision value"); + static_assert(PRECISION >= ::arrow::Decimal128Type::kMinPrecision && + PRECISION <= ::arrow::Decimal128Type::kMaxPrecision, + "Invalid precision value"); using type = ::arrow::Decimal128Type; static constexpr ::arrow::Type::type type_id = ::arrow::Decimal128Type::type_id; @@ -59,7 +85,9 @@ struct Decimal128WithPrecisionAndScale { template struct Decimal256WithPrecisionAndScale { - static_assert(PRECISION >= 1 && PRECISION <= 76, "Invalid precision value"); + static_assert(PRECISION >= ::arrow::Decimal256Type::kMinPrecision && + PRECISION <= ::arrow::Decimal256Type::kMaxPrecision, + "Invalid precision value"); using type = ::arrow::Decimal256Type; static constexpr ::arrow::Type::type type_id = ::arrow::Decimal256Type::type_id; @@ -154,6 +182,50 @@ static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t std::memcpy(out, decimals->data()->GetValues(1, 0), byte_width * n); } +template +::arrow::enable_if_t< + std::is_same>::value, Status> +NonNullArray(size_t size, std::shared_ptr* out) { + constexpr int32_t kDecimalPrecision = precision; + constexpr int32_t kDecimalScale = Decimal32WithPrecisionAndScale::scale; + + const auto type = ::arrow::decimal32(kDecimalPrecision, kDecimalScale); + ::arrow::Decimal32Builder builder(type); + const int32_t byte_width = + static_cast(*type).byte_width(); + + constexpr int32_t seed = 0; + + ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width)); + random_decimals<::arrow::Decimal32Type::kByteWidth>(size, seed, kDecimalPrecision, + out_buf->mutable_data()); + + RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size)); + return builder.Finish(out); +} + +template +::arrow::enable_if_t< + std::is_same>::value, Status> +NonNullArray(size_t size, std::shared_ptr* out) { + constexpr int32_t kDecimalPrecision = precision; + constexpr int32_t kDecimalScale = Decimal64WithPrecisionAndScale::scale; + + const auto type = ::arrow::decimal64(kDecimalPrecision, kDecimalScale); + ::arrow::Decimal64Builder builder(type); + const int32_t byte_width = + static_cast(*type).byte_width(); + + constexpr int32_t seed = 0; + + ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width)); + random_decimals<::arrow::Decimal64Type::kByteWidth>(size, seed, kDecimalPrecision, + out_buf->mutable_data()); + + RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size)); + return builder.Finish(out); +} + template ::arrow::enable_if_t< std::is_same>::value, Status> @@ -343,6 +415,60 @@ ::arrow::enable_if_fixed_size_binary NullableArray( return builder.Finish(out); } +template +::arrow::enable_if_t< + std::is_same>::value, Status> +NullableArray(size_t size, size_t num_nulls, uint32_t seed, + std::shared_ptr<::arrow::Array>* out) { + std::vector valid_bytes(size, '\1'); + + for (size_t i = 0; i < num_nulls; ++i) { + valid_bytes[i * 2] = '\0'; + } + + constexpr int32_t kDecimalPrecision = precision; + constexpr int32_t kDecimalScale = Decimal32WithPrecisionAndScale::scale; + const auto type = ::arrow::decimal32(kDecimalPrecision, kDecimalScale); + const int32_t byte_width = + static_cast(*type).byte_width(); + + ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width)); + + random_decimals<::arrow::Decimal32Type::kByteWidth>(size, seed, precision, + out_buf->mutable_data()); + + ::arrow::Decimal32Builder builder(type); + RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data())); + return builder.Finish(out); +} + +template +::arrow::enable_if_t< + std::is_same>::value, Status> +NullableArray(size_t size, size_t num_nulls, uint32_t seed, + std::shared_ptr<::arrow::Array>* out) { + std::vector valid_bytes(size, '\1'); + + for (size_t i = 0; i < num_nulls; ++i) { + valid_bytes[i * 2] = '\0'; + } + + constexpr int32_t kDecimalPrecision = precision; + constexpr int32_t kDecimalScale = Decimal64WithPrecisionAndScale::scale; + const auto type = ::arrow::decimal64(kDecimalPrecision, kDecimalScale); + const int32_t byte_width = + static_cast(*type).byte_width(); + + ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width)); + + random_decimals<::arrow::Decimal64Type::kByteWidth>(size, seed, precision, + out_buf->mutable_data()); + + ::arrow::Decimal64Builder builder(type); + RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data())); + return builder.Finish(out); +} + template ::arrow::enable_if_t< std::is_same>::value, Status> diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 118dca7cf3d9b..defac2624239c 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2377,7 +2377,7 @@ struct SerializeFunctor< } // Parquet's Decimal are stored with FixedLength values where the length is - // proportional to the precision. Arrow's Decimal are always stored with 16/32 + // proportional to the precision. Arrow's Decimal are always stored with 4/8/16/32 // bytes. Thus the internal FLBA pointer must be adjusted by the offset calculated // here. int32_t Offset(const Array& array) { @@ -2391,29 +2391,45 @@ struct SerializeFunctor< int64_t non_null_count = array.length() - array.null_count(); int64_t size = non_null_count * ArrowType::kByteWidth; scratch_buffer = AllocateBuffer(ctx->memory_pool, size); - scratch = reinterpret_cast(scratch_buffer->mutable_data()); + scratch_i32 = reinterpret_cast(scratch_buffer->mutable_data()); + scratch_i64 = reinterpret_cast(scratch_buffer->mutable_data()); } template FixedLenByteArray FixDecimalEndianness(const uint8_t* in, int64_t offset) { + static_assert(byte_width == ::arrow::Decimal32Type::kByteWidth || + byte_width == ::arrow::Decimal64Type::kByteWidth || + byte_width == ::arrow::Decimal128Type::kByteWidth || + byte_width == ::arrow::Decimal256Type::kByteWidth, + "only 4/8/16/32 byte Decimals supported"); + + if constexpr (byte_width == ::arrow::Decimal32Type::kByteWidth) { + const auto* u32_in = reinterpret_cast(in); + auto out = reinterpret_cast(scratch_i32) + offset; + *scratch_i32++ = ::arrow::bit_util::ToBigEndian(u32_in[0]); + return FixedLenByteArray(out); + } + const auto* u64_in = reinterpret_cast(in); - auto out = reinterpret_cast(scratch) + offset; - static_assert(byte_width == 16 || byte_width == 32, - "only 16 and 32 byte Decimals supported"); - if (byte_width == 32) { - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[3]); - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[2]); - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + auto out = reinterpret_cast(scratch_i64) + offset; + if constexpr (byte_width == ::arrow::Decimal64Type::kByteWidth) { + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + } else if constexpr (byte_width == ::arrow::Decimal128Type::kByteWidth) { + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); } else { - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[3]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[2]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); } + return FixedLenByteArray(out); } std::shared_ptr scratch_buffer; - int64_t* scratch; + int32_t* scratch_i32; + int64_t* scratch_i64; }; // ---------------------------------------------------------------------- From 290de24a41d3f4236ede206cbe9ffa3f619b4e79 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 25 Jan 2025 19:35:07 -0500 Subject: [PATCH 05/20] Pyarrow parquet to pandas --- python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 2 ++ python/pyarrow/tests/parquet/test_basic.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index a0f1d5bbbed8b..335fbbdd0f8ca 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -2123,6 +2123,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& case Type::STRUCT: // fall through case Type::TIME32: // fall through case Type::TIME64: // fall through + case Type::DECIMAL32: // fall through + case Type::DECIMAL64: // fall through case Type::DECIMAL128: // fall through case Type::DECIMAL256: // fall through case Type::INTERVAL_MONTH_DAY_NANO: // fall through diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 6496aa99092b8..1c5add3388a1b 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -364,9 +364,9 @@ def test_byte_stream_split(): def test_store_decimal_as_integer(tempdir): arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))), - type=pa.decimal128(5, 2)) + type=pa.decimal32(5, 2)) arr_decimal_10_18 = pa.array(list(map(Decimal, range(100))), - type=pa.decimal128(16, 9)) + type=pa.decimal64(16, 9)) arr_decimal_gt18 = pa.array(list(map(Decimal, range(100))), type=pa.decimal128(22, 2)) arr_bool = pa.array([True, False] * 50) From e5b996e63a4b47afbd1d2c00ca9590b47f9cff25 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Fri, 14 Feb 2025 22:18:58 -0500 Subject: [PATCH 06/20] Address comments --- .../parquet/arrow/arrow_reader_writer_test.cc | 170 +++++++----------- cpp/src/parquet/arrow/arrow_schema_test.cc | 64 ++----- cpp/src/parquet/arrow/reader_internal.cc | 82 +++++---- cpp/src/parquet/arrow/schema_internal.cc | 47 ++--- cpp/src/parquet/arrow/schema_internal.h | 16 +- cpp/src/parquet/column_writer.cc | 5 + cpp/src/parquet/properties.h | 9 +- python/pyarrow/tests/parquet/test_basic.py | 4 +- 8 files changed, 175 insertions(+), 222 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 0990c73494737..27cb849365ca7 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -181,14 +181,6 @@ std::shared_ptr get_logical_type(const DataType& type) { static_cast(type); return get_logical_type(*dict_type.value_type()); } - case ArrowId::DECIMAL32: { - const auto& dec_type = static_cast(type); - return LogicalType::Decimal(dec_type.precision(), dec_type.scale()); - } - case ArrowId::DECIMAL64: { - const auto& dec_type = static_cast(type); - return LogicalType::Decimal(dec_type.precision(), dec_type.scale()); - } case ArrowId::DECIMAL128: { const auto& dec_type = static_cast(type); return LogicalType::Decimal(dec_type.precision(), dec_type.scale()); @@ -214,11 +206,9 @@ ParquetType::type get_physical_type(const DataType& type) { case ArrowId::INT16: case ArrowId::UINT32: case ArrowId::INT32: - case ArrowId::DECIMAL32: return ParquetType::INT32; case ArrowId::UINT64: case ArrowId::INT64: - case ArrowId::DECIMAL64: return ParquetType::INT64; case ArrowId::FLOAT: return ParquetType::FLOAT; @@ -543,8 +533,6 @@ static std::shared_ptr MakeSimpleSchema(const DataType& type, case ::arrow::Type::HALF_FLOAT: byte_width = sizeof(::arrow::HalfFloatType::c_type); break; - case ::arrow::Type::DECIMAL32: - case ::arrow::Type::DECIMAL64: case ::arrow::Type::DECIMAL128: case ::arrow::Type::DECIMAL256: { const auto& decimal_type = static_cast(values_type); @@ -560,8 +548,6 @@ static std::shared_ptr MakeSimpleSchema(const DataType& type, case ::arrow::Type::HALF_FLOAT: byte_width = sizeof(::arrow::HalfFloatType::c_type); break; - case ::arrow::Type::DECIMAL32: - case ::arrow::Type::DECIMAL64: case ::arrow::Type::DECIMAL128: case ::arrow::Type::DECIMAL256: { const auto& decimal_type = static_cast(type); @@ -797,38 +783,6 @@ class TestReadDecimals : public ParquetIOTestBase { // The Decimal roundtrip tests always go through the FixedLenByteArray path, // check the ByteArray case manually. -TEST_F(TestReadDecimals, Decimal32ByteArray) { - const std::vector> big_endian_decimals = { - // 123456 - {1, 226, 64}, - // 987654 - {15, 18, 6}, - // -123456 - {255, 254, 29, 192}, - }; - - auto expected = - ArrayFromJSON(::arrow::decimal32(6, 3), R"(["123.456", "987.654", "-123.456"])"); - CheckReadFromByteArrays(LogicalType::Decimal(6, 3), big_endian_decimals, *expected); -} - -TEST_F(TestReadDecimals, Decimal64ByteArray) { - const std::vector> big_endian_decimals = { - // 123456 - {1, 226, 64}, - // 987654 - {15, 18, 6}, - // -123456 - {255, 254, 29, 192}, - // -123456 - {255, 255, 255, 255, 255, 254, 29, 192}, - }; - - auto expected = ArrayFromJSON(::arrow::decimal64(16, 3), - R"(["123.456", "987.654", "-123.456", "-123.456"])"); - CheckReadFromByteArrays(LogicalType::Decimal(16, 3), big_endian_decimals, *expected); -} - TEST_F(TestReadDecimals, Decimal128ByteArray) { const std::vector> big_endian_decimals = { // 123456 @@ -836,14 +790,12 @@ TEST_F(TestReadDecimals, Decimal128ByteArray) { // 987654 {15, 18, 6}, // -123456 - {255, 254, 29, 192}, - // -123456 {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192}, }; - auto expected = ArrayFromJSON(::arrow::decimal128(20, 3), - R"(["123.456", "987.654", "-123.456", "-123.456"])"); - CheckReadFromByteArrays(LogicalType::Decimal(20, 3), big_endian_decimals, *expected); + auto expected = + ArrayFromJSON(::arrow::decimal128(6, 3), R"(["123.456", "987.654", "-123.456"])"); + CheckReadFromByteArrays(LogicalType::Decimal(6, 3), big_endian_decimals, *expected); } TEST_F(TestReadDecimals, Decimal256ByteArray) { @@ -853,14 +805,12 @@ TEST_F(TestReadDecimals, Decimal256ByteArray) { // 987654 {15, 18, 6}, // -123456 - {255, 254, 29, 192}, - // -123456 {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192}, }; - auto expected = ArrayFromJSON(::arrow::decimal256(40, 3), - R"(["123.456", "987.654", "-123.456", "-123.456"])"); + auto expected = + ArrayFromJSON(::arrow::decimal256(40, 3), R"(["123.456", "987.654", "-123.456"])"); CheckReadFromByteArrays(LogicalType::Decimal(40, 3), big_endian_decimals, *expected); } @@ -908,9 +858,9 @@ typedef ::testing::Types< ::arrow::Int16Type, ::arrow::Int32Type, ::arrow::UInt64Type, ::arrow::Int64Type, ::arrow::Date32Type, ::arrow::FloatType, ::arrow::DoubleType, ::arrow::StringType, ::arrow::BinaryType, ::arrow::FixedSizeBinaryType, ::arrow::HalfFloatType, - Decimal32WithPrecisionAndScale<1>, Decimal32WithPrecisionAndScale<5>, - Decimal64WithPrecisionAndScale<10>, Decimal64WithPrecisionAndScale<18>, - Decimal128WithPrecisionAndScale<19>, Decimal128WithPrecisionAndScale<27>, + Decimal128WithPrecisionAndScale<1>, Decimal128WithPrecisionAndScale<5>, + Decimal128WithPrecisionAndScale<10>, Decimal128WithPrecisionAndScale<19>, + Decimal128WithPrecisionAndScale<23>, Decimal128WithPrecisionAndScale<27>, Decimal128WithPrecisionAndScale<38>, Decimal256WithPrecisionAndScale<39>, Decimal256WithPrecisionAndScale<56>, Decimal256WithPrecisionAndScale<76>> TestTypes; @@ -953,9 +903,8 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { std::shared_ptr
table = MakeSimpleTable(values, false); this->ResetSink(); - ASSERT_OK_NO_THROW(WriteTable( - *table, ::arrow::default_memory_pool(), this->sink_, values->length(), - ::parquet::WriterProperties::Builder().enable_store_decimal_as_integer()->build())); + ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_, + values->length(), default_writer_properties())); std::shared_ptr
out; std::unique_ptr reader; @@ -2995,7 +2944,7 @@ TEST(ArrowReadWrite, Decimal256) { using ::arrow::Decimal256; using ::arrow::field; - auto type = ::arrow::decimal256(48, 4); + auto type = ::arrow::decimal256(8, 4); const char* json = R"(["1.0000", null, "-1.2345", "-1000.5678", "-9999.9999", "9999.9999"])"; @@ -3009,7 +2958,7 @@ TEST(ArrowReadWrite, DecimalStats) { using ::arrow::Decimal128; using ::arrow::field; - auto type = ::arrow::decimal128(/*precision=*/28, /*scale=*/0); + auto type = ::arrow::decimal128(/*precision=*/8, /*scale=*/0); const char* json = R"(["255", "128", null, "0", "1", "-127", "-128", "-129", "-255"])"; auto array = ::arrow::ArrayFromJSON(type, json); @@ -3498,8 +3447,8 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { types.push_back(::arrow::duration(::arrow::TimeUnit::MILLI)); types.push_back(::arrow::duration(::arrow::TimeUnit::MICRO)); types.push_back(::arrow::duration(::arrow::TimeUnit::NANO)); - types.push_back(::arrow::decimal32(3, 2)); - types.push_back(::arrow::decimal128(23, 2)); + types.push_back(::arrow::decimal128(3, 2)); + types.push_back(::arrow::decimal256(3, 2)); types.push_back(::arrow::fixed_size_binary(4)); // Note large variants of types appear to get converted back to regular on read types.push_back(::arrow::dictionary(::arrow::int32(), ::arrow::binary())); @@ -3551,8 +3500,9 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptionalDecimal) { ByteArray("\x0f\x12\x06"), // 987654 }; const std::vector int32_values = {123456, 987654}; + const std::vector int64_values = {123456, 987654}; - const auto inner_type = ::arrow::decimal32(6, 3); + const auto inner_type = ::arrow::decimal128(6, 3); auto inner_field = ::arrow::field("inner", inner_type, /*nullable=*/false); auto type = ::arrow::struct_({inner_field}); auto field = ::arrow::field("outer", type, /*nullable=*/true); @@ -3562,7 +3512,7 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptionalDecimal) { ::arrow::StructArray::Make({inner}, {inner_field}, null_bitmap)); auto table = ::arrow::Table::Make(::arrow::schema({field}), {array}); - for (const auto& encoding : {Type::BYTE_ARRAY, Type::INT32}) { + for (const auto& encoding : {Type::BYTE_ARRAY, Type::INT32, Type::INT64}) { // Manually write out file based on encoding type ARROW_SCOPED_TRACE("Encoding decimals as ", encoding); auto parquet_schema = GroupNode::Make( @@ -3593,6 +3543,12 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptionalDecimal) { int32_values.data()); break; } + case Type::INT64: { + auto typed_writer = checked_cast(column_writer); + typed_writer->WriteBatch(4, def_levels.data(), /*rep_levels=*/nullptr, + int64_values.data()); + break; + } default: FAIL() << "Invalid encoding"; return; @@ -3606,11 +3562,11 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptionalDecimal) { } } -TEST(ArrowReadWrite, Decimal32AsInt) { +TEST(ArrowReadWrite, Decimal256AsInt) { using ::arrow::Decimal256; using ::arrow::field; - auto type = ::arrow::decimal32(8, 4); + auto type = ::arrow::decimal256(8, 4); const char* json = R"(["1.0000", null, "-1.2345", "-1000.5678", "-9999.9999", "9999.9999"])"; @@ -4103,7 +4059,7 @@ TEST(TestArrowReaderAdHoc, WriteBatchedNestedNullableStringColumn) { // ARROW-10493 std::vector> fields{ ::arrow::field("s", ::arrow::utf8(), /*nullable=*/true), - ::arrow::field("d", ::arrow::decimal32(4, 2), /*nullable=*/true), + ::arrow::field("d", ::arrow::decimal128(4, 2), /*nullable=*/true), ::arrow::field("b", ::arrow::boolean(), /*nullable=*/true), ::arrow::field("i8", ::arrow::int8(), /*nullable=*/true), ::arrow::field("i64", ::arrow::int64(), /*nullable=*/true)}; @@ -4266,47 +4222,25 @@ TEST_P(TestArrowReaderAdHocSparkAndHvr, ReadDecimals) { std::shared_ptr expected_array; - if (decimal_type->id() == ::arrow::Decimal32Type::type_id) { - ::arrow::Decimal32Builder builder(decimal_type, pool); - for (int32_t i = 0; i < expected_length; ++i) { - ::arrow::Decimal32 value((i + 1) * 100); - ASSERT_OK(builder.Append(value)); - } - ASSERT_OK(builder.Finish(&expected_array)); - } else if (decimal_type->id() == ::arrow::Decimal64Type::type_id) { - ::arrow::Decimal64Builder builder(decimal_type, pool); - for (int32_t i = 0; i < expected_length; ++i) { - ::arrow::Decimal64 value((i + 1) * 100); - ASSERT_OK(builder.Append(value)); - } - ASSERT_OK(builder.Finish(&expected_array)); - } else if (decimal_type->id() == ::arrow::Decimal128Type::type_id) { - ::arrow::Decimal128Builder builder(decimal_type, pool); - for (int32_t i = 0; i < expected_length; ++i) { - ::arrow::Decimal128 value((i + 1) * 100); - ASSERT_OK(builder.Append(value)); - } - ASSERT_OK(builder.Finish(&expected_array)); - } else { - ::arrow::Decimal256Builder builder(decimal_type, pool); - for (int32_t i = 0; i < expected_length; ++i) { - ::arrow::Decimal256 value((i + 1) * 100); - ASSERT_OK(builder.Append(value)); - } - ASSERT_OK(builder.Finish(&expected_array)); - } + ::arrow::Decimal128Builder builder(decimal_type, pool); + for (int32_t i = 0; i < expected_length; ++i) { + ::arrow::Decimal128 value((i + 1) * 100); + ASSERT_OK(builder.Append(value)); + } + ASSERT_OK(builder.Finish(&expected_array)); AssertArraysEqual(*expected_array, *chunk); } INSTANTIATE_TEST_SUITE_P( ReadDecimals, TestArrowReaderAdHocSparkAndHvr, ::testing::Values( - std::make_tuple("int32_decimal.parquet", ::arrow::decimal32(4, 2)), - std::make_tuple("int64_decimal.parquet", ::arrow::decimal64(10, 2)), + std::make_tuple("int32_decimal.parquet", ::arrow::decimal128(4, 2)), + std::make_tuple("int64_decimal.parquet", ::arrow::decimal128(10, 2)), std::make_tuple("fixed_length_decimal.parquet", ::arrow::decimal128(25, 2)), - std::make_tuple("fixed_length_decimal_legacy.parquet", ::arrow::decimal64(13, 2)), - std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal32(4, 2)))); + std::make_tuple("fixed_length_decimal_legacy.parquet", + ::arrow::decimal128(13, 2)), + std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal128(4, 2)))); TEST(TestArrowReaderAdHoc, ReadFloat16Files) { using ::arrow::util::Float16; @@ -5330,17 +5264,33 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO { this->ReaderFromSink(&reader); this->ReadSingleColumnFile(std::move(reader), &out); - ASSERT_EQ(out->type()->id(), TestType::type_id); - AssertArraysEqual(values, *out); + // Reader always read values as DECIMAL128 type + ASSERT_EQ(out->type()->id(), ::arrow::Type::DECIMAL128); + + if (values.type()->id() == ::arrow::Type::DECIMAL128) { + AssertArraysEqual(values, *out); + } else { + auto& expected_values = dynamic_cast(values); + auto& read_values = dynamic_cast(*out); + ASSERT_EQ(expected_values.length(), read_values.length()); + ASSERT_EQ(expected_values.null_count(), read_values.null_count()); + ASSERT_EQ(expected_values.length(), read_values.length()); + for (int64_t i = 0; i < expected_values.length(); ++i) { + ASSERT_EQ(expected_values.IsNull(i), read_values.IsNull(i)); + if (!expected_values.IsNull(i)) { + ASSERT_EQ(::arrow::Decimal256(expected_values.Value(i)).ToString(0), + ::arrow::Decimal128(read_values.Value(i)).ToString(0)); + } + } + } } }; typedef ::testing::Types< - Decimal32WithPrecisionAndScale<1>, Decimal32WithPrecisionAndScale<5>, - Decimal64WithPrecisionAndScale<10>, Decimal64WithPrecisionAndScale<18>, - Decimal128WithPrecisionAndScale<19>, Decimal128WithPrecisionAndScale<27>, - Decimal128WithPrecisionAndScale<38>, Decimal256WithPrecisionAndScale<39>, - Decimal256WithPrecisionAndScale<56>, Decimal256WithPrecisionAndScale<76>> + Decimal128WithPrecisionAndScale<1>, Decimal128WithPrecisionAndScale<5>, + Decimal128WithPrecisionAndScale<10>, Decimal128WithPrecisionAndScale<18>, + Decimal256WithPrecisionAndScale<1>, Decimal256WithPrecisionAndScale<5>, + Decimal256WithPrecisionAndScale<10>, Decimal256WithPrecisionAndScale<18>> DecimalTestTypes; TYPED_TEST_SUITE(TestIntegerAnnotateDecimalTypeParquetIO, DecimalTestTypes); diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 4de4e1d3299f9..535efa0c8e5de 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -67,7 +67,7 @@ const auto TIMESTAMP_MS = ::arrow::timestamp(TimeUnit::MILLI); const auto TIMESTAMP_US = ::arrow::timestamp(TimeUnit::MICRO); const auto TIMESTAMP_NS = ::arrow::timestamp(TimeUnit::NANO); const auto BINARY = ::arrow::binary(); -const auto DECIMAL_8_4 = std::make_shared<::arrow::Decimal32Type>(8, 4); +const auto DECIMAL_8_4 = std::make_shared<::arrow::Decimal128Type>(8, 4); class TestConvertParquetSchema : public ::testing::Test { public: @@ -185,13 +185,11 @@ TEST_F(TestConvertParquetSchema, ParquetAnnotatedFields) { {"string", LogicalType::String(), ParquetType::BYTE_ARRAY, -1, ::arrow::utf8()}, {"enum", LogicalType::Enum(), ParquetType::BYTE_ARRAY, -1, ::arrow::binary()}, {"decimal(8, 2)", LogicalType::Decimal(8, 2), ParquetType::INT32, -1, - ::arrow::decimal32(8, 2)}, + ::arrow::decimal128(8, 2)}, {"decimal(16, 4)", LogicalType::Decimal(16, 4), ParquetType::INT64, -1, - ::arrow::decimal64(16, 4)}, + ::arrow::decimal128(16, 4)}, {"decimal(32, 8)", LogicalType::Decimal(32, 8), ParquetType::FIXED_LEN_BYTE_ARRAY, 16, ::arrow::decimal128(32, 8)}, - {"decimal(73, 38)", LogicalType::Decimal(73, 38), ParquetType::FIXED_LEN_BYTE_ARRAY, - 31, ::arrow::decimal256(73, 38)}, {"date", LogicalType::Date(), ParquetType::INT32, -1, ::arrow::date32()}, {"time(ms)", LogicalType::Time(true, LogicalType::TimeUnit::MILLIS), ParquetType::INT32, -1, ::arrow::time32(::arrow::TimeUnit::MILLI)}, @@ -972,12 +970,12 @@ class TestConvertArrowSchema : public ::testing::Test { ::arrow::Status ConvertSchema( const std::vector>& fields, std::shared_ptr<::parquet::ArrowWriterProperties> arrow_properties = - ::parquet::default_arrow_writer_properties(), - std::shared_ptr<::parquet::WriterProperties> parquet_properties = - ::parquet::default_writer_properties()) { + ::parquet::default_arrow_writer_properties()) { arrow_schema_ = ::arrow::schema(fields); - return ToParquetSchema(arrow_schema_.get(), *parquet_properties.get(), - *arrow_properties, &result_schema_); + std::shared_ptr<::parquet::WriterProperties> properties = + ::parquet::default_writer_properties(); + return ToParquetSchema(arrow_schema_.get(), *properties.get(), *arrow_properties, + &result_schema_); } protected: @@ -1071,16 +1069,14 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { {"utf8", ::arrow::utf8(), LogicalType::String(), ParquetType::BYTE_ARRAY, -1}, {"large_utf8", ::arrow::large_utf8(), LogicalType::String(), ParquetType::BYTE_ARRAY, -1}, - {"decimal(1, 0)", ::arrow::decimal32(1, 0), LogicalType::Decimal(1, 0), + {"decimal(1, 0)", ::arrow::decimal128(1, 0), LogicalType::Decimal(1, 0), ParquetType::FIXED_LEN_BYTE_ARRAY, 1}, - {"decimal(8, 2)", ::arrow::decimal32(8, 2), LogicalType::Decimal(8, 2), + {"decimal(8, 2)", ::arrow::decimal128(8, 2), LogicalType::Decimal(8, 2), ParquetType::FIXED_LEN_BYTE_ARRAY, 4}, - {"decimal(16, 4)", ::arrow::decimal64(16, 4), LogicalType::Decimal(16, 4), + {"decimal(16, 4)", ::arrow::decimal128(16, 4), LogicalType::Decimal(16, 4), ParquetType::FIXED_LEN_BYTE_ARRAY, 7}, {"decimal(32, 8)", ::arrow::decimal128(32, 8), LogicalType::Decimal(32, 8), ParquetType::FIXED_LEN_BYTE_ARRAY, 14}, - {"decimal(73, 38)", ::arrow::decimal256(73, 38), LogicalType::Decimal(73, 38), - ParquetType::FIXED_LEN_BYTE_ARRAY, 31}, {"float16", ::arrow::float16(), LogicalType::Float16(), ParquetType::FIXED_LEN_BYTE_ARRAY, 2}, {"time32", ::arrow::time32(::arrow::TimeUnit::MILLI), @@ -1138,44 +1134,6 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { // ASSERT_NO_FATAL_FAILURE(); } -TEST_F(TestConvertArrowSchema, ArrowDecimalFieldsStoredAsInt) { - struct FieldConstructionArguments { - std::string name; - std::shared_ptr<::arrow::DataType> datatype; - std::shared_ptr logical_type; - parquet::Type::type physical_type; - int physical_length; - }; - - std::vector cases = { - {"decimal(1, 0)", ::arrow::decimal32(1, 0), LogicalType::Decimal(1, 0), - ParquetType::INT32, 1}, - {"decimal(8, 2)", ::arrow::decimal32(8, 2), LogicalType::Decimal(8, 2), - ParquetType::INT32, 4}, - {"decimal(16, 4)", ::arrow::decimal64(16, 4), LogicalType::Decimal(16, 4), - ParquetType::INT64, 7}, - {"decimal(32, 8)", ::arrow::decimal128(32, 8), LogicalType::Decimal(32, 8), - ParquetType::FIXED_LEN_BYTE_ARRAY, 14}, - {"decimal(73, 38)", ::arrow::decimal256(73, 38), LogicalType::Decimal(73, 38), - ParquetType::FIXED_LEN_BYTE_ARRAY, 31}}; - - std::vector> arrow_fields; - std::vector parquet_fields; - - for (const FieldConstructionArguments& c : cases) { - arrow_fields.push_back(::arrow::field(c.name, c.datatype, false)); - parquet_fields.push_back(PrimitiveNode::Make(c.name, Repetition::REQUIRED, - c.logical_type, c.physical_type, - c.physical_length)); - } - - auto parquet_properties_builder = ::parquet::WriterProperties::Builder(); - ASSERT_OK(ConvertSchema( - arrow_fields, ::parquet::default_arrow_writer_properties(), - parquet_properties_builder.enable_store_decimal_as_integer()->build())); - CheckFlatSchema(parquet_fields); -} - TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) { struct FieldConstructionArguments { std::string name; diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 2a2c392eb02bc..a0b937e149073 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -28,6 +28,7 @@ #include #include "arrow/array.h" +#include "arrow/array/array_decimal.h" #include "arrow/compute/api.h" #include "arrow/datum.h" #include "arrow/io/memory.h" @@ -37,6 +38,7 @@ #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/base64.h" #include "arrow/util/bit_util.h" @@ -217,35 +219,25 @@ Status ExtractDecimalMinMaxFromBytesType(const Statistics& statistics, std::shared_ptr<::arrow::Scalar>* max) { const DecimalLogicalType& decimal_type = checked_cast(logical_type); - auto precision = decimal_type.precision(); - auto scale = decimal_type.scale(); - std::shared_ptr arrow_type; - if (precision <= Decimal32Type::kMaxPrecision) { - ARROW_ASSIGN_OR_RAISE(arrow_type, Decimal32Type::Make(precision, scale)); - ARROW_ASSIGN_OR_RAISE( - *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); - ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), - std::move(arrow_type))); - } else if (precision <= Decimal64Type::kMaxPrecision) { - ARROW_ASSIGN_OR_RAISE(arrow_type, Decimal64Type::Make(precision, scale)); - ARROW_ASSIGN_OR_RAISE( - *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); - ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), - std::move(arrow_type))); - } else if (precision <= Decimal128Type::kMaxPrecision) { - ARROW_ASSIGN_OR_RAISE(arrow_type, Decimal128Type::Make(precision, scale)); + Result> maybe_type = + Decimal128Type::Make(decimal_type.precision(), decimal_type.scale()); + std::shared_ptr arrow_type; + if (maybe_type.ok()) { + arrow_type = maybe_type.ValueOrDie(); ARROW_ASSIGN_OR_RAISE( *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), std::move(arrow_type))); - } else { - ARROW_ASSIGN_OR_RAISE(arrow_type, Decimal256Type::Make(precision, scale)); - ARROW_ASSIGN_OR_RAISE( - *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); - ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), - std::move(arrow_type))); + return Status::OK(); } + // Fallback to see if Decimal256 can represent the type. + ARROW_ASSIGN_OR_RAISE( + arrow_type, Decimal256Type::Make(decimal_type.precision(), decimal_type.scale())); + ARROW_ASSIGN_OR_RAISE( + *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); + ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), + std::move(arrow_type))); return Status::OK(); } @@ -732,7 +724,7 @@ struct DecimalConverter { } }; -/// \brief Convert an Int32 or Int64 array into a Decimal32Array or Decimal64Array +/// \brief Convert an Int32 or Int64 array into a Decimal32/64/128/256Array /// The parquet spec allows systems to write decimals in int32, int64 if the values are /// small enough to fit in less 4 bytes or less than 8 bytes, respectively. /// This function implements the conversion from int32 and int64 arrays to decimal arrays. @@ -745,7 +737,9 @@ static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, // Decimal32 and Decimal64 are only Arrow constructs. Parquet does not // specifically distinguish between decimal byte widths. DCHECK(field->type()->id() == ::arrow::Type::DECIMAL32 || - field->type()->id() == ::arrow::Type::DECIMAL64); + field->type()->id() == ::arrow::Type::DECIMAL64 || + field->type()->id() == ::arrow::Type::DECIMAL128 || + field->type()->id() == ::arrow::Type::DECIMAL256); const int64_t length = reader->values_written(); @@ -771,9 +765,15 @@ static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, if constexpr (std::is_same_v) { ::arrow::Decimal32 decimal(value); decimal.ToBytes(out_ptr); - } else { + } else if constexpr (std::is_same_v) { ::arrow::Decimal64 decimal(value); decimal.ToBytes(out_ptr); + } else if constexpr (std::is_same_v) { + ::arrow::Decimal128 decimal(value); + decimal.ToBytes(out_ptr); + } else { + ::arrow::Decimal256 decimal(value); + decimal.ToBytes(out_ptr); } } @@ -933,6 +933,10 @@ Status TransferColumnData(RecordReader* reader, } break; case ::arrow::Type::DECIMAL64: { switch (descr->physical_type()) { + case ::parquet::Type::INT32: { + auto fn = DecimalIntegerTransfer; + RETURN_NOT_OK(fn(reader, pool, value_field, &result)); + } break; case ::parquet::Type::INT64: { auto fn = DecimalIntegerTransfer; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); @@ -947,12 +951,20 @@ Status TransferColumnData(RecordReader* reader, } break; default: return Status::Invalid( - "Physical type for decimal64 must be int64, byte array, or fixed length " - "binary"); + "Physical type for decimal64 must be int32, int64, byte array, or fixed " + "length binary"); } } break; case ::arrow::Type::DECIMAL128: { switch (descr->physical_type()) { + case ::parquet::Type::INT32: { + auto fn = DecimalIntegerTransfer; + RETURN_NOT_OK(fn(reader, pool, value_field, &result)); + } break; + case ::parquet::Type::INT64: { + auto fn = DecimalIntegerTransfer; + RETURN_NOT_OK(fn(reader, pool, value_field, &result)); + } break; case ::parquet::Type::BYTE_ARRAY: { auto fn = &TransferDecimal; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); @@ -963,11 +975,20 @@ Status TransferColumnData(RecordReader* reader, } break; default: return Status::Invalid( - "Physical type for decimal128 must be byte array, or fixed length binary"); + "Physical type for decimal128 must be int32, int64, byte array, or fixed " + "length binary"); } } break; case ::arrow::Type::DECIMAL256: { switch (descr->physical_type()) { + case ::parquet::Type::INT32: { + auto fn = DecimalIntegerTransfer; + RETURN_NOT_OK(fn(reader, pool, value_field, &result)); + } break; + case ::parquet::Type::INT64: { + auto fn = DecimalIntegerTransfer; + RETURN_NOT_OK(fn(reader, pool, value_field, &result)); + } break; case ::parquet::Type::BYTE_ARRAY: { auto fn = &TransferDecimal; RETURN_NOT_OK(fn(reader, pool, value_field, &result)); @@ -978,7 +999,8 @@ Status TransferColumnData(RecordReader* reader, } break; default: return Status::Invalid( - "Physical type for decimal256 must be byte array, or fixed length binary"); + "Physical type for decimal256 must be int32, int64, byte array, or fixed " + "length binary"); } } break; case ::arrow::Type::TIMESTAMP: { diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index d372e2fb681c0..aa5182e036634 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -32,16 +32,17 @@ using ::arrow::Result; using ::arrow::Status; using ::arrow::internal::checked_cast; -Result> MakeArrowDecimal(const LogicalType& logical_type) { +Result> MakeArrowDecimal(const LogicalType& logical_type, + bool smallest_decimal_enabled) { const auto& decimal = checked_cast(logical_type); - if (decimal.precision() <= ::arrow::Decimal32Type::kMaxPrecision) { - return ::arrow::Decimal32Type::Make(decimal.precision(), decimal.scale()); - } else if (decimal.precision() <= ::arrow::Decimal64Type::kMaxPrecision) { - return ::arrow::Decimal64Type::Make(decimal.precision(), decimal.scale()); + + if (smallest_decimal_enabled) { + return ::arrow::smallest_decimal(decimal.precision(), decimal.scale()); } else if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) { return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale()); + } else { + return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale()); } - return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale()); } Result> MakeArrowInt(const LogicalType& logical_type) { @@ -114,19 +115,20 @@ Result> MakeArrowTimestamp(const LogicalType& logical } } -Result> FromByteArray( - const LogicalType& logical_type, const ArrowReaderProperties& reader_properties) { +Result> FromByteArray(const LogicalType& logical_type, + bool arrow_extensions_enabled, + bool smallest_decimal_enabled) { switch (logical_type.type()) { case LogicalType::Type::STRING: return ::arrow::utf8(); case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type); + return MakeArrowDecimal(logical_type, smallest_decimal_enabled); case LogicalType::Type::NONE: case LogicalType::Type::ENUM: case LogicalType::Type::BSON: return ::arrow::binary(); case LogicalType::Type::JSON: - if (reader_properties.get_arrow_extensions_enabled()) { + if (arrow_extensions_enabled) { return ::arrow::extension::json(::arrow::utf8()); } // When the original Arrow schema isn't stored and Arrow extensions are disabled, @@ -139,10 +141,11 @@ Result> FromByteArray( } Result> FromFLBA(const LogicalType& logical_type, - int32_t physical_length) { + int32_t physical_length, + bool smallest_decimal_enabled) { switch (logical_type.type()) { case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type); + return MakeArrowDecimal(logical_type, smallest_decimal_enabled); case LogicalType::Type::FLOAT16: return ::arrow::float16(); case LogicalType::Type::NONE: @@ -156,7 +159,8 @@ Result> FromFLBA(const LogicalType& logical_type, } } -::arrow::Result> FromInt32(const LogicalType& logical_type) { +::arrow::Result> FromInt32(const LogicalType& logical_type, + bool smallest_decimal_enabled) { switch (logical_type.type()) { case LogicalType::Type::INT: return MakeArrowInt(logical_type); @@ -165,7 +169,7 @@ ::arrow::Result> FromInt32(const LogicalType& logical case LogicalType::Type::TIME: return MakeArrowTime32(logical_type); case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type); + return MakeArrowDecimal(logical_type, smallest_decimal_enabled); case LogicalType::Type::NONE: return ::arrow::int32(); default: @@ -174,12 +178,13 @@ ::arrow::Result> FromInt32(const LogicalType& logical } } -Result> FromInt64(const LogicalType& logical_type) { +Result> FromInt64(const LogicalType& logical_type, + bool smallest_decimal_enabled) { switch (logical_type.type()) { case LogicalType::Type::INT: return MakeArrowInt64(logical_type); case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type); + return MakeArrowDecimal(logical_type, smallest_decimal_enabled); case LogicalType::Type::TIMESTAMP: return MakeArrowTimestamp(logical_type); case LogicalType::Type::TIME: @@ -199,13 +204,14 @@ Result> GetArrowType( return ::arrow::null(); } + bool smallest_decimal_enabled = reader_properties.smallest_decimal_enabled(); switch (physical_type) { case ParquetType::BOOLEAN: return ::arrow::boolean(); case ParquetType::INT32: - return FromInt32(logical_type); + return FromInt32(logical_type, smallest_decimal_enabled); case ParquetType::INT64: - return FromInt64(logical_type); + return FromInt64(logical_type, smallest_decimal_enabled); case ParquetType::INT96: return ::arrow::timestamp(reader_properties.coerce_int96_timestamp_unit()); case ParquetType::FLOAT: @@ -213,9 +219,10 @@ Result> GetArrowType( case ParquetType::DOUBLE: return ::arrow::float64(); case ParquetType::BYTE_ARRAY: - return FromByteArray(logical_type, reader_properties); + return FromByteArray(logical_type, reader_properties.get_arrow_extensions_enabled(), + smallest_decimal_enabled); case ParquetType::FIXED_LEN_BYTE_ARRAY: - return FromFLBA(logical_type, type_length); + return FromFLBA(logical_type, type_length, smallest_decimal_enabled); default: { // PARQUET-1565: This can occur if the file is corrupt return Status::IOError("Invalid physical column type: ", diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index 58828f85ab8e3..d26d32c536e72 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -29,12 +29,16 @@ namespace parquet::arrow { using ::arrow::Result; -Result> FromByteArray(const LogicalType& logical_type, - bool use_known_arrow_extensions); -Result> FromFLBA(const LogicalType& logical_type, - int32_t physical_length); -Result> FromInt32(const LogicalType& logical_type); -Result> FromInt64(const LogicalType& logical_type); +Result> FromByteArray( + const LogicalType& logical_type, bool arrow_extensions_enabled = false, + bool smallest_decimal_enabled = false); +Result> FromFLBA( + const LogicalType& logical_type, int32_t physical_length, + bool smallest_decimal_enabled = false); +Result> FromInt32( + const LogicalType& logical_type, bool smallest_decimal_enabled = false); +Result> FromInt64( + const LogicalType& logical_type, bool smallest_decimal_enabled = false); Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index defac2624239c..1fc1e42ad717f 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2059,6 +2059,9 @@ Status TypedColumnWriterImpl::WriteArrowDense( WRITE_SERIALIZE_CASE(DATE64, Date64Type, Int32Type) WRITE_SERIALIZE_CASE(TIME32, Time32Type, Int32Type) WRITE_SERIALIZE_CASE(DECIMAL32, Decimal32Type, Int32Type) + WRITE_SERIALIZE_CASE(DECIMAL64, Decimal64Type, Int32Type) + WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, Int32Type) + WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, Int32Type) default: ARROW_UNSUPPORTED() } @@ -2230,6 +2233,8 @@ Status TypedColumnWriterImpl::WriteArrowDense( WRITE_ZERO_COPY_CASE(TIME64, Time64Type, Int64Type) WRITE_ZERO_COPY_CASE(DURATION, DurationType, Int64Type) WRITE_SERIALIZE_CASE(DECIMAL64, Decimal64Type, Int64Type) + WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, Int64Type) + WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, Int64Type) default: ARROW_UNSUPPORTED(); } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 19436b84a379b..357e317448e7c 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -914,7 +914,8 @@ class PARQUET_EXPORT ArrowReaderProperties { cache_options_(::arrow::io::CacheOptions::LazyDefaults()), coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO), arrow_extensions_enabled_(false), - should_load_statistics_(false) {} + should_load_statistics_(false), + smallest_decimal_enabled_(false) {} /// \brief Set whether to use the IO thread pool to parse columns in parallel. /// @@ -1006,6 +1007,11 @@ class PARQUET_EXPORT ArrowReaderProperties { /// Return whether loading statistics as much as possible. bool should_load_statistics() const { return should_load_statistics_; } + void set_smallest_decimal_enabled(bool smallest_decimal_enabled) { + smallest_decimal_enabled_ = smallest_decimal_enabled; + } + bool smallest_decimal_enabled() const { return smallest_decimal_enabled_; } + private: bool use_threads_; std::unordered_set read_dict_indices_; @@ -1016,6 +1022,7 @@ class PARQUET_EXPORT ArrowReaderProperties { ::arrow::TimeUnit::type coerce_int96_timestamp_unit_; bool arrow_extensions_enabled_; bool should_load_statistics_; + bool smallest_decimal_enabled_; }; /// EXPERIMENTAL: Constructs the default ArrowReaderProperties diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 1c5add3388a1b..6496aa99092b8 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -364,9 +364,9 @@ def test_byte_stream_split(): def test_store_decimal_as_integer(tempdir): arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))), - type=pa.decimal32(5, 2)) + type=pa.decimal128(5, 2)) arr_decimal_10_18 = pa.array(list(map(Decimal, range(100))), - type=pa.decimal64(16, 9)) + type=pa.decimal128(16, 9)) arr_decimal_gt18 = pa.array(list(map(Decimal, range(100))), type=pa.decimal128(22, 2)) arr_bool = pa.array([True, False] * 50) From 44f1adcbc198c4fc92294ea91598fc1fe8f893e4 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 15 Feb 2025 18:38:28 -0500 Subject: [PATCH 07/20] Add more tests in arrow_schema_test --- cpp/src/parquet/arrow/arrow_schema_test.cc | 119 +++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 535efa0c8e5de..16409d30669a4 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -268,6 +268,43 @@ TEST_F(TestConvertParquetSchema, ParquetAnnotatedFields) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); } +TEST_F(TestConvertParquetSchema, ParquetAnnotatedFieldsSmallestDecimal) { + struct FieldConstructionArguments { + std::string name; + std::shared_ptr logical_type; + parquet::Type::type physical_type; + int physical_length; + std::shared_ptr<::arrow::DataType> datatype; + }; + + std::vector cases = { + {"decimal(8, 2)", LogicalType::Decimal(8, 2), ParquetType::INT32, -1, + ::arrow::decimal32(8, 2)}, + {"decimal(16, 4)", LogicalType::Decimal(16, 4), ParquetType::INT64, -1, + ::arrow::decimal64(16, 4)}, + {"decimal(32, 8)", LogicalType::Decimal(32, 8), ParquetType::FIXED_LEN_BYTE_ARRAY, + 16, ::arrow::decimal128(32, 8)}, + {"decimal(73, 38)", LogicalType::Decimal(73, 38), ParquetType::FIXED_LEN_BYTE_ARRAY, + 31, ::arrow::decimal256(73, 38)}, + }; + + std::vector parquet_fields; + std::vector> arrow_fields; + + for (const FieldConstructionArguments& c : cases) { + parquet_fields.push_back(PrimitiveNode::Make(c.name, Repetition::OPTIONAL, + c.logical_type, c.physical_type, + c.physical_length)); + arrow_fields.push_back(::arrow::field(c.name, c.datatype)); + } + + auto reader_props = ArrowReaderProperties(); + reader_props.set_smallest_decimal_enabled(true); + ASSERT_OK(ConvertSchema(parquet_fields, nullptr, reader_props)); + auto arrow_schema = ::arrow::schema(arrow_fields); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); +} + TEST_F(TestConvertParquetSchema, DuplicateFieldNames) { std::vector parquet_fields; std::vector> arrow_fields; @@ -354,6 +391,42 @@ TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) { ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); } +TEST_F(TestConvertParquetSchema, ParquetSmallestDecimals) { + std::vector parquet_fields; + std::vector> arrow_fields; + + parquet_fields.push_back(PrimitiveNode::Make("flba-decimal", Repetition::OPTIONAL, + ParquetType::FIXED_LEN_BYTE_ARRAY, + ConvertedType::DECIMAL, 4, 8, 4)); + arrow_fields.push_back( + ::arrow::field("flba-decimal", std::make_shared<::arrow::Decimal32Type>(8, 4))); + + parquet_fields.push_back(PrimitiveNode::Make("binary-decimal", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, + ConvertedType::DECIMAL, -1, 18, 4)); + arrow_fields.push_back( + ::arrow::field("binary-decimal", std::make_shared<::arrow::Decimal64Type>(18, 4))); + + parquet_fields.push_back(PrimitiveNode::Make("int32-decimal", Repetition::OPTIONAL, + ParquetType::INT32, ConvertedType::DECIMAL, + -1, 38, 4)); + arrow_fields.push_back( + ::arrow::field("int32-decimal", std::make_shared<::arrow::Decimal128Type>(38, 4))); + + parquet_fields.push_back(PrimitiveNode::Make("int64-decimal", Repetition::OPTIONAL, + ParquetType::INT64, ConvertedType::DECIMAL, + -1, 48, 4)); + arrow_fields.push_back( + ::arrow::field("int64-decimal", std::make_shared<::arrow::Decimal256Type>(48, 4))); + + auto arrow_schema = ::arrow::schema(arrow_fields); + auto reader_props = ArrowReaderProperties(); + reader_props.set_smallest_decimal_enabled(true); + ASSERT_OK(ConvertSchema(parquet_fields, nullptr, reader_props)); + + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); +} + TEST_F(TestConvertParquetSchema, ParquetMaps) { std::vector parquet_fields; std::vector> arrow_fields; @@ -1134,6 +1207,52 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { // ASSERT_NO_FATAL_FAILURE(); } +TEST_F(TestConvertArrowSchema, ArrowFieldsStoreSchema) { + struct FieldConstructionArguments { + std::string name; + std::shared_ptr<::arrow::DataType> datatype; + std::shared_ptr logical_type; + parquet::Type::type physical_type; + int physical_length; + }; + + std::vector cases = { + {"decimal(1, 0)", ::arrow::decimal128(1, 0), LogicalType::Decimal(1, 0), + ParquetType::FIXED_LEN_BYTE_ARRAY, 1}, + {"decimal(8, 2)", ::arrow::decimal128(8, 2), LogicalType::Decimal(8, 2), + ParquetType::FIXED_LEN_BYTE_ARRAY, 4}, + {"decimal(16, 4)", ::arrow::decimal128(16, 4), LogicalType::Decimal(16, 4), + ParquetType::FIXED_LEN_BYTE_ARRAY, 7}, + {"decimal(32, 8)", ::arrow::decimal128(32, 8), LogicalType::Decimal(32, 8), + ParquetType::FIXED_LEN_BYTE_ARRAY, 14}, + {"decimal(1, 0)", ::arrow::decimal32(1, 0), LogicalType::Decimal(1, 0), + ParquetType::FIXED_LEN_BYTE_ARRAY, 1}, + {"decimal(8, 2)", ::arrow::decimal32(8, 2), LogicalType::Decimal(8, 2), + ParquetType::FIXED_LEN_BYTE_ARRAY, 4}, + {"decimal(16, 4)", ::arrow::decimal64(16, 4), LogicalType::Decimal(16, 4), + ParquetType::FIXED_LEN_BYTE_ARRAY, 7}, + {"decimal(32, 8)", ::arrow::decimal128(32, 8), LogicalType::Decimal(32, 8), + ParquetType::FIXED_LEN_BYTE_ARRAY, 14}, + {"decimal(73, 38)", ::arrow::decimal256(73, 38), LogicalType::Decimal(73, 38), + ParquetType::FIXED_LEN_BYTE_ARRAY, 31}}; + + std::vector> arrow_fields; + std::vector parquet_fields; + + for (const FieldConstructionArguments& c : cases) { + arrow_fields.push_back(::arrow::field(c.name, c.datatype, false)); + parquet_fields.push_back(PrimitiveNode::Make(c.name, Repetition::REQUIRED, + c.logical_type, c.physical_type, + c.physical_length)); + } + + auto writer_props = ::parquet::default_arrow_writer_properties(); + writer_props->store_schema(); + ASSERT_OK(ConvertSchema(arrow_fields, writer_props)); + CheckFlatSchema(parquet_fields); + // ASSERT_NO_FATAL_FAILURE(); +} + TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) { struct FieldConstructionArguments { std::string name; From c017323a213d76b862af0261a04c2920db81807e Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 15 Feb 2025 19:43:27 -0500 Subject: [PATCH 08/20] Add more tests in arrow_reader_writer_test --- .../parquet/arrow/arrow_reader_writer_test.cc | 97 +++++++++++++++++-- 1 file changed, 90 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 27cb849365ca7..195352ef12cf5 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -181,6 +181,14 @@ std::shared_ptr get_logical_type(const DataType& type) { static_cast(type); return get_logical_type(*dict_type.value_type()); } + case ArrowId::DECIMAL32: { + const auto& dec_type = static_cast(type); + return LogicalType::Decimal(dec_type.precision(), dec_type.scale()); + } + case ArrowId::DECIMAL64: { + const auto& dec_type = static_cast(type); + return LogicalType::Decimal(dec_type.precision(), dec_type.scale()); + } case ArrowId::DECIMAL128: { const auto& dec_type = static_cast(type); return LogicalType::Decimal(dec_type.precision(), dec_type.scale()); @@ -206,9 +214,11 @@ ParquetType::type get_physical_type(const DataType& type) { case ArrowId::INT16: case ArrowId::UINT32: case ArrowId::INT32: + case ArrowId::DECIMAL32: return ParquetType::INT32; case ArrowId::UINT64: case ArrowId::INT64: + case ArrowId::DECIMAL64: return ParquetType::INT64; case ArrowId::FLOAT: return ParquetType::FLOAT; @@ -533,6 +543,8 @@ static std::shared_ptr MakeSimpleSchema(const DataType& type, case ::arrow::Type::HALF_FLOAT: byte_width = sizeof(::arrow::HalfFloatType::c_type); break; + case ::arrow::Type::DECIMAL32: + case ::arrow::Type::DECIMAL64: case ::arrow::Type::DECIMAL128: case ::arrow::Type::DECIMAL256: { const auto& decimal_type = static_cast(values_type); @@ -548,6 +560,8 @@ static std::shared_ptr MakeSimpleSchema(const DataType& type, case ::arrow::Type::HALF_FLOAT: byte_width = sizeof(::arrow::HalfFloatType::c_type); break; + case ::arrow::Type::DECIMAL32: + case ::arrow::Type::DECIMAL64: case ::arrow::Type::DECIMAL128: case ::arrow::Type::DECIMAL256: { const auto& decimal_type = static_cast(type); @@ -646,11 +660,13 @@ class ParquetIOTestBase : public ::testing::Test { ASSERT_OK((*out)->ValidateFull()); } - void ReadAndCheckSingleColumnFile(const Array& values) { + void ReadAndCheckSingleColumnFile( + const Array& values, + const ArrowReaderProperties& properties = default_arrow_reader_properties()) { std::shared_ptr out; std::unique_ptr reader; - ReaderFromSink(&reader); + ReaderFromSink(&reader, properties); ReadSingleColumnFile(std::move(reader), &out); AssertArraysEqual(values, *out); @@ -753,9 +769,10 @@ class ParquetIOTestBase : public ::testing::Test { class TestReadDecimals : public ParquetIOTestBase { public: - void CheckReadFromByteArrays(const std::shared_ptr& logical_type, - const std::vector>& values, - const Array& expected) { + void CheckReadFromByteArrays( + const std::shared_ptr& logical_type, + const std::vector>& values, const Array& expected, + const ArrowReaderProperties& properties = default_arrow_reader_properties()) { std::vector byte_arrays(values.size()); std::transform(values.begin(), values.end(), byte_arrays.begin(), [](const std::vector& bytes) { @@ -776,13 +793,12 @@ class TestReadDecimals : public ParquetIOTestBase { column_writer->Close(); file_writer->Close(); - ReadAndCheckSingleColumnFile(expected); + ReadAndCheckSingleColumnFile(expected, properties); } }; // The Decimal roundtrip tests always go through the FixedLenByteArray path, // check the ByteArray case manually. - TEST_F(TestReadDecimals, Decimal128ByteArray) { const std::vector> big_endian_decimals = { // 123456 @@ -814,6 +830,73 @@ TEST_F(TestReadDecimals, Decimal256ByteArray) { CheckReadFromByteArrays(LogicalType::Decimal(40, 3), big_endian_decimals, *expected); } +TEST_F(TestReadDecimals, DecimalByteArraySmallestDecimal) { + const std::vector>> big_endian_decimals_vector = { + { + // 123456 + {1, 226, 64}, + // 987654 + {15, 18, 6}, + // -123456 + {255, 254, 29, 192}, + }, + { + // 123456 + {1, 226, 64}, + // 987654 + {15, 18, 6}, + // -123456 + {255, 254, 29, 192}, + // -123456 + {255, 255, 255, 255, 255, 254, 29, 192}, + }, + { + // 123456 + {1, 226, 64}, + // 987654 + {15, 18, 6}, + // -123456 + {255, 254, 29, 192}, + // -123456 + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 254, 29, 192}, + }, + { + // 123456 + {1, 226, 64}, + // 987654 + {15, 18, 6}, + // -123456 + {255, 254, 29, 192}, + // -123456 + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 254, 29, 192}, + }}; + + const std::vector, std::shared_ptr>> + expected_vector = { + {ArrayFromJSON(::arrow::decimal32(6, 3), + R"(["123.456", "987.654", "-123.456"])"), + LogicalType::Decimal(6, 3)}, + {ArrayFromJSON(::arrow::decimal64(16, 3), + R"(["123.456", "987.654", "-123.456", "-123.456"])"), + LogicalType::Decimal(16, 3)}, + {ArrayFromJSON(::arrow::decimal128(20, 3), + R"(["123.456", "987.654", "-123.456", "-123.456"])"), + LogicalType::Decimal(20, 3)}, + {ArrayFromJSON(::arrow::decimal256(40, 3), + R"(["123.456", "987.654", "-123.456", "-123.456"])"), + LogicalType::Decimal(40, 3)}}; + + ArrowReaderProperties reader_props = default_arrow_reader_properties(); + reader_props.set_smallest_decimal_enabled(true); + for (size_t i = 0; i < expected_vector.size(); ++i) { + CheckReadFromByteArrays(expected_vector.at(i).second, + big_endian_decimals_vector.at(i), + *expected_vector.at(i).first, reader_props); + } +} + template class TestParquetIO : public ParquetIOTestBase { public: From 63d307b8abf228d1c6278ce2b68fd2d1144298ed Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 15 Feb 2025 21:47:41 -0500 Subject: [PATCH 09/20] Add more typed tests for small decimals --- .../parquet/arrow/arrow_reader_writer_test.cc | 185 ++++++++++++------ cpp/src/parquet/arrow/reader.cc | 9 +- cpp/src/parquet/arrow/reader.h | 6 +- cpp/src/parquet/arrow/test_util.h | 54 +++-- cpp/src/parquet/arrow/writer.cc | 23 ++- cpp/src/parquet/arrow/writer.h | 19 +- 6 files changed, 206 insertions(+), 90 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 195352ef12cf5..875f8bf75ff34 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include "parquet/properties.h" #ifdef _MSC_VER # pragma warning(push) // Disable forcing value to bool warnings @@ -395,13 +396,16 @@ using ParquetWriter = TypedColumnWriter>; void WriteTableToBuffer(const std::shared_ptr
& table, int64_t row_group_size, const std::shared_ptr& arrow_properties, - std::shared_ptr* out) { + std::shared_ptr* out, + const ArrowReaderProperties& schema_arrow_reader_properities = + default_arrow_reader_properties()) { auto sink = CreateOutputStream(); auto write_props = WriterProperties::Builder().write_batch_size(100)->build(); ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink, - row_group_size, write_props, arrow_properties)); + row_group_size, write_props, arrow_properties, + schema_arrow_reader_properities)); ASSERT_OK_AND_ASSIGN(*out, sink->Finish()); } @@ -449,17 +453,19 @@ void CheckConfiguredRoundtrip( } } -void DoSimpleRoundtrip(const std::shared_ptr
& table, bool use_threads, - int64_t row_group_size, const std::vector& column_subset, - std::shared_ptr
* out, - const std::shared_ptr& arrow_properties = - default_arrow_writer_properties()) { +void DoSimpleRoundtrip( + const std::shared_ptr
& table, bool use_threads, int64_t row_group_size, + const std::vector& column_subset, std::shared_ptr
* out, + const std::shared_ptr& arrow_properties = + default_arrow_writer_properties(), + const ArrowReaderProperties& reader_properties = default_arrow_reader_properties()) { std::shared_ptr buffer; - ASSERT_NO_FATAL_FAILURE( - WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer)); + ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size, arrow_properties, + &buffer, reader_properties)); - ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool())); + ASSERT_OK_AND_ASSIGN(auto reader, + OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool(), reader_properties)); reader->set_use_threads(use_threads); if (column_subset.size() > 0) { @@ -474,18 +480,18 @@ void DoRoundTripWithBatches( const std::shared_ptr
& table, bool use_threads, int64_t row_group_size, const std::vector& column_subset, std::shared_ptr
* out, const std::shared_ptr& arrow_writer_properties = - default_arrow_writer_properties()) { + default_arrow_writer_properties(), + ArrowReaderProperties reader_properties = default_arrow_reader_properties()) { std::shared_ptr buffer; - ASSERT_NO_FATAL_FAILURE( - WriteTableToBuffer(table, row_group_size, arrow_writer_properties, &buffer)); + reader_properties.set_batch_size(row_group_size - 1); + ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer( + table, row_group_size, arrow_writer_properties, &buffer, reader_properties)); std::unique_ptr reader; FileReaderBuilder builder; ASSERT_OK_NO_THROW(builder.Open(std::make_shared(buffer))); - ArrowReaderProperties arrow_reader_properties; - arrow_reader_properties.set_batch_size(row_group_size - 1); ASSERT_OK_NO_THROW(builder.memory_pool(::arrow::default_memory_pool()) - ->properties(arrow_reader_properties) + ->properties(reader_properties) ->Build(&reader)); std::unique_ptr<::arrow::RecordBatchReader> batch_reader; if (column_subset.size() > 0) { @@ -506,20 +512,21 @@ void DoRoundTripWithBatches( void CheckSimpleRoundtrip( const std::shared_ptr
& table, int64_t row_group_size, const std::shared_ptr& arrow_writer_properties = - default_arrow_writer_properties()) { + default_arrow_writer_properties(), + const ArrowReaderProperties& reader_properties = default_arrow_reader_properties()) { std::shared_ptr
result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result, - arrow_writer_properties)); + arrow_writer_properties, reader_properties)); ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), /*check_metadata=*/false); ASSERT_OK(result->ValidateFull()); ::arrow::AssertTablesEqual(*table, *result, false); - ASSERT_NO_FATAL_FAILURE(DoRoundTripWithBatches(table, false /* use_threads */, - row_group_size, {}, &result, - arrow_writer_properties)); + ASSERT_NO_FATAL_FAILURE( + DoRoundTripWithBatches(table, false /* use_threads */, row_group_size, {}, &result, + arrow_writer_properties, reader_properties)); ::arrow::AssertSchemaEqual(*table->schema(), *result->schema(), /*check_metadata=*/false); ASSERT_OK(result->ValidateFull()); @@ -634,6 +641,30 @@ class ParquetIOTestBase : public ::testing::Test { return ParquetFileWriter::Open(sink_, schema); } + template + ::arrow::enable_if_t< + !std::is_base_of::value, + ArrowReaderProperties> + ReaderPropertiesFromArrowType() { + return default_arrow_reader_properties(); + } + + template + ::arrow::enable_if_t< + std::is_base_of::value, + ArrowReaderProperties> + ReaderPropertiesFromArrowType() { + auto properties = default_arrow_reader_properties(); + properties.set_smallest_decimal_enabled(smallest_decimal_enabled); + return properties; + } + + template + void ReaderFromSinkTemplate(std::unique_ptr* out) { + ReaderFromSink(out, this->template ReaderPropertiesFromArrowType()); + } + void ReaderFromSink( std::unique_ptr* out, const ArrowReaderProperties& properties = default_arrow_reader_properties()) { @@ -660,6 +691,12 @@ class ParquetIOTestBase : public ::testing::Test { ASSERT_OK((*out)->ValidateFull()); } + template + void ReadAndCheckSingleColumnFileTemplate(const Array& values) { + ReadAndCheckSingleColumnFile( + values, this->template ReaderPropertiesFromArrowType()); + } + void ReadAndCheckSingleColumnFile( const Array& values, const ArrowReaderProperties& properties = default_arrow_reader_properties()) { @@ -723,10 +760,11 @@ class ParquetIOTestBase : public ::testing::Test { *out = MakeSimpleTable(lists, true /* nullable_lists */); } + template void ReadAndCheckSingleColumnTable(const std::shared_ptr& values) { std::shared_ptr<::arrow::Table> out; std::unique_ptr reader; - ReaderFromSink(&reader); + ReaderFromSinkTemplate(&reader); ReadTableFromFile(std::move(reader), &out); ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(values->length(), out->num_rows()); @@ -738,8 +776,15 @@ class ParquetIOTestBase : public ::testing::Test { AssertArraysEqual(*values, *result); } - void CheckRoundTrip(const std::shared_ptr
& table) { - CheckSimpleRoundtrip(table, table->num_rows()); + template + void CheckRoundTripTemplate(const std::shared_ptr
& table) { + CheckRoundTrip(table, this->template ReaderPropertiesFromArrowType()); + } + + void CheckRoundTrip(const std::shared_ptr
& table, + const ArrowReaderProperties& reader_properties) { + CheckSimpleRoundtrip(table, table->num_rows(), default_arrow_writer_properties(), + reader_properties); } template @@ -941,6 +986,9 @@ typedef ::testing::Types< ::arrow::Int16Type, ::arrow::Int32Type, ::arrow::UInt64Type, ::arrow::Int64Type, ::arrow::Date32Type, ::arrow::FloatType, ::arrow::DoubleType, ::arrow::StringType, ::arrow::BinaryType, ::arrow::FixedSizeBinaryType, ::arrow::HalfFloatType, + Decimal32WithPrecisionAndScale<1, true>, Decimal32WithPrecisionAndScale<5, true>, + Decimal64WithPrecisionAndScale<10, true>, Decimal64WithPrecisionAndScale<18, true>, + Decimal128WithPrecisionAndScale<19, true>, Decimal256WithPrecisionAndScale<39, true>, Decimal128WithPrecisionAndScale<1>, Decimal128WithPrecisionAndScale<5>, Decimal128WithPrecisionAndScale<10>, Decimal128WithPrecisionAndScale<19>, Decimal128WithPrecisionAndScale<23>, Decimal128WithPrecisionAndScale<27>, @@ -958,7 +1006,8 @@ TYPED_TEST(TestParquetIO, SingleColumnRequiredWrite) { MakeSimpleSchema(*values->type(), Repetition::REQUIRED); ASSERT_NO_FATAL_FAILURE(this->WriteColumn(schema, values)); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnFileTemplate(*values)); } TYPED_TEST(TestParquetIO, ZeroChunksTable) { @@ -991,7 +1040,7 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { std::shared_ptr
out; std::unique_ptr reader; - ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader)); + ASSERT_NO_FATAL_FAILURE(this->template ReaderFromSinkTemplate(&reader)); ASSERT_NO_FATAL_FAILURE(this->ReadTableFromFile(std::move(reader), &out)); ASSERT_EQ(1, out->num_columns()); EXPECT_EQ(table->num_rows(), out->num_rows()); @@ -1012,7 +1061,8 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { MakeSimpleSchema(*values->type(), Repetition::OPTIONAL); ASSERT_NO_FATAL_FAILURE(this->WriteColumn(schema, values)); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnFileTemplate(*values)); } TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) { @@ -1037,7 +1087,8 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) { MakeSimpleSchema(*dict_values->type(), Repetition::OPTIONAL); ASSERT_NO_FATAL_FAILURE(this->WriteColumn(schema, dict_values)); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnFileTemplate(*values)); } TYPED_TEST(TestParquetIO, SingleColumnRequiredSliceWrite) { @@ -1048,12 +1099,14 @@ TYPED_TEST(TestParquetIO, SingleColumnRequiredSliceWrite) { std::shared_ptr sliced_values = values->Slice(SMALL_SIZE / 2, SMALL_SIZE); ASSERT_NO_FATAL_FAILURE(this->WriteColumn(schema, sliced_values)); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*sliced_values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnFileTemplate(*sliced_values)); // Slice offset 1 higher sliced_values = values->Slice(SMALL_SIZE / 2 + 1, SMALL_SIZE); ASSERT_NO_FATAL_FAILURE(this->WriteColumn(schema, sliced_values)); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*sliced_values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnFileTemplate(*sliced_values)); } TYPED_TEST(TestParquetIO, SingleColumnOptionalSliceWrite) { @@ -1064,12 +1117,14 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalSliceWrite) { std::shared_ptr sliced_values = values->Slice(SMALL_SIZE / 2, SMALL_SIZE); ASSERT_NO_FATAL_FAILURE(this->WriteColumn(schema, sliced_values)); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*sliced_values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnFileTemplate(*sliced_values)); // Slice offset 1 higher, thus different null bitmap. sliced_values = values->Slice(SMALL_SIZE / 2 + 1, SMALL_SIZE); ASSERT_NO_FATAL_FAILURE(this->WriteColumn(schema, sliced_values)); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*sliced_values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnFileTemplate(*sliced_values)); } TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { @@ -1078,44 +1133,44 @@ TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { ASSERT_OK(NullableArray(SMALL_SIZE, 10, kDefaultSeed, &values)); std::shared_ptr
table = MakeSimpleTable(values, true); - ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); + ASSERT_NO_FATAL_FAILURE(this->template CheckRoundTripTemplate(table)); } TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) { std::shared_ptr
table; ASSERT_NO_FATAL_FAILURE(this->PrepareEmptyListsTable(SMALL_SIZE, &table)); - ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); + ASSERT_NO_FATAL_FAILURE(this->template CheckRoundTripTemplate(table)); } TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) { std::shared_ptr
table; this->PrepareListTable(SMALL_SIZE, true, true, 10, &table); - this->CheckRoundTrip(table); + this->template CheckRoundTripTemplate(table); } TYPED_TEST(TestParquetIO, SingleRequiredListNullableColumnReadWrite) { std::shared_ptr
table; ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, false, true, 10, &table)); - ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); + ASSERT_NO_FATAL_FAILURE(this->template CheckRoundTripTemplate(table)); } TYPED_TEST(TestParquetIO, SingleNullableListRequiredColumnReadWrite) { std::shared_ptr
table; ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, false, 10, &table)); - ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); + ASSERT_NO_FATAL_FAILURE(this->template CheckRoundTripTemplate(table)); } TYPED_TEST(TestParquetIO, SingleRequiredListRequiredColumnReadWrite) { std::shared_ptr
table; ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, false, false, 0, &table)); - ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); + ASSERT_NO_FATAL_FAILURE(this->template CheckRoundTripTemplate(table)); } TYPED_TEST(TestParquetIO, SingleNullableListRequiredListRequiredColumnReadWrite) { std::shared_ptr
table; ASSERT_NO_FATAL_FAILURE( this->PrepareListOfListTable(SMALL_SIZE, true, false, false, 0, &table)); - ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); + ASSERT_NO_FATAL_FAILURE(this->template CheckRoundTripTemplate(table)); } TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedWrite) { @@ -1142,7 +1197,8 @@ TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedWrite) { } ASSERT_OK_NO_THROW(writer->Close()); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnFileTemplate(*values)); } TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWrite) { @@ -1154,7 +1210,8 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWrite) { ASSERT_OK_NO_THROW(WriteTable(*table, default_memory_pool(), this->sink_, 512, default_writer_properties())); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnTable(values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnTable(values)); } TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWriteArrowIO) { @@ -1164,12 +1221,14 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWriteArrowIO) { this->ResetSink(); auto buffer = AllocateBuffer(); + auto reader_properties = this->template ReaderPropertiesFromArrowType(); { // BufferOutputStream closed on gc auto arrow_sink_ = std::make_shared<::arrow::io::BufferOutputStream>(buffer); ASSERT_OK_NO_THROW(WriteTable(*table, default_memory_pool(), arrow_sink_, 512, - default_writer_properties())); + default_writer_properties(), + default_arrow_writer_properties(), reader_properties)); // XXX: Remove this after ARROW-455 completed ASSERT_OK(arrow_sink_->Close()); @@ -1179,7 +1238,8 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWriteArrowIO) { auto source = std::make_shared(pbuffer); std::shared_ptr<::arrow::Table> out; - ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(source, ::arrow::default_memory_pool())); + ASSERT_OK_AND_ASSIGN( + auto reader, OpenFile(source, ::arrow::default_memory_pool(), reader_properties)); ASSERT_NO_FATAL_FAILURE(this->ReadTableFromFile(std::move(reader), &out)); ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(values->length(), out->num_rows()); @@ -1215,7 +1275,8 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) { } ASSERT_OK_NO_THROW(writer->Close()); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnFile(*values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnFileTemplate(*values)); } TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) { @@ -1228,7 +1289,8 @@ TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) { ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_, 512, default_writer_properties())); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnTable(values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnTable(values)); } TYPED_TEST(TestParquetIO, FileMetaDataWrite) { @@ -1269,7 +1331,7 @@ TYPED_TEST(TestParquetIO, CheckIterativeColumnRead) { values->length(), default_writer_properties())); std::unique_ptr reader; - this->ReaderFromSink(&reader); + this->template ReaderFromSinkTemplate(&reader); std::unique_ptr column_reader; ASSERT_OK_NO_THROW(reader->GetColumn(0, &column_reader)); ASSERT_NE(nullptr, column_reader.get()); @@ -1362,7 +1424,8 @@ TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compatibility) { ->build(); ASSERT_OK_NO_THROW( WriteTable(*table, default_memory_pool(), this->sink_, 512, properties)); - ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleColumnTable(values)); + ASSERT_NO_FATAL_FAILURE( + this->template ReadAndCheckSingleColumnTable<::arrow::UInt32Type>(values)); } using TestDurationParquetIO = TestParquetIO<::arrow::DurationType>; @@ -5332,10 +5395,11 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO { auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); std::unique_ptr writer; + auto reader_properties = this->template ReaderPropertiesFromArrowType(); ASSERT_OK_NO_THROW(FileWriter::Make( ::arrow::default_memory_pool(), ParquetFileWriter::Open(this->sink_, schema_node, writer_properties), - arrow_schema, default_arrow_writer_properties(), &writer)); + arrow_schema, default_arrow_writer_properties(), &writer, reader_properties)); ASSERT_OK_NO_THROW(writer->NewRowGroup()); ASSERT_OK_NO_THROW(writer->WriteColumnChunk(*values)); ASSERT_OK_NO_THROW(writer->Close()); @@ -5344,13 +5408,17 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO { void ReadAndCheckSingleDecimalColumnFile(const Array& values) { std::shared_ptr out; std::unique_ptr reader; - this->ReaderFromSink(&reader); + auto reader_properties = this->template ReaderPropertiesFromArrowType(); + this->ReaderFromSink(&reader, reader_properties); this->ReadSingleColumnFile(std::move(reader), &out); - // Reader always read values as DECIMAL128 type - ASSERT_EQ(out->type()->id(), ::arrow::Type::DECIMAL128); + auto expected_type_id = reader_properties.smallest_decimal_enabled() + ? TestType::type_id + // Reader always read values as DECIMAL128 type + : ::arrow::Type::DECIMAL128; + ASSERT_EQ(out->type()->id(), expected_type_id); - if (values.type()->id() == ::arrow::Type::DECIMAL128) { + if (values.type()->id() == expected_type_id) { AssertArraysEqual(values, *out); } else { auto& expected_values = dynamic_cast(values); @@ -5370,6 +5438,9 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO { }; typedef ::testing::Types< + Decimal32WithPrecisionAndScale<1, true>, Decimal32WithPrecisionAndScale<5, true>, + Decimal64WithPrecisionAndScale<10, true>, Decimal64WithPrecisionAndScale<18, true>, + Decimal128WithPrecisionAndScale<19, true>, Decimal256WithPrecisionAndScale<39, true>, Decimal128WithPrecisionAndScale<1>, Decimal128WithPrecisionAndScale<5>, Decimal128WithPrecisionAndScale<10>, Decimal128WithPrecisionAndScale<18>, Decimal256WithPrecisionAndScale<1>, Decimal256WithPrecisionAndScale<5>, @@ -5402,7 +5473,7 @@ class TestBufferedParquetIO : public TestParquetIO { SchemaDescriptor descriptor; ASSERT_NO_THROW(descriptor.Init(schema)); std::shared_ptr<::arrow::Schema> arrow_schema; - ArrowReaderProperties props; + auto props = this->template ReaderPropertiesFromArrowType(); ASSERT_OK_NO_THROW(FromParquetSchema(&descriptor, props, &arrow_schema)); std::unique_ptr writer; @@ -5427,7 +5498,8 @@ class TestBufferedParquetIO : public TestParquetIO { std::shared_ptr out; std::unique_ptr reader; - this->ReaderFromSink(&reader); + auto props = this->template ReaderPropertiesFromArrowType(); + this->ReaderFromSink(&reader, props); ASSERT_EQ(num_row_groups, reader->num_row_groups()); this->ReadSingleColumnFile(std::move(reader), &out); @@ -5438,7 +5510,8 @@ class TestBufferedParquetIO : public TestParquetIO { int num_row_groups) { std::shared_ptr<::arrow::Table> out; std::unique_ptr reader; - this->ReaderFromSink(&reader); + auto props = this->template ReaderPropertiesFromArrowType(); + this->ReaderFromSink(&reader, props); ASSERT_EQ(num_row_groups, reader->num_row_groups()); this->ReadTableFromFile(std::move(reader), &out); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 03b725beb2a01..ca29bf0b8288d 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -362,6 +362,10 @@ class FileReaderImpl : public FileReader { reader_properties_.set_batch_size(batch_size); } + void set_smallest_decimal_enabled(bool smallest_decimal_enabled) override { + reader_properties_.set_smallest_decimal_enabled(smallest_decimal_enabled); + } + const ArrowReaderProperties& properties() const override { return reader_properties_; } const SchemaManifest& manifest() const override { return manifest_; } @@ -1401,10 +1405,11 @@ Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* } Result> OpenFile( - std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool) { + std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool, + const ArrowReaderProperties& reader_properties) { FileReaderBuilder builder; RETURN_NOT_OK(builder.Open(std::move(file))); - return builder.memory_pool(pool)->Build(); + return builder.memory_pool(pool)->properties(reader_properties)->Build(); } namespace internal { diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 476a940bf1696..d35ccc2099cfe 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -303,6 +303,9 @@ class PARQUET_EXPORT FileReader { /// Set number of records to read per batch for the RecordBatchReader. virtual void set_batch_size(int64_t batch_size) = 0; + /// Set whether to enable smallest decimal arrow type + virtual void set_smallest_decimal_enabled(bool smallest_decimal_enabled) = 0; + virtual const ArrowReaderProperties& properties() const = 0; virtual const SchemaManifest& manifest() const = 0; @@ -403,7 +406,8 @@ ::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>, /// Advanced settings are supported through the FileReaderBuilder class. PARQUET_EXPORT ::arrow::Result> OpenFile( - std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator); + std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* pool, + const ArrowReaderProperties& reader_properties = default_arrow_reader_properties()); /// @} diff --git a/cpp/src/parquet/arrow/test_util.h b/cpp/src/parquet/arrow/test_util.h index 8659c6f3f8bdc..d466555879617 100644 --- a/cpp/src/parquet/arrow/test_util.h +++ b/cpp/src/parquet/arrow/test_util.h @@ -47,8 +47,10 @@ using ::arrow::Array; using ::arrow::ChunkedArray; using ::arrow::Status; -template -struct Decimal32WithPrecisionAndScale { +struct BaseDecimalWithPrecisionAndScale {}; + +template +struct Decimal32WithPrecisionAndScale : BaseDecimalWithPrecisionAndScale { static_assert(PRECISION >= ::arrow::Decimal32Type::kMinPrecision && PRECISION <= ::arrow::Decimal32Type::kMaxPrecision, "Invalid precision value"); @@ -57,10 +59,11 @@ struct Decimal32WithPrecisionAndScale { static constexpr ::arrow::Type::type type_id = ::arrow::Decimal32Type::type_id; static constexpr int32_t precision = PRECISION; static constexpr int32_t scale = PRECISION - 1; + static constexpr bool smallest_decimal_enabled = SMALLEST_DECIMAL_ENABLED; }; -template -struct Decimal64WithPrecisionAndScale { +template +struct Decimal64WithPrecisionAndScale : BaseDecimalWithPrecisionAndScale { static_assert(PRECISION >= ::arrow::Decimal64Type::kMinPrecision && PRECISION <= ::arrow::Decimal64Type::kMaxPrecision, "Invalid precision value"); @@ -69,10 +72,11 @@ struct Decimal64WithPrecisionAndScale { static constexpr ::arrow::Type::type type_id = ::arrow::Decimal64Type::type_id; static constexpr int32_t precision = PRECISION; static constexpr int32_t scale = PRECISION - 1; + static constexpr bool smallest_decimal_enabled = SMALLEST_DECIMAL_ENABLED; }; -template -struct Decimal128WithPrecisionAndScale { +template +struct Decimal128WithPrecisionAndScale : BaseDecimalWithPrecisionAndScale { static_assert(PRECISION >= ::arrow::Decimal128Type::kMinPrecision && PRECISION <= ::arrow::Decimal128Type::kMaxPrecision, "Invalid precision value"); @@ -81,10 +85,11 @@ struct Decimal128WithPrecisionAndScale { static constexpr ::arrow::Type::type type_id = ::arrow::Decimal128Type::type_id; static constexpr int32_t precision = PRECISION; static constexpr int32_t scale = PRECISION - 1; + static constexpr bool smallest_decimal_enabled = SMALLEST_DECIMAL_ENABLED; }; -template -struct Decimal256WithPrecisionAndScale { +template +struct Decimal256WithPrecisionAndScale : BaseDecimalWithPrecisionAndScale { static_assert(PRECISION >= ::arrow::Decimal256Type::kMinPrecision && PRECISION <= ::arrow::Decimal256Type::kMaxPrecision, "Invalid precision value"); @@ -93,6 +98,7 @@ struct Decimal256WithPrecisionAndScale { static constexpr ::arrow::Type::type type_id = ::arrow::Decimal256Type::type_id; static constexpr int32_t precision = PRECISION; static constexpr int32_t scale = PRECISION - 1; + static constexpr bool smallest_decimal_enabled = SMALLEST_DECIMAL_ENABLED; }; template @@ -184,7 +190,9 @@ static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t template ::arrow::enable_if_t< - std::is_same>::value, Status> + std::is_same>::value || + std::is_same>::value, + Status> NonNullArray(size_t size, std::shared_ptr* out) { constexpr int32_t kDecimalPrecision = precision; constexpr int32_t kDecimalScale = Decimal32WithPrecisionAndScale::scale; @@ -206,7 +214,9 @@ NonNullArray(size_t size, std::shared_ptr* out) { template ::arrow::enable_if_t< - std::is_same>::value, Status> + std::is_same>::value || + std::is_same>::value, + Status> NonNullArray(size_t size, std::shared_ptr* out) { constexpr int32_t kDecimalPrecision = precision; constexpr int32_t kDecimalScale = Decimal64WithPrecisionAndScale::scale; @@ -228,7 +238,9 @@ NonNullArray(size_t size, std::shared_ptr* out) { template ::arrow::enable_if_t< - std::is_same>::value, Status> + std::is_same>::value || + std::is_same>::value, + Status> NonNullArray(size_t size, std::shared_ptr* out) { constexpr int32_t kDecimalPrecision = precision; constexpr int32_t kDecimalScale = Decimal128WithPrecisionAndScale::scale; @@ -250,7 +262,9 @@ NonNullArray(size_t size, std::shared_ptr* out) { template ::arrow::enable_if_t< - std::is_same>::value, Status> + std::is_same>::value || + std::is_same>::value, + Status> NonNullArray(size_t size, std::shared_ptr* out) { constexpr int32_t kDecimalPrecision = precision; constexpr int32_t kDecimalScale = Decimal256WithPrecisionAndScale::scale; @@ -417,7 +431,9 @@ ::arrow::enable_if_fixed_size_binary NullableArray( template ::arrow::enable_if_t< - std::is_same>::value, Status> + std::is_same>::value || + std::is_same>::value, + Status> NullableArray(size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) { std::vector valid_bytes(size, '\1'); @@ -444,7 +460,9 @@ NullableArray(size_t size, size_t num_nulls, uint32_t seed, template ::arrow::enable_if_t< - std::is_same>::value, Status> + std::is_same>::value || + std::is_same>::value, + Status> NullableArray(size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) { std::vector valid_bytes(size, '\1'); @@ -471,7 +489,9 @@ NullableArray(size_t size, size_t num_nulls, uint32_t seed, template ::arrow::enable_if_t< - std::is_same>::value, Status> + std::is_same>::value || + std::is_same>::value, + Status> NullableArray(size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) { std::vector valid_bytes(size, '\1'); @@ -498,7 +518,9 @@ NullableArray(size_t size, size_t num_nulls, uint32_t seed, template ::arrow::enable_if_t< - std::is_same>::value, Status> + std::is_same>::value || + std::is_same>::value, + Status> NullableArray(size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) { std::vector valid_bytes(size, '\1'); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index c6d86648c1d63..23d352ace7924 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -300,9 +300,9 @@ class FileWriterImpl : public FileWriter { } } - Status Init() { + Status Init(const ArrowReaderProperties& schema_arrow_reader_properities) { return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr, - default_arrow_reader_properties(), &schema_manifest_); + schema_arrow_reader_properities, &schema_manifest_); } Status NewRowGroup() override { @@ -515,10 +515,11 @@ Status FileWriter::Make(::arrow::MemoryPool* pool, std::unique_ptr writer, std::shared_ptr<::arrow::Schema> schema, std::shared_ptr arrow_properties, - std::unique_ptr* out) { + std::unique_ptr* out, + const ArrowReaderProperties& schema_arrow_reader_properities) { std::unique_ptr impl(new FileWriterImpl( std::move(schema), pool, std::move(writer), std::move(arrow_properties))); - RETURN_NOT_OK(impl->Init()); + RETURN_NOT_OK(impl->Init(schema_arrow_reader_properities)); *out = std::move(impl); return Status::OK(); } @@ -554,7 +555,8 @@ Result> FileWriter::Open( const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr properties, - std::shared_ptr arrow_properties) { + std::shared_ptr arrow_properties, + const ArrowReaderProperties& schema_arrow_reader_properities) { std::shared_ptr parquet_schema; RETURN_NOT_OK( ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema)); @@ -572,7 +574,8 @@ Result> FileWriter::Open( std::unique_ptr writer; auto schema_ptr = std::make_shared<::arrow::Schema>(schema); RETURN_NOT_OK(Make(pool, std::move(base_writer), std::move(schema_ptr), - std::move(arrow_properties), &writer)); + std::move(arrow_properties), &writer, + schema_arrow_reader_properities)); return writer; } @@ -592,11 +595,13 @@ Status WriteMetaDataFile(const FileMetaData& file_metadata, Status WriteTable(const ::arrow::Table& table, ::arrow::MemoryPool* pool, std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size, std::shared_ptr properties, - std::shared_ptr arrow_properties) { + std::shared_ptr arrow_properties, + const ArrowReaderProperties& schema_arrow_reader_properities) { std::unique_ptr writer; ARROW_ASSIGN_OR_RAISE( - writer, FileWriter::Open(*table.schema(), pool, std::move(sink), - std::move(properties), std::move(arrow_properties))); + writer, + FileWriter::Open(*table.schema(), pool, std::move(sink), std::move(properties), + std::move(arrow_properties), schema_arrow_reader_properities)); RETURN_NOT_OK(writer->WriteTable(table, chunk_size)); return writer->Close(); } diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h index e36b8f252c750..cf83b6cf4193a 100644 --- a/cpp/src/parquet/arrow/writer.h +++ b/cpp/src/parquet/arrow/writer.h @@ -53,10 +53,13 @@ namespace arrow { /// file. class PARQUET_EXPORT FileWriter { public: - static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr writer, - std::shared_ptr<::arrow::Schema> schema, - std::shared_ptr arrow_properties, - std::unique_ptr* out); + static ::arrow::Status Make( + MemoryPool* pool, std::unique_ptr writer, + std::shared_ptr<::arrow::Schema> schema, + std::shared_ptr arrow_properties, + std::unique_ptr* out, + const ArrowReaderProperties& schema_arrow_reader_properities = + default_arrow_reader_properties()); /// \brief Try to create an Arrow to Parquet file writer. /// @@ -72,7 +75,9 @@ class PARQUET_EXPORT FileWriter { std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr properties = default_writer_properties(), std::shared_ptr arrow_properties = - default_arrow_writer_properties()); + default_arrow_writer_properties(), + const ArrowReaderProperties& schema_arrow_reader_properities = + default_arrow_reader_properties()); /// Return the Arrow schema to be written to. virtual std::shared_ptr<::arrow::Schema> schema() const = 0; @@ -177,7 +182,9 @@ WriteTable(const ::arrow::Table& table, MemoryPool* pool, int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH, std::shared_ptr properties = default_writer_properties(), std::shared_ptr arrow_properties = - default_arrow_writer_properties()); + default_arrow_writer_properties(), + const ArrowReaderProperties& schema_arrow_reader_properities = + default_arrow_reader_properties()); } // namespace arrow } // namespace parquet From 77dd7d3d5dbe4f2e1a2aceb25ac65e3e192e5fac Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sun, 16 Feb 2025 12:12:19 -0500 Subject: [PATCH 10/20] Document new flag --- cpp/src/parquet/properties.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 357e317448e7c..c0c56f7ce47b1 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -1007,6 +1007,10 @@ class PARQUET_EXPORT ArrowReaderProperties { /// Return whether loading statistics as much as possible. bool should_load_statistics() const { return should_load_statistics_; } + /// \brief When enabled, Parquet decimal logical types will always be + /// mapped to the smallest arrow decimal types based on the precision + /// + /// Default is false void set_smallest_decimal_enabled(bool smallest_decimal_enabled) { smallest_decimal_enabled_ = smallest_decimal_enabled; } From d81cf1399491c40e4a4dada2bc0b73c33f3705dc Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sun, 16 Feb 2025 13:03:39 -0500 Subject: [PATCH 11/20] Add decimal32/64 list type support arrow to pandas --- python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 335fbbdd0f8ca..d45451206bf4c 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -182,6 +182,8 @@ static inline bool ListTypeSupported(const DataType& type) { case Type::HALF_FLOAT: case Type::FLOAT: case Type::DOUBLE: + case Type::DECIMAL32: + case Type::DECIMAL64: case Type::DECIMAL128: case Type::DECIMAL256: case Type::BINARY: From 424472fcdf5475fcc53abe895e0e042bb24a8167 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sun, 16 Feb 2025 16:22:07 -0500 Subject: [PATCH 12/20] Support smallest_decimal_enabled flag in pyarrow --- cpp/src/arrow/dataset/file_parquet.cc | 4 +++- .../parquet/arrow/arrow_reader_writer_test.cc | 1 - cpp/src/parquet/arrow/writer.cc | 16 +++++++------- cpp/src/parquet/arrow/writer.h | 6 +++--- python/pyarrow/_dataset_parquet.pyx | 19 +++++++++++++++-- python/pyarrow/_parquet.pxd | 2 ++ python/pyarrow/_parquet.pyx | 5 ++++- python/pyarrow/parquet/core.py | 21 ++++++++++++++++--- python/pyarrow/tests/parquet/test_basic.py | 14 +++++++++++++ 9 files changed, 69 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index cf51ea18d7a02..e366e98cb012a 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -132,6 +132,8 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties( parquet_scan_options.arrow_reader_properties->cache_options()); arrow_properties.set_io_context( parquet_scan_options.arrow_reader_properties->io_context()); + arrow_properties.set_smallest_decimal_enabled( + parquet_scan_options.arrow_reader_properties->smallest_decimal_enabled()); arrow_properties.set_use_threads(options.use_threads); return arrow_properties; } @@ -532,7 +534,7 @@ Future> ParquetFileFormat::GetReader metadata) .Then( [=](const std::unique_ptr& reader) mutable - -> Result> { + -> Result> { auto arrow_properties = MakeArrowReaderProperties( *self, *reader->metadata(), *options, *parquet_scan_options); diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 875f8bf75ff34..d0ccbf4540128 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include "parquet/properties.h" #ifdef _MSC_VER # pragma warning(push) // Disable forcing value to bool warnings diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 23d352ace7924..06bc991d61c16 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -300,9 +300,9 @@ class FileWriterImpl : public FileWriter { } } - Status Init(const ArrowReaderProperties& schema_arrow_reader_properities) { + Status Init(const ArrowReaderProperties& schema_arrow_reader_properties) { return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr, - schema_arrow_reader_properities, &schema_manifest_); + schema_arrow_reader_properties, &schema_manifest_); } Status NewRowGroup() override { @@ -516,10 +516,10 @@ Status FileWriter::Make(::arrow::MemoryPool* pool, std::shared_ptr<::arrow::Schema> schema, std::shared_ptr arrow_properties, std::unique_ptr* out, - const ArrowReaderProperties& schema_arrow_reader_properities) { + const ArrowReaderProperties& schema_arrow_reader_properties) { std::unique_ptr impl(new FileWriterImpl( std::move(schema), pool, std::move(writer), std::move(arrow_properties))); - RETURN_NOT_OK(impl->Init(schema_arrow_reader_properities)); + RETURN_NOT_OK(impl->Init(schema_arrow_reader_properties)); *out = std::move(impl); return Status::OK(); } @@ -556,7 +556,7 @@ Result> FileWriter::Open( std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr properties, std::shared_ptr arrow_properties, - const ArrowReaderProperties& schema_arrow_reader_properities) { + const ArrowReaderProperties& schema_arrow_reader_properties) { std::shared_ptr parquet_schema; RETURN_NOT_OK( ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema)); @@ -575,7 +575,7 @@ Result> FileWriter::Open( auto schema_ptr = std::make_shared<::arrow::Schema>(schema); RETURN_NOT_OK(Make(pool, std::move(base_writer), std::move(schema_ptr), std::move(arrow_properties), &writer, - schema_arrow_reader_properities)); + schema_arrow_reader_properties)); return writer; } @@ -596,12 +596,12 @@ Status WriteTable(const ::arrow::Table& table, ::arrow::MemoryPool* pool, std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size, std::shared_ptr properties, std::shared_ptr arrow_properties, - const ArrowReaderProperties& schema_arrow_reader_properities) { + const ArrowReaderProperties& schema_arrow_reader_properties) { std::unique_ptr writer; ARROW_ASSIGN_OR_RAISE( writer, FileWriter::Open(*table.schema(), pool, std::move(sink), std::move(properties), - std::move(arrow_properties), schema_arrow_reader_properities)); + std::move(arrow_properties), schema_arrow_reader_properties)); RETURN_NOT_OK(writer->WriteTable(table, chunk_size)); return writer->Close(); } diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h index cf83b6cf4193a..e08675f233bf4 100644 --- a/cpp/src/parquet/arrow/writer.h +++ b/cpp/src/parquet/arrow/writer.h @@ -58,7 +58,7 @@ class PARQUET_EXPORT FileWriter { std::shared_ptr<::arrow::Schema> schema, std::shared_ptr arrow_properties, std::unique_ptr* out, - const ArrowReaderProperties& schema_arrow_reader_properities = + const ArrowReaderProperties& schema_arrow_reader_properties = default_arrow_reader_properties()); /// \brief Try to create an Arrow to Parquet file writer. @@ -76,7 +76,7 @@ class PARQUET_EXPORT FileWriter { std::shared_ptr properties = default_writer_properties(), std::shared_ptr arrow_properties = default_arrow_writer_properties(), - const ArrowReaderProperties& schema_arrow_reader_properities = + const ArrowReaderProperties& schema_arrow_reader_properties = default_arrow_reader_properties()); /// Return the Arrow schema to be written to. @@ -183,7 +183,7 @@ WriteTable(const ::arrow::Table& table, MemoryPool* pool, std::shared_ptr properties = default_writer_properties(), std::shared_ptr arrow_properties = default_arrow_writer_properties(), - const ArrowReaderProperties& schema_arrow_reader_properities = + const ArrowReaderProperties& schema_arrow_reader_properties = default_arrow_reader_properties()); } // namespace arrow diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 863c928591937..bee05a6f15786 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -703,7 +703,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): cache_options : pyarrow.CacheOptions, default None Cache options used when pre_buffer is enabled. The default values should be good for most use cases. You may want to adjust these for example if - you have exceptionally high latency to the file system. + you have exceptionally high latency to the file system. thrift_string_size_limit : int, default None If not None, override the maximum total string size allocated when decoding Thrift structures. The default limit should be @@ -720,6 +720,11 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): Parquet file. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + smallest_decimal_enabled : bool, default False + If True, always convert to the smallest arrow decimal type based + on precision. """ # Avoid mistakingly creating attributes @@ -733,7 +738,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_container_size_limit=None, decryption_config=None, decryption_properties=None, - bint page_checksum_verification=False): + bint page_checksum_verification=False, + bint smallest_decimal_enabled=False): self.init(shared_ptr[CFragmentScanOptions]( new CParquetFragmentScanOptions())) self.use_buffered_stream = use_buffered_stream @@ -752,6 +758,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): if decryption_properties is not None: self.decryption_properties = decryption_properties self.page_checksum_verification = page_checksum_verification + self.smallest_decimal_enabled = smallest_decimal_enabled cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): FragmentScanOptions.init(self, sp) @@ -868,6 +875,14 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): def page_checksum_verification(self, bint page_checksum_verification): self.reader_properties().set_page_checksum_verification(page_checksum_verification) + @property + def smallest_decimal_enabled(self): + return self.arrow_reader_properties().smallest_decimal_enabled() + + @smallest_decimal_enabled.setter + def smallest_decimal_enabled(self, bint smallest_decimal_enabled): + self.arrow_reader_properties().set_smallest_decimal_enabled(smallest_decimal_enabled) + def equals(self, ParquetFragmentScanOptions other): """ Parameters diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index c17c3b70d7f41..241d9d7416d64 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -405,6 +405,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: CCacheOptions cache_options() const void set_coerce_int96_timestamp_unit(TimeUnit unit) TimeUnit coerce_int96_timestamp_unit() const + void set_smallest_decimal_enabled(c_bool smallest_decimal_enabled) + c_bool smallest_decimal_enabled() const ArrowReaderProperties default_arrow_reader_properties() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 2fb1e41641f8e..cd91c507d70ee 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1441,7 +1441,8 @@ cdef class ParquetReader(_Weakrefable): FileDecryptionProperties decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): + page_checksum_verification=False, + smallest_decimal_enabled=False): """ Open a parquet file for reading. @@ -1458,6 +1459,7 @@ cdef class ParquetReader(_Weakrefable): thrift_string_size_limit : int, optional thrift_container_size_limit : int, optional page_checksum_verification : bool, default False + smallest_decimal_enabled : bool, default False """ cdef: shared_ptr[CFileMetaData] c_metadata @@ -1497,6 +1499,7 @@ cdef class ParquetReader(_Weakrefable): decryption_properties.unwrap()) arrow_props.set_pre_buffer(pre_buffer) + arrow_props.set_smallest_decimal_enabled(smallest_decimal_enabled) properties.set_page_checksum_verification(page_checksum_verification) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 6ca6f7089e75c..3589f7e073616 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -255,6 +255,9 @@ class ParquetFile: it will be parsed as an URI to determine the filesystem. page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. + smallest_decimal_enabled : bool, default False + If True, always convert to the smallest arrow decimal type based + on precision. Examples -------- @@ -303,7 +306,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None, pre_buffer=False, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, filesystem=None, - page_checksum_verification=False): + page_checksum_verification=False, smallest_decimal_enabled=False): self._close_source = getattr(source, 'closed', True) @@ -323,6 +326,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, + smallest_decimal_enabled=smallest_decimal_enabled, ) self.common_metadata = common_metadata self._nested_paths_by_prefix = self._build_nested_paths() @@ -1267,6 +1271,9 @@ class ParquetDataset: If True, verify the page checksum for each page read from the file. use_legacy_dataset : bool, optional Deprecated and has no effect from PyArrow version 15.0.0. +smallest_decimal_enabled : bool, default False + If True, always convert to the smallest arrow decimal type based + on precision. Examples -------- @@ -1280,7 +1287,8 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, page_checksum_verification=False, - use_legacy_dataset=None): + use_legacy_dataset=None, + smallest_decimal_enabled=False): if use_legacy_dataset is not None: warnings.warn( @@ -1297,6 +1305,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, "thrift_string_size_limit": thrift_string_size_limit, "thrift_container_size_limit": thrift_container_size_limit, "page_checksum_verification": page_checksum_verification, + "smallest_decimal_enabled": smallest_decimal_enabled, } if buffer_size: read_options.update(use_buffered_stream=True, @@ -1686,6 +1695,9 @@ def partitioning(self): sufficient for most Parquet files. page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. +smallest_decimal_enabled : bool, default False + If True, always convert to the smallest arrow decimal type based + on precision. Returns ------- @@ -1781,7 +1793,8 @@ def read_table(source, *, columns=None, use_threads=True, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): + page_checksum_verification=False, + smallest_decimal_enabled=False): if use_legacy_dataset is not None: warnings.warn( @@ -1806,6 +1819,7 @@ def read_table(source, *, columns=None, use_threads=True, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, + smallest_decimal_enabled=smallest_decimal_enabled, ) except ImportError: # fall back on ParquetFile for simple cases when pyarrow.dataset @@ -1838,6 +1852,7 @@ def read_table(source, *, columns=None, use_threads=True, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, + smallest_decimal_enabled=smallest_decimal_enabled, ) return dataset.read(columns=columns, use_threads=use_threads, diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 6496aa99092b8..b0f74808cf60e 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -362,6 +362,20 @@ def test_byte_stream_split(): use_dictionary=False) +def test_smallest_decimal_enabled(tempdir): + arr1 = pa.array(list(map(Decimal, range(100))), type=pa.decimal32(5, 2)) + arr2 = pa.array(list(map(Decimal, range(100))), type=pa.decimal64(16, 9)) + arr3 = pa.array(list(map(Decimal, range(100))), type=pa.decimal128(22, 2)) + arr4 = pa.array(list(map(Decimal, range(100))), type=pa.decimal256(48, 2)) + data_decimal = [arr1, arr2, arr3, arr4] + table = pa.Table.from_arrays(data_decimal, names=['a', 'b', 'c', 'd']) + + # Check with smallest_decimal_enabled + _check_roundtrip(table, + expected=table, + read_table_kwargs={"smallest_decimal_enabled": True}) + + def test_store_decimal_as_integer(tempdir): arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))), type=pa.decimal128(5, 2)) From d1687a74c050d8702cafb4b67fa2b4dab523b5e6 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 8 Mar 2025 21:09:30 -0500 Subject: [PATCH 13/20] Revert writer schema manifest arg passing change --- .../parquet/arrow/arrow_reader_writer_test.cc | 20 +++++++--------- cpp/src/parquet/arrow/writer.cc | 23 ++++++++----------- cpp/src/parquet/arrow/writer.h | 19 +++++---------- 3 files changed, 23 insertions(+), 39 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index d0ccbf4540128..a868df48bdf5a 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -395,16 +395,13 @@ using ParquetWriter = TypedColumnWriter>; void WriteTableToBuffer(const std::shared_ptr
& table, int64_t row_group_size, const std::shared_ptr& arrow_properties, - std::shared_ptr* out, - const ArrowReaderProperties& schema_arrow_reader_properities = - default_arrow_reader_properties()) { + std::shared_ptr* out) { auto sink = CreateOutputStream(); auto write_props = WriterProperties::Builder().write_batch_size(100)->build(); ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink, - row_group_size, write_props, arrow_properties, - schema_arrow_reader_properities)); + row_group_size, write_props, arrow_properties)); ASSERT_OK_AND_ASSIGN(*out, sink->Finish()); } @@ -459,8 +456,8 @@ void DoSimpleRoundtrip( default_arrow_writer_properties(), const ArrowReaderProperties& reader_properties = default_arrow_reader_properties()) { std::shared_ptr buffer; - ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size, arrow_properties, - &buffer, reader_properties)); + ASSERT_NO_FATAL_FAILURE( + WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer)); ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), @@ -483,8 +480,8 @@ void DoRoundTripWithBatches( ArrowReaderProperties reader_properties = default_arrow_reader_properties()) { std::shared_ptr buffer; reader_properties.set_batch_size(row_group_size - 1); - ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer( - table, row_group_size, arrow_writer_properties, &buffer, reader_properties)); + ASSERT_NO_FATAL_FAILURE( + WriteTableToBuffer(table, row_group_size, arrow_writer_properties, &buffer)); std::unique_ptr reader; FileReaderBuilder builder; @@ -1227,7 +1224,7 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWriteArrowIO) { auto arrow_sink_ = std::make_shared<::arrow::io::BufferOutputStream>(buffer); ASSERT_OK_NO_THROW(WriteTable(*table, default_memory_pool(), arrow_sink_, 512, default_writer_properties(), - default_arrow_writer_properties(), reader_properties)); + default_arrow_writer_properties())); // XXX: Remove this after ARROW-455 completed ASSERT_OK(arrow_sink_->Close()); @@ -5394,11 +5391,10 @@ class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO { auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); std::unique_ptr writer; - auto reader_properties = this->template ReaderPropertiesFromArrowType(); ASSERT_OK_NO_THROW(FileWriter::Make( ::arrow::default_memory_pool(), ParquetFileWriter::Open(this->sink_, schema_node, writer_properties), - arrow_schema, default_arrow_writer_properties(), &writer, reader_properties)); + arrow_schema, default_arrow_writer_properties(), &writer)); ASSERT_OK_NO_THROW(writer->NewRowGroup()); ASSERT_OK_NO_THROW(writer->WriteColumnChunk(*values)); ASSERT_OK_NO_THROW(writer->Close()); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 06bc991d61c16..c6d86648c1d63 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -300,9 +300,9 @@ class FileWriterImpl : public FileWriter { } } - Status Init(const ArrowReaderProperties& schema_arrow_reader_properties) { + Status Init() { return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr, - schema_arrow_reader_properties, &schema_manifest_); + default_arrow_reader_properties(), &schema_manifest_); } Status NewRowGroup() override { @@ -515,11 +515,10 @@ Status FileWriter::Make(::arrow::MemoryPool* pool, std::unique_ptr writer, std::shared_ptr<::arrow::Schema> schema, std::shared_ptr arrow_properties, - std::unique_ptr* out, - const ArrowReaderProperties& schema_arrow_reader_properties) { + std::unique_ptr* out) { std::unique_ptr impl(new FileWriterImpl( std::move(schema), pool, std::move(writer), std::move(arrow_properties))); - RETURN_NOT_OK(impl->Init(schema_arrow_reader_properties)); + RETURN_NOT_OK(impl->Init()); *out = std::move(impl); return Status::OK(); } @@ -555,8 +554,7 @@ Result> FileWriter::Open( const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr properties, - std::shared_ptr arrow_properties, - const ArrowReaderProperties& schema_arrow_reader_properties) { + std::shared_ptr arrow_properties) { std::shared_ptr parquet_schema; RETURN_NOT_OK( ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema)); @@ -574,8 +572,7 @@ Result> FileWriter::Open( std::unique_ptr writer; auto schema_ptr = std::make_shared<::arrow::Schema>(schema); RETURN_NOT_OK(Make(pool, std::move(base_writer), std::move(schema_ptr), - std::move(arrow_properties), &writer, - schema_arrow_reader_properties)); + std::move(arrow_properties), &writer)); return writer; } @@ -595,13 +592,11 @@ Status WriteMetaDataFile(const FileMetaData& file_metadata, Status WriteTable(const ::arrow::Table& table, ::arrow::MemoryPool* pool, std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size, std::shared_ptr properties, - std::shared_ptr arrow_properties, - const ArrowReaderProperties& schema_arrow_reader_properties) { + std::shared_ptr arrow_properties) { std::unique_ptr writer; ARROW_ASSIGN_OR_RAISE( - writer, - FileWriter::Open(*table.schema(), pool, std::move(sink), std::move(properties), - std::move(arrow_properties), schema_arrow_reader_properties)); + writer, FileWriter::Open(*table.schema(), pool, std::move(sink), + std::move(properties), std::move(arrow_properties))); RETURN_NOT_OK(writer->WriteTable(table, chunk_size)); return writer->Close(); } diff --git a/cpp/src/parquet/arrow/writer.h b/cpp/src/parquet/arrow/writer.h index e08675f233bf4..e36b8f252c750 100644 --- a/cpp/src/parquet/arrow/writer.h +++ b/cpp/src/parquet/arrow/writer.h @@ -53,13 +53,10 @@ namespace arrow { /// file. class PARQUET_EXPORT FileWriter { public: - static ::arrow::Status Make( - MemoryPool* pool, std::unique_ptr writer, - std::shared_ptr<::arrow::Schema> schema, - std::shared_ptr arrow_properties, - std::unique_ptr* out, - const ArrowReaderProperties& schema_arrow_reader_properties = - default_arrow_reader_properties()); + static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr writer, + std::shared_ptr<::arrow::Schema> schema, + std::shared_ptr arrow_properties, + std::unique_ptr* out); /// \brief Try to create an Arrow to Parquet file writer. /// @@ -75,9 +72,7 @@ class PARQUET_EXPORT FileWriter { std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr properties = default_writer_properties(), std::shared_ptr arrow_properties = - default_arrow_writer_properties(), - const ArrowReaderProperties& schema_arrow_reader_properties = - default_arrow_reader_properties()); + default_arrow_writer_properties()); /// Return the Arrow schema to be written to. virtual std::shared_ptr<::arrow::Schema> schema() const = 0; @@ -182,9 +177,7 @@ WriteTable(const ::arrow::Table& table, MemoryPool* pool, int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH, std::shared_ptr properties = default_writer_properties(), std::shared_ptr arrow_properties = - default_arrow_writer_properties(), - const ArrowReaderProperties& schema_arrow_reader_properties = - default_arrow_reader_properties()); + default_arrow_writer_properties()); } // namespace arrow } // namespace parquet From 52711d55ee5632bfbf081c0a98bebe96cc160e36 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 22 Mar 2025 16:36:06 -0400 Subject: [PATCH 14/20] Fix lint --- cpp/src/arrow/dataset/file_parquet.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index e366e98cb012a..450723ca6b0b0 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -534,7 +534,7 @@ Future> ParquetFileFormat::GetReader metadata) .Then( [=](const std::unique_ptr& reader) mutable - -> Result> { + -> Result> { auto arrow_properties = MakeArrowReaderProperties( *self, *reader->metadata(), *options, *parquet_scan_options); From f64d6d91d2157cd22429a151c87fa1ced6a0b604 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 22 Mar 2025 17:39:34 -0400 Subject: [PATCH 15/20] Remove extra doc --- python/pyarrow/_dataset_parquet.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 8c310a89f991a..b144a472d81fa 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -720,8 +720,6 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): Parquet file. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. - page_checksum_verification : bool, default False - If True, verify the page checksum for each page read from the file. smallest_decimal_enabled : bool, default False If True, always convert to the smallest arrow decimal type based on precision. From 3fb307e18c73742d6910c32786c9a076c9a79961 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 29 Mar 2025 17:31:06 -0400 Subject: [PATCH 16/20] Revert FileReader changes --- .../parquet/arrow/arrow_reader_writer_test.cc | 19 ++++++++++++++----- cpp/src/parquet/arrow/reader.cc | 9 ++------- cpp/src/parquet/arrow/reader.h | 6 +----- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 5c15bf3b55861..081590d0182b6 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/type_fwd.h" #ifdef _MSC_VER # pragma warning(push) // Disable forcing value to bool warnings @@ -459,9 +460,12 @@ void DoSimpleRoundtrip( ASSERT_NO_FATAL_FAILURE( WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer)); - ASSERT_OK_AND_ASSIGN(auto reader, - OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), reader_properties)); + std::unique_ptr reader; + FileReaderBuilder builder; + ASSERT_OK(builder.Open(std::make_shared(buffer))); + ASSERT_OK(builder.properties(reader_properties) + ->memory_pool(::arrow::default_memory_pool()) + ->Build(&reader)); reader->set_use_threads(use_threads); if (column_subset.size() > 0) { @@ -1234,8 +1238,13 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWriteArrowIO) { auto source = std::make_shared(pbuffer); std::shared_ptr<::arrow::Table> out; - ASSERT_OK_AND_ASSIGN( - auto reader, OpenFile(source, ::arrow::default_memory_pool(), reader_properties)); + std::unique_ptr reader; + FileReaderBuilder builder; + ASSERT_OK(builder.Open(source)); + ASSERT_OK(builder.properties(reader_properties) + ->memory_pool(::arrow::default_memory_pool()) + ->Build(&reader)); + ASSERT_NO_FATAL_FAILURE(this->ReadTableFromFile(std::move(reader), &out)); ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(values->length(), out->num_rows()); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index ca29bf0b8288d..03b725beb2a01 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -362,10 +362,6 @@ class FileReaderImpl : public FileReader { reader_properties_.set_batch_size(batch_size); } - void set_smallest_decimal_enabled(bool smallest_decimal_enabled) override { - reader_properties_.set_smallest_decimal_enabled(smallest_decimal_enabled); - } - const ArrowReaderProperties& properties() const override { return reader_properties_; } const SchemaManifest& manifest() const override { return manifest_; } @@ -1405,11 +1401,10 @@ Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* } Result> OpenFile( - std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool, - const ArrowReaderProperties& reader_properties) { + std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool) { FileReaderBuilder builder; RETURN_NOT_OK(builder.Open(std::move(file))); - return builder.memory_pool(pool)->properties(reader_properties)->Build(); + return builder.memory_pool(pool)->Build(); } namespace internal { diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index d35ccc2099cfe..476a940bf1696 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -303,9 +303,6 @@ class PARQUET_EXPORT FileReader { /// Set number of records to read per batch for the RecordBatchReader. virtual void set_batch_size(int64_t batch_size) = 0; - /// Set whether to enable smallest decimal arrow type - virtual void set_smallest_decimal_enabled(bool smallest_decimal_enabled) = 0; - virtual const ArrowReaderProperties& properties() const = 0; virtual const SchemaManifest& manifest() const = 0; @@ -406,8 +403,7 @@ ::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>, /// Advanced settings are supported through the FileReaderBuilder class. PARQUET_EXPORT ::arrow::Result> OpenFile( - std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* pool, - const ArrowReaderProperties& reader_properties = default_arrow_reader_properties()); + std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator); /// @} From f27934908c06388f58cd663c44569be9e8ee5e87 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 29 Mar 2025 18:23:59 -0400 Subject: [PATCH 17/20] Delay scratch buffer pointer cast --- cpp/src/parquet/column_writer.cc | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index ccebec674f70a..f5db865f1dd7a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2404,8 +2404,6 @@ struct SerializeFunctor< int64_t non_null_count = array.length() - array.null_count(); int64_t size = non_null_count * ArrowType::kByteWidth; scratch_buffer = AllocateBuffer(ctx->memory_pool, size); - scratch_i32 = reinterpret_cast(scratch_buffer->mutable_data()); - scratch_i64 = reinterpret_cast(scratch_buffer->mutable_data()); } template @@ -2417,32 +2415,32 @@ struct SerializeFunctor< "only 4/8/16/32 byte Decimals supported"); if constexpr (byte_width == ::arrow::Decimal32Type::kByteWidth) { + int32_t* scratch = reinterpret_cast(scratch_buffer->mutable_data()); const auto* u32_in = reinterpret_cast(in); - auto out = reinterpret_cast(scratch_i32) + offset; - *scratch_i32++ = ::arrow::bit_util::ToBigEndian(u32_in[0]); + auto out = reinterpret_cast(scratch) + offset; + *scratch++ = ::arrow::bit_util::ToBigEndian(u32_in[0]); return FixedLenByteArray(out); } + int64_t* scratch = reinterpret_cast(scratch_buffer->mutable_data()); const auto* u64_in = reinterpret_cast(in); - auto out = reinterpret_cast(scratch_i64) + offset; + auto out = reinterpret_cast(scratch) + offset; if constexpr (byte_width == ::arrow::Decimal64Type::kByteWidth) { - *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); } else if constexpr (byte_width == ::arrow::Decimal128Type::kByteWidth) { - *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); - *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); + *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); } else { - *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[3]); - *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[2]); - *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); - *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[3]); + *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[2]); + *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); + *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); } return FixedLenByteArray(out); } std::shared_ptr scratch_buffer; - int32_t* scratch_i32; - int64_t* scratch_i64; }; // ---------------------------------------------------------------------- From 8a78c72bf945ea56b06d06f6b8f7e0485c2bec72 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Sat, 29 Mar 2025 18:43:56 -0400 Subject: [PATCH 18/20] Use ArrowReaderProperties --- cpp/src/parquet/arrow/schema_internal.cc | 41 ++++++++++++------------ cpp/src/parquet/arrow/schema_internal.h | 12 ++++--- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index aa5182e036634..78432e9f5b438 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -115,20 +115,21 @@ Result> MakeArrowTimestamp(const LogicalType& logical } } -Result> FromByteArray(const LogicalType& logical_type, - bool arrow_extensions_enabled, - bool smallest_decimal_enabled) { +Result> FromByteArray( + const LogicalType& logical_type, const ArrowReaderProperties& reader_properties + +) { switch (logical_type.type()) { case LogicalType::Type::STRING: return ::arrow::utf8(); case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type, smallest_decimal_enabled); + return MakeArrowDecimal(logical_type, reader_properties.smallest_decimal_enabled()); case LogicalType::Type::NONE: case LogicalType::Type::ENUM: case LogicalType::Type::BSON: return ::arrow::binary(); case LogicalType::Type::JSON: - if (arrow_extensions_enabled) { + if (reader_properties.get_arrow_extensions_enabled()) { return ::arrow::extension::json(::arrow::utf8()); } // When the original Arrow schema isn't stored and Arrow extensions are disabled, @@ -140,12 +141,12 @@ Result> FromByteArray(const LogicalType& logical_type } } -Result> FromFLBA(const LogicalType& logical_type, - int32_t physical_length, - bool smallest_decimal_enabled) { +Result> FromFLBA( + const LogicalType& logical_type, int32_t physical_length, + const ArrowReaderProperties& reader_properties) { switch (logical_type.type()) { case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type, smallest_decimal_enabled); + return MakeArrowDecimal(logical_type, reader_properties.smallest_decimal_enabled()); case LogicalType::Type::FLOAT16: return ::arrow::float16(); case LogicalType::Type::NONE: @@ -159,8 +160,8 @@ Result> FromFLBA(const LogicalType& logical_type, } } -::arrow::Result> FromInt32(const LogicalType& logical_type, - bool smallest_decimal_enabled) { +::arrow::Result> FromInt32( + const LogicalType& logical_type, const ArrowReaderProperties& reader_properties) { switch (logical_type.type()) { case LogicalType::Type::INT: return MakeArrowInt(logical_type); @@ -169,7 +170,7 @@ ::arrow::Result> FromInt32(const LogicalType& logical case LogicalType::Type::TIME: return MakeArrowTime32(logical_type); case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type, smallest_decimal_enabled); + return MakeArrowDecimal(logical_type, reader_properties.smallest_decimal_enabled()); case LogicalType::Type::NONE: return ::arrow::int32(); default: @@ -178,13 +179,13 @@ ::arrow::Result> FromInt32(const LogicalType& logical } } -Result> FromInt64(const LogicalType& logical_type, - bool smallest_decimal_enabled) { +Result> FromInt64( + const LogicalType& logical_type, const ArrowReaderProperties& reader_properties) { switch (logical_type.type()) { case LogicalType::Type::INT: return MakeArrowInt64(logical_type); case LogicalType::Type::DECIMAL: - return MakeArrowDecimal(logical_type, smallest_decimal_enabled); + return MakeArrowDecimal(logical_type, reader_properties.smallest_decimal_enabled()); case LogicalType::Type::TIMESTAMP: return MakeArrowTimestamp(logical_type); case LogicalType::Type::TIME: @@ -204,14 +205,13 @@ Result> GetArrowType( return ::arrow::null(); } - bool smallest_decimal_enabled = reader_properties.smallest_decimal_enabled(); switch (physical_type) { case ParquetType::BOOLEAN: return ::arrow::boolean(); case ParquetType::INT32: - return FromInt32(logical_type, smallest_decimal_enabled); + return FromInt32(logical_type, reader_properties); case ParquetType::INT64: - return FromInt64(logical_type, smallest_decimal_enabled); + return FromInt64(logical_type, reader_properties); case ParquetType::INT96: return ::arrow::timestamp(reader_properties.coerce_int96_timestamp_unit()); case ParquetType::FLOAT: @@ -219,10 +219,9 @@ Result> GetArrowType( case ParquetType::DOUBLE: return ::arrow::float64(); case ParquetType::BYTE_ARRAY: - return FromByteArray(logical_type, reader_properties.get_arrow_extensions_enabled(), - smallest_decimal_enabled); + return FromByteArray(logical_type, reader_properties); case ParquetType::FIXED_LEN_BYTE_ARRAY: - return FromFLBA(logical_type, type_length, smallest_decimal_enabled); + return FromFLBA(logical_type, type_length, reader_properties); default: { // PARQUET-1565: This can occur if the file is corrupt return Status::IOError("Invalid physical column type: ", diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index d26d32c536e72..ed16bf66daa8d 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -19,6 +19,7 @@ #include "arrow/result.h" #include "arrow/type_fwd.h" +#include "parquet/properties.h" #include "parquet/schema.h" namespace arrow { @@ -30,15 +31,16 @@ namespace parquet::arrow { using ::arrow::Result; Result> FromByteArray( - const LogicalType& logical_type, bool arrow_extensions_enabled = false, - bool smallest_decimal_enabled = false); + const LogicalType& logical_type, const ArrowReaderProperties& reader_properties); Result> FromFLBA( const LogicalType& logical_type, int32_t physical_length, - bool smallest_decimal_enabled = false); + const ArrowReaderProperties& reader_properties); Result> FromInt32( - const LogicalType& logical_type, bool smallest_decimal_enabled = false); + const LogicalType& logical_type, + const ArrowReaderProperties& reader_properties = default_arrow_reader_properties()); Result> FromInt64( - const LogicalType& logical_type, bool smallest_decimal_enabled = false); + const LogicalType& logical_type, + const ArrowReaderProperties& reader_properties = default_arrow_reader_properties()); Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, From d2e1ffa550e0212f439ab240166f82cd85a265d4 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Thu, 3 Apr 2025 22:37:17 -0400 Subject: [PATCH 19/20] Revert "Delay scratch buffer pointer cast" This reverts commit f27934908c06388f58cd663c44569be9e8ee5e87. --- cpp/src/parquet/column_writer.cc | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f5db865f1dd7a..ccebec674f70a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2404,6 +2404,8 @@ struct SerializeFunctor< int64_t non_null_count = array.length() - array.null_count(); int64_t size = non_null_count * ArrowType::kByteWidth; scratch_buffer = AllocateBuffer(ctx->memory_pool, size); + scratch_i32 = reinterpret_cast(scratch_buffer->mutable_data()); + scratch_i64 = reinterpret_cast(scratch_buffer->mutable_data()); } template @@ -2415,32 +2417,32 @@ struct SerializeFunctor< "only 4/8/16/32 byte Decimals supported"); if constexpr (byte_width == ::arrow::Decimal32Type::kByteWidth) { - int32_t* scratch = reinterpret_cast(scratch_buffer->mutable_data()); const auto* u32_in = reinterpret_cast(in); - auto out = reinterpret_cast(scratch) + offset; - *scratch++ = ::arrow::bit_util::ToBigEndian(u32_in[0]); + auto out = reinterpret_cast(scratch_i32) + offset; + *scratch_i32++ = ::arrow::bit_util::ToBigEndian(u32_in[0]); return FixedLenByteArray(out); } - int64_t* scratch = reinterpret_cast(scratch_buffer->mutable_data()); const auto* u64_in = reinterpret_cast(in); - auto out = reinterpret_cast(scratch) + offset; + auto out = reinterpret_cast(scratch_i64) + offset; if constexpr (byte_width == ::arrow::Decimal64Type::kByteWidth) { - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); } else if constexpr (byte_width == ::arrow::Decimal128Type::kByteWidth) { - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); } else { - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[3]); - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[2]); - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); - *scratch++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[3]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[2]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[1]); + *scratch_i64++ = ::arrow::bit_util::ToBigEndian(u64_in[0]); } return FixedLenByteArray(out); } std::shared_ptr scratch_buffer; + int32_t* scratch_i32; + int64_t* scratch_i64; }; // ---------------------------------------------------------------------- From a8304f311805b9f54e08fd6447b86b5d0f698524 Mon Sep 17 00:00:00 2001 From: Tim Nguyen <6283718+curioustien@users.noreply.github.com> Date: Thu, 3 Apr 2025 23:04:02 -0400 Subject: [PATCH 20/20] Remove mistake include --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 9a0d7fb95b53f..6d0fcffd81de3 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/type_fwd.h" #ifdef _MSC_VER # pragma warning(push) // Disable forcing value to bool warnings