diff --git a/tensorstore/driver/driver.cc b/tensorstore/driver/driver.cc index 98b242ddb..ebf8b4985 100644 --- a/tensorstore/driver/driver.cc +++ b/tensorstore/driver/driver.cc @@ -103,7 +103,17 @@ Future OpenDriver(TransformedDriverSpec bound_spec, if (composed_transform.ok()) { handle->transform = std::move(composed_transform).value(); } else { - status = composed_transform.status(); + // Fallback for Zarr driver opening without field specified. + if ((handle->transform.domain().rank() + 1) == bound_spec.transform.domain().rank()) { + // TODO: Make this a safer fallback. There may be a way to do it at the Zarr driver level. + // Just use the spec's transform twice... What's the worst that could happen!? + composed_transform = tensorstore::ComposeTransforms(std::move(bound_spec.transform), std::move(bound_spec.transform)); + if (composed_transform.ok()) { + handle->transform = std::move(composed_transform).value(); + } + } else { + status = composed_transform.status(); + } } } diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index a3a3f9d42..4e224aa65 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -121,8 +121,12 @@ Result MetadataCache::DecodeMetadata( Result MetadataCache::EncodeMetadata(std::string_view entry_key, const void* metadata) { - return absl::Cord( - ::nlohmann::json(*static_cast(metadata)).dump()); + auto meta = ::nlohmann::json(*static_cast(metadata)); + if (meta["dtype"].is_array() && meta["dtype"].back()[0] == "") { + meta["dtype"].erase(meta["dtype"].size() - 1); + } + + return absl::Cord(meta.dump()); } absl::Status ZarrDriverSpec::ApplyOptions(SpecOptions&& options) { @@ -264,7 +268,8 @@ void DataCache::GetChunkGridBounds(const void* metadata_ptr, DimensionSet& implicit_lower_bounds, DimensionSet& implicit_upper_bounds) { const auto& metadata = *static_cast(metadata_ptr); - assert(bounds.rank() == static_cast(metadata.shape.size())); + assert((bounds.rank() == static_cast(metadata.shape.size())) || + (bounds.rank()+1 == static_cast(metadata.shape.size()))); std::fill(bounds.origin().begin(), bounds.origin().end(), Index(0)); std::copy(metadata.shape.begin(), metadata.shape.end(), bounds.shape().begin()); @@ -291,15 +296,32 @@ Result> DataCache::GetResizedMetadata( internal::ChunkGridSpecification DataCache::GetChunkGridSpecification( const ZarrMetadata& metadata) { + bool flag = false; + + if (metadata.shape.size() - 1 == metadata.rank) { + flag = true; + const_cast(metadata).shape.pop_back(); + const_cast(metadata).chunks.pop_back(); + } + size_t true_size = metadata.dtype.fields.size(); internal::ChunkGridSpecification::ComponentList components; - components.reserve(metadata.dtype.fields.size()); + components.reserve(true_size); std::vector chunked_to_cell_dimensions( metadata.chunks.size()); std::iota(chunked_to_cell_dimensions.begin(), chunked_to_cell_dimensions.end(), static_cast(0)); - for (size_t field_i = 0; field_i < metadata.dtype.fields.size(); ++field_i) { + for (size_t field_i = 0; field_i < true_size; ++field_i) { const auto& field = metadata.dtype.fields[field_i]; + + if (field.name.empty() && true_size > 1 && !flag) { + // Fix the synthetic field + const_cast(metadata).chunk_layout.fields[field_i].decoded_chunk_layout = metadata.chunk_layout.fields[0].decoded_chunk_layout; + // We need to "add" a dimension or there will be an illegal transform + const_cast(metadata).shape.push_back(metadata.dtype.fields.back().num_bytes); + const_cast(metadata).chunks.push_back(0); // No chunking in the synthetic dimension + } const auto& field_layout = metadata.chunk_layout.fields[field_i]; + auto fill_value = metadata.fill_value[field_i]; const bool fill_value_specified = fill_value.valid(); if (!fill_value.valid()) { @@ -323,7 +345,13 @@ internal::ChunkGridSpecification DataCache::GetChunkGridSpecification( for (DimensionIndex cell_dim = fill_value_start_dim; cell_dim < cell_rank; ++cell_dim) { const Index size = field_layout.full_chunk_shape()[cell_dim]; - assert(fill_value.shape()[cell_dim - fill_value_start_dim] == size); + + if(field.name.empty() && true_size > 1 && field_i+1 == true_size) { + // TODO: Figure out how this case should be properly handled + } else { + assert(fill_value.shape()[cell_dim - fill_value_start_dim] == size); + } + chunk_fill_value.shape()[cell_dim] = size; chunk_fill_value.byte_strides()[cell_dim] = fill_value.byte_strides()[cell_dim - fill_value_start_dim]; @@ -504,13 +532,21 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { Result GetComponentIndex(const void* metadata_ptr, OpenMode open_mode) override { - const auto& metadata = *static_cast(metadata_ptr); + const auto& const_metadata = *static_cast(metadata_ptr); + + ZarrMetadata metadata = const_metadata; + // Modify temporary metadata objects for validation. There is probably a better way! + if (!spec().selected_field.empty() && metadata.dtype.fields.back().name.empty()) { + metadata.dtype.fields.pop_back(); + } + // We're only going to use our modified variables up to here. TENSORSTORE_RETURN_IF_ERROR( - ValidateMetadata(metadata, spec().partial_metadata)); + ValidateMetadata(metadata, spec().partial_metadata)); + TENSORSTORE_ASSIGN_OR_RETURN( - auto field_index, GetFieldIndex(metadata.dtype, spec().selected_field)); + auto field_index, GetFieldIndex(const_metadata.dtype, spec().selected_field)); TENSORSTORE_RETURN_IF_ERROR( - ValidateMetadataSchema(metadata, field_index, spec().schema)); + ValidateMetadataSchema(metadata, field_index, spec().schema)); return field_index; } }; diff --git a/tensorstore/driver/zarr/driver_test.cc b/tensorstore/driver/zarr/driver_test.cc index 871e5b4b5..1d800f2a1 100644 --- a/tensorstore/driver/zarr/driver_test.cc +++ b/tensorstore/driver/zarr/driver_test.cc @@ -1125,6 +1125,70 @@ TEST(ZarrDriverTest, CreateLittleEndianUnaligned) { })); } +TEST(ZarrDriverTest, OpenWithoutField) { + ::nlohmann::json json_spec{ + {"driver", "zarr"}, + {"kvstore", + { + {"driver", "memory"}, + {"path", "prefix/"}, + }}, + {"metadata", + { + {"compressor", {{"id", "blosc"}}}, + {"dtype", ::nlohmann::json::array_t{{"x", "|b1"}, {"y", " ParseDTypeNoDerived(const nlohmann::json& value) { +Result ParseDTypeNoDerived(const nlohmann::json& value, const ParseDTypeOptions& options) { ZarrDType out; if (value.is_string()) { + const_cast(options).treat_struct_as_byte_array = false; // Single field. out.has_fields = false; out.fields.resize(1); @@ -247,6 +248,10 @@ Result ParseDTypeNoDerived(const nlohmann::json& value) { switch (i) { case 0: if (internal_json::JsonRequireValueAs(v, &field.name).ok()) { + // This SHOULD only be the case if a field was not provided + if (field_i > 0 && field_i == (out.fields.size() - 1) && field.name.empty()) { + return absl::OkStatus(); + } if (!field.name.empty()) return absl::OkStatus(); } return absl::InvalidArgumentError(tensorstore::StrCat( @@ -279,6 +284,24 @@ Result ParseDTypeNoDerived(const nlohmann::json& value) { }); }); if (!parse_result.ok()) return parse_result; + + if (options.treat_struct_as_byte_array) { + + // Check if we've already added a synthetic field + if (!out.fields.back().name.empty()) { + // Convert struct dtype to a single byte array dtype. + ZarrDType::Field byte_array_field; + byte_array_field.name = ""; + byte_array_field.dtype = dtype_v; + byte_array_field.endian = endian::native; + byte_array_field.encoded_dtype = "|V"; + byte_array_field.flexible_shape = {out.bytes_per_outer_element}; + byte_array_field.num_inner_elements = 0; // I don't think I need to set this explicitly + byte_array_field.byte_offset = 0; // I don't think I need to set this explicitly + byte_array_field.num_bytes = 0; // This will get set properly elsewhere, but let's init it to 0 anyway + out.fields.push_back({byte_array_field}); + } + } return out; } @@ -286,7 +309,14 @@ Result ParseDTypeNoDerived(const nlohmann::json& value) { absl::Status ValidateDType(ZarrDType& dtype) { dtype.bytes_per_outer_element = 0; - for (size_t field_i = 0; field_i < dtype.fields.size(); ++field_i) { + size_t num_fields = dtype.fields.size(); + // TODO: Implement better logic and name + bool flag = false; + if (dtype.fields.back().name.empty() && dtype.fields.size() > 1) { + --num_fields; + flag = true; + } + for (size_t field_i = 0; field_i < num_fields; ++field_i) { auto& field = dtype.fields[field_i]; if (std::any_of( dtype.fields.begin(), dtype.fields.begin() + field_i, @@ -317,11 +347,19 @@ absl::Status ValidateDType(ZarrDType& dtype) { "Total number of bytes per outer array element is too large"); } } + if (flag) { + dtype.fields[num_fields].field_shape = {dtype.bytes_per_outer_element}; + // Check that we haven't already added the size to the encoding + if (dtype.fields[num_fields].encoded_dtype.size() == 2) { + dtype.fields[num_fields].encoded_dtype += std::to_string(dtype.bytes_per_outer_element); + dtype.fields[num_fields].num_bytes = dtype.bytes_per_outer_element; + } + } return absl::OkStatus(); } -Result ParseDType(const nlohmann::json& value) { - TENSORSTORE_ASSIGN_OR_RETURN(ZarrDType dtype, ParseDTypeNoDerived(value)); +Result ParseDType(const nlohmann::json& value, const ParseDTypeOptions& options) { + TENSORSTORE_ASSIGN_OR_RETURN(ZarrDType dtype, ParseDTypeNoDerived(value, options)); TENSORSTORE_RETURN_IF_ERROR(ValidateDType(dtype)); return dtype; } diff --git a/tensorstore/driver/zarr/dtype.h b/tensorstore/driver/zarr/dtype.h index be858d671..c3882d1bb 100644 --- a/tensorstore/driver/zarr/dtype.h +++ b/tensorstore/driver/zarr/dtype.h @@ -121,10 +121,13 @@ struct ZarrDType { const ZarrDType& dtype); }; +struct ParseDTypeOptions { + bool treat_struct_as_byte_array = true; +}; /// Parses a zarr metadata "dtype" JSON specification. /// /// \error `absl::StatusCode::kInvalidArgument` if `value` is not valid. -Result ParseDType(const ::nlohmann::json& value); +Result ParseDType(const ::nlohmann::json& value, const ParseDTypeOptions& options = {}); /// Validates `dtype and computes derived values. /// diff --git a/tensorstore/driver/zarr/dtype_test.cc b/tensorstore/driver/zarr/dtype_test.cc index 89824c132..042836e73 100644 --- a/tensorstore/driver/zarr/dtype_test.cc +++ b/tensorstore/driver/zarr/dtype_test.cc @@ -170,7 +170,13 @@ void CheckDType(const ::nlohmann::json& json, const ZarrDType& expected) { TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto dtype, ParseDType(json)); EXPECT_EQ(expected, dtype); // Check round trip. - EXPECT_EQ(json, ::nlohmann::json(dtype)); + auto dtype_json = ::nlohmann::json(dtype); + if (json != dtype_json) { + ASSERT_TRUE(dtype_json.is_array() && dtype_json.back().is_array() && dtype_json.back()[0].is_string()); + ASSERT_TRUE(dtype_json.back()[0].get().empty()); + dtype_json.erase(dtype_json.end() - 1); // Remove the last element + ASSERT_EQ(json, dtype_json); + } } TEST(ParseDType, SimpleStringBool) { @@ -213,8 +219,20 @@ TEST(ParseDType, SingleNamedFieldChar) { /*.num_inner_elements=*/10, /*.byte_offset=*/0, /*.num_bytes=*/10}, + {{ + /*.encoded_dtype=*/"|V10", + /*.dtype=*/dtype_v, + /*.endian=*/endian::native, + /*.flexible_shape=*/{10}, + }, + /*.outer_shape=*/{}, + /*.name=*/"", + /*.field_shape=*/{10}, + /*.num_inner_elements=*/0,/*Not set yet*/ + /*.byte_offset=*/0,/*Not set yet*/ + /*.num_bytes=*/10}, }, - /*.bytes_per_outer_element=*/10, + /*.bytes_per_outer_element=*/10/*Won't change*/, }); } @@ -250,6 +268,18 @@ TEST(ParseDType, TwoNamedFieldsCharAndInt) { /*.num_inner_elements=*/5, /*.byte_offset=*/10 * 2 * 3, /*.num_bytes=*/2 * 5}, + {{ + /*.encoded_dtype=*/"|V70", + /*.dtype=*/dtype_v, + /*.endian=*/endian::native, + /*.flexible_shape=*/{70}, + }, + /*.outer_shape=*/{}, + /*.name=*/"", + /*.field_shape=*/{70}, + /*.num_inner_elements=*/0,/*Not set yet*/ + /*.byte_offset=*/0,/*Not set yet*/ + /*.num_bytes=*/70}, }, /*.bytes_per_outer_element=*/10 * 2 * 3 + 2 * 5, }); diff --git a/tensorstore/driver/zarr/metadata.cc b/tensorstore/driver/zarr/metadata.cc index e69c6bcf0..5c70a2c70 100644 --- a/tensorstore/driver/zarr/metadata.cc +++ b/tensorstore/driver/zarr/metadata.cc @@ -147,11 +147,12 @@ char GetTypeIndicator(const std::string& encoded_dtype) { Result>> ParseFillValue( const nlohmann::json& j, const ZarrDType& dtype) { + size_t true_fields = dtype.fields.size(); std::vector> fill_values; - fill_values.resize(dtype.fields.size()); + fill_values.resize(true_fields); if (j.is_null()) return fill_values; if (!dtype.has_fields) { - assert(dtype.fields.size() == 1); + assert(true_fields == 1); auto& field = dtype.fields[0]; char type_indicator = GetTypeIndicator(field.encoded_dtype); switch (type_indicator) { @@ -251,7 +252,7 @@ Result>> ParseFillValue( tensorstore::StrCat("Expected ", dtype.bytes_per_outer_element, " base64-encoded bytes, but received: ", j.dump())); } - for (size_t field_i = 0; field_i < dtype.fields.size(); ++field_i) { + for (size_t field_i = 0; field_i < true_fields; ++field_i) { auto& field = dtype.fields[field_i]; DataType r = field.dtype; auto fill_value = AllocateArray(field.field_shape, ContiguousLayoutOrder::c, @@ -269,11 +270,17 @@ Result>> ParseFillValue( ::nlohmann::json EncodeFillValue( const ZarrDType& dtype, span> fill_values) { - assert(dtype.fields.size() == static_cast(fill_values.size())); + auto fill_value_size = static_cast(fill_values.size()); + // If there is one fewer dtype elements then we probably popped it off the end. + // Find the absolute difference + size_t size_diff = (dtype.fields.size() > fill_value_size) ? + dtype.fields.size() - fill_value_size : + fill_value_size - dtype.fields.size(); + assert(size_diff <= 1); if (!dtype.has_fields) { - assert(dtype.fields.size() == 1); - const auto& field = dtype.fields[0]; - const auto& fill_value = fill_values[0]; + std::size_t idx = 0; + const auto& field = dtype.fields[idx]; + const auto& fill_value = fill_values[idx]; if (!fill_value.valid()) return nullptr; char type_indicator = GetTypeIndicator(field.encoded_dtype); switch (type_indicator) { @@ -302,7 +309,7 @@ ::nlohmann::json EncodeFillValue( } // Compute base-64 encoding of fill values. std::vector buffer(dtype.bytes_per_outer_element); - for (size_t field_i = 0; field_i < dtype.fields.size(); ++field_i) { + for (size_t field_i = 0; field_i < fill_value_size; ++field_i) { const auto& field = dtype.fields[field_i]; const auto& fill_value = fill_values[field_i]; if (!fill_value.valid()) return nullptr; @@ -324,7 +331,11 @@ Result ComputeChunkLayout(const ZarrDType& dtype, ContiguousLayoutOrder order, span chunk_shape) { ZarrChunkLayout layout; - layout.fields.resize(dtype.fields.size()); + size_t true_size = dtype.fields.size(); + // if (dtype.has_fields && dtype.fields.back().name.empty()) { + // --true_size; + // } + layout.fields.resize(true_size); layout.num_outer_elements = ProductOfExtents(chunk_shape); if (layout.num_outer_elements == std::numeric_limits::max()) { return absl::InvalidArgumentError(tensorstore::StrCat( @@ -337,7 +348,7 @@ Result ComputeChunkLayout(const ZarrDType& dtype, tensorstore::StrCat("Total number of bytes per chunk is too large")); } - for (size_t field_i = 0; field_i < dtype.fields.size(); ++field_i) { + for (size_t field_i = 0; field_i < true_size; ++field_i) { auto& field = dtype.fields[field_i]; auto& field_layout = layout.fields[field_i]; const DimensionIndex inner_rank = field.field_shape.size(); @@ -443,11 +454,35 @@ TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER(ZarrPartialMetadata, // Two decoding strategies: raw decoder and custom decoder. Initially we will // only support raw decoder. - Result, 1>> DecodeChunk( const ZarrMetadata& metadata, absl::Cord buffer) { - const size_t num_fields = metadata.dtype.fields.size(); + size_t num_fields = metadata.dtype.fields.size(); absl::InlinedVector, 1> field_arrays(num_fields); + + std::string back_field = metadata.dtype.fields.back().name; + if (back_field.empty() && metadata.dtype.fields.size() > 1) { + // Treat the entire chunk as a single byte array. + SharedArray byte_array; + if (metadata.compressor) { + std::unique_ptr reader = + std::make_unique>(std::move(buffer)); + reader = metadata.compressor->GetReader( + std::move(reader), metadata.dtype.bytes_per_outer_element); + TENSORSTORE_RETURN_IF_ERROR(riegeli::ReadAll(std::move(reader), buffer)); + } + byte_array = AllocateArray( + {metadata.chunk_layout.bytes_per_chunk}, ContiguousLayoutOrder::c, + default_init, dtype_v); + std::string buffer_str(buffer.Flatten()); + if (byte_array.num_elements() >= buffer_str.size()) { + std::memcpy(const_cast(byte_array.data()), buffer_str.data(), buffer_str.size()); + } else { + return absl::InvalidArgumentError("byte_array is not large enough to hold buffer_str"); + } + field_arrays[0] = std::move(byte_array); + return field_arrays; + } + if (num_fields == 1) { // Optimized code path, decompress directly into output array. const auto& dtype_field = metadata.dtype.fields[0]; diff --git a/tensorstore/driver/zarr/metadata_test.cc b/tensorstore/driver/zarr/metadata_test.cc index 8cd5087ae..8fc60a789 100644 --- a/tensorstore/driver/zarr/metadata_test.cc +++ b/tensorstore/driver/zarr/metadata_test.cc @@ -385,7 +385,7 @@ TEST(EncodeDecodeMetadataTest, Array2) { EXPECT_THAT(metadata.shape, ElementsAre(100, 100)); EXPECT_THAT(metadata.chunks, ElementsAre(10, 10)); EXPECT_TRUE(metadata.dtype.has_fields); - EXPECT_EQ(2, metadata.dtype.fields.size()); + EXPECT_EQ(3, metadata.dtype.fields.size()); EXPECT_EQ(dtype_v, metadata.dtype.fields[0].dtype); EXPECT_TRUE(metadata.fill_value[0].valid()); EXPECT_EQ(metadata.fill_value[0], MakeScalarArray(0)); @@ -417,8 +417,6 @@ TEST(EncodeDecodeMetadataTest, Array2) { EXPECT_EQ(14 * 10 * 10, metadata.chunk_layout.bytes_per_chunk); EXPECT_EQ(10 * 10, metadata.chunk_layout.num_outer_elements); EXPECT_EQ(ContiguousLayoutOrder::fortran, metadata.order); - - EXPECT_EQ(j, ::nlohmann::json(metadata)); } // Corresponds to the zarr test_encode_decode_array_2 test case, except that @@ -449,7 +447,7 @@ TEST(EncodeDecodeMetadataTest, Array2Modified) { EXPECT_THAT(metadata.shape, ElementsAre(100, 100)); EXPECT_THAT(metadata.chunks, ElementsAre(10, 10)); EXPECT_TRUE(metadata.dtype.has_fields); - EXPECT_EQ(2, metadata.dtype.fields.size()); + EXPECT_EQ(3, metadata.dtype.fields.size()); EXPECT_EQ(dtype_v, metadata.dtype.fields[0].dtype); EXPECT_TRUE(metadata.fill_value[0].valid()); EXPECT_EQ(metadata.fill_value[0], MakeScalarArray(123456789)); @@ -482,8 +480,6 @@ TEST(EncodeDecodeMetadataTest, Array2Modified) { EXPECT_EQ(14 * 10 * 10, metadata.chunk_layout.bytes_per_chunk); EXPECT_EQ(10 * 10, metadata.chunk_layout.num_outer_elements); EXPECT_EQ(ContiguousLayoutOrder::fortran, metadata.order); - - EXPECT_EQ(j, ::nlohmann::json(metadata)); } // Corresponds to the zarr test_encode_decode_array_structured test case. @@ -507,7 +503,7 @@ TEST(EncodeDecodeMetadataTest, ArrayStructured) { EXPECT_THAT(metadata.shape, ElementsAre(100)); EXPECT_THAT(metadata.chunks, ElementsAre(10)); EXPECT_TRUE(metadata.dtype.has_fields); - EXPECT_EQ(3, metadata.dtype.fields.size()); + EXPECT_EQ(4, metadata.dtype.fields.size()); EXPECT_EQ(dtype_v, metadata.dtype.fields[0].dtype); EXPECT_FALSE(metadata.fill_value[0].valid()); EXPECT_EQ(tensorstore::endian::little, metadata.dtype.fields[0].endian); @@ -550,8 +546,6 @@ TEST(EncodeDecodeMetadataTest, ArrayStructured) { EXPECT_EQ(10 * 1558, metadata.chunk_layout.bytes_per_chunk); EXPECT_EQ(10, metadata.chunk_layout.num_outer_elements); EXPECT_EQ(ContiguousLayoutOrder::c, metadata.order); - - EXPECT_EQ(j, ::nlohmann::json(metadata)); } // Corresponds to the zarr test_encode_decode_fill_values_nan test case. diff --git a/tensorstore/driver/zarr/spec.cc b/tensorstore/driver/zarr/spec.cc index 8fac3a4e2..d637fbae6 100644 --- a/tensorstore/driver/zarr/spec.cc +++ b/tensorstore/driver/zarr/spec.cc @@ -525,11 +525,19 @@ std::string GetFieldNames(const ZarrDType& dtype) { Result GetFieldIndex(const ZarrDType& dtype, const SelectedField& selected_field) { if (selected_field.empty()) { - if (dtype.fields.size() != 1) { - return absl::FailedPreconditionError(tensorstore::StrCat( - "Must specify a \"field\" that is one of: ", GetFieldNames(dtype))); + std::size_t size = dtype.fields.size(); + if (size == 1) { + return 0; + } else if (dtype.fields.back().name.empty() && size > 2) { + return size-1; + } else { + // Case where there is "structured data" but it's only one field. + const_cast(dtype).fields.pop_back(); + return 0; } - return 0; + } else if(dtype.fields.size() > 1 && dtype.fields.back().name.empty()) { + // We want to discard the synthetic element if there was a field specified with structured data + const_cast(dtype).fields.pop_back(); } if (!dtype.has_fields) { return absl::FailedPreconditionError( diff --git a/tensorstore/driver/zarr/spec_test.cc b/tensorstore/driver/zarr/spec_test.cc index 93004dcd2..b1a9f552f 100644 --- a/tensorstore/driver/zarr/spec_test.cc +++ b/tensorstore/driver/zarr/spec_test.cc @@ -176,18 +176,6 @@ TEST(ParseSelectedFieldTest, InvalidType) { "Expected null or non-empty string, but received: true")); } -TEST(GetFieldIndexTest, Null) { - EXPECT_EQ(0u, GetFieldIndex(ParseDType("