Skip to content

Commit 5d0149d

Browse files
EnricoMipitrou
andauthored
GH-41246: [Docs][C++][Python] Improve docs on column encryption for nested fields (#45411)
### Rationale for this change Encrypting columns with nested fields with a column key is not trivial since only leaf fields are allowed in the column key map. Documentation emphasizes this fact and provides examples. ### What changes are included in this PR? This amends the documentation on encryption for C++ and Python. ### Are these changes tested? Only documentation. ### Are there any user-facing changes? Only documentation. * GitHub Issue: #41246 Lead-authored-by: Enrico Minack <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Signed-off-by: AlenkaF <[email protected]>
1 parent c753740 commit 5d0149d

File tree

9 files changed

+357
-6
lines changed

9 files changed

+357
-6
lines changed

cpp/examples/arrow/CMakeLists.txt

+9
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,15 @@ if(ARROW_PARQUET AND ARROW_DATASET)
166166
${DATASET_EXAMPLES_LINK_LIBS})
167167
add_dependencies(execution-plan-documentation-examples parquet)
168168

169+
if(PARQUET_REQUIRE_ENCRYPTION)
170+
add_arrow_example(parquet_column_encryption
171+
EXTRA_SOURCES
172+
${PROJECT_SOURCE_DIR}/src/parquet/encryption/test_in_memory_kms.cc
173+
EXTRA_LINK_LIBS
174+
${DATASET_EXAMPLES_LINK_LIBS})
175+
add_dependencies(parquet-column-encryption parquet)
176+
endif()
177+
169178
if(ARROW_CSV)
170179
add_arrow_example(join_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS})
171180
add_dependencies(join-example parquet)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/api.h"
19+
#include "arrow/dataset/file_parquet.h"
20+
#include "arrow/dataset/parquet_encryption_config.h"
21+
#include "arrow/filesystem/localfs.h"
22+
#include "parquet/encryption/crypto_factory.h"
23+
#include "parquet/encryption/test_in_memory_kms.h"
24+
25+
#include <iostream>
26+
27+
namespace fs = arrow::fs;
28+
29+
namespace ds = arrow::dataset;
30+
31+
arrow::Result<std::shared_ptr<arrow::Table>> GetTable() {
32+
auto int_builder = arrow::Int32Builder();
33+
34+
std::shared_ptr<arrow::Array> arr_i;
35+
ARROW_RETURN_NOT_OK(int_builder.AppendValues({1, 3, 5, 7, 1}));
36+
ARROW_RETURN_NOT_OK(int_builder.Finish(&arr_i));
37+
38+
auto struct_type = arrow::struct_({{"a", arrow::int32()}, {"b", arrow::int64()}});
39+
auto pool = arrow::default_memory_pool();
40+
auto a_builder = std::make_shared<arrow::Int32Builder>();
41+
auto b_builder = std::make_shared<arrow::Int64Builder>();
42+
auto struct_builder = arrow::StructBuilder(struct_type, pool, {a_builder, b_builder});
43+
44+
std::shared_ptr<arrow::Array> arr_struct;
45+
ARROW_RETURN_NOT_OK(struct_builder.Append());
46+
ARROW_RETURN_NOT_OK(a_builder->Append(2));
47+
ARROW_RETURN_NOT_OK(b_builder->Append(20));
48+
ARROW_RETURN_NOT_OK(struct_builder.Append());
49+
ARROW_RETURN_NOT_OK(a_builder->Append(4));
50+
ARROW_RETURN_NOT_OK(b_builder->Append(40));
51+
ARROW_RETURN_NOT_OK(struct_builder.Append());
52+
ARROW_RETURN_NOT_OK(a_builder->Append(6));
53+
ARROW_RETURN_NOT_OK(b_builder->Append(60));
54+
ARROW_RETURN_NOT_OK(struct_builder.Append());
55+
ARROW_RETURN_NOT_OK(a_builder->Append(8));
56+
ARROW_RETURN_NOT_OK(b_builder->Append(80));
57+
ARROW_RETURN_NOT_OK(struct_builder.Append());
58+
ARROW_RETURN_NOT_OK(a_builder->Append(10));
59+
ARROW_RETURN_NOT_OK(b_builder->Append(100));
60+
ARROW_RETURN_NOT_OK(struct_builder.Finish(&arr_struct));
61+
62+
auto map_type = arrow::map(arrow::int32(), arrow::utf8());
63+
auto key_builder = std::make_shared<arrow::Int32Builder>();
64+
auto item_builder = std::make_shared<arrow::StringBuilder>();
65+
auto map_builder = arrow::MapBuilder(pool, key_builder, item_builder, map_type);
66+
67+
std::shared_ptr<arrow::Array> arr_map;
68+
ARROW_RETURN_NOT_OK(map_builder.Append());
69+
ARROW_RETURN_NOT_OK(key_builder->AppendValues({2, 4}));
70+
ARROW_RETURN_NOT_OK(item_builder->AppendValues({"2", "4"}));
71+
ARROW_RETURN_NOT_OK(map_builder.Append());
72+
ARROW_RETURN_NOT_OK(key_builder->AppendValues({6}));
73+
ARROW_RETURN_NOT_OK(item_builder->AppendValues({"6"}));
74+
ARROW_RETURN_NOT_OK(map_builder.Append());
75+
ARROW_RETURN_NOT_OK(map_builder.Append());
76+
ARROW_RETURN_NOT_OK(key_builder->AppendValues({8, 10}));
77+
ARROW_RETURN_NOT_OK(item_builder->AppendValues({"8", "10"}));
78+
ARROW_RETURN_NOT_OK(map_builder.Append());
79+
ARROW_RETURN_NOT_OK(map_builder.Finish(&arr_map));
80+
81+
auto list_type = arrow::list(arrow::int32());
82+
auto value_builder = std::make_shared<arrow::Int32Builder>();
83+
auto list_builder = arrow::ListBuilder(pool, value_builder, list_type);
84+
85+
std::shared_ptr<arrow::Array> arr_list;
86+
ARROW_RETURN_NOT_OK(list_builder.Append());
87+
ARROW_RETURN_NOT_OK(value_builder->AppendValues({1, 2, 3}));
88+
ARROW_RETURN_NOT_OK(list_builder.Append());
89+
ARROW_RETURN_NOT_OK(value_builder->AppendValues({4, 5, 6}));
90+
ARROW_RETURN_NOT_OK(list_builder.Append());
91+
ARROW_RETURN_NOT_OK(value_builder->AppendValues({7}));
92+
ARROW_RETURN_NOT_OK(list_builder.Append());
93+
ARROW_RETURN_NOT_OK(value_builder->AppendValues({8}));
94+
ARROW_RETURN_NOT_OK(list_builder.Append());
95+
ARROW_RETURN_NOT_OK(list_builder.Finish(&arr_list));
96+
97+
auto schema = arrow::schema({
98+
arrow::field("i", arrow::int32()),
99+
arrow::field("s", struct_type),
100+
arrow::field("m", map_type),
101+
arrow::field("l", list_type),
102+
});
103+
104+
return arrow::Table::Make(schema, {arr_i, arr_struct, arr_map, arr_list});
105+
}
106+
107+
std::shared_ptr<parquet::encryption::CryptoFactory> GetCryptoFactory() {
108+
// Configure KMS.
109+
std::unordered_map<std::string, std::string> key_map;
110+
key_map.emplace("footerKeyId", "0123456789012345");
111+
key_map.emplace("columnKeyId", "1234567890123456");
112+
113+
auto crypto_factory = std::make_shared<parquet::encryption::CryptoFactory>();
114+
auto kms_client_factory =
115+
// for testing only, do not use it as an example of KmsClientFactory implementation
116+
std::make_shared<parquet::encryption::TestOnlyInMemoryKmsClientFactory>(
117+
/*wrap_locally=*/true, key_map);
118+
crypto_factory->RegisterKmsClientFactory(std::move(kms_client_factory));
119+
return crypto_factory;
120+
}
121+
122+
arrow::Status WriteEncryptedFile(const std::string& path_to_file) {
123+
using arrow::internal::checked_pointer_cast;
124+
125+
// Get a configured crypto factory and kms connection conf.
126+
auto crypto_factory = GetCryptoFactory();
127+
auto kms_connection_config =
128+
std::make_shared<parquet::encryption::KmsConnectionConfig>();
129+
130+
// Set write options with encryption configuration.
131+
auto encryption_config = std::make_shared<parquet::encryption::EncryptionConfiguration>(
132+
std::string("footerKeyId"));
133+
encryption_config->column_keys =
134+
"columnKeyId: i, s.a, s.b, m.key_value.key, m.key_value.value, l.list.element";
135+
136+
auto parquet_encryption_config = std::make_shared<ds::ParquetEncryptionConfig>();
137+
// Directly assign shared_ptr objects to ParquetEncryptionConfig members.
138+
parquet_encryption_config->crypto_factory = crypto_factory;
139+
parquet_encryption_config->kms_connection_config = kms_connection_config;
140+
parquet_encryption_config->encryption_config = std::move(encryption_config);
141+
142+
auto file_format = std::make_shared<ds::ParquetFileFormat>();
143+
auto parquet_file_write_options = checked_pointer_cast<ds::ParquetFileWriteOptions>(
144+
file_format->DefaultWriteOptions());
145+
parquet_file_write_options->parquet_encryption_config =
146+
std::move(parquet_encryption_config);
147+
148+
// Write dataset.
149+
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::Table> table, GetTable());
150+
printf("%s", table->ToString().c_str());
151+
auto dataset = std::make_shared<ds::InMemoryDataset>(table);
152+
ARROW_ASSIGN_OR_RAISE(auto scanner_builder, dataset->NewScan());
153+
ARROW_ASSIGN_OR_RAISE(auto scanner, scanner_builder->Finish());
154+
155+
auto file_system = std::make_shared<fs::LocalFileSystem>();
156+
auto partitioning = std::make_shared<ds::HivePartitioning>(
157+
arrow::schema({arrow::field("part", arrow::utf8())}));
158+
159+
ds::FileSystemDatasetWriteOptions write_options;
160+
write_options.file_write_options = parquet_file_write_options;
161+
write_options.filesystem = file_system;
162+
write_options.base_dir = path_to_file;
163+
write_options.partitioning = partitioning;
164+
write_options.basename_template = "part{i}.parquet";
165+
return ds::FileSystemDataset::Write(write_options, std::move(scanner));
166+
}
167+
168+
arrow::Status ReadEncryptedFile(const std::string& path_to_file) {
169+
// Get a configured crypto factory and kms connection conf
170+
auto crypto_factory = GetCryptoFactory();
171+
auto kms_connection_config =
172+
std::make_shared<parquet::encryption::KmsConnectionConfig>();
173+
174+
// Create decryption properties.
175+
auto decryption_config =
176+
std::make_shared<parquet::encryption::DecryptionConfiguration>();
177+
auto parquet_decryption_config = std::make_shared<ds::ParquetDecryptionConfig>();
178+
parquet_decryption_config->crypto_factory = crypto_factory;
179+
parquet_decryption_config->kms_connection_config = kms_connection_config;
180+
parquet_decryption_config->decryption_config = std::move(decryption_config);
181+
182+
// Set scan options.
183+
auto parquet_scan_options = std::make_shared<ds::ParquetFragmentScanOptions>();
184+
parquet_scan_options->parquet_decryption_config = std::move(parquet_decryption_config);
185+
186+
// Get configured Parquet file format
187+
auto file_format = std::make_shared<ds::ParquetFileFormat>();
188+
file_format->default_fragment_scan_options = std::move(parquet_scan_options);
189+
190+
// Get the FileSystem.
191+
auto file_system = std::make_shared<fs::LocalFileSystem>();
192+
193+
// Get FileInfo objects for all files under the base directory
194+
fs::FileSelector selector;
195+
selector.base_dir = path_to_file;
196+
selector.recursive = true;
197+
198+
// Create the dataset
199+
ds::FileSystemFactoryOptions factory_options;
200+
ARROW_ASSIGN_OR_RAISE(auto dataset_factory,
201+
ds::FileSystemDatasetFactory::Make(file_system, selector,
202+
file_format, factory_options));
203+
ARROW_ASSIGN_OR_RAISE(auto dataset, dataset_factory->Finish());
204+
ARROW_ASSIGN_OR_RAISE(auto scanner_builder, dataset->NewScan());
205+
ARROW_ASSIGN_OR_RAISE(auto scanner, scanner_builder->Finish());
206+
ARROW_ASSIGN_OR_RAISE(auto table, scanner->ToTable());
207+
std::cout << "Table size: " << table->num_rows() << "\n";
208+
return arrow::Status::OK();
209+
}
210+
211+
arrow::Status RunExamples(const std::string& path_to_file) {
212+
ARROW_RETURN_NOT_OK(WriteEncryptedFile(path_to_file));
213+
ARROW_RETURN_NOT_OK(ReadEncryptedFile(path_to_file));
214+
return arrow::Status::OK();
215+
}
216+
217+
int main(int argc, char** argv) {
218+
if (argc != 2) {
219+
// Fake success for CI purposes.
220+
return EXIT_SUCCESS;
221+
}
222+
223+
std::string path_to_file = argv[1];
224+
arrow::Status status = RunExamples(path_to_file);
225+
226+
if (!status.ok()) {
227+
std::cerr << "Error occurred: " << status.message() << std::endl;
228+
return EXIT_FAILURE;
229+
}
230+
return EXIT_SUCCESS;
231+
}

cpp/examples/arrow/parquet_read_write.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ arrow::Status WriteInBatches(std::string path_to_file) {
165165
return arrow::Status::OK();
166166
}
167167

168-
arrow::Status RunExamples(std::string path_to_file) {
168+
arrow::Status RunExamples(const std::string& path_to_file) {
169169
ARROW_RETURN_NOT_OK(WriteFullFile(path_to_file));
170170
ARROW_RETURN_NOT_OK(ReadFullFile(path_to_file));
171171
ARROW_RETURN_NOT_OK(WriteInBatches(path_to_file));

cpp/src/parquet/encryption/crypto_factory.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ struct PARQUET_EXPORT EncryptionConfiguration {
4343
/// ID of the master key for footer encryption/signing
4444
std::string footer_key;
4545

46-
/// List of columns to encrypt, with master key IDs (see HIVE-21848).
47-
/// Format: "masterKeyID:colName,colName;masterKeyID:colName..."
46+
/// List of columns to encrypt, with column master key IDs (see HIVE-21848).
47+
/// Format: "columnKeyID:colName,colName;columnKeyID:colName..."
4848
/// Either
4949
/// (1) column_keys must be set
5050
/// or

docs/source/cpp/dataset.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -445,8 +445,6 @@ storage systems, such as Amazon S3, by passing a different filesystem.
445445
See the :ref:`filesystem <cpp-filesystems>` docs for more details on the available
446446
filesystems.
447447

448-
.. _cpp-dataset-full-example:
449-
450448
A note on transactions & ACID guarantees
451449
----------------------------------------
452450

@@ -467,6 +465,8 @@ Most file formats have magic numbers which are written at the end. This means a
467465
partial file write can safely be detected and discarded. The CSV file format does
468466
not have any such concept and a partially written CSV file may be detected as valid.
469467

468+
.. _cpp-dataset-full-example:
469+
470470
Full Example
471471
------------
472472

docs/source/cpp/examples/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Examples
2525
compute_and_write_example
2626
dataset_documentation_example
2727
dataset_skyhook_scan_example
28+
parquet_column_encryption
2829
row_columnar_conversion
2930
std::tuple-like ranges to Arrow <tuple_range_conversion>
3031
Converting RecordBatch to Tensor <converting_recordbatch_to_tensor>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
.. Licensed to the Apache Software Foundation (ASF) under one
2+
.. or more contributor license agreements. See the NOTICE file
3+
.. distributed with this work for additional information
4+
.. regarding copyright ownership. The ASF licenses this file
5+
.. to you under the Apache License, Version 2.0 (the
6+
.. "License"); you may not use this file except in compliance
7+
.. with the License. You may obtain a copy of the License at
8+
9+
.. http://www.apache.org/licenses/LICENSE-2.0
10+
11+
.. Unless required by applicable law or agreed to in writing,
12+
.. software distributed under the License is distributed on an
13+
.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
.. KIND, either express or implied. See the License for the
15+
.. specific language governing permissions and limitations
16+
.. under the License.
17+
18+
.. default-domain:: cpp
19+
.. highlight:: cpp
20+
21+
Parquet column encryption
22+
=========================
23+
24+
The following example defines a :class:`arrow::Table` instance, and then writes
25+
it to an encrypted Parquet file. Metadata footer and columns are encrypted with
26+
different encryption keys.
27+
28+
.. literalinclude:: ../../../../cpp/examples/arrow/parquet_column_encryption.cc
29+

docs/source/cpp/parquet.rst

+46
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,52 @@ More specifically, Parquet C++ supports:
585585
* EncryptionWithFooterKey and EncryptionWithColumnKey modes.
586586
* Encrypted Footer and Plaintext Footer modes.
587587

588+
Configuration
589+
~~~~~~~~~~~~~
590+
591+
Parquet encryption uses a ``parquet::encryption::CryptoFactory`` that has access to a
592+
Key Management System (KMS), which stores actual encryption keys, referenced by key ids.
593+
The Parquet encryption configuration only uses key ids, no actual keys.
594+
595+
Parquet metadata encryption is configured via ``parquet::encryption::EncryptionConfiguration``:
596+
597+
.. literalinclude:: ../../../cpp/examples/arrow/parquet_column_encryption.cc
598+
:language: cpp
599+
:start-at: // Set write options with encryption configuration
600+
:end-before: encryption_config->column_keys
601+
:dedent: 2
602+
603+
If ``encryption_config->uniform_encryption`` is set to ``true``, then all columns are
604+
encrypted with the same key as the Parquet metadata. Otherwise, individual
605+
columns are encrypted with individual keys as configured via
606+
``encryption_config->column_keys``. This field expects a string of the format
607+
``"columnKeyID1:colName1,colName2;columnKeyID3:colName3..."``.
608+
609+
.. literalinclude:: ../../../cpp/examples/arrow/parquet_column_encryption.cc
610+
:language: cpp
611+
:start-at: // Set write options with encryption configuration
612+
:end-before: auto parquet_encryption_config
613+
:emphasize-lines: 4-5
614+
:dedent: 2
615+
616+
See the full `Parquet column encryption example <examples/parquet_column_encryption.html>`_.
617+
618+
.. note::
619+
620+
Encrypting columns that have nested fields (struct, map or list data types)
621+
requires column keys for the inner fields, not the outer column itself.
622+
Configuring a column key for the outer column causes
623+
this error (here the column name is ``col``):
624+
625+
.. code-block::
626+
627+
OSError: Encrypted column col not in file schema
628+
629+
Conventionally, the key and value fields of a map column ``m`` have the names
630+
``m.key_value.key`` and ``m.key_value.value``, respectively. The inner field of a
631+
list column ``l`` has the name ``l.list.element``. An inner field ``f`` of a struct column ``s`` has
632+
the name ``s.f``.
633+
588634
Miscellaneous
589635
-------------
590636

0 commit comments

Comments
 (0)