From 9c932bfa2294b984bfbbd031fb191330f4cfe3e1 Mon Sep 17 00:00:00 2001 From: Venki Korukanti Date: Mon, 24 Feb 2025 13:16:25 -0800 Subject: [PATCH] [Kernel] Handle the non-uniform value type in map[string, string] in delta commit files (#4182) ## Description See for #3888 for details. Some versions of the Delta-Spark written tables with commit info containing arbitrary value types in the map of s`tring -> string`. This has existed for a while. Update the Kernel default delta commit file reader to always try to parse as a `string` if the value type is a `string` type. This is not ideal, but no other easy ways to handle this. ## How was this patch tested? UT. --- .../_delta_log/.00000000000000000000.crc.crc | Bin 0 -> 16 bytes .../_delta_log/.00000000000000000000.json.crc | Bin 0 -> 20 bytes .../_delta_log/.00000000000000000001.crc.crc | Bin 0 -> 20 bytes .../_delta_log/.00000000000000000001.json.crc | Bin 0 -> 16 bytes .../_delta_log/.00000000000000000002.crc.crc | Bin 0 -> 20 bytes .../_delta_log/.00000000000000000002.json.crc | Bin 0 -> 24 bytes .../_delta_log/00000000000000000000.crc | 1 + .../_delta_log/00000000000000000000.json | 4 ++ .../_delta_log/00000000000000000001.crc | 1 + .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/00000000000000000002.crc | 1 + .../_delta_log/00000000000000000002.json | 5 +++ ...-ad22-6b06d871b547.c000.snappy.parquet.crc | Bin 0 -> 12 bytes ...-818f-22d546976866.c000.snappy.parquet.crc | Bin 0 -> 12 bytes ...44df-ad22-6b06d871b547.c000.snappy.parquet | Bin 0 -> 452 bytes ...484c-818f-22d546976866.c000.snappy.parquet | Bin 0 -> 452 bytes ...-be33-fd0289e53614.c000.snappy.parquet.crc | Bin 0 -> 12 bytes ...-9757-873b7f544510.c000.snappy.parquet.crc | Bin 0 -> 12 bytes ...4e46-be33-fd0289e53614.c000.snappy.parquet | Bin 0 -> 452 bytes ...4879-9757-873b7f544510.c000.snappy.parquet | Bin 0 -> 452 bytes .../scala/io/delta/golden/GoldenTables.scala | 18 ++++++++ .../internal/data/DefaultJsonRow.java | 15 ++++++- .../kernel/defaults/TableChangesSuite.scala | 11 +++++ .../engine/DefaultJsonHandlerSuite.scala | 40 ++++++++++++++++++ 24 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000000.crc.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000000.json.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000001.crc.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000001.json.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000002.crc.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000002.json.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000000.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000000.json create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000001.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000001.json create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000002.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000002.json create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/.part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/.part-00000-c5babbd8-6013-484c-818f-22d546976866.c000.snappy.parquet.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/part-00000-c5babbd8-6013-484c-818f-22d546976866.c000.snappy.parquet create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=2/.part-00000-129a0441-5f41-4e46-be33-fd0289e53614.c000.snappy.parquet.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=2/.part-00000-cc2a9650-0450-4879-9757-873b7f544510.c000.snappy.parquet.crc create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=2/part-00000-129a0441-5f41-4e46-be33-fd0289e53614.c000.snappy.parquet create mode 100644 connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=2/part-00000-cc2a9650-0450-4879-9757-873b7f544510.c000.snappy.parquet diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000000.crc.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000000.crc.crc new file mode 100644 index 0000000000000000000000000000000000000000..ebff5f46514821523e76fcf3e5d0c2f4d58a232e GIT binary patch literal 16 XcmYc;N@ieSU}CtMTd;)V1&bsAB-{j^ literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000000.json.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000000.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..b277b514f73ff89d78bda0ee7d6c1adeadba7a8b GIT binary patch literal 20 bcmYc;N@ieSU}9KYKXY5tVWr*;waWwmLskca literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000001.crc.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000001.crc.crc new file mode 100644 index 0000000000000000000000000000000000000000..df4e0288890bd05490e6d894d6c56a17c1563ecf GIT binary patch literal 20 bcmYc;N@ieSU}AXIlk_Eq^_Kmc@*|G`MP&$v literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000001.json.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000001.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..4d2fb531f46812259a9be32c0e60c5e439bbccff GIT binary patch literal 16 XcmYc;N@ieSU}CuJ^3+x)?zj&CB+~@} literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000002.crc.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000002.crc.crc new file mode 100644 index 0000000000000000000000000000000000000000..bfc205a6dee5f460b8a30c5f7af77e64a35ffc89 GIT binary patch literal 20 ccmYc;N@ieSU}Crwv+$Zlz1o|3E1!k|07O6tD*ylh literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000002.json.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/.00000000000000000002.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..053841500354e6bb7dc4df1d1243e586ea37a9f2 GIT binary patch literal 24 gcmYc;N@ieSU}C6T-ZTHncey9wi>?(`c;1Nu0B->bng9R* literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000000.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000000.crc new file mode 100644 index 00000000000..e0b1bd686c2 --- /dev/null +++ b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"bda32d72-442d-4705-9a8c-16093eb31744","tableSizeBytes":452,"numFiles":1,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"da00fe29-8b6e-4f3b-b91f-a3729283bc1a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"month\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["month"],"configuration":{"delta.enableChangeDataFeed":"true"},"createdTime":1740185389028},"protocol":{"minReaderVersion":1,"minWriterVersion":7,"writerFeatures":["changeDataFeed","appendOnly","invariants"]},"allFiles":[{"path":"month=1/part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet","partitionValues":{"month":"1"},"size":452,"modificationTime":1740185390672,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":1},\"nullCount\":{\"id\":0}}"}]} diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000000.json b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000000.json new file mode 100644 index 00000000000..84507c06f03 --- /dev/null +++ b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"commitInfo":{"timestamp":1740185390903,"operation":"CREATE TABLE AS SELECT","operationParameters":{"partitionBy":"[\"month\"]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.enableChangeDataFeed\":\"true\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"452"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.4.0-SNAPSHOT","txnId":"bda32d72-442d-4705-9a8c-16093eb31744"}} +{"metaData":{"id":"da00fe29-8b6e-4f3b-b91f-a3729283bc1a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"month\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["month"],"configuration":{"delta.enableChangeDataFeed":"true"},"createdTime":1740185389028}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":7,"writerFeatures":["changeDataFeed","appendOnly","invariants"]}} +{"add":{"path":"month=1/part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet","partitionValues":{"month":"1"},"size":452,"modificationTime":1740185390672,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":1},\"nullCount\":{\"id\":0}}"}} diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000001.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000001.crc new file mode 100644 index 00000000000..67170724936 --- /dev/null +++ b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000001.crc @@ -0,0 +1 @@ +{"txnId":"0d7d28b8-55c2-4d8b-b48e-88b22c90aed1","tableSizeBytes":904,"numFiles":2,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"da00fe29-8b6e-4f3b-b91f-a3729283bc1a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"month\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["month"],"configuration":{"delta.enableChangeDataFeed":"true"},"createdTime":1740185389028},"protocol":{"minReaderVersion":1,"minWriterVersion":7,"writerFeatures":["changeDataFeed","appendOnly","invariants"]},"allFiles":[{"path":"month=2/part-00000-cc2a9650-0450-4879-9757-873b7f544510.c000.snappy.parquet","partitionValues":{"month":"2"},"size":452,"modificationTime":1740185395663,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2},\"maxValues\":{\"id\":2},\"nullCount\":{\"id\":0}}"},{"path":"month=1/part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet","partitionValues":{"month":"1"},"size":452,"modificationTime":1740185390672,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":1},\"nullCount\":{\"id\":0}}"}]} diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000001.json b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000001.json new file mode 100644 index 00000000000..d956ee99b35 --- /dev/null +++ b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1740185395669,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"452"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.4.0-SNAPSHOT","txnId":"0d7d28b8-55c2-4d8b-b48e-88b22c90aed1"}} +{"add":{"path":"month=2/part-00000-cc2a9650-0450-4879-9757-873b7f544510.c000.snappy.parquet","partitionValues":{"month":"2"},"size":452,"modificationTime":1740185395663,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2},\"maxValues\":{\"id\":2},\"nullCount\":{\"id\":0}}"}} diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000002.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000002.crc new file mode 100644 index 00000000000..cb7d67984c2 --- /dev/null +++ b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000002.crc @@ -0,0 +1 @@ +{"txnId":"79b3e3aa-82dc-4c18-b95e-8b50089b55c7","tableSizeBytes":904,"numFiles":2,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"da00fe29-8b6e-4f3b-b91f-a3729283bc1a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"month\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["month"],"configuration":{"delta.enableChangeDataFeed":"true"},"createdTime":1740185389028},"protocol":{"minReaderVersion":1,"minWriterVersion":7,"writerFeatures":["changeDataFeed","appendOnly","invariants"]},"allFiles":[{"path":"month=2/part-00000-129a0441-5f41-4e46-be33-fd0289e53614.c000.snappy.parquet","partitionValues":{"month":"2"},"size":452,"modificationTime":1740185397380,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2},\"maxValues\":{\"id\":2},\"nullCount\":{\"id\":0}}"},{"path":"month=1/part-00000-c5babbd8-6013-484c-818f-22d546976866.c000.snappy.parquet","partitionValues":{"month":"1"},"size":452,"modificationTime":1740185397384,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":1},\"nullCount\":{\"id\":0}}"}]} diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000002.json b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000002.json new file mode 100644 index 00000000000..41d3ef238c9 --- /dev/null +++ b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/_delta_log/00000000000000000002.json @@ -0,0 +1,5 @@ +{"commitInfo":{"timestamp":1740185397394,"operation":"OPTIMIZE","operationParameters":{"predicate":"[]","zOrderBy":"[\"id\"]","clusterBy":"[]","auto":false},"readVersion":1,"isolationLevel":"SnapshotIsolation","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"2","numRemovedBytes":"904","p25FileSize":"452","numDeletionVectorsRemoved":"0","minFileSize":"452","numAddedFiles":"2","maxFileSize":"452","p75FileSize":"452","p50FileSize":"452","numAddedBytes":"904"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.4.0-SNAPSHOT","txnId":"79b3e3aa-82dc-4c18-b95e-8b50089b55c7"}} +{"add":{"path":"month=1/part-00000-c5babbd8-6013-484c-818f-22d546976866.c000.snappy.parquet","partitionValues":{"month":"1"},"size":452,"modificationTime":1740185397384,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":1},\"nullCount\":{\"id\":0}}"}} +{"remove":{"path":"month=1/part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet","deletionTimestamp":1740185396708,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{"month":"1"},"size":452,"stats":"{\"numRecords\":1}"}} +{"add":{"path":"month=2/part-00000-129a0441-5f41-4e46-be33-fd0289e53614.c000.snappy.parquet","partitionValues":{"month":"2"},"size":452,"modificationTime":1740185397380,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2},\"maxValues\":{\"id\":2},\"nullCount\":{\"id\":0}}"}} +{"remove":{"path":"month=2/part-00000-cc2a9650-0450-4879-9757-873b7f544510.c000.snappy.parquet","deletionTimestamp":1740185396708,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{"month":"2"},"size":452,"stats":"{\"numRecords\":1}"}} diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/.part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/.part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..e72389a1212d923337f72a1a112f3879516ede07 GIT binary patch literal 12 TcmYc;N@ieSU}8{Xt5^#F561$G literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/.part-00000-c5babbd8-6013-484c-818f-22d546976866.c000.snappy.parquet.crc b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/.part-00000-c5babbd8-6013-484c-818f-22d546976866.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..e72389a1212d923337f72a1a112f3879516ede07 GIT binary patch literal 12 TcmYc;N@ieSU}8{Xt5^#F561$G literal 0 HcmV?d00001 diff --git a/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet b/connectors/golden-tables/src/main/resources/golden/commit-info-containing-arbitrary-operationParams-types/month=1/part-00000-22d25ea7-a383-44df-ad22-6b06d871b547.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f5bd9a0ce913ddcf370ccdcc3ded1e9fadd5c019 GIT binary patch literal 452 zcmZWm%SyvQ6rGM*g1Qkp!vwMjgO(QJkdVecaHESNE~K~<5t(Gh8cfnQ4@GL%{*8a( zZ;7`}UAeiBGxwfzm^-<+^#~#@!pQgM*Uy1PFtkl#Ahdss5kfnS21N4+n{0Pe)rnw} z0&G!(jG;i#J6DIrT;?y2Mf9vwNicvXnBiC#gExn%i58wkJGacy=eO;zZdh?0Ey#gExn%i58wkJGacy=eO;zZdh?0EyEaUsQ>h)AZ1b#SJSc_>o5@+nITBDqPeSiM!JCq=-QVZbVm;k^|WmWAXV)NZ~w>c=W z8PX23=l~l)+`2l$+({Cewnnr7}cqU%dPkiOf9$~e)kj@o`V@V%Z6 lqfXE^qp%lvy4&$aS{eV$H*K%0b)=fMEaUsQ>h)AZ1b#SJSc_>o5@+nITBDqPeSiM!JCq=-QVZbVm;k^|WmWAXV)NZ~w>c=W z8PX23=l~l)+`2l$+({Cewnnr7}cqU%dPkiOf9$~e)kj@o`V@V%Z6 lqfXE^qp%lvy4&$aS{eV$H*K%0b)=fM + spark.sql( + f""" + |CREATE TABLE delta.`$tablePath` + |USING DELTA + |PARTITIONED BY (month) + |TBLPROPERTIES (delta.enableChangeDataFeed = true) + |AS + |SELECT 1 AS id, 1 AS month""".stripMargin) + + // Add some data + spark.sql("INSERT INTO delta.`%s` VALUES (2, 2)".format(tablePath)) + + // Run optimize that generates a commitInfo with arbitrary value types + // operationParameters + spark.sql("OPTIMIZE delta.`%s` ZORDER BY id".format(tablePath)) + } } case class TestStruct(f1: String, f2: Long) diff --git a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/data/DefaultJsonRow.java b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/data/DefaultJsonRow.java index c30e014990a..ca3457d46bd 100644 --- a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/data/DefaultJsonRow.java +++ b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/data/DefaultJsonRow.java @@ -308,10 +308,23 @@ public ColumnVector getElements() { List values = new ArrayList<>(jsonValue.size()); final Iterator> iter = jsonValue.fields(); + boolean isValueOfStringType = mapType.getValueType() instanceof StringType; while (iter.hasNext()) { Map.Entry entry = iter.next(); String keyParsed = entry.getKey(); - Object valueParsed = decodeElement(entry.getValue(), mapType.getValueType()); + + Object valueParsed = null; + if (isValueOfStringType) { + // Special handling for value which is of type string. Delta tables generated by + // Delta-Spark ended up having serializing values as their original type and not + // as string in the Delta commit files. + // Ex. {"key": true} instead of {"key": "true"} + if (!entry.getValue().isNull()) { + valueParsed = entry.getValue().asText(); + } + } else { + valueParsed = decodeElement(entry.getValue(), mapType.getValueType()); + } if (valueParsed == null && !mapType.isValueContainsNull()) { throw new RuntimeException( "Map type expects no nulls in values, but " + "received `null` as value"); diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala index b95b6c35723..618b4139fa0 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala @@ -362,6 +362,17 @@ class TableChangesSuite extends AnyFunSuite with TestUtils { }.getMessage.contains("Unsupported Delta protocol reader version")) } + withGoldenTable("commit-info-containing-arbitrary-operationParams-types") { tablePath => + test("getChanges - commit info with arbitrary operationParams types") { + // Check all actions are correctly retrieved + testGetChangesVsSpark( + tablePath, + 0, + 2, + FULL_ACTION_SET) + } + } + ////////////////////////////////////////////////////////////////////////////////// // Helpers to compare actions returned between Kernel and Spark ////////////////////////////////////////////////////////////////////////////////// diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/engine/DefaultJsonHandlerSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/engine/DefaultJsonHandlerSuite.scala index eedd154db3e..4cacff4c1f0 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/engine/DefaultJsonHandlerSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/engine/DefaultJsonHandlerSuite.scala @@ -23,6 +23,7 @@ import scala.collection.JavaConverters._ import io.delta.kernel.data.ColumnVector import io.delta.kernel.defaults.utils.{DefaultVectorTestUtils, TestRow, TestUtils} +import io.delta.kernel.internal.actions.CommitInfo import io.delta.kernel.internal.util.InternalUtils.singletonStringColumnVector import io.delta.kernel.types._ @@ -436,4 +437,43 @@ class DefaultJsonHandlerSuite extends AnyFunSuite with TestUtils with DefaultVec writeAndVerify(overwrite = true) } } + + test("parse diverse type values in a map[string, string]") { + val input = + """ + |{ + | "inCommitTimestamp":1740009523401, + | "timestamp":1740009523401, + | "engineInfo":"myengine.com", + | "operation":"WRITE", + | "operationParameters": + | {"mode":"Append","statsOnLoad":false,"partitionBy":"[]"}, + | "isBlindAppend":true, + | "txnId":"cb009f42-5da1-4e7e-b4fa-09de3332f52a", + | "operationMetrics": { + | "numFiles":"1", + | "serializedAsNumber":2, + | "serializedAsBoolean":true + | } + |} + |""".stripMargin + + val output = jsonHandler.parseJson( + stringVector(Seq(input)), + CommitInfo.FULL_SCHEMA, + Optional.empty()) + assert(output.getSize == 1) + val actResult = TestRow(output.getRows.next) + val expResult = TestRow( + 1740009523401L, + 1740009523401L, + "myengine.com", + "WRITE", + Map("mode" -> "Append", "statsOnLoad" -> "false", "partitionBy" -> "[]"), + true, + "cb009f42-5da1-4e7e-b4fa-09de3332f52a", + Map("numFiles" -> "1", "serializedAsNumber" -> "2", "serializedAsBoolean" -> "true")) + + checkAnswer(Seq(actResult), Seq(expResult)) + } }