apache · RussellSpitzer · Oct 7, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 22, 2024
diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java
@@ -321,9 +321,9 @@ private CharSeqComparator() {}
      * represented using two Java characters (using UTF-16 surrogate pairs). Character by character
      * comparison may yield incorrect results while comparing a 4 byte UTF-8 character to a java
      * char. Character by character comparison works as expected if both characters are <= 3 byte
-     * UTF-8 character or both characters are 4 byte UTF-8 characters.
-     * isCharInUTF16HighSurrogateRange method detects a 4-byte character and considers that
-     * character to be lexicographically greater than any 3 byte or lower UTF-8 character.
+     * UTF-8 character or both characters are 4 byte UTF-8 characters. isCharHighSurrogate method
+     * detects a high surrogate (4-byte character) and considers that character to be
+     * lexicographically greater than any 3 byte or lower UTF-8 character.
      */
     @Override
     public int compare(CharSequence s1, CharSequence s2) {

diff --git a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
@@ -82,9 +82,9 @@ public static Literal<CharSequence> truncateStringMax(Literal<CharSequence> inpu
     for (int i = length - 1; i >= 0; i--) {
       // Get the offset in the truncated string buffer where the number of unicode characters = i
       int offsetByCodePoint = truncatedStringBuilder.offsetByCodePoints(0, i);
-      int nextCodePoint = truncatedStringBuilder.codePointAt(offsetByCodePoint) + 1;
+      int nextCodePoint = incrementCodePoint(truncatedStringBuilder.codePointAt(offsetByCodePoint));
       // No overflow
-      if (nextCodePoint != 0 && Character.isValidCodePoint(nextCodePoint)) {
+      if (nextCodePoint != 0) {
         truncatedStringBuilder.setLength(offsetByCodePoint);
         // Append next code point to the truncated substring
         truncatedStringBuilder.appendCodePoint(nextCodePoint);
@@ -93,4 +93,24 @@ public static Literal<CharSequence> truncateStringMax(Literal<CharSequence> inpu
     }
     return null; // Cannot find a valid upper bound
   }
+
+  private static int incrementCodePoint(int codePoint) {
+    // surrogate code points are not Unicode scalar values,
+    // any UTF-8 byte sequence that would otherwise map to code points U+D800..U+DFFF is ill-formed.
+    // see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27288
+    Preconditions.checkArgument(
+        codePoint < Character.MIN_SURROGATE || codePoint > Character.MAX_SURROGATE,
+        "invalid code point: %s",
+        codePoint);
+
+    if (codePoint == Character.MIN_SURROGATE - 1) {
+      // increment to the next Unicode scalar value
+      return Character.MAX_SURROGATE + 1;
+    } else if (codePoint == Character.MAX_CODE_POINT) {
+      // overflow
+      return 0;
+    } else {
+      return codePoint + 1;
+    }
+  }
 }
diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
@@ -202,11 +202,20 @@ public void testTruncateStringMax() {
     String test5 = "\uDBFF\uDFFF\uDBFF\uDFFF";
     String test6 = "\uD800\uDFFF\uD800\uDFFF";
     // Increment the previous character
-    String test6_2_expected = "\uD801\uDC00";
+    String test6_1_expected = "\uD801\uDC00";
     String test7 = "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02";
     String test7_2_expected = "\uD83D\uDE02\uD83D\uDE03";
     String test7_1_expected = "\uD83D\uDE03";
 
+    // Increment the max UTF-8 character will overflow
+    String test8 = "a\uDBFF\uDFFFc";
+    String test8_2_expected = "b";
+
+    // Increment skip invalid Unicode scalar values [Character.MIN_SURROGATE,
+    // Character.MAX_SURROGATE]
+    String test9 = "a" + (char) (Character.MIN_SURROGATE - 1) + "b";
+    String test9_2_expected = "a" + (char) (Character.MAX_SURROGATE + 1);
+
     Comparator<CharSequence> cmp = Literal.of(test1).comparator();
     assertThat(cmp.compare(truncateStringMax(Literal.of(test1), 4).value(), test1))
         .as("Truncated upper bound should be greater than or equal to the actual upper bound")
@@ -254,10 +263,10 @@ public void testTruncateStringMax() {
     assertThat(truncateStringMax(Literal.of(test5), 1))
         .as("An upper bound doesn't exist since the first two characters are max UTF-8 characters")
         .isNull();
-    assertThat(cmp.compare(truncateStringMax(Literal.of(test6), 2).value(), test6))
+    assertThat(cmp.compare(truncateStringMax(Literal.of(test6), 1).value(), test6))
         .as("Truncated upper bound should be greater than or equal to the actual upper bound")
         .isGreaterThanOrEqualTo(0);
-    assertThat(cmp.compare(truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected))
+    assertThat(cmp.compare(truncateStringMax(Literal.of(test6), 1).value(), test6_1_expected))
         .as(
             "Test 4 byte UTF-8 character increment. Output must have one character with "
                 + "the first character incremented")
@@ -273,5 +282,24 @@ public void testTruncateStringMax() {
         .as(
             "Test input with multiple 4 byte UTF-8 character where the first unicode character should be incremented")
         .isEqualTo(0);
+
+    assertThat(cmp.compare(truncateStringMax(Literal.of(test8), 2).value(), test8))
+        .as("Truncated upper bound should be greater than or equal to the actual upper bound")
+        .isGreaterThanOrEqualTo(0);
+    assertThat(cmp.compare(truncateStringMax(Literal.of(test8), 2).value(), test8_2_expected))
+        .as(
+            "Test the last character is the 4-byte max UTF-8 character after truncated where the second-to-last "
+                + "character should be incremented")
+        .isEqualTo(0);
+
+    assertThat(cmp.compare(truncateStringMax(Literal.of(test9), 2).value(), test9))
+        .as("Truncated upper bound should be greater than or equal to the actual upper bound")
+        .isGreaterThanOrEqualTo(0);
+
+    assertThat(cmp.compare(truncateStringMax(Literal.of(test9), 2).value(), test9_2_expected))
+        .as(
+            "Test the last character is `Character.MIN_SURROGATE - 1` after truncated, it should be incremented to "
+                + "next valid Unicode scalar value `Character.MAX_SURROGATE + 1`")
+        .isEqualTo(0);
   }
 }