diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..5cec72c
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,2 @@
+github: [fasterthanlime]
+patreon: fasterthanlime
diff --git a/README.md b/README.md
index d0c693e..533857e 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,37 @@
-
-
-
-
-
-
-
-
+# facet-json
-[](https://coveralls.io/github/facet-rs/facet?branch=main)
+[](https://coveralls.io/github/facet-rs/facet?branch=main)
[](https://crates.io/crates/facet-json)
[](https://docs.rs/facet-json)
[](./LICENSE)
[](https://discord.gg/JhD7CwCJ8F)
-_Logo by [Misiasart](https://misiasart.com/)_
+# facet-json
+
+A JSON deserializer based on facet-deserialize
-Thanks to all individual and corporate sponsors, without whom this work could not exist:
+## Sponsors
-
-
-
-
-
-
+Thanks to all individual sponsors:
+
+
+
+
+
+
+
+
+...along with corporate sponsors:
+
+
-
-
+
+
@@ -44,9 +45,11 @@ Thanks to all individual and corporate sponsors, without whom this work could no
-# facet-json
+...without whom this work could not exist.
-A JSON deserializer based on facet-deserialize
+## Special thanks
+
+The facet logo was drawn by [Misiasart](https://misiasart.com/).
## License
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 3e2694c..f7287b1 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -331,7 +331,7 @@ impl<'input> Tokenizer<'input> {
};
// Parse hexadecimal value
- let code_point = match u16::from_str_radix(hex_str, 16) {
+ let code_value = match u16::from_str_radix(hex_str, 16) {
Ok(cp) => cp,
Err(_) => {
return Err(TokenError {
@@ -341,9 +341,88 @@ impl<'input> Tokenizer<'input> {
}
};
+ self.pos += 4; // Move past the 4 hex digits
+
+ // Check if this is a UTF-16 surrogate pair
+ let final_code_point = if (0xD800..=0xDBFF).contains(&code_value) {
+ // This is a high surrogate, we need to read a low surrogate
+ // Check for \uXXXX pattern following
+ if self.pos + 6 <= self.input.len()
+ && self.input[self.pos] == b'\\'
+ && self.input[self.pos + 1] == b'u'
+ {
+ // Read the second \uXXXX
+ let low_hex_start = self.pos + 2;
+ let low_hex_digits =
+ &self.input[low_hex_start..low_hex_start + 4];
+ let low_hex_str = match str::from_utf8(low_hex_digits) {
+ Ok(s) => s,
+ Err(_) => {
+ return Err(TokenError {
+ kind: TokenErrorKind::InvalidUtf8(
+ "invalid UTF-8 in Unicode escape".to_string(),
+ ),
+ span: Span::new(low_hex_start, 4),
+ });
+ }
+ };
+
+ let low_value = match u16::from_str_radix(low_hex_str, 16) {
+ Ok(cp) => cp,
+ Err(_) => {
+ return Err(TokenError {
+ kind: TokenErrorKind::UnexpectedCharacter('?'),
+ span: Span::new(low_hex_start, 4),
+ });
+ }
+ };
+
+ // Check if it's a valid low surrogate
+ if (0xDC00..=0xDFFF).contains(&low_value) {
+ // Combine the surrogates into a single code point
+ // Formula: 0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF)
+ let high = code_value as u32;
+ let low = low_value as u32;
+ let code_point =
+ 0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF);
+
+ self.pos += 6; // Move past \uXXXX
+ code_point
+ } else {
+ // Not a valid low surrogate, treat high surrogate as invalid
+ return Err(TokenError {
+ kind: TokenErrorKind::InvalidUtf8(
+ "high surrogate not followed by low surrogate"
+ .to_string(),
+ ),
+ span: Span::new(hex_start, 4),
+ });
+ }
+ } else {
+ // High surrogate not followed by \uXXXX
+ return Err(TokenError {
+ kind: TokenErrorKind::InvalidUtf8(
+ "high surrogate not followed by low surrogate"
+ .to_string(),
+ ),
+ span: Span::new(hex_start, 4),
+ });
+ }
+ } else if (0xDC00..=0xDFFF).contains(&code_value) {
+ // Low surrogate without high surrogate is invalid
+ return Err(TokenError {
+ kind: TokenErrorKind::InvalidUtf8(
+ "unexpected low surrogate".to_string(),
+ ),
+ span: Span::new(hex_start, 4),
+ });
+ } else {
+ // Regular BMP character
+ code_value as u32
+ };
+
// Convert to UTF-8 and append to buffer
- // Handle basic Unicode code points (BMP)
- let c = match char::from_u32(code_point as u32) {
+ let c = match char::from_u32(final_code_point) {
Some(c) => c,
None => {
return Err(TokenError {
@@ -360,7 +439,7 @@ impl<'input> Tokenizer<'input> {
let utf8_bytes = c.encode_utf8(&mut utf8_buf).as_bytes();
buf.push_owned(utf8_bytes);
- self.pos += 3; // +3 because we'll increment once more below
+ self.pos -= 1; // -1 because we'll increment once more below
}
_ => buf.push_owned(&[esc]), // other escapes
}
diff --git a/tests/escapes.rs b/tests/escapes.rs
index ef3be3d..97644f3 100644
--- a/tests/escapes.rs
+++ b/tests/escapes.rs
@@ -55,6 +55,14 @@ const UNICODE_TEST_CASES: &[(&str, &str)] = &[
("\"Hello\\u0021\"", "Hello!"),
// Mixed normal escapes and Unicode escapes
("\"\\u0048\\tello\\u0021\"", "H\tello!"),
+ // Surrogate pair for U+1F399 (ποΈ microphone emoji)
+ ("\"\\ud83c\\udf99\"", "π"),
+ // Surrogate pair for U+1F600 (π grinning face emoji)
+ ("\"\\ud83d\\ude00\"", "π"),
+ // Multiple surrogate pairs in one string
+ ("\"\\ud83d\\ude00\\ud83d\\ude01\"", "ππ"),
+ // Mixed text and surrogate pairs
+ ("\"Hello \\ud83d\\ude00 World\"", "Hello π World"),
];
#[test]
@@ -67,6 +75,56 @@ fn test_unicode_escapes() -> Result<(), Box> {
Ok(())
}
+/// Test invalid surrogate sequences that should fail to parse
+#[test]
+fn test_invalid_surrogate_sequences() {
+ // High surrogate without low surrogate
+ let result = facet_json::from_str::("\"\\ud83c\"");
+ assert!(
+ result.is_err(),
+ "High surrogate without low surrogate should fail"
+ );
+
+ // High surrogate followed by non-surrogate Unicode escape
+ let result = facet_json::from_str::("\"\\ud83c\\u0041\"");
+ assert!(
+ result.is_err(),
+ "High surrogate followed by non-surrogate should fail"
+ );
+
+ // Low surrogate without high surrogate
+ let result = facet_json::from_str::("\"\\udc00\"");
+ assert!(
+ result.is_err(),
+ "Low surrogate without high surrogate should fail"
+ );
+
+ // High surrogate at end of string without low surrogate
+ let result = facet_json::from_str::("\"text\\ud83c\"");
+ assert!(
+ result.is_err(),
+ "High surrogate at end of string should fail"
+ );
+}
+
+/// Test that regular BMP characters still work after surrogate pair changes
+#[test]
+fn test_bmp_characters_after_surrogate_fix() -> Result<(), Box> {
+ // ASCII characters via Unicode escapes
+ let parsed = facet_json::from_str::("\"\\u0041\\u0042\\u0043\"")?;
+ assert_eq!(parsed, "ABC");
+
+ // Non-ASCII BMP characters (Chinese)
+ let parsed = facet_json::from_str::("\"\\u4e2d\\u6587\"")?;
+ assert_eq!(parsed, "δΈζ");
+
+ // Greek letters
+ let parsed = facet_json::from_str::("\"\\u03b1\\u03b2\\u03b3\"")?;
+ assert_eq!(parsed, "Ξ±Ξ²Ξ³");
+
+ Ok(())
+}
+
/// Test cases for ASCII control character serialization
/// These test the specific code path that generates \u0000 escape sequences
const CONTROL_CHAR_TEST_CASES: &[(char, &str)] = &[