diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..5cec72c --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,2 @@ +github: [fasterthanlime] +patreon: fasterthanlime diff --git a/README.md b/README.md index d0c693e..533857e 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,37 @@ -

- - - - - Facet logo - a reflection library for Rust - -

+# facet-json -[![Coverage Status](https://coveralls.io/repos/github/facet-rs/facet/badge.svg?branch=main)](https://coveralls.io/github/facet-rs/facet?branch=main) +[![Coverage Status](https://coveralls.io/repos/github/facet-rs/facet-json/badge.svg?branch=main)](https://coveralls.io/github/facet-rs/facet?branch=main) [![crates.io](https://img.shields.io/crates/v/facet-json.svg)](https://crates.io/crates/facet-json) [![documentation](https://docs.rs/facet-json/badge.svg)](https://docs.rs/facet-json) [![MIT/Apache-2.0 licensed](https://img.shields.io/crates/l/facet-json.svg)](./LICENSE) [![Discord](https://img.shields.io/discord/1379550208551026748?logo=discord&label=discord)](https://discord.gg/JhD7CwCJ8F) -_Logo by [Misiasart](https://misiasart.com/)_ +# facet-json + +A JSON deserializer based on facet-deserialize -Thanks to all individual and corporate sponsors, without whom this work could not exist: +## Sponsors -

- - -Ko-fi - - +Thanks to all individual sponsors: + +

GitHub Sponsors + + + Patreon + +

+ +...along with corporate sponsors: + +

- -Patreon + +AWS @@ -44,9 +45,11 @@ Thanks to all individual and corporate sponsors, without whom this work could no

-# facet-json +...without whom this work could not exist. -A JSON deserializer based on facet-deserialize +## Special thanks + +The facet logo was drawn by [Misiasart](https://misiasart.com/). ## License diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3e2694c..f7287b1 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -331,7 +331,7 @@ impl<'input> Tokenizer<'input> { }; // Parse hexadecimal value - let code_point = match u16::from_str_radix(hex_str, 16) { + let code_value = match u16::from_str_radix(hex_str, 16) { Ok(cp) => cp, Err(_) => { return Err(TokenError { @@ -341,9 +341,88 @@ impl<'input> Tokenizer<'input> { } }; + self.pos += 4; // Move past the 4 hex digits + + // Check if this is a UTF-16 surrogate pair + let final_code_point = if (0xD800..=0xDBFF).contains(&code_value) { + // This is a high surrogate, we need to read a low surrogate + // Check for \uXXXX pattern following + if self.pos + 6 <= self.input.len() + && self.input[self.pos] == b'\\' + && self.input[self.pos + 1] == b'u' + { + // Read the second \uXXXX + let low_hex_start = self.pos + 2; + let low_hex_digits = + &self.input[low_hex_start..low_hex_start + 4]; + let low_hex_str = match str::from_utf8(low_hex_digits) { + Ok(s) => s, + Err(_) => { + return Err(TokenError { + kind: TokenErrorKind::InvalidUtf8( + "invalid UTF-8 in Unicode escape".to_string(), + ), + span: Span::new(low_hex_start, 4), + }); + } + }; + + let low_value = match u16::from_str_radix(low_hex_str, 16) { + Ok(cp) => cp, + Err(_) => { + return Err(TokenError { + kind: TokenErrorKind::UnexpectedCharacter('?'), + span: Span::new(low_hex_start, 4), + }); + } + }; + + // Check if it's a valid low surrogate + if (0xDC00..=0xDFFF).contains(&low_value) { + // Combine the surrogates into a single code point + // Formula: 0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF) + let high = code_value as u32; + let low = low_value as u32; + let code_point = + 0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF); + + self.pos += 6; // Move past \uXXXX + code_point + } else { + // Not a valid low surrogate, treat high surrogate as invalid + return Err(TokenError { + kind: TokenErrorKind::InvalidUtf8( + "high surrogate not followed by low surrogate" + .to_string(), + ), + span: Span::new(hex_start, 4), + }); + } + } else { + // High surrogate not followed by \uXXXX + return Err(TokenError { + kind: TokenErrorKind::InvalidUtf8( + "high surrogate not followed by low surrogate" + .to_string(), + ), + span: Span::new(hex_start, 4), + }); + } + } else if (0xDC00..=0xDFFF).contains(&code_value) { + // Low surrogate without high surrogate is invalid + return Err(TokenError { + kind: TokenErrorKind::InvalidUtf8( + "unexpected low surrogate".to_string(), + ), + span: Span::new(hex_start, 4), + }); + } else { + // Regular BMP character + code_value as u32 + }; + // Convert to UTF-8 and append to buffer - // Handle basic Unicode code points (BMP) - let c = match char::from_u32(code_point as u32) { + let c = match char::from_u32(final_code_point) { Some(c) => c, None => { return Err(TokenError { @@ -360,7 +439,7 @@ impl<'input> Tokenizer<'input> { let utf8_bytes = c.encode_utf8(&mut utf8_buf).as_bytes(); buf.push_owned(utf8_bytes); - self.pos += 3; // +3 because we'll increment once more below + self.pos -= 1; // -1 because we'll increment once more below } _ => buf.push_owned(&[esc]), // other escapes } diff --git a/tests/escapes.rs b/tests/escapes.rs index ef3be3d..97644f3 100644 --- a/tests/escapes.rs +++ b/tests/escapes.rs @@ -55,6 +55,14 @@ const UNICODE_TEST_CASES: &[(&str, &str)] = &[ ("\"Hello\\u0021\"", "Hello!"), // Mixed normal escapes and Unicode escapes ("\"\\u0048\\tello\\u0021\"", "H\tello!"), + // Surrogate pair for U+1F399 (πŸŽ™οΈ microphone emoji) + ("\"\\ud83c\\udf99\"", "πŸŽ™"), + // Surrogate pair for U+1F600 (πŸ˜€ grinning face emoji) + ("\"\\ud83d\\ude00\"", "πŸ˜€"), + // Multiple surrogate pairs in one string + ("\"\\ud83d\\ude00\\ud83d\\ude01\"", "πŸ˜€πŸ˜"), + // Mixed text and surrogate pairs + ("\"Hello \\ud83d\\ude00 World\"", "Hello πŸ˜€ World"), ]; #[test] @@ -67,6 +75,56 @@ fn test_unicode_escapes() -> Result<(), Box> { Ok(()) } +/// Test invalid surrogate sequences that should fail to parse +#[test] +fn test_invalid_surrogate_sequences() { + // High surrogate without low surrogate + let result = facet_json::from_str::("\"\\ud83c\""); + assert!( + result.is_err(), + "High surrogate without low surrogate should fail" + ); + + // High surrogate followed by non-surrogate Unicode escape + let result = facet_json::from_str::("\"\\ud83c\\u0041\""); + assert!( + result.is_err(), + "High surrogate followed by non-surrogate should fail" + ); + + // Low surrogate without high surrogate + let result = facet_json::from_str::("\"\\udc00\""); + assert!( + result.is_err(), + "Low surrogate without high surrogate should fail" + ); + + // High surrogate at end of string without low surrogate + let result = facet_json::from_str::("\"text\\ud83c\""); + assert!( + result.is_err(), + "High surrogate at end of string should fail" + ); +} + +/// Test that regular BMP characters still work after surrogate pair changes +#[test] +fn test_bmp_characters_after_surrogate_fix() -> Result<(), Box> { + // ASCII characters via Unicode escapes + let parsed = facet_json::from_str::("\"\\u0041\\u0042\\u0043\"")?; + assert_eq!(parsed, "ABC"); + + // Non-ASCII BMP characters (Chinese) + let parsed = facet_json::from_str::("\"\\u4e2d\\u6587\"")?; + assert_eq!(parsed, "δΈ­ζ–‡"); + + // Greek letters + let parsed = facet_json::from_str::("\"\\u03b1\\u03b2\\u03b3\"")?; + assert_eq!(parsed, "Ξ±Ξ²Ξ³"); + + Ok(()) +} + /// Test cases for ASCII control character serialization /// These test the specific code path that generates \u0000 escape sequences const CONTROL_CHAR_TEST_CASES: &[(char, &str)] = &[