Skip to content
This repository was archived by the owner on Nov 27, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/FUNDING.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
github: [fasterthanlime]
patreon: fasterthanlime
45 changes: 24 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,36 +1,37 @@
<h1>
<picture>
<source type="image/webp" media="(prefers-color-scheme: dark)" srcset="https://github.com/facet-rs/facet/raw/main/static/logo-v2/facet-b-dark.webp">
<source type="image/png" media="(prefers-color-scheme: dark)" srcset="https://github.com/facet-rs/facet/raw/main/static/logo-v2/facet-b-dark.png">
<source type="image/webp" srcset="https://github.com/facet-rs/facet/raw/main/static/logo-v2/facet-b-light.webp">
<img src="https://github.com/facet-rs/facet/raw/main/static/logo-v2/facet-b-light.png" height="35" alt="Facet logo - a reflection library for Rust">
</picture>
</h1>
# facet-json

[![Coverage Status](https://coveralls.io/repos/github/facet-rs/facet/badge.svg?branch=main)](https://coveralls.io/github/facet-rs/facet?branch=main)
[![Coverage Status](https://coveralls.io/repos/github/facet-rs/facet-json/badge.svg?branch=main)](https://coveralls.io/github/facet-rs/facet?branch=main)
[![crates.io](https://img.shields.io/crates/v/facet-json.svg)](https://crates.io/crates/facet-json)
[![documentation](https://docs.rs/facet-json/badge.svg)](https://docs.rs/facet-json)
[![MIT/Apache-2.0 licensed](https://img.shields.io/crates/l/facet-json.svg)](./LICENSE)
[![Discord](https://img.shields.io/discord/1379550208551026748?logo=discord&label=discord)](https://discord.gg/JhD7CwCJ8F)

_Logo by [Misiasart](https://misiasart.com/)_
# facet-json

A JSON deserializer based on facet-deserialize

Thanks to all individual and corporate sponsors, without whom this work could not exist:
## Sponsors

<p> <a href="https://ko-fi.com/fasterthanlime">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/kofi-dark.svg">
<img src="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/kofi-light.svg" height="40" alt="Ko-fi">
</picture>
</a> <a href="https://github.com/sponsors/fasterthanlime">
Thanks to all individual sponsors:

<p> <a href="https://github.com/sponsors/fasterthanlime">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/github-dark.svg">
<img src="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/github-light.svg" height="40" alt="GitHub Sponsors">
</picture>
</a> <a href="https://patreon.com/fasterthanlime">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/patreon-dark.svg">
<img src="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/patreon-light.svg" height="40" alt="Patreon">
</picture>
</a> </p>

...along with corporate sponsors:

<p> <a href="https://aws.amazon.com">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/patreon-dark.svg">
<img src="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/patreon-light.svg" height="40" alt="Patreon">
<source media="(prefers-color-scheme: dark)" srcset="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/aws-dark.svg">
<img src="https://github.com/facet-rs/facet/raw/main/static/sponsors-v3/aws-light.svg" height="40" alt="AWS">
</picture>
</a> <a href="https://zed.dev">
<picture>
Expand All @@ -44,9 +45,11 @@ Thanks to all individual and corporate sponsors, without whom this work could no
</picture>
</a> </p>

# facet-json
...without whom this work could not exist.

A JSON deserializer based on facet-deserialize
## Special thanks

The facet logo was drawn by [Misiasart](https://misiasart.com/).

## License

Expand Down
87 changes: 83 additions & 4 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ impl<'input> Tokenizer<'input> {
};

// Parse hexadecimal value
let code_point = match u16::from_str_radix(hex_str, 16) {
let code_value = match u16::from_str_radix(hex_str, 16) {
Ok(cp) => cp,
Err(_) => {
return Err(TokenError {
Expand All @@ -341,9 +341,88 @@ impl<'input> Tokenizer<'input> {
}
};

self.pos += 4; // Move past the 4 hex digits

// Check if this is a UTF-16 surrogate pair
let final_code_point = if (0xD800..=0xDBFF).contains(&code_value) {
// This is a high surrogate, we need to read a low surrogate
// Check for \uXXXX pattern following
if self.pos + 6 <= self.input.len()
&& self.input[self.pos] == b'\\'
&& self.input[self.pos + 1] == b'u'
{
// Read the second \uXXXX
let low_hex_start = self.pos + 2;
let low_hex_digits =
&self.input[low_hex_start..low_hex_start + 4];
let low_hex_str = match str::from_utf8(low_hex_digits) {
Ok(s) => s,
Err(_) => {
return Err(TokenError {
kind: TokenErrorKind::InvalidUtf8(
"invalid UTF-8 in Unicode escape".to_string(),
),
span: Span::new(low_hex_start, 4),
});
}
};

let low_value = match u16::from_str_radix(low_hex_str, 16) {
Ok(cp) => cp,
Err(_) => {
return Err(TokenError {
kind: TokenErrorKind::UnexpectedCharacter('?'),
span: Span::new(low_hex_start, 4),
});
}
};

// Check if it's a valid low surrogate
if (0xDC00..=0xDFFF).contains(&low_value) {
// Combine the surrogates into a single code point
// Formula: 0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF)
let high = code_value as u32;
let low = low_value as u32;
let code_point =
0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF);

self.pos += 6; // Move past \uXXXX
code_point
} else {
// Not a valid low surrogate, treat high surrogate as invalid
return Err(TokenError {
kind: TokenErrorKind::InvalidUtf8(
"high surrogate not followed by low surrogate"
.to_string(),
),
span: Span::new(hex_start, 4),
});
}
} else {
// High surrogate not followed by \uXXXX
return Err(TokenError {
kind: TokenErrorKind::InvalidUtf8(
"high surrogate not followed by low surrogate"
.to_string(),
),
span: Span::new(hex_start, 4),
});
}
} else if (0xDC00..=0xDFFF).contains(&code_value) {
// Low surrogate without high surrogate is invalid
return Err(TokenError {
kind: TokenErrorKind::InvalidUtf8(
"unexpected low surrogate".to_string(),
),
span: Span::new(hex_start, 4),
});
} else {
// Regular BMP character
code_value as u32
};

// Convert to UTF-8 and append to buffer
// Handle basic Unicode code points (BMP)
let c = match char::from_u32(code_point as u32) {
let c = match char::from_u32(final_code_point) {
Some(c) => c,
None => {
return Err(TokenError {
Expand All @@ -360,7 +439,7 @@ impl<'input> Tokenizer<'input> {
let utf8_bytes = c.encode_utf8(&mut utf8_buf).as_bytes();
buf.push_owned(utf8_bytes);

self.pos += 3; // +3 because we'll increment once more below
self.pos -= 1; // -1 because we'll increment once more below
}
_ => buf.push_owned(&[esc]), // other escapes
}
Expand Down
58 changes: 58 additions & 0 deletions tests/escapes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ const UNICODE_TEST_CASES: &[(&str, &str)] = &[
("\"Hello\\u0021\"", "Hello!"),
// Mixed normal escapes and Unicode escapes
("\"\\u0048\\tello\\u0021\"", "H\tello!"),
// Surrogate pair for U+1F399 (🎙️ microphone emoji)
("\"\\ud83c\\udf99\"", "🎙"),
// Surrogate pair for U+1F600 (😀 grinning face emoji)
("\"\\ud83d\\ude00\"", "😀"),
// Multiple surrogate pairs in one string
("\"\\ud83d\\ude00\\ud83d\\ude01\"", "😀😁"),
// Mixed text and surrogate pairs
("\"Hello \\ud83d\\ude00 World\"", "Hello 😀 World"),
];

#[test]
Expand All @@ -67,6 +75,56 @@ fn test_unicode_escapes() -> Result<(), Box<dyn std::error::Error>> {
Ok(())
}

/// Test invalid surrogate sequences that should fail to parse
#[test]
fn test_invalid_surrogate_sequences() {
// High surrogate without low surrogate
let result = facet_json::from_str::<String>("\"\\ud83c\"");
assert!(
result.is_err(),
"High surrogate without low surrogate should fail"
);

// High surrogate followed by non-surrogate Unicode escape
let result = facet_json::from_str::<String>("\"\\ud83c\\u0041\"");
assert!(
result.is_err(),
"High surrogate followed by non-surrogate should fail"
);

// Low surrogate without high surrogate
let result = facet_json::from_str::<String>("\"\\udc00\"");
assert!(
result.is_err(),
"Low surrogate without high surrogate should fail"
);

// High surrogate at end of string without low surrogate
let result = facet_json::from_str::<String>("\"text\\ud83c\"");
assert!(
result.is_err(),
"High surrogate at end of string should fail"
);
}

/// Test that regular BMP characters still work after surrogate pair changes
#[test]
fn test_bmp_characters_after_surrogate_fix() -> Result<(), Box<dyn std::error::Error>> {
// ASCII characters via Unicode escapes
let parsed = facet_json::from_str::<String>("\"\\u0041\\u0042\\u0043\"")?;
assert_eq!(parsed, "ABC");

// Non-ASCII BMP characters (Chinese)
let parsed = facet_json::from_str::<String>("\"\\u4e2d\\u6587\"")?;
assert_eq!(parsed, "中文");

// Greek letters
let parsed = facet_json::from_str::<String>("\"\\u03b1\\u03b2\\u03b3\"")?;
assert_eq!(parsed, "αβγ");

Ok(())
}

/// Test cases for ASCII control character serialization
/// These test the specific code path that generates \u0000 escape sequences
const CONTROL_CHAR_TEST_CASES: &[(char, &str)] = &[
Expand Down