diff --git a/Cargo.toml b/Cargo.toml index 414c3a6a..4bdafefd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,6 +50,8 @@ serde_path_to_error = ["dep:serde_path_to_error"] serde_with-3 = ["dep:serde_with", "dep:serde"] serde = ["dep:serde"] serde_json-1 = ["dep:serde_json"] +# Provides additional functionality for BSON validation. For internal use only. +sfp-internal = [] [lib] name = "bson" diff --git a/src/error.rs b/src/error.rs index fc5ef6dc..ca002545 100644 --- a/src/error.rs +++ b/src/error.rs @@ -121,6 +121,19 @@ pub enum ErrorKind { n: u64, }, + /// A cstring exceeded the maximum parsing length. + #[cfg(feature = "sfp-internal")] + #[error("cstring exceeded the maximum parsing length ({max_parse_len} bytes)")] + #[non_exhaustive] + #[doc(hidden)] + TooLongCStr { + /// The configured maximum parsing length. + max_parse_len: usize, + + /// The bytes parsed before the maximum parsing length was reached. + bytes: Vec, + }, + /// Invalid UTF-8 bytes were encountered. #[error("Invalid UTF-8")] #[non_exhaustive] diff --git a/src/raw.rs b/src/raw.rs index eb3da8b0..f72045e3 100644 --- a/src/raw.rs +++ b/src/raw.rs @@ -159,7 +159,7 @@ pub use self::{ cstr::{assert_valid_cstr, cstr, validate_cstr, CStr, CString, IsValidCStr}, document::RawDocument, document_buf::{BindRawBsonRef, BindValue, RawDocumentBuf}, - iter::{RawElement, RawIter}, + iter::{Iter, RawElement, RawIter}, }; pub(crate) const MIN_BSON_STRING_SIZE: i32 = 4 + 1; // 4 bytes for length, one byte for null terminator diff --git a/src/raw/array.rs b/src/raw/array.rs index 28210a59..c0706692 100644 --- a/src/raw/array.rs +++ b/src/raw/array.rs @@ -97,6 +97,21 @@ impl RawArray { self.into_iter().nth(index).transpose() } + /// Gets a reference to the value at the given index. Returns an error if a cstring is + /// encountered that exceeds the provided `len`. + #[cfg(feature = "sfp-internal")] + #[doc(hidden)] + pub fn get_with_max_cstr_parse_len( + &self, + index: usize, + len: usize, + ) -> RawResult>> { + self.into_iter() + .max_cstr_parse_len(len) + .nth(index) + .transpose() + } + fn get_with<'a, T>( &'a self, index: usize, @@ -269,6 +284,17 @@ pub struct RawArrayIter<'a> { inner: RawIter<'a>, } +impl<'a> RawArrayIter<'a> { + /// The maximum number of bytes the iterator should parse when searching for the null-terminator + /// for a cstring. + #[cfg(feature = "sfp-internal")] + #[doc(hidden)] + pub fn max_cstr_parse_len(mut self, len: impl Into>) -> Self { + self.inner = self.inner.max_cstr_parse_len(len); + self + } +} + impl<'a> Iterator for RawArrayIter<'a> { type Item = RawResult>; diff --git a/src/raw/document.rs b/src/raw/document.rs index e62526ec..c45fc428 100644 --- a/src/raw/document.rs +++ b/src/raw/document.rs @@ -26,7 +26,6 @@ use super::{ RawDocumentBuf, RawIter, RawRegexRef, - Result as RawResult, MIN_BSON_DOCUMENT_SIZE, }; use crate::{oid::ObjectId, spec::ElementType, Document}; @@ -94,7 +93,7 @@ impl RawDocument { /// let doc = RawDocument::from_bytes(b"\x05\0\0\0\0")?; /// # Ok::<(), bson::error::Error>(()) /// ``` - pub fn from_bytes + ?Sized>(data: &D) -> RawResult<&RawDocument> { + pub fn from_bytes + ?Sized>(data: &D) -> Result<&RawDocument> { let data = data.as_ref(); if data.len() < 5 { @@ -145,7 +144,7 @@ impl RawDocument { /// assert!(doc.get("unknown")?.is_none()); /// # Ok::<(), Error>(()) /// ``` - pub fn get(&self, key: impl AsRef) -> RawResult>> { + pub fn get(&self, key: impl AsRef) -> Result>> { for elem in RawIter::new(self) { let elem = elem?; if key.as_ref() == elem.key().as_str() { @@ -155,6 +154,24 @@ impl RawDocument { Ok(None) } + /// Gets a reference to the value corresponding to the given key by iterating until the key is + /// found. Returns an error if a cstring is encountered that exceeds the provided `len`. + #[cfg(feature = "sfp-internal")] + #[doc(hidden)] + pub fn get_with_max_cstr_parse_len( + &self, + key: impl AsRef, + len: usize, + ) -> Result>> { + for elem in RawIter::new(self).max_cstr_parse_len(len) { + let elem = elem?; + if key.as_ref() == elem.key().as_str() { + return Ok(Some(elem.try_into()?)); + } + } + Ok(None) + } + /// Gets an iterator over the elements in the [`RawDocument`] that yields /// `Result<(&str, RawBson<'_>)>`. pub fn iter(&self) -> Iter<'_> { @@ -479,24 +496,41 @@ impl RawDocument { self.as_bytes().len() == MIN_BSON_DOCUMENT_SIZE as usize } - pub(crate) fn cstring_bytes_at(&self, start_at: usize) -> RawResult<&[u8]> { - let buf = &self.as_bytes()[start_at..]; - - let mut splits = buf.splitn(2, |x| *x == 0); - let value = splits - .next() - .ok_or_else(|| RawError::malformed_bytes("no value"))?; - if splits.next().is_some() { - Ok(value) - } else { - Err(RawError::malformed_bytes("expected null terminator")) - } - } - - pub(crate) fn read_cstring_at(&self, start_at: usize) -> RawResult<&CStr> { - let bytes = self.cstring_bytes_at(start_at)?; - let s = try_to_str(bytes)?; - s.try_into() + pub(crate) fn cstring_bytes_at( + &self, + start_at: usize, + max_parse_len: Option, + ) -> Result<&[u8]> { + let data = &self.data; + let end = max_parse_len + .map(|len| std::cmp::min(start_at + len + 1, data.len())) + .unwrap_or(data.len()); + let buf = &data[start_at..end]; + + let Some(index) = buf.iter().position(|b| *b == 0) else { + #[cfg(feature = "sfp-internal")] + if let Some(max_parse_len) = max_parse_len { + return Err(crate::error::ErrorKind::TooLongCStr { + max_parse_len, + bytes: buf.to_vec(), + } + .into()); + } + // Note: This error should never be encountered in practice because the document + // constructors validate that the last byte is 0. + return Err(Error::malformed_bytes("missing null terminator")); + }; + Ok(&buf[..index]) + } + + pub(crate) fn read_cstring_at( + &self, + start_at: usize, + max_parse_len: Option, + ) -> Result<&CStr> { + let bytes = self.cstring_bytes_at(start_at, max_parse_len)?; + let str = try_to_str(bytes)?; + str.try_into() } } @@ -580,7 +614,7 @@ impl<'a> From<&'a RawDocument> for Cow<'a, RawDocument> { impl TryFrom<&RawDocument> for Document { type Error = RawError; - fn try_from(rawdoc: &RawDocument) -> RawResult { + fn try_from(rawdoc: &RawDocument) -> Result { rawdoc .into_iter() .map(|res| res.and_then(|(k, v)| Ok((k.as_str().to_owned(), v.try_into()?)))) @@ -591,7 +625,7 @@ impl TryFrom<&RawDocument> for Document { impl TryFrom<&RawDocument> for Utf8Lossy { type Error = RawError; - fn try_from(rawdoc: &RawDocument) -> RawResult> { + fn try_from(rawdoc: &RawDocument) -> Result> { let mut out = Document::new(); for elem in rawdoc.iter_elements() { let elem = elem?; @@ -602,7 +636,7 @@ impl TryFrom<&RawDocument> for Utf8Lossy { } } -fn deep_utf8_lossy(src: RawBson) -> RawResult { +fn deep_utf8_lossy(src: RawBson) -> Result { match src { RawBson::Array(arr) => { let mut tmp = vec![]; @@ -674,7 +708,7 @@ impl TryFrom<&RawDocumentBuf> for Utf8Lossy { impl<'a> IntoIterator for &'a RawDocument { type IntoIter = Iter<'a>; - type Item = RawResult<(&'a CStr, RawBsonRef<'a>)>; + type Item = Result<(&'a CStr, RawBsonRef<'a>)>; fn into_iter(self) -> Iter<'a> { self.iter() diff --git a/src/raw/iter.rs b/src/raw/iter.rs index e7611029..e418e4cc 100644 --- a/src/raw/iter.rs +++ b/src/raw/iter.rs @@ -30,7 +30,8 @@ use super::{ RawDocument, }; -/// An iterator over the document's entries. +/// An iterator over the key-value pairs in a document. Construct by calling [`RawDocument::iter`] +/// or [`RawDocumentBuf::iter`](crate::RawDocumentBuf::iter). pub struct Iter<'a> { inner: RawIter<'a>, } @@ -41,6 +42,15 @@ impl<'a> Iter<'a> { inner: RawIter::new(doc), } } + + /// The maximum number of bytes the iterator should parse when searching for the null-terminator + /// for a cstring. + #[cfg(feature = "sfp-internal")] + #[doc(hidden)] + pub fn max_cstr_parse_len(mut self, len: impl Into>) -> Self { + self.inner = self.inner.max_cstr_parse_len(len); + self + } } impl<'a> Iterator for Iter<'a> { @@ -58,11 +68,12 @@ impl<'a> Iterator for Iter<'a> { } } -/// An iterator over the document's elements. +/// An iterator over the elements in a document. Construct by calling [`RawDocument::iter_elements`] +/// or [`RawDocumentBuf::iter_elements`](crate::RawDocumentBuf::iter_elements). pub struct RawIter<'a> { doc: &'a RawDocument, offset: usize, - + max_cstr_parse_len: Option, /// Whether the underlying doc is assumed to be valid or if an error has been encountered. /// After an error, all subsequent iterations will return None. valid: bool, @@ -73,10 +84,20 @@ impl<'a> RawIter<'a> { Self { doc, offset: 4, + max_cstr_parse_len: None, valid: true, } } + /// The maximum number of bytes the iterator should parse when searching for the null-terminator + /// for a cstring. + #[cfg(feature = "sfp-internal")] + #[doc(hidden)] + pub fn max_cstr_parse_len(mut self, len: impl Into>) -> Self { + self.max_cstr_parse_len = len.into(); + self + } + fn verify_enough_bytes(&self, start: usize, num_bytes: usize) -> Result<()> { let end = checked_add(start, num_bytes)?; if self.doc.as_bytes().get(start..end).is_none() { @@ -212,12 +233,15 @@ impl<'a> RawElement<'a> { id: self.get_oid_at(self.start_at + (self.size - 12))?, }), ElementType::RegularExpression => { - let pattern = self.doc.read_cstring_at(self.start_at)?; + // Note: the max_cstr_parse_len doesn't need to be passed in here because it is + // already enforced when the iterator determines the total size of the regular + // expression. + let pattern = self.doc.read_cstring_at(self.start_at, None)?; RawBsonRef::RegularExpression(RawRegexRef { pattern, options: self .doc - .read_cstring_at(self.start_at + pattern.len() + 1)?, + .read_cstring_at(self.start_at + pattern.len() + 1, None)?, }) } ElementType::Timestamp => RawBsonRef::Timestamp({ @@ -316,13 +340,18 @@ impl<'a> RawElement<'a> { id: self.get_oid_at(self.start_at + (self.size - 12))?, }), ElementType::RegularExpression => { + // Note: the max_cstr_parse_len doesn't need to be passed in here because it is + // already enforced when the iterator determines the total size of the regular + // expression. let pattern = - String::from_utf8_lossy(self.doc.cstring_bytes_at(self.start_at)?).into_owned(); + String::from_utf8_lossy(self.doc.cstring_bytes_at(self.start_at, None)?) + .into_owned(); let pattern_len = pattern.len(); Utf8LossyBson::RegularExpression(crate::Regex { pattern: pattern.try_into()?, options: String::from_utf8_lossy( - self.doc.cstring_bytes_at(self.start_at + pattern_len + 1)?, + self.doc + .cstring_bytes_at(self.start_at + pattern_len + 1, None)?, ) .into_owned() .try_into()?, @@ -404,8 +433,10 @@ impl RawIter<'_> { ElementType::Array => self.next_document_len(offset)?, ElementType::Binary => self.get_next_length_at(offset)? + 4 + 1, ElementType::RegularExpression => { - let pattern = self.doc.read_cstring_at(offset)?; - let options = self.doc.read_cstring_at(offset + pattern.len() + 1)?; + let pattern = self.doc.read_cstring_at(offset, self.max_cstr_parse_len)?; + let options = self + .doc + .read_cstring_at(offset + pattern.len() + 1, self.max_cstr_parse_len)?; pattern.len() + 1 + options.len() + 1 } ElementType::DbPointer => read_len(&self.doc.as_bytes()[offset..])? + 12, @@ -440,7 +471,10 @@ impl<'a> Iterator for RawIter<'a> { return Some(Err(Error::malformed_bytes("iteration overflowed document"))); } - let key = match self.doc.read_cstring_at(self.offset + 1) { + let key = match self + .doc + .read_cstring_at(self.offset + 1, self.max_cstr_parse_len) + { Ok(k) => k, Err(e) => { self.valid = false; diff --git a/src/raw/test.rs b/src/raw/test.rs index 92311d1d..f94d37f5 100644 --- a/src/raw/test.rs +++ b/src/raw/test.rs @@ -508,3 +508,111 @@ proptest! { prop_assert_eq!(doc, roundtrip); } } + +#[test] +#[cfg(feature = "sfp-internal")] +fn max_cstr_parse_len() { + let key = cstr!("aaaaaaaa"); + let doc = rawdoc! { key: "b" }; + + let mut iter = doc.iter().max_cstr_parse_len(key.len()); + let (k, _) = iter.next().unwrap().unwrap(); + assert_eq!(k, key); + + let mut iter = doc.iter().max_cstr_parse_len(key.len() - 1); + let error = iter.next().unwrap().unwrap_err(); + match error.kind { + ErrorKind::TooLongCStr { + max_parse_len, + bytes, + } => { + assert_eq!(max_parse_len, key.len() - 1); + assert_eq!(bytes.as_slice(), &key.as_str().bytes().collect::>()); + } + other => panic!("expected TooLongCStr, got {}", other), + } + + let mut iter = doc.iter_elements().max_cstr_parse_len(key.len() - 5); + let Err(error) = iter.next().unwrap() else { + panic!("expected error"); + }; + assert!(matches!(error.kind, ErrorKind::TooLongCStr { .. })); + + let b = doc + .get_with_max_cstr_parse_len(key.as_str(), key.len()) + .unwrap() + .unwrap() + .as_str() + .unwrap(); + assert_eq!(b, "b"); + + let error = doc + .get_with_max_cstr_parse_len(key.as_str(), key.len() - 2) + .unwrap_err(); + assert!(matches!(error.kind, ErrorKind::TooLongCStr { .. })); + + // ensure we don't panic on OOB if the max parse len exceeds the document's length + let mut iter = doc.iter().max_cstr_parse_len(doc.as_bytes().len() + 1); + iter.next().unwrap().unwrap(); + + // a long key in a nested document shouldn't impact parsing + let nested_doc = rawdoc! { "nested": doc, "after": "c" }; + for result in nested_doc.iter().max_cstr_parse_len(key.len() - 1) { + result.unwrap(); + } + + // a too-long key earlier in the doc should cause an error + let longer_key = cstr!("aaaaaaaaaaaaaaaa"); + let doc = rawdoc! { longer_key: "b", key: "c" }; + let error = doc.get_with_max_cstr_parse_len(key, key.len()).unwrap_err(); + assert!(matches!(error.kind, ErrorKind::TooLongCStr { .. })); + + let mut bytes = rawdoc! { "array": { key: "element" } }.into_bytes(); + // change the type id for the value from document to array + bytes[4] = 4; + let doc_with_array = RawDocumentBuf::from_bytes(bytes).unwrap(); + let array = doc_with_array.get_array("array").unwrap(); + + let mut iter = array.into_iter().max_cstr_parse_len(key.len()); + let val = iter.next().unwrap().unwrap().as_str().unwrap(); + assert_eq!(val, "element"); + + let mut iter = array.into_iter().max_cstr_parse_len(key.len() - 1); + let error = iter.next().unwrap().unwrap_err(); + assert!(matches!(error.kind, ErrorKind::TooLongCStr { .. })); + + let pattern = cstr!("pattern"); + let options = cstr!("abc"); + let key = cstr!("r"); + let regex = RawRegexRef { pattern, options }; + let doc_with_regex = rawdoc! { key: RawBsonRef::from(regex) }; + + let mut iter = doc_with_regex.iter().max_cstr_parse_len(pattern.len()); + let (key_from_doc, bson) = iter.next().unwrap().unwrap(); + let regex_from_doc = bson.as_regex().unwrap(); + assert_eq!(key_from_doc, key); + assert_eq!(regex_from_doc, regex); + + let mut iter = doc_with_regex.iter().max_cstr_parse_len(pattern.len() - 1); + let error = iter.next().unwrap().unwrap_err(); + match error.kind { + ErrorKind::TooLongCStr { + max_parse_len, + bytes, + } => { + assert_eq!(max_parse_len, pattern.len() - 1); + assert_eq!(bytes, pattern.as_str().bytes().collect::>()); + } + other => panic!("expected TooLongCStr, got {}", other), + } + + let mut iter = doc_with_regex + .iter_elements() + .max_cstr_parse_len(pattern.len() - 1); + // an error occurs here rather than when accessing the value because the iterator needs to read + // the regex's cstrings to determine its overall length + let Err(error) = iter.next().unwrap() else { + panic!("expected error"); + }; + assert!(matches!(error.kind, ErrorKind::TooLongCStr { .. })); +}