|
1 | 1 | //! A module for wrappers that encode / decode data.
|
2 | 2 |
|
3 | 3 | use std::borrow::Cow;
|
| 4 | +use std::io::{self, BufRead, Read}; |
4 | 5 |
|
5 | 6 | #[cfg(feature = "encoding")]
|
6 |
| -use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; |
| 7 | +use encoding_rs::{Decoder as ExtDecoder, Encoding, UTF_16BE, UTF_16LE, UTF_8, CoderResult}; |
7 | 8 |
|
8 | 9 | use crate::{Error, Result};
|
9 | 10 |
|
@@ -184,4 +185,166 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
|
184 | 185 | }
|
185 | 186 | }
|
186 | 187 |
|
| 188 | +/// A reference to an encoding together with information about how it was retrieved. |
| 189 | +/// |
| 190 | +/// The state transition diagram: |
| 191 | +/// |
| 192 | +/// ```mermaid |
| 193 | +/// flowchart LR |
| 194 | +/// Implicit -- from_str --> Explicit |
| 195 | +/// Implicit -- BOM --> BomDetected |
| 196 | +/// Implicit -- "encoding=..." --> XmlDetected |
| 197 | +/// BomDetected -- "encoding=..." --> XmlDetected |
| 198 | +/// ``` |
| 199 | +#[cfg(feature = "encoding")] |
| 200 | +#[derive(Clone, Copy)] |
| 201 | +pub(crate) enum EncodingRef { |
| 202 | + /// Encoding was implicitly assumed to have a specified value. It can be refined |
| 203 | + /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`) |
| 204 | + Implicit(&'static Encoding), |
| 205 | + /// Encoding was explicitly set to the desired value. It cannot be changed |
| 206 | + /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`) |
| 207 | + Explicit(&'static Encoding), |
| 208 | + /// Encoding was detected from a byte order mark (BOM) or by the first bytes |
| 209 | + /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`) |
| 210 | + BomDetected(&'static Encoding), |
| 211 | + /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`). |
| 212 | + /// It can no longer change |
| 213 | + XmlDetected(&'static Encoding), |
| 214 | +} |
| 215 | +#[cfg(feature = "encoding")] |
| 216 | +impl EncodingRef { |
| 217 | + #[inline] |
| 218 | + pub(crate) fn encoding(&self) -> &'static Encoding { |
| 219 | + match self { |
| 220 | + Self::Implicit(e) => e, |
| 221 | + Self::Explicit(e) => e, |
| 222 | + Self::BomDetected(e) => e, |
| 223 | + Self::XmlDetected(e) => e, |
| 224 | + } |
| 225 | + } |
| 226 | + #[inline] |
| 227 | + pub(crate) fn can_be_refined(&self) -> bool { |
| 228 | + match self { |
| 229 | + Self::Implicit(_) | Self::BomDetected(_) => true, |
| 230 | + Self::Explicit(_) | Self::XmlDetected(_) => false, |
| 231 | + } |
| 232 | + } |
| 233 | +} |
| 234 | + |
| 235 | +#[cfg(feature = "encoding")] |
| 236 | + |
| 237 | +struct DecodingBufReader<R> { |
| 238 | + // // The buffer |
| 239 | + // buffer: String, |
| 240 | + // // How many bytes in the buffer currently hold significant data. |
| 241 | + // current_position: usize, |
| 242 | + |
| 243 | + // /// Track whether we see errors. |
| 244 | + // encoding: Option<Encoding>, |
| 245 | + |
| 246 | + inner: R, |
| 247 | + decoded_buffer: Vec<u8>, |
| 248 | + current_pos: usize, |
| 249 | + |
| 250 | + decoder: ExtDecoder, |
| 251 | + encoding: EncodingRef, |
| 252 | +} |
| 253 | + |
| 254 | +#[cfg(feature = "encoding")] |
| 255 | +impl<R: BufRead> BufRead for DecodingBufReader<R> { |
| 256 | + fn fill_buf(&mut self) -> io::Result<&[u8]> { |
| 257 | + self.shuffle(); |
| 258 | + let data = self.inner.fill_buf()?; |
| 259 | + |
| 260 | + let amount_read_from_inner = self.feed(data)?; |
| 261 | + self.inner.consume(amount_read_from_inner); |
| 262 | + |
| 263 | + Ok(data) |
| 264 | + } |
| 265 | + |
| 266 | + fn consume(&mut self, amt: usize) { |
| 267 | + self.current_pos = std::cmp::min(self.current_pos + amt, self.decoded_buffer.capacity()); |
| 268 | + } |
| 269 | +} |
| 270 | + |
| 271 | + |
| 272 | +#[cfg(feature = "encoding")] |
| 273 | +impl<R: Read> Read for DecodingBufReader<R> { |
| 274 | + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { |
| 275 | + self.inner.read(buf) |
| 276 | + } |
| 277 | +} |
| 278 | + |
| 279 | +#[cfg(feature = "encoding")] |
| 280 | +impl<R: BufRead> DecodingBufReader<R> { |
| 281 | + fn new(inner: R) -> Self { |
| 282 | + DecodingBufReader { |
| 283 | + inner: inner, |
| 284 | + decoded_buffer: Vec::new(), |
| 285 | + current_pos: 0, |
| 286 | + |
| 287 | + decoder: UTF_8.new_decoder(), |
| 288 | + encoding: EncodingRef::Implicit(UTF_8), |
| 289 | + } |
| 290 | + } |
| 291 | + |
| 292 | + fn get_raw_buffer(&mut self) -> io::Result<&[u8]> { |
| 293 | + self.inner.fill_buf() |
| 294 | + } |
| 295 | + |
| 296 | + /// Move unconsumed data to the front of the buffer and reset the length |
| 297 | + fn shuffle(&mut self) { |
| 298 | + if self.current_pos == 0 { |
| 299 | + return; |
| 300 | + } |
| 301 | + |
| 302 | + // Copy all unconsumed bytes to the beginning of the buffer |
| 303 | + self.decoded_buffer.as_mut_slice().copy_within(self.current_pos.., 0); |
| 304 | + // Truncate the buffer |
| 305 | + self.decoded_buffer.truncate(self.decoded_buffer.len() - self.current_pos); |
| 306 | + self.current_pos = 0; |
| 307 | + } |
| 308 | + |
| 309 | + /// Reallocate a smaller buffer with the provided size |
| 310 | + fn shrink_buffer(&mut self, size: usize) { |
| 311 | + self.shuffle(); |
| 312 | + self.decoded_buffer.shrink_to(size); |
| 313 | + } |
| 314 | + |
| 315 | + fn set_encoding(&mut self, encoding: &'static Encoding) { |
| 316 | + self.encoding = EncodingRef::Explicit(encoding); |
| 317 | + } |
| 318 | + |
| 319 | + fn feed(&mut self, data: &[u8]) -> io::Result<usize> { |
| 320 | + // reserve (at least) enough space in our buffer to hold the decoded data |
| 321 | + // encoding::max_utf8_buffer_length(data.len()) |
| 322 | + self.decoded_buffer.reserve(data.len()); |
| 323 | + |
| 324 | + // The number of bytes already read from current `input` in total. |
| 325 | + let (result, read, written, had_errors) = |
| 326 | + self.decoder.decode_to_utf8(&data[..], |
| 327 | + &mut self.decoded_buffer[self.current_pos..], |
| 328 | + data.is_empty()); |
| 329 | + self.current_pos += written; |
| 330 | + match result { |
| 331 | + CoderResult::InputEmpty => { |
| 332 | + // We have consumed the current input buffer. |
| 333 | + match had_errors { |
| 334 | + true => Err(io::Error::new(io::ErrorKind::Other, "Errors decoding")), |
| 335 | + false => Ok(read), |
| 336 | + } |
| 337 | + }, |
| 338 | + CoderResult::OutputFull => unreachable!("This shouldn't happen, we reserved space"), |
| 339 | + } |
| 340 | + } |
| 341 | +} |
| 342 | + |
| 343 | +#[cfg(test)] |
| 344 | +mod tests { |
| 345 | + |
| 346 | +} |
| 347 | + |
| 348 | + |
| 349 | + |
187 | 350 | // TODO: add some tests for functions
|
0 commit comments