Skip to content

Commit b16c52e

Browse files
committed
temp
1 parent 6d883b5 commit b16c52e

File tree

2 files changed

+166
-50
lines changed

2 files changed

+166
-50
lines changed

src/encoding.rs

+164-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
//! A module for wrappers that encode / decode data.
22
33
use std::borrow::Cow;
4+
use std::io::{self, BufRead, Read};
45

56
#[cfg(feature = "encoding")]
6-
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
7+
use encoding_rs::{Decoder as ExtDecoder, Encoding, UTF_16BE, UTF_16LE, UTF_8, CoderResult};
78

89
use crate::{Error, Result};
910

@@ -184,4 +185,166 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
184185
}
185186
}
186187

188+
/// A reference to an encoding together with information about how it was retrieved.
189+
///
190+
/// The state transition diagram:
191+
///
192+
/// ```mermaid
193+
/// flowchart LR
194+
/// Implicit -- from_str --> Explicit
195+
/// Implicit -- BOM --> BomDetected
196+
/// Implicit -- "encoding=..." --> XmlDetected
197+
/// BomDetected -- "encoding=..." --> XmlDetected
198+
/// ```
199+
#[cfg(feature = "encoding")]
200+
#[derive(Clone, Copy)]
201+
pub(crate) enum EncodingRef {
202+
/// Encoding was implicitly assumed to have a specified value. It can be refined
203+
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
204+
Implicit(&'static Encoding),
205+
/// Encoding was explicitly set to the desired value. It cannot be changed
206+
/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
207+
Explicit(&'static Encoding),
208+
/// Encoding was detected from a byte order mark (BOM) or by the first bytes
209+
/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
210+
BomDetected(&'static Encoding),
211+
/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
212+
/// It can no longer change
213+
XmlDetected(&'static Encoding),
214+
}
215+
#[cfg(feature = "encoding")]
216+
impl EncodingRef {
217+
#[inline]
218+
pub(crate) fn encoding(&self) -> &'static Encoding {
219+
match self {
220+
Self::Implicit(e) => e,
221+
Self::Explicit(e) => e,
222+
Self::BomDetected(e) => e,
223+
Self::XmlDetected(e) => e,
224+
}
225+
}
226+
#[inline]
227+
pub(crate) fn can_be_refined(&self) -> bool {
228+
match self {
229+
Self::Implicit(_) | Self::BomDetected(_) => true,
230+
Self::Explicit(_) | Self::XmlDetected(_) => false,
231+
}
232+
}
233+
}
234+
235+
#[cfg(feature = "encoding")]
236+
237+
struct DecodingBufReader<R> {
238+
// // The buffer
239+
// buffer: String,
240+
// // How many bytes in the buffer currently hold significant data.
241+
// current_position: usize,
242+
243+
// /// Track whether we see errors.
244+
// encoding: Option<Encoding>,
245+
246+
inner: R,
247+
decoded_buffer: Vec<u8>,
248+
current_pos: usize,
249+
250+
decoder: ExtDecoder,
251+
encoding: EncodingRef,
252+
}
253+
254+
#[cfg(feature = "encoding")]
255+
impl<R: BufRead> BufRead for DecodingBufReader<R> {
256+
fn fill_buf(&mut self) -> io::Result<&[u8]> {
257+
self.shuffle();
258+
let data = self.inner.fill_buf()?;
259+
260+
let amount_read_from_inner = self.feed(data)?;
261+
self.inner.consume(amount_read_from_inner);
262+
263+
Ok(data)
264+
}
265+
266+
fn consume(&mut self, amt: usize) {
267+
self.current_pos = std::cmp::min(self.current_pos + amt, self.decoded_buffer.capacity());
268+
}
269+
}
270+
271+
272+
#[cfg(feature = "encoding")]
273+
impl<R: Read> Read for DecodingBufReader<R> {
274+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
275+
self.inner.read(buf)
276+
}
277+
}
278+
279+
#[cfg(feature = "encoding")]
280+
impl<R: BufRead> DecodingBufReader<R> {
281+
fn new(inner: R) -> Self {
282+
DecodingBufReader {
283+
inner: inner,
284+
decoded_buffer: Vec::new(),
285+
current_pos: 0,
286+
287+
decoder: UTF_8.new_decoder(),
288+
encoding: EncodingRef::Implicit(UTF_8),
289+
}
290+
}
291+
292+
fn get_raw_buffer(&mut self) -> io::Result<&[u8]> {
293+
self.inner.fill_buf()
294+
}
295+
296+
/// Move unconsumed data to the front of the buffer and reset the length
297+
fn shuffle(&mut self) {
298+
if self.current_pos == 0 {
299+
return;
300+
}
301+
302+
// Copy all unconsumed bytes to the beginning of the buffer
303+
self.decoded_buffer.as_mut_slice().copy_within(self.current_pos.., 0);
304+
// Truncate the buffer
305+
self.decoded_buffer.truncate(self.decoded_buffer.len() - self.current_pos);
306+
self.current_pos = 0;
307+
}
308+
309+
/// Reallocate a smaller buffer with the provided size
310+
fn shrink_buffer(&mut self, size: usize) {
311+
self.shuffle();
312+
self.decoded_buffer.shrink_to(size);
313+
}
314+
315+
fn set_encoding(&mut self, encoding: &'static Encoding) {
316+
self.encoding = EncodingRef::Explicit(encoding);
317+
}
318+
319+
fn feed(&mut self, data: &[u8]) -> io::Result<usize> {
320+
// reserve (at least) enough space in our buffer to hold the decoded data
321+
// encoding::max_utf8_buffer_length(data.len())
322+
self.decoded_buffer.reserve(data.len());
323+
324+
// The number of bytes already read from current `input` in total.
325+
let (result, read, written, had_errors) =
326+
self.decoder.decode_to_utf8(&data[..],
327+
&mut self.decoded_buffer[self.current_pos..],
328+
data.is_empty());
329+
self.current_pos += written;
330+
match result {
331+
CoderResult::InputEmpty => {
332+
// We have consumed the current input buffer.
333+
match had_errors {
334+
true => Err(io::Error::new(io::ErrorKind::Other, "Errors decoding")),
335+
false => Ok(read),
336+
}
337+
},
338+
CoderResult::OutputFull => unreachable!("This shouldn't happen, we reserved space"),
339+
}
340+
}
341+
}
342+
343+
#[cfg(test)]
344+
mod tests {
345+
346+
}
347+
348+
349+
187350
// TODO: add some tests for functions

src/reader/mod.rs

+2-49
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
use std::str::from_utf8;
44

55
#[cfg(feature = "encoding")]
6-
use encoding_rs::{Encoding, UTF_8};
6+
use encoding_rs::UTF_8;
77

88
#[cfg(feature = "encoding")]
9-
use crate::encoding::detect_encoding;
9+
use crate::encoding::{detect_encoding, EncodingRef};
1010
use crate::encoding::Decoder;
1111
use crate::errors::{Error, Result};
1212
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
@@ -179,53 +179,6 @@ enum TagState {
179179
Exit,
180180
}
181181

182-
/// A reference to an encoding together with information about how it was retrieved.
183-
///
184-
/// The state transition diagram:
185-
///
186-
/// ```mermaid
187-
/// flowchart LR
188-
/// Implicit -- from_str --> Explicit
189-
/// Implicit -- BOM --> BomDetected
190-
/// Implicit -- "encoding=..." --> XmlDetected
191-
/// BomDetected -- "encoding=..." --> XmlDetected
192-
/// ```
193-
#[cfg(feature = "encoding")]
194-
#[derive(Clone, Copy)]
195-
enum EncodingRef {
196-
/// Encoding was implicitly assumed to have a specified value. It can be refined
197-
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
198-
Implicit(&'static Encoding),
199-
/// Encoding was explicitly set to the desired value. It cannot be changed
200-
/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
201-
Explicit(&'static Encoding),
202-
/// Encoding was detected from a byte order mark (BOM) or by the first bytes
203-
/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
204-
BomDetected(&'static Encoding),
205-
/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
206-
/// It can no longer change
207-
XmlDetected(&'static Encoding),
208-
}
209-
#[cfg(feature = "encoding")]
210-
impl EncodingRef {
211-
#[inline]
212-
fn encoding(&self) -> &'static Encoding {
213-
match self {
214-
Self::Implicit(e) => e,
215-
Self::Explicit(e) => e,
216-
Self::BomDetected(e) => e,
217-
Self::XmlDetected(e) => e,
218-
}
219-
}
220-
#[inline]
221-
fn can_be_refined(&self) -> bool {
222-
match self {
223-
Self::Implicit(_) | Self::BomDetected(_) => true,
224-
Self::Explicit(_) | Self::XmlDetected(_) => false,
225-
}
226-
}
227-
}
228-
229182
////////////////////////////////////////////////////////////////////////////////////////////////////
230183

231184
/// A low level encoding-agnostic XML event reader.

0 commit comments

Comments
 (0)