From c1fe4641cf12eb5f5d5621662f744fe38e7102f9 Mon Sep 17 00:00:00 2001 From: Vladimir Stoiakin Date: Thu, 17 Nov 2022 17:30:48 +0300 Subject: [PATCH] Change parsing of a BOM to make it standard-compliant (#1152) --- src/stream.cpp | 423 ++++++++++++++--------------------------- src/stream.h | 51 ++--- src/streamcharsource.h | 29 ++- 3 files changed, 174 insertions(+), 329 deletions(-) diff --git a/src/stream.cpp b/src/stream.cpp index b1aa092f6..5bc6eed7f 100644 --- a/src/stream.cpp +++ b/src/stream.cpp @@ -1,153 +1,12 @@ -#include - #include "stream.h" #ifndef YAML_PREFETCH_SIZE #define YAML_PREFETCH_SIZE 2048 #endif -#define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A))) -#define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A)) - #define CP_REPLACEMENT_CHARACTER (0xFFFD) namespace YAML { -enum UtfIntroState { - uis_start, - uis_utfbe_b1, - uis_utf32be_b2, - uis_utf32be_bom3, - uis_utf32be, - uis_utf16be, - uis_utf16be_bom1, - uis_utfle_bom1, - uis_utf16le_bom2, - uis_utf32le_bom3, - uis_utf16le, - uis_utf32le, - uis_utf8_imp, - uis_utf16le_imp, - uis_utf32le_imp3, - uis_utf8_bom1, - uis_utf8_bom2, - uis_utf8, - uis_error -}; - -enum UtfIntroCharType { - uict00, - uictBB, - uictBF, - uictEF, - uictFE, - uictFF, - uictAscii, - uictOther, - uictMax -}; - -static bool s_introFinalState[] = { - false, // uis_start - false, // uis_utfbe_b1 - false, // uis_utf32be_b2 - false, // uis_utf32be_bom3 - true, // uis_utf32be - true, // uis_utf16be - false, // uis_utf16be_bom1 - false, // uis_utfle_bom1 - false, // uis_utf16le_bom2 - false, // uis_utf32le_bom3 - true, // uis_utf16le - true, // uis_utf32le - false, // uis_utf8_imp - false, // uis_utf16le_imp - false, // uis_utf32le_imp3 - false, // uis_utf8_bom1 - false, // uis_utf8_bom2 - true, // uis_utf8 - true, // uis_error -}; - -static UtfIntroState s_introTransitions[][uictMax] = { - // uict00, uictBB, uictBF, uictEF, - // uictFE, uictFF, uictAscii, uictOther - {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, - uis_utfle_bom1, uis_utf8_imp, uis_utf8}, - {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf16be, uis_utf8}, - {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, - uis_utf8, uis_utf8}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, - uis_utf8}, - {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, - uis_utf32be, uis_utf32be, uis_utf32be}, - {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, - uis_utf16be, uis_utf16be, uis_utf16be}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, - uis_utf8}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, - uis_utf8, uis_utf8}, - {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, - uis_utf32le, uis_utf32le, uis_utf32le}, - {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf8, uis_utf8}, - {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, - uis_utf16le, uis_utf16le, uis_utf16le}, - {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf8}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf8}, - {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, - uis_utf8}, -}; - -static char s_introUngetCount[][uictMax] = { - // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther - {0, 1, 1, 0, 0, 0, 0, 1}, {0, 2, 2, 2, 2, 2, 2, 2}, - {3, 3, 3, 3, 0, 3, 3, 3}, {4, 4, 4, 4, 4, 0, 4, 4}, - {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1}, - {2, 2, 2, 2, 2, 0, 2, 2}, {2, 2, 2, 2, 0, 2, 2, 2}, - {0, 1, 1, 1, 1, 1, 1, 1}, {0, 2, 2, 2, 2, 2, 2, 2}, - {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1}, - {0, 2, 2, 2, 2, 2, 2, 2}, {0, 3, 3, 3, 3, 3, 3, 3}, - {4, 4, 4, 4, 4, 4, 4, 4}, {2, 0, 2, 2, 2, 2, 2, 2}, - {3, 3, 0, 3, 3, 3, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1}, -}; - -inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) { - if (std::istream::traits_type::eof() == ch) { - return uictOther; - } - - switch (ch) { - case 0: - return uict00; - case 0xBB: - return uictBB; - case 0xBF: - return uictBF; - case 0xEF: - return uictEF; - case 0xFE: - return uictFE; - case 0xFF: - return uictFF; - } - - if ((ch > 0) && (ch < 0xFF)) { - return uictAscii; - } - - return uictOther; -} inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift) { @@ -182,96 +41,116 @@ inline void QueueUnicodeCodepoint(std::deque& q, unsigned long ch) { } } -Stream::Stream(std::istream& input) - : m_input(input), - m_mark{}, - m_charSet{}, - m_readahead{}, - m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]), - m_nPrefetchedAvailable(0), - m_nPrefetchedUsed(0) { - using char_traits = std::istream::traits_type; - - if (!input) - return; +// Determine (or guess) the character-set by reading the BOM, if any. +// See the YAML specification for the determination algorithm. +// Returns the size of detected BOM +uint_fast8_t Stream::CheckBOM(const uint8_t* buffer, std::size_t size) { + if (size >= 4) { + if (buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && + buffer[3] == 0xFF) { + m_charSet = utf32be; + return 4; + } + if (buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0x00) { + m_charSet = utf32be; + return 0; + } - // Determine (or guess) the character-set by reading the BOM, if any. See - // the YAML specification for the determination algorithm. - char_traits::int_type intro[4]{}; - int nIntroUsed = 0; - UtfIntroState state = uis_start; - for (; !s_introFinalState[state];) { - std::istream::int_type ch = input.get(); - intro[nIntroUsed++] = ch; - UtfIntroCharType charType = IntroCharTypeOf(ch); - UtfIntroState newState = s_introTransitions[state][charType]; - int nUngets = s_introUngetCount[state][charType]; - if (nUngets > 0) { - input.clear(); - for (; nUngets > 0; --nUngets) { - if (char_traits::eof() != intro[--nIntroUsed]) - input.putback(char_traits::to_char_type(intro[nIntroUsed])); - } + if (buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && + buffer[3] == 0x00) { + m_charSet = utf32le; + return 4; + } + if (buffer[1] == 0x00 && buffer[2] == 0x00 && buffer[3] == 0x00) { + m_charSet = utf32le; + return 0; } - state = newState; } - switch (state) { - case uis_utf8: - m_charSet = utf8; - break; - case uis_utf16le: - m_charSet = utf16le; - break; - case uis_utf16be: + if (size >= 2) { + if (buffer[0] == 0xFE && buffer[1] == 0xFF) { m_charSet = utf16be; - break; - case uis_utf32le: - m_charSet = utf32le; - break; - case uis_utf32be: - m_charSet = utf32be; - break; - default: + return 2; + } + if (buffer[0] == 0x00) { + m_charSet = utf16be; + return 0; + } + + if (buffer[0] == 0xFF && buffer[1] == 0xFE) { + m_charSet = utf16le; + return 2; + } + if (buffer[1] == 0x00) { + m_charSet = utf16le; + return 0; + } + } + + if (size >= 3) { + if (buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) { m_charSet = utf8; - break; + return 3; + } } - ReadAheadTo(0); + m_charSet = utf8; + return 0; } -Stream::~Stream() { delete[] m_pPrefetched; } +Stream::Stream(std::istream& input) + : m_input(input), + m_pPrefetched(new uint8_t[YAML_PREFETCH_SIZE]), + m_nPrefetchedAvailable(0), + m_nPrefetchedUsed(0), + m_readahead{}, + m_mark{}, + m_charSet(utf8) { -char Stream::peek() const { - if (m_readahead.empty()) { - return Stream::eof(); - } + if (!input.good()) + return; - return m_readahead[0]; + input.read(reinterpret_cast(m_pPrefetched), YAML_PREFETCH_SIZE); + m_nPrefetchedAvailable = input.gcount(); + m_nPrefetchedUsed = CheckBOM(m_pPrefetched, m_nPrefetchedAvailable); } -Stream::operator bool() const { - return m_input.good() || - (!m_readahead.empty() && m_readahead[0] != Stream::eof()); +Stream::~Stream() { delete[] m_pPrefetched; } + +bool Stream::isEmpty() const { + return m_readahead.empty() && m_nPrefetchedUsed >= m_nPrefetchedAvailable && + !m_input.good(); } -// get -// . Extracts a character from the stream and updates our position -char Stream::get() { - char ch = peek(); - AdvanceCurrent(); - m_mark.column++; +Stream::operator bool() const { return !isEmpty(); } - if (ch == '\n') { - m_mark.column = 0; - m_mark.line++; +char Stream::peek(std::size_t i) const { + if (prepare(i)) { + return m_readahead[i]; + } else { + return Stream::eof(); } +} - return ch; +// Extracts a character from the stream and updates our position +char Stream::get() { + if (prepare(0)) { + char c = m_readahead.front(); + m_readahead.pop_front(); + m_mark.pos++; + if (c == '\n') { + m_mark.column = 0; + m_mark.line++; + } else { + m_mark.column++; + } + return c; + } else { + return Stream::eof(); + } } -// get -// . Extracts 'n' characters from the stream and updates our position +// Extracts 'n' characters from the stream and updates our position std::string Stream::get(int n) { std::string ret; if (n > 0) { @@ -282,74 +161,85 @@ std::string Stream::get(int n) { return ret; } -// eat -// . Eats 'n' characters and updates our position. +// Eats 'n' characters and updates our position. void Stream::eat(int n) { for (int i = 0; i < n; i++) get(); } -void Stream::AdvanceCurrent() { - if (!m_readahead.empty()) { - m_readahead.pop_front(); - m_mark.pos++; - } - - ReadAheadTo(0); -} - -bool Stream::_ReadAheadTo(size_t i) const { - while (m_input.good() && (m_readahead.size() <= i)) { +bool Stream::prepare(std::size_t i) const { + while (m_readahead.size() <= i) { + bool resume; switch (m_charSet) { case utf8: - StreamInUtf8(); + resume = StreamInUtf8(); break; case utf16le: - StreamInUtf16(); + resume = StreamInUtf16(); break; case utf16be: - StreamInUtf16(); + resume = StreamInUtf16(); break; case utf32le: - StreamInUtf32(); + resume = StreamInUtf32(); break; case utf32be: - StreamInUtf32(); + resume = StreamInUtf32(); break; } + if (!resume) { + break; + } } - // signal end of stream - if (!m_input.good()) - m_readahead.push_back(Stream::eof()); - return m_readahead.size() > i; } -void Stream::StreamInUtf8() const { - unsigned char b = GetNextByte(); - if (m_input.good()) { - m_readahead.push_back(static_cast(b)); +bool Stream::GetNextByte(uint8_t& byte) const { + if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) { + if (m_input.good()) { + m_input.read(reinterpret_cast(m_pPrefetched), YAML_PREFETCH_SIZE); + m_nPrefetchedAvailable = m_input.gcount(); + m_nPrefetchedUsed = 0; + } else { + m_nPrefetchedAvailable = 0; + } + } + + if (m_nPrefetchedAvailable != 0) { + byte = m_pPrefetched[m_nPrefetchedUsed++]; + return true; + } else { + return false; } } -void Stream::StreamInUtf16() const { +bool Stream::StreamInUtf8() const { + uint8_t byte; + if (GetNextByte(byte)) { + m_readahead.push_back(static_cast(byte)); + return true; + } else { + return false; + } +} + +bool Stream::StreamInUtf16() const { unsigned long ch = 0; - unsigned char bytes[2]; + uint8_t bytes[2]; int nBigEnd = (m_charSet == utf16be) ? 0 : 1; - bytes[0] = GetNextByte(); - bytes[1] = GetNextByte(); - if (!m_input.good()) { - return; + if (!GetNextByte(bytes[0]) || !GetNextByte(bytes[1])) { + return false; } + ch = (static_cast(bytes[nBigEnd]) << 8) | static_cast(bytes[1 ^ nBigEnd]); if (ch >= 0xDC00 && ch < 0xE000) { // Trailing (low) surrogate...ugh, wrong order QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); - return; + return true; } if (ch >= 0xD800 && ch < 0xDC00) { @@ -359,11 +249,8 @@ void Stream::StreamInUtf16() const { // Read the trailing (low) surrogate for (;;) { - bytes[0] = GetNextByte(); - bytes[1] = GetNextByte(); - if (!m_input.good()) { - QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); - return; + if (!GetNextByte(bytes[0]) || !GetNextByte(bytes[1])) { + return false; } unsigned long chLow = (static_cast(bytes[nBigEnd]) << 8) | static_cast(bytes[1 ^ nBigEnd]); @@ -376,7 +263,7 @@ void Stream::StreamInUtf16() const { if (chLow < 0xD800 || chLow >= 0xE000) { // Easiest case: queue the codepoint and return QueueUnicodeCodepoint(m_readahead, ch); - return; + return true; } // Start the loop over with the new high surrogate ch = chLow; @@ -397,43 +284,19 @@ void Stream::StreamInUtf16() const { } QueueUnicodeCodepoint(m_readahead, ch); + return true; } -inline char* ReadBuffer(unsigned char* pBuffer) { - return reinterpret_cast(pBuffer); -} - -unsigned char Stream::GetNextByte() const { - if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) { - std::streambuf* pBuf = m_input.rdbuf(); - m_nPrefetchedAvailable = static_cast( - pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE)); - m_nPrefetchedUsed = 0; - if (!m_nPrefetchedAvailable) { - m_input.setstate(std::ios_base::eofbit); - } - - if (0 == m_nPrefetchedAvailable) { - return 0; - } - } - - return m_pPrefetched[m_nPrefetchedUsed++]; -} - -void Stream::StreamInUtf32() const { +bool Stream::StreamInUtf32() const { static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}}; unsigned long ch = 0; - unsigned char bytes[4]; + uint8_t bytes[4]; int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0]; - bytes[0] = GetNextByte(); - bytes[1] = GetNextByte(); - bytes[2] = GetNextByte(); - bytes[3] = GetNextByte(); - if (!m_input.good()) { - return; + if (!GetNextByte(bytes[0]) || !GetNextByte(bytes[1]) || + !GetNextByte(bytes[2]) || !GetNextByte(bytes[3])) { + return false; } for (int i = 0; i < 4; ++i) { @@ -442,5 +305,7 @@ void Stream::StreamInUtf32() const { } QueueUnicodeCodepoint(m_readahead, ch); + return true; } + } // namespace YAML diff --git a/src/stream.h b/src/stream.h index 2bc7a1521..a442daece 100644 --- a/src/stream.h +++ b/src/stream.h @@ -9,20 +9,16 @@ #include "yaml-cpp/mark.h" #include +#include #include -#include #include -#include #include namespace YAML { -class StreamCharSource; - +// Converts arbitrary UTF-* encoding on input to UTF-8 class Stream { public: - friend class StreamCharSource; - Stream(std::istream& input); Stream(const Stream&) = delete; Stream(Stream&&) = delete; @@ -30,15 +26,15 @@ class Stream { Stream& operator=(Stream&&) = delete; ~Stream(); - operator bool() const; - bool operator!() const { return !static_cast(*this); } + static char eof() { return 0x04; } - char peek() const; + char peek(std::size_t i = 0) const; char get(); std::string get(int n); void eat(int n = 1); - static char eof() { return 0x04; } + bool isEmpty() const; + operator bool() const; const Mark mark() const { return m_mark; } int pos() const { return m_mark.pos; } @@ -47,36 +43,25 @@ class Stream { void ResetColumn() { m_mark.column = 0; } private: - enum CharacterSet { utf8, utf16le, utf16be, utf32le, utf32be }; + uint_fast8_t CheckBOM(const uint8_t* buffer, std::size_t size); + bool prepare(std::size_t i) const; + bool StreamInUtf8() const; + bool StreamInUtf16() const; + bool StreamInUtf32() const; + bool GetNextByte(uint8_t& byte) const; + private: std::istream& m_input; - Mark m_mark; + uint8_t* const m_pPrefetched; + mutable std::size_t m_nPrefetchedAvailable; + mutable std::size_t m_nPrefetchedUsed; - CharacterSet m_charSet; mutable std::deque m_readahead; - unsigned char* const m_pPrefetched; - mutable size_t m_nPrefetchedAvailable; - mutable size_t m_nPrefetchedUsed; + Mark m_mark; - void AdvanceCurrent(); - char CharAt(size_t i) const; - bool ReadAheadTo(size_t i) const; - bool _ReadAheadTo(size_t i) const; - void StreamInUtf8() const; - void StreamInUtf16() const; - void StreamInUtf32() const; - unsigned char GetNextByte() const; + enum { utf8, utf16le, utf16be, utf32le, utf32be } m_charSet; }; -// CharAt -// . Unchecked access -inline char Stream::CharAt(size_t i) const { return m_readahead[i]; } - -inline bool Stream::ReadAheadTo(size_t i) const { - if (m_readahead.size() > i) - return true; - return _ReadAheadTo(i); -} } // namespace YAML #endif // STREAM_H_62B23520_7C8E_11DE_8A39_0800200C9A66 diff --git a/src/streamcharsource.h b/src/streamcharsource.h index 826ba5347..d50c5fcbc 100644 --- a/src/streamcharsource.h +++ b/src/streamcharsource.h @@ -7,8 +7,8 @@ #pragma once #endif -#include "yaml-cpp/noexcept.h" #include "stream.h" +#include "yaml-cpp/noexcept.h" #include namespace YAML { @@ -22,29 +22,24 @@ class StreamCharSource { StreamCharSource& operator=(StreamCharSource&&) = delete; ~StreamCharSource() = default; - operator bool() const; - char operator[](std::size_t i) const { return m_stream.CharAt(m_offset + i); } - bool operator!() const { return !static_cast(*this); } + operator bool() const { return true; } + + char operator[](std::size_t i) const { return m_stream.peek(m_offset + i); } - const StreamCharSource operator+(int i) const; + const StreamCharSource operator+(int i) const { + StreamCharSource source(*this); + if (static_cast(source.m_offset) + i >= 0) + source.m_offset += static_cast(i); + else + source.m_offset = 0; + return source; + } private: std::size_t m_offset; const Stream& m_stream; }; -inline StreamCharSource::operator bool() const { - return m_stream.ReadAheadTo(m_offset); -} - -inline const StreamCharSource StreamCharSource::operator+(int i) const { - StreamCharSource source(*this); - if (static_cast(source.m_offset) + i >= 0) - source.m_offset += static_cast(i); - else - source.m_offset = 0; - return source; -} } // namespace YAML #endif // STREAMCHARSOURCE_H_62B23520_7C8E_11DE_8A39_0800200C9A66