diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt index d3fb58ed0c71c..0828e4057c172 100644 --- a/libc/src/__support/wchar/CMakeLists.txt +++ b/libc/src/__support/wchar/CMakeLists.txt @@ -6,6 +6,19 @@ add_header_library( libc.hdr.types.char32_t ) +add_header_library( + string_converter + HDRS + string_converter.h + DEPENDS + libc.hdr.types.char8_t + libc.hdr.types.char32_t + libc.hdr.types.size_t + libc.src.__support.error_or + .mbstate + .character_converter +) + add_object_library( character_converter HDRS diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index 3cacfa5689e4d..8d7c4183c5998 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -14,6 +14,7 @@ #include "src/__support/error_or.h" #include "src/__support/math_extras.h" #include "src/__support/wchar/mbstate.h" +#include #include "character_converter.h" @@ -92,6 +93,7 @@ int CharacterConverter::push(char8_t utf8_byte) { state->bytes_stored++; return 0; } + // Invalid byte -> reset the state clear(); return EILSEQ; @@ -130,6 +132,12 @@ ErrorOr CharacterConverter::pop_utf32() { return utf32; } +size_t CharacterConverter::sizeAsUTF32() { + return 1; // a single utf-32 value can fit an entire character +} + +size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; } + ErrorOr CharacterConverter::pop_utf8() { if (isEmpty()) return Error(-1); @@ -156,6 +164,9 @@ ErrorOr CharacterConverter::pop_utf8() { } state->bytes_stored--; + if (state->bytes_stored == 0) + clear(); + return static_cast(output); } diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index d9a63fdc0522c..9e8dd71028002 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -14,6 +14,7 @@ #include "src/__support/common.h" #include "src/__support/error_or.h" #include "src/__support/wchar/mbstate.h" +#include namespace LIBC_NAMESPACE_DECL { namespace internal { @@ -30,6 +31,9 @@ class CharacterConverter { bool isEmpty(); bool isValidState(); + size_t sizeAsUTF32(); + size_t sizeAsUTF8(); + int push(char8_t utf8_byte); int push(char32_t utf32); diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h new file mode 100644 index 0000000000000..8c0417a3b7df4 --- /dev/null +++ b/libc/src/__support/wchar/string_converter.h @@ -0,0 +1,113 @@ +//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H +#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H + +#include "hdr/types/char32_t.h" +#include "hdr/types/char8_t.h" +#include "hdr/types/size_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/wchar/character_converter.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +template class StringConverter { +private: + CharacterConverter cr; + const T *src; + size_t src_len; + size_t src_idx; + size_t num_pushed; + size_t num_to_write; + + int pushFullCharacter() { + for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len; + ++num_pushed) { + int err = cr.push(src[src_idx + num_pushed]); + if (err != 0) + return err; + } + + // if we aren't able to read a full character from the source string + if (src_idx + num_pushed == src_len && !cr.isFull()) { + src_idx += num_pushed; + return -1; + } + + return 0; + } + +public: + StringConverter(const T *s, size_t srclen, size_t dstlen, mbstate *ps) + : cr(ps), src(s), src_len(srclen), src_idx(0), num_pushed(0), + num_to_write(dstlen) { + pushFullCharacter(); + } + + StringConverter(const T *s, size_t dstlen, mbstate *ps) + : StringConverter(s, SIZE_MAX, dstlen, ps) {} + + ErrorOr popUTF32() { + if (cr.isEmpty()) { + int err = pushFullCharacter(); + if (err != 0) + return Error(err); + + if (cr.sizeAsUTF32() > num_to_write) { + cr.clear(); + return Error(-1); + } + } + + auto out = cr.pop_utf32(); + if (cr.isEmpty()) + src_idx += num_pushed; + + if (out.has_value() && out.value() == L'\0') + src_len = src_idx; + + num_to_write--; + + return out; + } + + ErrorOr popUTF8() { + if (cr.isEmpty()) { + int err = pushFullCharacter(); + if (err != 0) + return Error(err); + + if (cr.sizeAsUTF8() > num_to_write) { + cr.clear(); + return Error(-1); + } + } + + auto out = cr.pop_utf8(); + if (cr.isEmpty()) + src_idx += num_pushed; + + if (out.has_value() && out.value() == '\0') + src_len = src_idx; + + num_to_write--; + + return out; + } + + size_t getSourceIndex() { return src_idx; } +}; + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt index 5176bfd4b024b..6982232d67544 100644 --- a/libc/test/src/__support/wchar/CMakeLists.txt +++ b/libc/test/src/__support/wchar/CMakeLists.txt @@ -19,3 +19,17 @@ add_libc_test( DEPENDS libc.src.__support.wchar.character_converter ) + +add_libc_test( + string_converter_test.cpp + SUITE + libc-support-tests + SRCS + string_converter_test.cpp + DEPENDS + libc.src.__support.wchar.string_converter + libc.src.__support.wchar.mbstate + libc.src.__support.error_or + libc.hdr.errno_macros + libc.hdr.types.char32_t +) diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp new file mode 100644 index 0000000000000..cb908e2fec2a8 --- /dev/null +++ b/libc/test/src/__support/wchar/string_converter_test.cpp @@ -0,0 +1,308 @@ +//===-- Unittests for StringConverter class -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/errno_macros.h" +#include "hdr/types/char32_t.h" +#include "hdr/types/char8_t.h" +#include "src/__support/error_or.h" +#include "src/__support/wchar/mbstate.h" +#include "src/__support/wchar/string_converter.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcStringConverterTest, UTF8To32) { + // first 4 bytes are clown emoji (🤡), then next 3 are sigma symbol (∑) + const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91"; + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), SIZE_MAX, &state); + + auto res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x1f921); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); + + res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x2211); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 7); + + res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 8); + + res = sc.popUTF32(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(res.error(), -1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 8); +} + +TEST(LlvmLibcStringConverterTest, UTF32To8) { + const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), SIZE_MAX, &state); + + auto res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xF0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x9F); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA4); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xE2); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x88); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x91); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 2); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 3); + + res = sc.popUTF8(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(res.error(), -1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 3); +} + +TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) { + const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), 1, SIZE_MAX, &state); + + auto res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xF0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x9F); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA4); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(res.error(), -1); +} + +TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) { + // first 4 bytes are clown emoji, then next 3 are sigma symbol + const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91"; + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), 5, SIZE_MAX, &state); + + auto res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x1f921); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); + + res = sc.popUTF32(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(res.error()), -1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 5); +} + +TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) { + const wchar_t *src = L"\x1f921\xffffff"; // clown emoji, invalid utf32 + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), SIZE_MAX, &state); + + auto res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xF0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x9F); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA4); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(res.error()), EILSEQ); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); +} + +TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) { + // first 4 bytes are clown emoji (🤡) + // next 3 form an invalid character + const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30"; + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), SIZE_MAX, &state); + + auto res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x1f921); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); + + res = sc.popUTF32(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(res.error()), EILSEQ); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); +} + +TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) { + /* + We do NOT test partially popping a character and expecting the next + StringConverter to continue where we left off. This is not expected to work + and considered invalid. + */ + const wchar_t *src = L"\x1f921\xff"; // clown emoji, sigma symbol + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc1( + reinterpret_cast(src), 1, SIZE_MAX, &state); + + auto res = sc1.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xF0); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 0); + + res = sc1.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x9F); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 0); + + res = sc1.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA4); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 0); + + res = sc1.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA1); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 1); + + LIBC_NAMESPACE::internal::StringConverter sc2( + reinterpret_cast(src) + sc1.getSourceIndex(), 1, + SIZE_MAX, &state); + + res = sc2.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xC3); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 0); + + res = sc2.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xBF); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); +} + +TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) { + const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc1( + reinterpret_cast(src), 2, SIZE_MAX, &state); + + auto res = sc1.popUTF32(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(res.error()), -1); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 2); + + LIBC_NAMESPACE::internal::StringConverter sc2( + reinterpret_cast(src) + sc1.getSourceIndex(), 3, + SIZE_MAX, &state); + + res = sc2.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x1f921); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 2); + + res = sc2.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 3); +} + +TEST(LlvmLibcStringConverterTest, DstLimitUTF8To32) { + const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), SIZE_MAX, 1, &state); + + auto res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); + + res = sc.popUTF32(); // no space to pop this into + ASSERT_FALSE(res.has_value()); +} + +TEST(LlvmLibcStringConverterTest, DstLimitUTF32To8) { + const wchar_t *src = L"\x1f921\x1f921"; // 2 clown emojis + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), SIZE_MAX, 5, &state); + + auto res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 0); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); +}