diff --git a/src/common/util/src/wstring_convert_util.cpp b/src/common/util/src/wstring_convert_util.cpp index a8411092767e2b..f3e8fbb5d5a537 100644 --- a/src/common/util/src/wstring_convert_util.cpp +++ b/src/common/util/src/wstring_convert_util.cpp @@ -4,33 +4,61 @@ #include "openvino/util/wstring_convert_util.hpp" -#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT - -# include -# include +#include +#include +#ifdef _WIN32 +# include +#endif -# ifdef _WIN32 -# include -# endif +namespace ov::util { +#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT -# if defined(__clang__) || defined(__GNUC__) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wdeprecated-declarations" -# endif +constexpr auto value_mask = 0x3FU; +constexpr auto codepoint_2nd_shift = 6U; +constexpr auto codepoint_3rd_shift = 12U; +constexpr auto codepoint_4th_shift = 18U; -std::string ov::util::wstring_to_string(const std::wstring& wstr) { +std::string wstring_to_string(const std::wstring& wstr) { # ifdef _WIN32 int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); std::string strTo(size_needed, 0); WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); return strTo; # else - std::wstring_convert> wstring_decoder; - return wstring_decoder.to_bytes(wstr); + std::string result; + result.reserve(wstr.size() * (sizeof(wchar_t) >= 4 ? 4 : 3)); // Worst case for UTF-8 + + for (const auto& wc : wstr) { + uint32_t codepoint = static_cast(wc); + + if (codepoint <= 0x7FU) { + // 1-byte sequence (ASCII) + result.push_back(static_cast(codepoint)); + } else if (codepoint <= 0x7FFU) { + // 2-byte sequence + result.push_back(static_cast(0xC0U | ((codepoint >> codepoint_2nd_shift) & 0x1FU))); + result.push_back(static_cast(0x80U | (codepoint & value_mask))); + } else if (codepoint <= 0xFFFFU) { + // 3-byte sequence + result.push_back(static_cast(0xE0U | ((codepoint >> codepoint_3rd_shift) & 0x0FU))); + result.push_back(static_cast(0x80U | ((codepoint >> codepoint_2nd_shift) & value_mask))); + result.push_back(static_cast(0x80U | (codepoint & value_mask))); + } else if (codepoint <= 0x10FFFFU) { + // 4-byte sequence + result.push_back(static_cast(0xF0U | ((codepoint >> codepoint_4th_shift) & 0x07U))); + result.push_back(static_cast(0x80U | ((codepoint >> codepoint_3rd_shift) & value_mask))); + result.push_back(static_cast(0x80U | ((codepoint >> codepoint_2nd_shift) & value_mask))); + result.push_back(static_cast(0x80U | (codepoint & value_mask))); + } else { + throw std::runtime_error("Invalid Unicode codepoint"); + } + } + result.shrink_to_fit(); + return result; # endif } -std::wstring ov::util::string_to_wstring(const std::string& string) { +std::wstring string_to_wstring(const std::string& string) { const char* str = string.c_str(); # ifdef _WIN32 int strSize = static_cast(std::strlen(str)); @@ -39,14 +67,46 @@ std::wstring ov::util::string_to_wstring(const std::string& string) { MultiByteToWideChar(CP_UTF8, 0, str, strSize, &wstrTo[0], size_needed); return wstrTo; # else - std::wstring_convert> wstring_encoder; - std::wstring result = wstring_encoder.from_bytes(str); + + const auto check_utf8_seq_size = [](const char* first, const char* last, const std::ptrdiff_t seq_size) { + if (seq_size > std::distance(first, last)) { + throw std::runtime_error("Invalid UTF-8 sequence"); + } + }; + + std::wstring result; + result.reserve(string.size()); + for (const auto last = str + string.size(); str < last;) { + auto codepoint = static_cast(*str++); + if (codepoint <= 0x7FU) { + // 1-byte sequence, nothing to do + } else if ((codepoint & 0xE0U) == 0xC0U) { + // 2-byte sequence + check_utf8_seq_size(str, last, 1); + codepoint = (codepoint & 0x1FU) << codepoint_2nd_shift; + codepoint |= (static_cast(*str++) & value_mask); + } else if ((codepoint & 0xF0U) == 0xE0U) { + // 3-byte sequence + check_utf8_seq_size(str, last, 2); + codepoint = (codepoint & 0x0FU) << codepoint_3rd_shift; + codepoint |= (static_cast(*str++) & value_mask) << codepoint_2nd_shift; + codepoint |= (static_cast(*str++) & value_mask); + } else if ((codepoint & 0xF8U) == 0xF0U) { + // 4-byte sequence + check_utf8_seq_size(str, last, 3); + codepoint = (codepoint & 0x07U) << codepoint_4th_shift; + codepoint |= (static_cast(*str++) & value_mask) << codepoint_3rd_shift; + codepoint |= (static_cast(*str++) & value_mask) << codepoint_2nd_shift; + codepoint |= (static_cast(*str++) & value_mask); + } else { + throw std::runtime_error("Invalid UTF-8 byte"); + } + + result.push_back(static_cast(codepoint)); + } + return result; # endif } - -# if defined(__clang__) || defined(__GNUC__) -# pragma GCC diagnostic pop -# endif - #endif // OPENVINO_ENABLE_UNICODE_PATH_SUPPORT +} // namespace ov::util