Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 82 additions & 22 deletions src/common/util/src/wstring_convert_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,61 @@

#include "openvino/util/wstring_convert_util.hpp"

#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT

# include <codecvt>
# include <locale>
#include <cstdint>
#include <stdexcept>
#ifdef _WIN32
# include <windows.h>
#endif

# ifdef _WIN32
# include <windows.h>
# endif
namespace ov::util {
#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT

# if defined(__clang__) || defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
# endif
constexpr auto value_mask = 0x3FU;
constexpr auto codepoint_2nd_shift = 6U;
constexpr auto codepoint_3rd_shift = 12U;
constexpr auto codepoint_4th_shift = 18U;

std::string ov::util::wstring_to_string(const std::wstring& wstr) {
std::string wstring_to_string(const std::wstring& wstr) {
# ifdef _WIN32
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
std::string strTo(size_needed, 0);
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
return strTo;
# else
std::wstring_convert<std::codecvt_utf8<wchar_t>> wstring_decoder;
return wstring_decoder.to_bytes(wstr);
std::string result;
result.reserve(wstr.size() * (sizeof(wchar_t) >= 4 ? 4 : 3)); // Worst case for UTF-8

for (const auto& wc : wstr) {
uint32_t codepoint = static_cast<uint32_t>(wc);

if (codepoint <= 0x7FU) {
// 1-byte sequence (ASCII)
result.push_back(static_cast<char>(codepoint));
} else if (codepoint <= 0x7FFU) {
// 2-byte sequence
result.push_back(static_cast<char>(0xC0U | ((codepoint >> codepoint_2nd_shift) & 0x1FU)));
result.push_back(static_cast<char>(0x80U | (codepoint & value_mask)));
} else if (codepoint <= 0xFFFFU) {
// 3-byte sequence
result.push_back(static_cast<char>(0xE0U | ((codepoint >> codepoint_3rd_shift) & 0x0FU)));
result.push_back(static_cast<char>(0x80U | ((codepoint >> codepoint_2nd_shift) & value_mask)));
result.push_back(static_cast<char>(0x80U | (codepoint & value_mask)));
} else if (codepoint <= 0x10FFFFU) {
// 4-byte sequence
result.push_back(static_cast<char>(0xF0U | ((codepoint >> codepoint_4th_shift) & 0x07U)));
result.push_back(static_cast<char>(0x80U | ((codepoint >> codepoint_3rd_shift) & value_mask)));
result.push_back(static_cast<char>(0x80U | ((codepoint >> codepoint_2nd_shift) & value_mask)));
result.push_back(static_cast<char>(0x80U | (codepoint & value_mask)));
} else {
throw std::runtime_error("Invalid Unicode codepoint");
}
}
result.shrink_to_fit();
return result;
# endif
}

std::wstring ov::util::string_to_wstring(const std::string& string) {
std::wstring string_to_wstring(const std::string& string) {
const char* str = string.c_str();
# ifdef _WIN32
int strSize = static_cast<int>(std::strlen(str));
Expand All @@ -39,14 +67,46 @@ std::wstring ov::util::string_to_wstring(const std::string& string) {
MultiByteToWideChar(CP_UTF8, 0, str, strSize, &wstrTo[0], size_needed);
return wstrTo;
# else
std::wstring_convert<std::codecvt_utf8<wchar_t>> wstring_encoder;
std::wstring result = wstring_encoder.from_bytes(str);

const auto check_utf8_seq_size = [](const char* first, const char* last, const std::ptrdiff_t seq_size) {
if (seq_size > std::distance(first, last)) {
throw std::runtime_error("Invalid UTF-8 sequence");
}
};

std::wstring result;
result.reserve(string.size());
for (const auto last = str + string.size(); str < last;) {
auto codepoint = static_cast<uint32_t>(*str++);
if (codepoint <= 0x7FU) {
// 1-byte sequence, nothing to do
} else if ((codepoint & 0xE0U) == 0xC0U) {
// 2-byte sequence
check_utf8_seq_size(str, last, 1);
codepoint = (codepoint & 0x1FU) << codepoint_2nd_shift;
codepoint |= (static_cast<unsigned char>(*str++) & value_mask);
} else if ((codepoint & 0xF0U) == 0xE0U) {
// 3-byte sequence
check_utf8_seq_size(str, last, 2);
codepoint = (codepoint & 0x0FU) << codepoint_3rd_shift;
codepoint |= (static_cast<unsigned char>(*str++) & value_mask) << codepoint_2nd_shift;
codepoint |= (static_cast<unsigned char>(*str++) & value_mask);
} else if ((codepoint & 0xF8U) == 0xF0U) {
// 4-byte sequence
check_utf8_seq_size(str, last, 3);
codepoint = (codepoint & 0x07U) << codepoint_4th_shift;
codepoint |= (static_cast<unsigned char>(*str++) & value_mask) << codepoint_3rd_shift;
codepoint |= (static_cast<unsigned char>(*str++) & value_mask) << codepoint_2nd_shift;
codepoint |= (static_cast<unsigned char>(*str++) & value_mask);
} else {
throw std::runtime_error("Invalid UTF-8 byte");
}

result.push_back(static_cast<wchar_t>(codepoint));
}

return result;
# endif
}

# if defined(__clang__) || defined(__GNUC__)
# pragma GCC diagnostic pop
# endif

#endif // OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
} // namespace ov::util
Loading