Skip to content

[libc] Wchar Stringconverter #146388

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions libc/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@ add_header_library(
libc.hdr.types.char32_t
)

add_header_library(
string_converter
HDRS
string_converter.h
DEPENDS
libc.hdr.types.char8_t
libc.hdr.types.char32_t
libc.hdr.types.size_t
libc.src.__support.error_or
.mbstate
.character_converter
)

add_object_library(
character_converter
HDRS
Expand Down
11 changes: 11 additions & 0 deletions libc/src/__support/wchar/character_converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "src/__support/error_or.h"
#include "src/__support/math_extras.h"
#include "src/__support/wchar/mbstate.h"
#include <stddef.h>

#include "character_converter.h"

Expand Down Expand Up @@ -92,6 +93,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
state->bytes_stored++;
return 0;
}

// Invalid byte -> reset the state
clear();
return EILSEQ;
Expand Down Expand Up @@ -130,6 +132,12 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
return utf32;
}

size_t CharacterConverter::sizeAsUTF32() {
return 1; // a single utf-32 value can fit an entire character
}

size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; }

ErrorOr<char8_t> CharacterConverter::pop_utf8() {
if (isEmpty())
return Error(-1);
Expand All @@ -156,6 +164,9 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
}

state->bytes_stored--;
if (state->bytes_stored == 0)
clear();

return static_cast<char8_t>(output);
}

Expand Down
4 changes: 4 additions & 0 deletions libc/src/__support/wchar/character_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/mbstate.h"
#include <stddef.h>

namespace LIBC_NAMESPACE_DECL {
namespace internal {
Expand All @@ -30,6 +31,9 @@ class CharacterConverter {
bool isEmpty();
bool isValidState();

size_t sizeAsUTF32();
size_t sizeAsUTF8();

int push(char8_t utf8_byte);
int push(char32_t utf32);

Expand Down
113 changes: 113 additions & 0 deletions libc/src/__support/wchar/string_converter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H

#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "hdr/types/size_t.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"

namespace LIBC_NAMESPACE_DECL {
namespace internal {

template <typename T> class StringConverter {
private:
CharacterConverter cr;
const T *src;
size_t src_len;
size_t src_idx;
size_t bytes_pushed;
size_t num_to_write;

int pushFullCharacter() {
for (bytes_pushed = 0; !cr.isFull() && src_idx + bytes_pushed < src_len;
++bytes_pushed) {
int err = cr.push(src[src_idx + bytes_pushed]);
if (err != 0)
return err;
}

// if we aren't able to read a full character from the source string
if (src_idx + bytes_pushed == src_len && !cr.isFull()) {
src_idx += bytes_pushed;
return -1;
}

return 0;
}

public:
StringConverter(const T *s, size_t srclen, size_t dstlen, mbstate *ps)
: cr(ps), src(s), src_len(srclen), src_idx(0), bytes_pushed(0),
num_to_write(dstlen) {
pushFullCharacter();
}

StringConverter(const T *s, size_t dstlen, mbstate *ps)
: StringConverter(s, SIZE_MAX, dstlen, ps) {}

ErrorOr<char32_t> popUTF32() {
if (cr.isEmpty()) {
int err = pushFullCharacter();
if (err != 0)
return Error(err);

if (cr.sizeAsUTF32() > num_to_write) {
cr.clear();
return Error(-1);
}
}

auto out = cr.pop_utf32();
if (cr.isEmpty())
src_idx += bytes_pushed;

if (out.has_value() && out.value() == L'\0')
src_len = src_idx;

num_to_write--;

return out;
}

ErrorOr<char8_t> popUTF8() {
if (cr.isEmpty()) {
int err = pushFullCharacter();
if (err != 0)
return Error(err);

if (cr.sizeAsUTF8() > num_to_write) {
cr.clear();
return Error(-1);
}
}

auto out = cr.pop_utf8();
if (cr.isEmpty())
src_idx += bytes_pushed;

if (out.has_value() && out.value() == '\0')
src_len = src_idx;

num_to_write--;

return out;
}

size_t getSourceIndex() { return src_idx; }
};

} // namespace internal
} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
14 changes: 14 additions & 0 deletions libc/test/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,17 @@ add_libc_test(
DEPENDS
libc.src.__support.wchar.character_converter
)

add_libc_test(
string_converter_test.cpp
SUITE
libc-support-tests
SRCS
string_converter_test.cpp
DEPENDS
libc.src.__support.wchar.string_converter
libc.src.__support.wchar.mbstate
libc.src.__support.error_or
libc.hdr.errno_macros
libc.hdr.types.char32_t
)
Loading
Loading