Skip to content

Commit

Permalink
Update utf8proc to v2.9.0
Browse files Browse the repository at this point in the history
Unicode 15.1 support
  • Loading branch information
koutcher committed Oct 21, 2023
1 parent 7545d7b commit 13f463b
Show file tree
Hide file tree
Showing 3 changed files with 10,959 additions and 10,891 deletions.
61 changes: 42 additions & 19 deletions compat/utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
}

UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
return "15.0.0";
return "15.1.0";
}

UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
Expand Down Expand Up @@ -288,35 +288,54 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
true; // GB999
}

static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
{
if (state) {
int lbc_override;
if (*state == UTF8PROC_BOUNDCLASS_START)
*state = lbc_override = lbc;
else
lbc_override = *state;
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
if (*state == 0) { /* state initialization */
state_bc = lbc;
state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
}
else { /* lbc and licb are already encoded in *state */
state_bc = *state & 0xff; // 1st byte of state is bound class
state_icb = *state >> 8; // 2nd byte of state is indic conjunct break
}

utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
!(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
&& ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c

// Special support for GB9c. Don't break between two consonants
// separated 1+ linker characters and 0+ extend characters in any order.
// After a consonant, we enter LINKER state after at least one linker.
if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
state_icb = ticb;
else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;

// Special support for GB 12/13 made possible by GB999. After two RI
// class codepoints we want to force a break. Do this by resetting the
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
// after that character according to GB999 (unless of course such a break is
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
state_bc = UTF8PROC_BOUNDCLASS_OTHER;
// Special support for GB11 (emoji extend* zwj / emoji)
else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
*state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
else
*state = tbc;
state_bc = tbc;
}
else
*state = tbc;
state_bc = tbc;

*state = state_bc + (state_icb << 8);
return break_permitted;
}
else
Expand All @@ -326,8 +345,12 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {

return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
utf8proc_get_property(c2)->boundclass,
const utf8proc_property_t *p1 = utf8proc_get_property(c1);
const utf8proc_property_t *p2 = utf8proc_get_property(c2);
return grapheme_break_extended(p1->boundclass,
p2->boundclass,
p1->indic_conjunct_break,
p2->indic_conjunct_break,
state);
}

Expand Down Expand Up @@ -498,8 +521,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
}
if (options & UTF8PROC_CHARBOUND) {
utf8proc_bool boundary;
int tbc = property->boundclass;
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
last_boundclass);
if (boundary) {
if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
if (bufsize >= 2) dst[1] = uc;
Expand Down
18 changes: 14 additions & 4 deletions compat/utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
/** The MAJOR version number (increased when backwards API compatibility is broken). */
#define UTF8PROC_VERSION_MAJOR 2
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
#define UTF8PROC_VERSION_MINOR 8
#define UTF8PROC_VERSION_MINOR 9
/** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 0
/** @} */
Expand Down Expand Up @@ -273,7 +273,8 @@ typedef struct utf8proc_property_struct {
* Boundclass.
* @see utf8proc_boundclass_t.
*/
unsigned boundclass:8;
unsigned boundclass:6;
unsigned indic_conjunct_break:2;
} utf8proc_property_t;

/** Unicode categories. */
Expand Down Expand Up @@ -388,6 +389,14 @@ typedef enum {
UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
} utf8proc_boundclass_t;

/** Indic_Conjunct_Break property. (TR44) */
typedef enum {
UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0,
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1,
UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2,
UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3,
} utf8proc_indic_conjunct_break_t;

/**
* Function pointer type passed to @ref utf8proc_map_custom and
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
Expand Down Expand Up @@ -481,8 +490,9 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
* - @ref UTF8PROC_STRIPNA - remove unassigned codepoints
* @param last_boundclass
* Pointer to an integer variable containing
* the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
* option is used. Otherwise, this parameter is ignored.
* the previous codepoint's (boundclass + indic_conjunct_break << 1) if the @ref UTF8PROC_CHARBOUND
* option is used. If the string is being processed in order, this can be initialized to 0 for
* the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored.
*
* @return
* In case of success, the number of codepoints written is returned; in case
Expand Down
Loading

0 comments on commit 13f463b

Please sign in to comment.