Skip to content

Commit

Permalink
refactor!: use utf8proc full casefolding
Browse files Browse the repository at this point in the history
According to `CaseFolding-15.1.0.txt`, full casefolding should be
preferred over simple casefolding as it's considered to be more correct.
Since utf8proc already provides full casefolding it makes sense to
switch to it. This will also remove a lot of unnecessary build code.

Temporary exceptions are made for two sets characters:

- `ß` will still be considered `ß` (instead of `ss`) as using a full
  casefolding requires interfering with upstream spell files in some
  form.
- `İ` will still be considered `İ` (instead of `i̇`) as using full
  casefolding requires making a value judgement on the "correct"
  behavior. There are two, equally valid case-insensetive comparison for
  this character according to unicode. It is essentially up to the
  implementor to decide which conversion is correct. For this reason it
  might make sense to allow users to decide which conversion should be
  done as an added option to `casemap` in a future PR.
  • Loading branch information
dundargoc committed Aug 7, 2024
1 parent 11a6f3c commit 328ea02
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 1,705 deletions.
50 changes: 0 additions & 50 deletions src/nvim/generators/gen_unicode_tables.lua
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ local get_path = function(fname)
end

local unicodedata_fname = get_path('UnicodeData.txt')
local casefolding_fname = get_path('CaseFolding.txt')
local eastasianwidth_fname = get_path('EastAsianWidth.txt')
local emoji_fname = get_path('emoji-data.txt')

Expand Down Expand Up @@ -77,10 +76,6 @@ local parse_data_to_props = function(ud_fp)
return fp_lines_to_lists(ud_fp, 15, false)
end

local parse_fold_props = function(cf_fp)
return fp_lines_to_lists(cf_fp, 4, true)
end

local parse_width_props = function(eaw_fp)
return fp_lines_to_lists(eaw_fp, 2, true)
end
Expand All @@ -97,45 +92,6 @@ local make_range = function(start, end_, step, add)
end
end

local build_convert_table = function(ut_fp, props, cond_func, nl_index, table_name)
ut_fp:write('static const convertStruct ' .. table_name .. '[] = {\n')
local start = -1
local end_ = -1
local step = 0
local add = -1
for _, p in ipairs(props) do
if cond_func(p) then
local n = tonumber(p[1], 16)
local nl = tonumber(p[nl_index], 16)
if start >= 0 and add == (nl - n) and (step == 0 or n - end_ == step) then
-- Continue with the same range.
step = n - end_
end_ = n
else
if start >= 0 then
-- Produce previous range.
ut_fp:write(make_range(start, end_, step, add))
end
start = n
end_ = n
step = 0
add = nl - n
end
end
end
if start >= 0 then
ut_fp:write(make_range(start, end_, step, add))
end
ut_fp:write('};\n')
end

local build_fold_table = function(ut_fp, foldprops)
local cond_func = function(p)
return (p[2] == 'C' or p[2] == 'S')
end
return build_convert_table(ut_fp, foldprops, cond_func, 3, 'foldCase')
end

local build_combining_table = function(ut_fp, dataprops)
ut_fp:write('static const struct interval combining[] = {\n')
local start = -1
Expand Down Expand Up @@ -291,12 +247,6 @@ local ut_fp = io.open(utf_tables_fname, 'w')

build_combining_table(ut_fp, dataprops)

local cf_fp = io.open(casefolding_fname, 'r')
local foldprops = parse_fold_props(cf_fp)
cf_fp:close()

build_fold_table(ut_fp, foldprops)

local eaw_fp = io.open(eastasianwidth_fname, 'r')
local widthprops = parse_width_props(eaw_fp)
eaw_fp:close()
Expand Down
53 changes: 25 additions & 28 deletions src/nvim/mbyte.c
Original file line number Diff line number Diff line change
Expand Up @@ -1284,41 +1284,38 @@ bool utf_ambiguous_width(int c)
|| intable(emoji_all, ARRAY_SIZE(emoji_all), c));
}

// Generic conversion function for case operations.
// Return the converted equivalent of "a", which is a UCS-4 character. Use
// the given conversion "table". Uses binary search on "table".
static int utf_convert(int a, const convertStruct *const table, size_t n_items)
{
// indices into table
size_t start = 0;
size_t end = n_items;
while (start < end) {
// need to search further
size_t mid = (end + start) / 2;
if (table[mid].rangeEnd < a) {
start = mid + 1;
} else {
end = mid;
}
}
if (start < n_items
&& table[start].rangeStart <= a
&& a <= table[start].rangeEnd
&& (a - table[start].rangeStart) % table[start].step == 0) {
return a + table[start].offset;
}
return a;
}

// Return the folded-case equivalent of "a", which is a UCS-4 character. Uses
// simple case folding.
// full case folding.
int utf_fold(int a)
{
if (a < 0x80) {
// be fast for ASCII
return a >= 0x41 && a <= 0x5a ? a + 32 : a;
}
return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));

// TODO(dundargoc): utf8proc only does full case folding, which breaks some tests. This is a
// temporary workaround to circumvent failing tests.
//
// (0xdf) ß == ss in full casefolding. Using this however breaks the vim spell tests and the error
// E763 is thrown. This is due to the test spells relying on the vim spell files.
//
// (0x130) İ == i̇ in full casefolding.
if (a == 0xdf || a == 0x130) {
return a;
}

utf8proc_uint8_t input_str[16] = { 0 };
utf8proc_encode_char(a, input_str);

utf8proc_uint8_t *fold_str_utf;
utf8proc_map((utf8proc_uint8_t *)input_str, 0, &fold_str_utf,
UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD);

int fold_codepoint_utf = utf_ptr2char((char *)fold_str_utf);

xfree(fold_str_utf);

return fold_codepoint_utf;
}

// Vim's own character class functions. These exist because many library
Expand Down
Loading

0 comments on commit 328ea02

Please sign in to comment.