Skip to content

Commit dd5c5c9

Browse files
committed
fix(util): 🐛 check for grapheme cluster in sfx() computation
1 parent 66a2782 commit dd5c5c9

File tree

2 files changed

+46
-6
lines changed

2 files changed

+46
-6
lines changed

packages/json-joy/src/util/diff/__tests__/str.spec.ts

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,11 @@ describe('sfx()', () => {
2626
expect(sfx('abc', '_abc')).toEqual(3);
2727
expect(sfx('abc', 'abcd')).toEqual(0);
2828
expect(sfx('👨‍🍳', '👨‍🍳')).toEqual(5);
29-
// expect(sfx('👨‍🍳', '👨‍🍳chef')).toEqual(5);
30-
// expect(sfx('👨‍🍳chef', '👨‍🍳')).toEqual(5);
31-
// expect(sfx('👨‍🍳👨‍🍳', '👨‍🍳')).toEqual(5);
29+
expect(sfx('👨‍🍳', '👨‍🍳chef')).toEqual(0);
30+
expect(sfx('👨‍🍳chef', '👨‍🍳')).toEqual(0);
31+
expect(sfx('👨‍🍳', 'chef👨‍🍳')).toEqual(5);
32+
expect(sfx('chef👨‍🍳', '👨‍🍳')).toEqual(5);
33+
expect(sfx('👨‍🍳👨‍🍳', '👨‍🍳')).toEqual(5);
3234
});
3335
});
3436

@@ -435,6 +437,18 @@ describe('Unicode edge cases', () => {
435437
assertPatch(nfd, nfc);
436438
assertPatch(`hello ${nfc}`, `hello ${nfd}`);
437439
});
440+
441+
test('handles complex emoji with ZWJ sequences', () => {
442+
const chefEmoji = '👨‍🍳'; // chef emoji (man + ZWJ + cooking)
443+
const src = chefEmoji;
444+
const dst = 'chef' + chefEmoji;
445+
const patch = normalize(diff(src, dst));
446+
assertPatch(src, dst, patch);
447+
expect(patch).toEqual([
448+
[PATCH_OP_TYPE.INS, 'chef'],
449+
[PATCH_OP_TYPE.EQL, chefEmoji],
450+
]);
451+
});
438452
});
439453

440454
describe('Algorithm edge cases', () => {

packages/json-joy/src/util/diff/str.ts

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -427,9 +427,35 @@ export const sfx = (txt1: string, txt2: string): number => {
427427
} else max = mid;
428428
mid = Math.floor((max - min) / 2 + min);
429429
}
430-
const code = txt1.charCodeAt(txt1.length - mid);
431-
const isSurrogatePairEnd = code >= 0xd800 && code <= 0xdbff;
432-
if (isSurrogatePairEnd) mid--;
430+
// Check if we're splitting a surrogate pair or combining character sequence
431+
// We need to check the character BEFORE the matched suffix to see if we're
432+
// splitting a grapheme cluster.
433+
if (mid > 0 && mid < txt1.length) {
434+
const boundaryPos = txt1.length - mid - 1;
435+
const code = txt1.charCodeAt(boundaryPos);
436+
const isHighSurrogate = code >= 0xd800 && code <= 0xdbff;
437+
const isCombining =
438+
code === 0x200d || // ZWJ
439+
(code >= 0xfe00 && code <= 0xfe0f) || // Variation selectors
440+
(code >= 0x0300 && code <= 0x036f); // Combining diacritical marks
441+
442+
if (isHighSurrogate || isCombining) {
443+
// We're splitting a grapheme cluster. Walk backwards to include the full cluster.
444+
mid--;
445+
while (mid > 0) {
446+
const pos = txt1.length - mid - 1;
447+
if (pos < 0) break;
448+
const prevCode = txt1.charCodeAt(pos);
449+
const isPrevHighSurrogate = prevCode >= 0xd800 && prevCode <= 0xdbff;
450+
const isPrevCombining =
451+
prevCode === 0x200d ||
452+
(prevCode >= 0xfe00 && prevCode <= 0xfe0f) ||
453+
(prevCode >= 0x0300 && prevCode <= 0x036f);
454+
if (!isPrevHighSurrogate && !isPrevCombining) break;
455+
mid--;
456+
}
457+
}
458+
}
433459
return mid;
434460
};
435461

0 commit comments

Comments
 (0)