Skip to content

Commit 646d279

Browse files
committed
Combine segments separated by double-width diacritical mark
1 parent 7d50181 commit 646d279

File tree

1 file changed

+170
-1
lines changed

1 file changed

+170
-1
lines changed

src/util/script_detection.ts

+170-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,175 @@ import {unicodeBlockLookup as isChar} from './is_char_in_unicode_block';
44

55
const segmenter = new Intl.Segmenter();
66

7+
const doubleWidthDiacritics = [
8+
0x035C, // Combining Double Breve Below
9+
0x035D, // Combining Double Breve
10+
0x035E, // Combining Double Macron
11+
0x035F, // Combining Double Macron Below
12+
0x0360, // Combining Double Tilde
13+
0x0361, // Combining Double Inverted Breve
14+
0x0362, // Combining Double Rightwards Arrow Below
15+
0x0955, // Devanagari Vowel Sign Candra Long E
16+
0x0956, // Devanagari Vowel Sign Ue
17+
0x0957, // Devanagari Vowel Sign Uue
18+
0x0A01, // Gurmukhi Sign Adak Bindi
19+
0x0A51, // Gurmukhi Sign Udaat
20+
0x0A75, // Gurmukhi Sign Yakash
21+
0x0AE2, // Gujarati Vowel Sign Vocalic L
22+
0x0AE3, // Gujarati Vowel Sign Vocalic Ll
23+
0x0B3F, // Oriya Vowel Sign I
24+
0x0B41, // Oriya Vowel Sign U
25+
0x0B42, // Oriya Vowel Sign Uu
26+
0x0B43, // Oriya Vowel Sign Vocalic R
27+
0x0B44, // Oriya Vowel Sign Vocalic Rr
28+
0x0B4D, // Oriya Sign Virama
29+
0x0B55, // Oriya Sign Overline
30+
0x0B56, // Oriya Ai Length Mark
31+
0x0B62, // Oriya Vowel Sign Vocalic L
32+
0x0B63, // Oriya Vowel Sign Vocalic Ll
33+
0x0C48, // Telugu Vowel Sign Ai
34+
0x0C81, // Kannada Sign Candrabindu
35+
0x0CBC, // Kannada Sign Nukta
36+
0x0CBF, // Kannada Vowel Sign I
37+
0x0CC6, // Kannada Vowel Sign E
38+
0x0CCC, // Kannada Vowel Sign Au
39+
0x0CCD, // Kannada Sign Virama
40+
0x0CE2, // Kannada Vowel Sign Vocalic L
41+
0x0CE3, // Kannada Vowel Sign Vocalic Ll
42+
0x0D41, // Malayalam Vowel Sign U
43+
0x0D42, // Malayalam Vowel Sign Uu
44+
0x0D43, // Malayalam Vowel Sign Vocalic R
45+
0x0D44, // Malayalam Vowel Sign Vocalic Rr
46+
0x0D4D, // Malayalam Sign Virama
47+
0x0DCA, // Sinhala Sign Al-Lakuna
48+
0x0DD2, // Sinhala Vowel Sign Ketti Is-Pilla
49+
0x0DD3, // Sinhala Vowel Sign Diga Is-Pilla
50+
0x0DD4, // Sinhala Vowel Sign Ketti Paa-Pilla
51+
0x0DD6, // Sinhala Vowel Sign Diga Paa-Pilla
52+
0x0E31, // Thai Character Mai Han-Akat
53+
0x0E34, // Thai Character Sara I
54+
0x0E35, // Thai Character Sara Ii
55+
0x0E36, // Thai Character Sara Ue
56+
0x0E37, // Thai Character Sara Uee
57+
0x0E38, // Thai Character Sara U
58+
0x0E39, // Thai Character Sara Uu
59+
0x0E3A, // Thai Character Phinthu
60+
0x0E47, // Thai Character Maitaikhu
61+
0x0E48, // Thai Character Mai Ek
62+
0x0E49, // Thai Character Mai Tho
63+
0x0E4A, // Thai Character Mai Tri
64+
0x0E4B, // Thai Character Mai Chattawa
65+
0x0E4C, // Thai Character Thanthakhat
66+
0x0E4D, // Thai Character Nikhahit
67+
0x0E4E, // Thai Character Yamakkan
68+
0x0F71, // Tibetan Vowel Sign Aa
69+
0x0F73, // Tibetan Vowel Sign Ii
70+
0x0F74, // Tibetan Vowel Sign U
71+
0x0F75, // Tibetan Vowel Sign Uu
72+
0x0F76, // Tibetan Vowel Sign Vocalic R
73+
0x0F77, // Tibetan Vowel Sign Vocalic Rr
74+
0x0F78, // Tibetan Vowel Sign Vocalic L
75+
0x0F79, // Tibetan Vowel Sign Vocalic Ll
76+
0x0F81, // Tibetan Vowel Sign Reversed Ii
77+
0x0F8D, // Tibetan Subjoined Sign Lce Tsa Can
78+
0x0F8E, // Tibetan Subjoined Sign Mchu Can
79+
0x0F8F, // Tibetan Subjoined Sign Inverted Mchu Can
80+
0x0F90, // Tibetan Subjoined Letter Ka
81+
0x0F91, // Tibetan Subjoined Letter Kha
82+
0x0F92, // Tibetan Subjoined Letter Ga
83+
0x0F93, // Tibetan Subjoined Letter Gha
84+
0x0F94, // Tibetan Subjoined Letter Nga
85+
0x0F95, // Tibetan Subjoined Letter Ca
86+
0x0F96, // Tibetan Subjoined Letter Cha
87+
0x0F97, // Tibetan Subjoined Letter Ja
88+
0x0F99, // Tibetan Subjoined Letter Nya
89+
0x0F9A, // Tibetan Subjoined Letter Tta
90+
0x0F9B, // Tibetan Subjoined Letter Ttha
91+
0x0F9C, // Tibetan Subjoined Letter Dda
92+
0x0F9D, // Tibetan Subjoined Letter Ddha
93+
0x0F9E, // Tibetan Subjoined Letter Nna
94+
0x0F9F, // Tibetan Subjoined Letter Ta
95+
0x0FA0, // Tibetan Subjoined Letter Tha
96+
0x0FA1, // Tibetan Subjoined Letter Da
97+
0x0FA2, // Tibetan Subjoined Letter Dha
98+
0x0FA3, // Tibetan Subjoined Letter Na
99+
0x0FA4, // Tibetan Subjoined Letter Pa
100+
0x0FA5, // Tibetan Subjoined Letter Pha
101+
0x0FA6, // Tibetan Subjoined Letter Ba
102+
0x0FA7, // Tibetan Subjoined Letter Bha
103+
0x0FA8, // Tibetan Subjoined Letter Ma
104+
0x0FA9, // Tibetan Subjoined Letter Tsa
105+
0x0FAA, // Tibetan Subjoined Letter Tsha
106+
0x0FAB, // Tibetan Subjoined Letter Dza
107+
0x0FAC, // Tibetan Subjoined Letter Dzha
108+
0x0FAD, // Tibetan Subjoined Letter Wa
109+
0x0FAE, // Tibetan Subjoined Letter Zha
110+
0x0FAF, // Tibetan Subjoined Letter Za
111+
0x0FB0, // Tibetan Subjoined Letter undefined-A
112+
0x0FB1, // Tibetan Subjoined Letter Ya
113+
0x0FB2, // Tibetan Subjoined Letter Ra
114+
0x0FB3, // Tibetan Subjoined Letter La
115+
0x0FB4, // Tibetan Subjoined Letter Sha
116+
0x0FB5, // Tibetan Subjoined Letter Ssa
117+
0x0FB6, // Tibetan Subjoined Letter Sa
118+
0x0FB7, // Tibetan Subjoined Letter Ha
119+
0x0FB8, // Tibetan Subjoined Letter A
120+
0x0FB9, // Tibetan Subjoined Letter Kssa
121+
0x0FBA, // Tibetan Subjoined Letter Fixed-Form Wa
122+
0x0FBB, // Tibetan Subjoined Letter Fixed-Form Ya
123+
0x0FBC, // Tibetan Subjoined Letter Fixed-Form Ra
124+
0x102D, // Myanmar Vowel Sign I
125+
0x102E, // Myanmar Vowel Sign Ii
126+
0x102F, // Myanmar Vowel Sign U
127+
0x1030, // Myanmar Vowel Sign Uu
128+
0x1032, // Myanmar Vowel Sign Ai
129+
0x1033, // Myanmar Vowel Sign Mon Ii
130+
0x1034, // Myanmar Vowel Sign Mon O
131+
0x1035, // Myanmar Vowel Sign E Above
132+
0x1036, // Myanmar Sign Anusvara
133+
0x1037, // Myanmar Sign Dot Below
134+
0x1039, // Myanmar Sign Virama
135+
0x103A, // Myanmar Sign Asat
136+
0x103D, // Myanmar Consonant Sign Medial Wa
137+
0x103E, // Myanmar Consonant Sign Medial Ha
138+
0x1058, // Myanmar Vowel Sign Vocalic L
139+
0x1059, // Myanmar Vowel Sign Vocalic Ll
140+
0x105E, // Myanmar Consonant Sign Mon Medial Na
141+
0x105F, // Myanmar Consonant Sign Mon Medial Ma
142+
0x1060, // Myanmar Consonant Sign Mon Medial La
143+
0x1071, // Myanmar Vowel Sign Geba Karen I
144+
0x1072, // Myanmar Vowel Sign Kayah Oe
145+
0x1073, // Myanmar Vowel Sign Kayah U
146+
0x1074, // Myanmar Vowel Sign Kayah Ee
147+
0x1082, // Myanmar Consonant Sign Shan Medial Wa
148+
0x1085, // Myanmar Vowel Sign Shan E Above
149+
0x1086, // Myanmar Vowel Sign Shan Final Y
150+
0x108D, // Myanmar Sign Shan Council Emphatic Tone
151+
0x109D, // Myanmar Vowel Sign Aiton Ai
152+
0x1732, // Hanunoo Vowel Sign I
153+
0x1733, // Hanunoo Vowel Sign U
154+
0x1734, // Hanunoo Sign Pamudpod
155+
0x1772, // Tagbanwa Vowel Sign I
156+
0x1773, // Tagbanwa Vowel Sign U
157+
0x17B7, // Khmer Vowel Sign I
158+
0x17B8, // Khmer Vowel Sign Ii
159+
0x17B9, // Khmer Vowel Sign Y
160+
0x17BA, // Khmer Vowel Sign Yy
161+
0x17BB, // Khmer Vowel Sign U
162+
0x17BC, // Khmer Vowel Sign Uu
163+
0x17BD, // Khmer Vowel Sign Ua
164+
0x17C6, // Khmer Sign Nikahit
165+
0x17CB, // Khmer Sign Bantoc
166+
0x17CD, // Khmer Sign Toandakhiat
167+
0x17CE, // Khmer Sign Kakabat
168+
0x17CF, // Khmer Sign Ahsda
169+
0x17D0, // Khmer Sign Samyok Sannya
170+
0x17D1, // Khmer Sign Viriam
171+
0x17D2, // Khmer Sign Coeng
172+
0x17D3, // Khmer Sign Bathamasat
173+
0x17DD, // Khmer Sign Atthacan
174+
];
175+
7176
export function splitByGraphemeCluster(text: string) {
8177
const segments = segmenter.segment(text)[Symbol.iterator]();
9178
let segment = segments.next();
@@ -14,7 +183,7 @@ export function splitByGraphemeCluster(text: string) {
14183
const baseSegments = [];
15184
while (!segment.done) {
16185
const baseSegment = segment;
17-
while (!nextSegment.done && /^\p{Mc}/u.test(nextSegment.value.segment)) {
186+
while (!nextSegment.done && (/^\p{Mc}/u.test(nextSegment.value.segment) || doubleWidthDiacritics.indexOf(baseSegment.value.segment.at(-1).codePointAt(0)) !== -1)) {
18187
baseSegment.value.segment += nextSegment.value.segment;
19188
segment = segments.next();
20189
nextSegment = nextSegments.next();

0 commit comments

Comments
 (0)