@@ -4,6 +4,175 @@ import {unicodeBlockLookup as isChar} from './is_char_in_unicode_block';
4
4
5
5
const segmenter = new Intl . Segmenter ( ) ;
6
6
7
+ const doubleWidthDiacritics = [
8
+ 0x035C , // Combining Double Breve Below
9
+ 0x035D , // Combining Double Breve
10
+ 0x035E , // Combining Double Macron
11
+ 0x035F , // Combining Double Macron Below
12
+ 0x0360 , // Combining Double Tilde
13
+ 0x0361 , // Combining Double Inverted Breve
14
+ 0x0362 , // Combining Double Rightwards Arrow Below
15
+ 0x0955 , // Devanagari Vowel Sign Candra Long E
16
+ 0x0956 , // Devanagari Vowel Sign Ue
17
+ 0x0957 , // Devanagari Vowel Sign Uue
18
+ 0x0A01 , // Gurmukhi Sign Adak Bindi
19
+ 0x0A51 , // Gurmukhi Sign Udaat
20
+ 0x0A75 , // Gurmukhi Sign Yakash
21
+ 0x0AE2 , // Gujarati Vowel Sign Vocalic L
22
+ 0x0AE3 , // Gujarati Vowel Sign Vocalic Ll
23
+ 0x0B3F , // Oriya Vowel Sign I
24
+ 0x0B41 , // Oriya Vowel Sign U
25
+ 0x0B42 , // Oriya Vowel Sign Uu
26
+ 0x0B43 , // Oriya Vowel Sign Vocalic R
27
+ 0x0B44 , // Oriya Vowel Sign Vocalic Rr
28
+ 0x0B4D , // Oriya Sign Virama
29
+ 0x0B55 , // Oriya Sign Overline
30
+ 0x0B56 , // Oriya Ai Length Mark
31
+ 0x0B62 , // Oriya Vowel Sign Vocalic L
32
+ 0x0B63 , // Oriya Vowel Sign Vocalic Ll
33
+ 0x0C48 , // Telugu Vowel Sign Ai
34
+ 0x0C81 , // Kannada Sign Candrabindu
35
+ 0x0CBC , // Kannada Sign Nukta
36
+ 0x0CBF , // Kannada Vowel Sign I
37
+ 0x0CC6 , // Kannada Vowel Sign E
38
+ 0x0CCC , // Kannada Vowel Sign Au
39
+ 0x0CCD , // Kannada Sign Virama
40
+ 0x0CE2 , // Kannada Vowel Sign Vocalic L
41
+ 0x0CE3 , // Kannada Vowel Sign Vocalic Ll
42
+ 0x0D41 , // Malayalam Vowel Sign U
43
+ 0x0D42 , // Malayalam Vowel Sign Uu
44
+ 0x0D43 , // Malayalam Vowel Sign Vocalic R
45
+ 0x0D44 , // Malayalam Vowel Sign Vocalic Rr
46
+ 0x0D4D , // Malayalam Sign Virama
47
+ 0x0DCA , // Sinhala Sign Al-Lakuna
48
+ 0x0DD2 , // Sinhala Vowel Sign Ketti Is-Pilla
49
+ 0x0DD3 , // Sinhala Vowel Sign Diga Is-Pilla
50
+ 0x0DD4 , // Sinhala Vowel Sign Ketti Paa-Pilla
51
+ 0x0DD6 , // Sinhala Vowel Sign Diga Paa-Pilla
52
+ 0x0E31 , // Thai Character Mai Han-Akat
53
+ 0x0E34 , // Thai Character Sara I
54
+ 0x0E35 , // Thai Character Sara Ii
55
+ 0x0E36 , // Thai Character Sara Ue
56
+ 0x0E37 , // Thai Character Sara Uee
57
+ 0x0E38 , // Thai Character Sara U
58
+ 0x0E39 , // Thai Character Sara Uu
59
+ 0x0E3A , // Thai Character Phinthu
60
+ 0x0E47 , // Thai Character Maitaikhu
61
+ 0x0E48 , // Thai Character Mai Ek
62
+ 0x0E49 , // Thai Character Mai Tho
63
+ 0x0E4A , // Thai Character Mai Tri
64
+ 0x0E4B , // Thai Character Mai Chattawa
65
+ 0x0E4C , // Thai Character Thanthakhat
66
+ 0x0E4D , // Thai Character Nikhahit
67
+ 0x0E4E , // Thai Character Yamakkan
68
+ 0x0F71 , // Tibetan Vowel Sign Aa
69
+ 0x0F73 , // Tibetan Vowel Sign Ii
70
+ 0x0F74 , // Tibetan Vowel Sign U
71
+ 0x0F75 , // Tibetan Vowel Sign Uu
72
+ 0x0F76 , // Tibetan Vowel Sign Vocalic R
73
+ 0x0F77 , // Tibetan Vowel Sign Vocalic Rr
74
+ 0x0F78 , // Tibetan Vowel Sign Vocalic L
75
+ 0x0F79 , // Tibetan Vowel Sign Vocalic Ll
76
+ 0x0F81 , // Tibetan Vowel Sign Reversed Ii
77
+ 0x0F8D , // Tibetan Subjoined Sign Lce Tsa Can
78
+ 0x0F8E , // Tibetan Subjoined Sign Mchu Can
79
+ 0x0F8F , // Tibetan Subjoined Sign Inverted Mchu Can
80
+ 0x0F90 , // Tibetan Subjoined Letter Ka
81
+ 0x0F91 , // Tibetan Subjoined Letter Kha
82
+ 0x0F92 , // Tibetan Subjoined Letter Ga
83
+ 0x0F93 , // Tibetan Subjoined Letter Gha
84
+ 0x0F94 , // Tibetan Subjoined Letter Nga
85
+ 0x0F95 , // Tibetan Subjoined Letter Ca
86
+ 0x0F96 , // Tibetan Subjoined Letter Cha
87
+ 0x0F97 , // Tibetan Subjoined Letter Ja
88
+ 0x0F99 , // Tibetan Subjoined Letter Nya
89
+ 0x0F9A , // Tibetan Subjoined Letter Tta
90
+ 0x0F9B , // Tibetan Subjoined Letter Ttha
91
+ 0x0F9C , // Tibetan Subjoined Letter Dda
92
+ 0x0F9D , // Tibetan Subjoined Letter Ddha
93
+ 0x0F9E , // Tibetan Subjoined Letter Nna
94
+ 0x0F9F , // Tibetan Subjoined Letter Ta
95
+ 0x0FA0 , // Tibetan Subjoined Letter Tha
96
+ 0x0FA1 , // Tibetan Subjoined Letter Da
97
+ 0x0FA2 , // Tibetan Subjoined Letter Dha
98
+ 0x0FA3 , // Tibetan Subjoined Letter Na
99
+ 0x0FA4 , // Tibetan Subjoined Letter Pa
100
+ 0x0FA5 , // Tibetan Subjoined Letter Pha
101
+ 0x0FA6 , // Tibetan Subjoined Letter Ba
102
+ 0x0FA7 , // Tibetan Subjoined Letter Bha
103
+ 0x0FA8 , // Tibetan Subjoined Letter Ma
104
+ 0x0FA9 , // Tibetan Subjoined Letter Tsa
105
+ 0x0FAA , // Tibetan Subjoined Letter Tsha
106
+ 0x0FAB , // Tibetan Subjoined Letter Dza
107
+ 0x0FAC , // Tibetan Subjoined Letter Dzha
108
+ 0x0FAD , // Tibetan Subjoined Letter Wa
109
+ 0x0FAE , // Tibetan Subjoined Letter Zha
110
+ 0x0FAF , // Tibetan Subjoined Letter Za
111
+ 0x0FB0 , // Tibetan Subjoined Letter undefined-A
112
+ 0x0FB1 , // Tibetan Subjoined Letter Ya
113
+ 0x0FB2 , // Tibetan Subjoined Letter Ra
114
+ 0x0FB3 , // Tibetan Subjoined Letter La
115
+ 0x0FB4 , // Tibetan Subjoined Letter Sha
116
+ 0x0FB5 , // Tibetan Subjoined Letter Ssa
117
+ 0x0FB6 , // Tibetan Subjoined Letter Sa
118
+ 0x0FB7 , // Tibetan Subjoined Letter Ha
119
+ 0x0FB8 , // Tibetan Subjoined Letter A
120
+ 0x0FB9 , // Tibetan Subjoined Letter Kssa
121
+ 0x0FBA , // Tibetan Subjoined Letter Fixed-Form Wa
122
+ 0x0FBB , // Tibetan Subjoined Letter Fixed-Form Ya
123
+ 0x0FBC , // Tibetan Subjoined Letter Fixed-Form Ra
124
+ 0x102D , // Myanmar Vowel Sign I
125
+ 0x102E , // Myanmar Vowel Sign Ii
126
+ 0x102F , // Myanmar Vowel Sign U
127
+ 0x1030 , // Myanmar Vowel Sign Uu
128
+ 0x1032 , // Myanmar Vowel Sign Ai
129
+ 0x1033 , // Myanmar Vowel Sign Mon Ii
130
+ 0x1034 , // Myanmar Vowel Sign Mon O
131
+ 0x1035 , // Myanmar Vowel Sign E Above
132
+ 0x1036 , // Myanmar Sign Anusvara
133
+ 0x1037 , // Myanmar Sign Dot Below
134
+ 0x1039 , // Myanmar Sign Virama
135
+ 0x103A , // Myanmar Sign Asat
136
+ 0x103D , // Myanmar Consonant Sign Medial Wa
137
+ 0x103E , // Myanmar Consonant Sign Medial Ha
138
+ 0x1058 , // Myanmar Vowel Sign Vocalic L
139
+ 0x1059 , // Myanmar Vowel Sign Vocalic Ll
140
+ 0x105E , // Myanmar Consonant Sign Mon Medial Na
141
+ 0x105F , // Myanmar Consonant Sign Mon Medial Ma
142
+ 0x1060 , // Myanmar Consonant Sign Mon Medial La
143
+ 0x1071 , // Myanmar Vowel Sign Geba Karen I
144
+ 0x1072 , // Myanmar Vowel Sign Kayah Oe
145
+ 0x1073 , // Myanmar Vowel Sign Kayah U
146
+ 0x1074 , // Myanmar Vowel Sign Kayah Ee
147
+ 0x1082 , // Myanmar Consonant Sign Shan Medial Wa
148
+ 0x1085 , // Myanmar Vowel Sign Shan E Above
149
+ 0x1086 , // Myanmar Vowel Sign Shan Final Y
150
+ 0x108D , // Myanmar Sign Shan Council Emphatic Tone
151
+ 0x109D , // Myanmar Vowel Sign Aiton Ai
152
+ 0x1732 , // Hanunoo Vowel Sign I
153
+ 0x1733 , // Hanunoo Vowel Sign U
154
+ 0x1734 , // Hanunoo Sign Pamudpod
155
+ 0x1772 , // Tagbanwa Vowel Sign I
156
+ 0x1773 , // Tagbanwa Vowel Sign U
157
+ 0x17B7 , // Khmer Vowel Sign I
158
+ 0x17B8 , // Khmer Vowel Sign Ii
159
+ 0x17B9 , // Khmer Vowel Sign Y
160
+ 0x17BA , // Khmer Vowel Sign Yy
161
+ 0x17BB , // Khmer Vowel Sign U
162
+ 0x17BC , // Khmer Vowel Sign Uu
163
+ 0x17BD , // Khmer Vowel Sign Ua
164
+ 0x17C6 , // Khmer Sign Nikahit
165
+ 0x17CB , // Khmer Sign Bantoc
166
+ 0x17CD , // Khmer Sign Toandakhiat
167
+ 0x17CE , // Khmer Sign Kakabat
168
+ 0x17CF , // Khmer Sign Ahsda
169
+ 0x17D0 , // Khmer Sign Samyok Sannya
170
+ 0x17D1 , // Khmer Sign Viriam
171
+ 0x17D2 , // Khmer Sign Coeng
172
+ 0x17D3 , // Khmer Sign Bathamasat
173
+ 0x17DD , // Khmer Sign Atthacan
174
+ ] ;
175
+
7
176
export function splitByGraphemeCluster ( text : string ) {
8
177
const segments = segmenter . segment ( text ) [ Symbol . iterator ] ( ) ;
9
178
let segment = segments . next ( ) ;
@@ -14,7 +183,7 @@ export function splitByGraphemeCluster(text: string) {
14
183
const baseSegments = [ ] ;
15
184
while ( ! segment . done ) {
16
185
const baseSegment = segment ;
17
- while ( ! nextSegment . done && / ^ \p{ Mc} / u. test ( nextSegment . value . segment ) ) {
186
+ while ( ! nextSegment . done && ( / ^ \p{ Mc} / u. test ( nextSegment . value . segment ) || doubleWidthDiacritics . indexOf ( baseSegment . value . segment . at ( - 1 ) . codePointAt ( 0 ) ) !== - 1 ) ) {
18
187
baseSegment . value . segment += nextSegment . value . segment ;
19
188
segment = segments . next ( ) ;
20
189
nextSegment = nextSegments . next ( ) ;
0 commit comments