-
Notifications
You must be signed in to change notification settings - Fork 0
/
options.go
409 lines (402 loc) · 16.9 KB
/
options.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
package kana
import (
"strconv"
"strings"
)
// ConvertOptions describes options for [Convert].
type ConvertOptions int
const (
// HalfwidthToWide converts characters in halfwidth forms
// to their ordinary, wide versions.
//
// The characters having East_Asian_Width property value of
// H (East Asian Halfwidth) except U+20A9 WON SIGN (₩) are converted.
// That is:
//
// - U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP (。) to U+FFBE HALFWIDTH HANGUL LETTER HIEUH (ᄒ)
// - U+FFC2 HALFWIDTH HANGUL LETTER A (ᅡ) to U+FFC7 HALFWIDTH HANGUL LETTER E (ᅦ)
// - U+FFCA HALFWIDTH HANGUL LETTER YEO (ᅧ) to U+FFCF HALFWIDTH HANGUL LETTER OE (ᅬ)
// - U+FFD2 HALFWIDTH HANGUL LETTER YO (ᅭ) to U+FFD7 HALFWIDTH HANGUL LETTER YU (ᅲ)
// - U+FFDA HALFWIDTH HANGUL LETTER EU (ᅳ) to U+FFDC HALFWIDTH HANGUL LETTER I (ᅵ)
// - U+FFE8 HALFWIDTH FORMS LIGHT VERTICAL (│) to U+FFEE HALFWIDTH WHITE CIRCLE (○)
//
// The conversion is roughly equivalent to NFKC but with some differences:
//
// - Halfwidth Hangul letters are not fully normalized and instead
// converted to the corresponding letters
// in Hangul Compatibility Jamo block.
//
// The following compat flags affect the behavior of this transformation:
//
// - [CompatVoicedSoundMarks]
// - [CompatVoicedKanaRestriction]
// - [CompatKeepHalfwidthHangul]
// - [CompatKeepHalfwidthSymbols]
HalfwidthToWide ConvertOptions = 1 << iota
// FullwidthToNarrow converts characters in fullwidth forms
// to their ordinary, narrow versions.
//
// The characters having East_Asian_Width property value of
// F (East Asian Fullwidth) are converted.
// That is:
//
// - U+FF01 FULLWIDTH EXCLAMATION MARK (!) to U+FF60 FULLWIDTH RIGHT WHITE PARENTHESIS (⦆)
// - U+FFE0 FULLWIDTH CENT SIGN (¢) to U+FFE6 FULLWIDTH WON SIGN (₩)
//
// The conversion is roughly equivalent to NFKC but with some differences:
//
// - U+FFE3 FULLWIDTH MACRON ( ̄) is not fully normalized and instead
// converted to U+00AF MACRON (¯).
//
// The following compat flags affect the behavior of this transformation:
//
// - [CompatQuotes]
// - [CompatMinus]
// - [CompatOverline]
// - [CompatCurrency]
// - [CompatBrackets]
// - [CompatKeepSpaces]
// - [CompatDoubleSpaces]
FullwidthToNarrow
// KatakanaToHiragana converts katakana to hiragana.
//
// Consider it transformation from Script=Katakana to Script=Hiragana,
// but there are a lot of exceptions.
//
// Those characters are converted to a single hiragana character:
//
// - U+30A1 KATAKANA LETTER SMALL A (ァ) to U+30F6 KATAKANA LETTER SMALL KE (ヶ)
// - U+30FD KATAKANA ITERATION MARK (ヽ) to U+30FE KATAKANA VOICED ITERATION MARK (ヾ)
// - U+1B155 KATAKANA LETTER SMALL KO (𛅕)
// - U+1B164 KATAKANA LETTER SMALL WI (𛅤) to U+1B166 KATAKANA LETTER SMALL WO (𛅦)
//
// Those characters are converted to a sequence of characters:
//
// - U+30F7 KATAKANA LETTER VA (ヷ) to U+30FA KATAKANA LETTER VO (ヺ)
//
// Those characters are not converted:
//
// - U+30FF KATAKANA DIGRAPH KOTO (ヿ)
// - U+31F0 KATAKANA LETTER SMALL KU (ㇰ) to U+31FF KATAKANA LETTER SMALL RO (ㇿ)
// - U+32D0 CIRCLED KATAKANA A (㋐) to U+32FE CIRCLED KATAKANA WO (㋾)
// - U+3300 SQUARE APAATO (㌀) to U+3357 SQUARE WATTO (㍗)
// - U+1AFF0 KATAKANA LETTER MINNAN TONE-2 (𚿰) to U+1AFF3 KATAKANA LETTER MINNAN TONE-5 (𚿳)
// - U+1AFF5 KATAKANA LETTER MINNAN TONE-7 (𚿵) to U+1AFFB KATAKANA LETTER MINNAN NASALIZED TONE-8 (𚿻)
// - U+1B000 KATAKANA LETTER ARCHAIC E (𛀀)
// - U+1B120 KATAKANA LETTER ARCHAIC YI (𛄠) to U+1B122 KATAKANA LETTER ARCHAIC WU (𛄢)
// - U+1B167 KATAKANA LETTER SMALL N (𛅧)
//
// You need [HalfwidthToWide] to convert them to hiragana:
//
// - U+FF66 HALFWIDTH KATAKANA LETTER WO (ヲ) to U+FF6F HALFWIDTH KATAKANA LETTER SMALL TU (ッ)
// - U+FF71 HALFWIDTH KATAKANA LETTER A (ア) to U+FF9D HALFWIDTH KATAKANA LETTER N (ン)
//
// The following compat flags affect the behavior of this transformation:
//
// - [CompatKanaRestriction]
KatakanaToHiragana
// HiraganaToKatakana converts hiragana to katakana.
//
// Consider it transformation from Script=Hiragana to Script=Katakana,
// but there are a lot of exceptions.
//
// Those characters are converted to a single katakana character:
//
// - U+3041 HIRAGANA LETTER SMALL A (ぁ) to U+3096 HIRAGANA LETTER SMALL KE (ゖ)
// - U+309D HIRAGANA ITERATION MARK (ゝ) to U+309E HIRAGANA VOICED ITERATION MARK (ゞ)
// - U+1B132 HIRAGANA LETTER SMALL KO (𛄲)
// - U+1B150 HIRAGANA LETTER SMALL WI (𛅐) to U+1B152 HIRAGANA LETTER SMALL WO (𛅒)
//
// Those characters are not converted:
//
// - U+309F HIRAGANA DIGRAPH YORI (ゟ)
// - U+1B001 HIRAGANA LETTER ARCHAIC YE (𛀁) to U+1B11F HIRAGANA LETTER ARCHAIC WU (𛄟)
// - U+1F200 SQUARE HIRAGANA HOKA (🈀)
//
// The following compat flags affect the behavior of this transformation:
//
// - [CompatKanaRestriction]
HiraganaToKatakana
// CompatWideKatakanaToHalfwidth converts ordinary katakana
// to their halfwidth forms.
//
// This transformation newly introduces compatibility characters
// rather than reducing them in the input string.
// This is against what Unicode intends to do. Therefore,
// the entire transformation mode is considered as a compatibility option.
//
// If you want to normalize between fullwidth and halfwidth katakana,
// you should use [HalfwidthToWide] instead.
//
// The following characters are converted:
//
// - U+3001 IDEOGRAPHIC COMMA (、) to U+3002 IDEOGRAPHIC FULL STOP (。)
// - U+300C LEFT CORNER BRACKET (「) to U+300D RIGHT CORNER BRACKET (」)
// - U+3099 COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK to U+309C KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK (゜)
// - U+30A1 KATAKANA LETTER SMALL A (ァ) to U+30ED KATAKANA LETTER RO (ロ)
// - U+30EF KATAKANA LETTER WA (ワ)
// - U+30F2 KATAKANA LETTER WO (ヲ) to U+30F4 KATAKANA LETTER VU (ヴ)
// - U+30FB KATAKANA MIDDLE DOT (・) to U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK (ー)
//
// When a character in the list canonically decomposes to a base character
// and a combining voiced or semi-voiced sound mark, the transformation
// is applied after decomposing the character.
//
// Note that, U+30F7 KATAKANA LETTER VA (ヷ) and U+30FA KATAKANA LETTER VO (ヺ)
// can also be transformed this way, but they are not included in the list.
// This is because the entire transformation exists for compatibility
// with NKF.
//
// Like other compat options, this is not stable under canonical equivalence.
CompatWideKatakanaToHalfwidth
// CompatQuotes is a compatibility option
// to reproduce NKF's behavior for quotes.
//
// Specifically, the following transformations are additionally applied
// in [FullwidthToNarrow]:
//
// - U+00B4 ACUTE ACCENT (´) → U+0027 APOSTROPHE (')
// - U+2018 LEFT SINGLE QUOTATION MARK (‘) → U+0060 GRAVE ACCENT (`)
// - U+2019 RIGHT SINGLE QUOTATION MARK (’) → U+0027 APOSTROPHE (')
// - U+201C LEFT DOUBLE QUOTATION MARK (“) → U+0022 QUOTATION MARK (")
// - U+201D RIGHT DOUBLE QUOTATION MARK (”) → U+0022 QUOTATION MARK (")
//
// While the following transformations are inhibited in [FullwidthToNarrow]:
//
// - U+FF02 FULLWIDTH QUOTATION MARK (")
// (usually converted to U+0022 QUOTATION MARK ("))
// - U+FF07 FULLWIDTH APOSTROPHE (')
// (usually converted to U+0027 APOSTROPHE ('))
CompatQuotes
// CompatMinus is a compatibility option
// to reproduce NKF's behavior for minus signs, hypens, and similar symbols.
//
// Specifically, the following transformations are applied:
//
// - U+2015 HORIZONTAL BAR (―) → U+2014 EM DASH (—)
// - U+FF0D FULLWIDTH HYPHEN-MINUS (-) → U+2212 MINUS SIGN (−)
//
// and the following transformations are additionally applied
// in [FullwidthToNarrow]:
//
// - U+2014 EM DASH (—) → U+002D HYPHEN-MINUS (-)
// - U+2015 HORIZONTAL BAR (―) → U+002D HYPHEN-MINUS (-)
// - U+2212 MINUS SIGN (−) → U+002D HYPHEN-MINUS (-)
// - U+FF0D FULLWIDTH HYPHEN-MINUS (-) → U+002D HYPHEN-MINUS (-)
CompatMinus
// CompatOverline is a compatibility option
// to reproduce NKF's behavior for overlines and similar symbols.
//
// Specifically, the following transformations are applied:
//
// - U+FFE3 FULLWIDTH MACRON ( ̄) → U+203E OVERLINE (‾), which wins over
// [FullwidthToNarrow], where it is converted to U+00AF MACRON (¯).
//
//
// Additionally, the following transformations are inhibited in
// [FullwidthToNarrow]:
//
// - U+FF5E FULLWIDTH TILDE (~)
// (usually converted to U+007E TILDE (~))
CompatOverline
// CompatCurrency is a compatibility option
// to reproduce NKF's behavior for currency symbols.
//
// Specifically, the following transformations are applied regardless of
// [FullwidthToNarrow]:
//
// - U+FFE0 FULLWIDTH CENT SIGN (¢) → U+00A2 CENT SIGN (¢)
// - U+FFE1 FULLWIDTH POUND SIGN (£) → U+00A3 POUND SIGN (£)
// - U+FFE5 FULLWIDTH YEN SIGN (¥) → U+00A5 YEN SIGN (¥)
//
// and the following transformations are inhibited in [FullwidthToNarrow]:
//
// - U+FFE6 FULLWIDTH WON SIGN (₩)
// (usually converted to U+20A9 WON SIGN (₩))
CompatCurrency
// CompatBrackets is a compatibility option
// to reproduce NKF's behavior for brackets and parentheses.
//
// Specifically, the following transformations are additionally applied
// in [FullwidthToNarrow]:
//
// - U+3008 LEFT ANGLE BRACKET (〈) → U+003C LESS-THAN SIGN (<)
// - U+3009 RIGHT ANGLE BRACKET (〉) → U+003E GREATER-THAN SIGN (>)
//
// while the following transformations are inhibited
// in [FullwidthToNarrow]:
//
// - U+FF5F FULLWIDTH LEFT WHITE PARENTHESIS (⦅)
// (usually converted to U+2985 LEFT WHITE PARENTHESIS (⦅))
// - U+FF60 FULLWIDTH RIGHT WHITE PARENTHESIS (⦆)
// (usually converted to U+2986 RIGHT WHITE PARENTHESIS (⦆))
CompatBrackets
// CompatOtherSymbols is a compatibility option
// to reproduce NKF's behavior for miscellaneous symbols.
//
// Specifically, the following transformations are applied regardless of
// [FullwidthToNarrow]:
//
// - U+FFE2 FULLWIDTH NOT SIGN (¬) → U+00AC NOT SIGN (¬)
// - U+FFE4 FULLWIDTH BROKEN BAR (¦) → U+00A6 BROKEN BAR (¦)
//
// and the following transformations are also applied regardless of
// [FullwidthToNarrow]:
//
// - U+2225 PARALLEL TO (∥) → U+2016 DOUBLE VERTICAL LINE (‖)
CompatOtherSymbols
// CompatKeepSpaces is a compatibility option
// to reproduce NKF's behavior for Ideographic Spaces.
//
// Specifically, the following transformations are inhibited in
// [FullwidthToNarrow]:
//
// - U+3000 IDEOGRAPHIC SPACE ( )
// (usually converted to U+0020 SPACE ( ))
CompatKeepSpaces
// CompatDoubleSpaces is a compatibility option
// to reproduce NKF's behavior for Ideographic Spaces.
//
// Specifically, if this option is present along with [FullwidthToNarrow],
// U+3000 IDEOGRAPHIC SPACE ( ) is converted to two U+0020 SPACE ( ) characters.
CompatDoubleSpaces
// CompatVoicedSoundMarks is a compatibility option
// to reproduce NKF's behavior for voiced and semi-voiced sound marks.
//
// Specifically, the following transformations are applied in [HalfwidthToWide]:
//
// - U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK (゙) is converted to
// U+309B KATAKANA-HIRAGANA VOICED SOUND MARK rather than
// U+3099 COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK, except when
// it follows ウ, カ, キ, ク, ケ, コ, サ, シ, ス, セ, ソ, タ, チ, ツ, テ, ト, ハ, ヒ, フ,
// ヘ, or ホ.
// - U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK (゚) is converted to
// U+309C KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK rather than
// U+309A COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK, except
// when it follows ハ, ヒ, フ, ヘ, or ホ.
CompatVoicedSoundMarks
// CompatKeepHalfwidthHangul is a compatibility option
// to reproduce NKF's behavior for halfwidth Katakana letters.
//
// Specifically, the following characters are transformed
// differently in [HalfwidthToWide]:
//
// - U+FF66 HALFWIDTH KATAKANA LETTER WO (ヲ) followed by
// U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK (゙) is converted to
// U+30F2 KATAKANA LETTER WO (ヲ) followed by
// U+309B KATAKANA-HIRAGANA VOICED SOUND MARK (゛), rather than
// U+30FA KATAKANA LETTER VO (ヺ).
// - U+FF9C HALFWIDTH KATAKANA LETTER TU (ワ) followed by
// U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK (゙) is converted to
// U+30EF KATAKANA LETTER WA (ワ) followed by
// U+309B KATAKANA-HIRAGANA VOICED SOUND MARK (゛), rather than
// U+30F7 KATAKANA LETTER VA (ヷ).
CompatVoicedKanaRestriction
// CompatKeepHalfwidthHangul is a compatibility option
// to reproduce NKF's behavior for halfwidth Hangul letters.
//
// Specifically, the following characters are kept intact
// in [HalfwidthToWide]:
//
// - U+FFA0 HALFWIDTH HANGUL FILLER (ᅠ)
// - U+FFA1 HALFWIDTH HANGUL LETTER KIYEOK (ᄀ) to U+FFBE HALFWIDTH HANGUL LETTER HIEUH (ᄒ)
// - U+FFC2 HALFWIDTH HANGUL LETTER A (ᅡ) to U+FFC7 HALFWIDTH HANGUL LETTER E (ᅦ)
// - U+FFCA HALFWIDTH HANGUL LETTER YEO (ᅧ) to U+FFCF HALFWIDTH HANGUL LETTER OE (ᅬ)
// - U+FFD2 HALFWIDTH HANGUL LETTER YO (ᅭ) to U+FFD7 HALFWIDTH HANGUL LETTER YU (ᅲ)
// - U+FFDA HALFWIDTH HANGUL LETTER EU (ᅳ) to U+FFDC HALFWIDTH HANGUL LETTER I (ᅵ)
CompatKeepHalfwidthHangul
// CompatKeepHalfwidthSymbols is a compatibility option
// to reproduce NKF's behavior for halfwidth symbols.
//
// Specifically, the following characters are kept intact
// in [HalfwidthToWide]:
//
// - U+FFE8 HALFWIDTH FORMS LIGHT VERTICAL (│)
// - U+FFE9 HALFWIDTH LEFTWARDS ARROW (←)
// - U+FFEA HALFWIDTH UPWARDS ARROW (↑)
// - U+FFEB HALFWIDTH RIGHTWARDS ARROW (→)
// - U+FFEC HALFWIDTH DOWNWARDS ARROW (↓)
// - U+FFED HALFWIDTH BLACK SQUARE (■)
// - U+FFEE HALFWIDTH WHITE CIRCLE (○)
CompatKeepHalfwidthSymbols
// CompatKanaRestriction is a compatibility option
// to reproduce NKF's behavior for hiragana and katakana.
//
// Specifically, the following characters are kept intact
// in [KatakanaToHiragana]:
//
// - U+30F5 KATAKANA LETTER SMALL KA (ヵ)
// - U+30F6 KATAKANA LETTER SMALL KE (ヶ)
// - U+30F7 KATAKANA LETTER VA (ヷ)
// - U+30F8 KATAKANA LETTER VI (ヸ)
// - U+30F9 KATAKANA LETTER VE (ヹ)
// - U+30FA KATAKANA LETTER VO (ヺ)
// - U+1B155 KATAKANA LETTER SMALL KO (𛅕)
// - U+1B164 KATAKANA LETTER SMALL WI (𛅤)
// - U+1B165 KATAKANA LETTER SMALL WE (𛅥)
// - U+1B166 KATAKANA LETTER SMALL WO (𛅦)
//
// and the following characters are kept intact in [HiraganaToKatakana]:
//
// - U+3095 HIRAGANA LETTER SMALL KA (ゕ)
// - U+3096 HIRAGANA LETTER SMALL KE (ゖ)
// - U+1B132 HIRAGANA LETTER SMALL KO (𛄲)
// - U+1B150 HIRAGANA LETTER SMALL WI (𛅐)
// - U+1B151 HIRAGANA LETTER SMALL WE (𛅑)
// - U+1B152 HIRAGANA LETTER SMALL WO (𛅒)
CompatKanaRestriction
)
func (o ConvertOptions) Normalize() ConvertOptions {
if o&FullwidthToNarrow == 0 {
o &^= CompatQuotes | CompatBrackets | CompatKeepSpaces | CompatDoubleSpaces
}
if o&CompatKeepSpaces != 0 {
o &^= CompatDoubleSpaces
}
if o&HalfwidthToWide == 0 {
o &^= CompatVoicedSoundMarks | CompatVoicedKanaRestriction | CompatKeepHalfwidthHangul | CompatKeepHalfwidthSymbols
}
if o&(KatakanaToHiragana|HiraganaToKatakana) == 0 {
o &^= CompatKanaRestriction
}
return o
}
var flagNames = []struct {
name string
flag ConvertOptions
mask ConvertOptions
}{
{"HalfwidthToWide", HalfwidthToWide, HalfwidthToWide},
{"FullwidthToNarrow", FullwidthToNarrow, FullwidthToNarrow},
{"KatakanaToHiragana", KatakanaToHiragana, KatakanaToHiragana},
{"HiraganaToKatakana", HiraganaToKatakana, HiraganaToKatakana},
{"CompatWideKatakanaToHalfwidth", CompatWideKatakanaToHalfwidth, CompatWideKatakanaToHalfwidth},
{"CompatQuotes", CompatQuotes, CompatQuotes},
{"CompatMinus", CompatMinus, CompatMinus},
{"CompatOverline", CompatOverline, CompatOverline},
{"CompatCurrency", CompatCurrency, CompatCurrency},
{"CompatBrackets", CompatBrackets, CompatBrackets},
{"CompatOtherSymbols", CompatOtherSymbols, CompatOtherSymbols},
{"CompatKeepSpaces", CompatKeepSpaces, CompatKeepSpaces},
{"CompatDoubleSpaces", CompatDoubleSpaces, CompatDoubleSpaces},
{"CompatVoicedSoundMarks", CompatVoicedSoundMarks, CompatVoicedSoundMarks},
{"CompatVoicedKanaRestriction", CompatVoicedKanaRestriction, CompatVoicedKanaRestriction},
{"CompatKeepHalfwidthHangul", CompatKeepHalfwidthHangul, CompatKeepHalfwidthHangul},
{"CompatKeepHalfwidthSymbols", CompatKeepHalfwidthSymbols, CompatKeepHalfwidthSymbols},
{"CompatKanaRestriction", CompatKanaRestriction, CompatKanaRestriction},
}
func (o ConvertOptions) String() string {
var names []string
for _, n := range flagNames {
if o&n.mask == n.flag {
names = append(names, n.name)
o &^= n.mask
}
}
if o != 0 {
names = append(names, "0x"+strconv.FormatInt(int64(o), 16))
} else if len(names) == 0 {
return "0"
}
return strings.Join(names, " | ")
}