-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgenerate_unicode_ranges.py
81 lines (64 loc) · 1.92 KB
/
generate_unicode_ranges.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# coding: utf-8
import sys
import unicodedata
ideograph_range = []
hangul_range = []
hiragana_range = []
katakana_range = []
combined_range = []
def add_to_range(range, index):
if not range:
range.append([index])
return
if len(range[-1]) == 1 and range[-1][0] == index - 1:
range[-1].append(index)
return
if range[-1][-1] == index - 1:
range[-1][-1] = index
return
range.append([index])
def count_chars(range):
result = 0
for item in range:
if len(item) == 1:
result += 1
else:
result += item[1] - item[0] + 1
return result
def print_range(range):
for item in range:
for char_index in xrange(item[0], item[1] if len(item) > 1 else item[0] + 1):
char = unichr(char_index)
print char_index, char.encode('utf8'), unicodedata.name(char)
for index in xrange(32, 10000000):
try:
char = unichr(index)
except ValueError:
break
try:
name = unicodedata.name(char)
except Exception, ex:
pass
else:
overlap = []
for subname, lst in [
('IDEOGRAPH', ideograph_range),
('HANGUL', hangul_range),
('HIRAGANA', hiragana_range),
('KATAKANA', katakana_range)]:
if subname in name:
add_to_range(lst, index)
overlap.append(subname)
if len(overlap) > 1:
sys.stderr.write('Character %r %s in %r\n' % (index, name, overlap))
if overlap:
add_to_range(combined_range, index)
def report(name, lst):
print '# %s total chars=%s' % (name, count_chars(lst))
print '%s = %s' % (name, lst)
print
report('ideograph_range', ideograph_range)
report('hangul_range', hangul_range)
report('hiragana_range', hiragana_range)
report('katakana_range', katakana_range)
report('combined_range', combined_range)