From f3c0a14c9b4b6d5aba549fc1a48ce0827eade648 Mon Sep 17 00:00:00 2001
From: atoring <tori@air.ocn.ne.jp>
Date: Sat, 19 May 2018 15:39:48 +0900
Subject: [PATCH] add: janome fix patch for pyinstaller

---
 lang/janome_fix/dic.py     | 453 +++++++++++++++++++++++++++++++++++++
 lang/janome_fix/dic.py.org | 448 ++++++++++++++++++++++++++++++++++++
 2 files changed, 901 insertions(+)
 create mode 100644 lang/janome_fix/dic.py
 create mode 100644 lang/janome_fix/dic.py.org

diff --git a/lang/janome_fix/dic.py b/lang/janome_fix/dic.py
new file mode 100644
index 0000000..e323f48
--- /dev/null
+++ b/lang/janome_fix/dic.py
@@ -0,0 +1,453 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 moco_beta
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import with_statement
+import os
+import io
+import pickle
+import gzip
+from struct import pack
+from .fst import Matcher, create_minimum_transducer, compileFST, unpack_uint
+import traceback
+import logging
+import sys
+import re
+try:
+    from functools import lru_cache
+except ImportError:
+    from functools import wraps
+    def lru_cache(**kwargs):
+        def _dummy(function):
+            @wraps(function)
+            def __dummy(*args, **kwargs):
+                return function(*args, **kwargs)
+            return __dummy
+        return _dummy
+
+
+PY3 = sys.version_info[0] == 3
+
+SYSDIC_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "sysdic")
+
+FILE_FST_DATA = 'fst.data'
+
+MODULE_ENTRIES_EXTRA = 'entries_extra%d.py'
+MODULE_ENTRIES_COMPACT = 'entries_compact%d.py'
+MODULE_ENTRIES_BUCKETS = 'entries_buckets.py'
+MODULE_CONNECTIONS = 'connections%d.py'
+MODULE_CHARDEFS = 'chardef.py'
+MODULE_UNKNOWNS = 'unknowns.py'
+
+FILE_USER_FST_DATA = 'user_fst.data'
+FILE_USER_ENTRIES_DATA = 'user_entries.data'
+
+def save_fstdata(data, dir, suffix=''):
+    _save(os.path.join(dir, FILE_FST_DATA + suffix), data, 9)
+
+
+def load_all_fstdata():
+    # --- fix for pyinstaller
+    if True:#not os.path.exists(SYSDIC_DIR):
+        return [_load(os.path.join("./sysdic", data_file))
+                for data_file in os.listdir("./sysdic") if data_file.startswith(FILE_FST_DATA)]
+    # --- fix end
+    return [_load(os.path.join(SYSDIC_DIR, data_file))
+            for data_file in os.listdir(SYSDIC_DIR) if data_file.startswith(FILE_FST_DATA)]
+
+
+def start_save_entries(dir, bucket_num):
+    for i in range(0, bucket_num):
+        _start_entries_as_module(os.path.join(dir, MODULE_ENTRIES_COMPACT % i))
+        _start_entries_as_module(os.path.join(dir, MODULE_ENTRIES_EXTRA % i))
+
+
+def end_save_entries(dir, bucket_num):
+    for i in range(0, bucket_num):
+        _end_entries_as_module(os.path.join(dir, MODULE_ENTRIES_COMPACT % i))
+        _end_entries_as_module(os.path.join(dir, MODULE_ENTRIES_EXTRA % i))
+
+
+def save_entry(dir, bucket_idx, morph_id, entry):
+    _save_entry_as_module_compact(os.path.join(dir, MODULE_ENTRIES_COMPACT % bucket_idx), morph_id, entry)
+    _save_entry_as_module_extra(os.path.join(dir, MODULE_ENTRIES_EXTRA % bucket_idx), morph_id, entry)
+
+
+def save_entry_buckets(dir, buckets):
+    _save_as_module(os.path.join(dir, MODULE_ENTRIES_BUCKETS), buckets)
+
+
+def save_connections(connections, dir=u'.'):
+    # split whole connections to 2 buckets to reduce memory usage while installing.
+    # TODO: find better ways...
+    bucket_size = (len(connections) // 2) + 1
+    offset = 0
+    for i in range(1, 3):
+        _save_as_module(os.path.join(dir, MODULE_CONNECTIONS % i),
+                        connections[offset:offset+bucket_size])
+        offset += bucket_size
+
+
+def save_chardefs(chardefs, dir=u'.'):
+    _save_as_module(os.path.join(dir, MODULE_CHARDEFS), chardefs)
+
+
+def save_unknowns(unknowns, dir=u'.'):
+    _save_as_module(os.path.join(dir, MODULE_UNKNOWNS), unknowns)
+
+
+def _save(file, data, compresslevel):
+    if not data:
+        return
+    with gzip.open(file, 'wb', compresslevel) as f:
+        f.write(data)
+        f.flush()
+
+
+def _load(file):
+    if not os.path.exists(file):
+        return None
+    with gzip.open(file, 'rb') as f:
+        data = f.read()
+        return data
+
+
+def _save_as_module(file, data):
+    if not data:
+        return
+    with open(file, 'w') as f:
+        f.write(u'DATA=')
+        f.write(str(data).replace('\\\\', '\\') if PY3 else unicode(data))
+        f.flush()
+
+
+def _start_entries_as_module(file):
+    idx_file = re.sub(r'\.py$', '_idx.py', file)
+    with open(file, 'w') as f:
+        with open(idx_file, 'w') as f_idx:
+            f.write("# -*- coding: utf-8 -*-\n")
+            f.write('DATA={')
+            f_idx.write('DATA={')
+
+
+def _end_entries_as_module(file):
+    idx_file = re.sub(r'\.py$', '_idx.py', file)
+    with open(file, 'a') as f:
+        with open(idx_file, 'a') as f_idx:
+            f.write('}\n')
+            f_idx.write('}\n')
+            f.flush()
+            f_idx.flush()
+
+
+def _save_entry_as_module_compact(file, morph_id, entry):
+    idx_file = re.sub(r'\.py$', '_idx.py', file)
+    with open(file, 'a') as f:
+        with open(idx_file, 'a') as f_idx:
+            f.write('%d:(' % morph_id)
+            _pos1 = f.tell()
+            f_idx.write('%d:%d,' % (morph_id, _pos1))
+            s = u"u'%s',%s,%s,%d" % (
+                entry[0].encode('unicode_escape').decode('ascii') if PY3 else entry[0].encode('unicode_escape'),
+                entry[1],
+                entry[2],
+                entry[3])
+            f.write(s)
+            f.write('),')
+
+
+def _save_entry_as_module_extra(file, morph_id, entry):
+    idx_file = re.sub(r'\.py$', '_idx.py', file)
+    with open(file, 'a') as f:
+        with open(idx_file, 'a') as f_idx:
+            f.write('%d:(' % morph_id)
+            _pos1 = f.tell()
+            f_idx.write('%d:%d,' % (morph_id, _pos1))
+            s = u"u'%s',u'%s',u'%s',u'%s',u'%s',u'%s'" % (
+                entry[4].encode('unicode_escape').decode('ascii') if PY3 else entry[4].encode('unicode_escape'),
+                entry[5].encode('unicode_escape').decode('ascii') if PY3 else entry[5].encode('unicode_escape'),
+                entry[6].encode('unicode_escape').decode('ascii') if PY3 else entry[6].encode('unicode_escape'),
+                entry[7].encode('unicode_escape').decode('ascii') if PY3 else entry[7].encode('unicode_escape'),
+                entry[8].encode('unicode_escape').decode('ascii') if PY3 else entry[8].encode('unicode_escape'),
+                entry[9].encode('unicode_escape').decode('ascii') if PY3 else entry[9].encode('unicode_escape'))
+            f.write(s)
+            f.write('),')
+            
+
+class Dictionary(object):
+    u"""
+    Base dictionary class
+    """
+    def __init__(self, compiledFST, entries, connections):
+        self.compiledFST = compiledFST
+        self.matcher = Matcher(compiledFST)
+        self.entries = entries
+        self.connections = connections
+
+    def lookup(self, s):
+        (matched, outputs) = self.matcher.run(s)
+        if not matched:
+            return []
+        try:
+            res = []
+            for e in outputs:
+                num = unpack_uint(e)
+                res.append((num,) + self.entries[num][:4])
+            return res
+        except Exception as e:
+            logging.error('Cannot load dictionary data. The dictionary may be corrupted?')
+            logging.error('input=%s' % s)
+            logging.error('outputs=%s' % str(outputs) if PY3 else unicode(outputs))
+            traceback.format_exc()
+            sys.exit(1)
+
+    def lookup_extra(self, num):
+        try:
+            return self.entries[num][4:]
+        except Exception as e:
+            logging.error('Cannot load dictionary data. The dictionary may be corrupted?')
+            logging.error('input=%s' % s)
+            logging.error('outputs=%s' % str(outputs) if PY3 else unicode(outputs))
+            traceback.format_exc()
+            sys.exit(1)
+
+    def get_trans_cost(self, id1, id2):
+        return self.connections[id1][id2]
+
+
+class MMapDictionary(object):
+    u"""
+    Base MMap dictionar class
+    """
+    def __init__(self, compiledFST, entries_compact, entries_extra, open_files, connections):
+        self.compiledFST = compiledFST
+        self.matcher = Matcher(compiledFST)
+        self.entries_compact = entries_compact
+        self.entries_extra = entries_extra
+        self.open_files = open_files
+        self.connections = connections
+
+    def lookup(self, s):
+        (matched, outputs) = self.matcher.run(s)
+        if not matched:
+            return []
+        try:
+            matched_entries = []
+            for e in outputs:
+                idx = unpack_uint(e)
+                bucket = next(filter(lambda b: idx >= b[0] and idx < b[1], self.entries_compact.keys())) if PY3 \
+                    else filter(lambda b: idx >= b[0] and idx < b[1], self.entries_compact.keys())[0]
+                mm, mm_idx = self.entries_compact[bucket]
+                _pos1s = mm_idx[idx] + 2
+                _pos1e = mm.find(b"',", _pos1s) if PY3 else mm.find("',", _pos1s)
+                _pos2s = _pos1e + 2
+                _pos2e = mm.find(b",", _pos2s) if PY3 else mm.find(",", _pos2s)
+                _pos3s = _pos2e + 1
+                _pos3e = mm.find(b",", _pos3s) if PY3 else mm.find(",", _pos3s)
+                _pos4s = _pos3e + 1
+                _pos4e = mm.find(b")", _pos4s) if PY3 else mm.find(")", _pos4s)
+                _entry = (mm[_pos1s:_pos1e].decode('unicode_escape'), int(mm[_pos2s:_pos2e]), int(mm[_pos3s:_pos3e]), int(mm[_pos4s:_pos4e]))
+                matched_entries.append((idx,) + _entry)
+            return matched_entries
+        except Exception as e:
+            logging.error('Cannot load dictionary data. The dictionary may be corrupted?')
+            logging.error('input=%s' % s)
+            logging.error('outputs=%s' % str(outputs) if PY3 else unicode(outputs))
+            traceback.format_exc()
+            sys.exit(1)
+
+    def lookup_extra(self, idx):
+        try:
+            bucket = next(filter(lambda b: idx >= b[0] and idx < b[1], self.entries_extra.keys())) if PY3 \
+               else filter(lambda b: idx >= b[0] and idx < b[1], self.entries_extra.keys())[0]
+            mm, mm_idx = self.entries_extra[bucket]
+            _pos1s = mm_idx[idx] + 2
+            _pos1e = mm.find(b"',u'", _pos1s) if PY3 else mm.find("',u'", _pos1s)
+            _pos2s = _pos1e + 4
+            _pos2e = mm.find(b"',u'", _pos2s) if PY3 else mm.find("',u'", _pos2s)
+            _pos3s = _pos2e + 4
+            _pos3e = mm.find(b"',u'", _pos3s) if PY3 else mm.find("',u'", _pos3s)
+            _pos4s = _pos3e + 4
+            _pos4e = mm.find(b"',u'", _pos4s) if PY3 else mm.find("',u'", _pos4s)
+            _pos5s = _pos4e + 4
+            _pos5e = mm.find(b"',u'", _pos5s) if PY3 else mm.find("',u'", _pos5s)
+            _pos6s = _pos5e + 4
+            _pos6e = mm.find(b"')", _pos6s) if PY3 else mm.find("')", _pos6s)
+            return (
+                mm[_pos1s:_pos1e].decode('unicode_escape'), mm[_pos2s:_pos2e].decode('unicode_escape'), mm[_pos3s:_pos3e].decode('unicode_escape'),
+                mm[_pos4s:_pos4e].decode('unicode_escape'), mm[_pos5s:_pos5e].decode('unicode_escape'), mm[_pos6s:_pos6e].decode('unicode_escape')
+            )
+        except Exception as e:
+            logging.error('Cannot load extra info. The dictionary may be corrupted?')
+            logging.error('idx=%d' % idx)
+            traceback.format_exc()
+            sys.exit(1)
+
+    def get_trans_cost(self, id1, id2):
+        return self.connections[id1][id2]
+
+    def __del__(self):
+        for mm, mm_idx in self.entries_compact.values():
+            mm.close()
+        if self.entries_extra:
+            for mm, mm_idx in self.entries_extra.values():
+                mm.close()
+        for fp in self.open_files:
+            fp.close()
+    
+
+class UnknownsDictionary(object):
+    def __init__(self, chardefs, unknowns):
+        self.char_categories = chardefs[0]
+        self.char_ranges = chardefs[1]
+        self.unknowns = unknowns
+
+    @lru_cache(maxsize=1024)
+    def get_char_categories(self, c):
+        res = {}
+        for chr_range in self.char_ranges:
+            if chr_range['from'] <= c <= chr_range['to']:
+                cate = chr_range['cate']
+                compate_cates = chr_range['compat_cates'] if 'compat_cates' in chr_range else []
+                res[cate] = compate_cates
+        if not res:
+            res = {u'DEFAULT': []}
+        return res
+
+    def unknown_invoked_always(self, cate):
+        if cate in self.char_categories:
+            return self.char_categories[cate]['INVOKE']
+        return False
+
+    def unknown_grouping(self, cate):
+        if cate in self.char_categories:
+            return self.char_categories[cate]['GROUP']
+        return False
+
+    def unknown_length(self, cate):
+        if cate in self.char_categories:
+            return self.char_categories[cate]['LENGTH']
+        return -1
+
+
+class SystemDictionary(Dictionary, UnknownsDictionary):
+    u"""
+    System dictionary class
+    """
+    def __init__(self, entries, connections, chardefs, unknowns):
+        Dictionary.__init__(self, load_all_fstdata(), entries, connections)
+        UnknownsDictionary.__init__(self, chardefs, unknowns)
+
+
+class MMapSystemDictionary(MMapDictionary, UnknownsDictionary):
+    u"""
+    MMap System dictionary class
+    """
+    def __init__(self, mmap_entries, connections, chardefs, unknowns):
+        MMapDictionary.__init__(self, load_all_fstdata(), mmap_entries[0], mmap_entries[1], mmap_entries[2], connections)
+        UnknownsDictionary.__init__(self, chardefs, unknowns)
+
+
+class UserDictionary(Dictionary):
+    u"""
+    User dictionary class (uncompiled)
+    """
+    def __init__(self, user_dict, enc, type, connections):
+        """
+        Initialize user defined dictionary object.
+
+        :param user_dict: user dictionary file (CSV format)
+        :param enc: character encoding
+        :param type: user dictionary type. supported types are 'ipadic' and 'simpledic'
+        :param connections: connection cost matrix. expected value is SYS_DIC.connections
+
+        .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary.
+        """
+        build_method = getattr(self, 'build' + type)
+        compiledFST, entries = build_method(user_dict, enc)
+        Dictionary.__init__(self, [compiledFST], entries, connections)
+
+    def buildipadic(self, user_dict, enc):
+        surfaces = []
+        entries = {}
+        with io.open(user_dict, encoding=enc) as f:
+            for line in f:
+                line = line.rstrip()
+                surface, left_id, right_id, cost, \
+                pos_major, pos_minor1, pos_minor2, pos_minor3, \
+                infl_type, infl_form, base_form, reading, phonetic = \
+                    line.split(',')
+                part_of_speech = ','.join([pos_major, pos_minor1, pos_minor2, pos_minor3])
+                morph_id = len(surfaces)
+                surfaces.append((surface.encode('utf8'), pack('I', morph_id)))
+                entries[morph_id] = (surface, int(left_id), int(right_id), int(cost), part_of_speech, infl_type, infl_form, base_form, reading, phonetic)
+        inputs = sorted(surfaces)  # inputs must be sorted.
+        assert len(surfaces) == len(entries)
+        processed, fst = create_minimum_transducer(inputs)
+        compiledFST = compileFST(fst)
+        return compiledFST, entries
+
+    def buildsimpledic(self, user_dict, enc):
+        import sys
+        surfaces = []
+        entries = {}
+        with io.open(user_dict, encoding=enc) as f:
+            for line in f:
+                line = line.rstrip()
+                surface, pos_major, reading = line.split(',')
+                part_of_speech = ','.join([pos_major, u'*', u'*', u'*'])
+                morph_id = len(surfaces)
+                surfaces.append((surface.encode('utf8'), pack('I', morph_id)))
+                entries[morph_id] = (surface, 0, 0, -100000, part_of_speech, u'*', u'*', surface, reading, reading)
+        inputs = sorted(surfaces)  # inputs must be sorted.
+        assert len(surfaces) == len(entries)
+        processed, fst = create_minimum_transducer(inputs)
+        compiledFST = compileFST(fst)
+        return compiledFST, entries
+
+    def save(self, to_dir, compressionlevel=9):
+        u"""
+        Save compressed compiled dictionary data.
+
+        :param to_dir: directory to save dictionary data
+        :compressionlevel: (Optional) gzip compression level. default is 9
+        """
+        if os.path.exists(to_dir) and not os.path.isdir(to_dir):
+            raise Exception('Not a directory : %s' % to_dir)
+        elif not os.path.exists(to_dir):
+            os.makedirs(to_dir, mode=int('0755', 8))
+        _save(os.path.join(to_dir, FILE_USER_FST_DATA), self.compiledFST[0], compressionlevel)
+        _save(os.path.join(to_dir, FILE_USER_ENTRIES_DATA), pickle.dumps(self.entries), compressionlevel)
+
+
+class CompiledUserDictionary(Dictionary):
+    u"""
+    User dictionary class (compiled)
+    """
+    def __init__(self, dic_dir, connections):
+        data, entries = self.load_dict(dic_dir)
+        Dictionary.__init__(self, [data], entries, connections)
+
+    def load_dict(self, dic_dir):
+        if not os.path.exists(dic_dir) or not os.path.isdir(dic_dir):
+            raise Exception('No such directory : ' % dic_dir)
+        data = _load(os.path.join(dic_dir, FILE_USER_FST_DATA))
+        entries = pickle.loads(_load(os.path.join(dic_dir, FILE_USER_ENTRIES_DATA)))
+        return data, entries
+
+
+class LoadingDictionaryError(Exception):
+    def __init__(self):
+        self.message = 'Cannot load dictionary data. Try mmap mode for very large dictionary.'
diff --git a/lang/janome_fix/dic.py.org b/lang/janome_fix/dic.py.org
new file mode 100644
index 0000000..acad55c
--- /dev/null
+++ b/lang/janome_fix/dic.py.org
@@ -0,0 +1,448 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015 moco_beta
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import with_statement
+import os
+import io
+import pickle
+import gzip
+from struct import pack
+from .fst import Matcher, create_minimum_transducer, compileFST, unpack_uint
+import traceback
+import logging
+import sys
+import re
+try:
+    from functools import lru_cache
+except ImportError:
+    from functools import wraps
+    def lru_cache(**kwargs):
+        def _dummy(function):
+            @wraps(function)
+            def __dummy(*args, **kwargs):
+                return function(*args, **kwargs)
+            return __dummy
+        return _dummy
+
+
+PY3 = sys.version_info[0] == 3
+
+SYSDIC_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "sysdic")
+
+FILE_FST_DATA = 'fst.data'
+
+MODULE_ENTRIES_EXTRA = 'entries_extra%d.py'
+MODULE_ENTRIES_COMPACT = 'entries_compact%d.py'
+MODULE_ENTRIES_BUCKETS = 'entries_buckets.py'
+MODULE_CONNECTIONS = 'connections%d.py'
+MODULE_CHARDEFS = 'chardef.py'
+MODULE_UNKNOWNS = 'unknowns.py'
+
+FILE_USER_FST_DATA = 'user_fst.data'
+FILE_USER_ENTRIES_DATA = 'user_entries.data'
+
+def save_fstdata(data, dir, suffix=''):
+    _save(os.path.join(dir, FILE_FST_DATA + suffix), data, 9)
+
+
+def load_all_fstdata():
+    return [_load(os.path.join(SYSDIC_DIR, data_file))
+            for data_file in os.listdir(SYSDIC_DIR) if data_file.startswith(FILE_FST_DATA)]
+
+
+def start_save_entries(dir, bucket_num):
+    for i in range(0, bucket_num):
+        _start_entries_as_module(os.path.join(dir, MODULE_ENTRIES_COMPACT % i))
+        _start_entries_as_module(os.path.join(dir, MODULE_ENTRIES_EXTRA % i))
+
+
+def end_save_entries(dir, bucket_num):
+    for i in range(0, bucket_num):
+        _end_entries_as_module(os.path.join(dir, MODULE_ENTRIES_COMPACT % i))
+        _end_entries_as_module(os.path.join(dir, MODULE_ENTRIES_EXTRA % i))
+
+
+def save_entry(dir, bucket_idx, morph_id, entry):
+    _save_entry_as_module_compact(os.path.join(dir, MODULE_ENTRIES_COMPACT % bucket_idx), morph_id, entry)
+    _save_entry_as_module_extra(os.path.join(dir, MODULE_ENTRIES_EXTRA % bucket_idx), morph_id, entry)
+
+
+def save_entry_buckets(dir, buckets):
+    _save_as_module(os.path.join(dir, MODULE_ENTRIES_BUCKETS), buckets)
+
+
+def save_connections(connections, dir=u'.'):
+    # split whole connections to 2 buckets to reduce memory usage while installing.
+    # TODO: find better ways...
+    bucket_size = (len(connections) // 2) + 1
+    offset = 0
+    for i in range(1, 3):
+        _save_as_module(os.path.join(dir, MODULE_CONNECTIONS % i),
+                        connections[offset:offset+bucket_size])
+        offset += bucket_size
+
+
+def save_chardefs(chardefs, dir=u'.'):
+    _save_as_module(os.path.join(dir, MODULE_CHARDEFS), chardefs)
+
+
+def save_unknowns(unknowns, dir=u'.'):
+    _save_as_module(os.path.join(dir, MODULE_UNKNOWNS), unknowns)
+
+
+def _save(file, data, compresslevel):
+    if not data:
+        return
+    with gzip.open(file, 'wb', compresslevel) as f:
+        f.write(data)
+        f.flush()
+
+
+def _load(file):
+    if not os.path.exists(file):
+        return None
+    with gzip.open(file, 'rb') as f:
+        data = f.read()
+        return data
+
+
+def _save_as_module(file, data):
+    if not data:
+        return
+    with open(file, 'w') as f:
+        f.write(u'DATA=')
+        f.write(str(data).replace('\\\\', '\\') if PY3 else unicode(data))
+        f.flush()
+
+
+def _start_entries_as_module(file):
+    idx_file = re.sub(r'\.py$', '_idx.py', file)
+    with open(file, 'w') as f:
+        with open(idx_file, 'w') as f_idx:
+            f.write("# -*- coding: utf-8 -*-\n")
+            f.write('DATA={')
+            f_idx.write('DATA={')
+
+
+def _end_entries_as_module(file):
+    idx_file = re.sub(r'\.py$', '_idx.py', file)
+    with open(file, 'a') as f:
+        with open(idx_file, 'a') as f_idx:
+            f.write('}\n')
+            f_idx.write('}\n')
+            f.flush()
+            f_idx.flush()
+
+
+def _save_entry_as_module_compact(file, morph_id, entry):
+    idx_file = re.sub(r'\.py$', '_idx.py', file)
+    with open(file, 'a') as f:
+        with open(idx_file, 'a') as f_idx:
+            f.write('%d:(' % morph_id)
+            _pos1 = f.tell()
+            f_idx.write('%d:%d,' % (morph_id, _pos1))
+            s = u"u'%s',%s,%s,%d" % (
+                entry[0].encode('unicode_escape').decode('ascii') if PY3 else entry[0].encode('unicode_escape'),
+                entry[1],
+                entry[2],
+                entry[3])
+            f.write(s)
+            f.write('),')
+
+
+def _save_entry_as_module_extra(file, morph_id, entry):
+    idx_file = re.sub(r'\.py$', '_idx.py', file)
+    with open(file, 'a') as f:
+        with open(idx_file, 'a') as f_idx:
+            f.write('%d:(' % morph_id)
+            _pos1 = f.tell()
+            f_idx.write('%d:%d,' % (morph_id, _pos1))
+            s = u"u'%s',u'%s',u'%s',u'%s',u'%s',u'%s'" % (
+                entry[4].encode('unicode_escape').decode('ascii') if PY3 else entry[4].encode('unicode_escape'),
+                entry[5].encode('unicode_escape').decode('ascii') if PY3 else entry[5].encode('unicode_escape'),
+                entry[6].encode('unicode_escape').decode('ascii') if PY3 else entry[6].encode('unicode_escape'),
+                entry[7].encode('unicode_escape').decode('ascii') if PY3 else entry[7].encode('unicode_escape'),
+                entry[8].encode('unicode_escape').decode('ascii') if PY3 else entry[8].encode('unicode_escape'),
+                entry[9].encode('unicode_escape').decode('ascii') if PY3 else entry[9].encode('unicode_escape'))
+            f.write(s)
+            f.write('),')
+            
+
+class Dictionary(object):
+    u"""
+    Base dictionary class
+    """
+    def __init__(self, compiledFST, entries, connections):
+        self.compiledFST = compiledFST
+        self.matcher = Matcher(compiledFST)
+        self.entries = entries
+        self.connections = connections
+
+    def lookup(self, s):
+        (matched, outputs) = self.matcher.run(s)
+        if not matched:
+            return []
+        try:
+            res = []
+            for e in outputs:
+                num = unpack_uint(e)
+                res.append((num,) + self.entries[num][:4])
+            return res
+        except Exception as e:
+            logging.error('Cannot load dictionary data. The dictionary may be corrupted?')
+            logging.error('input=%s' % s)
+            logging.error('outputs=%s' % str(outputs) if PY3 else unicode(outputs))
+            traceback.format_exc()
+            sys.exit(1)
+
+    def lookup_extra(self, num):
+        try:
+            return self.entries[num][4:]
+        except Exception as e:
+            logging.error('Cannot load dictionary data. The dictionary may be corrupted?')
+            logging.error('input=%s' % s)
+            logging.error('outputs=%s' % str(outputs) if PY3 else unicode(outputs))
+            traceback.format_exc()
+            sys.exit(1)
+
+    def get_trans_cost(self, id1, id2):
+        return self.connections[id1][id2]
+
+
+class MMapDictionary(object):
+    u"""
+    Base MMap dictionar class
+    """
+    def __init__(self, compiledFST, entries_compact, entries_extra, open_files, connections):
+        self.compiledFST = compiledFST
+        self.matcher = Matcher(compiledFST)
+        self.entries_compact = entries_compact
+        self.entries_extra = entries_extra
+        self.open_files = open_files
+        self.connections = connections
+
+    def lookup(self, s):
+        (matched, outputs) = self.matcher.run(s)
+        if not matched:
+            return []
+        try:
+            matched_entries = []
+            for e in outputs:
+                idx = unpack_uint(e)
+                bucket = next(filter(lambda b: idx >= b[0] and idx < b[1], self.entries_compact.keys())) if PY3 \
+                    else filter(lambda b: idx >= b[0] and idx < b[1], self.entries_compact.keys())[0]
+                mm, mm_idx = self.entries_compact[bucket]
+                _pos1s = mm_idx[idx] + 2
+                _pos1e = mm.find(b"',", _pos1s) if PY3 else mm.find("',", _pos1s)
+                _pos2s = _pos1e + 2
+                _pos2e = mm.find(b",", _pos2s) if PY3 else mm.find(",", _pos2s)
+                _pos3s = _pos2e + 1
+                _pos3e = mm.find(b",", _pos3s) if PY3 else mm.find(",", _pos3s)
+                _pos4s = _pos3e + 1
+                _pos4e = mm.find(b")", _pos4s) if PY3 else mm.find(")", _pos4s)
+                _entry = (mm[_pos1s:_pos1e].decode('unicode_escape'), int(mm[_pos2s:_pos2e]), int(mm[_pos3s:_pos3e]), int(mm[_pos4s:_pos4e]))
+                matched_entries.append((idx,) + _entry)
+            return matched_entries
+        except Exception as e:
+            logging.error('Cannot load dictionary data. The dictionary may be corrupted?')
+            logging.error('input=%s' % s)
+            logging.error('outputs=%s' % str(outputs) if PY3 else unicode(outputs))
+            traceback.format_exc()
+            sys.exit(1)
+
+    def lookup_extra(self, idx):
+        try:
+            bucket = next(filter(lambda b: idx >= b[0] and idx < b[1], self.entries_extra.keys())) if PY3 \
+               else filter(lambda b: idx >= b[0] and idx < b[1], self.entries_extra.keys())[0]
+            mm, mm_idx = self.entries_extra[bucket]
+            _pos1s = mm_idx[idx] + 2
+            _pos1e = mm.find(b"',u'", _pos1s) if PY3 else mm.find("',u'", _pos1s)
+            _pos2s = _pos1e + 4
+            _pos2e = mm.find(b"',u'", _pos2s) if PY3 else mm.find("',u'", _pos2s)
+            _pos3s = _pos2e + 4
+            _pos3e = mm.find(b"',u'", _pos3s) if PY3 else mm.find("',u'", _pos3s)
+            _pos4s = _pos3e + 4
+            _pos4e = mm.find(b"',u'", _pos4s) if PY3 else mm.find("',u'", _pos4s)
+            _pos5s = _pos4e + 4
+            _pos5e = mm.find(b"',u'", _pos5s) if PY3 else mm.find("',u'", _pos5s)
+            _pos6s = _pos5e + 4
+            _pos6e = mm.find(b"')", _pos6s) if PY3 else mm.find("')", _pos6s)
+            return (
+                mm[_pos1s:_pos1e].decode('unicode_escape'), mm[_pos2s:_pos2e].decode('unicode_escape'), mm[_pos3s:_pos3e].decode('unicode_escape'),
+                mm[_pos4s:_pos4e].decode('unicode_escape'), mm[_pos5s:_pos5e].decode('unicode_escape'), mm[_pos6s:_pos6e].decode('unicode_escape')
+            )
+        except Exception as e:
+            logging.error('Cannot load extra info. The dictionary may be corrupted?')
+            logging.error('idx=%d' % idx)
+            traceback.format_exc()
+            sys.exit(1)
+
+    def get_trans_cost(self, id1, id2):
+        return self.connections[id1][id2]
+
+    def __del__(self):
+        for mm, mm_idx in self.entries_compact.values():
+            mm.close()
+        if self.entries_extra:
+            for mm, mm_idx in self.entries_extra.values():
+                mm.close()
+        for fp in self.open_files:
+            fp.close()
+    
+
+class UnknownsDictionary(object):
+    def __init__(self, chardefs, unknowns):
+        self.char_categories = chardefs[0]
+        self.char_ranges = chardefs[1]
+        self.unknowns = unknowns
+
+    @lru_cache(maxsize=1024)
+    def get_char_categories(self, c):
+        res = {}
+        for chr_range in self.char_ranges:
+            if chr_range['from'] <= c <= chr_range['to']:
+                cate = chr_range['cate']
+                compate_cates = chr_range['compat_cates'] if 'compat_cates' in chr_range else []
+                res[cate] = compate_cates
+        if not res:
+            res = {u'DEFAULT': []}
+        return res
+
+    def unknown_invoked_always(self, cate):
+        if cate in self.char_categories:
+            return self.char_categories[cate]['INVOKE']
+        return False
+
+    def unknown_grouping(self, cate):
+        if cate in self.char_categories:
+            return self.char_categories[cate]['GROUP']
+        return False
+
+    def unknown_length(self, cate):
+        if cate in self.char_categories:
+            return self.char_categories[cate]['LENGTH']
+        return -1
+
+
+class SystemDictionary(Dictionary, UnknownsDictionary):
+    u"""
+    System dictionary class
+    """
+    def __init__(self, entries, connections, chardefs, unknowns):
+        Dictionary.__init__(self, load_all_fstdata(), entries, connections)
+        UnknownsDictionary.__init__(self, chardefs, unknowns)
+
+
+class MMapSystemDictionary(MMapDictionary, UnknownsDictionary):
+    u"""
+    MMap System dictionary class
+    """
+    def __init__(self, mmap_entries, connections, chardefs, unknowns):
+        MMapDictionary.__init__(self, load_all_fstdata(), mmap_entries[0], mmap_entries[1], mmap_entries[2], connections)
+        UnknownsDictionary.__init__(self, chardefs, unknowns)
+
+
+class UserDictionary(Dictionary):
+    u"""
+    User dictionary class (uncompiled)
+    """
+    def __init__(self, user_dict, enc, type, connections):
+        """
+        Initialize user defined dictionary object.
+
+        :param user_dict: user dictionary file (CSV format)
+        :param enc: character encoding
+        :param type: user dictionary type. supported types are 'ipadic' and 'simpledic'
+        :param connections: connection cost matrix. expected value is SYS_DIC.connections
+
+        .. seealso:: See http://mocobeta.github.io/janome/en/#use-with-user-defined-dictionary for details for user dictionary.
+        """
+        build_method = getattr(self, 'build' + type)
+        compiledFST, entries = build_method(user_dict, enc)
+        Dictionary.__init__(self, [compiledFST], entries, connections)
+
+    def buildipadic(self, user_dict, enc):
+        surfaces = []
+        entries = {}
+        with io.open(user_dict, encoding=enc) as f:
+            for line in f:
+                line = line.rstrip()
+                surface, left_id, right_id, cost, \
+                pos_major, pos_minor1, pos_minor2, pos_minor3, \
+                infl_type, infl_form, base_form, reading, phonetic = \
+                    line.split(',')
+                part_of_speech = ','.join([pos_major, pos_minor1, pos_minor2, pos_minor3])
+                morph_id = len(surfaces)
+                surfaces.append((surface.encode('utf8'), pack('I', morph_id)))
+                entries[morph_id] = (surface, int(left_id), int(right_id), int(cost), part_of_speech, infl_type, infl_form, base_form, reading, phonetic)
+        inputs = sorted(surfaces)  # inputs must be sorted.
+        assert len(surfaces) == len(entries)
+        processed, fst = create_minimum_transducer(inputs)
+        compiledFST = compileFST(fst)
+        return compiledFST, entries
+
+    def buildsimpledic(self, user_dict, enc):
+        import sys
+        surfaces = []
+        entries = {}
+        with io.open(user_dict, encoding=enc) as f:
+            for line in f:
+                line = line.rstrip()
+                surface, pos_major, reading = line.split(',')
+                part_of_speech = ','.join([pos_major, u'*', u'*', u'*'])
+                morph_id = len(surfaces)
+                surfaces.append((surface.encode('utf8'), pack('I', morph_id)))
+                entries[morph_id] = (surface, 0, 0, -100000, part_of_speech, u'*', u'*', surface, reading, reading)
+        inputs = sorted(surfaces)  # inputs must be sorted.
+        assert len(surfaces) == len(entries)
+        processed, fst = create_minimum_transducer(inputs)
+        compiledFST = compileFST(fst)
+        return compiledFST, entries
+
+    def save(self, to_dir, compressionlevel=9):
+        u"""
+        Save compressed compiled dictionary data.
+
+        :param to_dir: directory to save dictionary data
+        :compressionlevel: (Optional) gzip compression level. default is 9
+        """
+        if os.path.exists(to_dir) and not os.path.isdir(to_dir):
+            raise Exception('Not a directory : %s' % to_dir)
+        elif not os.path.exists(to_dir):
+            os.makedirs(to_dir, mode=int('0755', 8))
+        _save(os.path.join(to_dir, FILE_USER_FST_DATA), self.compiledFST[0], compressionlevel)
+        _save(os.path.join(to_dir, FILE_USER_ENTRIES_DATA), pickle.dumps(self.entries), compressionlevel)
+
+
+class CompiledUserDictionary(Dictionary):
+    u"""
+    User dictionary class (compiled)
+    """
+    def __init__(self, dic_dir, connections):
+        data, entries = self.load_dict(dic_dir)
+        Dictionary.__init__(self, [data], entries, connections)
+
+    def load_dict(self, dic_dir):
+        if not os.path.exists(dic_dir) or not os.path.isdir(dic_dir):
+            raise Exception('No such directory : ' % dic_dir)
+        data = _load(os.path.join(dic_dir, FILE_USER_FST_DATA))
+        entries = pickle.loads(_load(os.path.join(dic_dir, FILE_USER_ENTRIES_DATA)))
+        return data, entries
+
+
+class LoadingDictionaryError(Exception):
+    def __init__(self):
+        self.message = 'Cannot load dictionary data. Try mmap mode for very large dictionary.'