-
Notifications
You must be signed in to change notification settings - Fork 16
/
opf-cc.py
executable file
·181 lines (161 loc) · 6.96 KB
/
opf-cc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python
# Convert files in Open Packaging Format from Traditional Chinese
# to Simplified Chinese.
import sys, os, zipfile, re, codecs, subprocess, glob, shutil
import opencc
import mobiunpack
import kindlestrip
import lxml
from lxml.html import parse
debug = True
fsenc = sys.getfilesystemencoding()
def find_paths(converter):
input_path = sys.argv[1]
if not os.path.exists(input_path):
print "%s does not exist." % input_path
sys.exit(1)
ext = os.path.splitext(input_path)[1]
if not ext in [".epub", ".mobi"] or os.path.isdir(input_path):
print "%s is not a valid input file." % input_path
sys.exit(1)
output_file_path = converter.convert(input_path)
output_path = find_output_path(input_path)
if not (debug and os.path.isdir(output_path)):
print "Try extracting %s to %s" % (input_path, output_path)
if ext == ".epub":
zf = zipfile.ZipFile(input_path)
zf.extractall(output_path.decode('utf-8'))
else:
# Otherwise it's a mobi, use mobiunpack
mobiunpack.unpackBook(input_path, output_path)
return (input_path, output_path, output_file_path)
def find_output_path(input_path):
candidate = os.path.splitext(input_path)[0]
if os.path.exists(candidate):
# Quick shortcut for debugging, do not attempt to extract
# the file again, reuse the extracted directory.
if debug and os.path.isdir(candidate):
return candidate
match = re.match('(.*)-(\d+)', candidate)
if match:
print match.group(1)
candidate = match.group(1)
digit = int(match.group(2)) + 1
else:
digit = 1
candidate = "%s-%d" % (candidate, digit)
return find_output_path(candidate)
return candidate
def find_opf_path(input_path):
metadata_file = os.path.join(input_path, "META-INF", "container.xml")
if os.path.isfile(metadata_file):
metadata = parse(metadata_file)
for root_file in metadata.iter('rootfile'):
opf_file = root_file.attrib['full-path']
opf_path = os.path.join(input_path, opf_file)
if os.path.isfile(opf_path):
return opf_path
else:
# Otherwise it's not in Open Container Format, look for opf in
# the hard way
opfs = glob.glob(os.path.join(input_path, '*.opf'))
if len(opfs):
return opfs[0]
return None
def find_files_to_convert(input_path, opf_path):
opf = parse(opf_path)
files = [opf_path]
types = ['application/x-dtbncx+xml', 'application/xhtml+xml', 'text/x-oeb1-document']
for item in opf.iter('item'):
media_type = item.attrib['media-type']
if media_type in types:
href = item.attrib['href']
path = os.path.join(os.path.dirname(opf_path), href.encode(fsenc))
if os.path.isfile(path):
files.append(path)
return files
def convert_files_in_place(converter, files):
for f in files:
print 'Converting file: %s' % f
ext = os.path.splitext(f)[1]
if ext == '.ncx':
try:
ncx = lxml.etree.parse(f)
except lxml.etree.XMLSyntaxError as e:
print e.filename, e.lineno, e.msg, e.offset
for text in ncx.iter():
if text.tag.startswith('{') and text.tag.rsplit('}', 1)[-1] == "text":
text.text = converter.convert(text.text.encode('utf-8')).decode('utf-8')
ncx.write(f, encoding='utf-8', xml_declaration=True, pretty_print=True)
elif ext == '.opf':
# Quick and dirty way to convert metadata because lxml.html doesn't work
opf = open(f)
opf_contents = opf.read().split('</metadata>')
opf.close()
opf_contents[0] = opf_contents[0].replace('<dc:language>zh-TW</dc:language>',
'<dc:language>zh-Hans</dc:language>')
opf_contents[0] = opf_contents[0].replace('<dc:language>zh-Hant</dc:language>',
'<dc:language>zh-Hans</dc:language>')
opf_contents[0] = converter.convert(opf_contents[0])
opf = open(f, 'w')
opf.write('</metadata>'.join(opf_contents))
opf.close()
else:
output_file = '%s.tmp' % f
cmd = "opencc -i '%s' -o '%s' -c t2s.json" % (f, output_file)
os.system(cmd)
os.rename(output_file, f)
def add_dir_to_zip(archive, base, current):
for f in os.listdir(os.path.join(base, current)):
filename = os.path.join(current, f)
fullname = os.path.join(base, filename)
if os.path.isdir(fullname):
add_dir_to_zip(archive, base, filename)
else:
archive.write(fullname, filename)
def repack_files(input_path, output_file_path, opf_path):
(trunk, ext) = os.path.splitext(output_file_path)
if os.path.isfile(output_file_path):
old_file_path = "%s.old%s" % (trunk, ext)
print "Renaming existing file to %s" % old_file_path
os.rename(output_file_path, old_file_path)
print "Repacking converted files into %s" % output_file_path
if ext == '.epub':
# epub is just normal zip file with a special extension
cmd_args = ['zip', '-r', output_file_path, '.']
epub = zipfile.ZipFile(output_file_path, "w", zipfile.ZIP_DEFLATED)
add_dir_to_zip(epub, input_path, '.')
epub.close()
else:
# Otherwise it's a mobi file, use kindlegen to repack
cmd_args = []
output_file = os.path.basename(output_file_path)
cmd_args = ['kindlegen', opf_path, '-c2', '-verbose', '-o', output_file]
p = subprocess.Popen(cmd_args, cwd=input_path)
p.wait()
if ext == '.mobi':
# KindleGen puts output file under the same directory as the input file.
original_output_path = os.path.join(input_path,
os.path.basename(output_file_path))
# KindleGen introduced redundant data, use kindlestrip to remove that.
data_file = file(original_output_path, 'rb').read()
strippedFile = kindlestrip.SectionStripper(data_file)
outf = file(output_file_path, 'wb')
outf.write(strippedFile.getResult())
outf.close()
print "Removing temporary directory %s" % input_path
shutil.rmtree(input_path)
if len(sys.argv) < 2:
print "usage: %s <book.epub|book.mobi>"
sys.exit(1)
with opencc.OpenCC(config="t2s.json") as converter:
(input_file_path, extracted_path, output_file_path) = find_paths(converter)
opf_path = find_opf_path(extracted_path)
if opf_path:
files = find_files_to_convert(extracted_path, opf_path)
if len(files):
convert_files_in_place(converter, files)
repack_files(extracted_path, output_file_path, opf_path)
else:
print "%s is not in Open Packaging Format, abort." % extracted_path
sys.exit(1)