-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathfind_header_footer.py
123 lines (102 loc) · 3.78 KB
/
find_header_footer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
from lxml import etree
from diff_match_patch import diff_match_patch
ns = '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}'
# Header/Footer detection parameters
# Weights to assign to potential headers / footers.
# len(weights) should be even.
weights = (1.0, .75,
.75, 1.0)
# weights = (1.0, .75, .5,
# .5, .75, 1.0)
# allow potential headers/footers with this length difference
max_length_difference = 4
dmp = diff_match_patch()
dmp.Match_Distance = 2 # number of prepended characters allowed before match
dmp.Match_Threshold = .5 # 0 to 1 ... higher => more fanciful matches,
# slower execution.
# minimum match score for a line to be considered a header or footer.
min_score = .9
def annotate_page(page):
cands = [c for c in hf_candidates(page)]
page.info['hf_candidates'] = cands
def hf_candidates(page):
result = []
hfwin = len(weights) / 2
lines = [line for line in page.page.findall('.//LINE')]
for i in range(hfwin) + range(-hfwin, 0):
if abs(i) < len(lines):
result.append((lines[i], simplify_line_text(lines[i])))
else:
result.append(None)
return result
# def hf_candidates(page):
# result = []
# lines = [line for line in page.page.findall('.//'+ns+'line')]
# hfwin = 5
# for i in range(hfwin) + range(-hfwin, 0):
# if abs(i) < len(lines):
# result.append((lines[i], simplify_line_text(lines[i])))
# else:
# result.append(None)
# return result
def simplify_line_text(line):
text = etree.tostring(line,
method='text',
encoding=unicode).lower();
# collape numbers (roman too) to '@' so headers will be more
# similar from page to page
return re.sub(r'[ivx\d]', r'@', text)
text = re.sub(r'\s+', r' ', text)
def guess_hf(pageinfo, pages, window=None):
if window is None:
window = pages.window
result = []
pageinfo.info['hf_guesses'] = result
hf_candidates = pageinfo.info['hf_candidates']
if 'pageno_fmt' in pageinfo.info:
pageno_fmt = pageinfo.info['pageno_fmt']
pageno_line = pageno_fmt.getparent()
else:
pageno_fmt = None
pageno_line = None
for i in range(len(weights)):
if hf_candidates[i] is None:
continue
score = 0
if hf_candidates[i][0] == pageno_line:
score = 2
# if levenshtein(hf_candidates[i][1], 'chapter @') < 5:
# score = 2
for neighbor_info in pages.neighbors(window):
score += (weights[i]
* text_similarity(pageinfo, neighbor_info, i)
* geometry_similarity(pageinfo, neighbor_info, i))
if score > min_score:
result.append(i)
# result.append(hf_candidates[i])
break
if score < min_score:
# remove it from the running, so it doesn't slow down later checks
hf_candidates[i] = None
# print 'result' + ' '.join(str(hf[1].encode('utf-8')) for hf in result)
return result
def text_similarity(pageinfo, neighbor_info, i):
neighbor_candidate = neighbor_info.info['hf_candidates'][i]
if neighbor_candidate is None:
return 0
neighbor_line, neighbor_text = neighbor_candidate
line, text = pageinfo.info['hf_candidates'][i]
if abs(len(text) - len(neighbor_text)) > max_length_difference:
return 0
matchstart = dmp.match_main(text, neighbor_text, 0)
if matchstart != -1:
return 1
else:
return 0
# distance = levenshtein(neighbor_text, text)
# if distance > maxlen:
# return 0
# return (maxlen - distance) / maxlen
def geometry_similarity(pageinfo, neighbor_info, i):
return 1