Skip to content

Commit fd8ab03

Browse files
author
Tom Keefe
committed
allow for including headers/footers via command line
1 parent f929ae1 commit fd8ab03

File tree

11 files changed

+171
-21
lines changed

11 files changed

+171
-21
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,12 @@ Converts the source document to HTML.
285285
* `ignore_empty_paragraphs`: by default, empty paragraphs are ignored.
286286
Set this option to `False` to preserve empty paragraphs in the output.
287287

288+
* `include_headers_and_footers`: by default, headers and footers are not included in the output.
289+
Set this option to `True` to include them at the start and end of the output.
290+
291+
* `deduplicate_headers_and_footers`: by default, all headers and footers are included.
292+
Set this option to `True` to only include unique headers and footers.
293+
288294
* `id_prefix`:
289295
a string to prepend to any generated IDs,
290296
such as those used by bookmarks, footnotes and endnotes.

mammoth/conversion.py

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ def convert_document_element_to_html(element,
1616
convert_image=None,
1717
id_prefix=None,
1818
output_format=None,
19-
ignore_empty_paragraphs=True):
19+
ignore_empty_paragraphs=True,
20+
include_headers_and_footers=False,
21+
deduplicate_headers_and_footers=False):
2022

2123
if style_map is None:
2224
style_map = []
@@ -34,14 +36,15 @@ def convert_document_element_to_html(element,
3436
)
3537
else:
3638
comments = {}
37-
3839
messages = []
3940
converter = _DocumentConverter(
4041
messages=messages,
4142
style_map=style_map,
4243
convert_image=convert_image,
4344
id_prefix=id_prefix,
4445
ignore_empty_paragraphs=ignore_empty_paragraphs,
46+
include_headers_and_footers=include_headers_and_footers,
47+
deduplicate_headers_and_footers=deduplicate_headers_and_footers,
4548
note_references=[],
4649
comments=comments,
4750
)
@@ -62,11 +65,22 @@ def copy(self, **kwargs):
6265

6366

6467
class _DocumentConverter(documents.element_visitor(args=1)):
65-
def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments):
68+
def __init__(self,
69+
messages,
70+
style_map,
71+
convert_image,
72+
id_prefix,
73+
ignore_empty_paragraphs,
74+
include_headers_and_footers,
75+
deduplicate_headers_and_footers,
76+
note_references,
77+
comments):
6678
self._messages = messages
6779
self._style_map = style_map
6880
self._id_prefix = id_prefix
6981
self._ignore_empty_paragraphs = ignore_empty_paragraphs
82+
self._include_headers_and_footers = include_headers_and_footers
83+
self._deduplicate_headers_and_footers = deduplicate_headers_and_footers
7084
self._note_references = note_references
7185
self._referenced_comments = []
7286
self._convert_image = convert_image
@@ -81,17 +95,27 @@ def visit_image(self, image, context):
8195

8296
def visit_document(self, document, context):
8397
nodes = self._visit_all(document.children, context)
98+
99+
headers = []
100+
footers = []
101+
102+
if self._include_headers_and_footers:
103+
headers = self.visit_headers(document.headers, context)
104+
footers = self.visit_footers(document.footers, context)
105+
84106
notes = [
85107
document.notes.resolve(reference)
86108
for reference in self._note_references
87109
]
88110
notes_list = html.element("ol", {}, self._visit_all(notes, context))
111+
89112
comments = html.element("dl", {}, [
90113
html_node
91114
for referenced_comment in self._referenced_comments
92115
for html_node in self.visit_comment(referenced_comment, context)
93116
])
94-
return nodes + [notes_list, comments]
117+
118+
return headers + nodes + [notes_list, comments] + footers
95119

96120

97121
def visit_paragraph(self, paragraph, context):
@@ -300,6 +324,49 @@ def visit_comment(self, referenced_comment, context):
300324
html.element("dd", {}, body),
301325
]
302326

327+
def visit_header(self, header, context):
328+
return self._visit_all(header.children, context)
329+
330+
def visit_headers(self, headers, context):
331+
all_headers = [
332+
html_node
333+
for h in headers
334+
for html_node in self.visit_header(h, context)
335+
]
336+
337+
if not self._deduplicate_headers_and_footers:
338+
return all_headers
339+
340+
header_values = set()
341+
filtered_headers = []
342+
for h in all_headers:
343+
if not h.to_text() in header_values:
344+
filtered_headers.append(h)
345+
header_values.add(h.to_text())
346+
347+
return filtered_headers
348+
349+
def visit_footer(self, footer, context):
350+
return self._visit_all(footer.children, context)
351+
352+
def visit_footers(self, footers, context):
353+
all_footers = [
354+
html_node
355+
for f in footers
356+
for html_node in self.visit_footer(f, context)
357+
]
358+
359+
if not self._deduplicate_headers_and_footers:
360+
return all_footers
361+
362+
footer_values = set()
363+
filtered_footers = []
364+
for h in all_footers:
365+
if not h.to_text() in footer_values:
366+
filtered_footers.append(h)
367+
footer_values.add(h.to_text())
368+
369+
return filtered_footers
303370

304371
def _visit_all(self, elements, context):
305372
return [

mammoth/documents.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ class HasChildren(Element):
1414
class Document(HasChildren):
1515
notes = cobble.field()
1616
comments = cobble.field()
17+
headers = cobble.field()
18+
footers = cobble.field()
1719

1820
@cobble.data
1921
class Paragraph(HasChildren):
@@ -97,12 +99,16 @@ class Image(Element):
9799
open = cobble.field()
98100

99101

100-
def document(children, notes=None, comments=None):
102+
def document(children, notes=None, comments=None, headers=None, footers=None):
101103
if notes is None:
102104
notes = Notes({})
103105
if comments is None:
104106
comments = []
105-
return Document(children, notes, comments=comments)
107+
if headers is None:
108+
headers = []
109+
if footers is None:
110+
footers = []
111+
return Document(children, notes, comments=comments, headers=headers, footers=footers)
106112

107113
def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None):
108114
if indent is None:
@@ -252,5 +258,17 @@ class CommentReference(Element):
252258

253259
comment_reference = CommentReference
254260

261+
@cobble.data
262+
class Header(HasChildren):
263+
pass
264+
265+
header = Header
266+
267+
@cobble.data
268+
class Footer(HasChildren):
269+
pass
270+
271+
footer = Footer
272+
255273
def element_visitor(args):
256274
return cobble.visitor(Element, args=args)

mammoth/docx/__init__.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from .. import results, lists, zips
77
from .document_xml import read_document_xml_element
8+
from .header_xml import (read_header_xml_element, read_footer_xml_element)
89
from .content_types_xml import empty_content_types, read_content_types_xml_element
910
from .relationships_xml import read_relationships_xml_element, Relationships
1011
from .numbering_xml import read_numbering_xml_element, Numbering
@@ -27,12 +28,13 @@ def read(fileobj):
2728
zip_file,
2829
part_paths=part_paths,
2930
)
30-
3131
return results.combine([
3232
_read_notes(read_part_with_body, part_paths),
3333
_read_comments(read_part_with_body, part_paths),
34+
_read_headers(read_part_with_body, part_paths),
35+
_read_footers(read_part_with_body, part_paths)
3436
]).bind(lambda referents:
35-
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
37+
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], headers=referents[2], footers=referents[3], part_paths=part_paths)
3638
)
3739

3840

@@ -43,6 +45,8 @@ class _PartPaths(object):
4345
endnotes = cobble.field()
4446
footnotes = cobble.field()
4547
numbering = cobble.field()
48+
headers = cobble.field()
49+
footers = cobble.field()
4650
styles = cobble.field()
4751

4852

@@ -55,21 +59,24 @@ def _find_part_paths(zip_file):
5559
_find_relationships_path_for(document_filename),
5660
)
5761

58-
def find(name):
62+
def find(name, multiple=False):
5963
return _find_part_path(
6064
zip_file=zip_file,
6165
relationships=document_relationships,
6266
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
6367
fallback_path="word/{0}.xml".format(name),
6468
base_path=zips.split_path(document_filename)[0],
69+
multiple=multiple
6570
)
66-
71+
6772
return _PartPaths(
6873
main_document=document_filename,
6974
comments=find("comments"),
7075
endnotes=find("endnotes"),
7176
footnotes=find("footnotes"),
7277
numbering=find("numbering"),
78+
headers=find("header", multiple=True),
79+
footers=find("footer", multiple=True),
7380
styles=find("styles"),
7481
)
7582

@@ -88,7 +95,7 @@ def _find_document_filename(zip_file, relationships):
8895
raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
8996

9097

91-
def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
98+
def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path, multiple=False):
9299
targets = [
93100
zips.join_path(base_path, target).lstrip("/")
94101
for target in relationships.find_targets_by_type(relationship_type)
@@ -97,7 +104,7 @@ def _find_part_path(zip_file, relationships, relationship_type, base_path, fallb
97104
if len(valid_targets) == 0:
98105
return fallback_path
99106
else:
100-
return valid_targets[0]
107+
return valid_targets if multiple else valid_targets[0]
101108

102109

103110
def _read_notes(read_part_with_body, part_paths):
@@ -111,7 +118,6 @@ def _read_notes(read_part_with_body, part_paths):
111118
lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
112119
default=_empty_result,
113120
)
114-
115121
return results.combine([footnotes, endnotes]).map(lists.flatten)
116122

117123

@@ -122,14 +128,42 @@ def _read_comments(read_part_with_body, part_paths):
122128
default=_empty_result,
123129
)
124130

131+
def _read_headers(read_part_with_body, part_paths):
132+
if type(part_paths.headers) == str:
133+
header_paths = [part_paths.headers]
134+
else:
135+
header_paths = part_paths.headers
136+
137+
headers = [
138+
read_part_with_body(header,
139+
lambda root, body_reader: read_header_xml_element(root, body_reader=body_reader),
140+
default=_empty_result) for header in header_paths]
141+
return [h for h in headers if h.value != []]
142+
143+
144+
def _read_footers(read_part_with_body, part_paths):
145+
if type(part_paths.footers) == str:
146+
footer_paths = [part_paths.footers]
147+
else:
148+
footer_paths = part_paths.footers
149+
150+
footers = [
151+
read_part_with_body(footer,
152+
lambda root, body_reader: read_footer_xml_element(root, body_reader=body_reader),
153+
default=_empty_result) for footer in footer_paths]
154+
155+
return [f for f in footers if f.value != []]
156+
125157

126-
def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
158+
def _read_document(zip_file, read_part_with_body, notes, comments, headers, footers, part_paths):
127159
return read_part_with_body(
128160
part_paths.main_document,
129161
partial(
130162
read_document_xml_element,
131163
notes=notes,
132164
comments=comments,
165+
headers=headers,
166+
footers=footers
133167
),
134168
)
135169

mammoth/docx/body_xml.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def run(element):
7979
.find_child_or_null("w:vertAlign") \
8080
.attributes.get("w:val")
8181
font = properties.find_child_or_null("w:rFonts").attributes.get("w:ascii")
82-
82+
8383
is_bold = read_boolean_element(properties.find_child("w:b"))
8484
is_italic = read_boolean_element(properties.find_child("w:i"))
8585
is_underline = read_boolean_element(properties.find_child("w:u"))

mammoth/docx/document_xml.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ def read_document_xml_element(
55
element,
66
body_reader,
77
notes=None,
8-
comments=None):
8+
comments=None,
9+
headers=None,
10+
footers=None):
911

1012
if notes is None:
1113
notes = []
@@ -17,5 +19,7 @@ def read_document_xml_element(
1719
.map(lambda children: documents.document(
1820
children,
1921
notes=documents.notes(notes),
20-
comments=comments
22+
comments=comments,
23+
headers=headers,
24+
footers=footers
2125
))

mammoth/docx/header_xml.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import functools
2+
from .. import documents
3+
4+
def _read_extremity(extremity, element, body_reader):
5+
return body_reader.read_all(element.children) \
6+
.map(lambda children: extremity(children))
7+
8+
read_header_xml_element = functools.partial(_read_extremity, documents.header)
9+
read_footer_xml_element = functools.partial(_read_extremity, documents.footer)

mammoth/docx/styles_xml.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def read_styles_xml_element(element):
4040
style_set = styles.get(element_type)
4141
if style_set is not None:
4242
style_set[style.style_id] = style
43-
43+
4444
return Styles(
4545
paragraph_styles=paragraph_styles,
4646
character_styles=character_styles,

mammoth/html/nodes.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ class Node(object):
99
class TextNode(Node):
1010
value = cobble.field()
1111

12+
def to_text(self):
13+
return self.value
14+
1215

1316
@cobble.data
1417
class Tag(object):
@@ -52,6 +55,8 @@ def separator(self):
5255
def is_void(self):
5356
return not self.children and self.tag_name in self._VOID_TAG_NAMES
5457

58+
def to_text(self):
59+
return "".join([s.to_text() for s in iter(self.children)])
5560

5661
@cobble.visitable
5762
class ForceWrite(Node):

mammoth/options.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ def read_options(options):
1919
style_map += _default_style_map
2020

2121
options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True)
22+
23+
options["include_headers_and_footers"] = options.get("include_headers_and_footers", False)
24+
options["deduplicate_headers_and_footers"] = options.get("deduplicate_headers_and_footers", False)
25+
2226
options["style_map"] = style_map
2327
return read_style_map_result.map(lambda _: options)
2428

0 commit comments

Comments
 (0)