allow for including headers/footers via command line

Tom Keefe · Tom Keefe · commit fd8ab038b089 · 2018-09-04T14:06:16.000+01:00
diff --git a/README.md b/README.md
@@ -285,6 +285,12 @@ Converts the source document to HTML.
 * `ignore_empty_paragraphs`: by default, empty paragraphs are ignored.
   Set this option to `False` to preserve empty paragraphs in the output.
 
+* `include_headers_and_footers`: by default, headers and footers are not included in the output.
+  Set this option to `True` to include them at the start and end of the output.
+
+* `deduplicate_headers_and_footers`: by default, all headers and footers are included.
+  Set this option to `True` to only include unique headers and footers.
+
 * `id_prefix`:
   a string to prepend to any generated IDs,
   such as those used by bookmarks, footnotes and endnotes.
diff --git a/mammoth/conversion.py b/mammoth/conversion.py
@@ -16,7 +16,9 @@ def convert_document_element_to_html(element,
         convert_image=None,
         id_prefix=None,
         output_format=None,
-        ignore_empty_paragraphs=True):
+        ignore_empty_paragraphs=True,
+        include_headers_and_footers=False,
+        deduplicate_headers_and_footers=False):
             
     if style_map is None:
         style_map = []
@@ -34,14 +36,15 @@ def convert_document_element_to_html(element,
         )
     else:
         comments = {}
-
     messages = []
     converter = _DocumentConverter(
         messages=messages,
         style_map=style_map,
         convert_image=convert_image,
         id_prefix=id_prefix,
         ignore_empty_paragraphs=ignore_empty_paragraphs,
+        include_headers_and_footers=include_headers_and_footers,
+        deduplicate_headers_and_footers=deduplicate_headers_and_footers,
         note_references=[],
         comments=comments,
     )
@@ -62,11 +65,22 @@ def copy(self, **kwargs):
 
 
 class _DocumentConverter(documents.element_visitor(args=1)):
-    def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments):
+    def __init__(self, 
+                messages, 
+                style_map, 
+                convert_image, 
+                id_prefix, 
+                ignore_empty_paragraphs,
+                include_headers_and_footers,
+                deduplicate_headers_and_footers,
+                note_references, 
+                comments):
         self._messages = messages
         self._style_map = style_map
         self._id_prefix = id_prefix
         self._ignore_empty_paragraphs = ignore_empty_paragraphs
+        self._include_headers_and_footers = include_headers_and_footers
+        self._deduplicate_headers_and_footers = deduplicate_headers_and_footers
         self._note_references = note_references
         self._referenced_comments = []
         self._convert_image = convert_image
@@ -81,17 +95,27 @@ def visit_image(self, image, context):
 
     def visit_document(self, document, context):
         nodes = self._visit_all(document.children, context)
+
+        headers = []
+        footers = []
+
+        if self._include_headers_and_footers:
+            headers = self.visit_headers(document.headers, context)
+            footers = self.visit_footers(document.footers, context)
+
         notes = [
             document.notes.resolve(reference)
             for reference in self._note_references
         ]
         notes_list = html.element("ol", {}, self._visit_all(notes, context))
+        
         comments = html.element("dl", {}, [
             html_node
             for referenced_comment in self._referenced_comments
             for html_node in self.visit_comment(referenced_comment, context)
         ])
-        return nodes + [notes_list, comments]
+
+        return headers + nodes + [notes_list, comments] + footers
 
 
     def visit_paragraph(self, paragraph, context):
@@ -300,6 +324,49 @@ def visit_comment(self, referenced_comment, context):
             html.element("dd", {}, body),
         ]
 
+    def visit_header(self, header, context):
+        return self._visit_all(header.children, context)
+
+    def visit_headers(self, headers, context):
+        all_headers = [
+            html_node
+            for h in headers
+            for html_node in self.visit_header(h, context)
+        ]
+
+        if not self._deduplicate_headers_and_footers:
+            return all_headers
+
+        header_values = set()
+        filtered_headers = []
+        for h in all_headers:
+            if not h.to_text() in header_values:
+                filtered_headers.append(h)
+                header_values.add(h.to_text())
+
+        return filtered_headers
+
+    def visit_footer(self, footer, context):
+        return self._visit_all(footer.children, context)
+
+    def visit_footers(self, footers, context):
+        all_footers = [
+            html_node
+            for f in footers
+            for html_node in self.visit_footer(f, context)
+        ]
+
+        if not self._deduplicate_headers_and_footers:
+            return all_footers
+
+        footer_values = set()
+        filtered_footers = []
+        for h in all_footers:
+            if not h.to_text() in footer_values:
+                filtered_footers.append(h)
+                footer_values.add(h.to_text())
+
+        return filtered_footers
 
     def _visit_all(self, elements, context):
         return [
diff --git a/mammoth/documents.py b/mammoth/documents.py
@@ -14,6 +14,8 @@ class HasChildren(Element):
 class Document(HasChildren):
     notes = cobble.field()
     comments = cobble.field()
+    headers = cobble.field()
+    footers = cobble.field()
 
 @cobble.data
 class Paragraph(HasChildren):
@@ -97,12 +99,16 @@ class Image(Element):
     open = cobble.field()
 
 
-def document(children, notes=None, comments=None):
+def document(children, notes=None, comments=None, headers=None, footers=None):
     if notes is None:
         notes = Notes({})
     if comments is None:
         comments = []
-    return Document(children, notes, comments=comments)
+    if headers is None:
+        headers = []
+    if footers is None:
+        footers = []
+    return Document(children, notes, comments=comments, headers=headers, footers=footers)
 
 def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None):
     if indent is None:
@@ -252,5 +258,17 @@ class CommentReference(Element):
 
 comment_reference = CommentReference
 
+@cobble.data
+class Header(HasChildren):
+    pass
+
+header = Header
+
+@cobble.data
+class Footer(HasChildren):
+    pass
+
+footer = Footer
+
 def element_visitor(args):
     return cobble.visitor(Element, args=args)
diff --git a/mammoth/docx/__init__.py b/mammoth/docx/__init__.py
@@ -5,6 +5,7 @@
 
 from .. import results, lists, zips
 from .document_xml import read_document_xml_element
+from .header_xml import (read_header_xml_element, read_footer_xml_element)
 from .content_types_xml import empty_content_types, read_content_types_xml_element
 from .relationships_xml import read_relationships_xml_element, Relationships
 from .numbering_xml import read_numbering_xml_element, Numbering
@@ -27,12 +28,13 @@ def read(fileobj):
         zip_file,
         part_paths=part_paths,
     )
-    
     return results.combine([
         _read_notes(read_part_with_body, part_paths),
         _read_comments(read_part_with_body, part_paths),
+        _read_headers(read_part_with_body, part_paths),
+        _read_footers(read_part_with_body, part_paths)
     ]).bind(lambda referents:
-        _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
+        _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], headers=referents[2], footers=referents[3], part_paths=part_paths)
     )
 
 
@@ -43,6 +45,8 @@ class _PartPaths(object):
     endnotes = cobble.field()
     footnotes = cobble.field()
     numbering = cobble.field()
+    headers = cobble.field()
+    footers = cobble.field()
     styles = cobble.field()
 
 
@@ -55,21 +59,24 @@ def _find_part_paths(zip_file):
         _find_relationships_path_for(document_filename),
     )
     
-    def find(name):
+    def find(name, multiple=False):
         return _find_part_path(
             zip_file=zip_file,
             relationships=document_relationships,
             relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
             fallback_path="word/{0}.xml".format(name),
             base_path=zips.split_path(document_filename)[0],
+            multiple=multiple
         )
-    
+
     return _PartPaths(
         main_document=document_filename,
         comments=find("comments"),
         endnotes=find("endnotes"),
         footnotes=find("footnotes"),
         numbering=find("numbering"),
+        headers=find("header", multiple=True),
+        footers=find("footer", multiple=True),
         styles=find("styles"),
     )
 
@@ -88,7 +95,7 @@ def _find_document_filename(zip_file, relationships):
         raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")
 
 
-def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
+def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path, multiple=False):
     targets = [
         zips.join_path(base_path, target).lstrip("/")
         for target in relationships.find_targets_by_type(relationship_type)
@@ -97,7 +104,7 @@ def _find_part_path(zip_file, relationships, relationship_type, base_path, fallb
     if len(valid_targets) == 0:
         return fallback_path
     else:
-        return valid_targets[0]
+        return valid_targets if multiple else valid_targets[0]
 
 
 def _read_notes(read_part_with_body, part_paths):
@@ -111,7 +118,6 @@ def _read_notes(read_part_with_body, part_paths):
         lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
         default=_empty_result,
     )
-    
     return results.combine([footnotes, endnotes]).map(lists.flatten)
 
 
@@ -122,14 +128,42 @@ def _read_comments(read_part_with_body, part_paths):
         default=_empty_result,
     )
 
+def _read_headers(read_part_with_body, part_paths):
+    if type(part_paths.headers) == str:
+        header_paths = [part_paths.headers]
+    else:
+        header_paths = part_paths.headers
+
+    headers = [
+        read_part_with_body(header,
+            lambda root, body_reader: read_header_xml_element(root, body_reader=body_reader),
+        default=_empty_result) for header in header_paths]
+    return [h for h in headers if h.value != []]
+
+
+def _read_footers(read_part_with_body, part_paths):
+    if type(part_paths.footers) == str:
+        footer_paths = [part_paths.footers]
+    else:
+        footer_paths = part_paths.footers
+
+    footers = [
+        read_part_with_body(footer,
+            lambda root, body_reader: read_footer_xml_element(root, body_reader=body_reader),
+        default=_empty_result) for footer in footer_paths]
+
+    return [f for f in footers if f.value != []]
+
     
-def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
+def _read_document(zip_file, read_part_with_body, notes, comments, headers, footers, part_paths):
     return read_part_with_body(
         part_paths.main_document,
         partial(
             read_document_xml_element,
             notes=notes,
             comments=comments,
+            headers=headers,
+            footers=footers
         ),
     )
 
diff --git a/mammoth/docx/body_xml.py b/mammoth/docx/body_xml.py
@@ -79,7 +79,7 @@ def run(element):
             .find_child_or_null("w:vertAlign") \
             .attributes.get("w:val")
         font = properties.find_child_or_null("w:rFonts").attributes.get("w:ascii")
-        
+
         is_bold = read_boolean_element(properties.find_child("w:b"))
         is_italic = read_boolean_element(properties.find_child("w:i"))
         is_underline = read_boolean_element(properties.find_child("w:u"))
diff --git a/mammoth/docx/document_xml.py b/mammoth/docx/document_xml.py
@@ -5,7 +5,9 @@ def read_document_xml_element(
         element,
         body_reader,
         notes=None,
-        comments=None):
+        comments=None,
+        headers=None,
+        footers=None):
     
     if notes is None:
         notes = []
@@ -17,5 +19,7 @@ def read_document_xml_element(
         .map(lambda children: documents.document(
             children,
             notes=documents.notes(notes),
-            comments=comments
+            comments=comments,
+            headers=headers,
+            footers=footers
         ))
diff --git a/mammoth/docx/header_xml.py b/mammoth/docx/header_xml.py
@@ -0,0 +1,9 @@
+import functools
+from .. import documents
+
+def _read_extremity(extremity, element, body_reader):
+    return body_reader.read_all(element.children) \
+        .map(lambda children: extremity(children))
+
+read_header_xml_element = functools.partial(_read_extremity, documents.header)
+read_footer_xml_element = functools.partial(_read_extremity, documents.footer)
diff --git a/mammoth/docx/styles_xml.py b/mammoth/docx/styles_xml.py
@@ -40,7 +40,7 @@ def read_styles_xml_element(element):
         style_set = styles.get(element_type)
         if style_set is not None:
             style_set[style.style_id] = style
-    
+
     return Styles(
         paragraph_styles=paragraph_styles,
         character_styles=character_styles,
diff --git a/mammoth/html/nodes.py b/mammoth/html/nodes.py
@@ -9,6 +9,9 @@ class Node(object):
 class TextNode(Node):
     value = cobble.field()
 
+    def to_text(self):
+        return self.value
+
 
 @cobble.data
 class Tag(object):
@@ -52,6 +55,8 @@ def separator(self):
     def is_void(self):
         return not self.children and self.tag_name in self._VOID_TAG_NAMES
 
+    def to_text(self):
+        return "".join([s.to_text() for s in iter(self.children)])
 
 @cobble.visitable
 class ForceWrite(Node):
diff --git a/mammoth/options.py b/mammoth/options.py
@@ -19,6 +19,10 @@ def read_options(options):
         style_map += _default_style_map
     
     options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True)
+    
+    options["include_headers_and_footers"] = options.get("include_headers_and_footers", False)
+    options["deduplicate_headers_and_footers"] = options.get("deduplicate_headers_and_footers", False)
+
     options["style_map"] = style_map
     return read_style_map_result.map(lambda _: options)
 
diff --git a/mammoth/results.py b/mammoth/results.py