Merge pull request #57 from torchbox/#42-import-documents

#42 import documents
torchbox · Nov 2, 2021 · 79152ac · 79152ac
2 parents 3bb433f + abe9135
commit 79152ac
Show file tree

Hide file tree

Showing 5 changed files with 142 additions and 41 deletions.
diff --git a/wagtail_wordpress_import/block_builder.py b/wagtail_wordpress_import/block_builder.py
@@ -74,7 +74,7 @@ def build(self):
         """
         soup = self.soup.find("body").findChildren(recursive=False)
         cached_fallback_value = (
-            ""  # keep appending fall back content here, by default is Rich Text block
+            ""  # append fall back content here, by default it's a Rich Text block
         )
         cached_fallback_function = import_string(
             conf_fallback_block()

diff --git a/wagtail_wordpress_import/block_builder_defaults.py b/wagtail_wordpress_import/block_builder_defaults.py
@@ -6,6 +6,7 @@
 from django.core.files import File
 from django.core.files.temp import NamedTemporaryFile
 from wagtail.images.models import Image as ImportedImage
+from wagtail.documents.models import Document as ImportedDocument
 
 """StreamField blocks"""
 
@@ -140,13 +141,16 @@ def conf_fallback_block():
     )
 
 
-def build_none_block_content(cache, blocks):
+def build_none_block_content(html, blocks):
     """
     image_linker is called to link up and retrive the remote image
+    document_linker is called to link up and retrive the remote documents
     """
-    blocks.append({"type": "rich_text", "value": image_linker(cache)})
-    cache = ""
-    return cache
+    html = image_linker(html)
+    html = document_linker(html)
+    blocks.append({"type": "rich_text", "value": html})
+    html = ""
+    return html
 
 
 """Rich Text Functions"""
@@ -166,6 +170,30 @@ def conf_valid_image_content_types():
     )
 
 
+def conf_valid_document_file_types():
+    return getattr(
+        settings,
+        "",
+        [
+            "pdf",
+            "ppt",
+            "docx",
+        ],
+    )
+
+
+def conf_valid_document_content_types():
+    return getattr(
+        settings,
+        "",
+        [
+            "application/pdf",
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        ],
+    )
+
+
 def conf_domain_prefix():
 
     if hasattr(settings, "WAGTAIL_WORDPRESS_IMPORTER_BASE_URL"):
@@ -186,14 +214,14 @@ def image_linker(html):
         string: the html with img tags modified
 
     BS4 performs a find and replace on all img tags found in the HTML.
-    If the image can be retrived from the remote site and saved into a Wagtail ImageModel
+    If the image can be retrieved from the remote site and saved into a Wagtail ImageModel
     the soup is modified.
     """
     soup = BeautifulSoup(html, "html.parser")
     images = soup.find_all("img")
     for image in images:
-        if image.attrs and image.attrs["src"]:
-            image_src = get_abolute_src(image.attrs["src"], conf_domain_prefix())
+        if image.attrs and image.attrs.get("src"):
+            image_src = get_absolute_src(image.attrs["src"], conf_domain_prefix())
             saved_image = get_or_save_image(image_src)
             if saved_image:
                 image_embed = soup.new_tag("embed")
@@ -216,20 +244,31 @@ def get_image_file_name(src):
     return src.split("/")[-1] if src else None  # need the last part
 
 
+def get_document_file_name(src):
+    return src.split("/")[-1] if src else None  # need the last part
+
+
 def image_exists(name):
     try:
         return ImportedImage.objects.get(title=name)
     except ImportedImage.DoesNotExist:
         pass
 
 
+def document_exists(name):
+    try:
+        return ImportedDocument.objects.get(title=name)
+    except ImportedDocument.DoesNotExist:
+        pass
+
+
 def conf_get_requests_settings():
     return getattr(
         settings,
         "WAGTAIL_WORDPRESS_IMPORTER_REQUESTS_SETTINGS",
         {
             "headers": {"User-Agent": "WagtailWordpressImporter"},
-            "timeout": 1,
+            "timeout": 5,
             "stream": False,
         },
     )
@@ -252,7 +291,7 @@ def get_or_save_image(src):
             temp_image.close()
             return retrieved_image
         else:
-            print(f"RECEIVED INVALID RESPONSE: {src}")
+            print(f"RECEIVED INVALID IMAGE RESPONSE: {src}")
     return existing_image
 
 
@@ -265,11 +304,13 @@ def fetch_url(src, r=None, status=False, content_type=None):
             r.headers["content-type"].lower() if r.headers.get("content-type") else ""
         )
     except requests.ConnectTimeout:
-        print(f"THERE WAS A PROBLEM WITH REQUESTS FETCHING: {src}")
+        print(f"CONNECTION TIMEOUT: {src}")
+    except requests.ConnectionError:
+        print(f"CONNECTION ERROR: {src}")
     return r, status, content_type
 
 
-def get_abolute_src(src, domain_prefix=None):
+def get_absolute_src(src, domain_prefix=None):
     src = src.lstrip("/")
     if not src.startswith("http") and domain_prefix:
         return domain_prefix + "/" + src
@@ -286,3 +327,61 @@ def get_alignment_class(image):
             alignment = "right"
 
     return alignment
+
+
+def document_linker(html):
+    """
+    params
+    ======
+        html: html from a single rich_text block
+
+    returns
+    =======
+        string: the html with anchor links modified
+
+    BS4 performs a find and replace on all img tags found in the HTML.
+    If the image can be retrived from the remote site and saved into a Wagtail ImageModel
+    the soup is modified.
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    anchors = soup.find_all("a")
+    for anchor in anchors:
+        if anchor.attrs and anchor.attrs.get("href"):
+            anchor_href = get_absolute_src(anchor.attrs["href"], conf_domain_prefix())
+            anchor_inner_content = anchor.text
+            saved_document = get_or_save_document(anchor_href)
+            if saved_document:
+                document_embed = soup.new_tag("a")
+                document_embed.attrs["linktype"] = "document"
+                document_embed.attrs["id"] = saved_document.id
+                document_embed.string = anchor_inner_content
+                # image_embed.attrs["alt"] = get_image_alt(image)
+                # image_embed.attrs["format"] = get_alignment_class(image)
+                anchor.replace_with(document_embed)
+        else:
+            print(f"DOCUMENT HAS NO HREF: {anchor}")
+
+    return str(soup)
+
+
+def get_or_save_document(href):
+    file_type = href.split(".")[-1]
+    if file_type in conf_valid_document_file_types():
+        document_file_name = get_document_file_name(href)
+        existing_document = document_exists(document_file_name)
+        if not existing_document:
+            response, valid, type = fetch_url(href)
+            if valid and (type in conf_valid_document_content_types()):
+                temp_document = NamedTemporaryFile(delete=True)
+                temp_document.name = document_file_name
+                temp_document.write(response.content)
+                temp_document.flush()
+                retrieved_document = ImportedDocument(
+                    file=File(file=temp_document), title=document_file_name
+                )
+                retrieved_document.save()
+                temp_document.close()
+                return retrieved_document
+            else:
+                print(f"RECEIVED INVALID DOCUMENT RESPONSE: {href}")
+        return existing_document
diff --git a/wagtail_wordpress_import/test/fixtures/raw_html.txt b/wagtail_wordpress_import/test/fixtures/raw_html.txt
@@ -3,7 +3,7 @@
 <span style="font-weight: bold;font-style:italic;">Lorem ipsum (xcounterx) dolor sit amet</span>
 
 <a href="#ideas"><strong>Lorem ipsum dolor sit (xcounterx) amet!</strong></a>
-
+<a href="https://www.budgetsaresexy.com/files/personal-finance-culminating-assignment.pdf">Read this</a>
 <h2><strong>Lorem ipsum dolor sit amet?</strong></h2>
 
 <p>Absolute image url.

diff --git a/wagtail_wordpress_import/test/tests/test_block_builder.py b/wagtail_wordpress_import/test/tests/test_block_builder.py
@@ -12,7 +12,7 @@
     build_image_block,
     build_table_block,
     conf_domain_prefix,
-    get_abolute_src,
+    get_absolute_src,
     get_alignment_class,
     get_image_alt,
     get_image_file_name,
@@ -95,6 +95,7 @@ def test_build_iframe_block(self):
         self.assertEqual(output["type"], "raw_html")
         self.assertTrue(output["value"].startswith("<div"))
 
+    # work in progress
     def test_build_image_block(self):
         input = """<img src="http://www.example.com/image.jpg" />"""
         soup = get_soup(input, "html.parser")
@@ -219,55 +220,56 @@ def test_get_image_file_name(self):
         self.assertEqual(get_image_file_name("fakeimage.jpg"), "fakeimage.jpg")
         self.assertEqual(get_image_file_name("folder/fakeimage.jpg"), "fakeimage.jpg")
         self.assertEqual(
-            get_image_file_name(
-                "http://www.example.com/folder1/folder2//fakeimage.jpg"
-            ),
+            get_image_file_name("http://www.example.com/folder1/folder2/fakeimage.jpg"),
             "fakeimage.jpg",
         )
 
-    def test_get_abolute_src(self):
+    def test_get_absolute_src(self):
         self.assertEqual(
-            get_abolute_src("fakeimage.jpg", "http://www.example.com"),
+            get_absolute_src("fakeimage.jpg", "http://www.example.com"),
             "http://www.example.com/fakeimage.jpg",
         )
         self.assertEqual(
-            get_abolute_src("folder/fakeimage.jpg", "http://www.example.com"),
+            get_absolute_src("folder/fakeimage.jpg", "http://www.example.com"),
             "http://www.example.com/folder/fakeimage.jpg",
         )
+
+    def test_get_absolute_src_without_base_url(self):
         self.assertEqual(
-            get_abolute_src("folder/fakeimage.jpg"),
+            get_absolute_src("folder/fakeimage.jpg"),
             "folder/fakeimage.jpg",
-        )  # the test settings has no BASE_URL setting so try having no domain prefix
+        )
 
     def test_get_abolute_src_slashes_at_start(self):
         self.assertEqual(
-            get_abolute_src("//folder/fakeimage.jpg", "http://www.example.com"),
+            get_absolute_src("//folder/fakeimage.jpg", "http://www.example.com"),
             "http://www.example.com/folder/fakeimage.jpg",
         )
 
-    def test_get_alignment_class(self):
-        input = get_soup(
+    def test_get_alignment_class_align_left(self):
+        soup = get_soup(
             '<img src="fakeimage.jpg" alt="image alt" class="align-left" />',
             "html.parser",
         ).find("img")
-        self.assertEqual(get_alignment_class(input), "left")
-        input = get_soup(
+        self.assertEqual(get_alignment_class(soup), "left")
+
+    def test_get_alignment_class_align_right(self):
+        soup = get_soup(
             '<img src="fakeimage.jpg" alt="image alt" class="align-right" />',
             "html.parser",
         ).find("img")
-        self.assertEqual(get_alignment_class(input), "right")
-        input = get_soup(
+        self.assertEqual(get_alignment_class(soup), "right")
+
+    def test_get_alignment_class_not_present(self):
+        soup = get_soup(
             '<img src="fakeimage.jpg" alt="image alt" />',
             "html.parser",
         ).find("img")
-        self.assertEqual(get_alignment_class(input), "fullwidth")
-
-    def test_with_real_image(self):
-        # but we need to test with mocked images if we can.
-        raw_html_file = """
-        <p>Lorem <img src="https://dummyimage.com/600x400/000/fff" alt=""></p>
-        """
-        self.builder = BlockBuilder(raw_html_file, None, None)
-        self.builder.promote_child_tags()
-        self.blocks = self.builder.build()
-        self.assertTrue("<embed" in self.blocks[0]["value"])
+        self.assertEqual(get_alignment_class(soup), "fullwidth")
+
+    """
+    TODO: Add some more tests
+    I need to include tests here for images and documents.
+    I'm not sure how this could be done at the moment.
+    Also applies to: test_images_linked_rich_text() above
+    """
diff --git a/wagtail_wordpress_import/test/tests/test_wordpress_item.py b/wagtail_wordpress_import/test/tests/test_wordpress_item.py
@@ -1,6 +1,6 @@
 import os
 import json
-from django.test import TestCase
+from django.test import TestCase, override_settings
 from datetime import datetime
 from wagtail_wordpress_import.importers.wordpress import WordpressItem
 from wagtail_wordpress_import.logger import Logger