Skip to content

Commit

Permalink
Merge pull request #57 from torchbox/#42-import-documents
Browse files Browse the repository at this point in the history
#42 import documents
  • Loading branch information
nickmoreton authored Nov 2, 2021
2 parents 3bb433f + abe9135 commit 79152ac
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 41 deletions.
2 changes: 1 addition & 1 deletion wagtail_wordpress_import/block_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def build(self):
"""
soup = self.soup.find("body").findChildren(recursive=False)
cached_fallback_value = (
"" # keep appending fall back content here, by default is Rich Text block
"" # append fall back content here, by default it's a Rich Text block
)
cached_fallback_function = import_string(
conf_fallback_block()
Expand Down
121 changes: 110 additions & 11 deletions wagtail_wordpress_import/block_builder_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from django.core.files import File
from django.core.files.temp import NamedTemporaryFile
from wagtail.images.models import Image as ImportedImage
from wagtail.documents.models import Document as ImportedDocument

"""StreamField blocks"""

Expand Down Expand Up @@ -140,13 +141,16 @@ def conf_fallback_block():
)


def build_none_block_content(cache, blocks):
def build_none_block_content(html, blocks):
"""
image_linker is called to link up and retrive the remote image
document_linker is called to link up and retrive the remote documents
"""
blocks.append({"type": "rich_text", "value": image_linker(cache)})
cache = ""
return cache
html = image_linker(html)
html = document_linker(html)
blocks.append({"type": "rich_text", "value": html})
html = ""
return html


"""Rich Text Functions"""
Expand All @@ -166,6 +170,30 @@ def conf_valid_image_content_types():
)


def conf_valid_document_file_types():
return getattr(
settings,
"",
[
"pdf",
"ppt",
"docx",
],
)


def conf_valid_document_content_types():
return getattr(
settings,
"",
[
"application/pdf",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
],
)


def conf_domain_prefix():

if hasattr(settings, "WAGTAIL_WORDPRESS_IMPORTER_BASE_URL"):
Expand All @@ -186,14 +214,14 @@ def image_linker(html):
string: the html with img tags modified
BS4 performs a find and replace on all img tags found in the HTML.
If the image can be retrived from the remote site and saved into a Wagtail ImageModel
If the image can be retrieved from the remote site and saved into a Wagtail ImageModel
the soup is modified.
"""
soup = BeautifulSoup(html, "html.parser")
images = soup.find_all("img")
for image in images:
if image.attrs and image.attrs["src"]:
image_src = get_abolute_src(image.attrs["src"], conf_domain_prefix())
if image.attrs and image.attrs.get("src"):
image_src = get_absolute_src(image.attrs["src"], conf_domain_prefix())
saved_image = get_or_save_image(image_src)
if saved_image:
image_embed = soup.new_tag("embed")
Expand All @@ -216,20 +244,31 @@ def get_image_file_name(src):
return src.split("/")[-1] if src else None # need the last part


def get_document_file_name(src):
return src.split("/")[-1] if src else None # need the last part


def image_exists(name):
try:
return ImportedImage.objects.get(title=name)
except ImportedImage.DoesNotExist:
pass


def document_exists(name):
try:
return ImportedDocument.objects.get(title=name)
except ImportedDocument.DoesNotExist:
pass


def conf_get_requests_settings():
return getattr(
settings,
"WAGTAIL_WORDPRESS_IMPORTER_REQUESTS_SETTINGS",
{
"headers": {"User-Agent": "WagtailWordpressImporter"},
"timeout": 1,
"timeout": 5,
"stream": False,
},
)
Expand All @@ -252,7 +291,7 @@ def get_or_save_image(src):
temp_image.close()
return retrieved_image
else:
print(f"RECEIVED INVALID RESPONSE: {src}")
print(f"RECEIVED INVALID IMAGE RESPONSE: {src}")
return existing_image


Expand All @@ -265,11 +304,13 @@ def fetch_url(src, r=None, status=False, content_type=None):
r.headers["content-type"].lower() if r.headers.get("content-type") else ""
)
except requests.ConnectTimeout:
print(f"THERE WAS A PROBLEM WITH REQUESTS FETCHING: {src}")
print(f"CONNECTION TIMEOUT: {src}")
except requests.ConnectionError:
print(f"CONNECTION ERROR: {src}")
return r, status, content_type


def get_abolute_src(src, domain_prefix=None):
def get_absolute_src(src, domain_prefix=None):
src = src.lstrip("/")
if not src.startswith("http") and domain_prefix:
return domain_prefix + "/" + src
Expand All @@ -286,3 +327,61 @@ def get_alignment_class(image):
alignment = "right"

return alignment


def document_linker(html):
"""
params
======
html: html from a single rich_text block
returns
=======
string: the html with anchor links modified
BS4 performs a find and replace on all img tags found in the HTML.
If the image can be retrived from the remote site and saved into a Wagtail ImageModel
the soup is modified.
"""
soup = BeautifulSoup(html, "html.parser")
anchors = soup.find_all("a")
for anchor in anchors:
if anchor.attrs and anchor.attrs.get("href"):
anchor_href = get_absolute_src(anchor.attrs["href"], conf_domain_prefix())
anchor_inner_content = anchor.text
saved_document = get_or_save_document(anchor_href)
if saved_document:
document_embed = soup.new_tag("a")
document_embed.attrs["linktype"] = "document"
document_embed.attrs["id"] = saved_document.id
document_embed.string = anchor_inner_content
# image_embed.attrs["alt"] = get_image_alt(image)
# image_embed.attrs["format"] = get_alignment_class(image)
anchor.replace_with(document_embed)
else:
print(f"DOCUMENT HAS NO HREF: {anchor}")

return str(soup)


def get_or_save_document(href):
file_type = href.split(".")[-1]
if file_type in conf_valid_document_file_types():
document_file_name = get_document_file_name(href)
existing_document = document_exists(document_file_name)
if not existing_document:
response, valid, type = fetch_url(href)
if valid and (type in conf_valid_document_content_types()):
temp_document = NamedTemporaryFile(delete=True)
temp_document.name = document_file_name
temp_document.write(response.content)
temp_document.flush()
retrieved_document = ImportedDocument(
file=File(file=temp_document), title=document_file_name
)
retrieved_document.save()
temp_document.close()
return retrieved_document
else:
print(f"RECEIVED INVALID DOCUMENT RESPONSE: {href}")
return existing_document
2 changes: 1 addition & 1 deletion wagtail_wordpress_import/test/fixtures/raw_html.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<span style="font-weight: bold;font-style:italic;">Lorem ipsum (xcounterx) dolor sit amet</span>

<a href="#ideas"><strong>Lorem ipsum dolor sit (xcounterx) amet!</strong></a>

<a href="https://www.budgetsaresexy.com/files/personal-finance-culminating-assignment.pdf">Read this</a>
<h2><strong>Lorem ipsum dolor sit amet?</strong></h2>

<p>Absolute image url.
Expand Down
56 changes: 29 additions & 27 deletions wagtail_wordpress_import/test/tests/test_block_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
build_image_block,
build_table_block,
conf_domain_prefix,
get_abolute_src,
get_absolute_src,
get_alignment_class,
get_image_alt,
get_image_file_name,
Expand Down Expand Up @@ -95,6 +95,7 @@ def test_build_iframe_block(self):
self.assertEqual(output["type"], "raw_html")
self.assertTrue(output["value"].startswith("<div"))

# work in progress
def test_build_image_block(self):
input = """<img src="http://www.example.com/image.jpg" />"""
soup = get_soup(input, "html.parser")
Expand Down Expand Up @@ -219,55 +220,56 @@ def test_get_image_file_name(self):
self.assertEqual(get_image_file_name("fakeimage.jpg"), "fakeimage.jpg")
self.assertEqual(get_image_file_name("folder/fakeimage.jpg"), "fakeimage.jpg")
self.assertEqual(
get_image_file_name(
"http://www.example.com/folder1/folder2//fakeimage.jpg"
),
get_image_file_name("http://www.example.com/folder1/folder2/fakeimage.jpg"),
"fakeimage.jpg",
)

def test_get_abolute_src(self):
def test_get_absolute_src(self):
self.assertEqual(
get_abolute_src("fakeimage.jpg", "http://www.example.com"),
get_absolute_src("fakeimage.jpg", "http://www.example.com"),
"http://www.example.com/fakeimage.jpg",
)
self.assertEqual(
get_abolute_src("folder/fakeimage.jpg", "http://www.example.com"),
get_absolute_src("folder/fakeimage.jpg", "http://www.example.com"),
"http://www.example.com/folder/fakeimage.jpg",
)

def test_get_absolute_src_without_base_url(self):
self.assertEqual(
get_abolute_src("folder/fakeimage.jpg"),
get_absolute_src("folder/fakeimage.jpg"),
"folder/fakeimage.jpg",
) # the test settings has no BASE_URL setting so try having no domain prefix
)

def test_get_abolute_src_slashes_at_start(self):
self.assertEqual(
get_abolute_src("//folder/fakeimage.jpg", "http://www.example.com"),
get_absolute_src("//folder/fakeimage.jpg", "http://www.example.com"),
"http://www.example.com/folder/fakeimage.jpg",
)

def test_get_alignment_class(self):
input = get_soup(
def test_get_alignment_class_align_left(self):
soup = get_soup(
'<img src="fakeimage.jpg" alt="image alt" class="align-left" />',
"html.parser",
).find("img")
self.assertEqual(get_alignment_class(input), "left")
input = get_soup(
self.assertEqual(get_alignment_class(soup), "left")

def test_get_alignment_class_align_right(self):
soup = get_soup(
'<img src="fakeimage.jpg" alt="image alt" class="align-right" />',
"html.parser",
).find("img")
self.assertEqual(get_alignment_class(input), "right")
input = get_soup(
self.assertEqual(get_alignment_class(soup), "right")

def test_get_alignment_class_not_present(self):
soup = get_soup(
'<img src="fakeimage.jpg" alt="image alt" />',
"html.parser",
).find("img")
self.assertEqual(get_alignment_class(input), "fullwidth")

def test_with_real_image(self):
# but we need to test with mocked images if we can.
raw_html_file = """
<p>Lorem <img src="https://dummyimage.com/600x400/000/fff" alt=""></p>
"""
self.builder = BlockBuilder(raw_html_file, None, None)
self.builder.promote_child_tags()
self.blocks = self.builder.build()
self.assertTrue("<embed" in self.blocks[0]["value"])
self.assertEqual(get_alignment_class(soup), "fullwidth")

"""
TODO: Add some more tests
I need to include tests here for images and documents.
I'm not sure how this could be done at the moment.
Also applies to: test_images_linked_rich_text() above
"""
2 changes: 1 addition & 1 deletion wagtail_wordpress_import/test/tests/test_wordpress_item.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import json
from django.test import TestCase
from django.test import TestCase, override_settings
from datetime import datetime
from wagtail_wordpress_import.importers.wordpress import WordpressItem
from wagtail_wordpress_import.logger import Logger
Expand Down

0 comments on commit 79152ac

Please sign in to comment.