From 8e69411e826fc09e32ca6d289c30ae9595c45a3e Mon Sep 17 00:00:00 2001 From: Cristi Constantin Date: Tue, 16 Jul 2019 14:53:38 +0100 Subject: [PATCH 1/2] Basic ignore empty OpenGraph props --- extruct/_extruct.py | 5 +++++ extruct/opengraph.py | 2 ++ tests/samples/songkick/elysianfields.html | 1 + tests/samples/songkick/elysianfields.json | 3 +++ tests/test_extruct.py | 1 - 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/extruct/_extruct.py b/extruct/_extruct.py index ba35a6fa..2f206383 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -54,6 +54,7 @@ def extract(htmlstring, if errors not in ['log', 'ignore', 'strict']: raise ValueError('Invalid error command, valid values are either "log"' ', "ignore" or "strict"') + try: tree = parse_xmldom_html(htmlstring, encoding=encoding) except Exception as e: @@ -65,6 +66,7 @@ def extract(htmlstring, return {} if errors == 'strict': raise + processors = [] if 'microdata' in syntaxes: processors.append( @@ -95,6 +97,7 @@ def extract(htmlstring, ('rdfa', RDFaExtractor().extract_items, tree, )) + output = {} for syntax, extract, document in processors: try: @@ -108,6 +111,7 @@ def extract(htmlstring, pass if errors == 'strict': raise + if uniform: uniform_processors = [] if 'microdata' in syntaxes: @@ -131,6 +135,7 @@ def extract(htmlstring, output['opengraph'], None, )) + for syntax, uniform, raw, schema_context in uniform_processors: try: if syntax == 'opengraph': diff --git a/extruct/opengraph.py b/extruct/opengraph.py index 78e836bf..e5b97dae 100644 --- a/extruct/opengraph.py +++ b/extruct/opengraph.py @@ -32,6 +32,8 @@ def extract_items(self, document, base_url=None): for el in head.xpath('meta[@property and @content]'): prop = el.attrib['property'] val = el.attrib['content'] + if prop == '' or val == '': + continue ns = prop.partition(':')[0] if ns in _OG_NAMESPACES: namespaces[ns] = _OG_NAMESPACES[ns] diff --git a/tests/samples/songkick/elysianfields.html b/tests/samples/songkick/elysianfields.html index 790465b3..c7a00b4a 100644 --- a/tests/samples/songkick/elysianfields.html +++ b/tests/samples/songkick/elysianfields.html @@ -28,6 +28,7 @@ + diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json index 4b9f3649..0f94c14e 100644 --- a/tests/samples/songkick/elysianfields.json +++ b/tests/samples/songkick/elysianfields.json @@ -232,6 +232,9 @@ "http://ogp.me/ns#description": [ { "@value": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017." + }, + { + "@value": "" } ], "http://ogp.me/ns#image": [ diff --git a/tests/test_extruct.py b/tests/test_extruct.py index dc08401e..bb79af8f 100644 --- a/tests/test_extruct.py +++ b/tests/test_extruct.py @@ -5,7 +5,6 @@ import pytest import extruct -from extruct import SYNTAXES from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id From d4645ef96bdb8c2ab8129f496660992f48c32abd Mon Sep 17 00:00:00 2001 From: Cristi Constantin Date: Wed, 17 Jul 2019 11:56:36 +0100 Subject: [PATCH 2/2] Strip empty prop and content tags Updated the tests --- extruct/opengraph.py | 4 ++-- requirements.txt | 2 +- tests/samples/songkick/elysianfields.html | 1 + tests/samples/songkick/elysianfields.json | 3 +++ tests/test_extruct.py | 10 +++++++--- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/extruct/opengraph.py b/extruct/opengraph.py index e5b97dae..978d25ab 100644 --- a/extruct/opengraph.py +++ b/extruct/opengraph.py @@ -30,8 +30,8 @@ def extract_items(self, document, base_url=None): namespaces.update(self.get_namespaces(head)) props = [] for el in head.xpath('meta[@property and @content]'): - prop = el.attrib['property'] - val = el.attrib['content'] + prop = el.attrib['property'].strip() + val = el.attrib['content'].strip() if prop == '' or val == '': continue ns = prop.partition(':')[0] diff --git a/requirements.txt b/requirements.txt index 87a27224..820557a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ requests rdflib rdflib-jsonld mf2py>=1.1.0 -six +six>=1.11 w3lib diff --git a/tests/samples/songkick/elysianfields.html b/tests/samples/songkick/elysianfields.html index c7a00b4a..4fa2ba0a 100644 --- a/tests/samples/songkick/elysianfields.html +++ b/tests/samples/songkick/elysianfields.html @@ -27,6 +27,7 @@ + diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json index 0f94c14e..ba8e9f56 100644 --- a/tests/samples/songkick/elysianfields.json +++ b/tests/samples/songkick/elysianfields.json @@ -253,6 +253,9 @@ "http://ogp.me/ns#title": [ { "@value": "Elysian Fields" + }, + { + "@value": " " } ], "http://ogp.me/ns#type": [ diff --git a/tests/test_extruct.py b/tests/test_extruct.py index bb79af8f..a2ba8003 100644 --- a/tests/test_extruct.py +++ b/tests/test_extruct.py @@ -16,9 +16,13 @@ def test_all(self): body = get_testdata('songkick', 'elysianfields.html') expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8')) data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields') - # See test_rdfa_not_preserving_order() - del data['rdfa'][0]['http://ogp.me/ns#image'] - del expected['rdfa'][0]['http://ogp.me/ns#image'] + # Sorting the values here because RDFa is not preserving ordering on duplicated properties. + # See https://github.com/scrapinghub/extruct/issues/116 + # Also see test_rdfa_not_preserving_order() + for rdf in data['rdfa']: + for key, pairs in rdf.items(): + if ':' in key and isinstance(pairs, list): + rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True) self.assertEqual(jsonize_dict(data), expected) @pytest.mark.xfail