From 8e69411e826fc09e32ca6d289c30ae9595c45a3e Mon Sep 17 00:00:00 2001
From: Cristi Constantin <cristi.constantin@sent.com>
Date: Tue, 16 Jul 2019 14:53:38 +0100
Subject: [PATCH 1/2] Basic ignore empty OpenGraph props

---
 extruct/_extruct.py                       | 5 +++++
 extruct/opengraph.py                      | 2 ++
 tests/samples/songkick/elysianfields.html | 1 +
 tests/samples/songkick/elysianfields.json | 3 +++
 tests/test_extruct.py                     | 1 -
 5 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/extruct/_extruct.py b/extruct/_extruct.py
index ba35a6fa..2f206383 100644
--- a/extruct/_extruct.py
+++ b/extruct/_extruct.py
@@ -54,6 +54,7 @@ def extract(htmlstring,
     if errors not in ['log', 'ignore', 'strict']:
         raise ValueError('Invalid error command, valid values are either "log"'
                          ', "ignore" or "strict"')
+
     try:
         tree = parse_xmldom_html(htmlstring, encoding=encoding)
     except Exception as e:
@@ -65,6 +66,7 @@ def extract(htmlstring,
             return {}
         if errors == 'strict':
             raise
+
     processors = []
     if 'microdata' in syntaxes:
         processors.append(
@@ -95,6 +97,7 @@ def extract(htmlstring,
             ('rdfa', RDFaExtractor().extract_items,
              tree,
              ))
+
     output = {}
     for syntax, extract, document in processors:
         try:
@@ -108,6 +111,7 @@ def extract(htmlstring,
                 pass
             if errors == 'strict':
                 raise
+
     if uniform:
         uniform_processors = []
         if 'microdata' in syntaxes:
@@ -131,6 +135,7 @@ def extract(htmlstring,
                  output['opengraph'],
                  None,
                  ))
+
         for syntax, uniform, raw, schema_context in uniform_processors:
             try:
                 if syntax == 'opengraph':
diff --git a/extruct/opengraph.py b/extruct/opengraph.py
index 78e836bf..e5b97dae 100644
--- a/extruct/opengraph.py
+++ b/extruct/opengraph.py
@@ -32,6 +32,8 @@ def extract_items(self, document, base_url=None):
             for el in head.xpath('meta[@property and @content]'):
                 prop = el.attrib['property']
                 val = el.attrib['content']
+                if prop == '' or val == '':
+                    continue
                 ns = prop.partition(':')[0]
                 if ns in _OG_NAMESPACES:
                     namespaces[ns] = _OG_NAMESPACES[ns]
diff --git a/tests/samples/songkick/elysianfields.html b/tests/samples/songkick/elysianfields.html
index 790465b3..c7a00b4a 100644
--- a/tests/samples/songkick/elysianfields.html
+++ b/tests/samples/songkick/elysianfields.html
@@ -28,6 +28,7 @@
     <meta property="og:type" content="songkick-concerts:artist">
     <meta property="og:title" content="Elysian Fields">
     <meta property="og:description" content="Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.">
+    <meta property="og:description" content="" />
     <meta property="og:url" content="http://www.songkick.com/artists/236156-elysian-fields">
     <meta property="og:image" content="http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg">
     <meta property="og:image" content="http://images.sk-static.com/SECONDARY_IMAGE.jpg">
diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json
index 4b9f3649..0f94c14e 100644
--- a/tests/samples/songkick/elysianfields.json
+++ b/tests/samples/songkick/elysianfields.json
@@ -232,6 +232,9 @@
             "http://ogp.me/ns#description": [
                 {
                     "@value": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017."
+                },
+                {
+                    "@value": ""
                 }
             ],
             "http://ogp.me/ns#image": [
diff --git a/tests/test_extruct.py b/tests/test_extruct.py
index dc08401e..bb79af8f 100644
--- a/tests/test_extruct.py
+++ b/tests/test_extruct.py
@@ -5,7 +5,6 @@
 import pytest
 
 import extruct
-from extruct import SYNTAXES
 from tests import get_testdata, jsonize_dict, replace_node_ref_with_node_id
 
 

From d4645ef96bdb8c2ab8129f496660992f48c32abd Mon Sep 17 00:00:00 2001
From: Cristi Constantin <cristi.constantin@sent.com>
Date: Wed, 17 Jul 2019 11:56:36 +0100
Subject: [PATCH 2/2] Strip empty prop and content tags

Updated the tests
---
 extruct/opengraph.py                      |  4 ++--
 requirements.txt                          |  2 +-
 tests/samples/songkick/elysianfields.html |  1 +
 tests/samples/songkick/elysianfields.json |  3 +++
 tests/test_extruct.py                     | 10 +++++++---
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/extruct/opengraph.py b/extruct/opengraph.py
index e5b97dae..978d25ab 100644
--- a/extruct/opengraph.py
+++ b/extruct/opengraph.py
@@ -30,8 +30,8 @@ def extract_items(self, document, base_url=None):
             namespaces.update(self.get_namespaces(head))
             props = []
             for el in head.xpath('meta[@property and @content]'):
-                prop = el.attrib['property']
-                val = el.attrib['content']
+                prop = el.attrib['property'].strip()
+                val = el.attrib['content'].strip()
                 if prop == '' or val == '':
                     continue
                 ns = prop.partition(':')[0]
diff --git a/requirements.txt b/requirements.txt
index 87a27224..820557a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,5 +7,5 @@ requests
 rdflib
 rdflib-jsonld
 mf2py>=1.1.0
-six
+six>=1.11
 w3lib
diff --git a/tests/samples/songkick/elysianfields.html b/tests/samples/songkick/elysianfields.html
index c7a00b4a..4fa2ba0a 100644
--- a/tests/samples/songkick/elysianfields.html
+++ b/tests/samples/songkick/elysianfields.html
@@ -27,6 +27,7 @@
     <meta property="og:site_name" content="Songkick">
     <meta property="og:type" content="songkick-concerts:artist">
     <meta property="og:title" content="Elysian Fields">
+    <meta property="og:title" content="  ">
     <meta property="og:description" content="Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.">
     <meta property="og:description" content="" />
     <meta property="og:url" content="http://www.songkick.com/artists/236156-elysian-fields">
diff --git a/tests/samples/songkick/elysianfields.json b/tests/samples/songkick/elysianfields.json
index 0f94c14e..ba8e9f56 100644
--- a/tests/samples/songkick/elysianfields.json
+++ b/tests/samples/songkick/elysianfields.json
@@ -253,6 +253,9 @@
             "http://ogp.me/ns#title": [
                 {
                     "@value": "Elysian Fields"
+                },
+                {
+                    "@value": "  "
                 }
             ],
             "http://ogp.me/ns#type": [
diff --git a/tests/test_extruct.py b/tests/test_extruct.py
index bb79af8f..a2ba8003 100644
--- a/tests/test_extruct.py
+++ b/tests/test_extruct.py
@@ -16,9 +16,13 @@ def test_all(self):
         body = get_testdata('songkick', 'elysianfields.html')
         expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
         data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields')
-        # See test_rdfa_not_preserving_order()
-        del data['rdfa'][0]['http://ogp.me/ns#image']
-        del expected['rdfa'][0]['http://ogp.me/ns#image']
+        # Sorting the values here because RDFa is not preserving ordering on duplicated properties.
+        # See https://github.com/scrapinghub/extruct/issues/116
+        # Also see test_rdfa_not_preserving_order()
+        for rdf in data['rdfa']:
+            for key, pairs in rdf.items():
+                if ':' in key and isinstance(pairs, list):
+                    rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True)
         self.assertEqual(jsonize_dict(data), expected)
 
     @pytest.mark.xfail