From 7e952b02f65cde2d93e0a47a5607f6db8cff748f Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Tue, 4 Jun 2019 12:19:45 +0500 Subject: [PATCH 01/17] [wip] use html_text to get element text content in microdata --- extruct/w3cmicrodata.py | 27 +++++++++++++++++++++++++-- requirements.txt | 1 + setup.py | 1 + 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index a254a5c6..a9e1c507 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -11,6 +11,7 @@ import collections from functools import partial +from copy import deepcopy try: from urlparse import urljoin @@ -18,11 +19,32 @@ from urllib.parse import urljoin import lxml.etree +from lxml.html.clean import Cleaner from w3lib.html import strip_html5_whitespace +import html_text from extruct.utils import parse_html +# Cleaner which is similar to html_text cleaner, but is less aggressive +cleaner = Cleaner( + scripts=True, + javascript=False, # onclick attributes are fine + comments=True, + style=True, + links=True, + meta=True, + page_structure=False, # may be nice to have + processing_instructions=True, + embedded=False, # keep embedded content + frames=False, # keed frames + forms=False, # keep forms + annoying_tags=False, + remove_unknown_tags=False, + safe_attrs_only=False, +) + + class LxmlMicrodataExtractor(object): _xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]') _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop], @@ -49,11 +71,12 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"): return self.extract_items(tree, base_url) def extract_items(self, document, base_url): + cleaned_document = cleaner.clean_html(document) items_seen = set() return [ item for item in ( self._extract_item(it, items_seen=items_seen, base_url=base_url) - for it in self._xp_item(document)) + for it in self._xp_item(cleaned_document)) if item] def _extract_item(self, node, items_seen, base_url): @@ -182,7 +205,7 @@ def _extract_property_value(self, node, items_seen, base_url, force=False): return self._extract_textContent(node) def _extract_textContent(self, node): - return u"".join(self._xp_clean_text(node)).strip() + return html_text.etree_to_text(node) MicrodataExtractor = LxmlMicrodataExtractor diff --git a/requirements.txt b/requirements.txt index 87a27224..04bc5c16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ rdflib-jsonld mf2py>=1.1.0 six w3lib +html-text diff --git a/setup.py b/setup.py index 5bbc7553..f7e60387 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ def get_version(): 'rdflib-jsonld', 'mf2py', 'w3lib', + 'html-text>=0.5.1', 'six'], extras_require={ 'service': [ From ee4da8bf5c10a85daab6e652994065cb0a422ae4 Mon Sep 17 00:00:00 2001 From: jakubwasikowski <jakub.wasikowski@gmail.com> Date: Mon, 15 Jul 2019 16:50:29 +0200 Subject: [PATCH 02/17] Add new test case --- .../websites/microdata-with-description.html | 3036 +++++++++++++++++ tests/test_microdata.py | 14 + 2 files changed, 3050 insertions(+) create mode 100644 tests/samples/websites/microdata-with-description.html diff --git a/tests/samples/websites/microdata-with-description.html b/tests/samples/websites/microdata-with-description.html new file mode 100644 index 00000000..403adbed --- /dev/null +++ b/tests/samples/websites/microdata-with-description.html @@ -0,0 +1,3036 @@ +<!DOCTYPE html> + +<html> +<head> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><script type="text/javascript">window.NREUM||(NREUM={});NREUM.info = {"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"3ecd0a4901","applicationID":"4254217","transactionName":"MV1UN0ZSWUFUVEALVggXcgxAfVJGdlhaFksJVFoGRhxnQFpTQQFNSXFYB1FL","queueTime":0,"applicationTime":221,"ttGuid":"11507BD9179999DB","agent":""}</script><script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={xpid:"UwAPU1NXGwcCUVVQBgY="};window.NREUM||(NREUM={}),__nr_require=function(t,n,e){function r(e){if(!n[e]){var o=n[e]={exports:{}};t[e][0].call(o.exports,function(n){var o=t[e][1][n];return r(o||n)},o,o.exports)}return n[e].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<e.length;o++)r(e[o]);return r}({1:[function(t,n,e){function r(t){try{s.console&&console.log(t)}catch(n){}}var o,i=t("ee"),a=t(15),s={};try{o=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof console.log&&(s.console=!0,o.indexOf("dev")!==-1&&(s.dev=!0),o.indexOf("nr_dev")!==-1&&(s.nrDev=!0))}catch(c){}s.nrDev&&i.on("internal-error",function(t){r(t.stack)}),s.dev&&i.on("fn-err",function(t,n,e){r(e.stack)}),s.dev&&(r("NR AGENT IN DEVELOPMENT MODE"),r("flags: "+a(s,function(t,n){return t}).join(", ")))},{}],2:[function(t,n,e){function r(t,n,e,r,s){try{p?p-=1:o(s||new UncaughtException(t,n,e),!0)}catch(u){try{i("ierr",[u,c.now(),!0])}catch(d){}}return"function"==typeof f&&f.apply(this,a(arguments))}function UncaughtException(t,n,e){this.message=t||"Uncaught error with no additional information",this.sourceURL=n,this.line=e}function o(t,n){var e=n?null:c.now();i("err",[t,e])}var i=t("handle"),a=t(16),s=t("ee"),c=t("loader"),u=t("gos"),f=window.onerror,d=!1,l="nr@seenError",p=0;c.features.err=!0,t(1),window.onerror=r;try{throw new Error}catch(h){"stack"in h&&(t(5),t(4),"addEventListener"in window&&t(3),c.xhrWrappable&&t(6),d=!0)}s.on("fn-start",function(t,n,e){d&&(p+=1)}),s.on("fn-err",function(t,n,e){d&&!e[l]&&(u(e,l,function(){return!0}),this.thrown=!0,o(e))}),s.on("fn-end",function(){d&&!this.thrown&&p>0&&(p-=1)}),s.on("internal-error",function(t){i("ierr",[t,c.now(),!0])})},{}],3:[function(t,n,e){function r(t){for(var n=t;n&&!n.hasOwnProperty(f);)n=Object.getPrototypeOf(n);n&&o(n)}function o(t){s.inPlace(t,[f,d],"-",i)}function i(t,n){return t[1]}var a=t("ee").get("events"),s=t(18)(a,!0),c=t("gos"),u=XMLHttpRequest,f="addEventListener",d="removeEventListener";n.exports=a,"getPrototypeOf"in Object?(r(document),r(window),r(u.prototype)):u.prototype.hasOwnProperty(f)&&(o(window),o(u.prototype)),a.on(f+"-start",function(t,n){var e=t[1],r=c(e,"nr@wrapped",function(){function t(){if("function"==typeof e.handleEvent)return e.handleEvent.apply(e,arguments)}var n={object:t,"function":e}[typeof e];return n?s(n,"fn-",null,n.name||"anonymous"):e});this.wrapped=t[1]=r}),a.on(d+"-start",function(t){t[1]=this.wrapped||t[1]})},{}],4:[function(t,n,e){var r=t("ee").get("raf"),o=t(18)(r),i="equestAnimationFrame";n.exports=r,o.inPlace(window,["r"+i,"mozR"+i,"webkitR"+i,"msR"+i],"raf-"),r.on("raf-start",function(t){t[0]=o(t[0],"fn-")})},{}],5:[function(t,n,e){function r(t,n,e){t[0]=a(t[0],"fn-",null,e)}function o(t,n,e){this.method=e,this.timerDuration=isNaN(t[1])?0:+t[1],t[0]=a(t[0],"fn-",this,e)}var i=t("ee").get("timer"),a=t(18)(i),s="setTimeout",c="setInterval",u="clearTimeout",f="-start",d="-";n.exports=i,a.inPlace(window,[s,"setImmediate"],s+d),a.inPlace(window,[c],c+d),a.inPlace(window,[u,"clearImmediate"],u+d),i.on(c+f,r),i.on(s+f,o)},{}],6:[function(t,n,e){function r(t,n){d.inPlace(n,["onreadystatechange"],"fn-",s)}function o(){var t=this,n=f.context(t);t.readyState>3&&!n.resolved&&(n.resolved=!0,f.emit("xhr-resolved",[],t)),d.inPlace(t,x,"fn-",s)}function i(t){g.push(t),h&&(b?b.then(a):v?v(a):(E=-E,R.data=E))}function a(){for(var t=0;t<g.length;t++)r([],g[t]);g.length&&(g=[])}function s(t,n){return n}function c(t,n){for(var e in t)n[e]=t[e];return n}t(3);var u=t("ee"),f=u.get("xhr"),d=t(18)(f),l=NREUM.o,p=l.XHR,h=l.MO,m=l.PR,v=l.SI,w="readystatechange",x=["onload","onerror","onabort","onloadstart","onloadend","onprogress","ontimeout"],g=[];n.exports=f;var y=window.XMLHttpRequest=function(t){var n=new p(t);try{f.emit("new-xhr",[n],n),n.addEventListener(w,o,!1)}catch(e){try{f.emit("internal-error",[e])}catch(r){}}return n};if(c(p,y),y.prototype=p.prototype,d.inPlace(y.prototype,["open","send"],"-xhr-",s),f.on("send-xhr-start",function(t,n){r(t,n),i(n)}),f.on("open-xhr-start",r),h){var b=m&&m.resolve();if(!v&&!m){var E=1,R=document.createTextNode(E);new h(a).observe(R,{characterData:!0})}}else u.on("fn-end",function(t){t[0]&&t[0].type===w||a()})},{}],7:[function(t,n,e){function r(){var t=window.NREUM,n=t.info.accountID||null,e=t.info.agentID||null,r=t.info.trustKey||null,i="btoa"in window&&"function"==typeof window.btoa;if(!n||!e||!i)return null;var a={v:[0,1],d:{ty:"Browser",ac:n,ap:e,id:o.generateCatId(),tr:o.generateCatId(),ti:Date.now()}};return r&&n!==r&&(a.d.tk=r),btoa(JSON.stringify(a))}var o=t(13);n.exports={generateTraceHeader:r}},{}],8:[function(t,n,e){function r(t){var n=this.params,e=this.metrics;if(!this.ended){this.ended=!0;for(var r=0;r<p;r++)t.removeEventListener(l[r],this.listener,!1);n.aborted||(e.duration=s.now()-this.startTime,this.loadCaptureCalled||4!==t.readyState?null==n.status&&(n.status=0):a(this,t),e.cbTime=this.cbTime,d.emit("xhr-done",[t],t),c("xhr",[n,e,this.startTime]))}}function o(t,n){var e=t.responseType;if("json"===e&&null!==n)return n;var r="arraybuffer"===e||"blob"===e||"json"===e?t.response:t.responseText;return v(r)}function i(t,n){var e=u(n),r=t.params;r.host=e.hostname+":"+e.port,r.pathname=e.pathname,t.sameOrigin=e.sameOrigin}function a(t,n){t.params.status=n.status;var e=o(n,t.lastSize);if(e&&(t.metrics.rxSize=e),t.sameOrigin){var r=n.getResponseHeader("X-NewRelic-App-Data");r&&(t.params.cat=r.split(", ").pop())}t.loadCaptureCalled=!0}var s=t("loader");if(s.xhrWrappable){var c=t("handle"),u=t(9),f=t(7).generateTraceHeader,d=t("ee"),l=["load","error","abort","timeout"],p=l.length,h=t("id"),m=t(12),v=t(11),w=window.XMLHttpRequest;s.features.xhr=!0,t(6),d.on("new-xhr",function(t){var n=this;n.totalCbs=0,n.called=0,n.cbTime=0,n.end=r,n.ended=!1,n.xhrGuids={},n.lastSize=null,n.loadCaptureCalled=!1,t.addEventListener("load",function(e){a(n,t)},!1),m&&(m>34||m<10)||window.opera||t.addEventListener("progress",function(t){n.lastSize=t.loaded},!1)}),d.on("open-xhr-start",function(t){this.params={method:t[0]},i(this,t[1]),this.metrics={}}),d.on("open-xhr-end",function(t,n){"loader_config"in NREUM&&"xpid"in NREUM.loader_config&&this.sameOrigin&&n.setRequestHeader("X-NewRelic-ID",NREUM.loader_config.xpid);var e=!1;if("init"in NREUM&&"distributed_tracing"in NREUM.init&&(e=!!NREUM.init.distributed_tracing.enabled),e&&this.sameOrigin){var r=f();r&&n.setRequestHeader("newrelic",r)}}),d.on("send-xhr-start",function(t,n){var e=this.metrics,r=t[0],o=this;if(e&&r){var i=v(r);i&&(e.txSize=i)}this.startTime=s.now(),this.listener=function(t){try{"abort"!==t.type||o.loadCaptureCalled||(o.params.aborted=!0),("load"!==t.type||o.called===o.totalCbs&&(o.onloadCalled||"function"!=typeof n.onload))&&o.end(n)}catch(e){try{d.emit("internal-error",[e])}catch(r){}}};for(var a=0;a<p;a++)n.addEventListener(l[a],this.listener,!1)}),d.on("xhr-cb-time",function(t,n,e){this.cbTime+=t,n?this.onloadCalled=!0:this.called+=1,this.called!==this.totalCbs||!this.onloadCalled&&"function"==typeof e.onload||this.end(e)}),d.on("xhr-load-added",function(t,n){var e=""+h(t)+!!n;this.xhrGuids&&!this.xhrGuids[e]&&(this.xhrGuids[e]=!0,this.totalCbs+=1)}),d.on("xhr-load-removed",function(t,n){var e=""+h(t)+!!n;this.xhrGuids&&this.xhrGuids[e]&&(delete this.xhrGuids[e],this.totalCbs-=1)}),d.on("addEventListener-end",function(t,n){n instanceof w&&"load"===t[0]&&d.emit("xhr-load-added",[t[1],t[2]],n)}),d.on("removeEventListener-end",function(t,n){n instanceof w&&"load"===t[0]&&d.emit("xhr-load-removed",[t[1],t[2]],n)}),d.on("fn-start",function(t,n,e){n instanceof w&&("onload"===e&&(this.onload=!0),("load"===(t[0]&&t[0].type)||this.onload)&&(this.xhrCbStart=s.now()))}),d.on("fn-end",function(t,n){this.xhrCbStart&&d.emit("xhr-cb-time",[s.now()-this.xhrCbStart,this.onload,n],n)})}},{}],9:[function(t,n,e){n.exports=function(t){var n=document.createElement("a"),e=window.location,r={};n.href=t,r.port=n.port;var o=n.href.split("://");!r.port&&o[1]&&(r.port=o[1].split("/")[0].split("@").pop().split(":")[1]),r.port&&"0"!==r.port||(r.port="https"===o[0]?"443":"80"),r.hostname=n.hostname||e.hostname,r.pathname=n.pathname,r.protocol=o[0],"/"!==r.pathname.charAt(0)&&(r.pathname="/"+r.pathname);var i=!n.protocol||":"===n.protocol||n.protocol===e.protocol,a=n.hostname===document.domain&&n.port===e.port;return r.sameOrigin=i&&(!n.hostname||a),r}},{}],10:[function(t,n,e){function r(){}function o(t,n,e){return function(){return i(t,[u.now()].concat(s(arguments)),n?null:this,e),n?void 0:this}}var i=t("handle"),a=t(15),s=t(16),c=t("ee").get("tracer"),u=t("loader"),f=NREUM;"undefined"==typeof window.newrelic&&(newrelic=f);var d=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease"],l="api-",p=l+"ixn-";a(d,function(t,n){f[n]=o(l+n,!0,"api")}),f.addPageAction=o(l+"addPageAction",!0),f.setCurrentRouteName=o(l+"routeName",!0),n.exports=newrelic,f.interaction=function(){return(new r).get()};var h=r.prototype={createTracer:function(t,n){var e={},r=this,o="function"==typeof n;return i(p+"tracer",[u.now(),t,e],r),function(){if(c.emit((o?"":"no-")+"fn-start",[u.now(),r,o],e),o)try{return n.apply(this,arguments)}catch(t){throw c.emit("fn-err",[arguments,this,t],e),t}finally{c.emit("fn-end",[u.now()],e)}}}};a("actionText,setName,setAttribute,save,ignore,onEnd,getContext,end,get".split(","),function(t,n){h[n]=o(p+n)}),newrelic.noticeError=function(t,n){"string"==typeof t&&(t=new Error(t)),i("err",[t,u.now(),!1,n])}},{}],11:[function(t,n,e){n.exports=function(t){if("string"==typeof t&&t.length)return t.length;if("object"==typeof t){if("undefined"!=typeof ArrayBuffer&&t instanceof ArrayBuffer&&t.byteLength)return t.byteLength;if("undefined"!=typeof Blob&&t instanceof Blob&&t.size)return t.size;if(!("undefined"!=typeof FormData&&t instanceof FormData))try{return JSON.stringify(t).length}catch(n){return}}}},{}],12:[function(t,n,e){var r=0,o=navigator.userAgent.match(/Firefox[\/\s](\d+\.\d+)/);o&&(r=+o[1]),n.exports=r},{}],13:[function(t,n,e){function r(){function t(){return n?15&n[e++]:16*Math.random()|0}var n=null,e=0,r=window.crypto||window.msCrypto;r&&r.getRandomValues&&(n=r.getRandomValues(new Uint8Array(31)));for(var o,i="xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx",a="",s=0;s<i.length;s++)o=i[s],"x"===o?a+=t().toString(16):"y"===o?(o=3&t()|8,a+=o.toString(16)):a+=o;return a}function o(){function t(){return n?15&n[e++]:16*Math.random()|0}var n=null,e=0,r=window.crypto||window.msCrypto;r&&r.getRandomValues&&Uint8Array&&(n=r.getRandomValues(new Uint8Array(31)));for(var o=[],i=0;i<16;i++)o.push(t().toString(16));return o.join("")}n.exports={generateUuid:r,generateCatId:o}},{}],14:[function(t,n,e){function r(t,n){if(!o)return!1;if(t!==o)return!1;if(!n)return!0;if(!i)return!1;for(var e=i.split("."),r=n.split("."),a=0;a<r.length;a++)if(r[a]!==e[a])return!1;return!0}var o=null,i=null,a=/Version\/(\S+)\s+Safari/;if(navigator.userAgent){var s=navigator.userAgent,c=s.match(a);c&&s.indexOf("Chrome")===-1&&s.indexOf("Chromium")===-1&&(o="Safari",i=c[1])}n.exports={agent:o,version:i,match:r}},{}],15:[function(t,n,e){function r(t,n){var e=[],r="",i=0;for(r in t)o.call(t,r)&&(e[i]=n(r,t[r]),i+=1);return e}var o=Object.prototype.hasOwnProperty;n.exports=r},{}],16:[function(t,n,e){function r(t,n,e){n||(n=0),"undefined"==typeof e&&(e=t?t.length:0);for(var r=-1,o=e-n||0,i=Array(o<0?0:o);++r<o;)i[r]=t[n+r];return i}n.exports=r},{}],17:[function(t,n,e){n.exports={exists:"undefined"!=typeof window.performance&&window.performance.timing&&"undefined"!=typeof window.performance.timing.navigationStart}},{}],18:[function(t,n,e){function r(t){return!(t&&t instanceof Function&&t.apply&&!t[a])}var o=t("ee"),i=t(16),a="nr@original",s=Object.prototype.hasOwnProperty,c=!1;n.exports=function(t,n){function e(t,n,e,o){function nrWrapper(){var r,a,s,c;try{a=this,r=i(arguments),s="function"==typeof e?e(r,a):e||{}}catch(u){l([u,"",[r,a,o],s])}f(n+"start",[r,a,o],s);try{return c=t.apply(a,r)}catch(d){throw f(n+"err",[r,a,d],s),d}finally{f(n+"end",[r,a,c],s)}}return r(t)?t:(n||(n=""),nrWrapper[a]=t,d(t,nrWrapper),nrWrapper)}function u(t,n,o,i){o||(o="");var a,s,c,u="-"===o.charAt(0);for(c=0;c<n.length;c++)s=n[c],a=t[s],r(a)||(t[s]=e(a,u?s+o:o,i,s))}function f(e,r,o){if(!c||n){var i=c;c=!0;try{t.emit(e,r,o,n)}catch(a){l([a,e,r,o])}c=i}}function d(t,n){if(Object.defineProperty&&Object.keys)try{var e=Object.keys(t);return e.forEach(function(e){Object.defineProperty(n,e,{get:function(){return t[e]},set:function(n){return t[e]=n,n}})}),n}catch(r){l([r])}for(var o in t)s.call(t,o)&&(n[o]=t[o]);return n}function l(n){try{t.emit("internal-error",n)}catch(e){}}return t||(t=o),e.inPlace=u,e.flag=a,e}},{}],ee:[function(t,n,e){function r(){}function o(t){function n(t){return t&&t instanceof r?t:t?c(t,s,i):i()}function e(e,r,o,i){if(!l.aborted||i){t&&t(e,r,o);for(var a=n(o),s=m(e),c=s.length,u=0;u<c;u++)s[u].apply(a,r);var d=f[g[e]];return d&&d.push([y,e,r,a]),a}}function p(t,n){x[t]=m(t).concat(n)}function h(t,n){var e=x[t];if(e)for(var r=0;r<e.length;r++)e[r]===n&&e.splice(r,1)}function m(t){return x[t]||[]}function v(t){return d[t]=d[t]||o(e)}function w(t,n){u(t,function(t,e){n=n||"feature",g[e]=n,n in f||(f[n]=[])})}var x={},g={},y={on:p,addEventListener:p,removeEventListener:h,emit:e,get:v,listeners:m,context:n,buffer:w,abort:a,aborted:!1};return y}function i(){return new r}function a(){(f.api||f.feature)&&(l.aborted=!0,f=l.backlog={})}var s="nr@context",c=t("gos"),u=t(15),f={},d={},l=n.exports=o();l.backlog=f},{}],gos:[function(t,n,e){function r(t,n,e){if(o.call(t,n))return t[n];var r=e();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(t,n,{value:r,writable:!0,enumerable:!1}),r}catch(i){}return t[n]=r,r}var o=Object.prototype.hasOwnProperty;n.exports=r},{}],handle:[function(t,n,e){function r(t,n,e,r){o.buffer([t],r),o.emit(t,n,e)}var o=t("ee").get("handle");n.exports=r,r.ee=o},{}],id:[function(t,n,e){function r(t){var n=typeof t;return!t||"object"!==n&&"function"!==n?-1:t===window?0:a(t,i,function(){return o++})}var o=1,i="nr@id",a=t("gos");n.exports=r},{}],loader:[function(t,n,e){function r(){if(!E++){var t=b.info=NREUM.info,n=p.getElementsByTagName("script")[0];if(setTimeout(f.abort,3e4),!(t&&t.licenseKey&&t.applicationID&&n))return f.abort();u(g,function(n,e){t[n]||(t[n]=e)}),c("mark",["onload",a()+b.offset],null,"api");var e=p.createElement("script");e.src="https://"+t.agent,n.parentNode.insertBefore(e,n)}}function o(){"complete"===p.readyState&&i()}function i(){c("mark",["domContent",a()+b.offset],null,"api")}function a(){return R.exists&&performance.now?Math.round(performance.now()):(s=Math.max((new Date).getTime(),s))-b.offset}var s=(new Date).getTime(),c=t("handle"),u=t(15),f=t("ee"),d=t(14),l=window,p=l.document,h="addEventListener",m="attachEvent",v=l.XMLHttpRequest,w=v&&v.prototype;NREUM.o={ST:setTimeout,SI:l.setImmediate,CT:clearTimeout,XHR:v,REQ:l.Request,EV:l.Event,PR:l.Promise,MO:l.MutationObserver};var x=""+location,g={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",agent:"js-agent.newrelic.com/nr-1123.min.js"},y=v&&w&&w[h]&&!/CriOS/.test(navigator.userAgent),b=n.exports={offset:s,now:a,origin:x,features:{},xhrWrappable:y,userAgent:d};t(10),p[h]?(p[h]("DOMContentLoaded",i,!1),l[h]("load",r,!1)):(p[m]("onreadystatechange",o),l[m]("onload",r)),c("mark",["firstbyte",s],null,"api");var E=0,R=t(17)},{}]},{},["loader",2,8]);</script> + <title>Johnsons 4 Fleas Cats & Kittens Tablets From £5.29 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+ +
+

Welcome to Monster! or Join Us

+
+ + +
+
+
+
+ + + +
+
+ +
+
+ + + + + + + + +
+ + + + + +
+
+
+ + +
+
+
+

We’re sorry, but there seems to be an error with some of the information provided. Please check the highlighted fields

+
  • +
+
+
+
+
+

Johnsons 4 Fleas Cats & Kittens Tablets

+ + +

+ + + + + From only £5.29 - £9.56 +

+
+ +
+ + +
+ +
+ +
+ + + +
+
+ +
+ + + + + + Johnsons 4 Fleas Cats & Kittens Tablets + +
+ +
+ +
+ + +
+ +
+ + +
+ +
+ +
+ +
+ +
+
+
+
+ +
+ +
+

+ Save 10% on your first Repeat Delivery + +

+
+ +
+ +
+
+ + +
+ +
+ +
+ +
+ +
+
+ + + + + + +
+ +
+ + +
+
6 Treatment Pack
+ +

£9.56

+
+ +
+

SKU: CS20858_1

+ +

Available

+ +
+ +
+ + + + +
+
+ +
+
+ + + + + +
+ + +
+ + +
+ + +
+ +
+
+
+
+ + + + + + +
+ +
+ + +
+
3 Treatment Pack
+ +

£5.29

+
+ +
+

SKU: CS20858_2

+ +

Available

+ +
+ +
+ + + + +
+
+ +
+
+ + + + + +
+ + +
+ + +
+ + +
+ +
+ +
+ +
+ +
+
+
+ + +
+
+
+
+ +
+ +
+ +
+
+

Johnsons 4 Fleas Cats & Kittens - 3 Treatment Pack, 6 Treatment Pack

For use with Cats and Kittens over 4 weeks of age between 1 and 11kg.

Johnson's 4fleas tablets are an easy to use oral treatment to kill adult fleas found on your pet.

Effects on the fleas may be seen as soon as 15 minutes after administration.

Between 95 - 100% of fleas will be killed off in the first six hours, but ALL adult fleas will be gone after a day.

These tablets can be given directly to the mouth or may be mixed in a small portion f our pet's favourite food and given immediately. Administer a single tablet on an day when fleas are seen on your pet. Repeat on any subsequent day as necessary. Do not give more than one treatment per day.

You may notice your pet scratching more than usual for the first half hour after administration; this is completely normal and caused by the fleas reacting to Johnson's 4Fleas tablets.

While highly effective by themselves, 4Fleas is great when used as part of a programme to eliminate fleas and their larvae from both pets and their surroundings. +
+
+ + +
+
+
+
+ +
+
+ +
+
+
+ + + +
+
+
+ + + + +
+
+
+
+ +
+ + + +
+
+
+
+ + +
+
+ + +
+
+ +
+ +
+ + +
+ + FRONTLINE Plus Flea & Tick Spot On Treatment Cat 3 Pack NFA-C + +
+ Save 41% +
+
+ + + + +
+

Only £12.99

+
+
+

Save £9.13 (41)%

+
+ + +
+ + + +
+ +
+ +
+ +
+
+ + + +
+ +
+
+ +
+ + +
+ + Drontal Cat Worming Tablets 1 tablet NFA-C + +
+ + + + +
+

Only £2.10

+
+
+

Save £0.89 (30)%

+
+ + +
+ + + +
+ +
+ +
+ +
+
+ + + +
+ +
+
+ +
+ + +
+ + Panacur Wormer Granules for Dogs & Cats 1.8g Blue Sachet NFA-DC + +
+ + + + +
+

Only £1.44

+
+
+

Save £0.85 (37)%

+
+ + +
+ + + +
+ +
+ +
+ +
+
+ + + +
+ +
+
+ +
+ + +
+ + Beaphar Calming Cat Treats 35g + +
+ + + + +
+

Only £2.35

+
+
+

Save £0.64 (21)%

+
+ + +
+ + + +
+ +
+ +
+ + + +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+
+
+ + +
+
+
+
+ +
+
+
+
+

Deliveries

+

We strive for all orders with free delivery to be with you within 2 working days of them leaving our warehouse. Spend less than £35 and our delivery prices are as low as £2.99 or £5.99 if the weight is over 500g (Run faster than Postman Pet!)

+
+
+
+
+
+
+ +

Spend £35 or more and we'll deliver it to you, absolutely FREE!

+
+
+
+
+
+
+ +

Upgrade to our super-fast 10am next working day delivery* - £9.99

+
+
+
+
+
+
+ +

OR Upgrade to an optional working day delivery service* - £7.99

+
+
+
+
+
+ +
+
+
+
+
+

*delivery options will change according to the total cost and weight of your order, you'll see your options at the checkout

+
+
+
+
+
+
+
+
+ + + +
+ + + +
+
+ + + + + +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_microdata.py b/tests/test_microdata.py index 19bbbdd3..818b5455 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -184,3 +184,17 @@ def test_join_none(self): mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected) + + +class TestMicrodataWithDescription(unittest.TestCase): + maxDiff = None + + def test_if_punctuations_in_description_are_correctly_formatted(self): + body = get_testdata('websites', 'microdata-with-description.html') + expected = json.loads(get_testdata( + 'websites', 'microdata-with-description.json').decode('UTF-8')) + + mde = MicrodataExtractor() + data = mde.extract(body) + + self.assertEqual(data, expected) From 2a7ae11b24579687c5da657edb814fe884e33a38 Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 16:50:38 +0200 Subject: [PATCH 03/17] Add expected value --- .../websites/microdata-with-description.json | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tests/samples/websites/microdata-with-description.json diff --git a/tests/samples/websites/microdata-with-description.json b/tests/samples/websites/microdata-with-description.json new file mode 100644 index 00000000..9e360eac --- /dev/null +++ b/tests/samples/websites/microdata-with-description.json @@ -0,0 +1,44 @@ +[ + { + "type": "http://schema.org/Product", + "properties": { + "name": "Johnsons 4 Fleas Cats & Kittens Tablets", + "brand": "Johnsons", + "offers": [ + { + "type": "http://schema.org/AggregateOffer", + "properties": { + "offercount": "2", + "priceCurrency": "GBP", + "lowPrice": "5.29", + "highPrice": "9.56" + } + }, + { + "type": "http://schema.org/Offer", + "properties": { + "name": "Johnsons 4 Fleas Cats & Kittens Tablets 6 Treatment Pack", + "sku": "CS20858_1", + "price": "9.56", + "priceCurrency": "GBP", + "availability": "http://schema.org/InStock", + "itemCondition": "http://schema.org/NewCondition" + } + }, + { + "type": "http://schema.org/Offer", + "properties": { + "name": "Johnsons 4 Fleas Cats & Kittens Tablets 3 Treatment Pack", + "sku": "CS20858_2", + "price": "5.29", + "priceCurrency": "GBP", + "availability": "http://schema.org/InStock", + "itemCondition": "http://schema.org/NewCondition" + } + } + ], + "image": "https://res.cloudinary.com/monsterpetsupplies/image/upload/f_auto,c_pad,w_500,h_500,q_75/q_100/v1481710164/cat_kitten_flea_tabs_3_eaunpi.jpg", + "description": "Johnsons 4 Fleas Cats & Kittens - 3 Treatment Pack, 6 Treatment Pack For use with Cats and Kittens over 4 weeks of age between 1 and 11kg. Johnson's 4fleas tablets are an easy to use oral treatment to kill adult fleas found on your pet. Effects on the fleas may be seen as soon as 15 minutes after administration. Between 95 - 100% of fleas will be killed off in the first six hours, but ALL adult fleas will be gone after a day. These tablets can be given directly to the mouth or may be mixed in a small portion f our pet's favourite food and given immediately. Administer a single tablet on an day when fleas are seen on your pet. Repeat on any subsequent day as necessary. Do not give more than one treatment per day. You may notice your pet scratching more than usual for the first half hour after administration; this is completely normal and caused by the fleas reacting to Johnson's 4Fleas tablets. While highly effective by themselves, 4Fleas is great when used as part of a programme to eliminate fleas and their larvae from both pets and their surroundings." + } + } +] \ No newline at end of file From e4634140cf43e9b0507ffed7f221adf47832603c Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 18:33:57 +0200 Subject: [PATCH 04/17] Fix test case for description (it should contain new lines) --- tests/samples/websites/microdata-with-description.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/samples/websites/microdata-with-description.json b/tests/samples/websites/microdata-with-description.json index 9e360eac..cf2240ac 100644 --- a/tests/samples/websites/microdata-with-description.json +++ b/tests/samples/websites/microdata-with-description.json @@ -38,7 +38,7 @@ } ], "image": "https://res.cloudinary.com/monsterpetsupplies/image/upload/f_auto,c_pad,w_500,h_500,q_75/q_100/v1481710164/cat_kitten_flea_tabs_3_eaunpi.jpg", - "description": "Johnsons 4 Fleas Cats & Kittens - 3 Treatment Pack, 6 Treatment Pack For use with Cats and Kittens over 4 weeks of age between 1 and 11kg. Johnson's 4fleas tablets are an easy to use oral treatment to kill adult fleas found on your pet. Effects on the fleas may be seen as soon as 15 minutes after administration. Between 95 - 100% of fleas will be killed off in the first six hours, but ALL adult fleas will be gone after a day. These tablets can be given directly to the mouth or may be mixed in a small portion f our pet's favourite food and given immediately. Administer a single tablet on an day when fleas are seen on your pet. Repeat on any subsequent day as necessary. Do not give more than one treatment per day. You may notice your pet scratching more than usual for the first half hour after administration; this is completely normal and caused by the fleas reacting to Johnson's 4Fleas tablets. While highly effective by themselves, 4Fleas is great when used as part of a programme to eliminate fleas and their larvae from both pets and their surroundings." + "description": "Johnsons 4 Fleas Cats & Kittens - 3 Treatment Pack, 6 Treatment Pack\n\nFor use with Cats and Kittens over 4 weeks of age between 1 and 11kg.\nJohnson's 4fleas tablets are an easy to use oral treatment to kill adult fleas found on your pet.\nEffects on the fleas may be seen as soon as 15 minutes after administration.\nBetween 95 - 100% of fleas will be killed off in the first six hours, but ALL adult fleas will be gone after a day.\nThese tablets can be given directly to the mouth or may be mixed in a small portion f our pet's favourite food and given immediately. Administer a single tablet on an day when fleas are seen on your pet. Repeat on any subsequent day as necessary. Do not give more than one treatment per day.\nYou may notice your pet scratching more than usual for the first half hour after administration; this is completely normal and caused by the fleas reacting to Johnson's 4Fleas tablets.\nWhile highly effective by themselves, 4Fleas is great when used as part of a programme to eliminate fleas and their larvae from both pets and their surroundings." } } ] \ No newline at end of file From 92d064685659e7ee03ee47e20b5c64636ef2e17f Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 18:36:49 +0200 Subject: [PATCH 05/17] Move cleaning html to extracting content (extracting properties did not work without this) --- extruct/w3cmicrodata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index a9e1c507..75f9ab90 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -71,12 +71,11 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"): return self.extract_items(tree, base_url) def extract_items(self, document, base_url): - cleaned_document = cleaner.clean_html(document) items_seen = set() return [ item for item in ( self._extract_item(it, items_seen=items_seen, base_url=base_url) - for it in self._xp_item(cleaned_document)) + for it in self._xp_item(document)) if item] def _extract_item(self, node, items_seen, base_url): @@ -205,7 +204,8 @@ def _extract_property_value(self, node, items_seen, base_url, force=False): return self._extract_textContent(node) def _extract_textContent(self, node): - return html_text.etree_to_text(node) + clean_node = cleaner.clean_html(node) + return html_text.etree_to_text(clean_node) MicrodataExtractor = LxmlMicrodataExtractor From 4e2dbf9804ee7f21b54a5275bb5539f762420faf Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 18:37:15 +0200 Subject: [PATCH 06/17] Removed unused import --- extruct/w3cmicrodata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 75f9ab90..8c93763f 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -11,7 +11,6 @@ import collections from functools import partial -from copy import deepcopy try: from urlparse import urljoin From 48bb049a0819d052a47529d0cba43793fef2296a Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 18:37:50 +0200 Subject: [PATCH 07/17] Fix formatting in test --- tests/test_microdata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_microdata.py b/tests/test_microdata.py index 818b5455..3f170798 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -5,6 +5,7 @@ from extruct.w3cmicrodata import MicrodataExtractor from tests import get_testdata + class TestMicrodata(unittest.TestCase): maxDiff = None From c02c3498d39cb83461c38ec6a27ad9ef1548b8c3 Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 19:04:10 +0200 Subject: [PATCH 08/17] Fix test case for custom url --- tests/samples/schema.org/product_custom_url.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/samples/schema.org/product_custom_url.json b/tests/samples/schema.org/product_custom_url.json index fcf29b22..18399a2f 100644 --- a/tests/samples/schema.org/product_custom_url.json +++ b/tests/samples/schema.org/product_custom_url.json @@ -2,7 +2,7 @@ "properties": {"brand": "ACME", "name": "Executive Anvil", "image": "http://some-example.com/anvil_executive.jpg", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": {"type": "http://schema.org/AggregateRating", "properties": {"ratingValue": "4.4", From 86f47d25a099be744575887f6e956090bedffaee Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 19:14:53 +0200 Subject: [PATCH 09/17] Fix test case with custom url and node id --- tests/samples/schema.org/product_custom_url_and_node_id.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/samples/schema.org/product_custom_url_and_node_id.json b/tests/samples/schema.org/product_custom_url_and_node_id.json index 3111b628..888859b6 100644 --- a/tests/samples/schema.org/product_custom_url_and_node_id.json +++ b/tests/samples/schema.org/product_custom_url_and_node_id.json @@ -3,7 +3,7 @@ "properties": {"brand": "ACME", "name": "Executive Anvil", "image": "http://some-example.com/anvil_executive.jpg", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": {"type": "http://schema.org/AggregateRating", "_nodeId_": "aggregateRating", From c8dc16478e1b8597ec78156739cc87a8a5dc1257 Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 19:35:50 +0200 Subject: [PATCH 10/17] Fix test for umicrodata --- tests/test_uniform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_uniform.py b/tests/test_uniform.py index fdd1d561..e0e3d76b 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -74,7 +74,7 @@ def test_umicrodata(self): "brand": "ACME", "name": "Executive Anvil", "image": "anvil_executive.jpg", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": { "@type": "AggregateRating", From f902dad3058b027493693090367a44119adaac8e Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 19:39:08 +0200 Subject: [PATCH 11/17] Fix test for product ref --- tests/samples/schema.org/product-ref.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/samples/schema.org/product-ref.json b/tests/samples/schema.org/product-ref.json index 7c2bf8d6..1ffcbd58 100644 --- a/tests/samples/schema.org/product-ref.json +++ b/tests/samples/schema.org/product-ref.json @@ -32,7 +32,7 @@ ], "brand": "ACME", "name": "Executive Anvil", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": { "type": "http://schema.org/AggregateRating", From 75686f18cb65fc81702216818b6ff7d323d50eac Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 19:40:00 +0200 Subject: [PATCH 12/17] Fix test for product join None --- tests/samples/schema.org/product.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/samples/schema.org/product.json b/tests/samples/schema.org/product.json index 52dc974f..dd9846dc 100644 --- a/tests/samples/schema.org/product.json +++ b/tests/samples/schema.org/product.json @@ -2,7 +2,7 @@ "properties": {"brand": "ACME", "name": "Executive Anvil", "image": "anvil_executive.jpg", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": {"type": "http://schema.org/AggregateRating", "properties": {"ratingValue": "4.4", From 0f5a632dd0405ef4c94ee82d12038e360dd0e218 Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 19:49:44 +0200 Subject: [PATCH 13/17] Fix test_w3c_5_2 --- tests/samples/w3c/microdata.5.2.withtext.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/samples/w3c/microdata.5.2.withtext.json b/tests/samples/w3c/microdata.5.2.withtext.json index 020c5055..7b3cb19d 100644 --- a/tests/samples/w3c/microdata.5.2.withtext.json +++ b/tests/samples/w3c/microdata.5.2.withtext.json @@ -2,18 +2,18 @@ "name": "Tank Locomotive (DB 80)", "product-code": "33041", "scale": "HO"}, - "textContent": "Name:\n Tank Locomotive (DB 80)\n Product code:\n 33041\n Scale:\n HO\n Digital:\n Delta", + "textContent": "Name:\nTank Locomotive (DB 80)\nProduct code:\n33041\nScale:\nHO\nDigital:\nDelta", "type": ["http://md.example.com/loco", "http://md.example.com/lighting"]}, {"properties": {"name": "Turnout Lantern Kit", "product-code": "74470", "scale": "HO", "track-type": "C"}, - "textContent": "Name:\n Turnout Lantern Kit\n Product code:\n 74470\n Purpose:\n For retrofitting 2 C Track\n turnouts.", + "textContent": "Name:\nTurnout Lantern Kit\nProduct code:\n74470\nPurpose:\nFor retrofitting 2 C Track turnouts.", "type": ["http://md.example.com/track", "http://md.example.com/lighting"]}, {"properties": {"name": "Express Train Passenger Car (DB Am 203)", "product-code": "8710", "scale": "Z"}, - "textContent": "Name:\n Express Train Passenger Car (DB Am 203)\n Product code:\n 8710\n Scale:\n Z", + "textContent": "Name:\nExpress Train Passenger Car (DB Am 203)\nProduct code:\n8710\nScale:\nZ", "type": "http://md.example.com/passengers"}] From c2b59b1f4d5b74ee649c7caa2615ab657d6f9674 Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 19:53:53 +0200 Subject: [PATCH 14/17] Fix test case for event, fix formatting --- tests/samples/schema.org/Event.002.json | 2 +- tests/test_microdata.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/samples/schema.org/Event.002.json b/tests/samples/schema.org/Event.002.json index e141739e..dc9a914e 100644 --- a/tests/samples/schema.org/Event.002.json +++ b/tests/samples/schema.org/Event.002.json @@ -37,7 +37,7 @@ "offers": "foo-fighters-everlong-buy.html", "url": "foo-fighters-everlong.html"}, "type": "http://schema.org/MusicRecording"}], - "video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.", + "video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.", "duration": "T1M33S", "name": "Interview with the Foo Fighters", "thumbnail": "foo-fighters-interview-thumb.jpg"}, diff --git a/tests/test_microdata.py b/tests/test_microdata.py index 3f170798..76361971 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -38,12 +38,13 @@ def test_schemaorg_MusicRecording(self): self.assertEqual(data, expected) def test_schemaorg_Event(self): - for i in [1, 2, 3, 4, 8]: + for i in [1, 2, 3, 4, 8]: body = get_testdata('schema.org', 'Event.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'Event.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) + self.assertEqual(data, expected) def test_w3c_textContent_values(self): From 1e990b31e8133925a97ef71b380aa079112a2bfd Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Mon, 15 Jul 2019 19:55:16 +0200 Subject: [PATCH 15/17] Fix test for music recording --- tests/samples/schema.org/MusicRecording.001.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/samples/schema.org/MusicRecording.001.json b/tests/samples/schema.org/MusicRecording.001.json index 500c8223..385b4b6a 100644 --- a/tests/samples/schema.org/MusicRecording.001.json +++ b/tests/samples/schema.org/MusicRecording.001.json @@ -37,7 +37,7 @@ "offers": "foo-fighters-everlong-buy.html", "url": "foo-fighters-everlong.html"}, "type": "http://schema.org/MusicRecording"}], - "video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.", + "video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.", "duration": "T1M33S", "name": "Interview with the Foo Fighters", "thumbnail": "foo-fighters-interview-thumb.jpg"}, From 17c2982fa4ec11125014f409f61a1626d1edbb47 Mon Sep 17 00:00:00 2001 From: jakubwasikowski Date: Wed, 17 Jul 2019 12:19:29 +0200 Subject: [PATCH 16/17] Add minimal version of six --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 04bc5c16..a92e4eb0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,6 @@ requests rdflib rdflib-jsonld mf2py>=1.1.0 -six +six>=1.11 w3lib html-text From 670702e8dd1800428b73327ced454c33615ca784 Mon Sep 17 00:00:00 2001 From: Jakub Wasikowski Date: Fri, 19 Jul 2019 10:33:01 +0200 Subject: [PATCH 17/17] Fix comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Iván de Prado --- extruct/w3cmicrodata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 8c93763f..cb8c9fa7 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -36,7 +36,7 @@ page_structure=False, # may be nice to have processing_instructions=True, embedded=False, # keep embedded content - frames=False, # keed frames + frames=False, # keep frames forms=False, # keep forms annoying_tags=False, remove_unknown_tags=False,