diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index a254a5c6..cb8c9fa7 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -18,11 +18,32 @@ from urllib.parse import urljoin import lxml.etree +from lxml.html.clean import Cleaner from w3lib.html import strip_html5_whitespace +import html_text from extruct.utils import parse_html +# Cleaner which is similar to html_text cleaner, but is less aggressive +cleaner = Cleaner( + scripts=True, + javascript=False, # onclick attributes are fine + comments=True, + style=True, + links=True, + meta=True, + page_structure=False, # may be nice to have + processing_instructions=True, + embedded=False, # keep embedded content + frames=False, # keep frames + forms=False, # keep forms + annoying_tags=False, + remove_unknown_tags=False, + safe_attrs_only=False, +) + + class LxmlMicrodataExtractor(object): _xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]') _xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop], @@ -182,7 +203,8 @@ def _extract_property_value(self, node, items_seen, base_url, force=False): return self._extract_textContent(node) def _extract_textContent(self, node): - return u"".join(self._xp_clean_text(node)).strip() + clean_node = cleaner.clean_html(node) + return html_text.etree_to_text(clean_node) MicrodataExtractor = LxmlMicrodataExtractor diff --git a/requirements.txt b/requirements.txt index 87a27224..a92e4eb0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,6 @@ requests rdflib rdflib-jsonld mf2py>=1.1.0 -six +six>=1.11 w3lib +html-text diff --git a/setup.py b/setup.py index 5bbc7553..f7e60387 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ def get_version(): 'rdflib-jsonld', 'mf2py', 'w3lib', + 'html-text>=0.5.1', 'six'], extras_require={ 'service': [ diff --git a/tests/samples/schema.org/Event.002.json b/tests/samples/schema.org/Event.002.json index e141739e..dc9a914e 100644 --- a/tests/samples/schema.org/Event.002.json +++ b/tests/samples/schema.org/Event.002.json @@ -37,7 +37,7 @@ "offers": "foo-fighters-everlong-buy.html", "url": "foo-fighters-everlong.html"}, "type": "http://schema.org/MusicRecording"}], - "video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.", + "video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.", "duration": "T1M33S", "name": "Interview with the Foo Fighters", "thumbnail": "foo-fighters-interview-thumb.jpg"}, diff --git a/tests/samples/schema.org/MusicRecording.001.json b/tests/samples/schema.org/MusicRecording.001.json index 500c8223..385b4b6a 100644 --- a/tests/samples/schema.org/MusicRecording.001.json +++ b/tests/samples/schema.org/MusicRecording.001.json @@ -37,7 +37,7 @@ "offers": "foo-fighters-everlong-buy.html", "url": "foo-fighters-everlong.html"}, "type": "http://schema.org/MusicRecording"}], - "video": {"properties": {"description": "Catch this exclusive interview with\n Dave Grohl and the Foo Fighters about their new album, Rope.", + "video": {"properties": {"description": "Catch this exclusive interview with Dave Grohl and the Foo Fighters about their new album, Rope.", "duration": "T1M33S", "name": "Interview with the Foo Fighters", "thumbnail": "foo-fighters-interview-thumb.jpg"}, diff --git a/tests/samples/schema.org/product-ref.json b/tests/samples/schema.org/product-ref.json index 7c2bf8d6..1ffcbd58 100644 --- a/tests/samples/schema.org/product-ref.json +++ b/tests/samples/schema.org/product-ref.json @@ -32,7 +32,7 @@ ], "brand": "ACME", "name": "Executive Anvil", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": { "type": "http://schema.org/AggregateRating", diff --git a/tests/samples/schema.org/product.json b/tests/samples/schema.org/product.json index 52dc974f..dd9846dc 100644 --- a/tests/samples/schema.org/product.json +++ b/tests/samples/schema.org/product.json @@ -2,7 +2,7 @@ "properties": {"brand": "ACME", "name": "Executive Anvil", "image": "anvil_executive.jpg", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": {"type": "http://schema.org/AggregateRating", "properties": {"ratingValue": "4.4", diff --git a/tests/samples/schema.org/product_custom_url.json b/tests/samples/schema.org/product_custom_url.json index fcf29b22..18399a2f 100644 --- a/tests/samples/schema.org/product_custom_url.json +++ b/tests/samples/schema.org/product_custom_url.json @@ -2,7 +2,7 @@ "properties": {"brand": "ACME", "name": "Executive Anvil", "image": "http://some-example.com/anvil_executive.jpg", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": {"type": "http://schema.org/AggregateRating", "properties": {"ratingValue": "4.4", diff --git a/tests/samples/schema.org/product_custom_url_and_node_id.json b/tests/samples/schema.org/product_custom_url_and_node_id.json index 3111b628..888859b6 100644 --- a/tests/samples/schema.org/product_custom_url_and_node_id.json +++ b/tests/samples/schema.org/product_custom_url_and_node_id.json @@ -3,7 +3,7 @@ "properties": {"brand": "ACME", "name": "Executive Anvil", "image": "http://some-example.com/anvil_executive.jpg", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": {"type": "http://schema.org/AggregateRating", "_nodeId_": "aggregateRating", diff --git a/tests/samples/w3c/microdata.5.2.withtext.json b/tests/samples/w3c/microdata.5.2.withtext.json index 020c5055..7b3cb19d 100644 --- a/tests/samples/w3c/microdata.5.2.withtext.json +++ b/tests/samples/w3c/microdata.5.2.withtext.json @@ -2,18 +2,18 @@ "name": "Tank Locomotive (DB 80)", "product-code": "33041", "scale": "HO"}, - "textContent": "Name:\n Tank Locomotive (DB 80)\n Product code:\n 33041\n Scale:\n HO\n Digital:\n Delta", + "textContent": "Name:\nTank Locomotive (DB 80)\nProduct code:\n33041\nScale:\nHO\nDigital:\nDelta", "type": ["http://md.example.com/loco", "http://md.example.com/lighting"]}, {"properties": {"name": "Turnout Lantern Kit", "product-code": "74470", "scale": "HO", "track-type": "C"}, - "textContent": "Name:\n Turnout Lantern Kit\n Product code:\n 74470\n Purpose:\n For retrofitting 2 C Track\n turnouts.", + "textContent": "Name:\nTurnout Lantern Kit\nProduct code:\n74470\nPurpose:\nFor retrofitting 2 C Track turnouts.", "type": ["http://md.example.com/track", "http://md.example.com/lighting"]}, {"properties": {"name": "Express Train Passenger Car (DB Am 203)", "product-code": "8710", "scale": "Z"}, - "textContent": "Name:\n Express Train Passenger Car (DB Am 203)\n Product code:\n 8710\n Scale:\n Z", + "textContent": "Name:\nExpress Train Passenger Car (DB Am 203)\nProduct code:\n8710\nScale:\nZ", "type": "http://md.example.com/passengers"}] diff --git a/tests/samples/websites/microdata-with-description.html b/tests/samples/websites/microdata-with-description.html new file mode 100644 index 00000000..403adbed --- /dev/null +++ b/tests/samples/websites/microdata-with-description.html @@ -0,0 +1,3036 @@ +<!DOCTYPE html> + +<html> +<head> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><script type="text/javascript">window.NREUM||(NREUM={});NREUM.info = {"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"3ecd0a4901","applicationID":"4254217","transactionName":"MV1UN0ZSWUFUVEALVggXcgxAfVJGdlhaFksJVFoGRhxnQFpTQQFNSXFYB1FL","queueTime":0,"applicationTime":221,"ttGuid":"11507BD9179999DB","agent":""}</script><script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={xpid:"UwAPU1NXGwcCUVVQBgY="};window.NREUM||(NREUM={}),__nr_require=function(t,n,e){function r(e){if(!n[e]){var o=n[e]={exports:{}};t[e][0].call(o.exports,function(n){var o=t[e][1][n];return r(o||n)},o,o.exports)}return n[e].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<e.length;o++)r(e[o]);return r}({1:[function(t,n,e){function r(t){try{s.console&&console.log(t)}catch(n){}}var o,i=t("ee"),a=t(15),s={};try{o=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof console.log&&(s.console=!0,o.indexOf("dev")!==-1&&(s.dev=!0),o.indexOf("nr_dev")!==-1&&(s.nrDev=!0))}catch(c){}s.nrDev&&i.on("internal-error",function(t){r(t.stack)}),s.dev&&i.on("fn-err",function(t,n,e){r(e.stack)}),s.dev&&(r("NR AGENT IN DEVELOPMENT MODE"),r("flags: "+a(s,function(t,n){return t}).join(", ")))},{}],2:[function(t,n,e){function r(t,n,e,r,s){try{p?p-=1:o(s||new UncaughtException(t,n,e),!0)}catch(u){try{i("ierr",[u,c.now(),!0])}catch(d){}}return"function"==typeof f&&f.apply(this,a(arguments))}function UncaughtException(t,n,e){this.message=t||"Uncaught error with no additional information",this.sourceURL=n,this.line=e}function o(t,n){var e=n?null:c.now();i("err",[t,e])}var i=t("handle"),a=t(16),s=t("ee"),c=t("loader"),u=t("gos"),f=window.onerror,d=!1,l="nr@seenError",p=0;c.features.err=!0,t(1),window.onerror=r;try{throw new Error}catch(h){"stack"in h&&(t(5),t(4),"addEventListener"in window&&t(3),c.xhrWrappable&&t(6),d=!0)}s.on("fn-start",function(t,n,e){d&&(p+=1)}),s.on("fn-err",function(t,n,e){d&&!e[l]&&(u(e,l,function(){return!0}),this.thrown=!0,o(e))}),s.on("fn-end",function(){d&&!this.thrown&&p>0&&(p-=1)}),s.on("internal-error",function(t){i("ierr",[t,c.now(),!0])})},{}],3:[function(t,n,e){function r(t){for(var n=t;n&&!n.hasOwnProperty(f);)n=Object.getPrototypeOf(n);n&&o(n)}function o(t){s.inPlace(t,[f,d],"-",i)}function i(t,n){return t[1]}var a=t("ee").get("events"),s=t(18)(a,!0),c=t("gos"),u=XMLHttpRequest,f="addEventListener",d="removeEventListener";n.exports=a,"getPrototypeOf"in Object?(r(document),r(window),r(u.prototype)):u.prototype.hasOwnProperty(f)&&(o(window),o(u.prototype)),a.on(f+"-start",function(t,n){var e=t[1],r=c(e,"nr@wrapped",function(){function t(){if("function"==typeof e.handleEvent)return e.handleEvent.apply(e,arguments)}var n={object:t,"function":e}[typeof e];return n?s(n,"fn-",null,n.name||"anonymous"):e});this.wrapped=t[1]=r}),a.on(d+"-start",function(t){t[1]=this.wrapped||t[1]})},{}],4:[function(t,n,e){var r=t("ee").get("raf"),o=t(18)(r),i="equestAnimationFrame";n.exports=r,o.inPlace(window,["r"+i,"mozR"+i,"webkitR"+i,"msR"+i],"raf-"),r.on("raf-start",function(t){t[0]=o(t[0],"fn-")})},{}],5:[function(t,n,e){function r(t,n,e){t[0]=a(t[0],"fn-",null,e)}function o(t,n,e){this.method=e,this.timerDuration=isNaN(t[1])?0:+t[1],t[0]=a(t[0],"fn-",this,e)}var i=t("ee").get("timer"),a=t(18)(i),s="setTimeout",c="setInterval",u="clearTimeout",f="-start",d="-";n.exports=i,a.inPlace(window,[s,"setImmediate"],s+d),a.inPlace(window,[c],c+d),a.inPlace(window,[u,"clearImmediate"],u+d),i.on(c+f,r),i.on(s+f,o)},{}],6:[function(t,n,e){function r(t,n){d.inPlace(n,["onreadystatechange"],"fn-",s)}function o(){var t=this,n=f.context(t);t.readyState>3&&!n.resolved&&(n.resolved=!0,f.emit("xhr-resolved",[],t)),d.inPlace(t,x,"fn-",s)}function i(t){g.push(t),h&&(b?b.then(a):v?v(a):(E=-E,R.data=E))}function a(){for(var t=0;t<g.length;t++)r([],g[t]);g.length&&(g=[])}function s(t,n){return n}function c(t,n){for(var e in t)n[e]=t[e];return n}t(3);var u=t("ee"),f=u.get("xhr"),d=t(18)(f),l=NREUM.o,p=l.XHR,h=l.MO,m=l.PR,v=l.SI,w="readystatechange",x=["onload","onerror","onabort","onloadstart","onloadend","onprogress","ontimeout"],g=[];n.exports=f;var y=window.XMLHttpRequest=function(t){var n=new p(t);try{f.emit("new-xhr",[n],n),n.addEventListener(w,o,!1)}catch(e){try{f.emit("internal-error",[e])}catch(r){}}return n};if(c(p,y),y.prototype=p.prototype,d.inPlace(y.prototype,["open","send"],"-xhr-",s),f.on("send-xhr-start",function(t,n){r(t,n),i(n)}),f.on("open-xhr-start",r),h){var b=m&&m.resolve();if(!v&&!m){var E=1,R=document.createTextNode(E);new h(a).observe(R,{characterData:!0})}}else u.on("fn-end",function(t){t[0]&&t[0].type===w||a()})},{}],7:[function(t,n,e){function r(){var t=window.NREUM,n=t.info.accountID||null,e=t.info.agentID||null,r=t.info.trustKey||null,i="btoa"in window&&"function"==typeof window.btoa;if(!n||!e||!i)return null;var a={v:[0,1],d:{ty:"Browser",ac:n,ap:e,id:o.generateCatId(),tr:o.generateCatId(),ti:Date.now()}};return r&&n!==r&&(a.d.tk=r),btoa(JSON.stringify(a))}var o=t(13);n.exports={generateTraceHeader:r}},{}],8:[function(t,n,e){function r(t){var n=this.params,e=this.metrics;if(!this.ended){this.ended=!0;for(var r=0;r<p;r++)t.removeEventListener(l[r],this.listener,!1);n.aborted||(e.duration=s.now()-this.startTime,this.loadCaptureCalled||4!==t.readyState?null==n.status&&(n.status=0):a(this,t),e.cbTime=this.cbTime,d.emit("xhr-done",[t],t),c("xhr",[n,e,this.startTime]))}}function o(t,n){var e=t.responseType;if("json"===e&&null!==n)return n;var r="arraybuffer"===e||"blob"===e||"json"===e?t.response:t.responseText;return v(r)}function i(t,n){var e=u(n),r=t.params;r.host=e.hostname+":"+e.port,r.pathname=e.pathname,t.sameOrigin=e.sameOrigin}function a(t,n){t.params.status=n.status;var e=o(n,t.lastSize);if(e&&(t.metrics.rxSize=e),t.sameOrigin){var r=n.getResponseHeader("X-NewRelic-App-Data");r&&(t.params.cat=r.split(", ").pop())}t.loadCaptureCalled=!0}var s=t("loader");if(s.xhrWrappable){var c=t("handle"),u=t(9),f=t(7).generateTraceHeader,d=t("ee"),l=["load","error","abort","timeout"],p=l.length,h=t("id"),m=t(12),v=t(11),w=window.XMLHttpRequest;s.features.xhr=!0,t(6),d.on("new-xhr",function(t){var n=this;n.totalCbs=0,n.called=0,n.cbTime=0,n.end=r,n.ended=!1,n.xhrGuids={},n.lastSize=null,n.loadCaptureCalled=!1,t.addEventListener("load",function(e){a(n,t)},!1),m&&(m>34||m<10)||window.opera||t.addEventListener("progress",function(t){n.lastSize=t.loaded},!1)}),d.on("open-xhr-start",function(t){this.params={method:t[0]},i(this,t[1]),this.metrics={}}),d.on("open-xhr-end",function(t,n){"loader_config"in NREUM&&"xpid"in NREUM.loader_config&&this.sameOrigin&&n.setRequestHeader("X-NewRelic-ID",NREUM.loader_config.xpid);var e=!1;if("init"in NREUM&&"distributed_tracing"in NREUM.init&&(e=!!NREUM.init.distributed_tracing.enabled),e&&this.sameOrigin){var r=f();r&&n.setRequestHeader("newrelic",r)}}),d.on("send-xhr-start",function(t,n){var e=this.metrics,r=t[0],o=this;if(e&&r){var i=v(r);i&&(e.txSize=i)}this.startTime=s.now(),this.listener=function(t){try{"abort"!==t.type||o.loadCaptureCalled||(o.params.aborted=!0),("load"!==t.type||o.called===o.totalCbs&&(o.onloadCalled||"function"!=typeof n.onload))&&o.end(n)}catch(e){try{d.emit("internal-error",[e])}catch(r){}}};for(var a=0;a<p;a++)n.addEventListener(l[a],this.listener,!1)}),d.on("xhr-cb-time",function(t,n,e){this.cbTime+=t,n?this.onloadCalled=!0:this.called+=1,this.called!==this.totalCbs||!this.onloadCalled&&"function"==typeof e.onload||this.end(e)}),d.on("xhr-load-added",function(t,n){var e=""+h(t)+!!n;this.xhrGuids&&!this.xhrGuids[e]&&(this.xhrGuids[e]=!0,this.totalCbs+=1)}),d.on("xhr-load-removed",function(t,n){var e=""+h(t)+!!n;this.xhrGuids&&this.xhrGuids[e]&&(delete this.xhrGuids[e],this.totalCbs-=1)}),d.on("addEventListener-end",function(t,n){n instanceof w&&"load"===t[0]&&d.emit("xhr-load-added",[t[1],t[2]],n)}),d.on("removeEventListener-end",function(t,n){n instanceof w&&"load"===t[0]&&d.emit("xhr-load-removed",[t[1],t[2]],n)}),d.on("fn-start",function(t,n,e){n instanceof w&&("onload"===e&&(this.onload=!0),("load"===(t[0]&&t[0].type)||this.onload)&&(this.xhrCbStart=s.now()))}),d.on("fn-end",function(t,n){this.xhrCbStart&&d.emit("xhr-cb-time",[s.now()-this.xhrCbStart,this.onload,n],n)})}},{}],9:[function(t,n,e){n.exports=function(t){var n=document.createElement("a"),e=window.location,r={};n.href=t,r.port=n.port;var o=n.href.split("://");!r.port&&o[1]&&(r.port=o[1].split("/")[0].split("@").pop().split(":")[1]),r.port&&"0"!==r.port||(r.port="https"===o[0]?"443":"80"),r.hostname=n.hostname||e.hostname,r.pathname=n.pathname,r.protocol=o[0],"/"!==r.pathname.charAt(0)&&(r.pathname="/"+r.pathname);var i=!n.protocol||":"===n.protocol||n.protocol===e.protocol,a=n.hostname===document.domain&&n.port===e.port;return r.sameOrigin=i&&(!n.hostname||a),r}},{}],10:[function(t,n,e){function r(){}function o(t,n,e){return function(){return i(t,[u.now()].concat(s(arguments)),n?null:this,e),n?void 0:this}}var i=t("handle"),a=t(15),s=t(16),c=t("ee").get("tracer"),u=t("loader"),f=NREUM;"undefined"==typeof window.newrelic&&(newrelic=f);var d=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease"],l="api-",p=l+"ixn-";a(d,function(t,n){f[n]=o(l+n,!0,"api")}),f.addPageAction=o(l+"addPageAction",!0),f.setCurrentRouteName=o(l+"routeName",!0),n.exports=newrelic,f.interaction=function(){return(new r).get()};var h=r.prototype={createTracer:function(t,n){var e={},r=this,o="function"==typeof n;return i(p+"tracer",[u.now(),t,e],r),function(){if(c.emit((o?"":"no-")+"fn-start",[u.now(),r,o],e),o)try{return n.apply(this,arguments)}catch(t){throw c.emit("fn-err",[arguments,this,t],e),t}finally{c.emit("fn-end",[u.now()],e)}}}};a("actionText,setName,setAttribute,save,ignore,onEnd,getContext,end,get".split(","),function(t,n){h[n]=o(p+n)}),newrelic.noticeError=function(t,n){"string"==typeof t&&(t=new Error(t)),i("err",[t,u.now(),!1,n])}},{}],11:[function(t,n,e){n.exports=function(t){if("string"==typeof t&&t.length)return t.length;if("object"==typeof t){if("undefined"!=typeof ArrayBuffer&&t instanceof ArrayBuffer&&t.byteLength)return t.byteLength;if("undefined"!=typeof Blob&&t instanceof Blob&&t.size)return t.size;if(!("undefined"!=typeof FormData&&t instanceof FormData))try{return JSON.stringify(t).length}catch(n){return}}}},{}],12:[function(t,n,e){var r=0,o=navigator.userAgent.match(/Firefox[\/\s](\d+\.\d+)/);o&&(r=+o[1]),n.exports=r},{}],13:[function(t,n,e){function r(){function t(){return n?15&n[e++]:16*Math.random()|0}var n=null,e=0,r=window.crypto||window.msCrypto;r&&r.getRandomValues&&(n=r.getRandomValues(new Uint8Array(31)));for(var o,i="xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx",a="",s=0;s<i.length;s++)o=i[s],"x"===o?a+=t().toString(16):"y"===o?(o=3&t()|8,a+=o.toString(16)):a+=o;return a}function o(){function t(){return n?15&n[e++]:16*Math.random()|0}var n=null,e=0,r=window.crypto||window.msCrypto;r&&r.getRandomValues&&Uint8Array&&(n=r.getRandomValues(new Uint8Array(31)));for(var o=[],i=0;i<16;i++)o.push(t().toString(16));return o.join("")}n.exports={generateUuid:r,generateCatId:o}},{}],14:[function(t,n,e){function r(t,n){if(!o)return!1;if(t!==o)return!1;if(!n)return!0;if(!i)return!1;for(var e=i.split("."),r=n.split("."),a=0;a<r.length;a++)if(r[a]!==e[a])return!1;return!0}var o=null,i=null,a=/Version\/(\S+)\s+Safari/;if(navigator.userAgent){var s=navigator.userAgent,c=s.match(a);c&&s.indexOf("Chrome")===-1&&s.indexOf("Chromium")===-1&&(o="Safari",i=c[1])}n.exports={agent:o,version:i,match:r}},{}],15:[function(t,n,e){function r(t,n){var e=[],r="",i=0;for(r in t)o.call(t,r)&&(e[i]=n(r,t[r]),i+=1);return e}var o=Object.prototype.hasOwnProperty;n.exports=r},{}],16:[function(t,n,e){function r(t,n,e){n||(n=0),"undefined"==typeof e&&(e=t?t.length:0);for(var r=-1,o=e-n||0,i=Array(o<0?0:o);++r<o;)i[r]=t[n+r];return i}n.exports=r},{}],17:[function(t,n,e){n.exports={exists:"undefined"!=typeof window.performance&&window.performance.timing&&"undefined"!=typeof window.performance.timing.navigationStart}},{}],18:[function(t,n,e){function r(t){return!(t&&t instanceof Function&&t.apply&&!t[a])}var o=t("ee"),i=t(16),a="nr@original",s=Object.prototype.hasOwnProperty,c=!1;n.exports=function(t,n){function e(t,n,e,o){function nrWrapper(){var r,a,s,c;try{a=this,r=i(arguments),s="function"==typeof e?e(r,a):e||{}}catch(u){l([u,"",[r,a,o],s])}f(n+"start",[r,a,o],s);try{return c=t.apply(a,r)}catch(d){throw f(n+"err",[r,a,d],s),d}finally{f(n+"end",[r,a,c],s)}}return r(t)?t:(n||(n=""),nrWrapper[a]=t,d(t,nrWrapper),nrWrapper)}function u(t,n,o,i){o||(o="");var a,s,c,u="-"===o.charAt(0);for(c=0;c<n.length;c++)s=n[c],a=t[s],r(a)||(t[s]=e(a,u?s+o:o,i,s))}function f(e,r,o){if(!c||n){var i=c;c=!0;try{t.emit(e,r,o,n)}catch(a){l([a,e,r,o])}c=i}}function d(t,n){if(Object.defineProperty&&Object.keys)try{var e=Object.keys(t);return e.forEach(function(e){Object.defineProperty(n,e,{get:function(){return t[e]},set:function(n){return t[e]=n,n}})}),n}catch(r){l([r])}for(var o in t)s.call(t,o)&&(n[o]=t[o]);return n}function l(n){try{t.emit("internal-error",n)}catch(e){}}return t||(t=o),e.inPlace=u,e.flag=a,e}},{}],ee:[function(t,n,e){function r(){}function o(t){function n(t){return t&&t instanceof r?t:t?c(t,s,i):i()}function e(e,r,o,i){if(!l.aborted||i){t&&t(e,r,o);for(var a=n(o),s=m(e),c=s.length,u=0;u<c;u++)s[u].apply(a,r);var d=f[g[e]];return d&&d.push([y,e,r,a]),a}}function p(t,n){x[t]=m(t).concat(n)}function h(t,n){var e=x[t];if(e)for(var r=0;r<e.length;r++)e[r]===n&&e.splice(r,1)}function m(t){return x[t]||[]}function v(t){return d[t]=d[t]||o(e)}function w(t,n){u(t,function(t,e){n=n||"feature",g[e]=n,n in f||(f[n]=[])})}var x={},g={},y={on:p,addEventListener:p,removeEventListener:h,emit:e,get:v,listeners:m,context:n,buffer:w,abort:a,aborted:!1};return y}function i(){return new r}function a(){(f.api||f.feature)&&(l.aborted=!0,f=l.backlog={})}var s="nr@context",c=t("gos"),u=t(15),f={},d={},l=n.exports=o();l.backlog=f},{}],gos:[function(t,n,e){function r(t,n,e){if(o.call(t,n))return t[n];var r=e();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(t,n,{value:r,writable:!0,enumerable:!1}),r}catch(i){}return t[n]=r,r}var o=Object.prototype.hasOwnProperty;n.exports=r},{}],handle:[function(t,n,e){function r(t,n,e,r){o.buffer([t],r),o.emit(t,n,e)}var o=t("ee").get("handle");n.exports=r,r.ee=o},{}],id:[function(t,n,e){function r(t){var n=typeof t;return!t||"object"!==n&&"function"!==n?-1:t===window?0:a(t,i,function(){return o++})}var o=1,i="nr@id",a=t("gos");n.exports=r},{}],loader:[function(t,n,e){function r(){if(!E++){var t=b.info=NREUM.info,n=p.getElementsByTagName("script")[0];if(setTimeout(f.abort,3e4),!(t&&t.licenseKey&&t.applicationID&&n))return f.abort();u(g,function(n,e){t[n]||(t[n]=e)}),c("mark",["onload",a()+b.offset],null,"api");var e=p.createElement("script");e.src="https://"+t.agent,n.parentNode.insertBefore(e,n)}}function o(){"complete"===p.readyState&&i()}function i(){c("mark",["domContent",a()+b.offset],null,"api")}function a(){return R.exists&&performance.now?Math.round(performance.now()):(s=Math.max((new Date).getTime(),s))-b.offset}var s=(new Date).getTime(),c=t("handle"),u=t(15),f=t("ee"),d=t(14),l=window,p=l.document,h="addEventListener",m="attachEvent",v=l.XMLHttpRequest,w=v&&v.prototype;NREUM.o={ST:setTimeout,SI:l.setImmediate,CT:clearTimeout,XHR:v,REQ:l.Request,EV:l.Event,PR:l.Promise,MO:l.MutationObserver};var x=""+location,g={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",agent:"js-agent.newrelic.com/nr-1123.min.js"},y=v&&w&&w[h]&&!/CriOS/.test(navigator.userAgent),b=n.exports={offset:s,now:a,origin:x,features:{},xhrWrappable:y,userAgent:d};t(10),p[h]?(p[h]("DOMContentLoaded",i,!1),l[h]("load",r,!1)):(p[m]("onreadystatechange",o),l[m]("onload",r)),c("mark",["firstbyte",s],null,"api");var E=0,R=t(17)},{}]},{},["loader",2,8]);</script> + <title>Johnsons 4 Fleas Cats & Kittens Tablets From £5.29 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+
+
+
+
+ +
+

Welcome to Monster! or Join Us

+
+ + +
+
+
+
+ + + +
+
+ +
+
+ + + + + + + + +
+ + + + + +
+
+
+ + +
+
+
+

We’re sorry, but there seems to be an error with some of the information provided. Please check the highlighted fields

+
  • +
+
+
+
+
+

Johnsons 4 Fleas Cats & Kittens Tablets

+ + +

+ + + + + From only £5.29 - £9.56 +

+
+ +
+ + +
+ +
+ +
+ + + +
+
+ +
+ + + + + + Johnsons 4 Fleas Cats & Kittens Tablets + +
+ +
+ +
+ + +
+ +
+ + +
+ +
+ +
+ +
+ +
+
+
+
+ +
+ +
+

+ Save 10% on your first Repeat Delivery + +

+
+ +
+ +
+
+ + +
+ +
+ +
+ +
+ +
+
+ + + + + + +
+ +
+ + +
+
6 Treatment Pack
+ +

£9.56

+
+ +
+

SKU: CS20858_1

+ +

Available

+ +
+ +
+ + + + +
+
+ +
+
+ + + + + +
+ + +
+ + +
+ + +
+ +
+
+
+
+ + + + + + +
+ +
+ + +
+
3 Treatment Pack
+ +

£5.29

+
+ +
+

SKU: CS20858_2

+ +

Available

+ +
+ +
+ + + + +
+
+ +
+
+ + + + + +
+ + +
+ + +
+ + +
+ +
+ +
+ +
+ +
+
+
+ + +
+
+
+
+ +
+ +
+ +
+
+

Johnsons 4 Fleas Cats & Kittens - 3 Treatment Pack, 6 Treatment Pack

For use with Cats and Kittens over 4 weeks of age between 1 and 11kg.

Johnson's 4fleas tablets are an easy to use oral treatment to kill adult fleas found on your pet.

Effects on the fleas may be seen as soon as 15 minutes after administration.

Between 95 - 100% of fleas will be killed off in the first six hours, but ALL adult fleas will be gone after a day.

These tablets can be given directly to the mouth or may be mixed in a small portion f our pet's favourite food and given immediately. Administer a single tablet on an day when fleas are seen on your pet. Repeat on any subsequent day as necessary. Do not give more than one treatment per day.

You may notice your pet scratching more than usual for the first half hour after administration; this is completely normal and caused by the fleas reacting to Johnson's 4Fleas tablets.

While highly effective by themselves, 4Fleas is great when used as part of a programme to eliminate fleas and their larvae from both pets and their surroundings. +
+
+ + +
+
+
+
+ +
+
+ +
+
+
+ + + +
+
+
+ + + + +
+
+
+
+ +
+ + + +
+
+
+
+ + +
+
+ + +
+
+ +
+ +
+ + +
+ + FRONTLINE Plus Flea & Tick Spot On Treatment Cat 3 Pack NFA-C + +
+ Save 41% +
+
+ + + + +
+

Only £12.99

+
+
+

Save £9.13 (41)%

+
+ + +
+ + + +
+ +
+ +
+ +
+
+ + + +
+ +
+
+ +
+ + +
+ + Drontal Cat Worming Tablets 1 tablet NFA-C + +
+ + + + +
+

Only £2.10

+
+
+

Save £0.89 (30)%

+
+ + +
+ + + +
+ +
+ +
+ +
+
+ + + +
+ +
+
+ +
+ + +
+ + Panacur Wormer Granules for Dogs & Cats 1.8g Blue Sachet NFA-DC + +
+ + + + +
+

Only £1.44

+
+
+

Save £0.85 (37)%

+
+ + +
+ + + +
+ +
+ +
+ +
+
+ + + +
+ +
+
+ +
+ + +
+ + Beaphar Calming Cat Treats 35g + +
+ + + + +
+

Only £2.35

+
+
+

Save £0.64 (21)%

+
+ + +
+ + + +
+ +
+ +
+ + + +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+
+
+ + +
+
+
+
+ +
+
+
+
+

Deliveries

+

We strive for all orders with free delivery to be with you within 2 working days of them leaving our warehouse. Spend less than £35 and our delivery prices are as low as £2.99 or £5.99 if the weight is over 500g (Run faster than Postman Pet!)

+
+
+
+
+
+
+ +

Spend £35 or more and we'll deliver it to you, absolutely FREE!

+
+
+
+
+
+
+ +

Upgrade to our super-fast 10am next working day delivery* - £9.99

+
+
+
+
+
+
+ +

OR Upgrade to an optional working day delivery service* - £7.99

+
+
+
+
+
+ +
+
+
+
+
+

*delivery options will change according to the total cost and weight of your order, you'll see your options at the checkout

+
+
+
+
+
+
+
+
+ + + +
+ + + +
+
+ + + + + +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/samples/websites/microdata-with-description.json b/tests/samples/websites/microdata-with-description.json new file mode 100644 index 00000000..cf2240ac --- /dev/null +++ b/tests/samples/websites/microdata-with-description.json @@ -0,0 +1,44 @@ +[ + { + "type": "http://schema.org/Product", + "properties": { + "name": "Johnsons 4 Fleas Cats & Kittens Tablets", + "brand": "Johnsons", + "offers": [ + { + "type": "http://schema.org/AggregateOffer", + "properties": { + "offercount": "2", + "priceCurrency": "GBP", + "lowPrice": "5.29", + "highPrice": "9.56" + } + }, + { + "type": "http://schema.org/Offer", + "properties": { + "name": "Johnsons 4 Fleas Cats & Kittens Tablets 6 Treatment Pack", + "sku": "CS20858_1", + "price": "9.56", + "priceCurrency": "GBP", + "availability": "http://schema.org/InStock", + "itemCondition": "http://schema.org/NewCondition" + } + }, + { + "type": "http://schema.org/Offer", + "properties": { + "name": "Johnsons 4 Fleas Cats & Kittens Tablets 3 Treatment Pack", + "sku": "CS20858_2", + "price": "5.29", + "priceCurrency": "GBP", + "availability": "http://schema.org/InStock", + "itemCondition": "http://schema.org/NewCondition" + } + } + ], + "image": "https://res.cloudinary.com/monsterpetsupplies/image/upload/f_auto,c_pad,w_500,h_500,q_75/q_100/v1481710164/cat_kitten_flea_tabs_3_eaunpi.jpg", + "description": "Johnsons 4 Fleas Cats & Kittens - 3 Treatment Pack, 6 Treatment Pack\n\nFor use with Cats and Kittens over 4 weeks of age between 1 and 11kg.\nJohnson's 4fleas tablets are an easy to use oral treatment to kill adult fleas found on your pet.\nEffects on the fleas may be seen as soon as 15 minutes after administration.\nBetween 95 - 100% of fleas will be killed off in the first six hours, but ALL adult fleas will be gone after a day.\nThese tablets can be given directly to the mouth or may be mixed in a small portion f our pet's favourite food and given immediately. Administer a single tablet on an day when fleas are seen on your pet. Repeat on any subsequent day as necessary. Do not give more than one treatment per day.\nYou may notice your pet scratching more than usual for the first half hour after administration; this is completely normal and caused by the fleas reacting to Johnson's 4Fleas tablets.\nWhile highly effective by themselves, 4Fleas is great when used as part of a programme to eliminate fleas and their larvae from both pets and their surroundings." + } + } +] \ No newline at end of file diff --git a/tests/test_microdata.py b/tests/test_microdata.py index 19bbbdd3..76361971 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -5,6 +5,7 @@ from extruct.w3cmicrodata import MicrodataExtractor from tests import get_testdata + class TestMicrodata(unittest.TestCase): maxDiff = None @@ -37,12 +38,13 @@ def test_schemaorg_MusicRecording(self): self.assertEqual(data, expected) def test_schemaorg_Event(self): - for i in [1, 2, 3, 4, 8]: + for i in [1, 2, 3, 4, 8]: body = get_testdata('schema.org', 'Event.{:03d}.html'.format(i)) expected = json.loads(get_testdata('schema.org', 'Event.{:03d}.json'.format(i)).decode('UTF-8')) mde = MicrodataExtractor() data = mde.extract(body) + self.assertEqual(data, expected) def test_w3c_textContent_values(self): @@ -184,3 +186,17 @@ def test_join_none(self): mde = MicrodataExtractor() data = mde.extract(body) self.assertEqual(data, expected) + + +class TestMicrodataWithDescription(unittest.TestCase): + maxDiff = None + + def test_if_punctuations_in_description_are_correctly_formatted(self): + body = get_testdata('websites', 'microdata-with-description.html') + expected = json.loads(get_testdata( + 'websites', 'microdata-with-description.json').decode('UTF-8')) + + mde = MicrodataExtractor() + data = mde.extract(body) + + self.assertEqual(data, expected) diff --git a/tests/test_uniform.py b/tests/test_uniform.py index fdd1d561..e0e3d76b 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -74,7 +74,7 @@ def test_umicrodata(self): "brand": "ACME", "name": "Executive Anvil", "image": "anvil_executive.jpg", - "description": "Sleeker than ACME's Classic Anvil, the\n Executive Anvil is perfect for the business traveler\n looking for something to drop from a height.", + "description": "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.", "mpn": "925872", "aggregateRating": { "@type": "AggregateRating",