From 1b3673386d241112fe15204c75e746a70c733dd2 Mon Sep 17 00:00:00 2001 From: tuxos Date: Tue, 10 Feb 2015 14:40:46 +0100 Subject: [PATCH 1/6] More efficient title extraction and bugs fix --- goose/extractors/title.py | 91 +++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 31d69840..74ccbb2b 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""\ +""" This is a python port of "Goose" orignialy licensed to Gravity.com under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information @@ -21,17 +21,19 @@ limitations under the License. """ import re - +import traceback +import sys from goose.extractors import BaseExtractor -TITLE_SPLITTERS = [u"|", u"-", u"»", u":"] +TITLE_SPLITTERS = set([u"|", u"-", u"»", u":"]) class TitleExtractor(BaseExtractor): def clean_title(self, title): - """Clean title with the use of og:site_name + """ + Clean title with the use of og:site_name in this case try to get ride of site name and use TITLE_SPLITTERS to reformat title """ @@ -51,54 +53,51 @@ def clean_title(self, title): # my wonderfull article | TechCrunch title_words = title.split() - # check for an empty title - # so that we don't get an IndexError below - if len(title_words) == 0: - return u"" - - # check if first letter is in TITLE_SPLITTERS - # if so remove it - if title_words[0] in TITLE_SPLITTERS: - title_words.pop(0) - - # check if last letter is in TITLE_SPLITTERS - # if so remove it - if title_words[-1] in TITLE_SPLITTERS: - title_words.pop(-1) - - # rebuild the title - title = u" ".join(title_words).strip() + title = u"" + # check if first and last words are in TITLE_SPLITTERS + # if so remove them + for i in 0, -1: + if title_words and next (( + True for w in TITLE_SPLITTERS if title_words[i] in w), \ + False): + title_words.pop(i) + # rebuild the title + title = u" ".join(title_words).strip() return title def get_title(self): - """\ + """ Fetch the article title and analyze it """ - title = '' - - # rely on opengraph in case we have the data - if "title" in self.article.opengraph.keys(): - title = self.article.opengraph['title'] - return self.clean_title(title) - - # try to fetch the meta headline - meta_headline = self.parser.getElementsByTag( - self.article.doc, - tag="meta", - attr="name", - value="headline") - if meta_headline is not None and len(meta_headline) > 0: - title = self.parser.getAttribute(meta_headline[0], 'content') - return self.clean_title(title) - - # otherwise use the title meta - title_element = self.parser.getElementsByTag(self.article.doc, tag='title') - if title_element is not None and len(title_element) > 0: - title = self.parser.getText(title_element[0]) - return self.clean_title(title) - - return title + title = u"" + try: + # rely on opengraph in case we have the data + title_ = self.article.opengraph.get('title', '') + if title_: + # handle tags without any title: + return self.clean_title(title_) + + # try to fetch the meta headline + meta_headline = self.parser.getElementsByTag( + self.article.doc, + tag="meta", + attr="name", + value="headline") + if meta_headline: + title_ = self.parser.getAttribute(meta_headline[0], 'content') + if title_: + return self.clean_title(title_) + + # otherwise use the title meta + title_element = self.parser.getElementsByTag(self.article.doc, tag='title') + if title_element: + title_ = self.parser.getText(title_element[0]) + if title_: + return self.clean_title(title_) + except: + print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exec() + return title def extract(self): return self.get_title() From ac13863b552d1016f4d68976fd64d85c30c7bc4d Mon Sep 17 00:00:00 2001 From: tuxos Date: Tue, 10 Feb 2015 14:44:02 +0100 Subject: [PATCH 2/6] Fix maximum recursion depth failure --- goose/__init__.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/goose/__init__.py b/goose/__init__.py index 409b5732..ed322364 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -20,16 +20,19 @@ See the License for the specific language governing permissions and limitations under the License. """ +import sys +# the default recursion limit is 1000 +sys.setrecursionlimit(2000) import os import platform from tempfile import mkstemp +import traceback from goose.version import version_info, __version__ from goose.configuration import Configuration from goose.crawler import CrawlCandidate from goose.crawler import Crawler - class Goose(object): """\ @@ -48,7 +51,7 @@ def extend_config(self): self.config = config def extract(self, url=None, raw_html=None): - """\ + """ Main method to extract an article object from a URL, pass in a url and get back a Article """ @@ -61,12 +64,17 @@ def shutdown_network(self): def crawl(self, crawl_candiate): parsers = list(self.config.available_parsers) parsers.remove(self.config.parser_class) + article = None try: crawler = Crawler(self.config) article = crawler.crawl(crawl_candiate) except (UnicodeDecodeError, ValueError): self.config.parser_class = parsers[0] - return self.crawl(crawl_candiate) + try: + if isinstance(crawl_candiate, basestring): + return self.crawl(crawl_candiate) + except exception as e: + print >> sys.stderr, 'Article Crawl error: ', traceback.format_exec() return article def initialize(self): From 45c9acd171be31c0771187a40cb6baa5e6aa2e6b Mon Sep 17 00:00:00 2001 From: tuxos Date: Tue, 10 Feb 2015 23:06:31 +0100 Subject: [PATCH 3/6] Keep the default recursion limit value --- goose/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/goose/__init__.py b/goose/__init__.py index ed322364..540a9232 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -20,13 +20,11 @@ See the License for the specific language governing permissions and limitations under the License. """ -import sys -# the default recursion limit is 1000 -sys.setrecursionlimit(2000) import os import platform from tempfile import mkstemp import traceback +import sys from goose.version import version_info, __version__ from goose.configuration import Configuration From 51e89dfbb3c264a89dc4f4e3ed7223ae64c831db Mon Sep 17 00:00:00 2001 From: tuxos Date: Wed, 11 Feb 2015 14:54:25 +0100 Subject: [PATCH 4/6] Fix None title --- goose/extractors/title.py | 41 ++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 74ccbb2b..39b5d9a1 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -76,28 +76,29 @@ def get_title(self): title_ = self.article.opengraph.get('title', '') if title_: # handle tags without any title: - return self.clean_title(title_) - - # try to fetch the meta headline - meta_headline = self.parser.getElementsByTag( - self.article.doc, - tag="meta", - attr="name", - value="headline") - if meta_headline: - title_ = self.parser.getAttribute(meta_headline[0], 'content') - if title_: - return self.clean_title(title_) - - # otherwise use the title meta - title_element = self.parser.getElementsByTag(self.article.doc, tag='title') - if title_element: - title_ = self.parser.getText(title_element[0]) - if title_: - return self.clean_title(title_) + title = self.clean_title(title_) + else: + # try to fetch the meta headline + meta_headline = self.parser.getElementsByTag( + self.article.doc, + tag="meta", + attr="name", + value="headline") + if meta_headline: + title_ = self.parser.getAttribute(meta_headline[0], 'content') + if title_: + title = self.clean_title(title_) + else: + # otherwise use the title meta + title_element = self.parser.getElementsByTag(self.article.doc, tag='title') + if title_element: + title_ = self.parser.getText(title_element[0]) + if title_: + title = self.clean_title(title_) except: print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exec() - return title + + return title def extract(self): return self.get_title() From 8e5eae0b9308e8099dafc68c6d59628c730a68dd Mon Sep 17 00:00:00 2001 From: tuxos Date: Mon, 16 Feb 2015 10:45:07 +0100 Subject: [PATCH 5/6] catch more exceptions --- goose/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/goose/__init__.py b/goose/__init__.py index 540a9232..45a0c6a1 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -68,11 +68,10 @@ def crawl(self, crawl_candiate): article = crawler.crawl(crawl_candiate) except (UnicodeDecodeError, ValueError): self.config.parser_class = parsers[0] - try: - if isinstance(crawl_candiate, basestring): - return self.crawl(crawl_candiate) - except exception as e: - print >> sys.stderr, 'Article Crawl error: ', traceback.format_exec() + if isinstance(crawl_candiate, basestring): + return self.crawl(crawl_candiate) + except Exception as e: + print >> sys.stderr, 'Article Crawl error: ', traceback.format_exec() return article def initialize(self): From f7a9b3d907e3ab9a7d30b3d47c5e2c8895fa1bea Mon Sep 17 00:00:00 2001 From: tuxos Date: Mon, 23 Feb 2015 11:10:41 +0100 Subject: [PATCH 6/6] misspelling --- goose/extractors/title.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 39b5d9a1..ac08808f 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -96,7 +96,7 @@ def get_title(self): if title_: title = self.clean_title(title_) except: - print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exec() + print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exc() return title