From 1b3673386d241112fe15204c75e746a70c733dd2 Mon Sep 17 00:00:00 2001
From: tuxos <saloua.litayem@gmail.com>
Date: Tue, 10 Feb 2015 14:40:46 +0100
Subject: [PATCH 1/6] More efficient title extraction and bugs fix

---
 goose/extractors/title.py | 91 +++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 46 deletions(-)
diff --git a/goose/extractors/title.py b/goose/extractors/title.py
index 31d69840..74ccbb2b 100644
--- a/goose/extractors/title.py
+++ b/goose/extractors/title.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""\
+"""
 This is a python port of "Goose" orignialy licensed to Gravity.com
 under one or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
@@ -21,17 +21,19 @@
 limitations under the License.
 """
 import re
-
+import traceback
+import sys
 from goose.extractors import BaseExtractor
 
 
-TITLE_SPLITTERS = [u"|", u"-", u"»", u":"]
+TITLE_SPLITTERS = set([u"|", u"-", u"»", u":"])
 
 
 class TitleExtractor(BaseExtractor):
 
     def clean_title(self, title):
-        """Clean title with the use of og:site_name
+        """
+        Clean title with the use of og:site_name
         in this case try to get ride of site name
         and use TITLE_SPLITTERS to reformat title
         """
@@ -51,54 +53,51 @@ def clean_title(self, title):
         # my wonderfull article | TechCrunch
         title_words = title.split()
 
-        # check for an empty title
-        # so that we don't get an IndexError below
-        if len(title_words) == 0:
-            return u""
-
-        # check if first letter is in TITLE_SPLITTERS
-        # if so remove it
-        if title_words[0] in TITLE_SPLITTERS:
-            title_words.pop(0)
-
-        # check if last letter is in TITLE_SPLITTERS
-        # if so remove it
-        if title_words[-1] in TITLE_SPLITTERS:
-            title_words.pop(-1)
-
-        # rebuild the title
-        title = u" ".join(title_words).strip()
+        title = u""
+        # check if first and last words are in TITLE_SPLITTERS
+        # if so remove them
+        for i in 0, -1:
+            if title_words and next ((
+                True for w in TITLE_SPLITTERS if title_words[i] in w), \
+                False):
+                title_words.pop(i)
+            # rebuild the title
+            title = u" ".join(title_words).strip()
 
         return title
 
     def get_title(self):
-        """\
+        """
         Fetch the article title and analyze it
         """
-        title = ''
-
-        # rely on opengraph in case we have the data
-        if "title" in self.article.opengraph.keys():
-            title = self.article.opengraph['title']
-            return self.clean_title(title)
-
-        # try to fetch the meta headline
-        meta_headline = self.parser.getElementsByTag(
-                            self.article.doc,
-                            tag="meta",
-                            attr="name",
-                            value="headline")
-        if meta_headline is not None and len(meta_headline) > 0:
-            title = self.parser.getAttribute(meta_headline[0], 'content')
-            return self.clean_title(title)
-
-        # otherwise use the title meta
-        title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
-        if title_element is not None and len(title_element) > 0:
-            title = self.parser.getText(title_element[0])
-            return self.clean_title(title)
-
-        return title
+        title = u""
+        try:
+            # rely on opengraph in case we have the data
+            title_ = self.article.opengraph.get('title', '')
+            if title_:
+                # handle tags without any title: <meta property="og:title" />
+                return self.clean_title(title_)
+
+            # try to fetch the meta headline
+            meta_headline = self.parser.getElementsByTag(
+                                self.article.doc,
+                                tag="meta",
+                                attr="name",
+                                value="headline")
+            if meta_headline:
+                title_ = self.parser.getAttribute(meta_headline[0], 'content')
+                if title_:
+                    return self.clean_title(title_)
+
+            # otherwise use the title meta
+            title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
+            if title_element:
+                title_ = self.parser.getText(title_element[0])
+                if title_:
+                    return self.clean_title(title_)
+        except:
+            print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exec()
+            return title
 
     def extract(self):
         return self.get_title()

From ac13863b552d1016f4d68976fd64d85c30c7bc4d Mon Sep 17 00:00:00 2001
From: tuxos <saloua.litayem@gmail.com>
Date: Tue, 10 Feb 2015 14:44:02 +0100
Subject: [PATCH 2/6] Fix maximum recursion depth failure

---
 goose/__init__.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/goose/__init__.py b/goose/__init__.py
index 409b5732..ed322364 100644
--- a/goose/__init__.py
+++ b/goose/__init__.py
@@ -20,16 +20,19 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+import sys
+# the default recursion limit is 1000
+sys.setrecursionlimit(2000)
 import os
 import platform
 from tempfile import mkstemp
+import traceback
 
 from goose.version import version_info, __version__
 from goose.configuration import Configuration
 from goose.crawler import CrawlCandidate
 from goose.crawler import Crawler
 
-
 class Goose(object):
     """\
 
@@ -48,7 +51,7 @@ def extend_config(self):
             self.config = config
 
     def extract(self, url=None, raw_html=None):
-        """\
+        """
         Main method to extract an article object from a URL,
         pass in a url and get back a Article
         """
@@ -61,12 +64,17 @@ def shutdown_network(self):
     def crawl(self, crawl_candiate):
         parsers = list(self.config.available_parsers)
         parsers.remove(self.config.parser_class)
+        article = None
         try:
             crawler = Crawler(self.config)
             article = crawler.crawl(crawl_candiate)
         except (UnicodeDecodeError, ValueError):
             self.config.parser_class = parsers[0]
-            return self.crawl(crawl_candiate)
+            try:
+                if isinstance(crawl_candiate, basestring):
+                    return self.crawl(crawl_candiate)
+            except exception as e:
+                print >> sys.stderr, 'Article Crawl error: ', traceback.format_exec()
         return article
 
     def initialize(self):

From 45c9acd171be31c0771187a40cb6baa5e6aa2e6b Mon Sep 17 00:00:00 2001
From: tuxos <saloua.litayem@gmail.com>
Date: Tue, 10 Feb 2015 23:06:31 +0100
Subject: [PATCH 3/6] Keep the default recursion limit value

---
 goose/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/goose/__init__.py b/goose/__init__.py
index ed322364..540a9232 100644
--- a/goose/__init__.py
+++ b/goose/__init__.py
@@ -20,13 +20,11 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-import sys
-# the default recursion limit is 1000
-sys.setrecursionlimit(2000)
 import os
 import platform
 from tempfile import mkstemp
 import traceback
+import sys
 
 from goose.version import version_info, __version__
 from goose.configuration import Configuration

From 51e89dfbb3c264a89dc4f4e3ed7223ae64c831db Mon Sep 17 00:00:00 2001
From: tuxos <saloua.litayem@gmail.com>
Date: Wed, 11 Feb 2015 14:54:25 +0100
Subject: [PATCH 4/6] Fix None title

---
 goose/extractors/title.py | 41 ++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/goose/extractors/title.py b/goose/extractors/title.py
index 74ccbb2b..39b5d9a1 100644
--- a/goose/extractors/title.py
+++ b/goose/extractors/title.py
@@ -76,28 +76,29 @@ def get_title(self):
             title_ = self.article.opengraph.get('title', '')
             if title_:
                 # handle tags without any title: <meta property="og:title" />
-                return self.clean_title(title_)
-
-            # try to fetch the meta headline
-            meta_headline = self.parser.getElementsByTag(
-                                self.article.doc,
-                                tag="meta",
-                                attr="name",
-                                value="headline")
-            if meta_headline:
-                title_ = self.parser.getAttribute(meta_headline[0], 'content')
-                if title_:
-                    return self.clean_title(title_)
-
-            # otherwise use the title meta
-            title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
-            if title_element:
-                title_ = self.parser.getText(title_element[0])
-                if title_:
-                    return self.clean_title(title_)
+                title = self.clean_title(title_)
+            else:
+                # try to fetch the meta headline
+                meta_headline = self.parser.getElementsByTag(
+                                    self.article.doc,
+                                    tag="meta",
+                                    attr="name",
+                                    value="headline")
+                if meta_headline:
+                    title_ = self.parser.getAttribute(meta_headline[0], 'content')
+                    if title_:
+                        title = self.clean_title(title_)
+                else:
+                    # otherwise use the title meta
+                    title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
+                    if title_element:
+                        title_ = self.parser.getText(title_element[0])
+                        if title_:
+                            title = self.clean_title(title_)
         except:
             print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exec()
-            return title
+        
+        return title
 
     def extract(self):
         return self.get_title()

From 8e5eae0b9308e8099dafc68c6d59628c730a68dd Mon Sep 17 00:00:00 2001
From: tuxos <saloua.litayem@gmail.com>
Date: Mon, 16 Feb 2015 10:45:07 +0100
Subject: [PATCH 5/6] catch more exceptions

---
 goose/__init__.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/goose/__init__.py b/goose/__init__.py
index 540a9232..45a0c6a1 100644
--- a/goose/__init__.py
+++ b/goose/__init__.py
@@ -68,11 +68,10 @@ def crawl(self, crawl_candiate):
             article = crawler.crawl(crawl_candiate)
         except (UnicodeDecodeError, ValueError):
             self.config.parser_class = parsers[0]
-            try:
-                if isinstance(crawl_candiate, basestring):
-                    return self.crawl(crawl_candiate)
-            except exception as e:
-                print >> sys.stderr, 'Article Crawl error: ', traceback.format_exec()
+            if isinstance(crawl_candiate, basestring):
+                return self.crawl(crawl_candiate)
+        except Exception as e:
+            print >> sys.stderr, 'Article Crawl error: ', traceback.format_exec()
         return article
 
     def initialize(self):

From f7a9b3d907e3ab9a7d30b3d47c5e2c8895fa1bea Mon Sep 17 00:00:00 2001
From: tuxos <saloua.litayem@gmail.com>
Date: Mon, 23 Feb 2015 11:10:41 +0100
Subject: [PATCH 6/6] misspelling

---
 goose/extractors/title.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/goose/extractors/title.py b/goose/extractors/title.py
index 39b5d9a1..ac08808f 100644
--- a/goose/extractors/title.py
+++ b/goose/extractors/title.py
@@ -96,7 +96,7 @@ def get_title(self):
                         if title_:
                             title = self.clean_title(title_)
         except:
-            print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exec()
+            print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exc()
         
         return title