grangier · slitayem · Feb 10, 2015 · Feb 10, 2015 · Feb 10, 2015 · Feb 11, 2015
diff --git a/goose/__init__.py b/goose/__init__.py
@@ -23,13 +23,14 @@
 import os
 import platform
 from tempfile import mkstemp
+import traceback
+import sys
 
 from goose.version import version_info, __version__
 from goose.configuration import Configuration
 from goose.crawler import CrawlCandidate
 from goose.crawler import Crawler
 
-
 class Goose(object):
     """\
 
@@ -48,7 +49,7 @@ def extend_config(self):
             self.config = config
 
     def extract(self, url=None, raw_html=None):
-        """\
+        """
         Main method to extract an article object from a URL,
         pass in a url and get back a Article
         """
@@ -61,12 +62,16 @@ def shutdown_network(self):
     def crawl(self, crawl_candiate):
         parsers = list(self.config.available_parsers)
         parsers.remove(self.config.parser_class)
+        article = None
         try:
             crawler = Crawler(self.config)
             article = crawler.crawl(crawl_candiate)
         except (UnicodeDecodeError, ValueError):
             self.config.parser_class = parsers[0]
-            return self.crawl(crawl_candiate)
+            if isinstance(crawl_candiate, basestring):
+                return self.crawl(crawl_candiate)
+        except Exception as e:
+            print >> sys.stderr, 'Article Crawl error: ', traceback.format_exec()
         return article
 
     def initialize(self):

diff --git a/goose/extractors/title.py b/goose/extractors/title.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""\
+"""
 This is a python port of "Goose" orignialy licensed to Gravity.com
 under one or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
@@ -21,17 +21,19 @@
 limitations under the License.
 """
 import re
-
+import traceback
+import sys
 from goose.extractors import BaseExtractor
 
 
-TITLE_SPLITTERS = [u"|", u"-", u"»", u":"]
+TITLE_SPLITTERS = set([u"|", u"-", u"»", u":"])
 
 
 class TitleExtractor(BaseExtractor):
 
     def clean_title(self, title):
-        """Clean title with the use of og:site_name
+        """
+        Clean title with the use of og:site_name
         in this case try to get ride of site name
         and use TITLE_SPLITTERS to reformat title
         """
@@ -51,53 +53,51 @@ def clean_title(self, title):
         # my wonderfull article | TechCrunch
         title_words = title.split()
 
-        # check for an empty title
-        # so that we don't get an IndexError below
-        if len(title_words) == 0:
-            return u""
-
-        # check if first letter is in TITLE_SPLITTERS
-        # if so remove it
-        if title_words[0] in TITLE_SPLITTERS:
-            title_words.pop(0)
-
-        # check if last letter is in TITLE_SPLITTERS
-        # if so remove it
-        if title_words[-1] in TITLE_SPLITTERS:
-            title_words.pop(-1)
-
-        # rebuild the title
-        title = u" ".join(title_words).strip()
+        title = u""
+        # check if first and last words are in TITLE_SPLITTERS
+        # if so remove them
+        for i in 0, -1:
+            if title_words and next ((
+                True for w in TITLE_SPLITTERS if title_words[i] in w), \
+                False):
+                title_words.pop(i)
+            # rebuild the title
+            title = u" ".join(title_words).strip()
 
         return title
 
     def get_title(self):
-        """\
+        """
         Fetch the article title and analyze it
         """
-        title = ''
-
-        # rely on opengraph in case we have the data
-        if "title" in self.article.opengraph.keys():
-            title = self.article.opengraph['title']
-            return self.clean_title(title)
-
-        # try to fetch the meta headline
-        meta_headline = self.parser.getElementsByTag(
-                            self.article.doc,
-                            tag="meta",
-                            attr="name",
-                            value="headline")
-        if meta_headline is not None and len(meta_headline) > 0:
-            title = self.parser.getAttribute(meta_headline[0], 'content')
-            return self.clean_title(title)
-
-        # otherwise use the title meta
-        title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
-        if title_element is not None and len(title_element) > 0:
-            title = self.parser.getText(title_element[0])
-            return self.clean_title(title)
-
+        title = u""
+        try:
+            # rely on opengraph in case we have the data
+            title_ = self.article.opengraph.get('title', '')
+            if title_:
+                # handle tags without any title: <meta property="og:title" />
+                title = self.clean_title(title_)
+            else:
+                # try to fetch the meta headline
+                meta_headline = self.parser.getElementsByTag(
+                                    self.article.doc,
+                                    tag="meta",
+                                    attr="name",
+                                    value="headline")
+                if meta_headline:
+                    title_ = self.parser.getAttribute(meta_headline[0], 'content')
+                    if title_:
+                        title = self.clean_title(title_)
+                else:
+                    # otherwise use the title meta
+                    title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
+                    if title_element:
+                        title_ = self.parser.getText(title_element[0])
+                        if title_:
+                            title = self.clean_title(title_)
+        except:
+            print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exc()
+
         return title
 
     def extract(self):