Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions goose/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,14 @@
import os
import platform
from tempfile import mkstemp
import traceback
import sys

from goose.version import version_info, __version__
from goose.configuration import Configuration
from goose.crawler import CrawlCandidate
from goose.crawler import Crawler


class Goose(object):
"""\

Expand All @@ -48,7 +49,7 @@ def extend_config(self):
self.config = config

def extract(self, url=None, raw_html=None):
"""\
"""
Main method to extract an article object from a URL,
pass in a url and get back a Article
"""
Expand All @@ -61,12 +62,16 @@ def shutdown_network(self):
def crawl(self, crawl_candiate):
parsers = list(self.config.available_parsers)
parsers.remove(self.config.parser_class)
article = None
try:
crawler = Crawler(self.config)
article = crawler.crawl(crawl_candiate)
except (UnicodeDecodeError, ValueError):
self.config.parser_class = parsers[0]
return self.crawl(crawl_candiate)
if isinstance(crawl_candiate, basestring):
return self.crawl(crawl_candiate)
except Exception as e:
print >> sys.stderr, 'Article Crawl error: ', traceback.format_exec()
return article

def initialize(self):
Expand Down
90 changes: 45 additions & 45 deletions goose/extractors/title.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
"""\
"""
This is a python port of "Goose" orignialy licensed to Gravity.com
under one or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
Expand All @@ -21,17 +21,19 @@
limitations under the License.
"""
import re

import traceback
import sys
from goose.extractors import BaseExtractor


TITLE_SPLITTERS = [u"|", u"-", u"»", u":"]
TITLE_SPLITTERS = set([u"|", u"-", u"»", u":"])


class TitleExtractor(BaseExtractor):

def clean_title(self, title):
"""Clean title with the use of og:site_name
"""
Clean title with the use of og:site_name
in this case try to get ride of site name
and use TITLE_SPLITTERS to reformat title
"""
Expand All @@ -51,53 +53,51 @@ def clean_title(self, title):
# my wonderfull article | TechCrunch
title_words = title.split()

# check for an empty title
# so that we don't get an IndexError below
if len(title_words) == 0:
return u""

# check if first letter is in TITLE_SPLITTERS
# if so remove it
if title_words[0] in TITLE_SPLITTERS:
title_words.pop(0)

# check if last letter is in TITLE_SPLITTERS
# if so remove it
if title_words[-1] in TITLE_SPLITTERS:
title_words.pop(-1)

# rebuild the title
title = u" ".join(title_words).strip()
title = u""
# check if first and last words are in TITLE_SPLITTERS
# if so remove them
for i in 0, -1:
if title_words and next ((
True for w in TITLE_SPLITTERS if title_words[i] in w), \
False):
title_words.pop(i)
# rebuild the title
title = u" ".join(title_words).strip()

return title

def get_title(self):
"""\
"""
Fetch the article title and analyze it
"""
title = ''

# rely on opengraph in case we have the data
if "title" in self.article.opengraph.keys():
title = self.article.opengraph['title']
return self.clean_title(title)

# try to fetch the meta headline
meta_headline = self.parser.getElementsByTag(
self.article.doc,
tag="meta",
attr="name",
value="headline")
if meta_headline is not None and len(meta_headline) > 0:
title = self.parser.getAttribute(meta_headline[0], 'content')
return self.clean_title(title)

# otherwise use the title meta
title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
if title_element is not None and len(title_element) > 0:
title = self.parser.getText(title_element[0])
return self.clean_title(title)

title = u""
try:
# rely on opengraph in case we have the data
title_ = self.article.opengraph.get('title', '')
if title_:
# handle tags without any title: <meta property="og:title" />
title = self.clean_title(title_)
else:
# try to fetch the meta headline
meta_headline = self.parser.getElementsByTag(
self.article.doc,
tag="meta",
attr="name",
value="headline")
if meta_headline:
title_ = self.parser.getAttribute(meta_headline[0], 'content')
if title_:
title = self.clean_title(title_)
else:
# otherwise use the title meta
title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
if title_element:
title_ = self.parser.getText(title_element[0])
if title_:
title = self.clean_title(title_)
except:
print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exc()

return title

def extract(self):
Expand Down