diff --git a/goose/extractors/content.py b/goose/extractors/content.py index e0703d55..6f16f0ca 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -26,9 +26,11 @@ KNOWN_ARTICLE_CONTENT_TAGS = [ + {'tag': 'article'}, {'attr': 'itemprop', 'value': 'articleBody'}, {'attr': 'class', 'value': 'post-content'}, - {'tag': 'article'}, + {'attr': 'class', 'value': "story-body-text"}, + {'attr': 'class', 'value': "story-content"}, ] diff --git a/goose/extractors/title.py b/goose/extractors/title.py index 31d69840..13d2f102 100644 --- a/goose/extractors/title.py +++ b/goose/extractors/title.py @@ -63,7 +63,7 @@ def clean_title(self, title): # check if last letter is in TITLE_SPLITTERS # if so remove it - if title_words[-1] in TITLE_SPLITTERS: + if len(title_words) != 0 and title_words[-1] in TITLE_SPLITTERS: title_words.pop(-1) # rebuild the title diff --git a/goose/version.py b/goose/version.py index fedcbb6d..bfb200a2 100644 --- a/goose/version.py +++ b/goose/version.py @@ -21,5 +21,5 @@ limitations under the License. """ -version_info = (1, 0, 25) +version_info = (1, 0, 26) __version__ = ".".join(map(str, version_info)) diff --git a/tests/data/extractors/content/test_newyorktimes.html b/tests/data/extractors/content/test_newyorktimes.html new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/extractors/content/test_newyorktimes.json b/tests/data/extractors/content/test_newyorktimes.json new file mode 100644 index 00000000..e69de29b