Skip to content

Commit dbbbb7f

Browse files
author
Lol4to
committed
Merge into python3 support from Vetal4444
See https://github.com/vetal4444/python-goose/tree/python_3 grangier#220 Conflicts: goose/text.py
2 parents 964eb48 + 1ef277b commit dbbbb7f

31 files changed

+227
-84
lines changed

.travis.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@ language: python
33
python:
44
- 2.6
55
- 2.7
6+
- 3.4
67

78
install:
8-
- pip install -r requirements.txt --use-mirrors
9+
- pip install jieba
910
- python setup.py install
1011

1112
script: python setup.py test

goose/__init__.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
limitations under the License.
2222
"""
2323
import os
24-
import platform
2524
from tempfile import mkstemp
2625

2726
from goose.version import version_info, __version__
@@ -64,9 +63,12 @@ def crawl(self, crawl_candiate):
6463
try:
6564
crawler = Crawler(self.config)
6665
article = crawler.crawl(crawl_candiate)
67-
except (UnicodeDecodeError, ValueError):
68-
self.config.parser_class = parsers[0]
69-
return self.crawl(crawl_candiate)
66+
except (UnicodeDecodeError, ValueError) as e:
67+
if parsers:
68+
self.config.parser_class = parsers[0]
69+
return self.crawl(crawl_candiate)
70+
else:
71+
raise e
7072
return article
7173

7274
def initialize(self):

goose/cleaners.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
See the License for the specific language governing permissions and
2121
limitations under the License.
2222
"""
23+
from __future__ import unicode_literals
24+
2325
from goose.utils import ReplaceSequence
2426

2527

goose/configuration.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
"""
2323
import os
2424
import tempfile
25+
26+
import six
27+
2528
from goose.text import StopWords
2629
from goose.parsers import Parser
2730
from goose.parsers import ParserSoup
@@ -30,10 +33,12 @@
3033
HTTP_DEFAULT_TIMEOUT = 30
3134

3235
AVAILABLE_PARSERS = {
33-
'lxml': Parser,
34-
'soup': ParserSoup,
36+
'lxml': Parser
3537
}
3638

39+
if six.PY2:
40+
AVAILABLE_PARSERS['soup'] = ParserSoup
41+
3742

3843
class Configuration(object):
3944

goose/extractors/content.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def update_score(self, node, addToScore):
260260
if score_string:
261261
current_score = int(score_string)
262262

263-
new_score = current_score + addToScore
263+
new_score = current_score + int(addToScore)
264264
self.parser.setAttribute(node, "gravityScore", str(new_score))
265265

266266
def update_node_count(self, node, add_to_count):

goose/extractors/images.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import re
2424
import os
2525

26-
from urlparse import urlparse, urljoin
26+
from six.moves.urllib.parse import urlparse, urljoin
2727

2828
from goose.extractors import BaseExtractor
2929
from goose.image import Image

goose/extractors/metas.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
"""
2323

2424
import re
25-
from urlparse import urljoin
26-
from urlparse import urlparse
25+
26+
from six.moves.urllib.parse import urlparse, urljoin
2727

2828
from goose.extractors import BaseExtractor
2929

goose/image.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(self):
4646
self.extraction_type = "NA"
4747

4848
# stores how many bytes this image is.
49-
self.bytes = long(0)
49+
self.bytes = 0
5050

5151
def get_src(self):
5252
return self.src
@@ -87,7 +87,7 @@ def set_mime_type(self, mime_type):
8787
class LocallyStoredImage(object):
8888

8989
def __init__(self, src='', local_filename='',
90-
link_hash='', bytes=long(0), file_extension='', height=0, width=0):
90+
link_hash='', bytes=0, file_extension='', height=0, width=0):
9191
self.src = src
9292
self.local_filename = local_filename
9393
self.link_hash = link_hash

goose/network.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@
2020
See the License for the specific language governing permissions and
2121
limitations under the License.
2222
"""
23-
import urllib2
23+
import six
24+
25+
try:
26+
from urllib2 import urlopen, Request
27+
except ImportError:
28+
from urllib.request import urlopen, Request
2429

2530

2631
class HtmlFetcher(object):
@@ -39,18 +44,14 @@ def get_url(self):
3944

4045
def get_html(self, url):
4146
# utf-8 encode unicode url
42-
if isinstance(url, unicode):
47+
if isinstance(url, six.text_type) and six.PY2:
4348
url = url.encode('utf-8')
4449

4550
# set request
46-
self.request = urllib2.Request(
47-
url,
48-
headers=self.headers)
51+
self.request = Request(url, headers=self.headers)
4952
# do request
5053
try:
51-
self.result = urllib2.urlopen(
52-
self.request,
53-
timeout=self.config.http_timeout)
54+
self.result = urlopen(self.request, timeout=self.config.http_timeout)
5455
except Exception:
5556
self.result = None
5657

goose/outputformatters.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
See the License for the specific language governing permissions and
2121
limitations under the License.
2222
"""
23-
from HTMLParser import HTMLParser
23+
from six.moves.html_parser import HTMLParser
24+
2425
from goose.text import innerTrim
2526

2627

0 commit comments

Comments
 (0)