Skip to content

Commit 56864f2

Browse files
author
Burton DeWilde
committedDec 23, 2016
Added explicit bytes in blocks, absolute imports in init
1 parent 4586f00 commit 56864f2

File tree

3 files changed

+28
-35
lines changed

3 files changed

+28
-35
lines changed
 

‎.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
*.pyc
33
*.so
44
build/*
5+
dist/*
56
dragnet.egg-info
67

78
# cython temporary files
@@ -22,4 +23,3 @@ output/*
2223
*.swp
2324

2425
.vagrant
25-

‎dragnet/__init__.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1-
#! /usr/bin/env python
2-
3-
from .arias import AriasFeatures, Arias
4-
from .blocks import Blockifier, PartialBlock, BlockifyError
5-
from .features import NormalizedFeature, CSSFeatures
6-
from .content_extraction_model import ContentExtractionModel
7-
from .kohlschuetter import kohlschuetter_features, kohlschuetter
8-
from .util import evaluation_metrics
9-
from .weninger import weninger_features_kmeans
10-
from .readability import readability_features
11-
from .models import content_extractor, content_comments_extractor
1+
from dragnet.arias import AriasFeatures, Arias
2+
from dragnet.blocks import Blockifier, PartialBlock, BlockifyError
3+
from dragnet.features import NormalizedFeature, CSSFeatures
4+
from dragnet.content_extraction_model import ContentExtractionModel
5+
from dragnet.kohlschuetter import kohlschuetter_features, kohlschuetter
6+
from dragnet.util import evaluation_metrics
7+
from dragnet.weninger import weninger_features_kmeans
8+
from dragnet.readability import readability_features
9+
from dragnet.models import content_extractor, content_comments_extractor
1210

1311

1412
class AllFeatures(object):

‎dragnet/blocks.pyx

+18-23
Original file line numberDiff line numberDiff line change
@@ -51,23 +51,21 @@ cdef inline int int_min(int a, int b): return a if a <= b else b
5151

5252
# tags we'll ignore completely
5353
cdef cpp_set[string] BLACKLIST
54-
BLACKLIST = set([
55-
'applet', 'area', 'base', 'basefont', 'bdo', 'button',
56-
'caption', 'fieldset', 'fram', 'frameset',
57-
'iframe', 'img', 'input', 'legend', 'link', 'menu', 'meta',
58-
'noframes', 'noscript', 'object', 'optgroup', 'option', 'param',
59-
'script', 'select', 'style', 'textarea', 'var', 'xmp',
60-
'like', 'like-box', 'plusone',
54+
BLACKLIST = {
55+
b'applet', b'area', b'base', b'basefont', b'bdo', b'button',
56+
b'caption', b'fieldset', b'fram', b'frameset',
57+
b'iframe', b'img', b'input', b'legend', b'link', b'menu', b'meta',
58+
b'noframes', b'noscript', b'object', b'optgroup', b'option', b'param',
59+
b'script', b'select', b'style', b'textarea', b'var', b'xmp',
60+
b'like', b'like-box', b'plusone',
6161
# HTML5 vector image tags and math tags
62-
'svg', 'math'
63-
])
62+
b'svg', b'math'
63+
}
6464

6565

6666
# tags defining the blocks we'll extract
6767
cdef cpp_set[string] BLOCKS
68-
BLOCKS = set([
69-
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'table', 'map',
70-
])
68+
BLOCKS = {b'h1', b'h2', b'h3', b'h4', b'h5', b'h6', b'p', b'div', b'table', b'map'}
7169

7270
# define some commonly used strings here, otherwise Cython will always add
7371
# a little python overhead when using them even though they are constat
@@ -87,14 +85,13 @@ re_readability_positive = re.compile('article|body|content|entry|hentry|main|pag
8785
cdef string DIV = <string>'div'
8886

8987
cdef cpp_set[string] READABILITY_PLUS3
90-
READABILITY_PLUS3 = set(["pre", "td", "blockquote"])
88+
READABILITY_PLUS3 = {b'pre', b'td', b'blockquote'}
9189

9290
cdef cpp_set[string] READABILITY_MINUS3
93-
READABILITY_MINUS3 = set(
94-
["address", "ol", "ul", "dl", "dd", "dt", "li", "form"])
91+
READABILITY_MINUS3 = {b'address', b'ol', b'ul', b'dl', b'dd', b'dt', b'li', b'form'}
9592

9693
cdef cpp_set[string] READABILITY_MINUS5
97-
READABILITY_MINUS5 = set(["h1", "h2", "h3", "h4", "h5", "h6", "th"])
94+
READABILITY_MINUS5 = {b'h1', b'h2', b'h3', b'h4', b'h5', b'h6', b'th'}
9895

9996

10097
cdef cpp_set[char] WHITESPACE = set([<char>' ', <char>'\t', <char>'\n',
@@ -571,7 +568,7 @@ cdef class PartialBlock:
571568
# finally store it
572569
self.class_weights.push_back(pair[uint32_t, int](self.tag_id, weight))
573570
self.class_weights_written.insert(self.tag_id)
574-
571+
575572
cdef void reinit_readability(self):
576573
self.ancestors_write = self.ancestors
577574

@@ -695,12 +692,12 @@ cdef class TagCountPB(PartialBlock):
695692
# Since we don't output empty blocks, we also keep track of the
696693
# tag count since the last block we output as an additional feature
697694
#
698-
695+
699696
# _tc = tag count in the current block, since the last <div>, <p>, etc.
700697
# _tc_lb = tag count since last block. This is the tag count in prior
701698
# empty blocks, accumulated since the last block was output, excluding
702699
# the current block
703-
700+
704701
# so tc gets updated with each tag
705702
# tc is reset on block formation, even for empty blocks
706703
#
@@ -776,7 +773,7 @@ xml_re = re.compile('<\?\s*xml[^>]*encoding\s*=\s*"{0,1}\s*([a-zA-Z0-9-]+)', re.
776773
def guess_encoding(s, default='utf-8'):
777774
"""Try to guess the encoding of s -- check the XML declaration
778775
and the HTML meta tag
779-
776+
780777
if default=CHARDET then use chardet to guess the default"""
781778
mo = xml_re.search(s)
782779
if mo:
@@ -820,7 +817,7 @@ class Blockifier(object):
820817
partial_block.add_block_to_results(results)
821818

822819
return results
823-
820+
824821

825822
@staticmethod
826823
def blockify(s, encoding=None,
@@ -878,5 +875,3 @@ class TagCountNoCSSReadabilityBlockifier(Blockifier):
878875
return Blockifier.blockify(s, encoding, pb=TagCountPB,
879876
do_css=False, do_readability=True,
880877
parse_callback=parse_callback)
881-
882-

0 commit comments

Comments
 (0)
Please sign in to comment.