@@ -51,23 +51,21 @@ cdef inline int int_min(int a, int b): return a if a <= b else b
51
51
52
52
# tags we'll ignore completely
53
53
cdef cpp_set[string] BLACKLIST
54
- BLACKLIST = set ([
55
- ' applet' , ' area' , ' base' , ' basefont' , ' bdo' , ' button' ,
56
- ' caption' , ' fieldset' , ' fram' , ' frameset' ,
57
- ' iframe' , ' img' , ' input' , ' legend' , ' link' , ' menu' , ' meta' ,
58
- ' noframes' , ' noscript' , ' object' , ' optgroup' , ' option' , ' param' ,
59
- ' script' , ' select' , ' style' , ' textarea' , ' var' , ' xmp' ,
60
- ' like' , ' like-box' , ' plusone' ,
54
+ BLACKLIST = {
55
+ b ' applet' , b ' area' , b ' base' , b ' basefont' , b ' bdo' , b ' button' ,
56
+ b ' caption' , b ' fieldset' , b ' fram' , b ' frameset' ,
57
+ b ' iframe' , b ' img' , b ' input' , b ' legend' , b ' link' , b ' menu' , b ' meta' ,
58
+ b ' noframes' , b ' noscript' , b ' object' , b ' optgroup' , b ' option' , b ' param' ,
59
+ b ' script' , b ' select' , b ' style' , b ' textarea' , b ' var' , b ' xmp' ,
60
+ b ' like' , b ' like-box' , b ' plusone' ,
61
61
# HTML5 vector image tags and math tags
62
- ' svg' , ' math'
63
- ])
62
+ b ' svg' , b ' math'
63
+ }
64
64
65
65
66
66
# tags defining the blocks we'll extract
67
67
cdef cpp_set[string] BLOCKS
68
- BLOCKS = set ([
69
- ' h1' , ' h2' , ' h3' , ' h4' , ' h5' , ' h6' , ' p' , ' div' , ' table' , ' map' ,
70
- ])
68
+ BLOCKS = {b' h1' , b' h2' , b' h3' , b' h4' , b' h5' , b' h6' , b' p' , b' div' , b' table' , b' map' }
71
69
72
70
# define some commonly used strings here, otherwise Cython will always add
73
71
# a little python overhead when using them even though they are constat
@@ -87,14 +85,13 @@ re_readability_positive = re.compile('article|body|content|entry|hentry|main|pag
87
85
cdef string DIV = < string> ' div'
88
86
89
87
cdef cpp_set[string] READABILITY_PLUS3
90
- READABILITY_PLUS3 = set ([ " pre" , " td " , " blockquote" ])
88
+ READABILITY_PLUS3 = {b ' pre' , b ' td ' , b ' blockquote' }
91
89
92
90
cdef cpp_set[string] READABILITY_MINUS3
93
- READABILITY_MINUS3 = set (
94
- [" address" , " ol" , " ul" , " dl" , " dd" , " dt" , " li" , " form" ])
91
+ READABILITY_MINUS3 = {b' address' , b' ol' , b' ul' , b' dl' , b' dd' , b' dt' , b' li' , b' form' }
95
92
96
93
cdef cpp_set[string] READABILITY_MINUS5
97
- READABILITY_MINUS5 = set ([ " h1 " , " h2 " , " h3 " , " h4 " , " h5 " , " h6 " , " th " ])
94
+ READABILITY_MINUS5 = {b ' h1 ' , b ' h2 ' , b ' h3 ' , b ' h4 ' , b ' h5 ' , b ' h6 ' , b ' th ' }
98
95
99
96
100
97
cdef cpp_set[char ] WHITESPACE = set ([< char > ' ' , < char > ' \t ' , < char > ' \n ' ,
@@ -571,7 +568,7 @@ cdef class PartialBlock:
571
568
# finally store it
572
569
self .class_weights.push_back(pair[uint32_t, int ](self .tag_id, weight))
573
570
self .class_weights_written.insert(self .tag_id)
574
-
571
+
575
572
cdef void reinit_readability(self ):
576
573
self .ancestors_write = self .ancestors
577
574
@@ -695,12 +692,12 @@ cdef class TagCountPB(PartialBlock):
695
692
# Since we don't output empty blocks, we also keep track of the
696
693
# tag count since the last block we output as an additional feature
697
694
#
698
-
695
+
699
696
# _tc = tag count in the current block, since the last <div>, <p>, etc.
700
697
# _tc_lb = tag count since last block. This is the tag count in prior
701
698
# empty blocks, accumulated since the last block was output, excluding
702
699
# the current block
703
-
700
+
704
701
# so tc gets updated with each tag
705
702
# tc is reset on block formation, even for empty blocks
706
703
#
@@ -776,7 +773,7 @@ xml_re = re.compile('<\?\s*xml[^>]*encoding\s*=\s*"{0,1}\s*([a-zA-Z0-9-]+)', re.
776
773
def guess_encoding (s , default = ' utf-8' ):
777
774
""" Try to guess the encoding of s -- check the XML declaration
778
775
and the HTML meta tag
779
-
776
+
780
777
if default=CHARDET then use chardet to guess the default"""
781
778
mo = xml_re.search(s)
782
779
if mo:
@@ -820,7 +817,7 @@ class Blockifier(object):
820
817
partial_block.add_block_to_results(results)
821
818
822
819
return results
823
-
820
+
824
821
825
822
@staticmethod
826
823
def blockify (s , encoding = None ,
@@ -878,5 +875,3 @@ class TagCountNoCSSReadabilityBlockifier(Blockifier):
878
875
return Blockifier.blockify(s, encoding, pb = TagCountPB,
879
876
do_css = False , do_readability = True ,
880
877
parse_callback = parse_callback)
881
-
882
-
0 commit comments