Fix unicode processing + &nbsp; support

Lol4to · Lol4to · commit 964eb4806699 · 2015-11-13T18:27:36.000+03:00
* As STOP_WORDS are stored in unicode format we should keep our words candidates in unicode also to be able to compare candidates against dictionary correctly * With some languages, short stopwords are linked to the next word in the sentance with no-breakable-space. To designate those stop words we should support nbsp when tokenizing. Russian is an example. So this fixes grangier#223
diff --git a/goose/text.py b/goose/text.py
@@ -28,6 +28,7 @@
 from goose.utils.encoding import smart_str
 from goose.utils.encoding import DjangoUnicodeDecodeError
 
+SPACE_SYMBOLS = re.compile(ur'[\s\xa0\t]')
 TABSSPACE = re.compile(r'[\s\t]+')
 
 
@@ -106,12 +107,14 @@ def __init__(self, language='en'):
     def remove_punctuation(self, content):
         # code taken form
         # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
+        translate = lambda data: data.translate(self.TRANS_TABLE, string.punctuation)
         if isinstance(content, unicode):
-            content = content.encode('utf-8')
-        return content.translate(self.TRANS_TABLE, string.punctuation)
+            return translate(content.encode('utf-8')).decode('utf-8')  # Don't forget to decode back if encoded
+        else:
+            return translate(content)
 
     def candiate_words(self, stripped_input):
-        return stripped_input.split(' ')
+        return re.split(SPACE_SYMBOLS, stripped_input)
 
     def get_stopword_count(self, content):
         if not content: