vsraptor · wuben3125 · Mar 26, 2020
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
-## Personal Search engine
+## Personal Search engine (python 3)
 ##### Combined Bookmarks and external search
 
+
 ### What is this ?
 
 Aren't you frustrated having a boatload of quality bookmarks, but not using them because it is faster to
@@ -52,6 +53,7 @@ You would need to install scikit-learn (for Tfidf support) and Flask for the web
 > pip install flask-bootstrap
 ```
 
+
 #### Create url.lst file.
 
 Next either create manually url.lst file in the data directory or generate one using bin/bm2urlst.py.
@@ -69,6 +71,9 @@ you will use to do the searches.
 > python idx.py
 ```
 
+(4/14/19 note): I forgot to perform this step, and was scratching my head over a `vocabulary.csv does not exist` (paraphrased) error.
+
+
 ### Run the cmd-line app
 
 There cmd line app, is mainly for testing purposes.

diff --git a/TODO.txt b/TODO.txt
@@ -6,4 +6,6 @@
 6. Add configration pane
 7. Add ability to upload URL-lists, even better manage different URL lists i.e. indecies !?
 8. Rewrite the Bookmark parser in Python
-9. Capture and show "Did you mean" from Google search!
+9. Capture and show "Did you mean" from Google search!
+10. Port main processes to python 3 
+11. Speed up matrix processing
diff --git a/bin/bm2urlst.pl b/bin/bm2urlst.pl
@@ -6,18 +6,18 @@
 
 my $bkmark = $ARGV[0];
 my $grep_str = $ARGV[1];
-unless ($bkmark) { print "Please specify bookmark file\n"; exit};
-unless (-e $bkmark) { print "The Bookmark you specified does not exist\n"; exit};
+unless ($bkmark) { print("Please specify bookmark file\n"); exit};
+unless (-e $bkmark) { print ("The Bookmark you specified does not exist\n"); exit};
 
 undef $/;
-open my $fh, "<$bkmark" or die print $!;
+open my $fh, "<$bkmark" or die print( $!);
 my $str = <$fh>;
 close $fh;
 
-open my $fh, ">url.lst" or die print "cant create url.lst : $!";
+open my $fh, ">url.lst" or die print ("cant create url.lst : $!");
 my $dom = new Mojo::DOM($str);
 for my $el ( $dom->find('a')->each ) {;#->map(attr => 'HREF')->join("\n");
-	print $el->{href} . ' : ' . $el->text . "\n";
-	print $fh $el->{href} . "\n" if $el->{href} =~ /^http/;
+	print ($el->{href} . ' : ' . $el->text . "\n");
+	print ($fh $el->{href} . "\n" if $el->{href} =~ /^http/);
 }
 close $fh;
diff --git a/bin/bm2urlst.py b/bin/bm2urlst.py
@@ -5,7 +5,7 @@
 
 
 if len(sys.argv) == 1 :
-	print "Please specify bookmark file"
+	print("Please specify bookmark file")
 	sys.exit()
 
 bkmark = sys.argv[1];
@@ -16,4 +16,4 @@
 results = doc.xpath('//a')
 for r in results :
 	h = r.get('href')
-	if re.search(r'^http',h) : print h
+	if re.search(r'^http',h) : print (h)
diff --git a/bin/query.py b/bin/query.py
@@ -12,17 +12,17 @@ def google_search(query):
 	s = GoogleSearch()
 	s.search(query)
 	for info in s.results() :
-		print '-' * 50
-		print info
-		print s.excerpt(info['id'],5,50)
+		print('-' * 50)
+		print(info)
+		print(s.excerpt(info['id'],5,50))
 
 def bmark_search(query):
 	s = BmarkSearch()
 	s.search(query)
 	for info in s.results() :
-		print '-' * 50
-		print info
-		print s.excerpt(info['id'],5,50)
+		print('-' * 50)
+		print(info)
+		print( s.excerpt(info['id'],5,50))
 
 
 def main(arguments):

diff --git a/data/url.lst b/data/url.lst
@@ -22,4 +22,4 @@ http://faculty.msb.edu/hasnasj/GTWebSite/MythWeb.htm
 https://github.com/psyeugenic/eep/blob/egil/maps/eeps/eep-0043.md
 http://www.victorianweb.org/science/ether.htm
 http://www.brighthub.com/science/space/articles/32392.aspx
-http://wfhummel.cnchost.com/bankingbasics.html
+http://wfhummel.cnchost.com/bankingbasics.html
diff --git a/lib/bmark_search.py b/lib/bmark_search.py
@@ -113,7 +113,7 @@ def doc_info(self, idx):
 	def excerpt(self,idx,num_lines=10,char_count=None):
 		file_name = os.path.join(Utils.tmp_dir, str(idx) + ".txt")
 		head = ''
-		with open(file_name,"r") as txt :
+		with open(file_name,"r", encoding = "utf-8") as txt : # does this need explicit decoding?
 			for x in range(num_lines) : head += txt.read()
 		if char_count == None : return head
 		else : return head[:char_count]

diff --git a/lib/google_search.py b/lib/google_search.py
@@ -18,7 +18,7 @@ def search(self,query):
 		#if isinstance(query,list) : query = ' '.join(query)
 		query = re.sub(r'\+','', query) #remove + you may have used for bmark search
 		url = self.qurl + query
-		print url
+		print(url)
 		resp = requests.get(url, timeout=10, allow_redirects=True, headers=self.user_agent)#, config=debug)
 		if resp.ok :
 			#with open(Utils.tmp_dir + '/google.html', 'w') as html : print html.write(resp.content)

diff --git a/lib/indexer.py b/lib/indexer.py
@@ -18,15 +18,15 @@ class Indexer:
 
 	@staticmethod
 	def urls(url_list):
-		for url in open(url_list,'r'):
+		for url in open(url_list,'r',encoding='utf8'): # added encoding
 			#empty string or comment
 			if not url.strip() or url.strip().startswith('#') : continue
 			yield url.rstrip("\n")
 
 
 	@staticmethod
 	def save2file(name, string):
-		with open(name,'w') as txt: txt.write(string)
+		with open(name,'w', encoding='utf8') as txt: txt.write(string) # added encoding
 
 	def __init__(self,levels=200):
 		self.levels = levels
@@ -46,6 +46,7 @@ def cleanup(self,string):
 			self.log.info('*** The page has no <body> tag, skipping ... ***')
 			return None
 		txt = body[0].text_content().encode(errors='ignore') #encode('utf-8')
+		txt = txt.decode('utf-8') # added decoding        
 		txt = re.sub('\s*\n\s*', '\n', txt)
 		txt = re.sub('[ \t]{2,}', ' ', txt)
 		if not txt : self.log.error("Empty html")
@@ -70,14 +71,14 @@ def fetch(self,url,idx):
 				self.log.error("*(%s) %s : %s" % (resp.status_code, resp.reason, resp.headers['Content-Type']))
 				return None
 		except Exception as e:
-			print Exception(e)
-			print "Err processing %s" % url
+			print(Exception(e))
+			print("Err processing %s" % url)
 
 
 	def files(self,start_dir=Utils.tmp_dir):
 		for f in self.file_list :
-			print "tfidf processing: %s" % f
-			yield open(join(start_dir, f),'r').read()
+			print("tfidf processing: %s" % f)
+			yield open(join(start_dir, f),'r', encoding='utf8').read() # edited for explicit encoding
 
 
 	#the file list has to be in numerical order (so that tfidf matrix doc idx follow fetch sequence) otherwise indexing goes out of touch

diff --git a/lib/utils.py b/lib/utils.py
@@ -36,7 +36,7 @@ def init_logger(log_file='indexer.log'):
 	@staticmethod
 	def dict2csv(ary, path=None, fname='vocabulary.csv'):
 		if not path : path = Utils.data_dir
-		w = csv.writer(open(join(path,fname), "w"), lineterminator="\n")
+		w = csv.writer(open(join(path,fname), "w"), lineterminator="\n") # does this need explicit encoding? 
 		for key, val in ary.items(): w.writerow([key, val])
 
 	@staticmethod
@@ -51,12 +51,12 @@ def csv2dict(path=None, fname='vocabulary.csv', val_int=True):
 	@staticmethod
 	def pkl2file(data,path=None,dst_file='vocabulary.pkl'):
 		if not path : path = Utils.data_dir
-		with open(join(path, dst_file), "wb") as f : pickle.dump(data,f)
+		with open(join(path, dst_file), "wb") as f : pickle.dump(data,f) # does this need explicit encoding?
 
 	@staticmethod
 	def file2pkl(path=None,src_file='vocabulary.pkl'):
 		if not path : path = Utils.data_dir
-		with open(join(path,src_file), "rb") as f : data = pickle.load(f)
+		with open(join(path,src_file), "rb") as f : data = pickle.load(f) # does this need explicit encoding?
 		return data
 
 	@staticmethod

diff --git a/site/app/__init__.py b/site/app/__init__.py
@@ -1,6 +1,7 @@
 from flask import Flask
 from config import config
-from flask.ext.bootstrap import Bootstrap
+# from flask.ext.bootstrap import Bootstrap
+from flask_bootstrap import Bootstrap # updated
 bootstrap = Bootstrap()
 
 def create_app(cfg_name):

diff --git a/site/app/pse/routes.py b/site/app/pse/routes.py
@@ -16,20 +16,20 @@ def search():
 	google = GoogleSearch()
 	bmark = BmarkSearch()
 
-	if request.form.has_key('q') :
+	if 'q' in request.form: # updated from if request.form.has_key('q') :
 		q = request.form['q']
 
 		if len(q) > 0 :
 
 			try:
 				bmark.search(q)
 			except Exception as e :
-				flash('Bmark search: ' + e.message)
+				flash('Bmark search: ' + str(e)) # updated from flash("Google search error : " + e.message)
 
 			try :
 				google.search(q)
 			except Exception as e:
-				flash("Google search error : " + e.message)
+				flash("Google search error : " + str(e)) # updated from flash("Google search error : " + e.message)
 
 		else:
 			flash('Interesting what will happen if you search for something rather than nothing !!')

diff --git a/site/manage.py b/site/manage.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 import os
 from app import create_app
-from flask.ext.script import Manager
+# from flask.ext.script import Manager # deprecated; there's a link I could cite, but forget it. 
+from flask_script import Manager # updated
 
 app = create_app(os.getenv('FLASK_CONFIG') or 'default')
 manager = Manager(app)