diff --git a/README.md b/README.md index 82aebd1..9751fde 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ -## Personal Search engine +## Personal Search engine (python 3) ##### Combined Bookmarks and external search + ### What is this ? Aren't you frustrated having a boatload of quality bookmarks, but not using them because it is faster to @@ -52,6 +53,7 @@ You would need to install scikit-learn (for Tfidf support) and Flask for the web > pip install flask-bootstrap ``` + #### Create url.lst file. Next either create manually url.lst file in the data directory or generate one using bin/bm2urlst.py. @@ -69,6 +71,9 @@ you will use to do the searches. > python idx.py ``` +(4/14/19 note): I forgot to perform this step, and was scratching my head over a `vocabulary.csv does not exist` (paraphrased) error. + + ### Run the cmd-line app There cmd line app, is mainly for testing purposes. diff --git a/TODO.txt b/TODO.txt index 7d8d09e..9ce5b64 100644 --- a/TODO.txt +++ b/TODO.txt @@ -6,4 +6,6 @@ 6. Add configration pane 7. Add ability to upload URL-lists, even better manage different URL lists i.e. indecies !? 8. Rewrite the Bookmark parser in Python -9. Capture and show "Did you mean" from Google search! \ No newline at end of file +9. Capture and show "Did you mean" from Google search! +10. Port main processes to python 3 +11. Speed up matrix processing \ No newline at end of file diff --git a/bin/bm2urlst.pl b/bin/bm2urlst.pl index 99d3210..fd1e401 100755 --- a/bin/bm2urlst.pl +++ b/bin/bm2urlst.pl @@ -6,18 +6,18 @@ my $bkmark = $ARGV[0]; my $grep_str = $ARGV[1]; -unless ($bkmark) { print "Please specify bookmark file\n"; exit}; -unless (-e $bkmark) { print "The Bookmark you specified does not exist\n"; exit}; +unless ($bkmark) { print("Please specify bookmark file\n"); exit}; +unless (-e $bkmark) { print ("The Bookmark you specified does not exist\n"); exit}; undef $/; -open my $fh, "<$bkmark" or die print $!; +open my $fh, "<$bkmark" or die print( $!); my $str = <$fh>; close $fh; -open my $fh, ">url.lst" or die print "cant create url.lst : $!"; +open my $fh, ">url.lst" or die print ("cant create url.lst : $!"); my $dom = new Mojo::DOM($str); for my $el ( $dom->find('a')->each ) {;#->map(attr => 'HREF')->join("\n"); - print $el->{href} . ' : ' . $el->text . "\n"; - print $fh $el->{href} . "\n" if $el->{href} =~ /^http/; + print ($el->{href} . ' : ' . $el->text . "\n"); + print ($fh $el->{href} . "\n" if $el->{href} =~ /^http/); } close $fh; diff --git a/bin/bm2urlst.py b/bin/bm2urlst.py index 49137e3..bb6d11f 100755 --- a/bin/bm2urlst.py +++ b/bin/bm2urlst.py @@ -5,7 +5,7 @@ if len(sys.argv) == 1 : - print "Please specify bookmark file" + print("Please specify bookmark file") sys.exit() bkmark = sys.argv[1]; @@ -16,4 +16,4 @@ results = doc.xpath('//a') for r in results : h = r.get('href') - if re.search(r'^http',h) : print h + if re.search(r'^http',h) : print (h) diff --git a/bin/query.py b/bin/query.py index 18da456..5b97dcf 100755 --- a/bin/query.py +++ b/bin/query.py @@ -12,17 +12,17 @@ def google_search(query): s = GoogleSearch() s.search(query) for info in s.results() : - print '-' * 50 - print info - print s.excerpt(info['id'],5,50) + print('-' * 50) + print(info) + print(s.excerpt(info['id'],5,50)) def bmark_search(query): s = BmarkSearch() s.search(query) for info in s.results() : - print '-' * 50 - print info - print s.excerpt(info['id'],5,50) + print('-' * 50) + print(info) + print( s.excerpt(info['id'],5,50)) def main(arguments): diff --git a/data/url.lst b/data/url.lst index 24d3732..48f2920 100644 --- a/data/url.lst +++ b/data/url.lst @@ -22,4 +22,4 @@ http://faculty.msb.edu/hasnasj/GTWebSite/MythWeb.htm https://github.com/psyeugenic/eep/blob/egil/maps/eeps/eep-0043.md http://www.victorianweb.org/science/ether.htm http://www.brighthub.com/science/space/articles/32392.aspx -http://wfhummel.cnchost.com/bankingbasics.html +http://wfhummel.cnchost.com/bankingbasics.html \ No newline at end of file diff --git a/lib/bmark_search.py b/lib/bmark_search.py index bedcaf7..fe3b84a 100644 --- a/lib/bmark_search.py +++ b/lib/bmark_search.py @@ -113,7 +113,7 @@ def doc_info(self, idx): def excerpt(self,idx,num_lines=10,char_count=None): file_name = os.path.join(Utils.tmp_dir, str(idx) + ".txt") head = '' - with open(file_name,"r") as txt : + with open(file_name,"r", encoding = "utf-8") as txt : # does this need explicit decoding? for x in range(num_lines) : head += txt.read() if char_count == None : return head else : return head[:char_count] diff --git a/lib/google_search.py b/lib/google_search.py index ff5cfc3..f4ef399 100644 --- a/lib/google_search.py +++ b/lib/google_search.py @@ -18,7 +18,7 @@ def search(self,query): #if isinstance(query,list) : query = ' '.join(query) query = re.sub(r'\+','', query) #remove + you may have used for bmark search url = self.qurl + query - print url + print(url) resp = requests.get(url, timeout=10, allow_redirects=True, headers=self.user_agent)#, config=debug) if resp.ok : #with open(Utils.tmp_dir + '/google.html', 'w') as html : print html.write(resp.content) diff --git a/lib/indexer.py b/lib/indexer.py index 751a47d..5415081 100644 --- a/lib/indexer.py +++ b/lib/indexer.py @@ -18,7 +18,7 @@ class Indexer: @staticmethod def urls(url_list): - for url in open(url_list,'r'): + for url in open(url_list,'r',encoding='utf8'): # added encoding #empty string or comment if not url.strip() or url.strip().startswith('#') : continue yield url.rstrip("\n") @@ -26,7 +26,7 @@ def urls(url_list): @staticmethod def save2file(name, string): - with open(name,'w') as txt: txt.write(string) + with open(name,'w', encoding='utf8') as txt: txt.write(string) # added encoding def __init__(self,levels=200): self.levels = levels @@ -46,6 +46,7 @@ def cleanup(self,string): self.log.info('*** The page has no tag, skipping ... ***') return None txt = body[0].text_content().encode(errors='ignore') #encode('utf-8') + txt = txt.decode('utf-8') # added decoding txt = re.sub('\s*\n\s*', '\n', txt) txt = re.sub('[ \t]{2,}', ' ', txt) if not txt : self.log.error("Empty html") @@ -70,14 +71,14 @@ def fetch(self,url,idx): self.log.error("*(%s) %s : %s" % (resp.status_code, resp.reason, resp.headers['Content-Type'])) return None except Exception as e: - print Exception(e) - print "Err processing %s" % url + print(Exception(e)) + print("Err processing %s" % url) def files(self,start_dir=Utils.tmp_dir): for f in self.file_list : - print "tfidf processing: %s" % f - yield open(join(start_dir, f),'r').read() + print("tfidf processing: %s" % f) + yield open(join(start_dir, f),'r', encoding='utf8').read() # edited for explicit encoding #the file list has to be in numerical order (so that tfidf matrix doc idx follow fetch sequence) otherwise indexing goes out of touch diff --git a/lib/utils.py b/lib/utils.py index fe3ff08..4f74b95 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -36,7 +36,7 @@ def init_logger(log_file='indexer.log'): @staticmethod def dict2csv(ary, path=None, fname='vocabulary.csv'): if not path : path = Utils.data_dir - w = csv.writer(open(join(path,fname), "w"), lineterminator="\n") + w = csv.writer(open(join(path,fname), "w"), lineterminator="\n") # does this need explicit encoding? for key, val in ary.items(): w.writerow([key, val]) @staticmethod @@ -51,12 +51,12 @@ def csv2dict(path=None, fname='vocabulary.csv', val_int=True): @staticmethod def pkl2file(data,path=None,dst_file='vocabulary.pkl'): if not path : path = Utils.data_dir - with open(join(path, dst_file), "wb") as f : pickle.dump(data,f) + with open(join(path, dst_file), "wb") as f : pickle.dump(data,f) # does this need explicit encoding? @staticmethod def file2pkl(path=None,src_file='vocabulary.pkl'): if not path : path = Utils.data_dir - with open(join(path,src_file), "rb") as f : data = pickle.load(f) + with open(join(path,src_file), "rb") as f : data = pickle.load(f) # does this need explicit encoding? return data @staticmethod diff --git a/site/app/__init__.py b/site/app/__init__.py index a809474..a5fcd4e 100644 --- a/site/app/__init__.py +++ b/site/app/__init__.py @@ -1,6 +1,7 @@ from flask import Flask from config import config -from flask.ext.bootstrap import Bootstrap +# from flask.ext.bootstrap import Bootstrap +from flask_bootstrap import Bootstrap # updated bootstrap = Bootstrap() def create_app(cfg_name): diff --git a/site/app/pse/routes.py b/site/app/pse/routes.py index 4553fef..134a6d9 100644 --- a/site/app/pse/routes.py +++ b/site/app/pse/routes.py @@ -16,7 +16,7 @@ def search(): google = GoogleSearch() bmark = BmarkSearch() - if request.form.has_key('q') : + if 'q' in request.form: # updated from if request.form.has_key('q') : q = request.form['q'] if len(q) > 0 : @@ -24,12 +24,12 @@ def search(): try: bmark.search(q) except Exception as e : - flash('Bmark search: ' + e.message) + flash('Bmark search: ' + str(e)) # updated from flash("Google search error : " + e.message) try : google.search(q) except Exception as e: - flash("Google search error : " + e.message) + flash("Google search error : " + str(e)) # updated from flash("Google search error : " + e.message) else: flash('Interesting what will happen if you search for something rather than nothing !!') diff --git a/site/manage.py b/site/manage.py index dc101c6..45bcb0c 100644 --- a/site/manage.py +++ b/site/manage.py @@ -1,7 +1,8 @@ #!/usr/bin/env python import os from app import create_app -from flask.ext.script import Manager +# from flask.ext.script import Manager # deprecated; there's a link I could cite, but forget it. +from flask_script import Manager # updated app = create_app(os.getenv('FLASK_CONFIG') or 'default') manager = Manager(app)