Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
## Personal Search engine
## Personal Search engine (python 3)
##### Combined Bookmarks and external search


### What is this ?

Aren't you frustrated having a boatload of quality bookmarks, but not using them because it is faster to
Expand Down Expand Up @@ -52,6 +53,7 @@ You would need to install scikit-learn (for Tfidf support) and Flask for the web
> pip install flask-bootstrap
```


#### Create url.lst file.

Next either create manually url.lst file in the data directory or generate one using bin/bm2urlst.py.
Expand All @@ -69,6 +71,9 @@ you will use to do the searches.
> python idx.py
```

(4/14/19 note): I forgot to perform this step, and was scratching my head over a `vocabulary.csv does not exist` (paraphrased) error.


### Run the cmd-line app

There cmd line app, is mainly for testing purposes.
Expand Down
4 changes: 3 additions & 1 deletion TODO.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@
6. Add configration pane
7. Add ability to upload URL-lists, even better manage different URL lists i.e. indecies !?
8. Rewrite the Bookmark parser in Python
9. Capture and show "Did you mean" from Google search!
9. Capture and show "Did you mean" from Google search!
10. Port main processes to python 3
11. Speed up matrix processing
12 changes: 6 additions & 6 deletions bin/bm2urlst.pl
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@

my $bkmark = $ARGV[0];
my $grep_str = $ARGV[1];
unless ($bkmark) { print "Please specify bookmark file\n"; exit};
unless (-e $bkmark) { print "The Bookmark you specified does not exist\n"; exit};
unless ($bkmark) { print("Please specify bookmark file\n"); exit};
unless (-e $bkmark) { print ("The Bookmark you specified does not exist\n"); exit};

undef $/;
open my $fh, "<$bkmark" or die print $!;
open my $fh, "<$bkmark" or die print( $!);
my $str = <$fh>;
close $fh;

open my $fh, ">url.lst" or die print "cant create url.lst : $!";
open my $fh, ">url.lst" or die print ("cant create url.lst : $!");
my $dom = new Mojo::DOM($str);
for my $el ( $dom->find('a')->each ) {;#->map(attr => 'HREF')->join("\n");
print $el->{href} . ' : ' . $el->text . "\n";
print $fh $el->{href} . "\n" if $el->{href} =~ /^http/;
print ($el->{href} . ' : ' . $el->text . "\n");
print ($fh $el->{href} . "\n" if $el->{href} =~ /^http/);
}
close $fh;
4 changes: 2 additions & 2 deletions bin/bm2urlst.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


if len(sys.argv) == 1 :
print "Please specify bookmark file"
print("Please specify bookmark file")
sys.exit()

bkmark = sys.argv[1];
Expand All @@ -16,4 +16,4 @@
results = doc.xpath('//a')
for r in results :
h = r.get('href')
if re.search(r'^http',h) : print h
if re.search(r'^http',h) : print (h)
12 changes: 6 additions & 6 deletions bin/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@ def google_search(query):
s = GoogleSearch()
s.search(query)
for info in s.results() :
print '-' * 50
print info
print s.excerpt(info['id'],5,50)
print('-' * 50)
print(info)
print(s.excerpt(info['id'],5,50))

def bmark_search(query):
s = BmarkSearch()
s.search(query)
for info in s.results() :
print '-' * 50
print info
print s.excerpt(info['id'],5,50)
print('-' * 50)
print(info)
print( s.excerpt(info['id'],5,50))


def main(arguments):
Expand Down
2 changes: 1 addition & 1 deletion data/url.lst
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ http://faculty.msb.edu/hasnasj/GTWebSite/MythWeb.htm
https://github.com/psyeugenic/eep/blob/egil/maps/eeps/eep-0043.md
http://www.victorianweb.org/science/ether.htm
http://www.brighthub.com/science/space/articles/32392.aspx
http://wfhummel.cnchost.com/bankingbasics.html
http://wfhummel.cnchost.com/bankingbasics.html
2 changes: 1 addition & 1 deletion lib/bmark_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def doc_info(self, idx):
def excerpt(self,idx,num_lines=10,char_count=None):
file_name = os.path.join(Utils.tmp_dir, str(idx) + ".txt")
head = ''
with open(file_name,"r") as txt :
with open(file_name,"r", encoding = "utf-8") as txt : # does this need explicit decoding?
for x in range(num_lines) : head += txt.read()
if char_count == None : return head
else : return head[:char_count]
Expand Down
2 changes: 1 addition & 1 deletion lib/google_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def search(self,query):
#if isinstance(query,list) : query = ' '.join(query)
query = re.sub(r'\+','', query) #remove + you may have used for bmark search
url = self.qurl + query
print url
print(url)
resp = requests.get(url, timeout=10, allow_redirects=True, headers=self.user_agent)#, config=debug)
if resp.ok :
#with open(Utils.tmp_dir + '/google.html', 'w') as html : print html.write(resp.content)
Expand Down
13 changes: 7 additions & 6 deletions lib/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ class Indexer:

@staticmethod
def urls(url_list):
for url in open(url_list,'r'):
for url in open(url_list,'r',encoding='utf8'): # added encoding
#empty string or comment
if not url.strip() or url.strip().startswith('#') : continue
yield url.rstrip("\n")


@staticmethod
def save2file(name, string):
with open(name,'w') as txt: txt.write(string)
with open(name,'w', encoding='utf8') as txt: txt.write(string) # added encoding

def __init__(self,levels=200):
self.levels = levels
Expand All @@ -46,6 +46,7 @@ def cleanup(self,string):
self.log.info('*** The page has no <body> tag, skipping ... ***')
return None
txt = body[0].text_content().encode(errors='ignore') #encode('utf-8')
txt = txt.decode('utf-8') # added decoding
txt = re.sub('\s*\n\s*', '\n', txt)
txt = re.sub('[ \t]{2,}', ' ', txt)
if not txt : self.log.error("Empty html")
Expand All @@ -70,14 +71,14 @@ def fetch(self,url,idx):
self.log.error("*(%s) %s : %s" % (resp.status_code, resp.reason, resp.headers['Content-Type']))
return None
except Exception as e:
print Exception(e)
print "Err processing %s" % url
print(Exception(e))
print("Err processing %s" % url)


def files(self,start_dir=Utils.tmp_dir):
for f in self.file_list :
print "tfidf processing: %s" % f
yield open(join(start_dir, f),'r').read()
print("tfidf processing: %s" % f)
yield open(join(start_dir, f),'r', encoding='utf8').read() # edited for explicit encoding


#the file list has to be in numerical order (so that tfidf matrix doc idx follow fetch sequence) otherwise indexing goes out of touch
Expand Down
6 changes: 3 additions & 3 deletions lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def init_logger(log_file='indexer.log'):
@staticmethod
def dict2csv(ary, path=None, fname='vocabulary.csv'):
if not path : path = Utils.data_dir
w = csv.writer(open(join(path,fname), "w"), lineterminator="\n")
w = csv.writer(open(join(path,fname), "w"), lineterminator="\n") # does this need explicit encoding?
for key, val in ary.items(): w.writerow([key, val])

@staticmethod
Expand All @@ -51,12 +51,12 @@ def csv2dict(path=None, fname='vocabulary.csv', val_int=True):
@staticmethod
def pkl2file(data,path=None,dst_file='vocabulary.pkl'):
if not path : path = Utils.data_dir
with open(join(path, dst_file), "wb") as f : pickle.dump(data,f)
with open(join(path, dst_file), "wb") as f : pickle.dump(data,f) # does this need explicit encoding?

@staticmethod
def file2pkl(path=None,src_file='vocabulary.pkl'):
if not path : path = Utils.data_dir
with open(join(path,src_file), "rb") as f : data = pickle.load(f)
with open(join(path,src_file), "rb") as f : data = pickle.load(f) # does this need explicit encoding?
return data

@staticmethod
Expand Down
3 changes: 2 additions & 1 deletion site/app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from flask import Flask
from config import config
from flask.ext.bootstrap import Bootstrap
# from flask.ext.bootstrap import Bootstrap
from flask_bootstrap import Bootstrap # updated
bootstrap = Bootstrap()

def create_app(cfg_name):
Expand Down
6 changes: 3 additions & 3 deletions site/app/pse/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,20 @@ def search():
google = GoogleSearch()
bmark = BmarkSearch()

if request.form.has_key('q') :
if 'q' in request.form: # updated from if request.form.has_key('q') :
q = request.form['q']

if len(q) > 0 :

try:
bmark.search(q)
except Exception as e :
flash('Bmark search: ' + e.message)
flash('Bmark search: ' + str(e)) # updated from flash("Google search error : " + e.message)

try :
google.search(q)
except Exception as e:
flash("Google search error : " + e.message)
flash("Google search error : " + str(e)) # updated from flash("Google search error : " + e.message)

else:
flash('Interesting what will happen if you search for something rather than nothing !!')
Expand Down
3 changes: 2 additions & 1 deletion site/manage.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/usr/bin/env python
import os
from app import create_app
from flask.ext.script import Manager
# from flask.ext.script import Manager # deprecated; there's a link I could cite, but forget it.
from flask_script import Manager # updated

app = create_app(os.getenv('FLASK_CONFIG') or 'default')
manager = Manager(app)
Expand Down