Skip to content

Commit

Permalink
fetch and server improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Mahmoud Hashemi committed May 19, 2012
1 parent 67f60cf commit c82d975
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 33 deletions.
2 changes: 1 addition & 1 deletion dabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

dab_db = pw.SqliteDatabase(None) #deferred initialization

def init(db_name='dabase', **kwargs):
def init(db_name='dab_store', **kwargs):
dab_db.init(str(db_name)+'.db', **kwargs)
dab_db.connect()
Dabblet.create_table(fail_silently=True)
Expand Down
119 changes: 91 additions & 28 deletions dabnabbit.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import json
import time
import random
from optparse import OptionParser
from pyquery import PyQuery as pq
from collections import namedtuple

Expand All @@ -17,8 +18,20 @@
from progress import ProgressMeter

API_URL = "http://en.wikipedia.org/w/api.php"
DEFAULT_CAT = "Articles_with_links_needing_disambiguation"
DEFAULT_LIMIT = 100
DEFAULT_CONC = 100
DEFAULT_PER_CALL = 4
DEFAULT_TIMEOUT = 30
DEFAULT_DB = 'dab_store'
CAT_CONC = 10
ALL = 20000


EDIT_SUMMARY = 'DAB link solved with disambiguity!'



class WikiException(Exception): pass

Page = namedtuple("Page", "title, req_title, pageid, revisionid, revisiontext, is_parsed, fetch_date")
Expand Down Expand Up @@ -102,8 +115,6 @@ def get_category(cat_name, count=500, cont_str=""):

return ret

CAT_CONC = 10
ALL = 20000
def get_category_recursive(cat_name, count=None):
ret = set()
seen_cats = set()
Expand Down Expand Up @@ -144,8 +155,8 @@ def get_category_recursive(cat_name, count=None):
print 'Done, returning', len(ret),'category members.'
return list(ret)

def get_dab_page_ids(date=None, count=500):
cat_res = get_category_recursive("Articles_with_links_needing_disambiguation", count)
def get_dab_page_ids(category=DEFAULT_CAT, count=500):
cat_res = get_category_recursive(category, count)
return [ a.pageid for a in cat_res ]


Expand Down Expand Up @@ -246,7 +257,8 @@ def get_context(dab_a):
d(dab_a).addClass('dab-link')
link_parents = dab_a.parents()
cand_contexts = [ p for p in link_parents
if p.text_content() and len(p.text_content().split()) > 30 ]
if getattr(p, "text_content", lambda: False)()
and len(p.text_content().split()) > 30 ]
chosen_context = cand_contexts[-1]
d(chosen_context).addClass('dab-context')
# add upperbound/wrapping div
Expand All @@ -267,18 +279,18 @@ def get_dabblets(parsed_page):
try:
dab_link = d(dlm).parents("sup")[0].getprevious() # TODO: remove extra d?
dab_link = d(dab_link)
if dab_link.is_('a'):
dab_title = dab_link.attr('title')
context = get_context(dab_link)
ctx_html = context.outerHtml()
ret.append( Dabblet.from_page(title = dab_title,
context = ctx_html,
source_page = parsed_page,
source_order = i,
source_imgs = images_found))
except Exception as e:
print 'nope', e
continue
if dab_link.is_('a'):
dab_title = dab_link.attr('title')
context = get_context(dab_link)
ctx_html = context.outerHtml()
ret.append( Dabblet.from_page(title = dab_title,
context = ctx_html,
source_page = parsed_page,
source_order = i,
source_imgs = images_found))
pass

return ret

Expand Down Expand Up @@ -324,23 +336,18 @@ def submit_solution(title, solution):
resp = api_req('query', params)
return resp

P_PER_CALL = 4
DEFAULT_TIMEOUT = 30

def chunked_pimap(func, els, concurrency=150, chunk_size=P_PER_CALL):
def chunked_pimap(func, els, concurrency=DEFAULT_CONC, chunk_size=DEFAULT_PER_CALL):
chunked = (els[i:i + chunk_size]
for i in xrange(0, len(els), chunk_size))
pool = Pool(concurrency)
return pool.imap_unordered(func, chunked)

@dabase.dab_db.commit_on_success
def save_a_bunch(count=1000, concurrency=100, per_call=P_PER_CALL):
def save_a_bunch(count=DEFAULT_LIMIT, category=DEFAULT_CAT, concurrency=DEFAULT_CONC,
per_call=DEFAULT_PER_CALL, db_name=DEFAULT_DB):
import time

db_name = 'abunch'
dabase.init(db_name)

page_ids = get_dab_page_ids(count=count)
page_ids = get_dab_page_ids(category, count)

dabblets = []
dpm = ProgressMeter(total=len(page_ids), unit="articles", ticks=30)
Expand Down Expand Up @@ -411,12 +418,68 @@ def test():
title_article = get_articles(titles="Dog", raise_exc=True)
title_articles = get_articles(titles=["Dog"], raise_exc=True)

def parse_args():
parser = OptionParser()
parser.add_option("-d", "--database", dest="database",
type="string", default=DEFAULT_DB,
help="name of sqlite database used for saving Dabblets")

parser.add_option("-a", "--all", dest="get_all",
action="store_true", default=False,
help="save as many Dabblets as we can find")

parser.add_option("-l", "--limit", dest="limit",
type="int", default=DEFAULT_LIMIT,
help="max number of articles to search for Dabblets (see -a)")

parser.add_option("-C", "--category", dest="category",
type="string", default=DEFAULT_CAT,
help="category to search for Dabblets (recursive)")

parser.add_option("-c", "--concurrency", dest="concurrency",
type="int", default=DEFAULT_CONC,
help="concurrency factor to use when querying the"
"Wikipedia API (simultaneous requests)")

parser.add_option("-g", "--grouping", dest="grouping",
type="int", default=DEFAULT_PER_CALL,
help="how many sub-responses to request per API call")

parser.add_option('-D', "--debug", dest="debug",
action="store_true", default=False,
help="enable debugging (and pop up pdb at the end of successful run")

parser.add_option("-q", "--quiet", dest="verbose", action="store_false",
help="suppress output (TODO)")
return parser.parse_args()


if __name__ == '__main__':
import time
dcount = 100
opts, args = parse_args()
start = time.time()
print 'Saving a bunch (',dcount,') of Dabblets.'
dabblets = save_a_bunch(dcount)
if opts.get_all:
print 'Getting all Dabblets.',
opts.limit = None
else:
print 'Searching up to',opts.limit,'articles for Dabblets.'
print 'Using', opts.concurrency, 'green threads. Saving to', opts.database

dabase.init(opts.database)

try:
dabblets = save_a_bunch(count=opts.limit,
category=opts.category,
concurrency=opts.concurrency,
per_call=opts.grouping,
db_name=opts.database)
except Exception as e:
#if opts.debug: #TODO
# import pdb;pdb.pm()
raise

end = time.time()
print len(dabblets), 'Dabblets saved in', end-start, 'seconds'
import pdb;pdb.set_trace()

if opts.debug:
import pdb;pdb.set_trace()
2 changes: 1 addition & 1 deletion dabserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_session(session_id=None):
if not session.get('seq'):
sequence = ALL_DABBLETS[:]
random.shuffle(sequence)
sequence.sort(key=lambda d: round(d[1], 1)) # difficulty
sequence.sort(key=lambda d: round(d[1])) # difficulty
session['seq'] = [ d[0] for d in sequence ]
SESSIONS[session_id] = session
return session
Expand Down
6 changes: 3 additions & 3 deletions progress.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def __init__(self, *args, **kw):
self.rate_refresh = float(kw.get('rate_refresh', .5))
# Number of ticks in meter
self.meter_ticks = int(kw.get('ticks', 60))
self.meter_division = float(self.total) / self.meter_ticks
self.meter_value = int(self.count / self.meter_division)
self.meter_division = (float(self.total) / self.meter_ticks) or 1.0
self.meter_value = int(self.count / self.meter_division) or 1
self.last_update = None
self.rate_history_idx = 0
self.rate_history_len = 10
Expand Down Expand Up @@ -76,7 +76,7 @@ def update(self, count, *args, **kw):
continue
cnt += 1
total += rate
rate = total / cnt
rate = (total / cnt) or 1
self.estimated_duration.append((self.total - self.count) / rate)
self.rate_current = rate
self.last_update = now
Expand Down

0 comments on commit c82d975

Please sign in to comment.