Skip to content

Commit

Permalink
gevent and database stuff coming along nicely
Browse files Browse the repository at this point in the history
  • Loading branch information
Mahmoud Hashemi committed May 14, 2012
1 parent 8010244 commit 7559663
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 60 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ pub/
*.json
log*
article_html/*.html
*.db


*.pyc*
*.pyo*
Expand Down
97 changes: 97 additions & 0 deletions dabase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import peewee as pw
from datetime import datetime

dab_db = pw.SqliteDatabase(None) #deferred initialization

def init(db_name, **kwargs):
dab_db.init(str(db_name)+'.db', **kwargs)
dab_db.connect()
Dabblet.create_table(fail_silently=True)
DabChoice.create_table(fail_silently=True)


class DabModel(pw.Model):
class Meta:
database = dab_db


class Dabblet(DabModel):
title = pw.CharField()
context = pw.TextField()

source_title = pw.CharField()
source_order = pw.IntegerField()
source_pageid = pw.IntegerField()
source_revid = pw.IntegerField()

date_created = pw.DateTimeField(db_index=True)

difficulty = pw.IntegerField()
viability = pw.IntegerField()

@classmethod
def from_page(cls, title, context, source_page, source_order, **kw):
# TODO: get options
ret = cls(title = title,
context = context,
source_title = source_page.title,
source_pageid = source_page.pageid,
source_revid = source_page.revisionid,
source_order = source_order,
date_created = datetime.now())
ret.source_page = source_page
return ret

def _asdict(self):
return {'title': self.title,
'source_title': self.source_title,
'context': self.context,
'options': [ o._asdict() for o in self.options ]
}


#def __init__(self, title, context, source_page, source_order):
# self.title = title
# self.context = context
# self.source_page = source_page
# self.source_order = source_order
#
# self.options = [] # TODO: get_dab_options(title)


class DabChoice(DabModel):
dabblet = pw.ForeignKeyField(Dabblet, related_name='choices')
title = pw.CharField()
text = pw.TextField()

def _asdict(self):
return { 'title': self.title,
'text': self.text,
'dab_title': self.dabblet.title }


class DabbletSolution(DabModel):
dabblet = pw.ForeignKeyField(Dabblet, related_name='dabblet')
choice = pw.ForeignKeyField(DabChoice, related_name='choice')

solver_ip = pw.CharField()
date_solved = pw.DateTimeField(db_index=True)


def test():
from datetime import datetime
from dabnabbit import Page

init('dabase_unittest')
sp = Page(0, 'first_source', 0, 'first text', True, datetime.now())

#da2 = Dabblet(title='first', context='first context', source_title='first source', source_pageid=0, source_revid=0, source_order=0, date_created=datetime.now())
da2 = Dabblet.from_page('first dab title', 'first dab context', sp, 0)
da2.save()

dabblets = [ d for d in Dabblet.select() ]
print len(dabblets), 'Dabblets now in the test db'


if __name__ == '__main__':
test()
204 changes: 144 additions & 60 deletions dabnabbit.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import gevent
import socket
from gevent import monkey
monkey.patch_all()

Expand All @@ -8,33 +7,17 @@
import time
import random
from pyquery import PyQuery as pq

# stupid ipy notebook

from collections import namedtuple

import dabase
from dabase import Dabblet, DabChoice

API_URL = "http://en.wikipedia.org/w/api.php"

class WikiException(Exception): pass

Page = namedtuple("ParsedPage", "pageid, title, revisionid, revisiontext, is_parsed, fetch_date")

DabOption = namedtuple("DabOption", "title, text, dab_title")

class Dabblet(object):
def __init__(self, title, context, source_page, source_order):
self.title = title
self.context = context
self.source_page = source_page
self.source_order = source_order

self.options = get_dab_options(title)
Page = namedtuple("Page", "title, req_title, pageid, revisionid, revisiontext, is_parsed, fetch_date")

def _asdict(self):
return {'title': self.title,
'context': self.context,
'options': [ o._asdict() for o in self.options ]
}

def api_req(action, params=None, raise_exc=False, **kwargs):
all_params = {'format': 'json',
Expand All @@ -57,23 +40,29 @@ def api_req(action, params=None, raise_exc=False, **kwargs):
resp.error = e
return resp

mw_error = resp.headers.get('MediaWiki-API-Error')
if mw_error:
if raise_exc:
raise WikiException(mw_error)
else:
resp.error = mw_error
return resp

try:
resp.results = json.loads(resp.text)
resp.servedby = resp.results.get('servedby')
# TODO: warnings?
except Exception as e:
if raise_exc:
raise
else:
resp.error = e
return resp


mw_error = resp.headers.get('MediaWiki-API-Error')
if mw_error:
error_str = mw_error
error_obj = resp.results.get('error')
if error_obj and error_obj.get('info'):
error_str += ' ' + error_obj.get('info')
if raise_exc:
raise WikiException(error_str)
else:
resp.error = error_str
return resp

return resp


Expand All @@ -93,36 +82,53 @@ def get_dab_page_ids(date=None, count=500):
cat_res.results['query']['categorymembers'] ]


def get_articles(page_id=None, title=None, parsed=True, follow_redirects=False):
def get_articles(page_ids=None, titles=None, parsed=True, follow_redirects=False, **kwargs):
ret = []
params = {'prop': 'revisions',
'rvprop': 'content|ids' }

if page_id:
if not isinstance(page_id, (str,unicode)):
if page_ids:
if not isinstance(page_ids, (str,unicode)):
try:
page_id = "|".join([str(p) for p in page_id])
page_ids = "|".join([str(p) for p in page_ids])
except:
pass
params['pageids'] = str(page_id)
elif title:
params['titles'] = title
params['pageids'] = str(page_ids)
elif titles:
if not isinstance(titles, (str,unicode)):
try:
titles = "|".join([str(t) for t in titles])
except:
print "Couldn't join: ",repr(titles)
params['titles'] = titles
else:
raise Exception('You need to pass in a page id or a title.')

if parsed:
params['rvparse'] = 'true'
if follow_redirects:
params['redirects'] = 'true'

parse_resp = api_req('query', params)
parse_resp = api_req('query', params, **kwargs)
if parse_resp.results:
ret = [Page( pageid = page['pageid'],
title = page['title'],
revisionid = page['revisions'][0]['revid'],
revisiontext = page['revisions'][0]['*'],
is_parsed = parsed,
fetch_date = time.time())
for page in parse_resp.results['query']['pages'].values()]
try:
pages = parse_resp.results['query']['pages'].values()
redirect_list = parse_resp.results['query'].get('redirects', [])
except:
print "Couldn't get_articles() with params: ", params
return ret

redirects = dict([ (r['to'],r['from']) for r in redirect_list ])
for page in pages:
title = page['title']
pa = Page( pageid = page['pageid'],
title = title,
req_title = redirects.get(title, title),
revisionid = page['revisions'][0]['revid'],
revisiontext = page['revisions'][0]['*'],
is_parsed = parsed,
fetch_date = time.time())
ret.append(pa)
return ret


Expand All @@ -132,25 +138,34 @@ def is_fixable_dab_link(parsed_page):
pass


def get_dab_options(dab_page_title):
def get_dab_choices(dabblets): # side effect-y..
ret = []
dab_page = get_articles(title=dab_page_title, follow_redirects=True)[0]
dab_text = dab_page.revisiontext

d = pq(dab_text)
d('table#toc').remove()
liasons = set([ d(a).parents('li')[-1] for a in d('li a') ])
for lia in liasons:
# TODO: better heuristic than ":first" link?
title = d(lia).find('a:first').attr('title')
text = lia.text_content().strip()
ret.append(DabOption(title, text, dab_page.title))
if not dabblets:
return ret
dab_map = dict([(d.title, d) for d in dabblets])
dab_pages = get_articles(titles=dab_map.keys(), follow_redirects=True)

for dp in dab_pages:
dabblet = dab_map.get(dp.req_title) # :/ (worried about special characters)
dab_text = dp.revisiontext

d = pq(dab_text)
d('table#toc').remove()
liasons = set([ d(a).parents('li')[-1] for a in d('li a') ])
for lia in liasons:
# TODO: better heuristic than ":first" link?
title = d(lia).find('a:first').attr('title')
text = lia.text_content().strip()
ret.append(DabChoice(dabblet=dabblet,
title=title,
text=text))

return ret


def get_context(dab_a):
d = dab_a(dab_a.parents()[0])
d(dab_a).addClass('dab-link')
link_parents = dab_a.parents()
cand_contexts = [ p for p in link_parents
if p.text_content() and len(p.text_content().split()) > 30 ]
Expand All @@ -173,15 +188,84 @@ def get_dabblets(parsed_page):
continue
if dab_link.is_('a'):
dab_title = dab_link.attr('title')
d(dab_link).addClass('dab-link')
context = get_context(dab_link)
ret.append( Dabblet(dab_title, context.outerHtml(), d.html(), i) )
ret.append( Dabblet.from_page(dab_title, context.outerHtml(), parsed_page, i) )

return ret

def get_random_dabblets(count=2):
dabblets = []
page_ids = random.sample(get_dab_page_ids(count=count), count)
page_ids = random.sample(get_dab_page_ids(count=count*2), count)
articles = get_articles(page_ids)
dabblets.extend(sum([get_dabblets(a) for a in articles], []))
return dabblets

def save_a_bunch(count=1000):
import time
P_PER_CALL = 4
db_name = 'abunch'
dabase.init(db_name)
dabblets = []

page_ids = get_dab_page_ids(count=count)

print 'fetching', len(page_ids), 'articles...'
start = time.time()
ajobs = [gevent.spawn(get_articles, page_ids[i:i+P_PER_CALL]) for i in range(0, len(page_ids), P_PER_CALL)]
print 'using', len(ajobs), 'green threads.'
gevent.joinall(ajobs, timeout=30)
print 'fetch done (t+', time.time() - start, 'seconds)'
for aj in ajobs:
articles = aj.value
if not articles:
continue
dabblets.extend(sum([get_dabblets(a) for a in articles], []))

get_dab_choices(dabblets[:1])

all_choices = []
print 'fetching choices for', len(dabblets), 'Dabblets.'
choices_start = time.time()

cjobs = [ gevent.spawn(get_dab_choices, dabblets[i:i+P_PER_CALL])
for i in
range(0, len(dabblets), P_PER_CALL) ]

print 'using', len(cjobs), 'green threads.'
gevent.joinall(cjobs, timeout=30)
print 'fetching choices done (t+', time.time() - choices_start, 'seconds)'
for cj in cjobs:
choices = cj.value
if not choices:
continue
all_choices.extend(choices)

for d in dabblets:
d.save()
for c in all_choices:
c.save()

end = time.time()
fetched_len = len(dabblets)

print len(dabblets), 'Dabblets saved to', db_name, 'in', end-start, 'seconds'
print len(set([d.title for d in dabblets])), 'unique titles'
print len(set([d.source_title for d in dabblets])), 'unique source pages'
print len(all_choices), 'dabblet choices fetched and saved.'

print Dabblet.select().count(), 'total records in database'
print len(set([d.title for d in Dabblet.select()])), 'unique titles in database'

return dabblets

def test():
print 'getting one article by ID'
pid_article = get_articles(4269567, raise_exc=True)
assert len(pid_article) > 0
print 'getting one article by list of IDs (list of one)'
pid_articles = get_articles([4269567], raise_exc=True)
assert len(pid_articles) > 0

if __name__ == '__main__':
dabblets = save_a_bunch(50)
import pdb;pdb.set_trace()
Loading

0 comments on commit 7559663

Please sign in to comment.