-
Notifications
You must be signed in to change notification settings - Fork 13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding edges() and iteredges() Functions for DAWGs #1
base: master
Are you sure you want to change the base?
Changes from 5 commits
926d6e8
fa6cd76
8e7390a
0211c19
30bf53b
15355be
dee560c
2a93173
c94b4d8
8cb08f3
f3baac8
77f3802
ae7472a
4975f07
1207380
5462916
0b81a9f
2cbd340
f56e2b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -141,6 +141,39 @@ def keys(self, prefix=""): | |
|
||
return res | ||
|
||
def edges(self, prefix=""): | ||
b_prefix = prefix.encode('utf8') | ||
res = [] | ||
|
||
index = self.dct.follow_bytes(b_prefix, self.dct.ROOT) | ||
if index is None: | ||
return res | ||
|
||
edge_follower = wrapper.EdgeFollower(self.dct, self.guide) | ||
if not edge_follower.start(index, b_prefix): | ||
return res | ||
|
||
res.append(edge_follower.get_cur_edge()) | ||
while edge_follower.next(): | ||
res.append(edge_follower.get_cur_edge()) | ||
|
||
return res | ||
|
||
def iteredges(self, prefix=""): | ||
b_prefix = prefix.encode('utf8') | ||
|
||
index = self.dct.follow_bytes(b_prefix, self.dct.ROOT) | ||
if index is None: | ||
return | ||
|
||
edge_follower = wrapper.EdgeFollower(self.dct, self.guide) | ||
if not edge_follower.start(index, b_prefix): | ||
return | ||
|
||
yield edge_follower.get_cur_edge() | ||
while edge_follower.next(): | ||
yield edge_follower.get_cur_edge() | ||
|
||
def iterkeys(self, prefix=""): | ||
b_prefix = prefix.encode('utf8') | ||
index = self.dct.follow_bytes(b_prefix, self.dct.ROOT) | ||
|
@@ -279,15 +312,14 @@ def iterkeys(self, prefix=""): | |
yield u_key | ||
|
||
def items(self, prefix=""): | ||
index = self.dct.ROOT | ||
if not isinstance(prefix, bytes): | ||
prefix = prefix.encode('utf8') | ||
res = [] | ||
|
||
index = self.dct.ROOT | ||
if prefix: | ||
index = self.dct.follow_bytes(prefix, index) | ||
if not index: | ||
return res | ||
return | ||
res = [] | ||
|
||
completer = wrapper.Completer(self.dct, self.guide) | ||
completer.start(index, prefix) | ||
|
@@ -301,10 +333,9 @@ def items(self, prefix=""): | |
return res | ||
|
||
def iteritems(self, prefix=""): | ||
index = self.dct.ROOT | ||
if not isinstance(prefix, bytes): | ||
prefix = prefix.encode('utf8') | ||
|
||
index = self.dct.ROOT | ||
if prefix: | ||
index = self.dct.follow_bytes(prefix, index) | ||
if not index: | ||
|
@@ -315,9 +346,52 @@ def iteritems(self, prefix=""): | |
|
||
while completer.next(): | ||
key, value = completer.key.split(self._payload_separator) | ||
item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix | ||
# bytes() cast is a python 2.6 fix | ||
item = (key.decode('utf8'), a2b_base64(bytes(value))) | ||
yield item | ||
|
||
def edges(self, prefix=""): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that .edges method should return the same data regardless of DAWG class. It it returns a list of strings in a base class it should return a list of strings in all subclasses. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For BytesDAWG it could make sense to filter out edges leading to the values. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's similar data for all. It never returns a list of strings. It always returns a list of 2-tuples. For dawgs with no data, the tuples are For dawgs with data, they're There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we really want them to be the same, we could make them return |
||
index = self.dct.ROOT | ||
if not isinstance(prefix, bytes): | ||
prefix = prefix.encode('utf8') | ||
if prefix: | ||
index = self.dct.follow_bytes(prefix, index) | ||
if not index: | ||
return | ||
res = [] | ||
|
||
edge_follower = wrapper.EdgeFollower(self.dct, self.guide) | ||
if not edge_follower.start(index, prefix): | ||
return res | ||
|
||
vals = self.b_get_value(edge_follower.decoded_key) or [False] | ||
res.extend([(edge_follower.decoded_key, val) for val in vals]) | ||
while edge_follower.next(): | ||
vals = self.b_get_value(edge_follower.decoded_key) or [False] | ||
res.extend([(edge_follower.decoded_key, val) for val in vals]) | ||
|
||
return res | ||
|
||
def iteredges(self, prefix=""): | ||
index = self.dct.ROOT | ||
if not isinstance(prefix, bytes): | ||
prefix = prefix.encode('utf8') | ||
if prefix: | ||
index = self.dct.follow_bytes(prefix, index) | ||
if not index: | ||
return | ||
|
||
edge_follower = wrapper.EdgeFollower(self.dct, self.guide) | ||
if not edge_follower.start(index, prefix): | ||
return | ||
|
||
vals = self.b_get_value(edge_follower.decoded_key) or [False] | ||
for val in vals: | ||
yield (edge_follower.decoded_key, val or False) | ||
while edge_follower.next(): | ||
vals = self.b_get_value(edge_follower.decoded_key) or [False] | ||
for val in vals: | ||
yield (edge_follower.decoded_key, val or False) | ||
|
||
def _has_value(self, index): | ||
return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index) | ||
|
@@ -464,6 +538,43 @@ class IntCompletionDAWG(CompletionDAWG, IntDAWG): | |
Dict-like class based on DAWG. | ||
It can store integer values for unicode keys and support key completion. | ||
""" | ||
def edges(self, prefix=""): | ||
index = self.dct.ROOT | ||
if not isinstance(prefix, bytes): | ||
prefix = prefix.encode('utf8') | ||
if prefix: | ||
index = self.dct.follow_bytes(prefix, index) | ||
if not index: | ||
return | ||
res = [] | ||
|
||
edge_follower = wrapper.EdgeFollower(self.dct, self.guide) | ||
if not edge_follower.start(index, prefix): | ||
return res | ||
|
||
res.append((edge_follower.decoded_key, edge_follower.value())) | ||
while edge_follower.next(): | ||
res.append((edge_follower.decoded_key, edge_follower.value())) | ||
|
||
return res | ||
|
||
def iteredges(self, prefix=""): | ||
index = self.dct.ROOT | ||
if not isinstance(prefix, bytes): | ||
prefix = prefix.encode('utf8') | ||
if prefix: | ||
index = self.dct.follow_bytes(prefix, index) | ||
if not index: | ||
return | ||
|
||
edge_follower = wrapper.EdgeFollower(self.dct, self.guide) | ||
if not edge_follower.start(index, prefix): | ||
return | ||
|
||
yield (edge_follower.decoded_key, edge_follower.value()) | ||
while edge_follower.next(): | ||
yield (edge_follower.decoded_key, edge_follower.value()) | ||
|
||
def items(self, prefix=""): | ||
if not isinstance(prefix, bytes): | ||
prefix = prefix.encode('utf8') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,29 +17,29 @@ def __init__(self): | |
"Root index" | ||
|
||
def has_value(self, index): | ||
"Checks if a given index is related to the end of a key." | ||
#Checks if a given index is related to the end of a key. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are comments better than docstrings? It is nice to have some docs available at runtime, e.g. in REPL. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair. I changed it because PEP8 checks were complaining, and I never use docs at runtime, so forgot about that. I'm pretty neutral on this, so I'll change them back. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think using triple quotes and replacing "checks/gets/reads/..." with "check/get/read" should make them pep8-compatible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code currently isn't pep8 compatible anyway because some older lines go over 80 chars, etc... I'm thinking let's leave this as is for now, and I'll do a later run through to get everything pep8 compatible so the whole repo passes pep8 compatibility checks. |
||
return units.has_leaf(self._units[index]) | ||
|
||
def value(self, index): | ||
"Gets a value from a given index." | ||
#Gets a value from a given index. | ||
offset = units.offset(self._units[index]) | ||
value_index = (index ^ offset) & units.PRECISION_MASK | ||
return units.value(self._units[value_index]) | ||
|
||
def read(self, fp): | ||
"Reads a dictionary from an input stream." | ||
#Reads a dictionary from an input stream. | ||
base_size = struct.unpack(str("=I"), fp.read(4))[0] | ||
self._units.fromfile(fp, base_size) | ||
|
||
def contains(self, key): | ||
"Exact matching." | ||
#Exact matching. | ||
index = self.follow_bytes(key, self.ROOT) | ||
if index is None: | ||
return False | ||
return self.has_value(index) | ||
|
||
def find(self, key): | ||
"Exact matching (returns value)" | ||
#Exact matching (returns value) | ||
index = self.follow_bytes(key, self.ROOT) | ||
if index is None: | ||
return -1 | ||
|
@@ -48,7 +48,7 @@ def find(self, key): | |
return self.value(index) | ||
|
||
def follow_char(self, label, index): | ||
"Follows a transition" | ||
#Follows a transition | ||
offset = units.offset(self._units[index]) | ||
next_index = (index ^ offset ^ label) & units.PRECISION_MASK | ||
|
||
|
@@ -58,7 +58,7 @@ def follow_char(self, label, index): | |
return next_index | ||
|
||
def follow_bytes(self, s, index): | ||
"Follows transitions." | ||
#Follows transitions. | ||
for ch in s: | ||
index = self.follow_char(int_from_byte(ch), index) | ||
if index is None: | ||
|
@@ -95,6 +95,77 @@ def size(self): | |
return len(self._units) | ||
|
||
|
||
class EdgeFollower(object): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 for separating Completer and EdgeFollower |
||
def __init__(self, dic=None, guide=None): | ||
self._dic = dic | ||
self._guide = guide | ||
|
||
def value(self): | ||
if self._dic.has_value(self._cur_index): | ||
return self._dic.value(self._cur_index) | ||
return False | ||
|
||
def start(self, index, prefix=b""): | ||
"""initial setup for a completer next_edge() action on some prefix. If | ||
there's a child for this prefix, we add that as the one item on the | ||
index_stack. Otherwise, leave the stack empty, so next_edge() fails""" | ||
|
||
self.key = bytearray(prefix) | ||
self.base_key_len = len(self.key) | ||
self._parent_index = index | ||
self._sib_index = None | ||
self._cur_index = None | ||
if self._guide.size(): | ||
child_label = self._guide.child(index) # UCharType | ||
|
||
if child_label: | ||
# Follows a transition to the first child. | ||
next_index = self._dic.follow_char(child_label, index) | ||
if index is not None: | ||
self._sib_index = next_index | ||
self._cur_index = self._sib_index | ||
self.key.append(child_label) | ||
self.decoded_key = self.key.decode('utf-8') | ||
return True | ||
|
||
def next(self): | ||
#Gets the next edge (not necessarily a terminal) | ||
|
||
if not self._sib_index: | ||
return False | ||
|
||
sibling_label = self._guide.sibling(self._sib_index) | ||
self._sib_index = self._dic.follow_char(sibling_label, | ||
self._parent_index) | ||
self._cur_index = self._sib_index | ||
if not self._sib_index: | ||
return False | ||
|
||
self.key = self.key[:self.base_key_len] | ||
self.key.append(sibling_label) | ||
try: | ||
self.decoded_key = self.key.decode('utf-8') | ||
except UnicodeDecodeError: | ||
#this sibling is a multibyte char. keep following its children til | ||
#something is decodable | ||
while True: | ||
child_label = self._guide.child(self._sib_index) | ||
self._cur_index = self._dic.follow_char(child_label, | ||
self._cur_index) | ||
if not self._cur_index: | ||
return False | ||
self.key.append(child_label) | ||
try: | ||
self.decoded_key = self.key.decode('utf-8') | ||
break | ||
except UnicodeDecodeError: | ||
pass | ||
return True | ||
|
||
def get_cur_edge(self): | ||
return (self.decoded_key, self._dic.has_value(self._cur_index)) | ||
|
||
|
||
class Completer(object): | ||
|
||
def __init__(self, dic=None, guide=None): | ||
|
@@ -105,6 +176,8 @@ def value(self): | |
return self._dic.value(self._last_index) | ||
|
||
def start(self, index, prefix=b""): | ||
#initial setup for a completer next() action on some prefix | ||
|
||
self.key = bytearray(prefix) | ||
|
||
if self._guide.size(): | ||
|
@@ -113,9 +186,8 @@ def start(self, index, prefix=b""): | |
else: | ||
self._index_stack = [] | ||
|
||
|
||
def next(self): | ||
"Gets the next key" | ||
#Gets the next key | ||
|
||
if not self._index_stack: | ||
return False | ||
|
@@ -153,7 +225,6 @@ def next(self): | |
|
||
return self._find_terminal(index) | ||
|
||
|
||
def _follow(self, label, index): | ||
next_index = self._dic.follow_char(label, index) | ||
if next_index is None: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is backwards incompatible - .items should return an empty list, not None here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good call. Will fix (and add a test for future).