Skip to content

Commit a4956a9

Browse files
committed
Code for the epsilon removal post
1 parent 0b7c38c commit a4956a9

File tree

1 file changed

+191
-0
lines changed

1 file changed

+191
-0
lines changed

2010/eps-removal/eps_removal.py

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
# Epsilon production removal from grammars
2+
#
3+
# Eli Bendersky [https://eli.thegreenplace.net]
4+
# This code is in the public domain.
5+
import sys, os
6+
from collections import defaultdict
7+
8+
9+
class CFG(object):
10+
def __init__(self):
11+
self.prod = defaultdict(list)
12+
self.start = None
13+
14+
def set_start_symbol(self, start):
15+
""" Set the start symbol of the grammar.
16+
"""
17+
self.start = start
18+
19+
def add_prod(self, lhs, rhs):
20+
""" Add production to the grammar. 'rhs' can
21+
be several productions separated by '|'.
22+
Each production is a sequence of symbols
23+
separated by whitespace.
24+
Empty strings are interpreted as an eps-production.
25+
26+
Usage:
27+
grammar.add_prod('NT', 'VP PP')
28+
grammar.add_prod('Digit', '1|2|3|4')
29+
30+
# Optional Digit: digit or eps
31+
grammar.add_prod('Digit_opt', Digit |')
32+
"""
33+
# The internal data-structure representing productions.
34+
# maps a nonterminal name to a list of productions, each
35+
# a list of symbols. An empty list [] specifies an
36+
# eps-production.
37+
#
38+
prods = rhs.split('|')
39+
for prod in prods:
40+
self.prod[lhs].append(prod.split())
41+
42+
def remove_eps_productions(self):
43+
""" Removes epsilon productions from the grammar.
44+
45+
The algorithm:
46+
47+
1. Pick a nonterminal p_eps with an epsilon production
48+
2. Remove that epsilon production
49+
3. For each production containing p_eps, replace it
50+
with several productions such that all the
51+
combinations of p_eps being there or not will be
52+
represented.
53+
4. If there are still epsilon productions in the
54+
grammar, go back to step 1
55+
56+
The replication can be demonstrated with an example.
57+
Suppose that A contains an epsilon production, and
58+
we've found a production B:: [A, k, A]
59+
Then this production of B will be replaced with these:
60+
[A, k], [k], [k, A], [A, k, A]
61+
"""
62+
while True:
63+
# Find an epsilon production
64+
#
65+
p_eps, index = self._find_eps_production()
66+
67+
# No epsilon productions? Then we're done...
68+
#
69+
if p_eps is None:
70+
break
71+
72+
# Remove the epsilon production
73+
#
74+
del self.prod[p_eps][index]
75+
76+
# Now find all the productions that contain the
77+
# production that removed.
78+
# For each such production, replicate it with all
79+
# the combinations of the removed production.
80+
#
81+
for lhs in self.prod:
82+
prods = []
83+
84+
for lhs_prod in self.prod[lhs]:
85+
num_p_eps = lhs_prod.count(p_eps)
86+
if num_p_eps == 0:
87+
prods.append(lhs_prod)
88+
else:
89+
prods.extend(self._create_prod_combinations(
90+
prod=lhs_prod,
91+
nt=p_eps,
92+
count=num_p_eps))
93+
94+
# Remove duplicates
95+
#
96+
prods = sorted(prods)
97+
prods = [prods[i] for i in xrange(len(prods))
98+
if i == 0 or prods[i] != prods[i-1]]
99+
100+
self.prod[lhs] = prods
101+
102+
def _find_eps_production(self):
103+
""" Finds an epsilon production in the grammar. If such
104+
a production is found, returns the pair (lhs, index):
105+
the name of the non-terminal that has an epsilon
106+
production and its index in lhs's list of productions.
107+
If no epsilon productions were found, returns the
108+
pair (None, None).
109+
110+
Note: eps productions in the start symbol will be
111+
ignored, because we don't want to remove them.
112+
"""
113+
for lhs in self.prod:
114+
if not self.start is None and lhs == self.start:
115+
continue
116+
117+
for i, p in enumerate(self.prod[lhs]):
118+
if len(p) == 0:
119+
return lhs, i
120+
121+
return None, None
122+
123+
def _create_prod_combinations(self, prod, nt, count):
124+
""" prod:
125+
A production (list) that contains at least one
126+
instance of 'nt'
127+
nt:
128+
The non-terminal which should be replicated
129+
count:
130+
The amount of times 'nt' appears in 'lhs_prod'.
131+
Assumed to be >= 1
132+
133+
Returns the generated list of productions.
134+
"""
135+
# The combinations are a kind of a powerset. Membership
136+
# in a powerset can be checked by using the binary
137+
# representation of a number.
138+
# There are 2^count possibilities in total.
139+
#
140+
numset = 1 << count
141+
new_prods = []
142+
143+
for i in xrange(numset):
144+
nth_nt = 0
145+
new_prod = []
146+
147+
for s in prod:
148+
if s == nt:
149+
if i & (1 << nth_nt):
150+
new_prod.append(s)
151+
nth_nt += 1
152+
else:
153+
new_prod.append(s)
154+
155+
new_prods.append(new_prod)
156+
157+
return new_prods
158+
159+
160+
161+
#-----------------------------------------------------------------
162+
if __name__ == "__main__":
163+
cfg = CFG()
164+
#~ cfg.add_prod('B', 'A z A | A p | c r')
165+
#~ cfg.add_prod('A', 'a | | c k')
166+
167+
#~ cfg.set_start_symbol('S')
168+
#~ cfg.add_prod('S', 'A B | A')
169+
#~ cfg.add_prod('A', 'A a | | C c')
170+
#~ cfg.add_prod('B', 'C | b')
171+
#~ cfg.add_prod('C', 'C v | w |')
172+
173+
cfg.add_prod('func_call', 'identifier ( arguments_opt )')
174+
cfg.add_prod('arguments_opt', 'arguments_list | ')
175+
cfg.add_prod('arguments_list', 'argument | argument , arguments_list')
176+
177+
#~ cfg.add_prod('B', 'A z A')
178+
#~ cfg.add_prod('A', 'a | ')
179+
180+
181+
#~ cfg.add_prod('S', 'A B')
182+
#~ cfg.add_prod('A', 'A A B | | a')
183+
#~ cfg.add_prod('B', 'C D C | A | b |')
184+
#~ cfg.add_prod('C', 'c |')
185+
#~ cfg.add_prod('D', 'd')
186+
187+
cfg.remove_eps_productions()
188+
for p in cfg.prod:
189+
print p, ':: ', [' '.join(pr) for pr in cfg.prod[p]]
190+
191+
#~ print cfg._create_prod_combinations(['A', 'b', 'c', 'A'], 'A', 2)

0 commit comments

Comments
 (0)