|
| 1 | +# Epsilon production removal from grammars |
| 2 | +# |
| 3 | +# Eli Bendersky [https://eli.thegreenplace.net] |
| 4 | +# This code is in the public domain. |
| 5 | +import sys, os |
| 6 | +from collections import defaultdict |
| 7 | + |
| 8 | + |
| 9 | +class CFG(object): |
| 10 | + def __init__(self): |
| 11 | + self.prod = defaultdict(list) |
| 12 | + self.start = None |
| 13 | + |
| 14 | + def set_start_symbol(self, start): |
| 15 | + """ Set the start symbol of the grammar. |
| 16 | + """ |
| 17 | + self.start = start |
| 18 | + |
| 19 | + def add_prod(self, lhs, rhs): |
| 20 | + """ Add production to the grammar. 'rhs' can |
| 21 | + be several productions separated by '|'. |
| 22 | + Each production is a sequence of symbols |
| 23 | + separated by whitespace. |
| 24 | + Empty strings are interpreted as an eps-production. |
| 25 | +
|
| 26 | + Usage: |
| 27 | + grammar.add_prod('NT', 'VP PP') |
| 28 | + grammar.add_prod('Digit', '1|2|3|4') |
| 29 | +
|
| 30 | + # Optional Digit: digit or eps |
| 31 | + grammar.add_prod('Digit_opt', Digit |') |
| 32 | + """ |
| 33 | + # The internal data-structure representing productions. |
| 34 | + # maps a nonterminal name to a list of productions, each |
| 35 | + # a list of symbols. An empty list [] specifies an |
| 36 | + # eps-production. |
| 37 | + # |
| 38 | + prods = rhs.split('|') |
| 39 | + for prod in prods: |
| 40 | + self.prod[lhs].append(prod.split()) |
| 41 | + |
| 42 | + def remove_eps_productions(self): |
| 43 | + """ Removes epsilon productions from the grammar. |
| 44 | +
|
| 45 | + The algorithm: |
| 46 | +
|
| 47 | + 1. Pick a nonterminal p_eps with an epsilon production |
| 48 | + 2. Remove that epsilon production |
| 49 | + 3. For each production containing p_eps, replace it |
| 50 | + with several productions such that all the |
| 51 | + combinations of p_eps being there or not will be |
| 52 | + represented. |
| 53 | + 4. If there are still epsilon productions in the |
| 54 | + grammar, go back to step 1 |
| 55 | +
|
| 56 | + The replication can be demonstrated with an example. |
| 57 | + Suppose that A contains an epsilon production, and |
| 58 | + we've found a production B:: [A, k, A] |
| 59 | + Then this production of B will be replaced with these: |
| 60 | + [A, k], [k], [k, A], [A, k, A] |
| 61 | + """ |
| 62 | + while True: |
| 63 | + # Find an epsilon production |
| 64 | + # |
| 65 | + p_eps, index = self._find_eps_production() |
| 66 | + |
| 67 | + # No epsilon productions? Then we're done... |
| 68 | + # |
| 69 | + if p_eps is None: |
| 70 | + break |
| 71 | + |
| 72 | + # Remove the epsilon production |
| 73 | + # |
| 74 | + del self.prod[p_eps][index] |
| 75 | + |
| 76 | + # Now find all the productions that contain the |
| 77 | + # production that removed. |
| 78 | + # For each such production, replicate it with all |
| 79 | + # the combinations of the removed production. |
| 80 | + # |
| 81 | + for lhs in self.prod: |
| 82 | + prods = [] |
| 83 | + |
| 84 | + for lhs_prod in self.prod[lhs]: |
| 85 | + num_p_eps = lhs_prod.count(p_eps) |
| 86 | + if num_p_eps == 0: |
| 87 | + prods.append(lhs_prod) |
| 88 | + else: |
| 89 | + prods.extend(self._create_prod_combinations( |
| 90 | + prod=lhs_prod, |
| 91 | + nt=p_eps, |
| 92 | + count=num_p_eps)) |
| 93 | + |
| 94 | + # Remove duplicates |
| 95 | + # |
| 96 | + prods = sorted(prods) |
| 97 | + prods = [prods[i] for i in xrange(len(prods)) |
| 98 | + if i == 0 or prods[i] != prods[i-1]] |
| 99 | + |
| 100 | + self.prod[lhs] = prods |
| 101 | + |
| 102 | + def _find_eps_production(self): |
| 103 | + """ Finds an epsilon production in the grammar. If such |
| 104 | + a production is found, returns the pair (lhs, index): |
| 105 | + the name of the non-terminal that has an epsilon |
| 106 | + production and its index in lhs's list of productions. |
| 107 | + If no epsilon productions were found, returns the |
| 108 | + pair (None, None). |
| 109 | +
|
| 110 | + Note: eps productions in the start symbol will be |
| 111 | + ignored, because we don't want to remove them. |
| 112 | + """ |
| 113 | + for lhs in self.prod: |
| 114 | + if not self.start is None and lhs == self.start: |
| 115 | + continue |
| 116 | + |
| 117 | + for i, p in enumerate(self.prod[lhs]): |
| 118 | + if len(p) == 0: |
| 119 | + return lhs, i |
| 120 | + |
| 121 | + return None, None |
| 122 | + |
| 123 | + def _create_prod_combinations(self, prod, nt, count): |
| 124 | + """ prod: |
| 125 | + A production (list) that contains at least one |
| 126 | + instance of 'nt' |
| 127 | + nt: |
| 128 | + The non-terminal which should be replicated |
| 129 | + count: |
| 130 | + The amount of times 'nt' appears in 'lhs_prod'. |
| 131 | + Assumed to be >= 1 |
| 132 | +
|
| 133 | + Returns the generated list of productions. |
| 134 | + """ |
| 135 | + # The combinations are a kind of a powerset. Membership |
| 136 | + # in a powerset can be checked by using the binary |
| 137 | + # representation of a number. |
| 138 | + # There are 2^count possibilities in total. |
| 139 | + # |
| 140 | + numset = 1 << count |
| 141 | + new_prods = [] |
| 142 | + |
| 143 | + for i in xrange(numset): |
| 144 | + nth_nt = 0 |
| 145 | + new_prod = [] |
| 146 | + |
| 147 | + for s in prod: |
| 148 | + if s == nt: |
| 149 | + if i & (1 << nth_nt): |
| 150 | + new_prod.append(s) |
| 151 | + nth_nt += 1 |
| 152 | + else: |
| 153 | + new_prod.append(s) |
| 154 | + |
| 155 | + new_prods.append(new_prod) |
| 156 | + |
| 157 | + return new_prods |
| 158 | + |
| 159 | + |
| 160 | + |
| 161 | +#----------------------------------------------------------------- |
| 162 | +if __name__ == "__main__": |
| 163 | + cfg = CFG() |
| 164 | + #~ cfg.add_prod('B', 'A z A | A p | c r') |
| 165 | + #~ cfg.add_prod('A', 'a | | c k') |
| 166 | + |
| 167 | + #~ cfg.set_start_symbol('S') |
| 168 | + #~ cfg.add_prod('S', 'A B | A') |
| 169 | + #~ cfg.add_prod('A', 'A a | | C c') |
| 170 | + #~ cfg.add_prod('B', 'C | b') |
| 171 | + #~ cfg.add_prod('C', 'C v | w |') |
| 172 | + |
| 173 | + cfg.add_prod('func_call', 'identifier ( arguments_opt )') |
| 174 | + cfg.add_prod('arguments_opt', 'arguments_list | ') |
| 175 | + cfg.add_prod('arguments_list', 'argument | argument , arguments_list') |
| 176 | + |
| 177 | + #~ cfg.add_prod('B', 'A z A') |
| 178 | + #~ cfg.add_prod('A', 'a | ') |
| 179 | + |
| 180 | + |
| 181 | + #~ cfg.add_prod('S', 'A B') |
| 182 | + #~ cfg.add_prod('A', 'A A B | | a') |
| 183 | + #~ cfg.add_prod('B', 'C D C | A | b |') |
| 184 | + #~ cfg.add_prod('C', 'c |') |
| 185 | + #~ cfg.add_prod('D', 'd') |
| 186 | + |
| 187 | + cfg.remove_eps_productions() |
| 188 | + for p in cfg.prod: |
| 189 | + print p, ':: ', [' '.join(pr) for pr in cfg.prod[p]] |
| 190 | + |
| 191 | + #~ print cfg._create_prod_combinations(['A', 'b', 'c', 'A'], 'A', 2) |
0 commit comments