Skip to content

Commit 7d64a90

Browse files
committed
Rewrite SEQ without using recursion.
Allows for very long regexes to be processed. Doesn't require the result to be reversed afterwards.
1 parent 7ad13f5 commit 7d64a90

File tree

2 files changed

+40
-70
lines changed

2 files changed

+40
-70
lines changed

lexer.lisp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -290,8 +290,7 @@ handled elsewhere."
290290
(return))
291291
(write-char char out))))))
292292
(list (if (char= first-char #\p) :property :inverted-property)
293-
;; we must reverse here because of what PARSE-STRING does
294-
(nreverse name))))
293+
name)))
295294

296295
(defun collect-char-class (lexer)
297296
"Reads and consumes characters from regex string until a right
@@ -571,7 +570,7 @@ closing #\> will also be consumed."
571570
;; back-referencing a named register
572571
(incf (lexer-pos lexer))
573572
(list :back-reference
574-
(nreverse (parse-register-name-aux lexer))))
573+
(parse-register-name-aux lexer)))
575574
(t
576575
;; false alarm, just unescape \k
577576
#\k)))

parser.lisp

Lines changed: 38 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,7 @@ Will return <parse-tree> or \(<grouping-type> <parse-tree>) where
128128
;; have been the "(?:"<regex>")" production
129129
(cons :group (nconc flags (list reg-expr)))
130130
(if (eq open-token :open-paren-less-letter)
131-
(list :named-register
132-
;; every string was reversed, so we have to
133-
;; reverse it back to get the name
134-
(nreverse register-name)
131+
(list :named-register register-name
135132
reg-expr)
136133
(list (case open-token
137134
((:open-paren)
@@ -201,54 +198,42 @@ Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
201198
;; to parse a <seq> or <quant> in order to catch empty regular
202199
;; expressions
203200
(if (start-of-subexpr-p lexer)
204-
(let ((quant (quant lexer)))
205-
(if (start-of-subexpr-p lexer)
206-
(let* ((seq (seq lexer))
207-
(quant-is-char-p (characterp quant))
208-
(seq-is-sequence-p (and (consp seq)
209-
(eq (first seq) :sequence))))
210-
(cond ((and quant-is-char-p
211-
(characterp seq))
212-
(make-array-from-two-chars seq quant))
213-
((and quant-is-char-p
214-
(stringp seq))
215-
(vector-push-extend quant seq)
216-
seq)
217-
((and quant-is-char-p
218-
seq-is-sequence-p
219-
(characterp (second seq)))
220-
(cond ((cddr seq)
221-
(setf (cdr seq)
222-
(cons
223-
(make-array-from-two-chars (second seq)
224-
quant)
225-
(cddr seq)))
226-
seq)
227-
(t (make-array-from-two-chars (second seq) quant))))
228-
((and quant-is-char-p
229-
seq-is-sequence-p
230-
(stringp (second seq)))
231-
(cond ((cddr seq)
232-
(setf (cdr seq)
233-
(cons
234-
(progn
235-
(vector-push-extend quant (second seq))
236-
(second seq))
237-
(cddr seq)))
238-
seq)
239-
(t
240-
(vector-push-extend quant (second seq))
241-
(second seq))))
242-
(seq-is-sequence-p
243-
;; if <seq> is also a :SEQUENCE parse tree we merge
244-
;; both lists into one to avoid unnecessary consing
245-
(setf (cdr seq)
246-
(cons quant (cdr seq)))
247-
seq)
248-
(t (list :sequence quant seq))))
249-
quant))
250-
:void)))
251-
201+
(loop with seq-is-sequence-p = nil
202+
with last-cdr
203+
for quant = (quant lexer)
204+
for quant-is-char-p = (characterp quant)
205+
for seq = quant
206+
then
207+
(cond ((and quant-is-char-p (characterp seq))
208+
(make-array-from-two-chars seq quant))
209+
((and quant-is-char-p (stringp seq))
210+
(vector-push-extend quant seq)
211+
seq)
212+
((not seq-is-sequence-p)
213+
(setf last-cdr (list quant)
214+
seq-is-sequence-p t)
215+
(list* :sequence seq last-cdr))
216+
((and quant-is-char-p
217+
(characterp (car last-cdr)))
218+
(setf (car last-cdr)
219+
(make-array-from-two-chars (car last-cdr)
220+
quant))
221+
seq)
222+
((and quant-is-char-p
223+
(stringp (car last-cdr)))
224+
(vector-push-extend quant (car last-cdr))
225+
seq)
226+
(t
227+
;; if <seq> is also a :SEQUENCE parse tree we merge
228+
;; both lists into one
229+
(let ((cons (list quant)))
230+
(psetf last-cdr cons
231+
(cdr last-cdr) cons))
232+
seq))
233+
while (start-of-subexpr-p lexer)
234+
finally (return seq))
235+
:void)))
236+
252237
(defun reg-expr (lexer)
253238
"Parses and consumes a <regex>, a complete regular expression.
254239
The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
@@ -294,25 +279,11 @@ Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
294279
(setf (lexer-pos lexer) pos)
295280
seq)))))))
296281

297-
(defun reverse-strings (parse-tree)
298-
"Recursively walks through PARSE-TREE and destructively reverses all
299-
strings in it."
300-
(declare #.*standard-optimize-settings*)
301-
(cond ((stringp parse-tree)
302-
(nreverse parse-tree))
303-
((consp parse-tree)
304-
(loop for parse-tree-rest on parse-tree
305-
while parse-tree-rest
306-
do (setf (car parse-tree-rest)
307-
(reverse-strings (car parse-tree-rest))))
308-
parse-tree)
309-
(t parse-tree)))
310-
311282
(defun parse-string (string)
312283
"Translate the regex string STRING into a parse tree."
313284
(declare #.*standard-optimize-settings*)
314285
(let* ((lexer (make-lexer string))
315-
(parse-tree (reverse-strings (reg-expr lexer))))
286+
(parse-tree (reg-expr lexer)))
316287
;; check whether we've consumed the whole regex string
317288
(if (end-of-string-p lexer)
318289
parse-tree

0 commit comments

Comments
 (0)