Skip to content

Commit 88f3b3d

Browse files
committed
support for \\x and \\u metachar.
1 parent 0f06934 commit 88f3b3d

File tree

2 files changed

+34
-11
lines changed

2 files changed

+34
-11
lines changed

core/src/main/java/com/github/simy4/coregex/core/CoregexParser.java

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -204,15 +204,14 @@ private Coregex elementaryRE(Context ctx) {
204204
*/
205205
private Coregex.Literal quoted(Context ctx) {
206206
ctx.match('Q');
207-
int comments = ctx.flags & Pattern.COMMENTS;
208-
ctx.flags &= ~Pattern.COMMENTS;
207+
ctx.flags |= Pattern.LITERAL;
209208
StringBuilder literal = new StringBuilder();
210209
do {
211210
literal.append(ctx.span(ch -> '\\' != ch));
212211
ctx.match('\\');
213212
} while ('E' != ctx.peek() && (literal.append('\\') != null));
213+
ctx.flags &= ~Pattern.LITERAL;
214214
ctx.match('E');
215-
ctx.flags |= comments;
216215
return new Coregex.Literal(literal.toString());
217216
}
218217

@@ -574,8 +573,8 @@ private static final class Context {
574573
private static final char EOF = '\uFFFF';
575574

576575
private final String regex;
577-
private final char[] tokens = {SKIP, SKIP};
578-
private int flags, cursor, charsCursor;
576+
private final char[] tokens = {SKIP, SKIP, SKIP, SKIP};
577+
private int flags, cursor, tokensCursor;
579578

580579
Context(String regex, int flags) {
581580
this.regex = regex;
@@ -591,8 +590,8 @@ char peek() {
591590
}
592591

593592
char peek(int i) {
594-
for (; charsCursor < i; charsCursor++) {
595-
tokens[charsCursor] = token();
593+
for (; tokensCursor < i; tokensCursor++) {
594+
tokens[tokensCursor] = token();
596595
}
597596
return tokens[i - 1];
598597
}
@@ -601,7 +600,7 @@ void match(char ch) {
601600
if (ch != peek()) {
602601
error(String.valueOf(ch));
603602
}
604-
charsCursor--;
603+
tokensCursor--;
605604
tokens[0] = tokens[1];
606605
tokens[1] = SKIP;
607606
}
@@ -632,12 +631,12 @@ private char token() {
632631
case '\f':
633632
case '\r':
634633
case '\n':
635-
if (0 != (flags & Pattern.COMMENTS)) {
634+
if (0 != (flags & Pattern.COMMENTS) && 0 == (flags & Pattern.LITERAL)) {
636635
ch = SKIP;
637636
}
638637
break;
639638
case '#':
640-
if (0 != (flags & Pattern.COMMENTS)) {
639+
if (0 != (flags & Pattern.COMMENTS) && 0 == (flags & Pattern.LITERAL)) {
641640
while (cursor < regex.length()
642641
&& ('\n' != (ch = regex.charAt(cursor))
643642
&& (0 == (flags & Pattern.UNIX_LINES) || '\r' != ch))) {
@@ -646,6 +645,26 @@ private char token() {
646645
ch = SKIP;
647646
}
648647
break;
648+
case '\\':
649+
if (0 != (flags & Pattern.LITERAL)) {
650+
break;
651+
}
652+
switch (cursor + 1 < regex.length() ? regex.charAt(cursor + 1) : EOF) {
653+
case 'u':
654+
cursor += 2;
655+
String u = regex.substring(cursor, cursor + 4);
656+
char[] chars = Character.toChars(Integer.parseInt(u, 16));
657+
System.arraycopy(chars, 0, tokens, tokensCursor, chars.length);
658+
ch = chars[0];
659+
cursor += 3;
660+
break;
661+
case 'x':
662+
cursor += 2;
663+
String x = regex.substring(cursor, cursor + 2);
664+
ch = (char) Integer.parseInt(x, 16);
665+
cursor++;
666+
break;
667+
}
649668
}
650669
cursor++;
651670
} while (SKIP == ch);

core/src/test/scala/com/github/simy4/coregex/core/CoregexParserSuite.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,11 @@ class CoregexParserSuite extends ScalaCheckSuite with CoregexArbitraries {
659659
),
660660
3,
661661
6
662-
) -> Pattern.compile("((?i)[a-z]+(?-i)-[A-Z]){3,6}"),
662+
) -> Pattern.compile("((?i)[a-z]+(?-i)-[A-Z]){3,6}"),
663+
new Coregex.Concat(
664+
new Coregex.Set(Set.builder().range(0x61, 0x7a).build()),
665+
new Coregex.Set(Set.builder().range('\u0041', '\u005a').build())
666+
) -> Pattern.compile("[\\x61-\\x7a][\\u0041-\\u005a]"),
663667
Coregex.empty() -> Pattern.compile("^(?:||)$")
664668
)
665669
rng <- List(new RandomRNG())

0 commit comments

Comments
 (0)