support for \\x and \\u metachar.

SimY4 · SimY4 · commit 88f3b3dab3db · 2023-08-25T22:01:31.000+10:00
diff --git a/core/src/main/java/com/github/simy4/coregex/core/CoregexParser.java b/core/src/main/java/com/github/simy4/coregex/core/CoregexParser.java
@@ -204,15 +204,14 @@ private Coregex elementaryRE(Context ctx) {
    */
   private Coregex.Literal quoted(Context ctx) {
     ctx.match('Q');
-    int comments = ctx.flags & Pattern.COMMENTS;
-    ctx.flags &= ~Pattern.COMMENTS;
+    ctx.flags |= Pattern.LITERAL;
     StringBuilder literal = new StringBuilder();
     do {
       literal.append(ctx.span(ch -> '\\' != ch));
       ctx.match('\\');
     } while ('E' != ctx.peek() && (literal.append('\\') != null));
+    ctx.flags &= ~Pattern.LITERAL;
     ctx.match('E');
-    ctx.flags |= comments;
     return new Coregex.Literal(literal.toString());
   }
 
@@ -574,8 +573,8 @@ private static final class Context {
     private static final char EOF = '\uFFFF';
 
     private final String regex;
-    private final char[] tokens = {SKIP, SKIP};
-    private int flags, cursor, charsCursor;
+    private final char[] tokens = {SKIP, SKIP, SKIP, SKIP};
+    private int flags, cursor, tokensCursor;
 
     Context(String regex, int flags) {
       this.regex = regex;
@@ -591,8 +590,8 @@ char peek() {
     }
 
     char peek(int i) {
-      for (; charsCursor < i; charsCursor++) {
-        tokens[charsCursor] = token();
+      for (; tokensCursor < i; tokensCursor++) {
+        tokens[tokensCursor] = token();
       }
       return tokens[i - 1];
     }
@@ -601,7 +600,7 @@ void match(char ch) {
       if (ch != peek()) {
         error(String.valueOf(ch));
       }
-      charsCursor--;
+      tokensCursor--;
       tokens[0] = tokens[1];
       tokens[1] = SKIP;
     }
@@ -632,12 +631,12 @@ private char token() {
           case '\f':
           case '\r':
           case '\n':
-            if (0 != (flags & Pattern.COMMENTS)) {
+            if (0 != (flags & Pattern.COMMENTS) && 0 == (flags & Pattern.LITERAL)) {
               ch = SKIP;
             }
             break;
           case '#':
-            if (0 != (flags & Pattern.COMMENTS)) {
+            if (0 != (flags & Pattern.COMMENTS) && 0 == (flags & Pattern.LITERAL)) {
               while (cursor < regex.length()
                   && ('\n' != (ch = regex.charAt(cursor))
                       && (0 == (flags & Pattern.UNIX_LINES) || '\r' != ch))) {
@@ -646,6 +645,26 @@ private char token() {
               ch = SKIP;
             }
             break;
+          case '\\':
+            if (0 != (flags & Pattern.LITERAL)) {
+              break;
+            }
+            switch (cursor + 1 < regex.length() ? regex.charAt(cursor + 1) : EOF) {
+              case 'u':
+                cursor += 2;
+                String u = regex.substring(cursor, cursor + 4);
+                char[] chars = Character.toChars(Integer.parseInt(u, 16));
+                System.arraycopy(chars, 0, tokens, tokensCursor, chars.length);
+                ch = chars[0];
+                cursor += 3;
+                break;
+              case 'x':
+                cursor += 2;
+                String x = regex.substring(cursor, cursor + 2);
+                ch = (char) Integer.parseInt(x, 16);
+                cursor++;
+                break;
+            }
         }
         cursor++;
       } while (SKIP == ch);
diff --git a/core/src/test/scala/com/github/simy4/coregex/core/CoregexParserSuite.scala b/core/src/test/scala/com/github/simy4/coregex/core/CoregexParserSuite.scala
@@ -659,7 +659,11 @@ class CoregexParserSuite extends ScalaCheckSuite with CoregexArbitraries {
           ),
           3,
           6
-        )               -> Pattern.compile("((?i)[a-z]+(?-i)-[A-Z]){3,6}"),
+        ) -> Pattern.compile("((?i)[a-z]+(?-i)-[A-Z]){3,6}"),
+        new Coregex.Concat(
+          new Coregex.Set(Set.builder().range(0x61, 0x7a).build()),
+          new Coregex.Set(Set.builder().range('\u0041', '\u005a').build())
+        )               -> Pattern.compile("[\\x61-\\x7a][\\u0041-\\u005a]"),
         Coregex.empty() -> Pattern.compile("^(?:||)$")
       )
     rng <- List(new RandomRNG())

Original file line number	Diff line number	Diff line change
`@@ -659,7 +659,11 @@ class CoregexParserSuite extends ScalaCheckSuite with CoregexArbitraries {`
`659`	`659`	`),`
`660`	`660`	`3,`
`661`	`661`	`6`
`662`		`- ) -> Pattern.compile("((?i)[a-z]+(?-i)-[A-Z]){3,6}"),`
	`662`	`+ ) -> Pattern.compile("((?i)[a-z]+(?-i)-[A-Z]){3,6}"),`
	`663`	`+ new Coregex.Concat(`
	`664`	`+ new Coregex.Set(Set.builder().range(0x61, 0x7a).build()),`
	`665`	`+ new Coregex.Set(Set.builder().range('\u0041', '\u005a').build())`
	`666`	`+ ) -> Pattern.compile("[\\x61-\\x7a][\\u0041-\\u005a]"),`
`663`	`667`	`Coregex.empty() -> Pattern.compile("^(?:\|\|)$")`
`664`	`668`	`)`
`665`	`669`	`rng <- List(new RandomRNG())`