dfmc-reader: better warning for Tab in string literals

This changes the lexer transitions for strings (standard, raw, multi-line, and symbols) to ALLOW tabs so that in the post-processing of the token we can signal a more helpful message. Because all of those ^^ kinds of strings are ultimately processed by decode-string the tab checking only needs to happen in one place. The checking piggy-backs on the string splitter method, find-line-break, since that's already looking at every character in the string. This also renames `<invalid-multi-line-string-literal>` to just `<invalid-string-literal>` because that condition was potentially being signaled for both multi-line and one-line strings due to the fact that all string literals pass through the decode-string function. I didn't see a need for two condition classes for invalid string literals. Error message before: `Invalid token beginning """ encountered.` and after: `Invalid string literal: tab character at index 1; use \t or spaces instead` Fixes dylan-lang#425
cgay · Jan 22, 2025 · 8754756 · 8754756
1 parent a95679e
commit 8754756
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 32 deletions.
diff --git a/sources/dfmc/reader/interface.dylan b/sources/dfmc/reader/interface.dylan
@@ -102,8 +102,8 @@ define serious-program-warning <character-code-too-large> (<invalid-token>)
   format-arguments token-string;
 end serious-program-warning;
 
-define serious-program-warning <invalid-multi-line-string-literal> (<invalid-token>)
-  format-string    "Invalid multi-line string literal: %s";
+define serious-program-warning <invalid-string-literal> (<invalid-token>)
+  format-string    "Invalid string literal: %s";
   format-arguments detail;
 end serious-program-warning;
 

diff --git a/sources/dfmc/reader/lexer-transitions.dylan b/sources/dfmc/reader/lexer-transitions.dylan
@@ -16,6 +16,8 @@ define constant $ascii-8-bit-extensions
 
 // Build the state graph and save the initial state.
 // Note that transition strings support ranges, like "A-Z".
+// Note that string literals allow tabs in order to detect them and give
+// a better warning message later.
 //
 define constant $initial-state :: <state>
   = compile-state-machine
@@ -125,14 +127,14 @@ define constant $initial-state :: <state>
        state(#"sharp-double-quote", #f,
              #('"' . #"sharp-2-double-quotes"),
              #('\\' . #"quoted-symbol-escape"),
-             #(" !#-[]-~" . #"quoted-symbol"),
+             #("\t !#-[]-~" . #"quoted-symbol"),
              pair($ascii-8-bit-extensions, #"quoted-symbol")),
        state(#"sharp-2-double-quotes", make-quoted-symbol,
              #('"' . #"3quoted-symbol")),
        state(#"quoted-symbol", #f,
              #('"' . #"quoted-symbol-end"),
              #('\\' . #"quoted-symbol-escape"),
-             #(" !#-[]-~" . #"quoted-symbol"),
+             #("\t !#-[]-~" . #"quoted-symbol"),
              pair($ascii-8-bit-extensions, #"quoted-symbol")),
        state(#"quoted-symbol-escape", #f,
              #("\\abefnrt0\"" . #"quoted-symbol"),
@@ -145,7 +147,7 @@ define constant $initial-state :: <state>
        state(#"quoted-symbol-end", make-quoted-symbol),
        state(#"3quoted-symbol", #f,
              #('"' . #"3quoted-symbol-double-quote"),
-             #("\r\n !#-[]-~" . #"3quoted-symbol"),
+             #("\r\n\t !#-[]-~" . #"3quoted-symbol"),
              #('\\' . #"3quoted-symbol-escape")),
        state(#"3quoted-symbol-escape", #f,
              #("\\abefnrt0\"" . #"3quoted-symbol"),
@@ -157,10 +159,10 @@ define constant $initial-state :: <state>
              #('>' . #"3quoted-symbol")),
        state(#"3quoted-symbol-double-quote", #f,
              #('"' . #"3quoted-symbol-2-double-quotes"),
-             #("\r\n !#-[]-~" . #"3quoted-symbol")),
+             #("\r\n\t !#-[]-~" . #"3quoted-symbol")),
        state(#"3quoted-symbol-2-double-quotes", #f,
              #('"' . #"3quoted-symbol-end"),
-             #("\r\n !#-[]-~" . #"3quoted-symbol")),
+             #("\r\n\t !#-[]-~" . #"3quoted-symbol")),
        state(#"3quoted-symbol-end", make-multi-line-quoted-symbol),
 
        state(#"sharp-b", #f,
@@ -509,12 +511,12 @@ define constant $initial-state :: <state>
        state(#"double-quote", #f,
              #('"' . #"two-double-quotes"),
              #('\\' . #"string-escape"),
-             #(" !#-[]-~" . #"simple-string"),
+             #("\t !#-[]-~" . #"simple-string"),
              pair($ascii-8-bit-extensions, #"simple-string")),
        state(#"simple-string", #f,
              #('"' . #"end-simple-string"),
              #('\\' . #"string-escape"),
-             #(" !#-[]-~" . #"simple-string"),
+             #("\t !#-[]-~" . #"simple-string"),
              pair($ascii-8-bit-extensions, #"simple-string")),
        state(#"end-simple-string", make-string-literal),
        state(#"two-double-quotes", make-string-literal,
@@ -523,7 +525,7 @@ define constant $initial-state :: <state>
        state(#"3string", #f, // seen """
              #('"' . #"close-double-quote"),
              #('\\' . #"3string-escape"),
-             #(" !#-[]-~\r\n" . #"3string"),  // Ranges #-[ and ]-~ exclude backslash
+             #("\t !#-[]-~\r\n" . #"3string"),  // Ranges #-[ and ]-~ exclude backslash
              pair($ascii-8-bit-extensions, #"3string")),
        state(#"3string-escape", #f,
              #("\\'\"abefnrt0" . #"3string"),
@@ -535,41 +537,41 @@ define constant $initial-state :: <state>
              #('>' . #"3string")),
        state(#"close-double-quote", #f,
              #('"' . #"close-double-quote-2"),
-             #(" !#-[]-~\r\n" . #"3string"),
+             #("\t !#-[]-~\r\n" . #"3string"),
              pair($ascii-8-bit-extensions, #"3string")),
        state(#"close-double-quote-2", #f,
              #('"' . #"multi-line-string"),
-             #(" !#-[]-~\r\n" . #"3string"),
+             #("\t !#-[]-~\r\n" . #"3string"),
              pair($ascii-8-bit-extensions, #"3string")),
        state(#"multi-line-string", make-multi-line-string-literal),
 
        // Raw strings
        state(#"raw-string-start", #f,          // seen #r"
              #('"' . #"sharp-r-2-double-quotes"),
-             #(" !#-~" . #"raw-1string"),
+             #("\t !#-~" . #"raw-1string"),
              pair($ascii-8-bit-extensions, #"raw-1string")),
        state(#"sharp-r-2-double-quotes", make-raw-string-literal,
              #('"' . #"raw-3string-start")),
        state(#"raw-1string", #f,       // seen #r" plus one non-" char
              #('"' . #"raw-1string-end"),
-             #(" !#-~" . #"raw-1string"),
+             #("\t !#-~" . #"raw-1string"),
              pair($ascii-8-bit-extensions, #"raw-1string")),
        state(#"raw-1string-end", make-raw-string-literal),
        state(#"raw-3string-start", #f, // seen #r"""
              #('"' . #"raw-3string-double-quote"),
-             #(" !#-~\r\n" . #"raw-3string"),
+             #("\t !#-~\r\n" . #"raw-3string"),
              pair($ascii-8-bit-extensions, #"raw-3string")),
        state(#"raw-3string", #f,
              #('"' . #"raw-3string-double-quote"),
-             #(" !#-~\r\n" . #"raw-3string"),
+             #("\t !#-~\r\n" . #"raw-3string"),
              pair($ascii-8-bit-extensions, #"raw-3string")),
        state(#"raw-3string-double-quote", #f,
              #('"' . #"raw-3string-2-double-quotes"),
-             #(" !#-~\r\n" . #"raw-3string"),
+             #("\t !#-~\r\n" . #"raw-3string"),
              pair($ascii-8-bit-extensions, #"raw-3string")),
        state(#"raw-3string-2-double-quotes", #f,
              #('"' . #"raw-3string-end"),
-             #(" !#-~\r\n" . #"raw-3string"),
+             #("\t !#-~\r\n" . #"raw-3string"),
              pair($ascii-8-bit-extensions, #"raw-3string")),
        state(#"raw-3string-end", make-multi-line-raw-string-literal),
 

diff --git a/sources/dfmc/reader/lexer.dylan b/sources/dfmc/reader/lexer.dylan
@@ -866,16 +866,13 @@ define method decode-string
  => (string :: <byte-string>)
   local
     method fail (format-string, #rest format-args)
-      note(<invalid-multi-line-string-literal>,
+      note(<invalid-string-literal>,
            source-location: source-location,
            token-string: extract-string(source-location),
-           detail: apply(format-to-string,
-                         concatenate("invalid multi-line string literal: ",
-                                     format-string),
-                         format-args));
+           detail: apply(format-to-string, format-string, format-args));
     end,
     method whitespace-code? (c)
-      c == $space-code | c == $tab-code
+      c == $space-code
     end,
     method find-line-break (seq, bpos, epos)
       if (bpos < epos)
@@ -888,6 +885,8 @@ define method decode-string
             else
               values(bpos, bpos + 1)
             end;
+          $tab-code =>
+            fail("tab character at index %d; use \\t or spaces instead", bpos);
           otherwise =>
             find-line-break(seq, bpos + 1, epos);
         end
@@ -968,7 +967,7 @@ define method decode-string
   let contents = source-location.source-location-record.contents;
   let parts = split(contents, find-line-break, start: bpos, end: epos);
   if (parts.size == 1)
-    as(<string>, process-line(#f, parts[0]))      // e.g., """abc"""
+    as(<string>, process-line(#f, parts[0]))      // e.g., "x" or """x"""
   else
     let prefix = parts.last;
     if (~every?(whitespace-code?, prefix))

diff --git a/sources/dfmc/reader/reader-library.dylan b/sources/dfmc/reader/reader-library.dylan
@@ -71,7 +71,7 @@ define module dfmc-reader
         <integer-too-large>,
         <character-code-too-large>,
         <ratios-not-supported>,
-        <invalid-multi-line-string-literal>,
+        <invalid-string-literal>,
       <invalid-end-of-input>,
       <parser-error>,
       <manual-parser-error>,

diff --git a/sources/dfmc/reader/tests/literal-test-suite.dylan b/sources/dfmc/reader/tests/literal-test-suite.dylan
@@ -221,9 +221,15 @@ define test string-literal-test ()
   verify-literal(read-fragment(#:string:{"z\<009f>z"}),
                  map-as(<string>, char, #('z', #x9f, 'z')),
                  <string-fragment>);
-  // A one line string literal can't contain a literal Newline.
-  assert-signals(<invalid-token>, read-fragment("\"\n\""));
-  assert-signals(<invalid-token>, read-fragment(#:string:{"\1<b>"}));
+  assert-signals(<invalid-token>,
+                 read-fragment(#:string:{"\1<b>"}),
+                 "invalid hex escape");
+  assert-signals(<invalid-token>,
+                 read-fragment("\"\n\""),
+                 "Newline not allowed in non-multi-line string literal");
+  assert-signals(<invalid-string-literal>,
+                 read-fragment("\"\t\""),
+                 "Tab not allowed in string literal");
 end test;
 
 // Note: one line as in one line of source code not as in having no newline characters.
@@ -318,17 +324,17 @@ define test test-multi-line-string-delimiter-rules ()
   // sequences instead of with #:string:.
   let frag1 = read-fragment("\"\"\"   \n  abc\n  \"\"\"");
   assert-equal("abc", frag1.fragment-value);
-  assert-signals(<invalid-multi-line-string-literal>,
+  assert-signals(<invalid-string-literal>,
                  read-fragment(#:string:{"""a  (only whitespace allowed after start delim)
 abc
 """}),
                  "junk on first line");
-  assert-signals(<invalid-multi-line-string-literal>,
+  assert-signals(<invalid-string-literal>,
                  read-fragment(#:string:{"""
 abc
 xxx"""}),
                   "junk on last line");
-  assert-signals(<invalid-multi-line-string-literal>,
+  assert-signals(<invalid-string-literal>,
                  read-fragment(#:string:{"""
    abc
   xxx  (this line not indented enough)