Skip to content

Commit

Permalink
dfmc-reader: better warning for Tab in string literals
Browse files Browse the repository at this point in the history
This changes the lexer transitions for strings (standard, raw, multi-line, and symbols)
to ALLOW tabs so that in the post-processing of the token we can signal a more helpful
message.

Because all of those ^^ kinds of strings are ultimately processed by decode-string the
tab checking only needs to happen in one place. The checking piggy-backs on the string
splitter method, find-line-break, since that's already looking at every character in the
string.

This also renames `<invalid-multi-line-string-literal>` to just
`<invalid-string-literal>` because that condition was potentially being signaled for both
multi-line and one-line strings due to the fact that all string literals pass through the
decode-string function. I didn't see a need for two condition classes for invalid string
literals.

Error message before: `Invalid token beginning """ encountered.`

and after: `Invalid string literal: tab character at index 1; use \t or spaces instead`

Fixes dylan-lang#425
  • Loading branch information
cgay committed Jan 22, 2025
1 parent a95679e commit 8754756
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 32 deletions.
4 changes: 2 additions & 2 deletions sources/dfmc/reader/interface.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ define serious-program-warning <character-code-too-large> (<invalid-token>)
format-arguments token-string;
end serious-program-warning;

define serious-program-warning <invalid-multi-line-string-literal> (<invalid-token>)
format-string "Invalid multi-line string literal: %s";
define serious-program-warning <invalid-string-literal> (<invalid-token>)
format-string "Invalid string literal: %s";
format-arguments detail;
end serious-program-warning;

Expand Down
34 changes: 18 additions & 16 deletions sources/dfmc/reader/lexer-transitions.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ define constant $ascii-8-bit-extensions

// Build the state graph and save the initial state.
// Note that transition strings support ranges, like "A-Z".
// Note that string literals allow tabs in order to detect them and give
// a better warning message later.
//
define constant $initial-state :: <state>
= compile-state-machine
Expand Down Expand Up @@ -125,14 +127,14 @@ define constant $initial-state :: <state>
state(#"sharp-double-quote", #f,
#('"' . #"sharp-2-double-quotes"),
#('\\' . #"quoted-symbol-escape"),
#(" !#-[]-~" . #"quoted-symbol"),
#("\t !#-[]-~" . #"quoted-symbol"),
pair($ascii-8-bit-extensions, #"quoted-symbol")),
state(#"sharp-2-double-quotes", make-quoted-symbol,
#('"' . #"3quoted-symbol")),
state(#"quoted-symbol", #f,
#('"' . #"quoted-symbol-end"),
#('\\' . #"quoted-symbol-escape"),
#(" !#-[]-~" . #"quoted-symbol"),
#("\t !#-[]-~" . #"quoted-symbol"),
pair($ascii-8-bit-extensions, #"quoted-symbol")),
state(#"quoted-symbol-escape", #f,
#("\\abefnrt0\"" . #"quoted-symbol"),
Expand All @@ -145,7 +147,7 @@ define constant $initial-state :: <state>
state(#"quoted-symbol-end", make-quoted-symbol),
state(#"3quoted-symbol", #f,
#('"' . #"3quoted-symbol-double-quote"),
#("\r\n !#-[]-~" . #"3quoted-symbol"),
#("\r\n\t !#-[]-~" . #"3quoted-symbol"),
#('\\' . #"3quoted-symbol-escape")),
state(#"3quoted-symbol-escape", #f,
#("\\abefnrt0\"" . #"3quoted-symbol"),
Expand All @@ -157,10 +159,10 @@ define constant $initial-state :: <state>
#('>' . #"3quoted-symbol")),
state(#"3quoted-symbol-double-quote", #f,
#('"' . #"3quoted-symbol-2-double-quotes"),
#("\r\n !#-[]-~" . #"3quoted-symbol")),
#("\r\n\t !#-[]-~" . #"3quoted-symbol")),
state(#"3quoted-symbol-2-double-quotes", #f,
#('"' . #"3quoted-symbol-end"),
#("\r\n !#-[]-~" . #"3quoted-symbol")),
#("\r\n\t !#-[]-~" . #"3quoted-symbol")),
state(#"3quoted-symbol-end", make-multi-line-quoted-symbol),

state(#"sharp-b", #f,
Expand Down Expand Up @@ -509,12 +511,12 @@ define constant $initial-state :: <state>
state(#"double-quote", #f,
#('"' . #"two-double-quotes"),
#('\\' . #"string-escape"),
#(" !#-[]-~" . #"simple-string"),
#("\t !#-[]-~" . #"simple-string"),
pair($ascii-8-bit-extensions, #"simple-string")),
state(#"simple-string", #f,
#('"' . #"end-simple-string"),
#('\\' . #"string-escape"),
#(" !#-[]-~" . #"simple-string"),
#("\t !#-[]-~" . #"simple-string"),
pair($ascii-8-bit-extensions, #"simple-string")),
state(#"end-simple-string", make-string-literal),
state(#"two-double-quotes", make-string-literal,
Expand All @@ -523,7 +525,7 @@ define constant $initial-state :: <state>
state(#"3string", #f, // seen """
#('"' . #"close-double-quote"),
#('\\' . #"3string-escape"),
#(" !#-[]-~\r\n" . #"3string"), // Ranges #-[ and ]-~ exclude backslash
#("\t !#-[]-~\r\n" . #"3string"), // Ranges #-[ and ]-~ exclude backslash
pair($ascii-8-bit-extensions, #"3string")),
state(#"3string-escape", #f,
#("\\'\"abefnrt0" . #"3string"),
Expand All @@ -535,41 +537,41 @@ define constant $initial-state :: <state>
#('>' . #"3string")),
state(#"close-double-quote", #f,
#('"' . #"close-double-quote-2"),
#(" !#-[]-~\r\n" . #"3string"),
#("\t !#-[]-~\r\n" . #"3string"),
pair($ascii-8-bit-extensions, #"3string")),
state(#"close-double-quote-2", #f,
#('"' . #"multi-line-string"),
#(" !#-[]-~\r\n" . #"3string"),
#("\t !#-[]-~\r\n" . #"3string"),
pair($ascii-8-bit-extensions, #"3string")),
state(#"multi-line-string", make-multi-line-string-literal),

// Raw strings
state(#"raw-string-start", #f, // seen #r"
#('"' . #"sharp-r-2-double-quotes"),
#(" !#-~" . #"raw-1string"),
#("\t !#-~" . #"raw-1string"),
pair($ascii-8-bit-extensions, #"raw-1string")),
state(#"sharp-r-2-double-quotes", make-raw-string-literal,
#('"' . #"raw-3string-start")),
state(#"raw-1string", #f, // seen #r" plus one non-" char
#('"' . #"raw-1string-end"),
#(" !#-~" . #"raw-1string"),
#("\t !#-~" . #"raw-1string"),
pair($ascii-8-bit-extensions, #"raw-1string")),
state(#"raw-1string-end", make-raw-string-literal),
state(#"raw-3string-start", #f, // seen #r"""
#('"' . #"raw-3string-double-quote"),
#(" !#-~\r\n" . #"raw-3string"),
#("\t !#-~\r\n" . #"raw-3string"),
pair($ascii-8-bit-extensions, #"raw-3string")),
state(#"raw-3string", #f,
#('"' . #"raw-3string-double-quote"),
#(" !#-~\r\n" . #"raw-3string"),
#("\t !#-~\r\n" . #"raw-3string"),
pair($ascii-8-bit-extensions, #"raw-3string")),
state(#"raw-3string-double-quote", #f,
#('"' . #"raw-3string-2-double-quotes"),
#(" !#-~\r\n" . #"raw-3string"),
#("\t !#-~\r\n" . #"raw-3string"),
pair($ascii-8-bit-extensions, #"raw-3string")),
state(#"raw-3string-2-double-quotes", #f,
#('"' . #"raw-3string-end"),
#(" !#-~\r\n" . #"raw-3string"),
#("\t !#-~\r\n" . #"raw-3string"),
pair($ascii-8-bit-extensions, #"raw-3string")),
state(#"raw-3string-end", make-multi-line-raw-string-literal),

Expand Down
13 changes: 6 additions & 7 deletions sources/dfmc/reader/lexer.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -866,16 +866,13 @@ define method decode-string
=> (string :: <byte-string>)
local
method fail (format-string, #rest format-args)
note(<invalid-multi-line-string-literal>,
note(<invalid-string-literal>,
source-location: source-location,
token-string: extract-string(source-location),
detail: apply(format-to-string,
concatenate("invalid multi-line string literal: ",
format-string),
format-args));
detail: apply(format-to-string, format-string, format-args));
end,
method whitespace-code? (c)
c == $space-code | c == $tab-code
c == $space-code
end,
method find-line-break (seq, bpos, epos)
if (bpos < epos)
Expand All @@ -888,6 +885,8 @@ define method decode-string
else
values(bpos, bpos + 1)
end;
$tab-code =>
fail("tab character at index %d; use \\t or spaces instead", bpos);
otherwise =>
find-line-break(seq, bpos + 1, epos);
end
Expand Down Expand Up @@ -968,7 +967,7 @@ define method decode-string
let contents = source-location.source-location-record.contents;
let parts = split(contents, find-line-break, start: bpos, end: epos);
if (parts.size == 1)
as(<string>, process-line(#f, parts[0])) // e.g., """abc"""
as(<string>, process-line(#f, parts[0])) // e.g., "x" or """x"""
else
let prefix = parts.last;
if (~every?(whitespace-code?, prefix))
Expand Down
2 changes: 1 addition & 1 deletion sources/dfmc/reader/reader-library.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ define module dfmc-reader
<integer-too-large>,
<character-code-too-large>,
<ratios-not-supported>,
<invalid-multi-line-string-literal>,
<invalid-string-literal>,
<invalid-end-of-input>,
<parser-error>,
<manual-parser-error>,
Expand Down
18 changes: 12 additions & 6 deletions sources/dfmc/reader/tests/literal-test-suite.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,15 @@ define test string-literal-test ()
verify-literal(read-fragment(#:string:{"z\<009f>z"}),
map-as(<string>, char, #('z', #x9f, 'z')),
<string-fragment>);
// A one line string literal can't contain a literal Newline.
assert-signals(<invalid-token>, read-fragment("\"\n\""));
assert-signals(<invalid-token>, read-fragment(#:string:{"\1<b>"}));
assert-signals(<invalid-token>,
read-fragment(#:string:{"\1<b>"}),
"invalid hex escape");
assert-signals(<invalid-token>,
read-fragment("\"\n\""),
"Newline not allowed in non-multi-line string literal");
assert-signals(<invalid-string-literal>,
read-fragment("\"\t\""),
"Tab not allowed in string literal");
end test;

// Note: one line as in one line of source code not as in having no newline characters.
Expand Down Expand Up @@ -318,17 +324,17 @@ define test test-multi-line-string-delimiter-rules ()
// sequences instead of with #:string:.
let frag1 = read-fragment("\"\"\" \n abc\n \"\"\"");
assert-equal("abc", frag1.fragment-value);
assert-signals(<invalid-multi-line-string-literal>,
assert-signals(<invalid-string-literal>,
read-fragment(#:string:{"""a (only whitespace allowed after start delim)
abc
"""}),
"junk on first line");
assert-signals(<invalid-multi-line-string-literal>,
assert-signals(<invalid-string-literal>,
read-fragment(#:string:{"""
abc
xxx"""}),
"junk on last line");
assert-signals(<invalid-multi-line-string-literal>,
assert-signals(<invalid-string-literal>,
read-fragment(#:string:{"""
abc
xxx (this line not indented enough)
Expand Down

0 comments on commit 8754756

Please sign in to comment.