diff --git a/core/lexer.cpp b/core/lexer.cpp index 31599f4d7..205b509a9 100644 --- a/core/lexer.cpp +++ b/core/lexer.cpp @@ -217,9 +217,11 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati // https://www.json.org/img/number.png // Note, we deviate from the json.org documentation as follows: - // There is no reason to lex negative numbers as atomic tokens, it is better to parse them - // as a unary operator combined with a numeric literal. This avoids x-1 being tokenized as - // instead of the intended . + // * There is no reason to lex negative numbers as atomic tokens, it is better to parse them + // as a unary operator combined with a numeric literal. This avoids x-1 being tokenized as + // instead of the intended . + // * We support digit separators using the _ character for readability in + // large numeric literals. enum State { BEGIN, @@ -227,9 +229,11 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati AFTER_ONE_TO_NINE, AFTER_DOT, AFTER_DIGIT, + AFTER_UNDERSCORE, AFTER_E, AFTER_EXP_SIGN, - AFTER_EXP_DIGIT + AFTER_EXP_DIGIT, + AFTER_EXP_UNDERSCORE } state; std::string r; @@ -262,6 +266,8 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati case 'e': case 'E': state = AFTER_E; break; + case '_': state = AFTER_UNDERSCORE; goto skip_char; + default: goto end; } break; @@ -284,6 +290,8 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati case '8': case '9': state = AFTER_ONE_TO_NINE; break; + case '_': state = AFTER_UNDERSCORE; goto skip_char; + default: goto end; } break; @@ -325,10 +333,34 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati case '8': case '9': state = AFTER_DIGIT; break; + case '_': state = AFTER_UNDERSCORE; goto skip_char; + default: goto end; } break; + case AFTER_UNDERSCORE: + switch (*c) { + // The only valid transition from _ is to a digit. + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': state = AFTER_ONE_TO_NINE; break; + + default: { + std::stringstream ss; + ss << "couldn't lex number, junk after _: " << *c; + throw StaticError(filename, begin, ss.str()); + } + } + break; + case AFTER_E: switch (*c) { case '+': @@ -386,12 +418,38 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati case '7': case '8': case '9': state = AFTER_EXP_DIGIT; break; + + case '_': state = AFTER_EXP_UNDERSCORE; goto skip_char; default: goto end; } break; + + case AFTER_EXP_UNDERSCORE: + switch (*c) { + // The only valid transition from _ is to a digit. + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': state = AFTER_EXP_DIGIT; break; + + default: { + std::stringstream ss; + ss << "couldn't lex number, junk after _: " << *c; + throw StaticError(filename, begin, ss.str()); + } + } + break; } r += *c; + +skip_char: c++; } end: diff --git a/core/lexer_test.cpp b/core/lexer_test.cpp index 5b843e580..e1fba13c3 100644 --- a/core/lexer_test.cpp +++ b/core/lexer_test.cpp @@ -117,6 +117,55 @@ TEST(Lexer, TestNumbers) "number 1e+!:1:1: couldn't lex number, junk after exponent sign: !"); } +TEST(Lexer, TestNumbersWithSeparators) +{ + testLex("number 123_456", "123_456", {Token(Token::Kind::NUMBER, "123456")}, ""); + testLex("number 1_750_000", "1_750_000", {Token(Token::Kind::NUMBER, "1750000")}, ""); + testLex("number 1_2_3", "1_2_3", {Token(Token::Kind::NUMBER, "123")}, ""); + testLex("number 3.141_592", "3.141_592", {Token(Token::Kind::NUMBER, "3.141592")}, ""); + testLex("number 01_100", "01_100", {Token(Token::Kind::NUMBER, "0"), Token(Token::Kind::NUMBER, "1100")}, ""); + testLex("number 1_200.0", "1_200.0", {Token(Token::Kind::NUMBER, "1200.0")}, ""); + testLex("number 0e1_01", "0e1_01", {Token(Token::Kind::NUMBER, "0e101")}, ""); + testLex("number 10_10e3", "10_10e3", {Token(Token::Kind::NUMBER, "1010e3")}, ""); + testLex("number 2_3e1_2", "2_3e1_2", {Token(Token::Kind::NUMBER, "23e12")}, ""); + testLex("number 1.1_2e100", "1.1_2e100", {Token(Token::Kind::NUMBER, "1.12e100")}, ""); + testLex("number 1.1e-10_1", "1.1e-10_1", {Token(Token::Kind::NUMBER, "1.1e-101")}, ""); + testLex("number 9.109_383_56e-31", "9.109_383_56e-31", {Token(Token::Kind::NUMBER, "9.10938356e-31")}, ""); + + testLex("number 123456_!", + "123456_!", + {}, + "number 123456_!:1:1: couldn't lex number, junk after _: !"); + testLex("number 123__456", + "123__456", + {}, + "number 123__456:1:1: couldn't lex number, junk after _: _"); + testLex("number 1_200_.0", + "1_200_.0", + {}, + "number 1_200_.0:1:1: couldn't lex number, junk after _: ."); + testLex("number 1_200._0", + "1_200._0", + {}, + "number 1_200._0:1:1: couldn't lex number, junk after decimal point: _"); + testLex("number 1_200_e2", + "1_200_e2", + {}, + "number 1_200_e2:1:1: couldn't lex number, junk after _: e"); + testLex("number 1_200e_2", + "1_200e_2", + {}, + "number 1_200e_2:1:1: couldn't lex number, junk after 'E': _"); + testLex("number 200e-_2", + "200e-_2", + {}, + "number 200e-_2:1:1: couldn't lex number, junk after exponent sign: _"); + testLex("number 200e+_2", + "200e+_2", + {}, + "number 200e+_2:1:1: couldn't lex number, junk after exponent sign: _"); +} + TEST(Lexer, TestDoubleStrings) { testLex("double string \"hi\"", "\"hi\"", {Token(Token::Kind::STRING_DOUBLE, "hi")}, ""); @@ -328,6 +377,7 @@ TEST(Lexer, TestIdentifier) "foo bar123", {Token(Token::Kind::IDENTIFIER, "foo"), Token(Token::Kind::IDENTIFIER, "bar123")}, ""); + testLex("identifier _123", "_123", {Token(Token::Kind::IDENTIFIER, "_123")}, ""); } TEST(Lexer, TestComments) diff --git a/doc/_includes/examples/syntax.jsonnet b/doc/_includes/examples/syntax.jsonnet index eb1ab0d21..8a5288e6c 100644 --- a/doc/_includes/examples/syntax.jsonnet +++ b/doc/_includes/examples/syntax.jsonnet @@ -3,21 +3,6 @@ { cocktails: { // Ingredient quantities are in fl oz. - 'Tom Collins': { - ingredients: [ - { kind: "Farmer's Gin", qty: 1.5 }, - { kind: 'Lemon', qty: 1 }, - { kind: 'Simple Syrup', qty: 0.5 }, - { kind: 'Soda', qty: 2 }, - { kind: 'Angostura', qty: 'dash' }, - ], - garnish: 'Maraschino Cherry', - served: 'Tall', - description: ||| - The Tom Collins is essentially gin and - lemonade. The bitters add complexity. - |||, - }, Manhattan: { ingredients: [ { kind: 'Rye', qty: 2.5 }, @@ -28,5 +13,19 @@ served: 'Straight Up', description: @'A clear \ red drink.', }, + 'Trinidad Sour': { + ingredients: [ + { kind: 'Angostura bitters', qty: 1.333_333 }, + { kind: 'Rye whiskey', qty: 0.5 }, + { kind: 'Fresh lemon juice', qty: 0.75 }, + { kind: 'Orgeat syrup', qty: 1 }, + ], + garnish: 'Lemon twist', + served: 'chilled Nick & Nora glass', + description: ||| + Boldly balanced: 1 1/3 oz Angostura + transforms bitters into the star spirit. + |||, + }, }, } diff --git a/doc/_includes/examples/syntax.jsonnet.golden b/doc/_includes/examples/syntax.jsonnet.golden index 6108519fd..1e9c7b5bf 100644 --- a/doc/_includes/examples/syntax.jsonnet.golden +++ b/doc/_includes/examples/syntax.jsonnet.golden @@ -19,32 +19,28 @@ ], "served": "Straight Up" }, - "Tom Collins": { - "description": "The Tom Collins is essentially gin and\nlemonade. The bitters add complexity.\n", - "garnish": "Maraschino Cherry", + "Trinidad Sour": { + "description": "Boldly balanced: 1 1/3 oz Angostura\ntransforms bitters into the star spirit.\n", + "garnish": "Lemon twist", "ingredients": [ { - "kind": "Farmer's Gin", - "qty": 1.5 - }, - { - "kind": "Lemon", - "qty": 1 + "kind": "Angostura bitters", + "qty": 1.333333 }, { - "kind": "Simple Syrup", + "kind": "Rye whiskey", "qty": 0.5 }, { - "kind": "Soda", - "qty": 2 + "kind": "Fresh lemon juice", + "qty": 0.75 }, { - "kind": "Angostura", - "qty": "dash" + "kind": "Orgeat syrup", + "qty": 1 } ], - "served": "Tall" + "served": "chilled Nick & Nora glass" } } } diff --git a/doc/learning/tutorial.html b/doc/learning/tutorial.html index 3ac5520bf..aa9b4927e 100644 --- a/doc/learning/tutorial.html +++ b/doc/learning/tutorial.html @@ -60,6 +60,9 @@

Syntax

  • Verbatim strings @'foo' and @"foo" are for single lines.
  • +
  • + Large numeric literals may be rendered more readable by using underscores, e.g. 1_000_000. +
  • Using the interactive demo below, try modifying the strings / quantities. Try adding a "Dry diff --git a/doc/ref/spec.html b/doc/ref/spec.html index 9cfd6c01b..0338ca624 100644 --- a/doc/ref/spec.html +++ b/doc/ref/spec.html @@ -142,8 +142,19 @@

    Lexing

  • - number: As defined by JSON but without the leading - minus. + number: As defined by JSON, with two exceptions: +

      +
    • + Numeric literals may be rendered with underscores (_) between any two adjacent 0-9 digits + to improve readability. The underscores are discarded by the lexer. +
      + Examples: 1_000_000, 0.000_001, 6.022_140_76e23 +
    • +
    • + Negative numbers are lexed as the - unary operator applied to a positive number to + simplify parsing. +
    • +

  • diff --git a/test_suite/digitsep.jsonnet b/test_suite/digitsep.jsonnet new file mode 100644 index 000000000..5237198ce --- /dev/null +++ b/test_suite/digitsep.jsonnet @@ -0,0 +1,19 @@ +local cases = [ + [123_456, "123_456"], + [1_750_000, "1_750_000"], + [1_2_3, "1_2_3"], + [3.141_592, "3.141_592"], + [1_200.0, "1_200.0"], + [0e1_01, "0e1_01"], + [10_10e3, "10_10e3"], + [2_3e1_2, "2_3e1_2"], + [1.1_2e100, "1.1_2e100"], + [1.1e-10_1, "1.1e-10_1"], + [9.109_383_56e-31, "9.109_383_56e-31"], +]; + +local sepParse(s) = std.parseJson(std.strReplace(s, "_", "")); + +{ + test_results: [std.assertEqual(c[0], sepParse(c[1])) for c in cases], +} diff --git a/test_suite/digitsep.jsonnet.golden b/test_suite/digitsep.jsonnet.golden new file mode 100644 index 000000000..e9795a491 --- /dev/null +++ b/test_suite/digitsep.jsonnet.golden @@ -0,0 +1,15 @@ +{ + "test_results": [ + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true + ] +} diff --git a/test_suite/error.std_parseJson.nodigitsep.jsonnet b/test_suite/error.std_parseJson.nodigitsep.jsonnet new file mode 100644 index 000000000..f50d4f63b --- /dev/null +++ b/test_suite/error.std_parseJson.nodigitsep.jsonnet @@ -0,0 +1 @@ +std.parseJson("987_543") diff --git a/test_suite/error.std_parseJson.nodigitsep.jsonnet.golden b/test_suite/error.std_parseJson.nodigitsep.jsonnet.golden new file mode 100644 index 000000000..4ccf27d5f --- /dev/null +++ b/test_suite/error.std_parseJson.nodigitsep.jsonnet.golden @@ -0,0 +1,2 @@ +RUNTIME ERROR: [json.exception.parse_error.101] parse error at line 1, column 4: syntax error while parsing value - invalid literal; last read: '987_'; expected end of input + error.std_parseJson.nodigitsep.jsonnet:1:1-25