Parser: improve error message handling

chqrlie · chqrlie · commit 1ce66668863f · 2025-10-20T12:24:19.000+02:00
* use single `on_error` handler with error level and message arguments
* remove `Warning` token type, never handled anyway.
* improve `#error` and `#warning` message parsing consistency
* make `num_error` messages non fatal
* fix `#warning` behavior, add tests
diff --git a/ast_utils/constants.c2 b/ast_utils/constants.c2
@@ -21,7 +21,7 @@ public const u32 MaxScopes = 32;
 public const u32 MaxIdentifierLen = 31;
 //public const u32 MaxFeatureName = 31;
 public const u32 MaxFeatureDepth = 6;
-public const u32 MaxErrorMsgLen = 31;   // for #error "msg"
+public const u32 MaxErrorMsgLen = 127;   // for #error "msg"
 
 public const u32 MaxMultiString = 64*1024;
 public const u32 MaxMultiDeclBits = 4;
diff --git a/parser/c2_parser.c2 b/parser/c2_parser.c2
@@ -142,8 +142,7 @@ public fn void Parser.parse(Parser* p, i32 file_id, bool is_interface, bool is_g
                          p.sm.get_offset(p.file_id),
                          p.kwinfo,
                          p.features,
-                         on_tokenizer_error,
-                         on_tokenizer_warning,
+                         Parser.on_tokenizer_error,
                          p,
                          false);
         p.tok.init();
@@ -159,17 +158,22 @@ public fn void Parser.parse(Parser* p, i32 file_id, bool is_interface, bool is_g
     buf.free();
 }
 
-fn void on_tokenizer_error(void* arg, SrcLoc loc) {
+fn void Parser.on_tokenizer_error(void* arg, c2_tokenizer.ErrorLevel level, SrcLoc loc, const char* msg) {
     Parser* p = arg;
-    // NOTE: cannot use p.tok.error_msg, because of possible lookahead (changes token)
-    p.tok.loc = loc;
-    // will longjmp
-    p.error("%s", p.tokenizer.error_msg);
-}
 
-fn void on_tokenizer_warning(void* arg, SrcLoc loc) {
-    Parser* p = arg;
-    p.diags.error(loc, "%s", p.tokenizer.error_msg);
+    switch (level) {
+    case Note:
+        p.diags.note(loc, "%s", msg);
+        break;
+    case Warning:
+        p.diags.warn(loc, "%s", msg);
+        break;
+    default:
+        p.diags.error(loc, "%s", msg);
+        break;
+    }
+    if (level == c2_tokenizer.ErrorLevel.FatalError)
+        longjmp(&p.jmpbuf, 1);
 }
 
 fn void Parser.consumeToken(Parser* p) {
@@ -872,10 +876,6 @@ fn void Parser.dump_token(Parser* p, const Token* tok) @(unused) {
         out.add(p.pool.idx2str(tok.text_idx));
         out.add("*/");
         break;
-    case Warning:
-        out.color(color.Yellow);
-        out.add(tok.error_msg);
-        break;
     case Error:
         out.color(color.Red);
         out.add(p.tokenizer.error_msg);
diff --git a/parser/c2_tokenizer.c2 b/parser/c2_tokenizer.c2
@@ -253,7 +253,8 @@ public type Feature struct {
     bool is_else;   // inside the #else block
 }
 
-public type HandlerFn fn void (void* arg, SrcLoc loc);
+public type ErrorLevel enum u8 { Note, Warning, Error, FatalError }
+public type ErrorFn fn void (void* arg, ErrorLevel level, SrcLoc loc, const char* msg);
 
 public type Tokenizer struct {
     const char* cur;
@@ -265,9 +266,8 @@ public type Tokenizer struct {
 
     string_pool.Pool* pool; // no ownership
     string_buffer.Buf* buf; // no ownership, used for strings and character constants
-    HandlerFn on_error;
-    HandlerFn on_warning;
-    void* fn_arg;
+    ErrorFn on_error;
+    void* on_error_arg;
 
     // Feature handling
     Feature[constants.MaxFeatureDepth+1] feature_stack;
@@ -278,7 +278,7 @@ public type Tokenizer struct {
 
     char[256] error_msg;
 }
-static_assert(416, sizeof(Tokenizer));
+static_assert(408, sizeof(Tokenizer));
 
 public fn void Tokenizer.init(Tokenizer* t,
                               string_pool.Pool* pool,
@@ -287,9 +287,8 @@ public fn void Tokenizer.init(Tokenizer* t,
                               SrcLoc loc_start,
                               const keywords.Info* kwinfo,
                               const string_list.List* features,
-                              HandlerFn on_error,
-                              HandlerFn on_warning,
-                              void* fn_arg,
+                              ErrorFn on_error,
+                              void* on_error_arg,
                               bool raw_mode)
 {
     string.memset(t, 0, sizeof(Tokenizer));
@@ -302,8 +301,7 @@ public fn void Tokenizer.init(Tokenizer* t,
     t.pool = pool;
     t.buf = buf;
     t.on_error = on_error;
-    t.on_warning = on_warning;
-    t.fn_arg = fn_arg;
+    t.on_error_arg = on_error_arg;
 
     t.features = features;
     t.raw_mode = raw_mode;
@@ -684,7 +682,7 @@ fn void Tokenizer.error(Tokenizer* t, Token* result, const char* format @(printf
     result.kind = Kind.Error;
     result.error_msg = t.error_msg;
     result.done = true;
-    if (t.on_error) t.on_error(t.fn_arg, result.loc);
+    if (t.on_error) t.on_error(t.on_error_arg, FatalError, result.loc, t.error_msg);
 }
 
 // generate an error but keep parsing
@@ -694,8 +692,7 @@ fn void Tokenizer.num_error(Tokenizer* t, Token* result, const char* p, const ch
     vsnprintf(t.error_msg, sizeof(t.error_msg), format, args);
     va_end(args);
 
-    // XXX: error position should be passed separately from token start
-    result.loc = t.loc_start + (SrcLoc)(p - t.input_start);
+    SrcLoc err_loc = t.loc_start + (SrcLoc)(p - t.input_start);
     // read the rest of the pp-number token
     for (;;) {
         if ((*p == 'e' || *p == 'E' || *p == 'p' || *p == 'P') && (p[1] == '+' || p[1] == '-')) {
@@ -712,7 +709,8 @@ fn void Tokenizer.num_error(Tokenizer* t, Token* result, const char* p, const ch
     }
     t.cur = p;
     result.len = (u16)((p - t.input_start) - (result.loc - t.loc_start));
-    if (t.on_warning) t.on_warning(t.fn_arg, result.loc);
+    // This is a non fatal error: keep parsing but do not analyse
+    if (t.on_error) t.on_error(t.on_error_arg, Error, err_loc, t.error_msg);
 }
 
 fn void Tokenizer.lex_identifier(Tokenizer* t, Token* result) {
@@ -1435,14 +1433,11 @@ fn bool Tokenizer.lex_feature_cmd(Tokenizer* t, Token* result) {
     case Feat_ifdef:
     case Feat_ifndef:
     case Feat_elif:
-        if (t.handle_if(result, kind)) return true;
-        break;
+        return t.handle_if(result, kind);
     case Feat_else:
-        if (t.handle_else(result)) return true;
-        break;
+        return t.handle_else(result);
     case Feat_endif:
-        if (t.handle_endif(result)) return true;
-        break;
+        return t.handle_endif(result);
     case Feat_error:
     case Feat_warning:
         if (!t.is_enabled()) return false; // if disabled, dont care if anything else
@@ -1466,29 +1461,37 @@ fn bool Tokenizer.at_bol(Tokenizer* t) {
 }
 
 fn bool Tokenizer.parse_error_warn(Tokenizer* t, Token* result, Kind kind) {
-    const char* start = t.cur;
-    while (*t.cur != '\0' && *t.cur != '\r' && *t.cur != '\n')
-        t.cur++;
-    usize len = (usize)(t.cur - start);
-    if (len > constants.MaxErrorMsgLen) {
-        t.error(result, "error msg too long (max %d bytes)", constants.MaxErrorMsgLen);
-        return true;
+    Token tok;
+
+    // parse pptokens instead of raw text
+    string_buffer.Buf msg.init(t.error_msg, elemsof(t.error_msg), false, false, 0);
+    SrcLoc last_loc = 0;
+    while (t.lex_preproc(&tok) != Kind.Eof) {
+        // replace blanks with a single space
+        if (last_loc && last_loc < tok.loc) msg.add1(' ');
+        // copy string text or token source
+        if (tok.kind == Kind.StringLiteral) {
+            msg.add2(t.pool.idx2str(tok.text_idx), tok.text_len);
+        } else {
+            msg.add2(t.input_start + (tok.loc - t.loc_start), tok.len);
+        }
+        last_loc = tok.loc + tok.len;
     }
-    char[constants.MaxErrorMsgLen+1] msg;
-    string.memcpy(msg, start, len);
-    msg[len] = 0;
+    msg.size();  // ensure null terminator
 
     if (kind == Kind.Feat_error) {
-        t.cur = t.line_start;
-        t.error(result, "%s", msg);
-    } else {
-        // TODO: output diagnostic synchronously
-        string.strcpy(t.error_msg, msg);
-        result.kind = Kind.Warning;
-        result.len = (u16)((t.cur - t.input_start) - (result.loc - t.loc_start));
+        const char* start = t.input_start + (result.loc - t.loc_start);
+        result.kind = Kind.Error;
+        result.done = true;
+        result.len = (u16)(t.cur - start);
         result.error_msg = t.error_msg;
+        t.cur = start;  // make #error sticky
+        if (t.on_error) t.on_error(t.on_error_arg, FatalError, result.loc, t.error_msg);
+        return true;    // return error token with result.done set
+    } else {
+        if (t.on_error) t.on_error(t.on_error_arg, Warning, result.loc, t.error_msg);
+        return false;   // continue reading tokens
     }
-    return true;
 }
 
 fn bool Tokenizer.is_enabled(const Tokenizer* t) {
diff --git a/parser/token.c2 b/parser/token.c2
@@ -146,7 +146,6 @@ public type Kind enum u8 {
     BlockComment,
     // Special Tokens
     Eof,
-    Warning,
     Error,
 }
 
@@ -285,7 +284,6 @@ const char*[] token_names = {
     [Kind.LineComment]      = "l-comment",
     [Kind.BlockComment]     = "b-comment",
     [Kind.Eof]              = "eof",
-    [Kind.Warning]          = "warning",
     [Kind.Error]            = "error",
 }
 
diff --git a/test/parser/preprocessor_directives.c2 b/test/parser/preprocessor_directives.c2
@@ -25,9 +25,14 @@ const u32 Z = 2;
 static_assert(Z, 1);
 
 #if 0
-#warning /* comment */ this is a warning
+#warning /* comment */ this is a disabled warning
 #endif
 
+/**/ // @warning{this is a warning} +1
+#warning this is a warning
+/**/ // @warning{this is a warning} +1
+#warning /* comment */ this is a warning
+
 public fn i32 main() {
     return 0;
 }
diff --git a/tools/c2cat.c2 b/tools/c2cat.c2
@@ -306,7 +306,7 @@ public fn i32 c2cat(const char* filename)
     keywords.Info kwinfo;
     kwinfo.init(ctx.pool);
     c2_tokenizer.Tokenizer tokenizer;
-    tokenizer.init(ctx.pool, buf, ctx.input, 0, &kwinfo, &features, nil, nil, nil, true);
+    tokenizer.init(ctx.pool, buf, ctx.input, 0, &kwinfo, &features, nil, nil, true);
     ctx.tokenizer = &tokenizer;
 
     Token tok;
diff --git a/tools/tester/test_db.c2 b/tools/tester/test_db.c2
@@ -15,6 +15,7 @@
 
 module test_db;
 
+import ctype local;
 import c_errno local;
 import stdio local;
 import stdarg local;
@@ -491,24 +492,34 @@ fn void Db.parseTags(Db* db, const char* start, const char* end) {
     char[128] msg;
     if (!db.readUntil(msg, elemsof(msg), cp, '}', "message"))
         return;
+
+    // adjust line number for some special cases
+    u32 line = db.line_nr - db.line_offset;
+    for (const char* suff = cp + strlen(msg) + 1; suff < end; suff++) {
+        if (!isspace(*suff)) {
+            line += atoi(suff);
+            break;
+        }
+    }
+
     switch (kind) {
     case ERROR:
 #if TesterDebug
-        color_print(color.Blue, "  expecting error '%s' at %d", msg, db.line_nr - db.line_offset);
+        color_print(color.Blue, "  expecting error '%s' at %d", msg, line);
 #endif
-        db.errors.add(db.current_file, db.line_nr - db.line_offset, msg);
+        db.errors.add(db.current_file, line, msg);
         break;
     case WARNING:
 #if TesterDebug
-        color_print(color.Blue, "  expecting warning '%s' at %d", msg, db.line_nr - db.line_offset);
+        color_print(color.Blue, "  expecting warning '%s' at %d", msg, line);
 #endif
-        db.warnings.add(db.current_file, db.line_nr - db.line_offset, msg);
+        db.warnings.add(db.current_file, line, msg);
         break;
     case NOTE:
 #if TesterDebug
-        color_print(color.Blue, "  expecting note '%s' at %d", msg, db.line_nr - db.line_offset);
+        color_print(color.Blue, "  expecting note '%s' at %d", msg, line);
 #endif
-        db.notes.add(db.current_file, db.line_nr - db.line_offset, msg);
+        db.notes.add(db.current_file, line, msg);
         break;
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,6 @@ public type Kind enum u8 {`
`146`	`146`	`BlockComment,`
`147`	`147`	`// Special Tokens`
`148`	`148`	`Eof,`
`149`		`- Warning,`
`150`	`149`	`Error,`
`151`	`150`	`}`
`152`	`151`
`@@ -285,7 +284,6 @@ const char*[] token_names = {`
`285`	`284`	`[Kind.LineComment] = "l-comment",`
`286`	`285`	`[Kind.BlockComment] = "b-comment",`
`287`	`286`	`[Kind.Eof] = "eof",`
`288`		`- [Kind.Warning] = "warning",`
`289`	`287`	`[Kind.Error] = "error",`
`290`	`288`	`}`
`291`	`289`