From 649dcb0ce51a695884324e107d04c35da6c941c9 Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Fri, 25 Apr 2025 01:09:06 +0200
Subject: [PATCH 01/11] filters: Refactor filters list into general project
 settings

---
 elixir/project_utils.py | 30 +++++++++++++++
 elixir/projects.py      | 82 +++++++++++++++++++++++++++++++++++++++++
 elixir/web.py           |  2 +-
 3 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 elixir/project_utils.py
 create mode 100644 elixir/projects.py

diff --git a/elixir/project_utils.py b/elixir/project_utils.py
new file mode 100644
index 00000000..31523a83
--- /dev/null
+++ b/elixir/project_utils.py
@@ -0,0 +1,30 @@
+import re
+from typing import List
+
+from .filters.utils import Filter, FilterContext
+from .filters import default_filters
+from .projects import projects
+
+# Returns a list of applicable filters for project_name under provided filter context
+def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]:
+    project_config = projects.get(project_name)
+    if project_config is None or 'filters' not in project_config:
+        filter_classes = default_filters
+    else:
+        filter_classes = project_config['filters']
+
+    filters = []
+
+    for filter_cls in filter_classes:
+        if type(filter_cls) == tuple and len(filter_cls) == 2:
+            cls, kwargs = filter_cls
+            filters.append(cls(**kwargs))
+        elif type(filter_cls) == type:
+            filters.append(filter_cls())
+        else:
+            raise ValueError(f"Invalid filter: {filter_cls}, " \
+                    "should be either a two element tuple or a type. " \
+                    "Make sure project_filters in project.py is valid.")
+
+    return [f for f in filters if f.check_if_applies(ctx)]
+
diff --git a/elixir/projects.py b/elixir/projects.py
new file mode 100644
index 00000000..90a1ecc3
--- /dev/null
+++ b/elixir/projects.py
@@ -0,0 +1,82 @@
+from .filters import *
+
+# Dictionary of custom per-projects settings.
+# filters:
+# Projects not present in this dictionary only use default_filters.
+# Use `*` to unpack filter lists defined above,
+# you can pass additional options to filters by putting a Filter
+# class and a dictionary with options in a tuple, like this:
+# (FilterCls, {"option": True}).
+# Check filter files and utils.py for information about available options
+projects = {
+    'amazon-freertos': {
+        'filters': [
+            *default_filters,
+            MakefileSubdirFilter,
+        ],
+    },
+    'arm-trusted-firmware': {
+        'filters': [
+            *default_filters,
+            CppPathIncFilter,
+        ],
+    },
+    'barebox': {
+        'filters': [
+            *default_filters,
+            DtsiFilter,
+            *common_kconfig_filters,
+            CppPathIncFilter,
+            *common_makefile_filters,
+        ],
+    },
+    'coreboot': {
+        'filters': [
+            *default_filters,
+            DtsiFilter,
+            *common_kconfig_filters,
+            *common_makefile_filters,
+        ],
+    },
+    'linux': {
+        'filters': [
+            *default_filters,
+            DtsiFilter,
+            *common_kconfig_filters,
+            *common_makefile_filters,
+            # include/uapi contains includes to user headers under #ifndef __KERNEL__
+            # Our solution is to ignore all includes in such paths
+            (CppPathIncFilter, {"path_exceptions": {'^/include/uapi/.*'}}),
+        ],
+    },
+    'qemu': {
+        'filters': [
+            *default_filters,
+            *common_kconfig_filters,
+        ],
+    },
+    'u-boot': {
+        'filters': [
+            *default_filters,
+            DtsiFilter,
+            *common_kconfig_filters,
+            CppPathIncFilter,
+            *common_makefile_filters,
+        ],
+    },
+    'uclibc-ng': {
+        'filters': [
+            *default_filters,
+            ConfigInFilter,
+        ],
+    },
+    'zephyr': {
+        'filters': [
+            *default_filters,
+            DtsiFilter,
+            *common_kconfig_filters,
+            CppPathIncFilter,
+        ],
+    },
+}
+
diff --git a/elixir/web.py b/elixir/web.py
index 2a0cbbbb..514e9cce 100755
--- a/elixir/web.py
+++ b/elixir/web.py
@@ -33,7 +33,7 @@
 
 from .lib import validFamily
 from .query import Query, SymbolInstance
-from .filters import get_filters
+from .project_utils import get_filters
 from .filters.utils import FilterContext
 from .autocomplete import AutocompleteResource
 from .api import ApiIdentGetterResource

From 4aff55019668eafcd1bbdbeb22674f71b8ea7e85 Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Thu, 17 Oct 2024 21:26:34 +0200
Subject: [PATCH 02/11] lexers: Add a C lexer

This commit adds a C lexer with a tool to output lexing results. The
lexer is not hooked up to the rest of Elixir yet.

Why:
Currently, Elxir uses a simple, single-regex, perl based lexer. This
approach mostly works, but it has a few issues:
* Strings are not parsed correctly - there is a mistake in the regex
  that causes it to get confused by escapes in strings.
* DTS identifers are not parsed correctly - some valid identifers are
  not recognized, others are split by otherwise allowed characters
  (mostly commas)
* It barely distinguishes between languages. Comments in Kconfig files
  are not parsed correctly (and are not parseable by a simple regex).
* Only identifiers are handled. Some parts of Elixir (filters, doccomments
  parser) could probably use a more detailed token stream. Right now,
  each of these parts contains a different regex-based lexer/parser. This
  mostly works, but again, it means the same functionality is
  reimplemented in many different parts of Elixir.
* It does not recognize numbers, which means that numbers are looked up
  in the database during updates.
* Keyword blocklist/allowlist is shared between all languages

Some of these issues could be directly addressed in the regex itself,
and some could be addressed somewhere in Elixir.
But since there is a need for more sophisticated code analysis
(ex. compatible filters, doccoments) and leaving such a crucial part
of Elxir to a perl one-liner seems quite hacky, a decision to
implement proper lexing, with a differnt lexer for each supported
language was made.

Libraries considered:
* Pygments - Pygments lexers are good for code highlighting. It seems
  that as long the token stream results in expected identifiers being
  highlighted, it's good enough for Pygments. That is okay.
  Pygments lexers could be modified to provide a more reliable token
  stream, but the question is - does that help a typical Pygments user?
  Is it worth maintainers time? My assumption is that Pygments is not
  meant to be a general code analysis tool, but a code highlighter.
  It does that well, and extending it to a general lexer for all
  languages could be painful.
* PLY - It seems that it's mostly meant for education purposes and isn't
  maintained anymore. It's not very ergonomic, for exapmle the interface
  requires each lexer to be in a different file.
* pycparser - good for C, but does not support macros
* Other parsing libraries - I'm quite sure that at this stage and for
  Elxir's purposes, we a flat token stream is wanted, not a full AST.
  Partial parsing could be done on the token stream later. But if more
  complete analysis is necessary, then it's probably beter to leave it
  to tools specific to that language (see ctags). Parser rules are also
  typically more complicated.

Goals:
* Good identifiers support - ex. not all DTS identifers are parsed
  correctly right now
* Better comments support - KConfig help texts, GNU assmebler comments
  are not parsed well at all
* Usable token stream that can be reassembled back into a file - some
  code analysis may require information about punctuation or comments,
  besides identifiers. It's also good to be sure that each character was
  considered, especially if code is meant to be modified.

Notes:
The lexers will never be perfect. Languages change, file extensions
are confusing (.h can mean C, DTS or assembler).
The main idea is to increase reliability of identifer references search,
but achieving total correctness may require more work than it's worth.
I picked an approach that should be, I hope, easy to understand,
maintain, and allow sharing as much code as possible between different
lexers.
---
 elixir/lexers/__init__.py |   0
 elixir/lexers/__main__.py |  13 +++
 elixir/lexers/lexers.py   |  38 +++++++++
 elixir/lexers/shared.py   |  47 +++++++++++
 elixir/lexers/utils.py    | 171 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 269 insertions(+)
 create mode 100644 elixir/lexers/__init__.py
 create mode 100644 elixir/lexers/__main__.py
 create mode 100644 elixir/lexers/lexers.py
 create mode 100644 elixir/lexers/shared.py
 create mode 100644 elixir/lexers/utils.py

diff --git a/elixir/lexers/__init__.py b/elixir/lexers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py
new file mode 100644
index 00000000..7948eb94
--- /dev/null
+++ b/elixir/lexers/__main__.py
@@ -0,0 +1,13 @@
+if __name__ == "__main__":
+    import sys
+    from . import lexers
+
+    if len(sys.argv) != 2:
+        print("usage:", sys.argv[0], "path/to/file")
+        exit(1)
+
+    with open(sys.argv[1]) as f:
+        lexer = lexers.CLexer(f.read())
+        for token in lexer.lex():
+            print(token.line, token.token_type.name, token.span, token.token.encode())
+
diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py
new file mode 100644
index 00000000..8f021912
--- /dev/null
+++ b/elixir/lexers/lexers.py
@@ -0,0 +1,38 @@
+import re
+
+from . import shared
+from .utils import TokenType, simple_lexer, FirstInLine
+
+# Lexers used to extract possible references from source files
+# Design inspired by Pygments lexers interface
+
+# https://en.cppreference.com/w/c/language
+# https://www.iso-9899.info/wiki/The_Standard
+class CLexer:
+    # NOTE: does not support unicode identifiers
+    c_identifier = r'[a-zA-Z_][a-zA-Z_0-9]*'
+
+    c_punctuation = r'[!#%&`()*+,./:;<=>?\[\]\\^_{|}~-]'
+
+    # NOTE: macros don't always contain C code, but detecting that in pratice is hard
+    # without information about context (where the file is included from).
+    c_punctuation_extra = r'[$\\@]'
+
+    rules = [
+        (shared.whitespace, TokenType.WHITESPACE),
+        (shared.common_slash_comment, TokenType.COMMENT),
+        (shared.common_string_and_char, TokenType.STRING),
+        (shared.c_number, TokenType.NUMBER),
+        (c_identifier, TokenType.IDENTIFIER),
+        (FirstInLine(shared.c_preproc_ignore), TokenType.SPECIAL),
+        (c_punctuation, TokenType.PUNCTUATION),
+        (c_punctuation_extra, TokenType.PUNCTUATION),
+    ]
+
+    def __init__(self, code):
+        self.code = code
+
+    def lex(self, **kwargs):
+        return simple_lexer(self.rules, self.code, **kwargs)
+
+
diff --git a/elixir/lexers/shared.py b/elixir/lexers/shared.py
new file mode 100644
index 00000000..96625d30
--- /dev/null
+++ b/elixir/lexers/shared.py
@@ -0,0 +1,47 @@
+from .utils import regex_or, regex_concat
+
+# Regexes shared between lexers
+
+whitespace = r'\s+'
+
+# Building block for comments that start with a character and go until the end of the line
+singleline_comment_with_escapes_base = r'(\\\s*\n|[^\n])*\n'
+
+slash_star_multline_comment = r'/\*(.|\s)*?\*/'
+double_slash_singleline_comment = r'//' + singleline_comment_with_escapes_base
+common_slash_comment = regex_or(slash_star_multline_comment, double_slash_singleline_comment)
+
+common_decimal_integer = r'[0-9][0-9\']*'
+common_hexidecimal_integer = r'0[xX][0-9a-fA-F][0-9a-fA-F\']*'
+common_octal_integer = r'0[0-7][0-7\']*'
+common_binary_integer = r'0[bB][01][01\']*'
+
+c_preproc_include = r'#\s*include\s*(<.*?>|".*?")'
+# match warning and error directives with the error string
+c_preproc_warning_and_error = r'#\s*(warning|error)\s(\\\s*\n|[^\n])*\n'
+# match other preprocessor directives, but don't consume the whole line
+c_preproc_other = r'#\s*[a-z]+'
+c_preproc_ignore = regex_or(c_preproc_include, c_preproc_warning_and_error, c_preproc_other)
+
+# \, any amount of whitespace, newline or any character that's not backslash newline or a quote, any escaped character
+double_quote_string_with_escapes = r'"(\\\s*\n|[^\\"\n]|\\(.|\s))*?"'
+single_quote_string_with_escapes = r"'(\\\s*\n|[^\\'\n]|\\(.|\s))*?'"
+
+common_string_and_char = regex_or(double_quote_string_with_escapes, single_quote_string_with_escapes)
+
+c_exponent = r'([eE][+-]?[0-9][0-9\']*)'
+c_hexidecimal_exponent = r'([pP][+-]?[0-9][0-9\']*)'
+
+c_decimal_double_part = r'\.[0-9\']*' + c_exponent + '?'
+c_octal_double_part = r'\.[0-7\']*' + c_exponent + '?'
+c_hexidecimal_double_part = r'\.[0-9a-fA-F\']*' + c_hexidecimal_exponent  + '?'
+
+c_decimal = f'{ common_decimal_integer }({ c_decimal_double_part })?'
+c_hexidecimal = f'{ common_hexidecimal_integer }({ c_hexidecimal_double_part })?'
+c_octal = f'{ common_octal_integer }({ c_octal_double_part  })?'
+
+# not entirely correct... accepts way more than the standard allows
+c_number_suffix = r'([uU]|[lL]|(wb|WB)|[fF]|[zZ]){0,5}'
+
+c_number = regex_concat(regex_or(c_hexidecimal, common_binary_integer, c_decimal, c_octal), c_number_suffix)
+
diff --git a/elixir/lexers/utils.py b/elixir/lexers/utils.py
new file mode 100644
index 00000000..0290754b
--- /dev/null
+++ b/elixir/lexers/utils.py
@@ -0,0 +1,171 @@
+import re
+import enum
+from collections import namedtuple
+
+# Supported token types
+class TokenType(enum.Enum):
+    WHITESPACE = 'whitespace',
+    COMMENT = 'comment'
+    STRING = 'string'
+    NUMBER = 'number'
+    IDENTIFIER = 'identifier'
+    # may require extra parsing or context information
+    SPECIAL = 'special'
+    PUNCTUATION = 'punctuation'
+    # lexing failure - should be logged, at least until update jobs are preemptible
+    ERROR = 'error'
+
+Token = namedtuple('Token', 'token_type, token, span, line')
+
+def match_regex(regex):
+    rule = re.compile(regex, flags=re.MULTILINE)
+    return lambda code, pos, _: rule.match(code, pos)
+
+# Interface class that allows to match only if certian conditions,
+# hard to express in regex, are true
+class Matcher:
+    def update_after_match(self, code: str, pos: int, line: int, token: Token) -> None:
+        pass
+
+    def match(self, code: str, pos: int, line: int) -> None | re.Match:
+        pass
+
+# Match token only if it's the first token in line (skipping whitespace)
+class FirstInLine(Matcher):
+    whitespace = re.compile(r'\s*')
+
+    def __init__(self, regex):
+        self.rule = re.compile(regex, flags=re.MULTILINE)
+        self.first_in_line = True
+
+    def update_after_match(self, code, pos, line, token):
+        # first token is always first in line
+        if pos == 0:
+            self.first_in_line = True
+            return
+
+        # check if matched token contains a newline
+        newline_pos = code.rfind('\n', token.span[0], token.span[1])
+
+        # if it doesn't contain a newline, check the part after newline
+        if newline_pos != -1:
+            post_newline_tok = code[newline_pos+1:token.span[1]]
+
+            # if part after newline contains only whitespace (or nothing), the next token is first in line
+            if self.whitespace.fullmatch(post_newline_tok):
+                self.first_in_line = True
+        # if currently matched is the first in line, and only contains whitespace,
+        # the next token also counts as first in line
+        elif self.first_in_line and self.whitespace.fullmatch(code, token.span[0], token.span[1]):
+            self.first_in_line = True
+        # otherwise reset first in line marker
+        else:
+            self.first_in_line = False
+
+    def match(self, code, pos, line):
+        if self.first_in_line:
+            return self.rule.match(code, pos)
+
+class LexerContext:
+    def self(self, code, pos, line, filter_tokens):
+        self.code = code
+        self.pos = pos
+        self.line = line
+        self.filter_tokens = filter_tokens
+
+def simple_lexer(rules, code, filter_tokens=None):
+    if len(code) == 0:
+        return
+
+    # to avoid dealing with files without trailing newlines
+    if code[-1] != '\n':
+        code += '\n'
+
+    rules_compiled = []
+    after_match_hooks = []
+
+    # compile rules
+    for rule, action in rules:
+        # string rules are actually match regex rules
+        if type(rule) is str:
+            rules_compiled.append((match_regex(rule), action))
+        # rules can also be callables
+        elif callable(rule):
+            rules_compiled.append((rule, action))
+        # rules can also be matchers - matchers get more information during parsing,
+        # that information can stored in their state
+        elif isinstance(rule, Matcher):
+            rules_compiled.append((rule.match, action))
+            after_match_hooks.append(rule.update_after_match)
+
+    # helper function that calls hooks before yielding
+    def yield_token(to_yield):
+        for hook in after_match_hooks:
+            hook(code, pos, line, to_yield)
+        return to_yield
+
+    pos = 0
+    line = 1
+    while pos < len(code):
+        rule_matched = False
+        for rule, action in rules_compiled:
+            match = rule(code, pos, line)
+
+            if match is not None:
+                span = match.span()
+                # if match is empty - continue
+                if span[0] == span[1]:
+                    continue
+
+                rule_matched = True
+
+                if isinstance(action, TokenType):
+                    # only parse tokens of interest - slices apparently copy
+                    if filter_tokens is None or action in filter_tokens:
+                        token = code[span[0]:span[1]]
+                    else:
+                        token = None
+
+                    token_obj = Token(action, token, span, line)
+                    yield yield_token(token_obj)
+                    line += code.count('\n', span[0], span[1])
+                    pos = span[1]
+                    break
+                elif callable(action):
+                    last_token = None
+                    for token in action(LexerContext(code, pos, line, filter_tokens), match):
+                        last_token = token
+                        yield yield_token(token)
+
+                    if last_token is not None:
+                        pos = last_token.span[1]
+                        line = last_token.line + last_token.token.count('\n')
+
+                    break
+                else:
+                    raise Exception(f"invalid action {action}")
+
+        # if no rules match, an error token with a single character is produced.
+        # this isn't always a big problem, hence it's the decision of the caller
+        # to decide whether to quit or continue
+        if not rule_matched:
+            token = Token(TokenType.ERROR, code[pos], (pos, pos+1), line)
+            yield yield_token(token)
+            if code[pos] == '\n':
+                line += 1
+            pos += 1
+
+# Combines regexes passed as arguments with pipe operator
+def regex_or(*regexes):
+    result = '('
+    for r in regexes:
+        result += f'({ r })|'
+    return result[:-1] + ')'
+
+# Concatenates regexes, putting each in a separate group
+def regex_concat(*regexes):
+    result = ''
+    for r in regexes:
+        result += f'({ r })'
+    return result
+

From 174f29e50e23a07822f93463f165efaef2401c21 Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Thu, 17 Oct 2024 22:06:16 +0200
Subject: [PATCH 03/11] lexers: Add C lexer tests

This commit adds tests for the C lexer.

The "architecture" may be controversial - test snippets are stored as
strings. This has some drawbacks.
* It's pretty ugly
* Whitespace sensitive test cases may require extra care
* Grepping may get even more annoying (it probably makes sense to skip
  all files starting with test_).

An alternative would be to store each test case and result as a
different file. I didn't go with that approach because of the following
reasons:
* It's harder to use the built-in Python testing framework
* Test cases should be short, but it's annoying to navigate between many
  different small files
* Making a readable test result format requres extra parsing work in
  Elixir. It's doable, but also annoying.
---
 elixir/lexers/tests/__init__.py |   0
 elixir/lexers/tests/base.py     |  65 ++++
 elixir/lexers/tests/test_c.py   | 567 ++++++++++++++++++++++++++++++++
 3 files changed, 632 insertions(+)
 create mode 100644 elixir/lexers/tests/__init__.py
 create mode 100644 elixir/lexers/tests/base.py
 create mode 100644 elixir/lexers/tests/test_c.py

diff --git a/elixir/lexers/tests/__init__.py b/elixir/lexers/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/elixir/lexers/tests/base.py b/elixir/lexers/tests/base.py
new file mode 100644
index 00000000..e234df33
--- /dev/null
+++ b/elixir/lexers/tests/base.py
@@ -0,0 +1,65 @@
+import unittest
+
+class LexerTest(unittest.TestCase):
+    default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR")
+
+    # Checks if each token starts in the claimed position of code, if tokens cover all code and if no tokens overlap
+    def verify_positions(self, code, tokens):
+        last_token = None
+        for t in tokens:
+            if code[t.span[0]:t.span[1]] != t.token:
+                self.fail(f"token {t} span != code span {code[t.span[0]:t.span[1]].encode()}")
+
+            if last_token is not None and last_token.span[1] != t.span[0]:
+                self.fail(f"token does not start where the previous token ends. prev: {last_token}, next: {t}")
+            elif last_token is None and t.span[0] != 0:
+                self.fail(f"first token does not start at zero: {t}")
+
+            last_token = t
+
+        if last_token.span[1] != len(code):
+            self.fail(f"code is longer than position of the last token: {t}, code len: {len(code)}")
+
+    # Checks if each token is in the claimed line of code
+    def verify_lines(self, code, tokens):
+        lines = [""] + code.split("\n") # zero line is emtpy
+        last_line_number = None
+        last_line_contents_left = None
+        for t in tokens:
+            if last_line_number != t.line:
+                last_line_number = t.line
+                last_line_contents_left = lines[t.line]
+
+            if last_line_contents_left is None:
+                self.fail(f"nothing left in line {t.line} for {t.token} {t}")
+
+            newline_count = t.token.count("\n")
+            all_token_lines = last_line_contents_left + "\n" + \
+                    "\n".join([lines[i] for i in range(t.line+1, t.line+newline_count+1)]) + "\n"
+            token_pos_in_lines = all_token_lines.find(t.token)
+            if token_pos_in_lines == -1:
+                self.fail(f"token {t.token} not found in line {t.line}: {all_token_lines.encode()}")
+            if token_pos_in_lines < len(last_line_contents_left):
+                last_line_contents_left = last_line_contents_left[token_pos_in_lines:]
+            else:
+                last_line_contents_left = None
+
+    # Lex code, do basic soundness checks on tokens (lines and positions) and compare lexing results with a list of tokens
+    def lex(self, code, expected, filtered_tokens=None, lexer_options={}):
+        if filtered_tokens is None:
+            filtered_tokens = self.default_filtered_tokens
+
+        code = code.lstrip()
+        tokens = list(self.lexer_cls(code, **lexer_options).lex())
+        self.verify_positions(code, tokens)
+        self.verify_lines(code, tokens)
+
+        tokens = [[type.name, token] for type, token, span, line in tokens]
+        tokens = [t for t in tokens if t[0] in filtered_tokens]
+        try:
+            self.assertEqual(tokens, expected)
+        except Exception as e:
+            print()
+            for t in tokens: print(t, end=",\n")
+            raise e
+
diff --git a/elixir/lexers/tests/test_c.py b/elixir/lexers/tests/test_c.py
new file mode 100644
index 00000000..ffd48cee
--- /dev/null
+++ b/elixir/lexers/tests/test_c.py
@@ -0,0 +1,567 @@
+from ..lexers import CLexer
+from .base import LexerTest
+
+class CLexerTest(LexerTest):
+    lexer_cls = CLexer
+    default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR")
+
+    def test_if0(self):
+        self.lex(r"""
+#if 0
+static bool test_v3_0_test(void *h,
+                    enum type_enum e) {
+    return false;
+}
+#endif
+static bool test_v3_0_test(void *h,
+                    enum type_enum e) {
+    return false;
+}
+""", [
+        ['SPECIAL', '#if'],
+        ['NUMBER', '0'],
+        ['IDENTIFIER', 'static'],
+        ['IDENTIFIER', 'bool'],
+        ['IDENTIFIER', 'test_v3_0_test'],
+        ['IDENTIFIER', 'void'],
+        ['IDENTIFIER', 'h'],
+        ['IDENTIFIER', 'enum'],
+        ['IDENTIFIER', 'type_enum'],
+        ['IDENTIFIER', 'e'],
+        ['IDENTIFIER', 'return'],
+        ['IDENTIFIER', 'false'],
+        ['SPECIAL', '#endif'],
+        ['IDENTIFIER', 'static'],
+        ['IDENTIFIER', 'bool'],
+        ['IDENTIFIER', 'test_v3_0_test'],
+        ['IDENTIFIER', 'void'],
+        ['IDENTIFIER', 'h'],
+        ['IDENTIFIER', 'enum'],
+        ['IDENTIFIER', 'type_enum'],
+        ['IDENTIFIER', 'e'],
+        ['IDENTIFIER', 'return'],
+        ['IDENTIFIER', 'false'],
+    ], self.default_filtered_tokens + ("NUMBER",))
+
+    def test_preproc(self):
+        self.lex(r"""
+#include <stdio.h>
+#   include <stdio.h>
+# include "test.h"
+#   include "test.h"
+
+# warning war
+#       error err
+    #       error err
+    #warning war
+
+#error "escaped\
+        message"
+
+#warning "escaped\  
+        message"
+
+#  if defined(TEST)
+#   elif defined(TEST2)
+#else
+""", [
+        ['SPECIAL', '#include <stdio.h>'],
+        ['SPECIAL', '#   include <stdio.h>'],
+        ['SPECIAL', '# include "test.h"'],
+        ['SPECIAL', '#   include "test.h"'],
+        ['SPECIAL', '# warning war\n'],
+        ['SPECIAL', '#       error err\n'],
+        ['SPECIAL', '#       error err\n'],
+        ['SPECIAL', '#warning war\n'],
+        ['SPECIAL', '#error "escaped\\\n        message"\n'],
+        ['SPECIAL', '#warning "escaped\\  \n        message"\n'],
+        ['SPECIAL', '#  if'],
+        ['IDENTIFIER', 'defined'],
+        ['IDENTIFIER', 'TEST'],
+        ['SPECIAL', '#   elif'],
+        ['IDENTIFIER', 'defined'],
+        ['IDENTIFIER', 'TEST2'],
+        ['SPECIAL', '#else'],
+    ])
+
+    def test_defines(self):
+        self.lex("""
+# define test "long string \
+    escaped newline"
+
+    #define     test define1
+#       define     test2 define12323
+
+#define func(name, arg1,arg2...) \
+    void name##f() { \
+        return arg1 + arg2;
+    }
+""", [
+        ['SPECIAL', '# define'],
+        ['IDENTIFIER', 'test'],
+        ['STRING', '"long string     escaped newline"'],
+        ['SPECIAL', '#define'],
+        ['IDENTIFIER', 'test'],
+        ['IDENTIFIER', 'define1'],
+        ['SPECIAL', '#       define'],
+        ['IDENTIFIER', 'test2'],
+        ['IDENTIFIER', 'define12323'],
+        ['SPECIAL', '#define'],
+        ['IDENTIFIER', 'func'],
+        ['IDENTIFIER', 'name'],
+        ['IDENTIFIER', 'arg1'],
+        ['IDENTIFIER', 'arg2'],
+        ['IDENTIFIER', 'void'],
+        ['IDENTIFIER', 'name'],
+        ['IDENTIFIER', 'f'],
+        ['IDENTIFIER', 'return'],
+        ['IDENTIFIER', 'arg1'],
+        ['IDENTIFIER', 'arg2'],
+    ])
+
+    def test_strings(self):
+        self.lex(r"""
+"asdsad \   
+    asdasd";
+'asdsad \
+    asdasd';
+u8"test string";
+u"test string";
+u"test string";
+L"test string";
+"test \" string";
+"test ' string";
+"test \' string";
+"test \n string";
+"\xff";
+"test" "string";
+"test""string";
+"test"
+""", [
+        ['STRING', '"asdsad \\   \n    asdasd"'],
+        ['STRING', "'asdsad \\\n    asdasd'"],
+        ['IDENTIFIER', 'u8'],
+        ['STRING', '"test string"'],
+        ['IDENTIFIER', 'u'],
+        ['STRING', '"test string"'],
+        ['IDENTIFIER', 'u'],
+        ['STRING', '"test string"'],
+        ['IDENTIFIER', 'L'],
+        ['STRING', '"test string"'],
+        ['STRING', '"test \\" string"'],
+        ['STRING', '"test \' string"'],
+        ['STRING', '"test \\\' string"'],
+        ['STRING', '"test \\n string"'],
+        ['STRING', '"\\xff"'],
+        ['STRING', '"test"'],
+        ['STRING', '"string"'],
+        ['STRING', '"test"'],
+        ['STRING', '"string"'],
+        ['STRING', '"test"'],
+    ])
+
+    def test_strings2(self):
+        self.lex(r"""
+    "string";
+        char* s1 = "asdjlsajdlksad""asdsajdlsad";       //comment6
+    char* s2 = "asdjlsajdlksad"  "asdsajdlsad";         // \
+                                                        single line comment \
+        with escapes
+    char* s3 = " asdsaldjkas \"";
+    char* s4 = " asdsaldjkas \" zxclzxclk \" asljda";
+    char* s5 = " asdsaldjkas \' zxclzxclk \" asljda";
+    char* s6 = " asdsaldjkas \"\"\" zxclzxclk \'\'\' ; asljda";
+    char* s7 = u8"test";
+""", [
+        ['STRING', '"string"'],
+        ['IDENTIFIER', 'char'],
+        ['IDENTIFIER', 's1'],
+        ['STRING', '"asdjlsajdlksad"'],
+        ['STRING', '"asdsajdlsad"'],
+        ['COMMENT', '//comment6\n'],
+        ['IDENTIFIER', 'char'],
+        ['IDENTIFIER', 's2'],
+        ['STRING', '"asdjlsajdlksad"'],
+        ['STRING', '"asdsajdlsad"'],
+        ['COMMENT', '// \\\n                                                        single line comment \\\n        with escapes\n'],
+        ['IDENTIFIER', 'char'],
+        ['IDENTIFIER', 's3'],
+        ['STRING', '" asdsaldjkas \\""'],
+        ['IDENTIFIER', 'char'],
+        ['IDENTIFIER', 's4'],
+        ['STRING', '" asdsaldjkas \\" zxclzxclk \\" asljda"'],
+        ['IDENTIFIER', 'char'],
+        ['IDENTIFIER', 's5'],
+        ['STRING', '" asdsaldjkas \\\' zxclzxclk \\" asljda"'],
+        ['IDENTIFIER', 'char'],
+        ['IDENTIFIER', 's6'],
+        ['STRING', '" asdsaldjkas \\"\\"\\" zxclzxclk \\\'\\\'\\\' ; asljda"'],
+        ['IDENTIFIER', 'char'],
+        ['IDENTIFIER', 's7'],
+        ['IDENTIFIER', 'u8'],
+        ['STRING', '"test"'],
+    ])
+
+    def test_chars(self):
+        self.lex(r"""
+'a';
+u8'a';
+u'a';
+U'a';
+'\'';
+'\"';
+'\\';
+'\n';
+'\f';
+'\U0001f34c';
+'\13';
+'\x1234';
+'\u213';
+u'ą';
+""", [
+        ['STRING', "'a'"],
+        ['IDENTIFIER', 'u8'],
+        ['STRING', "'a'"],
+        ['IDENTIFIER', 'u'],
+        ['STRING', "'a'"],
+        ['IDENTIFIER', 'U'],
+        ['STRING', "'a'"],
+        ['STRING', "'\\''"],
+        ['STRING', '\'\\"\''],
+        ['STRING', "'\\\\'"],
+        ['STRING', "'\\n'"],
+        ['STRING', "'\\f'"],
+        ['STRING', "'\\U0001f34c'"],
+        ['STRING', "'\\13'"],
+        ['STRING', "'\\x1234'"],
+        ['STRING', "'\\u213'"],
+        ['IDENTIFIER', 'u'],
+        ['STRING', "'ą'"],
+    ])
+
+    def test_numbers(self):
+        self.lex(r"""
+1239183;
+-1239183;
+0xAB08902;
+-0xAB08902;
+0Xab08902;
+-0Xab08902;
+0b0101001;
+-0b0101001;
+0B0101001;
+-0B0101001;
+0231273;
+-0231273;
+""", [
+        ['NUMBER', '1239183'],
+        ['NUMBER', '1239183'],
+        ['NUMBER', '0xAB08902'],
+        ['NUMBER', '0xAB08902'],
+        ['NUMBER', '0Xab08902'],
+        ['NUMBER', '0Xab08902'],
+        ['NUMBER', '0b0101001'],
+        ['NUMBER', '0b0101001'],
+        ['NUMBER', '0B0101001'],
+        ['NUMBER', '0B0101001'],
+        ['NUMBER', '0231273'],
+        ['NUMBER', '0231273'],
+    ], self.default_filtered_tokens + ("NUMBER",))
+
+    def test_floats(self):
+        self.lex(r"""
+double       e = 0x2ABDEFabcdef;
+double
+    f = 017.048509495;
+double     -g = 0b1010010;
+double     g = 0b1010010;
+-017.048509495;
+017.048509495;
+-017.048509495e-12329123;
+017.048509495e-12329123;
+-0x123.fp34;
+0x123.fp34;
+-0x123.fP34;
+0x123.fP34;
+-0x123.fe1p123;
+0x123.fe1p123;
+-0x123.fe1p123;
+0x123.fe1p123;
+-.1;
+.1;
+-1.;
+1.;
+-0x1.ep+3;
+0x1.ep+3;
+-0X183083;
+0X183083;
+-0x213213.1231212'31e21p-2;
+0x213213.1231212'31e21p-2;
+-123123.123e2;
+123123.123e2;
+""", [
+        ['IDENTIFIER', 'double'],
+        ['IDENTIFIER', 'e'],
+        ['NUMBER', '0x2ABDEFabcdef'],
+        ['IDENTIFIER', 'double'],
+        ['IDENTIFIER', 'f'],
+        ['NUMBER', '017.048509495'],
+        ['IDENTIFIER', 'double'],
+        ['IDENTIFIER', 'g'],
+        ['NUMBER', '0b1010010'],
+        ['IDENTIFIER', 'double'],
+        ['IDENTIFIER', 'g'],
+        ['NUMBER', '0b1010010'],
+        ['NUMBER', '017.048509495'],
+        ['NUMBER', '017.048509495'],
+        ['NUMBER', '017.048509495e-12329123'],
+        ['NUMBER', '017.048509495e-12329123'],
+        ['NUMBER', '0x123.fp34'],
+        ['NUMBER', '0x123.fp34'],
+        ['NUMBER', '0x123.fP34'],
+        ['NUMBER', '0x123.fP34'],
+        ['NUMBER', '0x123.fe1p123'],
+        ['NUMBER', '0x123.fe1p123'],
+        ['NUMBER', '0x123.fe1p123'],
+        ['NUMBER', '0x123.fe1p123'],
+        ['NUMBER', '1'],
+        ['NUMBER', '1'],
+        ['NUMBER', '1.'],
+        ['NUMBER', '1.'],
+        ['NUMBER', '0x1.ep+3'],
+        ['NUMBER', '0x1.ep+3'],
+        ['NUMBER', '0X183083'],
+        ['NUMBER', '0X183083'],
+        ['NUMBER', "0x213213.1231212'31e21p-2"],
+        ['NUMBER', "0x213213.1231212'31e21p-2"],
+        ['NUMBER', '123123.123e2'],
+        ['NUMBER', '123123.123e2'],
+    ], self.default_filtered_tokens + ("NUMBER",))
+
+    def test_longs(self):
+        self.lex(r"""
+-123213092183ul;
+123213092183ul;
+-123213092183ull;
+123213092183ull;
+-123213092183llu;
+123213092183llu;
+-123213092183uLL;
+123213092183uLL;
+-123213092183LLU;
+123213092183LLU;
+-1232'13092183LLU;
+1232'13092183LLU;
+-1232'1309'2183LLU;
+1232'1309'2183LLU;
+-1232'1309'218'3LLU;
+1232'1309'218'3LLU;
+""", [
+        ['NUMBER', '123213092183ul'],
+        ['NUMBER', '123213092183ul'],
+        ['NUMBER', '123213092183ull'],
+        ['NUMBER', '123213092183ull'],
+        ['NUMBER', '123213092183llu'],
+        ['NUMBER', '123213092183llu'],
+        ['NUMBER', '123213092183uLL'],
+        ['NUMBER', '123213092183uLL'],
+        ['NUMBER', '123213092183LLU'],
+        ['NUMBER', '123213092183LLU'],
+        ['NUMBER', "1232'13092183LLU"],
+        ['NUMBER', "1232'13092183LLU"],
+        ['NUMBER', "1232'1309'2183LLU"],
+        ['NUMBER', "1232'1309'2183LLU"],
+        ['NUMBER', "1232'1309'218'3LLU"],
+        ['NUMBER', "1232'1309'218'3LLU"],
+    ], self.default_filtered_tokens + ("NUMBER",))
+
+    def test_comments(self):
+        self.lex(r"""
+    /*comment1*/
+    /* comment2*/
+    /* comment3 */
+    /*
+     *
+        comment4
+    _+}{|":?><~!@#$%&*()_+`123567890-=[];'\,./
+     * */
+
+    /* comment 5 \*\// */
+
+// comment5
+char* s2 = "asdjlsajdlksad"  "asdsajdlsad";         // \
+                                   single line comment \
+        with escapes
+char statement;
+""", [
+        ['COMMENT', '/*comment1*/'],
+        ['COMMENT', '/* comment2*/'],
+        ['COMMENT', '/* comment3 */'],
+        ['COMMENT', '/*\n     *\n        comment4\n    _+}{|":?><~!@#$%&*()_+`123567890-=[];\'\\,./\n     * */'],
+        ['COMMENT', '/* comment 5 \\*\\// */'],
+        ['COMMENT', '// comment5\n'],
+        ['IDENTIFIER', 'char'],
+        ['IDENTIFIER', 's2'],
+        ['STRING', '"asdjlsajdlksad"'],
+        ['STRING', '"asdsajdlsad"'],
+        ['COMMENT', '// \\\n                                   single line comment \\\n        with escapes\n'],
+        ['IDENTIFIER', 'char'],
+        ['IDENTIFIER', 'statement'],
+    ])
+
+    # https://en.cppreference.com/w/cpp/language/pack_indexing
+    def test_cpp_templates(self):
+        self.lex(r"""
+template<typename... Ts>
+constexpr auto f(Ts&&... ts) {
+    return sizeof...(Ts);
+}
+
+template<typename T, T::t t = 0>
+int f() {
+    std::cout << t << std::endl;
+    ns1::ns2::type v;
+    ns1::ns2::type2<int> v2;
+    ns1::ns2::type3<int, double> v3;
+}
+""", [
+        ['IDENTIFIER', 'template'],
+        ['IDENTIFIER', 'typename'],
+        ['IDENTIFIER', 'Ts'],
+        ['IDENTIFIER', 'constexpr'],
+        ['IDENTIFIER', 'auto'],
+        ['IDENTIFIER', 'f'],
+        ['IDENTIFIER', 'Ts'],
+        ['IDENTIFIER', 'ts'],
+        ['IDENTIFIER', 'return'],
+        ['IDENTIFIER', 'sizeof'],
+        ['IDENTIFIER', 'Ts'],
+        ['IDENTIFIER', 'template'],
+        ['IDENTIFIER', 'typename'],
+        ['IDENTIFIER', 'T'],
+        ['IDENTIFIER', 'T'],
+        ['IDENTIFIER', 't'],
+        ['IDENTIFIER', 't'],
+        ['IDENTIFIER', 'int'],
+        ['IDENTIFIER', 'f'],
+        ['IDENTIFIER', 'std'],
+        ['IDENTIFIER', 'cout'],
+        ['IDENTIFIER', 't'],
+        ['IDENTIFIER', 'std'],
+        ['IDENTIFIER', 'endl'],
+        ['IDENTIFIER', 'ns1'],
+        ['IDENTIFIER', 'ns2'],
+        ['IDENTIFIER', 'type'],
+        ['IDENTIFIER', 'v'],
+        ['IDENTIFIER', 'ns1'],
+        ['IDENTIFIER', 'ns2'],
+        ['IDENTIFIER', 'type2'],
+        ['IDENTIFIER', 'int'],
+        ['IDENTIFIER', 'v2'],
+        ['IDENTIFIER', 'ns1'],
+        ['IDENTIFIER', 'ns2'],
+        ['IDENTIFIER', 'type3'],
+        ['IDENTIFIER', 'int'],
+        ['IDENTIFIER', 'double'],
+        ['IDENTIFIER', 'v3'],
+    ])
+
+    # https://en.cppreference.com/w/cpp/language/requires
+    def test_cpp_concepts(self):
+        self.lex(r"""
+template<typename T>
+concept C = requires(T x) {
+    {x.count()} -> std::same_as<int>;
+    requires Same<T*, decltype(&x)>
+};
+""", [
+        ['IDENTIFIER', 'template'],
+        ['IDENTIFIER', 'typename'],
+        ['IDENTIFIER', 'T'],
+        ['IDENTIFIER', 'concept'],
+        ['IDENTIFIER', 'C'],
+        ['IDENTIFIER', 'requires'],
+        ['IDENTIFIER', 'T'],
+        ['IDENTIFIER', 'x'],
+        ['IDENTIFIER', 'x'],
+        ['IDENTIFIER', 'count'],
+        ['IDENTIFIER', 'std'],
+        ['IDENTIFIER', 'same_as'],
+        ['IDENTIFIER', 'int'],
+        ['IDENTIFIER', 'requires'],
+        ['IDENTIFIER', 'Same'],
+        ['IDENTIFIER', 'T'],
+        ['IDENTIFIER', 'decltype'],
+        ['IDENTIFIER', 'x'],
+    ])
+
+    def test_cpp_class(self):
+        self.lex(r"""
+using namespace std;
+
+auto f() -> std::string;
+
+class test {
+public:
+    int operator ""_tx(int);
+    int a = 123_tx;
+};
+""", [
+        ['IDENTIFIER', 'using'],
+        ['IDENTIFIER', 'namespace'],
+        ['IDENTIFIER', 'std'],
+        ['IDENTIFIER', 'auto'],
+        ['IDENTIFIER', 'f'],
+        ['IDENTIFIER', 'std'],
+        ['IDENTIFIER', 'string'],
+        ['IDENTIFIER', 'class'],
+        ['IDENTIFIER', 'test'],
+        ['IDENTIFIER', 'public'],
+        ['IDENTIFIER', 'int'],
+        ['IDENTIFIER', 'operator'],
+        ['STRING', '""'],
+        ['IDENTIFIER', '_tx'],
+        ['IDENTIFIER', 'int'],
+        ['IDENTIFIER', 'int'],
+        ['IDENTIFIER', 'a'],
+        ['IDENTIFIER', '_tx'],
+    ])
+
+    def test_cpp_attrs(self):
+        self.lex(r"""
+[[using test: atr1]] [[atr2]]
+int f[[atr3]]();
+""", [
+        ['IDENTIFIER', 'using'],
+        ['IDENTIFIER', 'test'],
+        ['IDENTIFIER', 'atr1'],
+        ['IDENTIFIER', 'atr2'],
+        ['IDENTIFIER', 'int'],
+        ['IDENTIFIER', 'f'],
+        ['IDENTIFIER', 'atr3'],
+    ])
+
+    # https://en.cppreference.com/w/cpp/language/noexcept_spec
+    def test_cpp_noexpect(self):
+        self.lex(r"""
+void f() noexpect(true) {}
+""", [
+        ['IDENTIFIER', 'void'],
+        ['IDENTIFIER', 'f'],
+        ['IDENTIFIER', 'noexpect'],
+        ['IDENTIFIER', 'true'],
+    ])
+
+    # https://en.cppreference.com/w/cpp/language/coroutines
+    def test_cpp_coroutines(self):
+        self.lex(r"""
+task<> test() {
+    co_await test2();
+}
+""", [
+        ['IDENTIFIER', 'task'],
+        ['IDENTIFIER', 'test'],
+        ['IDENTIFIER', 'co_await'],
+        ['IDENTIFIER', 'test2'],
+    ])
+

From 2289c4519e5278cac9a88b5ea2342e00f58f02ec Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Fri, 25 Oct 2024 13:10:15 +0200
Subject: [PATCH 04/11] lexers: Add a DTS lexer

---
 elixir/lexers/__main__.py |  26 +++++++--
 elixir/lexers/lexers.py   | 110 +++++++++++++++++++++++++++++++++++++-
 elixir/lexers/utils.py    |  30 ++++++++++-
 3 files changed, 159 insertions(+), 7 deletions(-)

diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py
index 7948eb94..7b9f7a6f 100644
--- a/elixir/lexers/__main__.py
+++ b/elixir/lexers/__main__.py
@@ -2,12 +2,28 @@
     import sys
     from . import lexers
 
-    if len(sys.argv) != 2:
-        print("usage:", sys.argv[0], "path/to/file")
+    if not (len(sys.argv) == 2 or (len(sys.argv) == 3 and sys.argv[1] == '-s')):
+        print("usage:", sys.argv[0], "[-s]", "path/to/file")
         exit(1)
 
-    with open(sys.argv[1]) as f:
-        lexer = lexers.CLexer(f.read())
+    short = sys.argv[1] == '-s'
+
+    filename = sys.argv[-1]
+
+    with open(filename) as f:
+        if filename.endswith(('.c', '.h', '.cpp', '.hpp')):
+            lexer = lexers.CLexer(f.read())
+        elif filename.endswith(('.dts', '.dtsi')):
+            lexer = lexers.DTSLexer(f.read())
+        else:
+            raise Exception("no lexer for filetype")
+
         for token in lexer.lex():
-            print(token.line, token.token_type.name, token.span, token.token.encode())
+            if not short:
+                print(token.line, token.token_type.name, token.span, token.token.encode())
+            else:
+                if token.token_type.name == 'IDENTIFIER' or token.token_type.name == 'STRING':
+                    print(f"|{token.token}|", end='')
+                else:
+                    print(token.token, end='')
 
diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py
index 8f021912..b3fb583b 100644
--- a/elixir/lexers/lexers.py
+++ b/elixir/lexers/lexers.py
@@ -1,7 +1,7 @@
 import re
 
 from . import shared
-from .utils import TokenType, simple_lexer, FirstInLine
+from .utils import TokenType, simple_lexer, FirstInLine, split_by_groups, regex_concat, token_from_string
 
 # Lexers used to extract possible references from source files
 # Design inspired by Pygments lexers interface
@@ -36,3 +36,111 @@ def lex(self, **kwargs):
         return simple_lexer(self.rules, self.code, **kwargs)
 
 
+# https://www.devicetree.org/specifications/
+class DTSLexer:
+    # TODO handle macros separately
+
+    # NOTE: previous versions would split identifiers by commas (and other special characters),
+    # this changes the old behavior
+
+    # 6.2
+    # technically shall be 1-31 characters long BUT /linux/v6.9.4/source/arch/arm64/boot/dts/qcom/sm8250.dtsi#L3506
+    dts_label = r'[a-zA-Z_][a-zA-Z_0-9]*'
+    # no whitespace between label and ampersand/colon is allowed
+    dts_label_reference = f'(&)({ dts_label })'
+    dts_label_definition = f'({ dts_label })(:)'
+
+    # 2.2.1
+    # same with label lenght, just in case
+    dts_node_name = r'[a-zA-Z0-9,._+-]+'
+    # can contain macro symbols
+    dts_unit_address = r'[a-zA-Z0-9,._+-]*'
+
+    dts_node_name_with_unit_address = f'({ dts_node_name })(@)({ dts_unit_address })' + r'(\s*)({)'
+    dts_node_name_without_unit_address = f'({ dts_node_name })' + r'(\s*)({)'
+
+    # 2.2.4
+    dts_property_name = r'[0-9a-zA-Z,._+?#-]+'
+    dts_property_assignment = f'({ dts_property_name })' + r'(\s*)(=)'
+    dts_property_empty = f'({ dts_property_name })' + r'(\s*)(;)'
+
+    dts_directive = r'/[a-zA-Z0-9-]+/';
+    dts_delete_node = regex_concat(r'/delete-node/\s+', dts_node_name)
+    dts_delete_property = regex_concat(r'/delete-property/\s+', dts_property_name)
+
+    # 6.3
+    dts_node_reference = r'(&)({)([a-zA-Z0-9,._+/@-]+?)(})'
+
+    dts_punctuation = r'[#@:;{}\[\]()^<>=+*/%&\\|~!?,-]'
+    # other, unknown, identifiers - for exmple macros
+    dts_default_identifier = r'[0-9a-zA-Z_]+'
+
+    # Parse DTS node reference, ex: &{/path/to/node@20/test}
+    @staticmethod
+    def parse_dts_node_reference(ctx, match):
+        # &
+        token, ctx = token_from_string(ctx, match.group(1), TokenType.PUNCTUATION)
+        yield token
+
+        # {
+        token, ctx = token_from_string(ctx, match.group(2), TokenType.PUNCTUATION)
+        yield token
+
+        path = match.group(3)
+        path_part_matcher = re.compile(DTSLexer.dts_unit_address)
+        strpos = 0
+
+        while strpos < len(path):
+            if path[strpos] == '@' or path[strpos] == '/':
+                token, ctx = token_from_string(ctx, path[strpos], TokenType.PUNCTUATION)
+                yield token
+                strpos += 1
+            else:
+                part_match = path_part_matcher.match(path, strpos)
+                if part_match is None:
+                    token, _ = token_from_string(ctx, TokenType.ERROR, '')
+                    yield token
+                    return None
+
+                token, ctx = token_from_string(ctx, part_match.group(0), TokenType.IDENTIFIER)
+                yield token
+                strpos += len(part_match.group(0))
+        # }
+        token, ctx = token_from_string(ctx, match.group(4), TokenType.PUNCTUATION)
+        yield token
+
+    rules = [
+        (shared.whitespace, TokenType.WHITESPACE),
+        (shared.common_slash_comment, TokenType.COMMENT),
+        (shared.common_string_and_char, TokenType.STRING),
+        (shared.c_number, TokenType.NUMBER),
+
+        (dts_label_reference, split_by_groups(TokenType.PUNCTUATION, TokenType.IDENTIFIER)),
+        (dts_label_definition, split_by_groups(TokenType.IDENTIFIER, TokenType.PUNCTUATION)),
+        (dts_node_reference, parse_dts_node_reference),
+
+        (dts_property_assignment,
+         split_by_groups(TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)),
+        (dts_property_empty,
+         split_by_groups(TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)),
+
+        (dts_node_name_with_unit_address,
+         split_by_groups(TokenType.IDENTIFIER, TokenType.PUNCTUATION,
+                    TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)),
+        (dts_node_name_without_unit_address,
+         split_by_groups(TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)),
+
+        (dts_directive, TokenType.SPECIAL),
+        (dts_delete_node, split_by_groups(TokenType.SPECIAL, TokenType.IDENTIFIER)),
+        (dts_delete_property, split_by_groups(TokenType.SPECIAL, TokenType.IDENTIFIER)),
+        (dts_default_identifier, TokenType.IDENTIFIER),
+        (FirstInLine(shared.c_preproc_ignore), TokenType.SPECIAL),
+        (dts_punctuation, TokenType.PUNCTUATION),
+    ]
+
+    def __init__(self, code):
+        self.code = code
+
+    def lex(self, **kwargs):
+        return simple_lexer(self.rules, self.code, **kwargs)
+
diff --git a/elixir/lexers/utils.py b/elixir/lexers/utils.py
index 0290754b..269af1bc 100644
--- a/elixir/lexers/utils.py
+++ b/elixir/lexers/utils.py
@@ -21,6 +21,34 @@ def match_regex(regex):
     rule = re.compile(regex, flags=re.MULTILINE)
     return lambda code, pos, _: rule.match(code, pos)
 
+def split_by_groups(*token_types):
+    def split(ctx, match):
+        pos = ctx.pos
+        line = ctx.line
+        for gi in range(len(match.groups())):
+            token = match.group(gi+1)
+            if len(token) != 0:
+                action = token_types[gi]
+                yield Token(action, token, (pos, pos+len(token)), line)
+                line += token.count("\n")
+                pos += len(token)
+
+    return split
+
+def token_from_match(ctx, match, token_type):
+    span = match.span()
+    result = Token(token_type, ctx.code[span[0]:span[1]], span, ctx.line)
+    ctx.pos = span[1]
+    ctx.line = ctx.line+result.token.count('\n')
+    return result, ctx
+
+def token_from_string(ctx, match, token_type):
+    span = (ctx.pos, ctx.pos+len(match))
+    result = Token(token_type, ctx.code[span[0]:span[1]], span, ctx.line)
+    ctx.pos = span[1]
+    ctx.line = ctx.line+result.token.count('\n')
+    return result, ctx
+
 # Interface class that allows to match only if certian conditions,
 # hard to express in regex, are true
 class Matcher:
@@ -67,7 +95,7 @@ def match(self, code, pos, line):
             return self.rule.match(code, pos)
 
 class LexerContext:
-    def self(self, code, pos, line, filter_tokens):
+    def __init__(self, code, pos, line, filter_tokens):
         self.code = code
         self.pos = pos
         self.line = line

From e6767b7e60fba6a9f826a66340bb0070d39e17a9 Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Fri, 25 Oct 2024 13:11:39 +0200
Subject: [PATCH 05/11] lexers: Add DTS lexer tests

---
 elixir/lexers/tests/test_dts.py | 271 ++++++++++++++++++++++++++++++++
 1 file changed, 271 insertions(+)
 create mode 100644 elixir/lexers/tests/test_dts.py

diff --git a/elixir/lexers/tests/test_dts.py b/elixir/lexers/tests/test_dts.py
new file mode 100644
index 00000000..72f39d7f
--- /dev/null
+++ b/elixir/lexers/tests/test_dts.py
@@ -0,0 +1,271 @@
+from ..lexers import DTSLexer
+from .base import LexerTest
+
+class DTSLexerTests(LexerTest):
+    lexer_cls = DTSLexer
+    default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR")
+
+    def test_preproc(self):
+        self.lex(r"""
+#include <file.dtsi>
+#include "file2.dtsi"
+#error error message asldjlksajdlksad
+#warning   warning message alsjdlkasjdlksajd
+#define MACRO(arg) \
+        arg = <3>;
+#if 0
+/ {
+    property = <2>;
+    MACRO(test)
+};
+#endif
+""", [
+        ['SPECIAL', '#include <file.dtsi>'],
+        ['SPECIAL', '#include "file2.dtsi"'],
+        ['SPECIAL', '#error error message asldjlksajdlksad\n'],
+        ['SPECIAL', '#warning   warning message alsjdlkasjdlksajd\n'],
+        ['SPECIAL', '#define'],
+        ['IDENTIFIER', 'MACRO'],
+        ['IDENTIFIER', 'arg'],
+        ['IDENTIFIER', 'arg'],
+        ['SPECIAL', '#if'],
+        ['IDENTIFIER', 'property'],
+        ['IDENTIFIER', 'MACRO'],
+        ['IDENTIFIER', 'test'],
+        ['SPECIAL', '#endif'],
+    ])
+
+    def test_dts_directives(self):
+        self.lex(r"""
+/include/ "file.dtsi"
+/dts-v1/;
+/memreserve/ 0x100 0x2;
+/ {
+    test_label: test-node {
+        test-prop2 = <3>;
+    };
+    test-prop = <2>;
+    /delete-node/ test-node;
+    /delete-node/ &test_label;
+    /delete-property/ test-prop;
+};
+""", [
+        ['SPECIAL', '/include/'],
+        ['STRING', '"file.dtsi"'],
+        ['SPECIAL', '/dts-v1/'],
+        ['SPECIAL', '/memreserve/'],
+        ['IDENTIFIER', 'test_label'],
+        ['IDENTIFIER', 'test-node'],
+        ['IDENTIFIER', 'test-prop2'],
+        ['IDENTIFIER', 'test-prop'],
+        ['SPECIAL', '/delete-node/'],
+        ['IDENTIFIER', 'test-node'],
+        ['SPECIAL', '/delete-node/'],
+        ['IDENTIFIER', 'test_label'],
+        ['SPECIAL', '/delete-property/'],
+        ['IDENTIFIER', 'test-prop'],
+    ])
+
+    def test_dts_unusual_identifiers(self):
+        self.lex(r"""
+/ {
+    _test_label:        5id,test._+asd-2           {
+        property,name = <2>;
+        0p,r.o_p+e?r#t-y,name = [1,2,3];
+        way_too_long_label_123219380921830218309218309213    :  node@234 {
+            compatible = "asd,zxc";
+        }
+        test  =   <&way_too_long_label_123219380921830218309218309213>;
+    };
+};
+""", [
+        ['IDENTIFIER', '_test_label'],
+        ['IDENTIFIER', 'id,test._+asd-2'],
+        ['IDENTIFIER', 'property,name'],
+        ['IDENTIFIER', 'p,r.o_p+e?r#t-y,name'],
+        ['IDENTIFIER', 'way_too_long_label_123219380921830218309218309213'],
+        ['IDENTIFIER', 'node'],
+        ['IDENTIFIER', '234'],
+        ['IDENTIFIER', 'compatible'],
+        ['STRING', '"asd,zxc"'],
+        ['IDENTIFIER', 'test'],
+        ['IDENTIFIER', 'way_too_long_label_123219380921830218309218309213'],
+    ])
+
+    def test_non_numeric_unit_address(self):
+        self.lex(r"""
+/ {
+    test: node@test_address {
+    };
+    test2: node@MACRO_ADDRESS(123) {
+    };
+};
+""", [
+        ['IDENTIFIER', 'test'],
+        ['IDENTIFIER', 'node'],
+        ['IDENTIFIER', 'test_address'],
+        ['IDENTIFIER', 'test2'],
+        ['IDENTIFIER', 'node'],
+        ['IDENTIFIER', 'MACRO_ADDRESS'],
+    ])
+
+    def test_values_with_labels(self):
+        self.lex(r"""
+/ {
+    prop1 = label1: <0 label2: 0x21323>;
+    prop2 = [1 2 3 label3: 4];
+    prop3 = label4: "val" label5: ;
+};
+""", [
+        ['PUNCTUATION', '/'],
+        ['PUNCTUATION', '{'],
+        ['IDENTIFIER', 'prop1'],
+        ['PUNCTUATION', '='],
+        ['IDENTIFIER', 'label1'],
+        ['PUNCTUATION', ':'],
+        ['PUNCTUATION', '<'],
+        ['NUMBER', '0'],
+        ['IDENTIFIER', 'label2'],
+        ['PUNCTUATION', ':'],
+        ['NUMBER', '0x21323'],
+        ['PUNCTUATION', '>'],
+        ['PUNCTUATION', ';'],
+        ['IDENTIFIER', 'prop2'],
+        ['PUNCTUATION', '='],
+        ['PUNCTUATION', '['],
+        ['NUMBER', '1'],
+        ['NUMBER', '2'],
+        ['NUMBER', '3'],
+        ['IDENTIFIER', 'label3'],
+        ['PUNCTUATION', ':'],
+        ['NUMBER', '4'],
+        ['PUNCTUATION', ']'],
+        ['PUNCTUATION', ';'],
+        ['IDENTIFIER', 'prop3'],
+        ['PUNCTUATION', '='],
+        ['IDENTIFIER', 'label4'],
+        ['PUNCTUATION', ':'],
+        ['STRING', '"val"'],
+        ['IDENTIFIER', 'label5'],
+        ['PUNCTUATION', ':'],
+        ['PUNCTUATION', ';'],
+        ['PUNCTUATION', '}'],
+        ['PUNCTUATION', ';'],
+    ], self.default_filtered_tokens + ('PUNCTUATION', 'NUMBER'))
+
+    def test_references(self):
+        self.lex(r"""
+/ {
+    interrupt-parent = < &{/node@c2342/another_node@address(2)/node3} >;
+    property2 = <&{/node@c2342/another_node@address(2)}>;
+    power-domains = <&power DEVICE_DOMAIN>;
+};
+""", [
+        ['IDENTIFIER', 'interrupt-parent'],
+        ['IDENTIFIER', 'node'],
+        ['IDENTIFIER', 'c2342'],
+        ['IDENTIFIER', 'another_node'],
+        ['IDENTIFIER', 'address'],
+        ['IDENTIFIER', 'node3'],
+        ['IDENTIFIER', 'property2'],
+        ['IDENTIFIER', 'node'],
+        ['IDENTIFIER', 'c2342'],
+        ['IDENTIFIER', 'another_node'],
+        ['IDENTIFIER', 'address'],
+        ['IDENTIFIER', 'power-domains'],
+        ['IDENTIFIER', 'power'],
+        ['IDENTIFIER', 'DEVICE_DOMAIN'],
+    ])
+
+    def test_property_types(self):
+        self.lex(r"""
+/ {
+    prop1 = <0 0x21323>;
+    prop2 = [1 2 3 4];
+    prop3 = "val", "val4" ;
+    prop4 = <~1+2-3*4/5%6&7|8^9<<10>>11>;
+    prop5;
+};
+""", [
+        ['PUNCTUATION', '/'],
+        ['PUNCTUATION', '{'],
+        ['IDENTIFIER', 'prop1'],
+        ['PUNCTUATION', '='],
+        ['PUNCTUATION', '<'],
+        ['NUMBER', '0'],
+        ['NUMBER', '0x21323'],
+        ['PUNCTUATION', '>'],
+        ['PUNCTUATION', ';'],
+        ['IDENTIFIER', 'prop2'],
+        ['PUNCTUATION', '='],
+        ['PUNCTUATION', '['],
+        ['NUMBER', '1'],
+        ['NUMBER', '2'],
+        ['NUMBER', '3'],
+        ['NUMBER', '4'],
+        ['PUNCTUATION', ']'],
+        ['PUNCTUATION', ';'],
+        ['IDENTIFIER', 'prop3'],
+        ['PUNCTUATION', '='],
+        ['STRING', '"val"'],
+        ['PUNCTUATION', ','],
+        ['STRING', '"val4"'],
+        ['PUNCTUATION', ';'],
+        ['IDENTIFIER', 'prop4'],
+        ['PUNCTUATION', '='],
+        ['PUNCTUATION', '<'],
+        ['PUNCTUATION', '~'],
+        ['NUMBER', '1'],
+        ['PUNCTUATION', '+'],
+        ['NUMBER', '2'],
+        ['PUNCTUATION', '-'],
+        ['NUMBER', '3'],
+        ['PUNCTUATION', '*'],
+        ['NUMBER', '4'],
+        ['PUNCTUATION', '/'],
+        ['NUMBER', '5'],
+        ['PUNCTUATION', '%'],
+        ['NUMBER', '6'],
+        ['PUNCTUATION', '&'],
+        ['NUMBER', '7'],
+        ['PUNCTUATION', '|'],
+        ['NUMBER', '8'],
+        ['PUNCTUATION', '^'],
+        ['NUMBER', '9'],
+        ['PUNCTUATION', '<'],
+        ['PUNCTUATION', '<'],
+        ['NUMBER', '10'],
+        ['PUNCTUATION', '>'],
+        ['PUNCTUATION', '>'],
+        ['NUMBER', '11'],
+        ['PUNCTUATION', '>'],
+        ['PUNCTUATION', ';'],
+        ['IDENTIFIER', 'prop5'],
+        ['PUNCTUATION', ';'],
+        ['PUNCTUATION', '}'],
+        ['PUNCTUATION', ';'],
+    ], self.default_filtered_tokens + ('PUNCTUATION', 'NUMBER'))
+
+    def test_comments(self):
+        self.lex(r"""
+//license info
+/ {
+    interrupts = <NAME 100 TYPE>, /* comment 1 */
+        <NAME 101 TYPE>; // comemnt2
+    /* long
+    * coment
+    * asdasd
+    */
+};
+""", [
+        ['COMMENT', '//license info\n'],
+        ['IDENTIFIER', 'interrupts'],
+        ['IDENTIFIER', 'NAME'],
+        ['IDENTIFIER', 'TYPE'],
+        ['COMMENT', '/* comment 1 */'],
+        ['IDENTIFIER', 'NAME'],
+        ['IDENTIFIER', 'TYPE'],
+        ['COMMENT', '// comemnt2\n'],
+        ['COMMENT', '/* long\n    * coment\n    * asdasd\n    */'],
+    ], self.default_filtered_tokens)
+

From ad1e0e2bd452e33bb0e9195a6ee15fdf19a7f58b Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Mon, 30 Dec 2024 00:03:03 +0100
Subject: [PATCH 06/11] lexers: Add a Kconfig lexer

---
 elixir/lexers/__main__.py |   2 +
 elixir/lexers/lexers.py   | 113 +++++++++++++++++++++++++++++++++++++-
 elixir/lexers/utils.py    |  11 ++++
 3 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py
index 7b9f7a6f..dab00d14 100644
--- a/elixir/lexers/__main__.py
+++ b/elixir/lexers/__main__.py
@@ -15,6 +15,8 @@
             lexer = lexers.CLexer(f.read())
         elif filename.endswith(('.dts', '.dtsi')):
             lexer = lexers.DTSLexer(f.read())
+        elif filename.endswith('Kconfig'):
+            lexer = lexers.KconfigLexer(f.read())
         else:
             raise Exception("no lexer for filetype")
 
diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py
index b3fb583b..06134a53 100644
--- a/elixir/lexers/lexers.py
+++ b/elixir/lexers/lexers.py
@@ -1,7 +1,8 @@
 import re
 
 from . import shared
-from .utils import TokenType, simple_lexer, FirstInLine, split_by_groups, regex_concat, token_from_string
+from .utils import TokenType, simple_lexer, FirstInLine, split_by_groups, regex_concat, token_from_string, token_from_match, \
+        regex_or, match_token, Token
 
 # Lexers used to extract possible references from source files
 # Design inspired by Pygments lexers interface
@@ -144,3 +145,113 @@ def __init__(self, code):
     def lex(self, **kwargs):
         return simple_lexer(self.rules, self.code, **kwargs)
 
+
+# https://www.kernel.org/doc/html/next/kbuild/kconfig-language.html#kconfig-syntax
+# https://www.kernel.org/doc/html/next/kbuild/kconfig-language.html#kconfig-hints
+
+# TODO better macros calls support
+
+class KconfigLexer:
+    hash_comment = r'#' + shared.singleline_comment_with_escapes_base
+
+    # NOTE pretty much all kconfig identifiers either start uppercase or with a number. this saves us from parsing macro calls
+    kconfig_identifier_starts_with_letters = r'[A-Z_][A-Z0-9a-z_-]*'
+    kconfig_identifier_starts_with_digits = r'[0-9]+[A-Z_a-z-][A-Z0-9a-z_-]*'
+    kconfig_identifier = regex_or(kconfig_identifier_starts_with_letters, kconfig_identifier_starts_with_digits)
+    # other perhaps interesting identifiers
+    kconfig_minor_identifier = r'[a-zA-Z0-9_/][a-zA-Z0-9_/.-]*'
+    kconfig_punctuation = r'[|&!=$()/_.+<>,-]'
+    kconfig_number = f'[0-9]+' # TODO does not handle hex numbers
+
+    # NOTE no identifiers are parsed out of KConfig help texts now, this changes the
+    # old behavior
+    # for example see all instances of USB in /u-boot/v2024.07/source/drivers/usb/Kconfig#L3
+
+    @staticmethod
+    def count_kconfig_help_whitespace(start_whitespace_str):
+        tabs = start_whitespace_str.count('\t')
+        spaces = start_whitespace_str.count(' ')
+        return 8*tabs + spaces + (len(start_whitespace_str)-tabs-spaces)
+
+    @staticmethod
+    def parse_kconfig_help_text(ctx, match):
+        # assumes called with matched help keyword, return the keyword
+        token, ctx = token_from_match(ctx, match, TokenType.SPECIAL)
+        yield token
+
+        # match whitespace after help
+        whitespace_after_help, ctx = match_token(ctx, r'\s*?\n', TokenType.WHITESPACE)
+        if whitespace_after_help is None:
+            # failed to match whitespace and newline after kconfig help - perhaps it's not the right context (macro call for exapmle)
+            return
+        else:
+            yield whitespace_after_help
+
+        line_matcher = re.compile(r'[^\n]*\n', flags=re.MULTILINE|re.UNICODE)
+
+        start_help_text_pos = ctx.pos
+        current_pos = ctx.pos
+        min_whitespace = None
+
+        def collect_tokens(start, end):
+            return Token(TokenType.COMMENT, ctx.code[start:end], (start, end), ctx.line)
+
+        # match first line with whitespace at the beginning
+        while current_pos < len(ctx.code):
+            line = line_matcher.match(ctx.code, current_pos)
+            if line is None:
+                yield collect_tokens(start_help_text_pos, current_pos)
+                return
+
+            token = line.group(0)
+            span = line.span()
+
+            if token == '\n':
+                # just an empty line
+                current_pos = span[1]
+                continue
+            else:
+                start_whitespace = re.match(r'\s*', token)
+                if start_whitespace is None:
+                    # no whitespace at the beginning of the line
+                    yield collect_tokens(start_help_text_pos, current_pos)
+                    return
+                elif min_whitespace is None:
+                    # first nonemtpy line - save amount of whitespace
+                    min_whitespace = KconfigLexer.count_kconfig_help_whitespace(start_whitespace.group(0))
+                    current_pos = span[1]
+                else:
+                    cur_whitespace = KconfigLexer.count_kconfig_help_whitespace(start_whitespace.group(0))
+                    if cur_whitespace < min_whitespace:
+                        yield collect_tokens(start_help_text_pos, current_pos)
+                        return
+                    else:
+                        current_pos = span[1]
+
+        yield collect_tokens(start_help_text_pos, current_pos)
+
+    rules = [
+        (shared.whitespace, TokenType.WHITESPACE),
+        (hash_comment, TokenType.COMMENT),
+        (shared.common_string_and_char, TokenType.STRING),
+        # for whatever reason u-boot kconfigs sometimes use ---help--- instead of help
+        # /u-boot/v2024.07/source/arch/arm/mach-sunxi/Kconfig#L732
+        (FirstInLine('-+help-+'), parse_kconfig_help_text),
+        (kconfig_punctuation, TokenType.PUNCTUATION),
+        (FirstInLine('help'), parse_kconfig_help_text),
+        (kconfig_identifier, TokenType.IDENTIFIER),
+        (kconfig_number, TokenType.NUMBER),
+        (kconfig_minor_identifier, TokenType.SPECIAL),
+        # things that do not match are probably things from a macro call.
+        # unless the syntax changed, or the help parser got confused.
+        # https://www.kernel.org/doc/html/next/kbuild/kconfig-macro-language.html
+        # both shell call and warning/error would require additinal parsing
+        (r'[^\n]+', TokenType.SPECIAL),
+    ]
+
+    def __init__(self, code):
+        self.code = code
+
+    def lex(self):
+        return simple_lexer(self.rules, self.code)
+
diff --git a/elixir/lexers/utils.py b/elixir/lexers/utils.py
index 269af1bc..7b991dd8 100644
--- a/elixir/lexers/utils.py
+++ b/elixir/lexers/utils.py
@@ -21,6 +21,17 @@ def match_regex(regex):
     rule = re.compile(regex, flags=re.MULTILINE)
     return lambda code, pos, _: rule.match(code, pos)
 
+def match_token(ctx, pattern, token_type):
+    match = re.compile(pattern).match(ctx.code, ctx.pos)
+    if match is None:
+        return None, ctx
+    else:
+        span = match.span()
+        result = Token(token_type, ctx.code[span[0]:span[1]], span, ctx.line)
+        ctx.pos = span[1] 
+        ctx.line += result.token.count('\n')
+        return result, ctx
+
 def split_by_groups(*token_types):
     def split(ctx, match):
         pos = ctx.pos

From 0eec683a2b1a773cd9fd6412ea321fe2a19f197c Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Mon, 30 Dec 2024 00:07:14 +0100
Subject: [PATCH 07/11] lexers: Add Kconfig lexer tests

---
 elixir/lexers/tests/test_kconfig.py | 372 ++++++++++++++++++++++++++++
 1 file changed, 372 insertions(+)
 create mode 100644 elixir/lexers/tests/test_kconfig.py

diff --git a/elixir/lexers/tests/test_kconfig.py b/elixir/lexers/tests/test_kconfig.py
new file mode 100644
index 00000000..e0adf379
--- /dev/null
+++ b/elixir/lexers/tests/test_kconfig.py
@@ -0,0 +1,372 @@
+from ..lexers import KconfigLexer
+from .base import LexerTest
+
+class KconfigLexerTest(LexerTest):
+    lexer_cls = KconfigLexer
+    default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR")
+
+    # TODO improve macro calls
+
+    def test_comments(self):
+        self.lex(r"""
+# comment1
+config 64BIT # comment2
+    bool # comment3
+    default "# asd"
+    default $(shell, \#)
+    help
+        asdasdsajdlakjd # not a comment
+
+        asdasdsajdlakjd # not a comment
+
+        # comment 5
+
+    # comment 6
+""", [
+            ['COMMENT', '# comment1\n'],
+            ['SPECIAL', 'config'],
+            ['IDENTIFIER', '64BIT'],
+            ['COMMENT', '# comment2\n'],
+            ['SPECIAL', 'bool'],
+            ['COMMENT', '# comment3\n'],
+            ['SPECIAL', 'default'],
+            ['STRING', '"# asd"'],
+            ['SPECIAL', 'default'],
+            ['SPECIAL', 'shell'],
+            ['SPECIAL', '\\#)'],
+            ['SPECIAL', 'help'],
+            ['COMMENT', '        asdasdsajdlakjd # not a comment\n\n        asdasdsajdlakjd # not a comment\n\n        # comment 5\n\n'],
+            ['COMMENT', '# comment 6\n'],
+        ])
+
+
+    def test_keywords(self):
+        self.lex(r""",
+menu "menu name"
+
+visible if y
+
+choice
+    prompt "test prompt"
+    default y
+
+config 86CONIFG
+    bool "text"
+    prompt "prompt"
+    default y
+    tristate "test"
+    def_bool TEST_bool
+    depends on TEST
+    select TEST2
+    imply TEST3
+    range 5 512 if CONFIG_512
+    help
+        help text
+
+        more help text
+
+endmenu
+""", [
+        ['SPECIAL', 'menu'],
+        ['STRING', '"menu name"'],
+        ['SPECIAL', 'visible'],
+        ['SPECIAL', 'if'],
+        ['SPECIAL', 'y'],
+        ['SPECIAL', 'choice'],
+        ['SPECIAL', 'prompt'],
+        ['STRING', '"test prompt"'],
+        ['SPECIAL', 'default'],
+        ['SPECIAL', 'y'],
+        ['SPECIAL', 'config'],
+        ['IDENTIFIER', '86CONIFG'],
+        ['SPECIAL', 'bool'],
+        ['STRING', '"text"'],
+        ['SPECIAL', 'prompt'],
+        ['STRING', '"prompt"'],
+        ['SPECIAL', 'default'],
+        ['SPECIAL', 'y'],
+        ['SPECIAL', 'tristate'],
+        ['STRING', '"test"'],
+        ['SPECIAL', 'def_bool'],
+        ['IDENTIFIER', 'TEST_bool'],
+        ['SPECIAL', 'depends'],
+        ['SPECIAL', 'on'],
+        ['IDENTIFIER', 'TEST'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST2'],
+        ['SPECIAL', 'imply'],
+        ['IDENTIFIER', 'TEST3'],
+        ['SPECIAL', 'range'],
+        ['SPECIAL', 'if'],
+        ['IDENTIFIER', 'CONFIG_512'],
+        ['SPECIAL', 'help'],
+        ['COMMENT', '        help text\n\n        more help text\n\n'],
+        ['SPECIAL', 'endmenu'],
+    ])
+
+    def test_conditions(self):
+        self.lex(r"""
+config TEST
+    select TEST1 if TEST2 = TEST3
+    select TEST2 if TEST5 != TEST6
+    select TEST7 if TEST8 < TEST9
+    select TEST10 if TEST11 > TEST12
+    select TEST13 if TEST14 <=  TEST15
+""", [
+        ['SPECIAL', 'config'],
+        ['IDENTIFIER', 'TEST'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST1'],
+        ['SPECIAL', 'if'],
+        ['IDENTIFIER', 'TEST2'],
+        ['PUNCTUATION', '='],
+        ['IDENTIFIER', 'TEST3'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST2'],
+        ['SPECIAL', 'if'],
+        ['IDENTIFIER', 'TEST5'],
+        ['PUNCTUATION', '!'],
+        ['PUNCTUATION', '='],
+        ['IDENTIFIER', 'TEST6'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST7'],
+        ['SPECIAL', 'if'],
+        ['IDENTIFIER', 'TEST8'],
+        ['PUNCTUATION', '<'],
+        ['IDENTIFIER', 'TEST9'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST10'],
+        ['SPECIAL', 'if'],
+        ['IDENTIFIER', 'TEST11'],
+        ['PUNCTUATION', '>'],
+        ['IDENTIFIER', 'TEST12'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST13'],
+        ['SPECIAL', 'if'],
+        ['IDENTIFIER', 'TEST14'],
+        ['PUNCTUATION', '<'],
+        ['PUNCTUATION', '='],
+        ['IDENTIFIER', 'TEST15'],
+    ], self.default_filtered_tokens + ("PUNCTUATION",))
+
+    def test_conditions2(self):
+        self.lex(r"""
+config TEST
+    select TEST16    if TEST17   >= TEST3
+    select TEST17 if (TEST18 = TEST19)
+
+    select TEST20 if !(TEST21 = TEST22)
+    select TEST23 if TEST24 && TEST25
+    select TEST26 if TEST27 || TEST28
+""", [
+        ['SPECIAL', 'config'],
+        ['IDENTIFIER', 'TEST'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST16'],
+        ['SPECIAL', 'if'],
+        ['IDENTIFIER', 'TEST17'],
+        ['PUNCTUATION', '>'],
+        ['PUNCTUATION', '='],
+        ['IDENTIFIER', 'TEST3'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST17'],
+        ['SPECIAL', 'if'],
+        ['PUNCTUATION', '('],
+        ['IDENTIFIER', 'TEST18'],
+        ['PUNCTUATION', '='],
+        ['IDENTIFIER', 'TEST19'],
+        ['PUNCTUATION', ')'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST20'],
+        ['SPECIAL', 'if'],
+        ['PUNCTUATION', '!'],
+        ['PUNCTUATION', '('],
+        ['IDENTIFIER', 'TEST21'],
+        ['PUNCTUATION', '='],
+        ['IDENTIFIER', 'TEST22'],
+        ['PUNCTUATION', ')'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST23'],
+        ['SPECIAL', 'if'],
+        ['IDENTIFIER', 'TEST24'],
+        ['PUNCTUATION', '&'],
+        ['PUNCTUATION', '&'],
+        ['IDENTIFIER', 'TEST25'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST26'],
+        ['SPECIAL', 'if'],
+        ['IDENTIFIER', 'TEST27'],
+        ['PUNCTUATION', '|'],
+        ['PUNCTUATION', '|'],
+        ['IDENTIFIER', 'TEST28'],
+    ], self.default_filtered_tokens + ("PUNCTUATION",))
+
+    def test_macros(self):
+        self.lex(r"""
+config TEST
+    depends on $(shell,cat file | grep -vi "option 2")
+    depends on $(info,info to print)
+    depends on $(warning-if,a != b,warning to print)
+""", [
+        ['SPECIAL', 'config'],
+        ['IDENTIFIER', 'TEST'],
+        ['SPECIAL', 'depends'],
+        ['SPECIAL', 'on'],
+        ['PUNCTUATION', '$'],
+        ['PUNCTUATION', '('],
+        ['SPECIAL', 'shell'],
+        ['PUNCTUATION', ','],
+        ['SPECIAL', 'cat'],
+        ['SPECIAL', 'file'],
+        ['PUNCTUATION', '|'],
+        ['SPECIAL', 'grep'],
+        ['PUNCTUATION', '-'],
+        ['SPECIAL', 'vi'],
+        ['STRING', '"option 2"'],
+        ['PUNCTUATION', ')'],
+        ['SPECIAL', 'depends'],
+        ['SPECIAL', 'on'],
+        ['PUNCTUATION', '$'],
+        ['PUNCTUATION', '('],
+        ['SPECIAL', 'info'],
+        ['PUNCTUATION', ','],
+        ['SPECIAL', 'info'],
+        ['SPECIAL', 'to'],
+        ['SPECIAL', 'print'],
+        ['PUNCTUATION', ')'],
+        ['SPECIAL', 'depends'],
+        ['SPECIAL', 'on'],
+        ['PUNCTUATION', '$'],
+        ['PUNCTUATION', '('],
+        ['SPECIAL', 'warning-if'],
+        ['PUNCTUATION', ','],
+        ['SPECIAL', 'a'],
+        ['PUNCTUATION', '!'],
+        ['PUNCTUATION', '='],
+        ['SPECIAL', 'b'],
+        ['PUNCTUATION', ','],
+        ['SPECIAL', 'warning'],
+        ['SPECIAL', 'to'],
+        ['SPECIAL', 'print'],
+        ['PUNCTUATION', ')'],
+    ], self.default_filtered_tokens + ("PUNCTUATION",))
+
+def test_macros2(self):
+    self.lex(r"""
+config TEST
+    depends on $(error-if,a != b,warning to print)
+    depends on $(filename)
+    depends on $(lineno)
+""", [
+        ['SPECIAL', 'config'],
+        ['IDENTIFIER', 'TEST'],
+        ['SPECIAL', 'depends'],
+        ['SPECIAL', 'on'],
+        ['PUNCTUATION', '$'],
+        ['PUNCTUATION', '('],
+        ['SPECIAL', 'error-if'],
+        ['PUNCTUATION', ','],
+        ['SPECIAL', 'a'],
+        ['PUNCTUATION', '!'],
+        ['PUNCTUATION', '='],
+        ['SPECIAL', 'b'],
+        ['PUNCTUATION', ','],
+        ['SPECIAL', 'warning'],
+        ['SPECIAL', 'to'],
+        ['SPECIAL', 'print'],
+        ['PUNCTUATION', ')'],
+        ['SPECIAL', 'depends'],
+        ['SPECIAL', 'on'],
+        ['PUNCTUATION', '$'],
+        ['PUNCTUATION', '('],
+        ['SPECIAL', 'filename'],
+        ['PUNCTUATION', ')'],
+        ['SPECIAL', 'depends'],
+        ['SPECIAL', 'on'],
+        ['PUNCTUATION', '$'],
+        ['PUNCTUATION', '('],
+        ['SPECIAL', 'lineno'],
+        ['PUNCTUATION', ')'],
+    ], self.default_filtered_tokens + ("PUNCTUATION",))
+
+    def test_help(self):
+        self.lex(r"""
+config
+    help
+     help test lasdlkajdk sadlksajd
+     lsajdlad
+
+     salkdjaldlksajd
+
+     "
+     asdlkajsdlkjsadlajdsk
+
+     salkdjlsakdj'
+config
+    select TEST
+config
+    ---help---
+     help test lasdlkajdk sadlksajd
+     lsajdlad
+
+     salkdjaldlksajd
+        
+config
+    select TEST
+""", [
+        ['SPECIAL', 'config'],
+        ['SPECIAL', 'help'],
+        ['COMMENT', '     help test lasdlkajdk sadlksajd\n     lsajdlad\n\n     salkdjaldlksajd\n\n     "\n     asdlkajsdlkjsadlajdsk\n\n     salkdjlsakdj\'\n'],
+        ['SPECIAL', 'config'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST'],
+        ['SPECIAL', 'config'],
+        ['SPECIAL', '---help---'],
+        ['COMMENT', '     help test lasdlkajdk sadlksajd\n     lsajdlad\n\n     salkdjaldlksajd\n        \n'],
+        ['SPECIAL', 'config'],
+        ['SPECIAL', 'select'],
+        ['IDENTIFIER', 'TEST'],
+    ])
+
+    def test_types(self):
+        self.lex(r"""
+config
+    bool
+    default y
+
+config
+    tristate
+    default m
+
+config
+    hex
+	default 0xdfffffff00000000
+
+config
+    string
+    default "string \" test # \# zxc"
+
+config
+    int
+    default 21312323
+""", [
+        ['SPECIAL', 'config'],
+        ['SPECIAL', 'bool'],
+        ['SPECIAL', 'default'],
+        ['SPECIAL', 'y'],
+        ['SPECIAL', 'config'],
+        ['SPECIAL', 'tristate'],
+        ['SPECIAL', 'default'],
+        ['SPECIAL', 'm'],
+        ['SPECIAL', 'config'],
+        ['SPECIAL', 'hex'],
+        ['SPECIAL', 'default'],
+        ['IDENTIFIER', '0xdfffffff00000000'],
+        ['SPECIAL', 'config'],
+        ['SPECIAL', 'string'],
+        ['SPECIAL', 'default'],
+        ['STRING', '"string \\" test # \\# zxc"'],
+        ['SPECIAL', 'config'],
+        ['SPECIAL', 'int'],
+        ['SPECIAL', 'default'],
+    ])

From 78f10c1ee9e2dd5e8a26f48d591000dd105359c8 Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Mon, 30 Dec 2024 00:14:18 +0100
Subject: [PATCH 08/11] lexers: Add a GNU Assembler lexer

---
 elixir/lexers/__main__.py |   2 +
 elixir/lexers/lexers.py   | 107 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)

diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py
index dab00d14..a39b5b42 100644
--- a/elixir/lexers/__main__.py
+++ b/elixir/lexers/__main__.py
@@ -17,6 +17,8 @@
             lexer = lexers.DTSLexer(f.read())
         elif filename.endswith('Kconfig'):
             lexer = lexers.KconfigLexer(f.read())
+        elif filename.endswith(('.s', '.S')):
+            lexer = lexers.GasLexer(f.read())
         else:
             raise Exception("no lexer for filetype")
 
diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py
index 06134a53..5b4bac4b 100644
--- a/elixir/lexers/lexers.py
+++ b/elixir/lexers/lexers.py
@@ -255,3 +255,110 @@ def __init__(self, code):
     def lex(self):
         return simple_lexer(self.rules, self.code)
 
+
+# https://sourceware.org/binutils/docs/as.html#Syntax
+class GasLexer:
+    # https://sourceware.org/binutils/docs/as.html#Symbol-Intro
+    # apparently dots are okay, BUT ctags removes the first dot from labels, for example. same with dollars
+    # /musl/v1.2.5/source/src/string/aarch64/memcpy.S#L92
+    gasm_identifier = r'[a-zA-Z0-9_][a-zA-Z0-9_$.]*'
+
+    gasm_flonum = r'0?[a-zA-Z][+-]?([0-9]|\\s*\n\s*)*\.([0-9]|\\s*\n\s*)*([eE][+-]?[0-9]+)?'
+    gasm_number = regex_or(gasm_flonum, shared.common_hexidecimal_integer, shared.common_binary_integer,
+                           shared.common_decimal_integer)
+
+    gasm_char = r"'(\\.|.|\n)"
+    gasm_string = f'(({ shared.double_quote_string_with_escapes })|({ gasm_char }))'
+
+    gasm_comment_chars_map = {
+        'generic': (r'#\s',),
+
+        'nios2': (r'#',),
+        'openrisc': (r'#',),
+        'powerpc': (r'#',),
+        's390': (r'#',),
+        'xtensa': (r'#',),
+        'microblaze': (r'#',),
+        'mips': (r'#',),
+        'alpha': (r'#',),
+        'csky': (r'#',),
+        # BUT double pipe in macros is an operator... and # not in the first line in
+        # /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S
+        'm68k': ('|', '^#', r'#\s'),
+        'arc': ('# ', ';'),
+
+        # https://sourceware.org/binutils/docs/as.html#HPPA-Syntax
+        # /linux/v6.10.7/source/arch/parisc/kernel/perf_asm.S#L28
+        'parisc': (';',),
+        'x86': (';',),
+        'tic6x': (';', '*'), # cx6, tms320, although the star is sketchy
+
+        # in below, # can be a comment only if the first character of the line
+
+        # https://sourceware.org/binutils/docs/as.html#SH-Syntax
+        # /linux/v6.10.7/source/arch/sh/kernel/head_32.S#L58
+        'sh': ('!', '^#'),
+        # https://sourceware.org/binutils/docs/as.html#Sparc_002dSyntax
+        # /linux/v6.10.7/source/arch/sparc/lib/memset.S#L125
+        'sparc': ('!', '^#'),
+        # used in ARM https://sourceware.org/binutils/docs/as.html#ARM-Syntax
+        # /linux/v6.10.7/source/arch/arm/mach-sa1100/sleep.S#L33
+        'arm32': ('@', '^#'),
+        'cris': (';', '^#'),
+        'avr': (';', '^#'),
+        # blackfin, tile
+    }
+
+    gasm_punctuation = r'[.,\[\]()<>{}%&+*!|@#$;:^/\\=~-]'
+    # TODO make sure all relevant directives are listed here
+    gasm_preprocessor = r'#[ \t]*(define|ifdef|ifndef|undef|if|else|elif|endif)'
+
+    rules_before_comments = [
+        (shared.whitespace, TokenType.WHITESPACE),
+        # don't interpret macro concatenate as a comment
+        ('##', TokenType.PUNCTUATION),
+        # don't interpret or as a comment
+        (r'\|\|', TokenType.PUNCTUATION),
+        (FirstInLine(regex_or(shared.c_preproc_include, shared.c_preproc_warning_and_error)), TokenType.SPECIAL),
+        (FirstInLine(gasm_preprocessor), TokenType.SPECIAL),
+        (shared.common_slash_comment, TokenType.COMMENT),
+    ]
+
+    rules_after_comments = [
+        (gasm_string, TokenType.STRING),
+        (gasm_number, TokenType.NUMBER),
+        (gasm_identifier, TokenType.IDENTIFIER),
+        (gasm_punctuation, TokenType.PUNCTUATION),
+    ]
+
+    def __init__(self, code, arch='generic'):
+        self.code = code
+        self.comment_chars = self.gasm_comment_chars_map[arch]
+
+    def get_arch_rules(self):
+        result = []
+
+        regex_chars = '*?+^.$\\[]|()'
+        add_slash = lambda ch: '\\' + ch if ch in regex_chars else ch
+
+        for comment_char in self.comment_chars:
+            if comment_char[0] == '^':
+                result.append((
+                    FirstInLine(add_slash(comment_char[1]) + shared.singleline_comment_with_escapes_base),
+                    TokenType.COMMENT
+                ))
+            else:
+                result.append((
+                    add_slash(comment_char) + shared.singleline_comment_with_escapes_base,
+                    TokenType.COMMENT)
+                )
+
+        return result
+
+    def lex(self):
+        rules = self.rules_before_comments + \
+                self.get_arch_rules() + \
+                self.rules_after_comments
+
+        return simple_lexer(rules, self.code)
+

From d967309c17bd4a29d76d325ec8b1ad4cfdef0057 Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Mon, 30 Dec 2024 00:15:28 +0100
Subject: [PATCH 09/11] lexers: Add GNU Assembler lexer tests

---
 elixir/lexers/tests/test_gas.py | 282 ++++++++++++++++++++++++++++++++
 1 file changed, 282 insertions(+)
 create mode 100644 elixir/lexers/tests/test_gas.py

diff --git a/elixir/lexers/tests/test_gas.py b/elixir/lexers/tests/test_gas.py
new file mode 100644
index 00000000..3c541f22
--- /dev/null
+++ b/elixir/lexers/tests/test_gas.py
@@ -0,0 +1,282 @@
+from ..lexers import GasLexer
+from .base import LexerTest
+
+class GasLexerTest(LexerTest):
+    lexer_cls = GasLexer
+    default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR")
+
+    def test_comments_m68k(self):
+        self.lex(r"""
+# comment 1
+#comment 2
+    clrl d1 | comment 3
+    clrl d0 |comment 4
+| comment 4
+
+    clrl d2 # comment 3
+
+#if defined(C1) || !defined(C2)
+	addql #4,%sp
+label:
+	movel	#-IDNENT,%sp@(IDENT)| comment 5
+// /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S
+    test # comment 6
+# endif
+
+#define macro(x) inst &IDENT,%pc@(ident); inst x
+""", [
+        ['COMMENT', '# comment 1\n'],
+        ['COMMENT', '#comment 2\n'],
+        ['IDENTIFIER', 'clrl'],
+        ['IDENTIFIER', 'd1'],
+        ['COMMENT', '| comment 3\n'],
+        ['IDENTIFIER', 'clrl'],
+        ['IDENTIFIER', 'd0'],
+        ['COMMENT', '|comment 4\n'],
+        ['COMMENT', '| comment 4\n'],
+        ['IDENTIFIER', 'clrl'],
+        ['IDENTIFIER', 'd2'],
+        ['COMMENT', '# comment 3\n'],
+        ['SPECIAL', '#if'],
+        ['IDENTIFIER', 'defined'],
+        ['IDENTIFIER', 'C1'],
+        ['IDENTIFIER', 'defined'],
+        ['IDENTIFIER', 'C2'],
+        ['IDENTIFIER', 'addql'],
+        ['IDENTIFIER', 'sp'],
+        ['IDENTIFIER', 'label'],
+        ['IDENTIFIER', 'movel'],
+        ['IDENTIFIER', 'IDNENT'],
+        ['IDENTIFIER', 'sp'],
+        ['IDENTIFIER', 'IDENT'],
+        ['COMMENT', '| comment 5\n'],
+        ['COMMENT', '// /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S\n'],
+        ['IDENTIFIER', 'test'],
+        ['COMMENT', '# comment 6\n'],
+        ['SPECIAL', '# endif'],
+        ['SPECIAL', '#define'],
+        ['IDENTIFIER', 'macro'],
+        ['IDENTIFIER', 'x'],
+        ['IDENTIFIER', 'inst'],
+        ['IDENTIFIER', 'IDENT'],
+        ['IDENTIFIER', 'pc'],
+        ['IDENTIFIER', 'ident'],
+        ['IDENTIFIER', 'inst'],
+        ['IDENTIFIER', 'x'],
+    ], lexer_options={"arch": "m68k"})
+
+    def test_comments_sparc(self):
+        self.lex(r"""
+#define F(i) 		\
+	.type	i,@function;
+
+    std	t1, [0x00];
+
+/*comment default */
+//comment default2
+    .type identifier,#function
+label:
+        sethi   %hi(IDENT), %g0 !test comment
+        wrpr %g1, %sp   ! test comment
+# comment
+#comment
+        sethi   %hi(IDENT_1 | IDENT_2), %l0
+""", [
+        ['SPECIAL', '#define'],
+        ['IDENTIFIER', 'F'],
+        ['IDENTIFIER', 'i'],
+        ['IDENTIFIER', 'type'],
+        ['IDENTIFIER', 'i'],
+        ['IDENTIFIER', 'function'],
+        ['IDENTIFIER', 'std'],
+        ['IDENTIFIER', 't1'],
+        ['COMMENT', '/*comment default */'],
+        ['COMMENT', '//comment default2\n'],
+        ['IDENTIFIER', 'type'],
+        ['IDENTIFIER', 'identifier'],
+        ['IDENTIFIER', 'function'],
+        ['IDENTIFIER', 'label'],
+        ['IDENTIFIER', 'sethi'],
+        ['IDENTIFIER', 'hi'],
+        ['IDENTIFIER', 'IDENT'],
+        ['IDENTIFIER', 'g0'],
+        ['COMMENT', '!test comment\n'],
+        ['IDENTIFIER', 'wrpr'],
+        ['IDENTIFIER', 'g1'],
+        ['IDENTIFIER', 'sp'],
+        ['COMMENT', '! test comment\n'],
+        ['COMMENT', '# comment\n'],
+        ['COMMENT', '#comment\n'],
+        ['IDENTIFIER', 'sethi'],
+        ['IDENTIFIER', 'hi'],
+        ['IDENTIFIER', 'IDENT_1'],
+        ['IDENTIFIER', 'IDENT_2'],
+        ['IDENTIFIER', 'l0'],
+    ], lexer_options={"arch": "sparc"})
+
+    def test_comments_arm32(self):
+        self.lex(r"""
+// comment default
+/* comment default2 */
+test:
+    bic	r0, r1, #10
+    # comment 1
+    #comment 1
+"""
++ "\t# comment 1" + r"""
+	moveq	r0, #IDENTIFIER @ Comment
+# comment 2
+#comment 2
+    push {r0}
+    add \addr, \addr, \tmp  @comment3
+    ldr r1, =TEST3
+	ldr TEST, [sp, IDENT(i)];
+	.long   PMD_TYPE_SECT | \
+		PMD_BIT4
+    stmfd	sp!, {r0, r1, r2, r3}
+    eor RT0, d, b;
+""", [
+        ['COMMENT', '// comment default\n'],
+        ['COMMENT', '/* comment default2 */'],
+        ['IDENTIFIER', 'test'],
+        ['IDENTIFIER', 'bic'],
+        ['IDENTIFIER', 'r0'],
+        ['IDENTIFIER', 'r1'],
+        ['NUMBER', '10'],
+        ['COMMENT', '# comment 1\n'],
+        ['COMMENT', '#comment 1\n'],
+        ['COMMENT', '# comment 1\n'],
+        ['IDENTIFIER', 'moveq'],
+        ['IDENTIFIER', 'r0'],
+        ['IDENTIFIER', 'IDENTIFIER'],
+        ['COMMENT', '@ Comment\n'],
+        ['COMMENT', '# comment 2\n'],
+        ['COMMENT', '#comment 2\n'],
+        ['IDENTIFIER', 'push'],
+        ['IDENTIFIER', 'r0'],
+        ['IDENTIFIER', 'add'],
+        ['IDENTIFIER', 'addr'],
+        ['IDENTIFIER', 'addr'],
+        ['IDENTIFIER', 'tmp'],
+        ['COMMENT', '@comment3\n'],
+        ['IDENTIFIER', 'ldr'],
+        ['IDENTIFIER', 'r1'],
+        ['IDENTIFIER', 'TEST3'],
+        ['IDENTIFIER', 'ldr'],
+        ['IDENTIFIER', 'TEST'],
+        ['IDENTIFIER', 'sp'],
+        ['IDENTIFIER', 'IDENT'],
+        ['IDENTIFIER', 'i'],
+        ['IDENTIFIER', 'long'],
+        ['IDENTIFIER', 'PMD_TYPE_SECT'],
+        ['IDENTIFIER', 'PMD_BIT4'],
+        ['IDENTIFIER', 'stmfd'],
+        ['IDENTIFIER', 'sp'],
+        ['IDENTIFIER', 'r0'],
+        ['IDENTIFIER', 'r1'],
+        ['IDENTIFIER', 'r2'],
+        ['IDENTIFIER', 'r3'],
+        ['IDENTIFIER', 'eor'],
+        ['IDENTIFIER', 'RT0'],
+        ['IDENTIFIER', 'd'],
+        ['IDENTIFIER', 'b'],
+    ], self.default_filtered_tokens + ("NUMBER",), {"arch": "arm32"})
+
+    def test_comments_generic(self):
+        self.lex(r"""
+/* comment
+ * more comment
+ * more comment
+ */
+    mov r0, r1  //test
+    mov x0, #IDENT
+    stp     x1, x2, [sp, #-4]!
+#if defined(IDENT1) || defined(IDENT2)
+#endif
+""", [
+        ['COMMENT', '/* comment\n * more comment\n * more comment\n */'],
+        ['IDENTIFIER', 'mov'],
+        ['IDENTIFIER', 'r0'],
+        ['PUNCTUATION', ','],
+        ['IDENTIFIER', 'r1'],
+        ['COMMENT', '//test\n'],
+        ['IDENTIFIER', 'mov'],
+        ['IDENTIFIER', 'x0'],
+        ['PUNCTUATION', ','],
+        ['PUNCTUATION', '#'],
+        ['IDENTIFIER', 'IDENT'],
+        ['IDENTIFIER', 'stp'],
+        ['IDENTIFIER', 'x1'],
+        ['PUNCTUATION', ','],
+        ['IDENTIFIER', 'x2'],
+        ['PUNCTUATION', ','],
+        ['PUNCTUATION', '['],
+        ['IDENTIFIER', 'sp'],
+        ['PUNCTUATION', ','],
+        ['PUNCTUATION', '#'],
+        ['PUNCTUATION', '-'],
+        ['NUMBER', '4'],
+        ['PUNCTUATION', ']'],
+        ['PUNCTUATION', '!'],
+        ['SPECIAL', '#if'],
+        ['IDENTIFIER', 'defined'],
+        ['PUNCTUATION', '('],
+        ['IDENTIFIER', 'IDENT1'],
+        ['PUNCTUATION', ')'],
+        ['PUNCTUATION', '||'],
+        ['IDENTIFIER', 'defined'],
+        ['PUNCTUATION', '('],
+        ['IDENTIFIER', 'IDENT2'],
+        ['PUNCTUATION', ')'],
+        ['SPECIAL', '#endif'],
+    ], self.default_filtered_tokens + ("PUNCTUATION", "NUMBER"))
+
+    def test_comments_preproc(self):
+        self.lex(r"""
+ # error "test"
+#warning "test"
+#include "test.h"
+#include <test.h>
+#if defined(T1) || defined(T2)
+#endif
+""", [
+        ['SPECIAL', '# error "test"\n'],
+        ['SPECIAL', '#warning "test"\n'],
+        ['SPECIAL', '#include "test.h"'],
+        ['SPECIAL', '#include <test.h>'],
+        ['SPECIAL', '#if'],
+        ['IDENTIFIER', 'defined'],
+        ['IDENTIFIER', 'T1'],
+        ['IDENTIFIER', 'defined'],
+        ['IDENTIFIER', 'T2'],
+        ['SPECIAL', '#endif'],
+    ])
+
+    def test_comments_literals(self):
+        self.lex(r"""
+.byte 12, 0b1010, 0B1010, 0x34, 0123, 0X45, 'a, '\b
+.ascii "asdsad\"zxczc"
+.float 0f-12321321030982394324\
+        21321432432.234324324E-14
+.float 0f-123.123213e+13
+.float 0e-123.123213e+13
+""", [
+        ['IDENTIFIER', 'byte'],
+        ['NUMBER', '12'],
+        ['NUMBER', '0b1010'],
+        ['NUMBER', '0B1010'],
+        ['NUMBER', '0x34'],
+        ['NUMBER', '0123'],
+        ['NUMBER', '0X45'],
+        ['STRING', "'a"],
+        ['STRING', "'\\b"],
+        ['IDENTIFIER', 'ascii'],
+        ['STRING', '"asdsad\\"zxczc"'],
+        ['IDENTIFIER', 'float'],
+        ['NUMBER', '0f-12321321030982394324\\\n        21321432432.234324324E-14'],
+        ['IDENTIFIER', 'float'],
+        ['NUMBER', '0f-123.123213e+13'],
+        ['IDENTIFIER', 'float'],
+        ['NUMBER', '0e-123.123213e+13'],
+    ], self.default_filtered_tokens + ("NUMBER",))
+

From 9b0ca9a911ab1bfc6717d8d2849c5d3cd1b7888a Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Mon, 30 Dec 2024 00:17:35 +0100
Subject: [PATCH 10/11] lexers: Add a Makefile lexer

---
 elixir/lexers/__main__.py |  2 ++
 elixir/lexers/lexers.py   | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py
index a39b5b42..676b2ed4 100644
--- a/elixir/lexers/__main__.py
+++ b/elixir/lexers/__main__.py
@@ -19,6 +19,8 @@
             lexer = lexers.KconfigLexer(f.read())
         elif filename.endswith(('.s', '.S')):
             lexer = lexers.GasLexer(f.read())
+        elif filename.endswith('Makefile'):
+            lexer = lexers.MakefileLexer(f.read())
         else:
             raise Exception("no lexer for filetype")
 
diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py
index 5b4bac4b..b470e749 100644
--- a/elixir/lexers/lexers.py
+++ b/elixir/lexers/lexers.py
@@ -362,3 +362,34 @@ def lex(self):
 
         return simple_lexer(rules, self.code)
 
+
+# https://www.gnu.org/software/make/manual/make.html
+class MakefileLexer:
+    # https://pubs.opengroup.org/onlinepubs/007904975/utilities/make.html
+
+    # NOTE same as in KConfig, we only care about screaming case names
+    make_identifier = r'[A-Z0-9_]+'
+    make_minor_identifier = r'[a-zA-Z0-9_][a-zA-Z0-9-_]*'
+    make_variable = r'(\$\([a-zA-Z0-9_-]\)|\$\{[a-zA-Z0-9_-]\})'
+    make_single_quote_string = r"'*?'"
+    make_string = f'(({ make_single_quote_string })|({ shared.double_quote_string_with_escapes }))'
+    make_escape = r'\\[#"\']'
+    make_punctuation = r'[~\\`\[\](){}<>.,:;|%$^@&?!+*/=-]'
+    make_comment = r'(?<!\\)#(\\\s*\n|[^\n])*\n'
+
+    rules = [
+        (shared.whitespace, TokenType.WHITESPACE),
+        (make_escape, TokenType.PUNCTUATION),
+        (make_comment, TokenType.COMMENT),
+        (make_string, TokenType.STRING),
+        (make_identifier, TokenType.IDENTIFIER),
+        (make_minor_identifier, TokenType.SPECIAL),
+        (make_punctuation, TokenType.PUNCTUATION),
+    ]
+
+    def __init__(self, code):
+        self.code = code
+
+    def lex(self):
+        return simple_lexer(self.rules, self.code)
+

From 60f953577e251587d3efb867eb5089519d75cbce Mon Sep 17 00:00:00 2001
From: Franciszek Stachura <fbstachura@gmail.com>
Date: Mon, 30 Dec 2024 00:28:25 +0100
Subject: [PATCH 11/11] lexers: Integrate new lexers with the rest of Elixir

---
 elixir/filters/__init__.py | 72 ++++++++++++++++++++++++++------------
 elixir/lexers/__init__.py  | 10 ++++++
 elixir/project_utils.py    | 17 +++++++++
 elixir/projects.py         | 44 +++++++++++++++++++++++
 elixir/query.py            | 38 ++++++++++++--------
 elixir/web.py              |  5 +--
 update.py                  | 60 +++++++++++++++++++------------
 7 files changed, 186 insertions(+), 60 deletions(-)

diff --git a/elixir/filters/__init__.py b/elixir/filters/__init__.py
index b06eae8f..e65e9d08 100755
--- a/elixir/filters/__init__.py
+++ b/elixir/filters/__init__.py
@@ -1,23 +1,51 @@
-from typing import List
-
-from .utils import Filter, FilterContext
-from .projects import project_filters, default_filters
-
-# Returns a list of applicable filters for project_name under provided filter context
-def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]:
-    filter_classes = project_filters.get(project_name, default_filters)
-    filters = []
-
-    for filter_cls in filter_classes:
-        if type(filter_cls) == tuple and len(filter_cls) == 2:
-            cls, kwargs = filter_cls
-            filters.append(cls(**kwargs))
-        elif type(filter_cls) == type:
-            filters.append(filter_cls())
-        else:
-            raise ValueError(f"Invalid filter: {filter_cls}, " \
-                    "should be either a two element tuple or a type. " \
-                    "Make sure project_filters in project.py is valid.")
-
-    return [f for f in filters if f.check_if_applies(ctx)]
+from .ident import IdentFilter
+
+from .cppinc import CppIncFilter
+from .cpppathinc import CppPathIncFilter
+
+from .defconfig import DefConfigIdentsFilter
+from .configin import ConfigInFilter
+
+from .kconfig import KconfigFilter
+from .kconfigidents import KconfigIdentsFilter
+
+from .dtsi import DtsiFilter
+from .dtscompdocs import DtsCompDocsFilter
+from .dtscompcode import DtsCompCodeFilter
+from .dtscompdts import DtsCompDtsFilter
+
+from .makefileo import MakefileOFilter
+from .makefiledtb import MakefileDtbFilter
+from .makefiledir import MakefileDirFilter
+from .makefilesubdir import MakefileSubdirFilter
+from .makefilefile import MakefileFileFilter
+from .makefilesrctree import MakefileSrcTreeFilter
+from .makefilesubdir import MakefileSubdirFilter
+
+
+# List of filters applied to all projects
+default_filters = [
+    DtsCompCodeFilter,
+    DtsCompDtsFilter,
+    DtsCompDocsFilter,
+    IdentFilter,
+    CppIncFilter,
+]
+
+# List of filters for Kconfig files
+common_kconfig_filters = [
+    KconfigFilter,
+    KconfigIdentsFilter,
+    DefConfigIdentsFilter,
+]
+
+# List of filters for Makefiles
+common_makefile_filters = [
+    MakefileOFilter,
+    MakefileDtbFilter,
+    MakefileDirFilter,
+    MakefileFileFilter,
+    MakefileSubdirFilter,
+    MakefileSrcTreeFilter,
+]
 
diff --git a/elixir/lexers/__init__.py b/elixir/lexers/__init__.py
index e69de29b..f4f3fa32 100644
--- a/elixir/lexers/__init__.py
+++ b/elixir/lexers/__init__.py
@@ -0,0 +1,10 @@
+from .lexers import *
+
+default_lexers = {
+    r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer,
+    r'makefile\..*':  MakefileLexer,
+    r'.*\.dts(i)?': DTSLexer,
+    r'.*\.s': GasLexer,
+    r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst
+}
+
diff --git a/elixir/project_utils.py b/elixir/project_utils.py
index 31523a83..242a62c1 100644
--- a/elixir/project_utils.py
+++ b/elixir/project_utils.py
@@ -4,6 +4,7 @@
 from .filters.utils import Filter, FilterContext
 from .filters import default_filters
 from .projects import projects
+from .lexers import default_lexers
 
 # Returns a list of applicable filters for project_name under provided filter context
 def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]:
@@ -28,3 +29,19 @@ def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]:
 
     return [f for f in filters if f.check_if_applies(ctx)]
 
+def get_lexer(path: str, project_name: str):
+    project_config = projects.get(project_name)
+    if project_config is None or 'lexers' not in project_config:
+        lexers = default_lexers
+    else:
+        lexers = project_config['lexers']
+
+    path = path.lower()
+    for regex, lexer in lexers.items():
+        if re.match(regex, path):
+            if type(lexer) == tuple:
+                lexer_cls, kwargs = lexer
+                return lambda code: lexer_cls(code, **kwargs)
+            else:
+                return lambda code: lexer(code)
+
diff --git a/elixir/projects.py b/elixir/projects.py
index 90a1ecc3..53d4065e 100644
--- a/elixir/projects.py
+++ b/elixir/projects.py
@@ -1,4 +1,7 @@
 from .filters import *
+from collections import OrderedDict
+from .filters import *
+from .lexers import *
 
 # Dictionary of custom per-projects settings.
 # filters:
@@ -48,6 +51,29 @@
             # Our solution is to ignore all includes in such paths
             (CppPathIncFilter, {"path_exceptions": {'^/include/uapi/.*'}}),
         ],
+        'lexers': OrderedDict({
+            r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer,
+            r'makefile\..*':  MakefileLexer,
+            r'.*\.dts(i)?': DTSLexer,
+            r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst
+
+            r'/arch/alpha/.*\.s': (GasLexer, {"arch": "alpha"}),
+            r'/arch/arc/.*\.s': (GasLexer, {"arch": "arc"}),
+            r'/arch/arm/.*\.s': (GasLexer, {"arch": "arm32"}),
+            r'/arch/csky/.*\.s': (GasLexer, {"arch": "csky"}),
+            r'/arch/m68k/.*\.s': (GasLexer, {"arch": "m68k"}),
+            r'/arch/microblaze/.*\.s': (GasLexer, {"arch": "microblaze"}),
+            r'/arch/mips/.*\.s': (GasLexer, {"arch": "mips"}),
+            r'/arch/openrisc/.*\.s': (GasLexer, {"arch": "openrisc"}),
+            r'/arch/parisc/.*\.s': (GasLexer, {"arch": "parisc"}),
+            r'/arch/s390/.*\.s': (GasLexer, {"arch": "s390"}),
+            r'/arch/sh/.*\.s': (GasLexer, {"arch": "sh"}),
+            r'/arch/sparc/.*\.s': (GasLexer, {"arch": "sparc"}),
+            r'/arch/um/.*\.s': (GasLexer, {"arch": "x86"}),
+            r'/arch/x86/.*\.s': (GasLexer, {"arch": "x86"}),
+            r'/arch/xtensa/.*\.s': (GasLexer, {"arch": "xtensa"}),
+            r'.*\.s': GasLexer,
+        }),
     },
     'qemu': {
         'filters': [
@@ -63,6 +89,24 @@
             CppPathIncFilter,
             *common_makefile_filters,
         ],
+        'lexers': OrderedDict({
+            r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer,
+            r'makefile\..*':  MakefileLexer,
+            r'.*\.dts(i)?': DTSLexer,
+            r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst
+
+            r'/arch/arc/.*\.s': (GasLexer, {"arch": "arc"}),
+            r'/arch/arm/.*\.s': (GasLexer, {"arch": "arm32"}),
+            r'/arch/m68k/.*\.s': (GasLexer, {"arch": "m68k"}),
+            r'/arch/microblaze/.*\.s': (GasLexer, {"arch": "microblaze"}),
+            r'/arch/mips/.*\.s': (GasLexer, {"arch": "mips"}),
+            r'/arch/riscv/.*\.s': (GasLexer, {"arch": "riscv"}),
+            r'/arch/sh/.*\.s': (GasLexer, {"arch": "sh"}),
+            r'/arch/x86/.*\.s': (GasLexer, {"arch": "x86"}),
+            r'/arch/sandbox/.*\.s': (GasLexer, {"arch": "x86"}),
+            r'/arch/xtensa/.*\.s': (GasLexer, {"arch": "xtensa"}),
+            r'.*\.s': GasLexer,
+        }),
     },
     'uclibc-ng': {
         'filters': [
diff --git a/elixir/query.py b/elixir/query.py
index ff14d4b1..5476dc6d 100755
--- a/elixir/query.py
+++ b/elixir/query.py
@@ -21,7 +21,8 @@
 from .lib import script, scriptLines, decode
 from . import lib
 from . import data
-import os
+from .lexers import TokenType
+import os, sys
 from collections import OrderedDict
 from urllib import parse
 
@@ -172,29 +173,38 @@ def query(self, cmd, *args):
 
             version = args[0]
             path = args[1]
+            lexer = args[2]
 
             filename = os.path.basename(path)
             family = lib.getFileFamily(filename)
 
-            if family != None:
+            if family is not None and lexer is not None:
                 buffer = BytesIO()
-                tokens = self.scriptLines('tokenize-file', version, path, family)
-                even = True
+                code = self.get_file_raw(version, path)
 
                 prefix = b''
                 if family == 'K':
                     prefix = b'CONFIG_'
 
-                for tok in tokens:
-                    even = not even
-                    tok2 = prefix + tok
-                    if (even and self.db.defs.exists(tok2) and
-                        (lib.compatibleFamily(self.db.defs.get(tok2).get_families(), family) or
-                        lib.compatibleMacro(self.db.defs.get(tok2).get_macros(), family))):
-                        tok = b'\033[31m' + tok2 + b'\033[0m'
-                    else:
-                        tok = lib.unescape(tok)
-                    buffer.write(tok)
+                for token_type, token, _, line in lexer(code).lex():
+                    token = token.encode()
+
+                    if token_type == TokenType.ERROR:
+                        print("error token: ", token, token_type, filename, line, file=sys.stderr)
+                    elif token_type == TokenType.IDENTIFIER:
+                        token_with_prefix = prefix + token
+                        token_in_db = self.db.defs.exists(token_with_prefix)
+                        if token_in_db:
+                            compatible = \
+                                lib.compatibleFamily(self.db.defs.get(token_with_prefix).get_families(), family) or \
+                                lib.compatibleMacro(self.db.defs.get(token_with_prefix).get_macros(), family)
+
+                            if compatible:
+                                buffer.write(b'\033[31m' + token_with_prefix + b'\033[0m')
+                                continue
+
+                    buffer.write(token)
+
                 return decode(buffer.getvalue())
             else:
                 return decode(self.script('get-file', version, path))
diff --git a/elixir/web.py b/elixir/web.py
index 514e9cce..d25745b0 100755
--- a/elixir/web.py
+++ b/elixir/web.py
@@ -33,7 +33,7 @@
 
 from .lib import validFamily
 from .query import Query, SymbolInstance
-from .project_utils import get_filters
+from .project_utils import get_filters, get_lexer
 from .filters.utils import FilterContext
 from .autocomplete import AutocompleteResource
 from .api import ApiIdentGetterResource
@@ -485,7 +485,8 @@ def format_code(filename, code):
 # version: requested version of the project
 # path: path to the file in the repository
 def generate_source(q, project, version, path):
-    code = q.query('file', version, path)
+    lexer = get_lexer(path, project)
+    code = q.query('file', version, path, lexer)
 
     _, fname = os.path.split(path)
     _, extension = os.path.splitext(fname)
diff --git a/update.py b/update.py
index 79cb4dcf..3d14e8ce 100755
--- a/update.py
+++ b/update.py
@@ -22,13 +22,16 @@
 # Throughout, an "idx" is the sequential number associated with a blob.
 # This is different from that blob's Git hash.
 
+import sys
 from sys import argv
 from threading import Thread, Lock, Event, Condition
 
+from elixir.lexers import TokenType
 import elixir.lib as lib
 from elixir.lib import script, scriptLines
 import elixir.data as data
 from elixir.data import PathList
+from elixir.project_utils import get_lexer
 from find_compatible_dts import FindCompatibleDTS
 
 verbose = False
@@ -56,6 +59,7 @@
 bindings_idxes = [] # DT bindings documentation files
 idx_key_mod = 1000000
 defs_idxes = {} # Idents definitions stored with (idx*idx_key_mod + line) as the key.
+file_paths = {}
 
 tags_done = False # True if all tags have been added to new_idxes
 
@@ -163,7 +167,7 @@ def run(self):
         progress('vers: Thread finished', index)
 
     def update_versions(self, tag):
-        global blobs_lock
+        global blobs_lock, file_paths
 
         # Get blob hashes and associated file paths
         blobs = scriptLines('list-blobs', '-p', tag)
@@ -174,12 +178,14 @@ def update_versions(self, tag):
             with blobs_lock:
                 idx = db.blob.get(hash)
             buf.append((idx, path))
+            file_paths[idx] = path
 
         buf = sorted(buf)
         obj = PathList()
         for idx, path in buf:
             obj.append(idx, path)
 
+
             # Store DT bindings documentation files to parse them later
             if path[:33] == b'Documentation/devicetree/bindings':
                 bindings_idxes.append(idx)
@@ -275,6 +281,7 @@ def run(self):
 
             new_idxes[self.index][1].wait() # Make sure the tag is ready
             new_idxes[self.index][2].wait() # Make sure UpdateDefs processed the tag
+            new_idxes[self.index][4].wait() # Tell that UpdateVersions processed the tag
 
             with tags_refs_lock:
                 tags_refs[0] += 1
@@ -288,45 +295,53 @@ def run(self):
             progress('refs: Thread ' + str(tags_refs[1]) + '/' + str(self.inc) + ' finished', tags_refs[0])
 
     def update_references(self, idxes):
-        global hash_file_lock, defs_lock, refs_lock, tags_refs
+        global hash_file_lock, defs_lock, refs_lock, tags_refs, file_paths
 
         for idx in idxes:
             if idx % 1000 == 0: progress('refs: ' + str(idx), tags_refs[0])
 
             with hash_file_lock:
                 hash = db.hash.get(idx)
-                filename = db.file.get(idx)
+                filename = file_paths[idx].decode()
 
             family = lib.getFileFamily(filename)
             if family == None: continue
 
+            lexer = get_lexer(filename, project)
+            if lexer is None:
+                continue
+
+            try:
+                code = script('get-blob', hash).decode()
+            except UnicodeDecodeError:
+                code = script('get-blob', hash).decode('raw_unicode_escape')
+
             prefix = b''
             # Kconfig values are saved as CONFIG_<value>
             if family == 'K':
                 prefix = b'CONFIG_'
 
-            tokens = scriptLines('tokenize-file', '-b', hash, family)
-            even = True
-            line_num = 1
             idents = {}
             with defs_lock:
-                for tok in tokens:
-                    even = not even
-                    if even:
-                        tok = prefix + tok
-
-                        if (db.defs.exists(tok) and
-                            not ( (idx*idx_key_mod + line_num) in defs_idxes and
-                                defs_idxes[idx*idx_key_mod + line_num] == tok ) and
-                            (family != 'M' or tok.startswith(b'CONFIG_'))):
-                            # We only index CONFIG_??? in makefiles
-                            if tok in idents:
-                                idents[tok] += ',' + str(line_num)
-                            else:
-                                idents[tok] = str(line_num)
+                for token_type, token, _, line in lexer(code).lex():
+                    if token_type == TokenType.ERROR:
+                        print("error token: ", token, token_type, filename, line, file=sys.stderr)
+                        continue
 
-                    else:
-                        line_num += tok.count(b'\1')
+                    token = prefix + token.encode()
+
+                    if token_type != TokenType.IDENTIFIER:
+                        continue
+
+                    if (db.defs.exists(token) and
+                        not ( (idx*idx_key_mod + line) in defs_idxes and
+                            defs_idxes[idx*idx_key_mod + line] == token ) and
+                        (family != 'M' or token.startswith(b'CONFIG_'))):
+                        # We only index CONFIG_??? in makefiles
+                        if token in idents:
+                            idents[token] += ',' + str(line)
+                        else:
+                            idents[token] = str(line)
 
             with refs_lock:
                 for ident, lines in idents.items():
@@ -579,6 +594,7 @@ def progress(msg, current):
 for tag in scriptLines('list-tags'):
     if not db.vers.exists(tag):
         tag_buf.append(tag)
+        break
 
 num_tags = len(tag_buf)
 project = lib.currentProject()