From 649dcb0ce51a695884324e107d04c35da6c941c9 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Fri, 25 Apr 2025 01:09:06 +0200 Subject: [PATCH 01/11] filters: Refactor filters list into general project settings --- elixir/project_utils.py | 30 +++++++++++++++ elixir/projects.py | 82 +++++++++++++++++++++++++++++++++++++++++ elixir/web.py | 2 +- 3 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 elixir/project_utils.py create mode 100644 elixir/projects.py diff --git a/elixir/project_utils.py b/elixir/project_utils.py new file mode 100644 index 00000000..31523a83 --- /dev/null +++ b/elixir/project_utils.py @@ -0,0 +1,30 @@ +import re +from typing import List + +from .filters.utils import Filter, FilterContext +from .filters import default_filters +from .projects import projects + +# Returns a list of applicable filters for project_name under provided filter context +def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]: + project_config = projects.get(project_name) + if project_config is None or 'filters' not in project_config: + filter_classes = default_filters + else: + filter_classes = project_config['filters'] + + filters = [] + + for filter_cls in filter_classes: + if type(filter_cls) == tuple and len(filter_cls) == 2: + cls, kwargs = filter_cls + filters.append(cls(**kwargs)) + elif type(filter_cls) == type: + filters.append(filter_cls()) + else: + raise ValueError(f"Invalid filter: {filter_cls}, " \ + "should be either a two element tuple or a type. " \ + "Make sure project_filters in project.py is valid.") + + return [f for f in filters if f.check_if_applies(ctx)] + diff --git a/elixir/projects.py b/elixir/projects.py new file mode 100644 index 00000000..90a1ecc3 --- /dev/null +++ b/elixir/projects.py @@ -0,0 +1,82 @@ +from .filters import * + +# Dictionary of custom per-projects settings. +# filters: +# Projects not present in this dictionary only use default_filters. +# Use `*` to unpack filter lists defined above, +# you can pass additional options to filters by putting a Filter +# class and a dictionary with options in a tuple, like this: +# (FilterCls, {"option": True}). +# Check filter files and utils.py for information about available options +projects = { + 'amazon-freertos': { + 'filters': [ + *default_filters, + MakefileSubdirFilter, + ], + }, + 'arm-trusted-firmware': { + 'filters': [ + *default_filters, + CppPathIncFilter, + ], + }, + 'barebox': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + CppPathIncFilter, + *common_makefile_filters, + ], + }, + 'coreboot': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + *common_makefile_filters, + ], + }, + 'linux': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + *common_makefile_filters, + # include/uapi contains includes to user headers under #ifndef __KERNEL__ + # Our solution is to ignore all includes in such paths + (CppPathIncFilter, {"path_exceptions": {'^/include/uapi/.*'}}), + ], + }, + 'qemu': { + 'filters': [ + *default_filters, + *common_kconfig_filters, + ], + }, + 'u-boot': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + CppPathIncFilter, + *common_makefile_filters, + ], + }, + 'uclibc-ng': { + 'filters': [ + *default_filters, + ConfigInFilter, + ], + }, + 'zephyr': { + 'filters': [ + *default_filters, + DtsiFilter, + *common_kconfig_filters, + CppPathIncFilter, + ], + }, +} + diff --git a/elixir/web.py b/elixir/web.py index 2a0cbbbb..514e9cce 100755 --- a/elixir/web.py +++ b/elixir/web.py @@ -33,7 +33,7 @@ from .lib import validFamily from .query import Query, SymbolInstance -from .filters import get_filters +from .project_utils import get_filters from .filters.utils import FilterContext from .autocomplete import AutocompleteResource from .api import ApiIdentGetterResource From 4aff55019668eafcd1bbdbeb22674f71b8ea7e85 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Thu, 17 Oct 2024 21:26:34 +0200 Subject: [PATCH 02/11] lexers: Add a C lexer This commit adds a C lexer with a tool to output lexing results. The lexer is not hooked up to the rest of Elixir yet. Why: Currently, Elxir uses a simple, single-regex, perl based lexer. This approach mostly works, but it has a few issues: * Strings are not parsed correctly - there is a mistake in the regex that causes it to get confused by escapes in strings. * DTS identifers are not parsed correctly - some valid identifers are not recognized, others are split by otherwise allowed characters (mostly commas) * It barely distinguishes between languages. Comments in Kconfig files are not parsed correctly (and are not parseable by a simple regex). * Only identifiers are handled. Some parts of Elixir (filters, doccomments parser) could probably use a more detailed token stream. Right now, each of these parts contains a different regex-based lexer/parser. This mostly works, but again, it means the same functionality is reimplemented in many different parts of Elixir. * It does not recognize numbers, which means that numbers are looked up in the database during updates. * Keyword blocklist/allowlist is shared between all languages Some of these issues could be directly addressed in the regex itself, and some could be addressed somewhere in Elixir. But since there is a need for more sophisticated code analysis (ex. compatible filters, doccoments) and leaving such a crucial part of Elxir to a perl one-liner seems quite hacky, a decision to implement proper lexing, with a differnt lexer for each supported language was made. Libraries considered: * Pygments - Pygments lexers are good for code highlighting. It seems that as long the token stream results in expected identifiers being highlighted, it's good enough for Pygments. That is okay. Pygments lexers could be modified to provide a more reliable token stream, but the question is - does that help a typical Pygments user? Is it worth maintainers time? My assumption is that Pygments is not meant to be a general code analysis tool, but a code highlighter. It does that well, and extending it to a general lexer for all languages could be painful. * PLY - It seems that it's mostly meant for education purposes and isn't maintained anymore. It's not very ergonomic, for exapmle the interface requires each lexer to be in a different file. * pycparser - good for C, but does not support macros * Other parsing libraries - I'm quite sure that at this stage and for Elxir's purposes, we a flat token stream is wanted, not a full AST. Partial parsing could be done on the token stream later. But if more complete analysis is necessary, then it's probably beter to leave it to tools specific to that language (see ctags). Parser rules are also typically more complicated. Goals: * Good identifiers support - ex. not all DTS identifers are parsed correctly right now * Better comments support - KConfig help texts, GNU assmebler comments are not parsed well at all * Usable token stream that can be reassembled back into a file - some code analysis may require information about punctuation or comments, besides identifiers. It's also good to be sure that each character was considered, especially if code is meant to be modified. Notes: The lexers will never be perfect. Languages change, file extensions are confusing (.h can mean C, DTS or assembler). The main idea is to increase reliability of identifer references search, but achieving total correctness may require more work than it's worth. I picked an approach that should be, I hope, easy to understand, maintain, and allow sharing as much code as possible between different lexers. --- elixir/lexers/__init__.py | 0 elixir/lexers/__main__.py | 13 +++ elixir/lexers/lexers.py | 38 +++++++++ elixir/lexers/shared.py | 47 +++++++++++ elixir/lexers/utils.py | 171 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 269 insertions(+) create mode 100644 elixir/lexers/__init__.py create mode 100644 elixir/lexers/__main__.py create mode 100644 elixir/lexers/lexers.py create mode 100644 elixir/lexers/shared.py create mode 100644 elixir/lexers/utils.py diff --git a/elixir/lexers/__init__.py b/elixir/lexers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py new file mode 100644 index 00000000..7948eb94 --- /dev/null +++ b/elixir/lexers/__main__.py @@ -0,0 +1,13 @@ +if __name__ == "__main__": + import sys + from . import lexers + + if len(sys.argv) != 2: + print("usage:", sys.argv[0], "path/to/file") + exit(1) + + with open(sys.argv[1]) as f: + lexer = lexers.CLexer(f.read()) + for token in lexer.lex(): + print(token.line, token.token_type.name, token.span, token.token.encode()) + diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py new file mode 100644 index 00000000..8f021912 --- /dev/null +++ b/elixir/lexers/lexers.py @@ -0,0 +1,38 @@ +import re + +from . import shared +from .utils import TokenType, simple_lexer, FirstInLine + +# Lexers used to extract possible references from source files +# Design inspired by Pygments lexers interface + +# https://en.cppreference.com/w/c/language +# https://www.iso-9899.info/wiki/The_Standard +class CLexer: + # NOTE: does not support unicode identifiers + c_identifier = r'[a-zA-Z_][a-zA-Z_0-9]*' + + c_punctuation = r'[!#%&`()*+,./:;<=>?\[\]\\^_{|}~-]' + + # NOTE: macros don't always contain C code, but detecting that in pratice is hard + # without information about context (where the file is included from). + c_punctuation_extra = r'[$\\@]' + + rules = [ + (shared.whitespace, TokenType.WHITESPACE), + (shared.common_slash_comment, TokenType.COMMENT), + (shared.common_string_and_char, TokenType.STRING), + (shared.c_number, TokenType.NUMBER), + (c_identifier, TokenType.IDENTIFIER), + (FirstInLine(shared.c_preproc_ignore), TokenType.SPECIAL), + (c_punctuation, TokenType.PUNCTUATION), + (c_punctuation_extra, TokenType.PUNCTUATION), + ] + + def __init__(self, code): + self.code = code + + def lex(self, **kwargs): + return simple_lexer(self.rules, self.code, **kwargs) + + diff --git a/elixir/lexers/shared.py b/elixir/lexers/shared.py new file mode 100644 index 00000000..96625d30 --- /dev/null +++ b/elixir/lexers/shared.py @@ -0,0 +1,47 @@ +from .utils import regex_or, regex_concat + +# Regexes shared between lexers + +whitespace = r'\s+' + +# Building block for comments that start with a character and go until the end of the line +singleline_comment_with_escapes_base = r'(\\\s*\n|[^\n])*\n' + +slash_star_multline_comment = r'/\*(.|\s)*?\*/' +double_slash_singleline_comment = r'//' + singleline_comment_with_escapes_base +common_slash_comment = regex_or(slash_star_multline_comment, double_slash_singleline_comment) + +common_decimal_integer = r'[0-9][0-9\']*' +common_hexidecimal_integer = r'0[xX][0-9a-fA-F][0-9a-fA-F\']*' +common_octal_integer = r'0[0-7][0-7\']*' +common_binary_integer = r'0[bB][01][01\']*' + +c_preproc_include = r'#\s*include\s*(<.*?>|".*?")' +# match warning and error directives with the error string +c_preproc_warning_and_error = r'#\s*(warning|error)\s(\\\s*\n|[^\n])*\n' +# match other preprocessor directives, but don't consume the whole line +c_preproc_other = r'#\s*[a-z]+' +c_preproc_ignore = regex_or(c_preproc_include, c_preproc_warning_and_error, c_preproc_other) + +# \, any amount of whitespace, newline or any character that's not backslash newline or a quote, any escaped character +double_quote_string_with_escapes = r'"(\\\s*\n|[^\\"\n]|\\(.|\s))*?"' +single_quote_string_with_escapes = r"'(\\\s*\n|[^\\'\n]|\\(.|\s))*?'" + +common_string_and_char = regex_or(double_quote_string_with_escapes, single_quote_string_with_escapes) + +c_exponent = r'([eE][+-]?[0-9][0-9\']*)' +c_hexidecimal_exponent = r'([pP][+-]?[0-9][0-9\']*)' + +c_decimal_double_part = r'\.[0-9\']*' + c_exponent + '?' +c_octal_double_part = r'\.[0-7\']*' + c_exponent + '?' +c_hexidecimal_double_part = r'\.[0-9a-fA-F\']*' + c_hexidecimal_exponent + '?' + +c_decimal = f'{ common_decimal_integer }({ c_decimal_double_part })?' +c_hexidecimal = f'{ common_hexidecimal_integer }({ c_hexidecimal_double_part })?' +c_octal = f'{ common_octal_integer }({ c_octal_double_part })?' + +# not entirely correct... accepts way more than the standard allows +c_number_suffix = r'([uU]|[lL]|(wb|WB)|[fF]|[zZ]){0,5}' + +c_number = regex_concat(regex_or(c_hexidecimal, common_binary_integer, c_decimal, c_octal), c_number_suffix) + diff --git a/elixir/lexers/utils.py b/elixir/lexers/utils.py new file mode 100644 index 00000000..0290754b --- /dev/null +++ b/elixir/lexers/utils.py @@ -0,0 +1,171 @@ +import re +import enum +from collections import namedtuple + +# Supported token types +class TokenType(enum.Enum): + WHITESPACE = 'whitespace', + COMMENT = 'comment' + STRING = 'string' + NUMBER = 'number' + IDENTIFIER = 'identifier' + # may require extra parsing or context information + SPECIAL = 'special' + PUNCTUATION = 'punctuation' + # lexing failure - should be logged, at least until update jobs are preemptible + ERROR = 'error' + +Token = namedtuple('Token', 'token_type, token, span, line') + +def match_regex(regex): + rule = re.compile(regex, flags=re.MULTILINE) + return lambda code, pos, _: rule.match(code, pos) + +# Interface class that allows to match only if certian conditions, +# hard to express in regex, are true +class Matcher: + def update_after_match(self, code: str, pos: int, line: int, token: Token) -> None: + pass + + def match(self, code: str, pos: int, line: int) -> None | re.Match: + pass + +# Match token only if it's the first token in line (skipping whitespace) +class FirstInLine(Matcher): + whitespace = re.compile(r'\s*') + + def __init__(self, regex): + self.rule = re.compile(regex, flags=re.MULTILINE) + self.first_in_line = True + + def update_after_match(self, code, pos, line, token): + # first token is always first in line + if pos == 0: + self.first_in_line = True + return + + # check if matched token contains a newline + newline_pos = code.rfind('\n', token.span[0], token.span[1]) + + # if it doesn't contain a newline, check the part after newline + if newline_pos != -1: + post_newline_tok = code[newline_pos+1:token.span[1]] + + # if part after newline contains only whitespace (or nothing), the next token is first in line + if self.whitespace.fullmatch(post_newline_tok): + self.first_in_line = True + # if currently matched is the first in line, and only contains whitespace, + # the next token also counts as first in line + elif self.first_in_line and self.whitespace.fullmatch(code, token.span[0], token.span[1]): + self.first_in_line = True + # otherwise reset first in line marker + else: + self.first_in_line = False + + def match(self, code, pos, line): + if self.first_in_line: + return self.rule.match(code, pos) + +class LexerContext: + def self(self, code, pos, line, filter_tokens): + self.code = code + self.pos = pos + self.line = line + self.filter_tokens = filter_tokens + +def simple_lexer(rules, code, filter_tokens=None): + if len(code) == 0: + return + + # to avoid dealing with files without trailing newlines + if code[-1] != '\n': + code += '\n' + + rules_compiled = [] + after_match_hooks = [] + + # compile rules + for rule, action in rules: + # string rules are actually match regex rules + if type(rule) is str: + rules_compiled.append((match_regex(rule), action)) + # rules can also be callables + elif callable(rule): + rules_compiled.append((rule, action)) + # rules can also be matchers - matchers get more information during parsing, + # that information can stored in their state + elif isinstance(rule, Matcher): + rules_compiled.append((rule.match, action)) + after_match_hooks.append(rule.update_after_match) + + # helper function that calls hooks before yielding + def yield_token(to_yield): + for hook in after_match_hooks: + hook(code, pos, line, to_yield) + return to_yield + + pos = 0 + line = 1 + while pos < len(code): + rule_matched = False + for rule, action in rules_compiled: + match = rule(code, pos, line) + + if match is not None: + span = match.span() + # if match is empty - continue + if span[0] == span[1]: + continue + + rule_matched = True + + if isinstance(action, TokenType): + # only parse tokens of interest - slices apparently copy + if filter_tokens is None or action in filter_tokens: + token = code[span[0]:span[1]] + else: + token = None + + token_obj = Token(action, token, span, line) + yield yield_token(token_obj) + line += code.count('\n', span[0], span[1]) + pos = span[1] + break + elif callable(action): + last_token = None + for token in action(LexerContext(code, pos, line, filter_tokens), match): + last_token = token + yield yield_token(token) + + if last_token is not None: + pos = last_token.span[1] + line = last_token.line + last_token.token.count('\n') + + break + else: + raise Exception(f"invalid action {action}") + + # if no rules match, an error token with a single character is produced. + # this isn't always a big problem, hence it's the decision of the caller + # to decide whether to quit or continue + if not rule_matched: + token = Token(TokenType.ERROR, code[pos], (pos, pos+1), line) + yield yield_token(token) + if code[pos] == '\n': + line += 1 + pos += 1 + +# Combines regexes passed as arguments with pipe operator +def regex_or(*regexes): + result = '(' + for r in regexes: + result += f'({ r })|' + return result[:-1] + ')' + +# Concatenates regexes, putting each in a separate group +def regex_concat(*regexes): + result = '' + for r in regexes: + result += f'({ r })' + return result + From 174f29e50e23a07822f93463f165efaef2401c21 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Thu, 17 Oct 2024 22:06:16 +0200 Subject: [PATCH 03/11] lexers: Add C lexer tests This commit adds tests for the C lexer. The "architecture" may be controversial - test snippets are stored as strings. This has some drawbacks. * It's pretty ugly * Whitespace sensitive test cases may require extra care * Grepping may get even more annoying (it probably makes sense to skip all files starting with test_). An alternative would be to store each test case and result as a different file. I didn't go with that approach because of the following reasons: * It's harder to use the built-in Python testing framework * Test cases should be short, but it's annoying to navigate between many different small files * Making a readable test result format requres extra parsing work in Elixir. It's doable, but also annoying. --- elixir/lexers/tests/__init__.py | 0 elixir/lexers/tests/base.py | 65 ++++ elixir/lexers/tests/test_c.py | 567 ++++++++++++++++++++++++++++++++ 3 files changed, 632 insertions(+) create mode 100644 elixir/lexers/tests/__init__.py create mode 100644 elixir/lexers/tests/base.py create mode 100644 elixir/lexers/tests/test_c.py diff --git a/elixir/lexers/tests/__init__.py b/elixir/lexers/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/elixir/lexers/tests/base.py b/elixir/lexers/tests/base.py new file mode 100644 index 00000000..e234df33 --- /dev/null +++ b/elixir/lexers/tests/base.py @@ -0,0 +1,65 @@ +import unittest + +class LexerTest(unittest.TestCase): + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + # Checks if each token starts in the claimed position of code, if tokens cover all code and if no tokens overlap + def verify_positions(self, code, tokens): + last_token = None + for t in tokens: + if code[t.span[0]:t.span[1]] != t.token: + self.fail(f"token {t} span != code span {code[t.span[0]:t.span[1]].encode()}") + + if last_token is not None and last_token.span[1] != t.span[0]: + self.fail(f"token does not start where the previous token ends. prev: {last_token}, next: {t}") + elif last_token is None and t.span[0] != 0: + self.fail(f"first token does not start at zero: {t}") + + last_token = t + + if last_token.span[1] != len(code): + self.fail(f"code is longer than position of the last token: {t}, code len: {len(code)}") + + # Checks if each token is in the claimed line of code + def verify_lines(self, code, tokens): + lines = [""] + code.split("\n") # zero line is emtpy + last_line_number = None + last_line_contents_left = None + for t in tokens: + if last_line_number != t.line: + last_line_number = t.line + last_line_contents_left = lines[t.line] + + if last_line_contents_left is None: + self.fail(f"nothing left in line {t.line} for {t.token} {t}") + + newline_count = t.token.count("\n") + all_token_lines = last_line_contents_left + "\n" + \ + "\n".join([lines[i] for i in range(t.line+1, t.line+newline_count+1)]) + "\n" + token_pos_in_lines = all_token_lines.find(t.token) + if token_pos_in_lines == -1: + self.fail(f"token {t.token} not found in line {t.line}: {all_token_lines.encode()}") + if token_pos_in_lines < len(last_line_contents_left): + last_line_contents_left = last_line_contents_left[token_pos_in_lines:] + else: + last_line_contents_left = None + + # Lex code, do basic soundness checks on tokens (lines and positions) and compare lexing results with a list of tokens + def lex(self, code, expected, filtered_tokens=None, lexer_options={}): + if filtered_tokens is None: + filtered_tokens = self.default_filtered_tokens + + code = code.lstrip() + tokens = list(self.lexer_cls(code, **lexer_options).lex()) + self.verify_positions(code, tokens) + self.verify_lines(code, tokens) + + tokens = [[type.name, token] for type, token, span, line in tokens] + tokens = [t for t in tokens if t[0] in filtered_tokens] + try: + self.assertEqual(tokens, expected) + except Exception as e: + print() + for t in tokens: print(t, end=",\n") + raise e + diff --git a/elixir/lexers/tests/test_c.py b/elixir/lexers/tests/test_c.py new file mode 100644 index 00000000..ffd48cee --- /dev/null +++ b/elixir/lexers/tests/test_c.py @@ -0,0 +1,567 @@ +from ..lexers import CLexer +from .base import LexerTest + +class CLexerTest(LexerTest): + lexer_cls = CLexer + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + def test_if0(self): + self.lex(r""" +#if 0 +static bool test_v3_0_test(void *h, + enum type_enum e) { + return false; +} +#endif +static bool test_v3_0_test(void *h, + enum type_enum e) { + return false; +} +""", [ + ['SPECIAL', '#if'], + ['NUMBER', '0'], + ['IDENTIFIER', 'static'], + ['IDENTIFIER', 'bool'], + ['IDENTIFIER', 'test_v3_0_test'], + ['IDENTIFIER', 'void'], + ['IDENTIFIER', 'h'], + ['IDENTIFIER', 'enum'], + ['IDENTIFIER', 'type_enum'], + ['IDENTIFIER', 'e'], + ['IDENTIFIER', 'return'], + ['IDENTIFIER', 'false'], + ['SPECIAL', '#endif'], + ['IDENTIFIER', 'static'], + ['IDENTIFIER', 'bool'], + ['IDENTIFIER', 'test_v3_0_test'], + ['IDENTIFIER', 'void'], + ['IDENTIFIER', 'h'], + ['IDENTIFIER', 'enum'], + ['IDENTIFIER', 'type_enum'], + ['IDENTIFIER', 'e'], + ['IDENTIFIER', 'return'], + ['IDENTIFIER', 'false'], + ], self.default_filtered_tokens + ("NUMBER",)) + + def test_preproc(self): + self.lex(r""" +#include +# include +# include "test.h" +# include "test.h" + +# warning war +# error err + # error err + #warning war + +#error "escaped\ + message" + +#warning "escaped\ + message" + +# if defined(TEST) +# elif defined(TEST2) +#else +""", [ + ['SPECIAL', '#include '], + ['SPECIAL', '# include '], + ['SPECIAL', '# include "test.h"'], + ['SPECIAL', '# include "test.h"'], + ['SPECIAL', '# warning war\n'], + ['SPECIAL', '# error err\n'], + ['SPECIAL', '# error err\n'], + ['SPECIAL', '#warning war\n'], + ['SPECIAL', '#error "escaped\\\n message"\n'], + ['SPECIAL', '#warning "escaped\\ \n message"\n'], + ['SPECIAL', '# if'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', '# elif'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'TEST2'], + ['SPECIAL', '#else'], + ]) + + def test_defines(self): + self.lex(""" +# define test "long string \ + escaped newline" + + #define test define1 +# define test2 define12323 + +#define func(name, arg1,arg2...) \ + void name##f() { \ + return arg1 + arg2; + } +""", [ + ['SPECIAL', '# define'], + ['IDENTIFIER', 'test'], + ['STRING', '"long string escaped newline"'], + ['SPECIAL', '#define'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'define1'], + ['SPECIAL', '# define'], + ['IDENTIFIER', 'test2'], + ['IDENTIFIER', 'define12323'], + ['SPECIAL', '#define'], + ['IDENTIFIER', 'func'], + ['IDENTIFIER', 'name'], + ['IDENTIFIER', 'arg1'], + ['IDENTIFIER', 'arg2'], + ['IDENTIFIER', 'void'], + ['IDENTIFIER', 'name'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'return'], + ['IDENTIFIER', 'arg1'], + ['IDENTIFIER', 'arg2'], + ]) + + def test_strings(self): + self.lex(r""" +"asdsad \ + asdasd"; +'asdsad \ + asdasd'; +u8"test string"; +u"test string"; +u"test string"; +L"test string"; +"test \" string"; +"test ' string"; +"test \' string"; +"test \n string"; +"\xff"; +"test" "string"; +"test""string"; +"test" +""", [ + ['STRING', '"asdsad \\ \n asdasd"'], + ['STRING', "'asdsad \\\n asdasd'"], + ['IDENTIFIER', 'u8'], + ['STRING', '"test string"'], + ['IDENTIFIER', 'u'], + ['STRING', '"test string"'], + ['IDENTIFIER', 'u'], + ['STRING', '"test string"'], + ['IDENTIFIER', 'L'], + ['STRING', '"test string"'], + ['STRING', '"test \\" string"'], + ['STRING', '"test \' string"'], + ['STRING', '"test \\\' string"'], + ['STRING', '"test \\n string"'], + ['STRING', '"\\xff"'], + ['STRING', '"test"'], + ['STRING', '"string"'], + ['STRING', '"test"'], + ['STRING', '"string"'], + ['STRING', '"test"'], + ]) + + def test_strings2(self): + self.lex(r""" + "string"; + char* s1 = "asdjlsajdlksad""asdsajdlsad"; //comment6 + char* s2 = "asdjlsajdlksad" "asdsajdlsad"; // \ + single line comment \ + with escapes + char* s3 = " asdsaldjkas \""; + char* s4 = " asdsaldjkas \" zxclzxclk \" asljda"; + char* s5 = " asdsaldjkas \' zxclzxclk \" asljda"; + char* s6 = " asdsaldjkas \"\"\" zxclzxclk \'\'\' ; asljda"; + char* s7 = u8"test"; +""", [ + ['STRING', '"string"'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's1'], + ['STRING', '"asdjlsajdlksad"'], + ['STRING', '"asdsajdlsad"'], + ['COMMENT', '//comment6\n'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's2'], + ['STRING', '"asdjlsajdlksad"'], + ['STRING', '"asdsajdlsad"'], + ['COMMENT', '// \\\n single line comment \\\n with escapes\n'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's3'], + ['STRING', '" asdsaldjkas \\""'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's4'], + ['STRING', '" asdsaldjkas \\" zxclzxclk \\" asljda"'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's5'], + ['STRING', '" asdsaldjkas \\\' zxclzxclk \\" asljda"'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's6'], + ['STRING', '" asdsaldjkas \\"\\"\\" zxclzxclk \\\'\\\'\\\' ; asljda"'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's7'], + ['IDENTIFIER', 'u8'], + ['STRING', '"test"'], + ]) + + def test_chars(self): + self.lex(r""" +'a'; +u8'a'; +u'a'; +U'a'; +'\''; +'\"'; +'\\'; +'\n'; +'\f'; +'\U0001f34c'; +'\13'; +'\x1234'; +'\u213'; +u'ą'; +""", [ + ['STRING', "'a'"], + ['IDENTIFIER', 'u8'], + ['STRING', "'a'"], + ['IDENTIFIER', 'u'], + ['STRING', "'a'"], + ['IDENTIFIER', 'U'], + ['STRING', "'a'"], + ['STRING', "'\\''"], + ['STRING', '\'\\"\''], + ['STRING', "'\\\\'"], + ['STRING', "'\\n'"], + ['STRING', "'\\f'"], + ['STRING', "'\\U0001f34c'"], + ['STRING', "'\\13'"], + ['STRING', "'\\x1234'"], + ['STRING', "'\\u213'"], + ['IDENTIFIER', 'u'], + ['STRING', "'ą'"], + ]) + + def test_numbers(self): + self.lex(r""" +1239183; +-1239183; +0xAB08902; +-0xAB08902; +0Xab08902; +-0Xab08902; +0b0101001; +-0b0101001; +0B0101001; +-0B0101001; +0231273; +-0231273; +""", [ + ['NUMBER', '1239183'], + ['NUMBER', '1239183'], + ['NUMBER', '0xAB08902'], + ['NUMBER', '0xAB08902'], + ['NUMBER', '0Xab08902'], + ['NUMBER', '0Xab08902'], + ['NUMBER', '0b0101001'], + ['NUMBER', '0b0101001'], + ['NUMBER', '0B0101001'], + ['NUMBER', '0B0101001'], + ['NUMBER', '0231273'], + ['NUMBER', '0231273'], + ], self.default_filtered_tokens + ("NUMBER",)) + + def test_floats(self): + self.lex(r""" +double e = 0x2ABDEFabcdef; +double + f = 017.048509495; +double -g = 0b1010010; +double g = 0b1010010; +-017.048509495; +017.048509495; +-017.048509495e-12329123; +017.048509495e-12329123; +-0x123.fp34; +0x123.fp34; +-0x123.fP34; +0x123.fP34; +-0x123.fe1p123; +0x123.fe1p123; +-0x123.fe1p123; +0x123.fe1p123; +-.1; +.1; +-1.; +1.; +-0x1.ep+3; +0x1.ep+3; +-0X183083; +0X183083; +-0x213213.1231212'31e21p-2; +0x213213.1231212'31e21p-2; +-123123.123e2; +123123.123e2; +""", [ + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'e'], + ['NUMBER', '0x2ABDEFabcdef'], + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'f'], + ['NUMBER', '017.048509495'], + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'g'], + ['NUMBER', '0b1010010'], + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'g'], + ['NUMBER', '0b1010010'], + ['NUMBER', '017.048509495'], + ['NUMBER', '017.048509495'], + ['NUMBER', '017.048509495e-12329123'], + ['NUMBER', '017.048509495e-12329123'], + ['NUMBER', '0x123.fp34'], + ['NUMBER', '0x123.fp34'], + ['NUMBER', '0x123.fP34'], + ['NUMBER', '0x123.fP34'], + ['NUMBER', '0x123.fe1p123'], + ['NUMBER', '0x123.fe1p123'], + ['NUMBER', '0x123.fe1p123'], + ['NUMBER', '0x123.fe1p123'], + ['NUMBER', '1'], + ['NUMBER', '1'], + ['NUMBER', '1.'], + ['NUMBER', '1.'], + ['NUMBER', '0x1.ep+3'], + ['NUMBER', '0x1.ep+3'], + ['NUMBER', '0X183083'], + ['NUMBER', '0X183083'], + ['NUMBER', "0x213213.1231212'31e21p-2"], + ['NUMBER', "0x213213.1231212'31e21p-2"], + ['NUMBER', '123123.123e2'], + ['NUMBER', '123123.123e2'], + ], self.default_filtered_tokens + ("NUMBER",)) + + def test_longs(self): + self.lex(r""" +-123213092183ul; +123213092183ul; +-123213092183ull; +123213092183ull; +-123213092183llu; +123213092183llu; +-123213092183uLL; +123213092183uLL; +-123213092183LLU; +123213092183LLU; +-1232'13092183LLU; +1232'13092183LLU; +-1232'1309'2183LLU; +1232'1309'2183LLU; +-1232'1309'218'3LLU; +1232'1309'218'3LLU; +""", [ + ['NUMBER', '123213092183ul'], + ['NUMBER', '123213092183ul'], + ['NUMBER', '123213092183ull'], + ['NUMBER', '123213092183ull'], + ['NUMBER', '123213092183llu'], + ['NUMBER', '123213092183llu'], + ['NUMBER', '123213092183uLL'], + ['NUMBER', '123213092183uLL'], + ['NUMBER', '123213092183LLU'], + ['NUMBER', '123213092183LLU'], + ['NUMBER', "1232'13092183LLU"], + ['NUMBER', "1232'13092183LLU"], + ['NUMBER', "1232'1309'2183LLU"], + ['NUMBER', "1232'1309'2183LLU"], + ['NUMBER', "1232'1309'218'3LLU"], + ['NUMBER', "1232'1309'218'3LLU"], + ], self.default_filtered_tokens + ("NUMBER",)) + + def test_comments(self): + self.lex(r""" + /*comment1*/ + /* comment2*/ + /* comment3 */ + /* + * + comment4 + _+}{|":?><~!@#$%&*()_+`123567890-=[];'\,./ + * */ + + /* comment 5 \*\// */ + +// comment5 +char* s2 = "asdjlsajdlksad" "asdsajdlsad"; // \ + single line comment \ + with escapes +char statement; +""", [ + ['COMMENT', '/*comment1*/'], + ['COMMENT', '/* comment2*/'], + ['COMMENT', '/* comment3 */'], + ['COMMENT', '/*\n *\n comment4\n _+}{|":?><~!@#$%&*()_+`123567890-=[];\'\\,./\n * */'], + ['COMMENT', '/* comment 5 \\*\\// */'], + ['COMMENT', '// comment5\n'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 's2'], + ['STRING', '"asdjlsajdlksad"'], + ['STRING', '"asdsajdlsad"'], + ['COMMENT', '// \\\n single line comment \\\n with escapes\n'], + ['IDENTIFIER', 'char'], + ['IDENTIFIER', 'statement'], + ]) + + # https://en.cppreference.com/w/cpp/language/pack_indexing + def test_cpp_templates(self): + self.lex(r""" +template +constexpr auto f(Ts&&... ts) { + return sizeof...(Ts); +} + +template +int f() { + std::cout << t << std::endl; + ns1::ns2::type v; + ns1::ns2::type2 v2; + ns1::ns2::type3 v3; +} +""", [ + ['IDENTIFIER', 'template'], + ['IDENTIFIER', 'typename'], + ['IDENTIFIER', 'Ts'], + ['IDENTIFIER', 'constexpr'], + ['IDENTIFIER', 'auto'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'Ts'], + ['IDENTIFIER', 'ts'], + ['IDENTIFIER', 'return'], + ['IDENTIFIER', 'sizeof'], + ['IDENTIFIER', 'Ts'], + ['IDENTIFIER', 'template'], + ['IDENTIFIER', 'typename'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 't'], + ['IDENTIFIER', 't'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'cout'], + ['IDENTIFIER', 't'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'endl'], + ['IDENTIFIER', 'ns1'], + ['IDENTIFIER', 'ns2'], + ['IDENTIFIER', 'type'], + ['IDENTIFIER', 'v'], + ['IDENTIFIER', 'ns1'], + ['IDENTIFIER', 'ns2'], + ['IDENTIFIER', 'type2'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'v2'], + ['IDENTIFIER', 'ns1'], + ['IDENTIFIER', 'ns2'], + ['IDENTIFIER', 'type3'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'double'], + ['IDENTIFIER', 'v3'], + ]) + + # https://en.cppreference.com/w/cpp/language/requires + def test_cpp_concepts(self): + self.lex(r""" +template +concept C = requires(T x) { + {x.count()} -> std::same_as; + requires Same +}; +""", [ + ['IDENTIFIER', 'template'], + ['IDENTIFIER', 'typename'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 'concept'], + ['IDENTIFIER', 'C'], + ['IDENTIFIER', 'requires'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 'x'], + ['IDENTIFIER', 'x'], + ['IDENTIFIER', 'count'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'same_as'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'requires'], + ['IDENTIFIER', 'Same'], + ['IDENTIFIER', 'T'], + ['IDENTIFIER', 'decltype'], + ['IDENTIFIER', 'x'], + ]) + + def test_cpp_class(self): + self.lex(r""" +using namespace std; + +auto f() -> std::string; + +class test { +public: + int operator ""_tx(int); + int a = 123_tx; +}; +""", [ + ['IDENTIFIER', 'using'], + ['IDENTIFIER', 'namespace'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'auto'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 'string'], + ['IDENTIFIER', 'class'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'public'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'operator'], + ['STRING', '""'], + ['IDENTIFIER', '_tx'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'a'], + ['IDENTIFIER', '_tx'], + ]) + + def test_cpp_attrs(self): + self.lex(r""" +[[using test: atr1]] [[atr2]] +int f[[atr3]](); +""", [ + ['IDENTIFIER', 'using'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'atr1'], + ['IDENTIFIER', 'atr2'], + ['IDENTIFIER', 'int'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'atr3'], + ]) + + # https://en.cppreference.com/w/cpp/language/noexcept_spec + def test_cpp_noexpect(self): + self.lex(r""" +void f() noexpect(true) {} +""", [ + ['IDENTIFIER', 'void'], + ['IDENTIFIER', 'f'], + ['IDENTIFIER', 'noexpect'], + ['IDENTIFIER', 'true'], + ]) + + # https://en.cppreference.com/w/cpp/language/coroutines + def test_cpp_coroutines(self): + self.lex(r""" +task<> test() { + co_await test2(); +} +""", [ + ['IDENTIFIER', 'task'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'co_await'], + ['IDENTIFIER', 'test2'], + ]) + From 2289c4519e5278cac9a88b5ea2342e00f58f02ec Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Fri, 25 Oct 2024 13:10:15 +0200 Subject: [PATCH 04/11] lexers: Add a DTS lexer --- elixir/lexers/__main__.py | 26 +++++++-- elixir/lexers/lexers.py | 110 +++++++++++++++++++++++++++++++++++++- elixir/lexers/utils.py | 30 ++++++++++- 3 files changed, 159 insertions(+), 7 deletions(-) diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py index 7948eb94..7b9f7a6f 100644 --- a/elixir/lexers/__main__.py +++ b/elixir/lexers/__main__.py @@ -2,12 +2,28 @@ import sys from . import lexers - if len(sys.argv) != 2: - print("usage:", sys.argv[0], "path/to/file") + if not (len(sys.argv) == 2 or (len(sys.argv) == 3 and sys.argv[1] == '-s')): + print("usage:", sys.argv[0], "[-s]", "path/to/file") exit(1) - with open(sys.argv[1]) as f: - lexer = lexers.CLexer(f.read()) + short = sys.argv[1] == '-s' + + filename = sys.argv[-1] + + with open(filename) as f: + if filename.endswith(('.c', '.h', '.cpp', '.hpp')): + lexer = lexers.CLexer(f.read()) + elif filename.endswith(('.dts', '.dtsi')): + lexer = lexers.DTSLexer(f.read()) + else: + raise Exception("no lexer for filetype") + for token in lexer.lex(): - print(token.line, token.token_type.name, token.span, token.token.encode()) + if not short: + print(token.line, token.token_type.name, token.span, token.token.encode()) + else: + if token.token_type.name == 'IDENTIFIER' or token.token_type.name == 'STRING': + print(f"|{token.token}|", end='') + else: + print(token.token, end='') diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py index 8f021912..b3fb583b 100644 --- a/elixir/lexers/lexers.py +++ b/elixir/lexers/lexers.py @@ -1,7 +1,7 @@ import re from . import shared -from .utils import TokenType, simple_lexer, FirstInLine +from .utils import TokenType, simple_lexer, FirstInLine, split_by_groups, regex_concat, token_from_string # Lexers used to extract possible references from source files # Design inspired by Pygments lexers interface @@ -36,3 +36,111 @@ def lex(self, **kwargs): return simple_lexer(self.rules, self.code, **kwargs) +# https://www.devicetree.org/specifications/ +class DTSLexer: + # TODO handle macros separately + + # NOTE: previous versions would split identifiers by commas (and other special characters), + # this changes the old behavior + + # 6.2 + # technically shall be 1-31 characters long BUT /linux/v6.9.4/source/arch/arm64/boot/dts/qcom/sm8250.dtsi#L3506 + dts_label = r'[a-zA-Z_][a-zA-Z_0-9]*' + # no whitespace between label and ampersand/colon is allowed + dts_label_reference = f'(&)({ dts_label })' + dts_label_definition = f'({ dts_label })(:)' + + # 2.2.1 + # same with label lenght, just in case + dts_node_name = r'[a-zA-Z0-9,._+-]+' + # can contain macro symbols + dts_unit_address = r'[a-zA-Z0-9,._+-]*' + + dts_node_name_with_unit_address = f'({ dts_node_name })(@)({ dts_unit_address })' + r'(\s*)({)' + dts_node_name_without_unit_address = f'({ dts_node_name })' + r'(\s*)({)' + + # 2.2.4 + dts_property_name = r'[0-9a-zA-Z,._+?#-]+' + dts_property_assignment = f'({ dts_property_name })' + r'(\s*)(=)' + dts_property_empty = f'({ dts_property_name })' + r'(\s*)(;)' + + dts_directive = r'/[a-zA-Z0-9-]+/'; + dts_delete_node = regex_concat(r'/delete-node/\s+', dts_node_name) + dts_delete_property = regex_concat(r'/delete-property/\s+', dts_property_name) + + # 6.3 + dts_node_reference = r'(&)({)([a-zA-Z0-9,._+/@-]+?)(})' + + dts_punctuation = r'[#@:;{}\[\]()^<>=+*/%&\\|~!?,-]' + # other, unknown, identifiers - for exmple macros + dts_default_identifier = r'[0-9a-zA-Z_]+' + + # Parse DTS node reference, ex: &{/path/to/node@20/test} + @staticmethod + def parse_dts_node_reference(ctx, match): + # & + token, ctx = token_from_string(ctx, match.group(1), TokenType.PUNCTUATION) + yield token + + # { + token, ctx = token_from_string(ctx, match.group(2), TokenType.PUNCTUATION) + yield token + + path = match.group(3) + path_part_matcher = re.compile(DTSLexer.dts_unit_address) + strpos = 0 + + while strpos < len(path): + if path[strpos] == '@' or path[strpos] == '/': + token, ctx = token_from_string(ctx, path[strpos], TokenType.PUNCTUATION) + yield token + strpos += 1 + else: + part_match = path_part_matcher.match(path, strpos) + if part_match is None: + token, _ = token_from_string(ctx, TokenType.ERROR, '') + yield token + return None + + token, ctx = token_from_string(ctx, part_match.group(0), TokenType.IDENTIFIER) + yield token + strpos += len(part_match.group(0)) + # } + token, ctx = token_from_string(ctx, match.group(4), TokenType.PUNCTUATION) + yield token + + rules = [ + (shared.whitespace, TokenType.WHITESPACE), + (shared.common_slash_comment, TokenType.COMMENT), + (shared.common_string_and_char, TokenType.STRING), + (shared.c_number, TokenType.NUMBER), + + (dts_label_reference, split_by_groups(TokenType.PUNCTUATION, TokenType.IDENTIFIER)), + (dts_label_definition, split_by_groups(TokenType.IDENTIFIER, TokenType.PUNCTUATION)), + (dts_node_reference, parse_dts_node_reference), + + (dts_property_assignment, + split_by_groups(TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)), + (dts_property_empty, + split_by_groups(TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)), + + (dts_node_name_with_unit_address, + split_by_groups(TokenType.IDENTIFIER, TokenType.PUNCTUATION, + TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)), + (dts_node_name_without_unit_address, + split_by_groups(TokenType.IDENTIFIER, TokenType.WHITESPACE, TokenType.PUNCTUATION)), + + (dts_directive, TokenType.SPECIAL), + (dts_delete_node, split_by_groups(TokenType.SPECIAL, TokenType.IDENTIFIER)), + (dts_delete_property, split_by_groups(TokenType.SPECIAL, TokenType.IDENTIFIER)), + (dts_default_identifier, TokenType.IDENTIFIER), + (FirstInLine(shared.c_preproc_ignore), TokenType.SPECIAL), + (dts_punctuation, TokenType.PUNCTUATION), + ] + + def __init__(self, code): + self.code = code + + def lex(self, **kwargs): + return simple_lexer(self.rules, self.code, **kwargs) + diff --git a/elixir/lexers/utils.py b/elixir/lexers/utils.py index 0290754b..269af1bc 100644 --- a/elixir/lexers/utils.py +++ b/elixir/lexers/utils.py @@ -21,6 +21,34 @@ def match_regex(regex): rule = re.compile(regex, flags=re.MULTILINE) return lambda code, pos, _: rule.match(code, pos) +def split_by_groups(*token_types): + def split(ctx, match): + pos = ctx.pos + line = ctx.line + for gi in range(len(match.groups())): + token = match.group(gi+1) + if len(token) != 0: + action = token_types[gi] + yield Token(action, token, (pos, pos+len(token)), line) + line += token.count("\n") + pos += len(token) + + return split + +def token_from_match(ctx, match, token_type): + span = match.span() + result = Token(token_type, ctx.code[span[0]:span[1]], span, ctx.line) + ctx.pos = span[1] + ctx.line = ctx.line+result.token.count('\n') + return result, ctx + +def token_from_string(ctx, match, token_type): + span = (ctx.pos, ctx.pos+len(match)) + result = Token(token_type, ctx.code[span[0]:span[1]], span, ctx.line) + ctx.pos = span[1] + ctx.line = ctx.line+result.token.count('\n') + return result, ctx + # Interface class that allows to match only if certian conditions, # hard to express in regex, are true class Matcher: @@ -67,7 +95,7 @@ def match(self, code, pos, line): return self.rule.match(code, pos) class LexerContext: - def self(self, code, pos, line, filter_tokens): + def __init__(self, code, pos, line, filter_tokens): self.code = code self.pos = pos self.line = line From e6767b7e60fba6a9f826a66340bb0070d39e17a9 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Fri, 25 Oct 2024 13:11:39 +0200 Subject: [PATCH 05/11] lexers: Add DTS lexer tests --- elixir/lexers/tests/test_dts.py | 271 ++++++++++++++++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 elixir/lexers/tests/test_dts.py diff --git a/elixir/lexers/tests/test_dts.py b/elixir/lexers/tests/test_dts.py new file mode 100644 index 00000000..72f39d7f --- /dev/null +++ b/elixir/lexers/tests/test_dts.py @@ -0,0 +1,271 @@ +from ..lexers import DTSLexer +from .base import LexerTest + +class DTSLexerTests(LexerTest): + lexer_cls = DTSLexer + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + def test_preproc(self): + self.lex(r""" +#include +#include "file2.dtsi" +#error error message asldjlksajdlksad +#warning warning message alsjdlkasjdlksajd +#define MACRO(arg) \ + arg = <3>; +#if 0 +/ { + property = <2>; + MACRO(test) +}; +#endif +""", [ + ['SPECIAL', '#include '], + ['SPECIAL', '#include "file2.dtsi"'], + ['SPECIAL', '#error error message asldjlksajdlksad\n'], + ['SPECIAL', '#warning warning message alsjdlkasjdlksajd\n'], + ['SPECIAL', '#define'], + ['IDENTIFIER', 'MACRO'], + ['IDENTIFIER', 'arg'], + ['IDENTIFIER', 'arg'], + ['SPECIAL', '#if'], + ['IDENTIFIER', 'property'], + ['IDENTIFIER', 'MACRO'], + ['IDENTIFIER', 'test'], + ['SPECIAL', '#endif'], + ]) + + def test_dts_directives(self): + self.lex(r""" +/include/ "file.dtsi" +/dts-v1/; +/memreserve/ 0x100 0x2; +/ { + test_label: test-node { + test-prop2 = <3>; + }; + test-prop = <2>; + /delete-node/ test-node; + /delete-node/ &test_label; + /delete-property/ test-prop; +}; +""", [ + ['SPECIAL', '/include/'], + ['STRING', '"file.dtsi"'], + ['SPECIAL', '/dts-v1/'], + ['SPECIAL', '/memreserve/'], + ['IDENTIFIER', 'test_label'], + ['IDENTIFIER', 'test-node'], + ['IDENTIFIER', 'test-prop2'], + ['IDENTIFIER', 'test-prop'], + ['SPECIAL', '/delete-node/'], + ['IDENTIFIER', 'test-node'], + ['SPECIAL', '/delete-node/'], + ['IDENTIFIER', 'test_label'], + ['SPECIAL', '/delete-property/'], + ['IDENTIFIER', 'test-prop'], + ]) + + def test_dts_unusual_identifiers(self): + self.lex(r""" +/ { + _test_label: 5id,test._+asd-2 { + property,name = <2>; + 0p,r.o_p+e?r#t-y,name = [1,2,3]; + way_too_long_label_123219380921830218309218309213 : node@234 { + compatible = "asd,zxc"; + } + test = <&way_too_long_label_123219380921830218309218309213>; + }; +}; +""", [ + ['IDENTIFIER', '_test_label'], + ['IDENTIFIER', 'id,test._+asd-2'], + ['IDENTIFIER', 'property,name'], + ['IDENTIFIER', 'p,r.o_p+e?r#t-y,name'], + ['IDENTIFIER', 'way_too_long_label_123219380921830218309218309213'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', '234'], + ['IDENTIFIER', 'compatible'], + ['STRING', '"asd,zxc"'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'way_too_long_label_123219380921830218309218309213'], + ]) + + def test_non_numeric_unit_address(self): + self.lex(r""" +/ { + test: node@test_address { + }; + test2: node@MACRO_ADDRESS(123) { + }; +}; +""", [ + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', 'test_address'], + ['IDENTIFIER', 'test2'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', 'MACRO_ADDRESS'], + ]) + + def test_values_with_labels(self): + self.lex(r""" +/ { + prop1 = label1: <0 label2: 0x21323>; + prop2 = [1 2 3 label3: 4]; + prop3 = label4: "val" label5: ; +}; +""", [ + ['PUNCTUATION', '/'], + ['PUNCTUATION', '{'], + ['IDENTIFIER', 'prop1'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'label1'], + ['PUNCTUATION', ':'], + ['PUNCTUATION', '<'], + ['NUMBER', '0'], + ['IDENTIFIER', 'label2'], + ['PUNCTUATION', ':'], + ['NUMBER', '0x21323'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop2'], + ['PUNCTUATION', '='], + ['PUNCTUATION', '['], + ['NUMBER', '1'], + ['NUMBER', '2'], + ['NUMBER', '3'], + ['IDENTIFIER', 'label3'], + ['PUNCTUATION', ':'], + ['NUMBER', '4'], + ['PUNCTUATION', ']'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop3'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'label4'], + ['PUNCTUATION', ':'], + ['STRING', '"val"'], + ['IDENTIFIER', 'label5'], + ['PUNCTUATION', ':'], + ['PUNCTUATION', ';'], + ['PUNCTUATION', '}'], + ['PUNCTUATION', ';'], + ], self.default_filtered_tokens + ('PUNCTUATION', 'NUMBER')) + + def test_references(self): + self.lex(r""" +/ { + interrupt-parent = < &{/node@c2342/another_node@address(2)/node3} >; + property2 = <&{/node@c2342/another_node@address(2)}>; + power-domains = <&power DEVICE_DOMAIN>; +}; +""", [ + ['IDENTIFIER', 'interrupt-parent'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', 'c2342'], + ['IDENTIFIER', 'another_node'], + ['IDENTIFIER', 'address'], + ['IDENTIFIER', 'node3'], + ['IDENTIFIER', 'property2'], + ['IDENTIFIER', 'node'], + ['IDENTIFIER', 'c2342'], + ['IDENTIFIER', 'another_node'], + ['IDENTIFIER', 'address'], + ['IDENTIFIER', 'power-domains'], + ['IDENTIFIER', 'power'], + ['IDENTIFIER', 'DEVICE_DOMAIN'], + ]) + + def test_property_types(self): + self.lex(r""" +/ { + prop1 = <0 0x21323>; + prop2 = [1 2 3 4]; + prop3 = "val", "val4" ; + prop4 = <~1+2-3*4/5%6&7|8^9<<10>>11>; + prop5; +}; +""", [ + ['PUNCTUATION', '/'], + ['PUNCTUATION', '{'], + ['IDENTIFIER', 'prop1'], + ['PUNCTUATION', '='], + ['PUNCTUATION', '<'], + ['NUMBER', '0'], + ['NUMBER', '0x21323'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop2'], + ['PUNCTUATION', '='], + ['PUNCTUATION', '['], + ['NUMBER', '1'], + ['NUMBER', '2'], + ['NUMBER', '3'], + ['NUMBER', '4'], + ['PUNCTUATION', ']'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop3'], + ['PUNCTUATION', '='], + ['STRING', '"val"'], + ['PUNCTUATION', ','], + ['STRING', '"val4"'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop4'], + ['PUNCTUATION', '='], + ['PUNCTUATION', '<'], + ['PUNCTUATION', '~'], + ['NUMBER', '1'], + ['PUNCTUATION', '+'], + ['NUMBER', '2'], + ['PUNCTUATION', '-'], + ['NUMBER', '3'], + ['PUNCTUATION', '*'], + ['NUMBER', '4'], + ['PUNCTUATION', '/'], + ['NUMBER', '5'], + ['PUNCTUATION', '%'], + ['NUMBER', '6'], + ['PUNCTUATION', '&'], + ['NUMBER', '7'], + ['PUNCTUATION', '|'], + ['NUMBER', '8'], + ['PUNCTUATION', '^'], + ['NUMBER', '9'], + ['PUNCTUATION', '<'], + ['PUNCTUATION', '<'], + ['NUMBER', '10'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', '>'], + ['NUMBER', '11'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', ';'], + ['IDENTIFIER', 'prop5'], + ['PUNCTUATION', ';'], + ['PUNCTUATION', '}'], + ['PUNCTUATION', ';'], + ], self.default_filtered_tokens + ('PUNCTUATION', 'NUMBER')) + + def test_comments(self): + self.lex(r""" +//license info +/ { + interrupts = , /* comment 1 */ + ; // comemnt2 + /* long + * coment + * asdasd + */ +}; +""", [ + ['COMMENT', '//license info\n'], + ['IDENTIFIER', 'interrupts'], + ['IDENTIFIER', 'NAME'], + ['IDENTIFIER', 'TYPE'], + ['COMMENT', '/* comment 1 */'], + ['IDENTIFIER', 'NAME'], + ['IDENTIFIER', 'TYPE'], + ['COMMENT', '// comemnt2\n'], + ['COMMENT', '/* long\n * coment\n * asdasd\n */'], + ], self.default_filtered_tokens) + From ad1e0e2bd452e33bb0e9195a6ee15fdf19a7f58b Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Mon, 30 Dec 2024 00:03:03 +0100 Subject: [PATCH 06/11] lexers: Add a Kconfig lexer --- elixir/lexers/__main__.py | 2 + elixir/lexers/lexers.py | 113 +++++++++++++++++++++++++++++++++++++- elixir/lexers/utils.py | 11 ++++ 3 files changed, 125 insertions(+), 1 deletion(-) diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py index 7b9f7a6f..dab00d14 100644 --- a/elixir/lexers/__main__.py +++ b/elixir/lexers/__main__.py @@ -15,6 +15,8 @@ lexer = lexers.CLexer(f.read()) elif filename.endswith(('.dts', '.dtsi')): lexer = lexers.DTSLexer(f.read()) + elif filename.endswith('Kconfig'): + lexer = lexers.KconfigLexer(f.read()) else: raise Exception("no lexer for filetype") diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py index b3fb583b..06134a53 100644 --- a/elixir/lexers/lexers.py +++ b/elixir/lexers/lexers.py @@ -1,7 +1,8 @@ import re from . import shared -from .utils import TokenType, simple_lexer, FirstInLine, split_by_groups, regex_concat, token_from_string +from .utils import TokenType, simple_lexer, FirstInLine, split_by_groups, regex_concat, token_from_string, token_from_match, \ + regex_or, match_token, Token # Lexers used to extract possible references from source files # Design inspired by Pygments lexers interface @@ -144,3 +145,113 @@ def __init__(self, code): def lex(self, **kwargs): return simple_lexer(self.rules, self.code, **kwargs) + +# https://www.kernel.org/doc/html/next/kbuild/kconfig-language.html#kconfig-syntax +# https://www.kernel.org/doc/html/next/kbuild/kconfig-language.html#kconfig-hints + +# TODO better macros calls support + +class KconfigLexer: + hash_comment = r'#' + shared.singleline_comment_with_escapes_base + + # NOTE pretty much all kconfig identifiers either start uppercase or with a number. this saves us from parsing macro calls + kconfig_identifier_starts_with_letters = r'[A-Z_][A-Z0-9a-z_-]*' + kconfig_identifier_starts_with_digits = r'[0-9]+[A-Z_a-z-][A-Z0-9a-z_-]*' + kconfig_identifier = regex_or(kconfig_identifier_starts_with_letters, kconfig_identifier_starts_with_digits) + # other perhaps interesting identifiers + kconfig_minor_identifier = r'[a-zA-Z0-9_/][a-zA-Z0-9_/.-]*' + kconfig_punctuation = r'[|&!=$()/_.+<>,-]' + kconfig_number = f'[0-9]+' # TODO does not handle hex numbers + + # NOTE no identifiers are parsed out of KConfig help texts now, this changes the + # old behavior + # for example see all instances of USB in /u-boot/v2024.07/source/drivers/usb/Kconfig#L3 + + @staticmethod + def count_kconfig_help_whitespace(start_whitespace_str): + tabs = start_whitespace_str.count('\t') + spaces = start_whitespace_str.count(' ') + return 8*tabs + spaces + (len(start_whitespace_str)-tabs-spaces) + + @staticmethod + def parse_kconfig_help_text(ctx, match): + # assumes called with matched help keyword, return the keyword + token, ctx = token_from_match(ctx, match, TokenType.SPECIAL) + yield token + + # match whitespace after help + whitespace_after_help, ctx = match_token(ctx, r'\s*?\n', TokenType.WHITESPACE) + if whitespace_after_help is None: + # failed to match whitespace and newline after kconfig help - perhaps it's not the right context (macro call for exapmle) + return + else: + yield whitespace_after_help + + line_matcher = re.compile(r'[^\n]*\n', flags=re.MULTILINE|re.UNICODE) + + start_help_text_pos = ctx.pos + current_pos = ctx.pos + min_whitespace = None + + def collect_tokens(start, end): + return Token(TokenType.COMMENT, ctx.code[start:end], (start, end), ctx.line) + + # match first line with whitespace at the beginning + while current_pos < len(ctx.code): + line = line_matcher.match(ctx.code, current_pos) + if line is None: + yield collect_tokens(start_help_text_pos, current_pos) + return + + token = line.group(0) + span = line.span() + + if token == '\n': + # just an empty line + current_pos = span[1] + continue + else: + start_whitespace = re.match(r'\s*', token) + if start_whitespace is None: + # no whitespace at the beginning of the line + yield collect_tokens(start_help_text_pos, current_pos) + return + elif min_whitespace is None: + # first nonemtpy line - save amount of whitespace + min_whitespace = KconfigLexer.count_kconfig_help_whitespace(start_whitespace.group(0)) + current_pos = span[1] + else: + cur_whitespace = KconfigLexer.count_kconfig_help_whitespace(start_whitespace.group(0)) + if cur_whitespace < min_whitespace: + yield collect_tokens(start_help_text_pos, current_pos) + return + else: + current_pos = span[1] + + yield collect_tokens(start_help_text_pos, current_pos) + + rules = [ + (shared.whitespace, TokenType.WHITESPACE), + (hash_comment, TokenType.COMMENT), + (shared.common_string_and_char, TokenType.STRING), + # for whatever reason u-boot kconfigs sometimes use ---help--- instead of help + # /u-boot/v2024.07/source/arch/arm/mach-sunxi/Kconfig#L732 + (FirstInLine('-+help-+'), parse_kconfig_help_text), + (kconfig_punctuation, TokenType.PUNCTUATION), + (FirstInLine('help'), parse_kconfig_help_text), + (kconfig_identifier, TokenType.IDENTIFIER), + (kconfig_number, TokenType.NUMBER), + (kconfig_minor_identifier, TokenType.SPECIAL), + # things that do not match are probably things from a macro call. + # unless the syntax changed, or the help parser got confused. + # https://www.kernel.org/doc/html/next/kbuild/kconfig-macro-language.html + # both shell call and warning/error would require additinal parsing + (r'[^\n]+', TokenType.SPECIAL), + ] + + def __init__(self, code): + self.code = code + + def lex(self): + return simple_lexer(self.rules, self.code) + diff --git a/elixir/lexers/utils.py b/elixir/lexers/utils.py index 269af1bc..7b991dd8 100644 --- a/elixir/lexers/utils.py +++ b/elixir/lexers/utils.py @@ -21,6 +21,17 @@ def match_regex(regex): rule = re.compile(regex, flags=re.MULTILINE) return lambda code, pos, _: rule.match(code, pos) +def match_token(ctx, pattern, token_type): + match = re.compile(pattern).match(ctx.code, ctx.pos) + if match is None: + return None, ctx + else: + span = match.span() + result = Token(token_type, ctx.code[span[0]:span[1]], span, ctx.line) + ctx.pos = span[1] + ctx.line += result.token.count('\n') + return result, ctx + def split_by_groups(*token_types): def split(ctx, match): pos = ctx.pos From 0eec683a2b1a773cd9fd6412ea321fe2a19f197c Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Mon, 30 Dec 2024 00:07:14 +0100 Subject: [PATCH 07/11] lexers: Add Kconfig lexer tests --- elixir/lexers/tests/test_kconfig.py | 372 ++++++++++++++++++++++++++++ 1 file changed, 372 insertions(+) create mode 100644 elixir/lexers/tests/test_kconfig.py diff --git a/elixir/lexers/tests/test_kconfig.py b/elixir/lexers/tests/test_kconfig.py new file mode 100644 index 00000000..e0adf379 --- /dev/null +++ b/elixir/lexers/tests/test_kconfig.py @@ -0,0 +1,372 @@ +from ..lexers import KconfigLexer +from .base import LexerTest + +class KconfigLexerTest(LexerTest): + lexer_cls = KconfigLexer + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + # TODO improve macro calls + + def test_comments(self): + self.lex(r""" +# comment1 +config 64BIT # comment2 + bool # comment3 + default "# asd" + default $(shell, \#) + help + asdasdsajdlakjd # not a comment + + asdasdsajdlakjd # not a comment + + # comment 5 + + # comment 6 +""", [ + ['COMMENT', '# comment1\n'], + ['SPECIAL', 'config'], + ['IDENTIFIER', '64BIT'], + ['COMMENT', '# comment2\n'], + ['SPECIAL', 'bool'], + ['COMMENT', '# comment3\n'], + ['SPECIAL', 'default'], + ['STRING', '"# asd"'], + ['SPECIAL', 'default'], + ['SPECIAL', 'shell'], + ['SPECIAL', '\\#)'], + ['SPECIAL', 'help'], + ['COMMENT', ' asdasdsajdlakjd # not a comment\n\n asdasdsajdlakjd # not a comment\n\n # comment 5\n\n'], + ['COMMENT', '# comment 6\n'], + ]) + + + def test_keywords(self): + self.lex(r""", +menu "menu name" + +visible if y + +choice + prompt "test prompt" + default y + +config 86CONIFG + bool "text" + prompt "prompt" + default y + tristate "test" + def_bool TEST_bool + depends on TEST + select TEST2 + imply TEST3 + range 5 512 if CONFIG_512 + help + help text + + more help text + +endmenu +""", [ + ['SPECIAL', 'menu'], + ['STRING', '"menu name"'], + ['SPECIAL', 'visible'], + ['SPECIAL', 'if'], + ['SPECIAL', 'y'], + ['SPECIAL', 'choice'], + ['SPECIAL', 'prompt'], + ['STRING', '"test prompt"'], + ['SPECIAL', 'default'], + ['SPECIAL', 'y'], + ['SPECIAL', 'config'], + ['IDENTIFIER', '86CONIFG'], + ['SPECIAL', 'bool'], + ['STRING', '"text"'], + ['SPECIAL', 'prompt'], + ['STRING', '"prompt"'], + ['SPECIAL', 'default'], + ['SPECIAL', 'y'], + ['SPECIAL', 'tristate'], + ['STRING', '"test"'], + ['SPECIAL', 'def_bool'], + ['IDENTIFIER', 'TEST_bool'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST2'], + ['SPECIAL', 'imply'], + ['IDENTIFIER', 'TEST3'], + ['SPECIAL', 'range'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'CONFIG_512'], + ['SPECIAL', 'help'], + ['COMMENT', ' help text\n\n more help text\n\n'], + ['SPECIAL', 'endmenu'], + ]) + + def test_conditions(self): + self.lex(r""" +config TEST + select TEST1 if TEST2 = TEST3 + select TEST2 if TEST5 != TEST6 + select TEST7 if TEST8 < TEST9 + select TEST10 if TEST11 > TEST12 + select TEST13 if TEST14 <= TEST15 +""", [ + ['SPECIAL', 'config'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST1'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST2'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST3'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST2'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST5'], + ['PUNCTUATION', '!'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST6'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST7'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST8'], + ['PUNCTUATION', '<'], + ['IDENTIFIER', 'TEST9'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST10'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST11'], + ['PUNCTUATION', '>'], + ['IDENTIFIER', 'TEST12'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST13'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST14'], + ['PUNCTUATION', '<'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST15'], + ], self.default_filtered_tokens + ("PUNCTUATION",)) + + def test_conditions2(self): + self.lex(r""" +config TEST + select TEST16 if TEST17 >= TEST3 + select TEST17 if (TEST18 = TEST19) + + select TEST20 if !(TEST21 = TEST22) + select TEST23 if TEST24 && TEST25 + select TEST26 if TEST27 || TEST28 +""", [ + ['SPECIAL', 'config'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST16'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST17'], + ['PUNCTUATION', '>'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST3'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST17'], + ['SPECIAL', 'if'], + ['PUNCTUATION', '('], + ['IDENTIFIER', 'TEST18'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST19'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST20'], + ['SPECIAL', 'if'], + ['PUNCTUATION', '!'], + ['PUNCTUATION', '('], + ['IDENTIFIER', 'TEST21'], + ['PUNCTUATION', '='], + ['IDENTIFIER', 'TEST22'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST23'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST24'], + ['PUNCTUATION', '&'], + ['PUNCTUATION', '&'], + ['IDENTIFIER', 'TEST25'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST26'], + ['SPECIAL', 'if'], + ['IDENTIFIER', 'TEST27'], + ['PUNCTUATION', '|'], + ['PUNCTUATION', '|'], + ['IDENTIFIER', 'TEST28'], + ], self.default_filtered_tokens + ("PUNCTUATION",)) + + def test_macros(self): + self.lex(r""" +config TEST + depends on $(shell,cat file | grep -vi "option 2") + depends on $(info,info to print) + depends on $(warning-if,a != b,warning to print) +""", [ + ['SPECIAL', 'config'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'shell'], + ['PUNCTUATION', ','], + ['SPECIAL', 'cat'], + ['SPECIAL', 'file'], + ['PUNCTUATION', '|'], + ['SPECIAL', 'grep'], + ['PUNCTUATION', '-'], + ['SPECIAL', 'vi'], + ['STRING', '"option 2"'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'info'], + ['PUNCTUATION', ','], + ['SPECIAL', 'info'], + ['SPECIAL', 'to'], + ['SPECIAL', 'print'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'warning-if'], + ['PUNCTUATION', ','], + ['SPECIAL', 'a'], + ['PUNCTUATION', '!'], + ['PUNCTUATION', '='], + ['SPECIAL', 'b'], + ['PUNCTUATION', ','], + ['SPECIAL', 'warning'], + ['SPECIAL', 'to'], + ['SPECIAL', 'print'], + ['PUNCTUATION', ')'], + ], self.default_filtered_tokens + ("PUNCTUATION",)) + +def test_macros2(self): + self.lex(r""" +config TEST + depends on $(error-if,a != b,warning to print) + depends on $(filename) + depends on $(lineno) +""", [ + ['SPECIAL', 'config'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'error-if'], + ['PUNCTUATION', ','], + ['SPECIAL', 'a'], + ['PUNCTUATION', '!'], + ['PUNCTUATION', '='], + ['SPECIAL', 'b'], + ['PUNCTUATION', ','], + ['SPECIAL', 'warning'], + ['SPECIAL', 'to'], + ['SPECIAL', 'print'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'filename'], + ['PUNCTUATION', ')'], + ['SPECIAL', 'depends'], + ['SPECIAL', 'on'], + ['PUNCTUATION', '$'], + ['PUNCTUATION', '('], + ['SPECIAL', 'lineno'], + ['PUNCTUATION', ')'], + ], self.default_filtered_tokens + ("PUNCTUATION",)) + + def test_help(self): + self.lex(r""" +config + help + help test lasdlkajdk sadlksajd + lsajdlad + + salkdjaldlksajd + + " + asdlkajsdlkjsadlajdsk + + salkdjlsakdj' +config + select TEST +config + ---help--- + help test lasdlkajdk sadlksajd + lsajdlad + + salkdjaldlksajd + +config + select TEST +""", [ + ['SPECIAL', 'config'], + ['SPECIAL', 'help'], + ['COMMENT', ' help test lasdlkajdk sadlksajd\n lsajdlad\n\n salkdjaldlksajd\n\n "\n asdlkajsdlkjsadlajdsk\n\n salkdjlsakdj\'\n'], + ['SPECIAL', 'config'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST'], + ['SPECIAL', 'config'], + ['SPECIAL', '---help---'], + ['COMMENT', ' help test lasdlkajdk sadlksajd\n lsajdlad\n\n salkdjaldlksajd\n \n'], + ['SPECIAL', 'config'], + ['SPECIAL', 'select'], + ['IDENTIFIER', 'TEST'], + ]) + + def test_types(self): + self.lex(r""" +config + bool + default y + +config + tristate + default m + +config + hex + default 0xdfffffff00000000 + +config + string + default "string \" test # \# zxc" + +config + int + default 21312323 +""", [ + ['SPECIAL', 'config'], + ['SPECIAL', 'bool'], + ['SPECIAL', 'default'], + ['SPECIAL', 'y'], + ['SPECIAL', 'config'], + ['SPECIAL', 'tristate'], + ['SPECIAL', 'default'], + ['SPECIAL', 'm'], + ['SPECIAL', 'config'], + ['SPECIAL', 'hex'], + ['SPECIAL', 'default'], + ['IDENTIFIER', '0xdfffffff00000000'], + ['SPECIAL', 'config'], + ['SPECIAL', 'string'], + ['SPECIAL', 'default'], + ['STRING', '"string \\" test # \\# zxc"'], + ['SPECIAL', 'config'], + ['SPECIAL', 'int'], + ['SPECIAL', 'default'], + ]) From 78f10c1ee9e2dd5e8a26f48d591000dd105359c8 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Mon, 30 Dec 2024 00:14:18 +0100 Subject: [PATCH 08/11] lexers: Add a GNU Assembler lexer --- elixir/lexers/__main__.py | 2 + elixir/lexers/lexers.py | 107 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py index dab00d14..a39b5b42 100644 --- a/elixir/lexers/__main__.py +++ b/elixir/lexers/__main__.py @@ -17,6 +17,8 @@ lexer = lexers.DTSLexer(f.read()) elif filename.endswith('Kconfig'): lexer = lexers.KconfigLexer(f.read()) + elif filename.endswith(('.s', '.S')): + lexer = lexers.GasLexer(f.read()) else: raise Exception("no lexer for filetype") diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py index 06134a53..5b4bac4b 100644 --- a/elixir/lexers/lexers.py +++ b/elixir/lexers/lexers.py @@ -255,3 +255,110 @@ def __init__(self, code): def lex(self): return simple_lexer(self.rules, self.code) + +# https://sourceware.org/binutils/docs/as.html#Syntax +class GasLexer: + # https://sourceware.org/binutils/docs/as.html#Symbol-Intro + # apparently dots are okay, BUT ctags removes the first dot from labels, for example. same with dollars + # /musl/v1.2.5/source/src/string/aarch64/memcpy.S#L92 + gasm_identifier = r'[a-zA-Z0-9_][a-zA-Z0-9_$.]*' + + gasm_flonum = r'0?[a-zA-Z][+-]?([0-9]|\\s*\n\s*)*\.([0-9]|\\s*\n\s*)*([eE][+-]?[0-9]+)?' + gasm_number = regex_or(gasm_flonum, shared.common_hexidecimal_integer, shared.common_binary_integer, + shared.common_decimal_integer) + + gasm_char = r"'(\\.|.|\n)" + gasm_string = f'(({ shared.double_quote_string_with_escapes })|({ gasm_char }))' + + gasm_comment_chars_map = { + 'generic': (r'#\s',), + + 'nios2': (r'#',), + 'openrisc': (r'#',), + 'powerpc': (r'#',), + 's390': (r'#',), + 'xtensa': (r'#',), + 'microblaze': (r'#',), + 'mips': (r'#',), + 'alpha': (r'#',), + 'csky': (r'#',), + # BUT double pipe in macros is an operator... and # not in the first line in + # /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S + 'm68k': ('|', '^#', r'#\s'), + 'arc': ('# ', ';'), + + # https://sourceware.org/binutils/docs/as.html#HPPA-Syntax + # /linux/v6.10.7/source/arch/parisc/kernel/perf_asm.S#L28 + 'parisc': (';',), + 'x86': (';',), + 'tic6x': (';', '*'), # cx6, tms320, although the star is sketchy + + # in below, # can be a comment only if the first character of the line + + # https://sourceware.org/binutils/docs/as.html#SH-Syntax + # /linux/v6.10.7/source/arch/sh/kernel/head_32.S#L58 + 'sh': ('!', '^#'), + # https://sourceware.org/binutils/docs/as.html#Sparc_002dSyntax + # /linux/v6.10.7/source/arch/sparc/lib/memset.S#L125 + 'sparc': ('!', '^#'), + # used in ARM https://sourceware.org/binutils/docs/as.html#ARM-Syntax + # /linux/v6.10.7/source/arch/arm/mach-sa1100/sleep.S#L33 + 'arm32': ('@', '^#'), + 'cris': (';', '^#'), + 'avr': (';', '^#'), + # blackfin, tile + } + + gasm_punctuation = r'[.,\[\]()<>{}%&+*!|@#$;:^/\\=~-]' + # TODO make sure all relevant directives are listed here + gasm_preprocessor = r'#[ \t]*(define|ifdef|ifndef|undef|if|else|elif|endif)' + + rules_before_comments = [ + (shared.whitespace, TokenType.WHITESPACE), + # don't interpret macro concatenate as a comment + ('##', TokenType.PUNCTUATION), + # don't interpret or as a comment + (r'\|\|', TokenType.PUNCTUATION), + (FirstInLine(regex_or(shared.c_preproc_include, shared.c_preproc_warning_and_error)), TokenType.SPECIAL), + (FirstInLine(gasm_preprocessor), TokenType.SPECIAL), + (shared.common_slash_comment, TokenType.COMMENT), + ] + + rules_after_comments = [ + (gasm_string, TokenType.STRING), + (gasm_number, TokenType.NUMBER), + (gasm_identifier, TokenType.IDENTIFIER), + (gasm_punctuation, TokenType.PUNCTUATION), + ] + + def __init__(self, code, arch='generic'): + self.code = code + self.comment_chars = self.gasm_comment_chars_map[arch] + + def get_arch_rules(self): + result = [] + + regex_chars = '*?+^.$\\[]|()' + add_slash = lambda ch: '\\' + ch if ch in regex_chars else ch + + for comment_char in self.comment_chars: + if comment_char[0] == '^': + result.append(( + FirstInLine(add_slash(comment_char[1]) + shared.singleline_comment_with_escapes_base), + TokenType.COMMENT + )) + else: + result.append(( + add_slash(comment_char) + shared.singleline_comment_with_escapes_base, + TokenType.COMMENT) + ) + + return result + + def lex(self): + rules = self.rules_before_comments + \ + self.get_arch_rules() + \ + self.rules_after_comments + + return simple_lexer(rules, self.code) + From d967309c17bd4a29d76d325ec8b1ad4cfdef0057 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Mon, 30 Dec 2024 00:15:28 +0100 Subject: [PATCH 09/11] lexers: Add GNU Assembler lexer tests --- elixir/lexers/tests/test_gas.py | 282 ++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 elixir/lexers/tests/test_gas.py diff --git a/elixir/lexers/tests/test_gas.py b/elixir/lexers/tests/test_gas.py new file mode 100644 index 00000000..3c541f22 --- /dev/null +++ b/elixir/lexers/tests/test_gas.py @@ -0,0 +1,282 @@ +from ..lexers import GasLexer +from .base import LexerTest + +class GasLexerTest(LexerTest): + lexer_cls = GasLexer + default_filtered_tokens = ("SPECIAL", "COMMENT", "STRING", "IDENTIFIER", "SPECIAL", "ERROR") + + def test_comments_m68k(self): + self.lex(r""" +# comment 1 +#comment 2 + clrl d1 | comment 3 + clrl d0 |comment 4 +| comment 4 + + clrl d2 # comment 3 + +#if defined(C1) || !defined(C2) + addql #4,%sp +label: + movel #-IDNENT,%sp@(IDENT)| comment 5 +// /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S + test # comment 6 +# endif + +#define macro(x) inst &IDENT,%pc@(ident); inst x +""", [ + ['COMMENT', '# comment 1\n'], + ['COMMENT', '#comment 2\n'], + ['IDENTIFIER', 'clrl'], + ['IDENTIFIER', 'd1'], + ['COMMENT', '| comment 3\n'], + ['IDENTIFIER', 'clrl'], + ['IDENTIFIER', 'd0'], + ['COMMENT', '|comment 4\n'], + ['COMMENT', '| comment 4\n'], + ['IDENTIFIER', 'clrl'], + ['IDENTIFIER', 'd2'], + ['COMMENT', '# comment 3\n'], + ['SPECIAL', '#if'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'C1'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'C2'], + ['IDENTIFIER', 'addql'], + ['IDENTIFIER', 'sp'], + ['IDENTIFIER', 'label'], + ['IDENTIFIER', 'movel'], + ['IDENTIFIER', 'IDNENT'], + ['IDENTIFIER', 'sp'], + ['IDENTIFIER', 'IDENT'], + ['COMMENT', '| comment 5\n'], + ['COMMENT', '// /linux/v6.10.7/source/arch/m68k/ifpsp060/src/fplsp.S\n'], + ['IDENTIFIER', 'test'], + ['COMMENT', '# comment 6\n'], + ['SPECIAL', '# endif'], + ['SPECIAL', '#define'], + ['IDENTIFIER', 'macro'], + ['IDENTIFIER', 'x'], + ['IDENTIFIER', 'inst'], + ['IDENTIFIER', 'IDENT'], + ['IDENTIFIER', 'pc'], + ['IDENTIFIER', 'ident'], + ['IDENTIFIER', 'inst'], + ['IDENTIFIER', 'x'], + ], lexer_options={"arch": "m68k"}) + + def test_comments_sparc(self): + self.lex(r""" +#define F(i) \ + .type i,@function; + + std t1, [0x00]; + +/*comment default */ +//comment default2 + .type identifier,#function +label: + sethi %hi(IDENT), %g0 !test comment + wrpr %g1, %sp ! test comment +# comment +#comment + sethi %hi(IDENT_1 | IDENT_2), %l0 +""", [ + ['SPECIAL', '#define'], + ['IDENTIFIER', 'F'], + ['IDENTIFIER', 'i'], + ['IDENTIFIER', 'type'], + ['IDENTIFIER', 'i'], + ['IDENTIFIER', 'function'], + ['IDENTIFIER', 'std'], + ['IDENTIFIER', 't1'], + ['COMMENT', '/*comment default */'], + ['COMMENT', '//comment default2\n'], + ['IDENTIFIER', 'type'], + ['IDENTIFIER', 'identifier'], + ['IDENTIFIER', 'function'], + ['IDENTIFIER', 'label'], + ['IDENTIFIER', 'sethi'], + ['IDENTIFIER', 'hi'], + ['IDENTIFIER', 'IDENT'], + ['IDENTIFIER', 'g0'], + ['COMMENT', '!test comment\n'], + ['IDENTIFIER', 'wrpr'], + ['IDENTIFIER', 'g1'], + ['IDENTIFIER', 'sp'], + ['COMMENT', '! test comment\n'], + ['COMMENT', '# comment\n'], + ['COMMENT', '#comment\n'], + ['IDENTIFIER', 'sethi'], + ['IDENTIFIER', 'hi'], + ['IDENTIFIER', 'IDENT_1'], + ['IDENTIFIER', 'IDENT_2'], + ['IDENTIFIER', 'l0'], + ], lexer_options={"arch": "sparc"}) + + def test_comments_arm32(self): + self.lex(r""" +// comment default +/* comment default2 */ +test: + bic r0, r1, #10 + # comment 1 + #comment 1 +""" ++ "\t# comment 1" + r""" + moveq r0, #IDENTIFIER @ Comment +# comment 2 +#comment 2 + push {r0} + add \addr, \addr, \tmp @comment3 + ldr r1, =TEST3 + ldr TEST, [sp, IDENT(i)]; + .long PMD_TYPE_SECT | \ + PMD_BIT4 + stmfd sp!, {r0, r1, r2, r3} + eor RT0, d, b; +""", [ + ['COMMENT', '// comment default\n'], + ['COMMENT', '/* comment default2 */'], + ['IDENTIFIER', 'test'], + ['IDENTIFIER', 'bic'], + ['IDENTIFIER', 'r0'], + ['IDENTIFIER', 'r1'], + ['NUMBER', '10'], + ['COMMENT', '# comment 1\n'], + ['COMMENT', '#comment 1\n'], + ['COMMENT', '# comment 1\n'], + ['IDENTIFIER', 'moveq'], + ['IDENTIFIER', 'r0'], + ['IDENTIFIER', 'IDENTIFIER'], + ['COMMENT', '@ Comment\n'], + ['COMMENT', '# comment 2\n'], + ['COMMENT', '#comment 2\n'], + ['IDENTIFIER', 'push'], + ['IDENTIFIER', 'r0'], + ['IDENTIFIER', 'add'], + ['IDENTIFIER', 'addr'], + ['IDENTIFIER', 'addr'], + ['IDENTIFIER', 'tmp'], + ['COMMENT', '@comment3\n'], + ['IDENTIFIER', 'ldr'], + ['IDENTIFIER', 'r1'], + ['IDENTIFIER', 'TEST3'], + ['IDENTIFIER', 'ldr'], + ['IDENTIFIER', 'TEST'], + ['IDENTIFIER', 'sp'], + ['IDENTIFIER', 'IDENT'], + ['IDENTIFIER', 'i'], + ['IDENTIFIER', 'long'], + ['IDENTIFIER', 'PMD_TYPE_SECT'], + ['IDENTIFIER', 'PMD_BIT4'], + ['IDENTIFIER', 'stmfd'], + ['IDENTIFIER', 'sp'], + ['IDENTIFIER', 'r0'], + ['IDENTIFIER', 'r1'], + ['IDENTIFIER', 'r2'], + ['IDENTIFIER', 'r3'], + ['IDENTIFIER', 'eor'], + ['IDENTIFIER', 'RT0'], + ['IDENTIFIER', 'd'], + ['IDENTIFIER', 'b'], + ], self.default_filtered_tokens + ("NUMBER",), {"arch": "arm32"}) + + def test_comments_generic(self): + self.lex(r""" +/* comment + * more comment + * more comment + */ + mov r0, r1 //test + mov x0, #IDENT + stp x1, x2, [sp, #-4]! +#if defined(IDENT1) || defined(IDENT2) +#endif +""", [ + ['COMMENT', '/* comment\n * more comment\n * more comment\n */'], + ['IDENTIFIER', 'mov'], + ['IDENTIFIER', 'r0'], + ['PUNCTUATION', ','], + ['IDENTIFIER', 'r1'], + ['COMMENT', '//test\n'], + ['IDENTIFIER', 'mov'], + ['IDENTIFIER', 'x0'], + ['PUNCTUATION', ','], + ['PUNCTUATION', '#'], + ['IDENTIFIER', 'IDENT'], + ['IDENTIFIER', 'stp'], + ['IDENTIFIER', 'x1'], + ['PUNCTUATION', ','], + ['IDENTIFIER', 'x2'], + ['PUNCTUATION', ','], + ['PUNCTUATION', '['], + ['IDENTIFIER', 'sp'], + ['PUNCTUATION', ','], + ['PUNCTUATION', '#'], + ['PUNCTUATION', '-'], + ['NUMBER', '4'], + ['PUNCTUATION', ']'], + ['PUNCTUATION', '!'], + ['SPECIAL', '#if'], + ['IDENTIFIER', 'defined'], + ['PUNCTUATION', '('], + ['IDENTIFIER', 'IDENT1'], + ['PUNCTUATION', ')'], + ['PUNCTUATION', '||'], + ['IDENTIFIER', 'defined'], + ['PUNCTUATION', '('], + ['IDENTIFIER', 'IDENT2'], + ['PUNCTUATION', ')'], + ['SPECIAL', '#endif'], + ], self.default_filtered_tokens + ("PUNCTUATION", "NUMBER")) + + def test_comments_preproc(self): + self.lex(r""" + # error "test" +#warning "test" +#include "test.h" +#include +#if defined(T1) || defined(T2) +#endif +""", [ + ['SPECIAL', '# error "test"\n'], + ['SPECIAL', '#warning "test"\n'], + ['SPECIAL', '#include "test.h"'], + ['SPECIAL', '#include '], + ['SPECIAL', '#if'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'T1'], + ['IDENTIFIER', 'defined'], + ['IDENTIFIER', 'T2'], + ['SPECIAL', '#endif'], + ]) + + def test_comments_literals(self): + self.lex(r""" +.byte 12, 0b1010, 0B1010, 0x34, 0123, 0X45, 'a, '\b +.ascii "asdsad\"zxczc" +.float 0f-12321321030982394324\ + 21321432432.234324324E-14 +.float 0f-123.123213e+13 +.float 0e-123.123213e+13 +""", [ + ['IDENTIFIER', 'byte'], + ['NUMBER', '12'], + ['NUMBER', '0b1010'], + ['NUMBER', '0B1010'], + ['NUMBER', '0x34'], + ['NUMBER', '0123'], + ['NUMBER', '0X45'], + ['STRING', "'a"], + ['STRING', "'\\b"], + ['IDENTIFIER', 'ascii'], + ['STRING', '"asdsad\\"zxczc"'], + ['IDENTIFIER', 'float'], + ['NUMBER', '0f-12321321030982394324\\\n 21321432432.234324324E-14'], + ['IDENTIFIER', 'float'], + ['NUMBER', '0f-123.123213e+13'], + ['IDENTIFIER', 'float'], + ['NUMBER', '0e-123.123213e+13'], + ], self.default_filtered_tokens + ("NUMBER",)) + From 9b0ca9a911ab1bfc6717d8d2849c5d3cd1b7888a Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Mon, 30 Dec 2024 00:17:35 +0100 Subject: [PATCH 10/11] lexers: Add a Makefile lexer --- elixir/lexers/__main__.py | 2 ++ elixir/lexers/lexers.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/elixir/lexers/__main__.py b/elixir/lexers/__main__.py index a39b5b42..676b2ed4 100644 --- a/elixir/lexers/__main__.py +++ b/elixir/lexers/__main__.py @@ -19,6 +19,8 @@ lexer = lexers.KconfigLexer(f.read()) elif filename.endswith(('.s', '.S')): lexer = lexers.GasLexer(f.read()) + elif filename.endswith('Makefile'): + lexer = lexers.MakefileLexer(f.read()) else: raise Exception("no lexer for filetype") diff --git a/elixir/lexers/lexers.py b/elixir/lexers/lexers.py index 5b4bac4b..b470e749 100644 --- a/elixir/lexers/lexers.py +++ b/elixir/lexers/lexers.py @@ -362,3 +362,34 @@ def lex(self): return simple_lexer(rules, self.code) + +# https://www.gnu.org/software/make/manual/make.html +class MakefileLexer: + # https://pubs.opengroup.org/onlinepubs/007904975/utilities/make.html + + # NOTE same as in KConfig, we only care about screaming case names + make_identifier = r'[A-Z0-9_]+' + make_minor_identifier = r'[a-zA-Z0-9_][a-zA-Z0-9-_]*' + make_variable = r'(\$\([a-zA-Z0-9_-]\)|\$\{[a-zA-Z0-9_-]\})' + make_single_quote_string = r"'*?'" + make_string = f'(({ make_single_quote_string })|({ shared.double_quote_string_with_escapes }))' + make_escape = r'\\[#"\']' + make_punctuation = r'[~\\`\[\](){}<>.,:;|%$^@&?!+*/=-]' + make_comment = r'(? Date: Mon, 30 Dec 2024 00:28:25 +0100 Subject: [PATCH 11/11] lexers: Integrate new lexers with the rest of Elixir --- elixir/filters/__init__.py | 72 ++++++++++++++++++++++++++------------ elixir/lexers/__init__.py | 10 ++++++ elixir/project_utils.py | 17 +++++++++ elixir/projects.py | 44 +++++++++++++++++++++++ elixir/query.py | 38 ++++++++++++-------- elixir/web.py | 5 +-- update.py | 60 +++++++++++++++++++------------ 7 files changed, 186 insertions(+), 60 deletions(-) diff --git a/elixir/filters/__init__.py b/elixir/filters/__init__.py index b06eae8f..e65e9d08 100755 --- a/elixir/filters/__init__.py +++ b/elixir/filters/__init__.py @@ -1,23 +1,51 @@ -from typing import List - -from .utils import Filter, FilterContext -from .projects import project_filters, default_filters - -# Returns a list of applicable filters for project_name under provided filter context -def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]: - filter_classes = project_filters.get(project_name, default_filters) - filters = [] - - for filter_cls in filter_classes: - if type(filter_cls) == tuple and len(filter_cls) == 2: - cls, kwargs = filter_cls - filters.append(cls(**kwargs)) - elif type(filter_cls) == type: - filters.append(filter_cls()) - else: - raise ValueError(f"Invalid filter: {filter_cls}, " \ - "should be either a two element tuple or a type. " \ - "Make sure project_filters in project.py is valid.") - - return [f for f in filters if f.check_if_applies(ctx)] +from .ident import IdentFilter + +from .cppinc import CppIncFilter +from .cpppathinc import CppPathIncFilter + +from .defconfig import DefConfigIdentsFilter +from .configin import ConfigInFilter + +from .kconfig import KconfigFilter +from .kconfigidents import KconfigIdentsFilter + +from .dtsi import DtsiFilter +from .dtscompdocs import DtsCompDocsFilter +from .dtscompcode import DtsCompCodeFilter +from .dtscompdts import DtsCompDtsFilter + +from .makefileo import MakefileOFilter +from .makefiledtb import MakefileDtbFilter +from .makefiledir import MakefileDirFilter +from .makefilesubdir import MakefileSubdirFilter +from .makefilefile import MakefileFileFilter +from .makefilesrctree import MakefileSrcTreeFilter +from .makefilesubdir import MakefileSubdirFilter + + +# List of filters applied to all projects +default_filters = [ + DtsCompCodeFilter, + DtsCompDtsFilter, + DtsCompDocsFilter, + IdentFilter, + CppIncFilter, +] + +# List of filters for Kconfig files +common_kconfig_filters = [ + KconfigFilter, + KconfigIdentsFilter, + DefConfigIdentsFilter, +] + +# List of filters for Makefiles +common_makefile_filters = [ + MakefileOFilter, + MakefileDtbFilter, + MakefileDirFilter, + MakefileFileFilter, + MakefileSubdirFilter, + MakefileSrcTreeFilter, +] diff --git a/elixir/lexers/__init__.py b/elixir/lexers/__init__.py index e69de29b..f4f3fa32 100644 --- a/elixir/lexers/__init__.py +++ b/elixir/lexers/__init__.py @@ -0,0 +1,10 @@ +from .lexers import * + +default_lexers = { + r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer, + r'makefile\..*': MakefileLexer, + r'.*\.dts(i)?': DTSLexer, + r'.*\.s': GasLexer, + r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst +} + diff --git a/elixir/project_utils.py b/elixir/project_utils.py index 31523a83..242a62c1 100644 --- a/elixir/project_utils.py +++ b/elixir/project_utils.py @@ -4,6 +4,7 @@ from .filters.utils import Filter, FilterContext from .filters import default_filters from .projects import projects +from .lexers import default_lexers # Returns a list of applicable filters for project_name under provided filter context def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]: @@ -28,3 +29,19 @@ def get_filters(ctx: FilterContext, project_name: str) -> List[Filter]: return [f for f in filters if f.check_if_applies(ctx)] +def get_lexer(path: str, project_name: str): + project_config = projects.get(project_name) + if project_config is None or 'lexers' not in project_config: + lexers = default_lexers + else: + lexers = project_config['lexers'] + + path = path.lower() + for regex, lexer in lexers.items(): + if re.match(regex, path): + if type(lexer) == tuple: + lexer_cls, kwargs = lexer + return lambda code: lexer_cls(code, **kwargs) + else: + return lambda code: lexer(code) + diff --git a/elixir/projects.py b/elixir/projects.py index 90a1ecc3..53d4065e 100644 --- a/elixir/projects.py +++ b/elixir/projects.py @@ -1,4 +1,7 @@ from .filters import * +from collections import OrderedDict +from .filters import * +from .lexers import * # Dictionary of custom per-projects settings. # filters: @@ -48,6 +51,29 @@ # Our solution is to ignore all includes in such paths (CppPathIncFilter, {"path_exceptions": {'^/include/uapi/.*'}}), ], + 'lexers': OrderedDict({ + r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer, + r'makefile\..*': MakefileLexer, + r'.*\.dts(i)?': DTSLexer, + r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst + + r'/arch/alpha/.*\.s': (GasLexer, {"arch": "alpha"}), + r'/arch/arc/.*\.s': (GasLexer, {"arch": "arc"}), + r'/arch/arm/.*\.s': (GasLexer, {"arch": "arm32"}), + r'/arch/csky/.*\.s': (GasLexer, {"arch": "csky"}), + r'/arch/m68k/.*\.s': (GasLexer, {"arch": "m68k"}), + r'/arch/microblaze/.*\.s': (GasLexer, {"arch": "microblaze"}), + r'/arch/mips/.*\.s': (GasLexer, {"arch": "mips"}), + r'/arch/openrisc/.*\.s': (GasLexer, {"arch": "openrisc"}), + r'/arch/parisc/.*\.s': (GasLexer, {"arch": "parisc"}), + r'/arch/s390/.*\.s': (GasLexer, {"arch": "s390"}), + r'/arch/sh/.*\.s': (GasLexer, {"arch": "sh"}), + r'/arch/sparc/.*\.s': (GasLexer, {"arch": "sparc"}), + r'/arch/um/.*\.s': (GasLexer, {"arch": "x86"}), + r'/arch/x86/.*\.s': (GasLexer, {"arch": "x86"}), + r'/arch/xtensa/.*\.s': (GasLexer, {"arch": "xtensa"}), + r'.*\.s': GasLexer, + }), }, 'qemu': { 'filters': [ @@ -63,6 +89,24 @@ CppPathIncFilter, *common_makefile_filters, ], + 'lexers': OrderedDict({ + r'.*\.(c|h|cpp|hpp|c++|cxx|cc)': CLexer, + r'makefile\..*': MakefileLexer, + r'.*\.dts(i)?': DTSLexer, + r'kconfig.*': KconfigLexer, #TODO negative lookahead for .rst + + r'/arch/arc/.*\.s': (GasLexer, {"arch": "arc"}), + r'/arch/arm/.*\.s': (GasLexer, {"arch": "arm32"}), + r'/arch/m68k/.*\.s': (GasLexer, {"arch": "m68k"}), + r'/arch/microblaze/.*\.s': (GasLexer, {"arch": "microblaze"}), + r'/arch/mips/.*\.s': (GasLexer, {"arch": "mips"}), + r'/arch/riscv/.*\.s': (GasLexer, {"arch": "riscv"}), + r'/arch/sh/.*\.s': (GasLexer, {"arch": "sh"}), + r'/arch/x86/.*\.s': (GasLexer, {"arch": "x86"}), + r'/arch/sandbox/.*\.s': (GasLexer, {"arch": "x86"}), + r'/arch/xtensa/.*\.s': (GasLexer, {"arch": "xtensa"}), + r'.*\.s': GasLexer, + }), }, 'uclibc-ng': { 'filters': [ diff --git a/elixir/query.py b/elixir/query.py index ff14d4b1..5476dc6d 100755 --- a/elixir/query.py +++ b/elixir/query.py @@ -21,7 +21,8 @@ from .lib import script, scriptLines, decode from . import lib from . import data -import os +from .lexers import TokenType +import os, sys from collections import OrderedDict from urllib import parse @@ -172,29 +173,38 @@ def query(self, cmd, *args): version = args[0] path = args[1] + lexer = args[2] filename = os.path.basename(path) family = lib.getFileFamily(filename) - if family != None: + if family is not None and lexer is not None: buffer = BytesIO() - tokens = self.scriptLines('tokenize-file', version, path, family) - even = True + code = self.get_file_raw(version, path) prefix = b'' if family == 'K': prefix = b'CONFIG_' - for tok in tokens: - even = not even - tok2 = prefix + tok - if (even and self.db.defs.exists(tok2) and - (lib.compatibleFamily(self.db.defs.get(tok2).get_families(), family) or - lib.compatibleMacro(self.db.defs.get(tok2).get_macros(), family))): - tok = b'\033[31m' + tok2 + b'\033[0m' - else: - tok = lib.unescape(tok) - buffer.write(tok) + for token_type, token, _, line in lexer(code).lex(): + token = token.encode() + + if token_type == TokenType.ERROR: + print("error token: ", token, token_type, filename, line, file=sys.stderr) + elif token_type == TokenType.IDENTIFIER: + token_with_prefix = prefix + token + token_in_db = self.db.defs.exists(token_with_prefix) + if token_in_db: + compatible = \ + lib.compatibleFamily(self.db.defs.get(token_with_prefix).get_families(), family) or \ + lib.compatibleMacro(self.db.defs.get(token_with_prefix).get_macros(), family) + + if compatible: + buffer.write(b'\033[31m' + token_with_prefix + b'\033[0m') + continue + + buffer.write(token) + return decode(buffer.getvalue()) else: return decode(self.script('get-file', version, path)) diff --git a/elixir/web.py b/elixir/web.py index 514e9cce..d25745b0 100755 --- a/elixir/web.py +++ b/elixir/web.py @@ -33,7 +33,7 @@ from .lib import validFamily from .query import Query, SymbolInstance -from .project_utils import get_filters +from .project_utils import get_filters, get_lexer from .filters.utils import FilterContext from .autocomplete import AutocompleteResource from .api import ApiIdentGetterResource @@ -485,7 +485,8 @@ def format_code(filename, code): # version: requested version of the project # path: path to the file in the repository def generate_source(q, project, version, path): - code = q.query('file', version, path) + lexer = get_lexer(path, project) + code = q.query('file', version, path, lexer) _, fname = os.path.split(path) _, extension = os.path.splitext(fname) diff --git a/update.py b/update.py index 79cb4dcf..3d14e8ce 100755 --- a/update.py +++ b/update.py @@ -22,13 +22,16 @@ # Throughout, an "idx" is the sequential number associated with a blob. # This is different from that blob's Git hash. +import sys from sys import argv from threading import Thread, Lock, Event, Condition +from elixir.lexers import TokenType import elixir.lib as lib from elixir.lib import script, scriptLines import elixir.data as data from elixir.data import PathList +from elixir.project_utils import get_lexer from find_compatible_dts import FindCompatibleDTS verbose = False @@ -56,6 +59,7 @@ bindings_idxes = [] # DT bindings documentation files idx_key_mod = 1000000 defs_idxes = {} # Idents definitions stored with (idx*idx_key_mod + line) as the key. +file_paths = {} tags_done = False # True if all tags have been added to new_idxes @@ -163,7 +167,7 @@ def run(self): progress('vers: Thread finished', index) def update_versions(self, tag): - global blobs_lock + global blobs_lock, file_paths # Get blob hashes and associated file paths blobs = scriptLines('list-blobs', '-p', tag) @@ -174,12 +178,14 @@ def update_versions(self, tag): with blobs_lock: idx = db.blob.get(hash) buf.append((idx, path)) + file_paths[idx] = path buf = sorted(buf) obj = PathList() for idx, path in buf: obj.append(idx, path) + # Store DT bindings documentation files to parse them later if path[:33] == b'Documentation/devicetree/bindings': bindings_idxes.append(idx) @@ -275,6 +281,7 @@ def run(self): new_idxes[self.index][1].wait() # Make sure the tag is ready new_idxes[self.index][2].wait() # Make sure UpdateDefs processed the tag + new_idxes[self.index][4].wait() # Tell that UpdateVersions processed the tag with tags_refs_lock: tags_refs[0] += 1 @@ -288,45 +295,53 @@ def run(self): progress('refs: Thread ' + str(tags_refs[1]) + '/' + str(self.inc) + ' finished', tags_refs[0]) def update_references(self, idxes): - global hash_file_lock, defs_lock, refs_lock, tags_refs + global hash_file_lock, defs_lock, refs_lock, tags_refs, file_paths for idx in idxes: if idx % 1000 == 0: progress('refs: ' + str(idx), tags_refs[0]) with hash_file_lock: hash = db.hash.get(idx) - filename = db.file.get(idx) + filename = file_paths[idx].decode() family = lib.getFileFamily(filename) if family == None: continue + lexer = get_lexer(filename, project) + if lexer is None: + continue + + try: + code = script('get-blob', hash).decode() + except UnicodeDecodeError: + code = script('get-blob', hash).decode('raw_unicode_escape') + prefix = b'' # Kconfig values are saved as CONFIG_ if family == 'K': prefix = b'CONFIG_' - tokens = scriptLines('tokenize-file', '-b', hash, family) - even = True - line_num = 1 idents = {} with defs_lock: - for tok in tokens: - even = not even - if even: - tok = prefix + tok - - if (db.defs.exists(tok) and - not ( (idx*idx_key_mod + line_num) in defs_idxes and - defs_idxes[idx*idx_key_mod + line_num] == tok ) and - (family != 'M' or tok.startswith(b'CONFIG_'))): - # We only index CONFIG_??? in makefiles - if tok in idents: - idents[tok] += ',' + str(line_num) - else: - idents[tok] = str(line_num) + for token_type, token, _, line in lexer(code).lex(): + if token_type == TokenType.ERROR: + print("error token: ", token, token_type, filename, line, file=sys.stderr) + continue - else: - line_num += tok.count(b'\1') + token = prefix + token.encode() + + if token_type != TokenType.IDENTIFIER: + continue + + if (db.defs.exists(token) and + not ( (idx*idx_key_mod + line) in defs_idxes and + defs_idxes[idx*idx_key_mod + line] == token ) and + (family != 'M' or token.startswith(b'CONFIG_'))): + # We only index CONFIG_??? in makefiles + if token in idents: + idents[token] += ',' + str(line) + else: + idents[token] = str(line) with refs_lock: for ident, lines in idents.items(): @@ -579,6 +594,7 @@ def progress(msg, current): for tag in scriptLines('list-tags'): if not db.vers.exists(tag): tag_buf.append(tag) + break num_tags = len(tag_buf) project = lib.currentProject()