From 127bdabf60ab748c0247e7cb3d0e9769d46e0f76 Mon Sep 17 00:00:00 2001 From: Taus Date: Mon, 30 Jun 2025 14:23:53 +0000 Subject: [PATCH 1/5] Python: Update `tree-sitter` dependency Updates the Python extractor to depend on version 0.24.7 of tree-sitter (and 0.12.0 of tree-sitter-graph). A few changes were needed in order to make the code build and run after updating the dependencies: - In `main.rs`, the `Language` parameter is now passed as a reference. - In `python.tsg`, many queries had captures that were not actually used in the body of the stanza. This is no longer allowed (unless the captures start with an underscore), as it may indicate an error. To fix this, I added underscores in the appropriate places (and verified that none of these unused captures were in fact bugs). --- python/extractor/tsg-python/Cargo.lock | 52 ++++++++-------------- python/extractor/tsg-python/Cargo.toml | 4 +- python/extractor/tsg-python/python.tsg | 30 ++++++------- python/extractor/tsg-python/src/main.rs | 2 +- python/extractor/tsg-python/tsp/Cargo.toml | 2 +- 5 files changed, 37 insertions(+), 53 deletions(-) diff --git a/python/extractor/tsg-python/Cargo.lock b/python/extractor/tsg-python/Cargo.lock index 16849dc7f4d7..f3604d1a3870 100644 --- a/python/extractor/tsg-python/Cargo.lock +++ b/python/extractor/tsg-python/Cargo.lock @@ -1,12 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 - -[[package]] -name = "ahash" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0453232ace82dee0dd0b4c87a59bd90f7b53b314f3e0f61fe2ee7c8a16482289" +version = 4 [[package]] name = "aho-corasick" @@ -82,12 +76,6 @@ dependencies = [ "shlex", ] -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - [[package]] name = "clap" version = "4.5.30" @@ -121,15 +109,6 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" -[[package]] -name = "hashbrown" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" -dependencies = [ - "ahash", -] - [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -258,14 +237,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" [[package]] -name = "string-interner" -version = "0.12.2" +name = "streaming-iterator" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "383196d1876517ee6f9f0864d1fc1070331b803335d3c6daaa04bbcccd823c08" -dependencies = [ - "cfg-if", - "hashbrown", -] +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" [[package]] name = "strsim" @@ -306,30 +281,39 @@ dependencies = [ [[package]] name = "tree-sitter" -version = "0.20.4" +version = "0.24.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e34327f8eac545e3f037382471b2b19367725a242bba7bc45edb9efb49fe39a" +checksum = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75" dependencies = [ "cc", "regex", + "regex-syntax", + "streaming-iterator", + "tree-sitter-language", ] [[package]] name = "tree-sitter-graph" -version = "0.7.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "639d21e886f581d293de5f5081f09af003c54607ff3fa85efa159b243ba1f97a" +checksum = "63f86eb73c7d891c4b9b6fe4d4e63dd94c506e4788af7c2296afdcfbeea626cc" dependencies = [ "log", "regex", "serde", "serde_json", "smallvec", - "string-interner", + "streaming-iterator", "thiserror", "tree-sitter", ] +[[package]] +name = "tree-sitter-language" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4013970217383f67b18aef68f6fb2e8d409bc5755227092d32efb0422ba24b8" + [[package]] name = "tsg-python" version = "0.1.0" diff --git a/python/extractor/tsg-python/Cargo.toml b/python/extractor/tsg-python/Cargo.toml index 7ad2c1c949e6..f02fb06931b2 100644 --- a/python/extractor/tsg-python/Cargo.toml +++ b/python/extractor/tsg-python/Cargo.toml @@ -10,7 +10,7 @@ edition = "2024" [dependencies] anyhow = "1.0" regex = "1" -tree-sitter = "=0.20.4" -tree-sitter-graph = "0.7.0" +tree-sitter = "=0.24.7" +tree-sitter-graph = "0.12.0" tsp = {path = "tsp"} clap = "4.5" diff --git a/python/extractor/tsg-python/python.tsg b/python/extractor/tsg-python/python.tsg index 7ad0f3f14244..8dec9ad5d3ef 100644 --- a/python/extractor/tsg-python/python.tsg +++ b/python/extractor/tsg-python/python.tsg @@ -416,13 +416,13 @@ attr (@if.node) _location_end = (location-end @expr) } -(generator_expression . "(" . (comment)* . (expression) @start (for_in_clause) @child [(for_in_clause) (if_clause)] @end . (comment)* . ")" .) @genexpr +(generator_expression . "(" . (comment)* . (expression) @start (for_in_clause) @child [(for_in_clause) (if_clause)] @end . (comment)* . ")" .) @_genexpr { attr (@child.node) _location_start = (location-start @start) attr (@child.node) _location_end = (location-end @end) } -(generator_expression . "(" . (comment)* . (expression) @start (for_in_clause) @end . (comment)* . ")" .) @genexpr +(generator_expression . "(" . (comment)* . (expression) @start (for_in_clause) @end . (comment)* . ")" .) @_genexpr { attr (@end.node) _location_start = (location-start @start) attr (@end.node) _location_end = (location-end @end) @@ -524,7 +524,7 @@ attr (@del.node -> @target.node) targets = (named-child-index @target) } -(delete_statement target: (_) @target) @del +(delete_statement target: (_) @target) @_del { attr (@target.node) ctx = "del" } @@ -798,8 +798,8 @@ (dictionary_comprehension body: (pair - key: (_) @key - value: (_) @value + key: (_) @_key + value: (_) @_value ) ) @genexpr { @@ -1299,7 +1299,7 @@ ; the index of the left-hand side of the current assignment. ; Base case, for the outermost assignment we set the outermost node to this node, and the index to zero. -(expression_statement (assignment !type) @assign) @expr +(expression_statement (assignment !type) @assign) @_expr { let @assign.outermost_assignment = @assign.node let @assign.target_index = 0 @@ -1358,7 +1358,7 @@ } (assignment - left: (_) @target + left: (_) @_target type: (_) right: (_) @value ) @assign @@ -2330,7 +2330,7 @@ attr (@operand.node) ctx = "load" } -(unary_operator "~" @op) @unaryop +(unary_operator "~" @_op) @unaryop { attr (@unaryop.node) op = "~" } @@ -2614,7 +2614,7 @@ ; Async status ; NOTE: We only set the `is_async` field on the _first_ clause of the `with` statement, ; as this is the behaviour of the old parser. -(with_statement "async" "with" @with_keyword (with_clause . (with_item) @with)) +(with_statement "async" "with" @_with_keyword (with_clause . (with_item) @with)) { attr (@with.node) is_async = #true } @@ -2800,7 +2800,7 @@ (identifier) @obj . (identifier) @attr -) @match_value_pattern +) @_match_value_pattern { let attribute = (ast-node @attr "Attribute") attr (@attr.node) _skip_to = attribute @@ -2814,7 +2814,7 @@ (match_value_pattern . (identifier) @id -) @match_value_pattern +) @_match_value_pattern { attr (@id.node) ctx = "load" } @@ -3267,8 +3267,8 @@ (decorated_definition (decorator (expression) @exp1) @dec1 . (comment)* . - (decorator (expression) @exp2) @dec2 -) @decorator + (decorator (expression) @_exp2) @dec2 +) @_decorator { attr (@dec1.node) func = @exp1.node edge @dec1.node -> @dec2.node @@ -3279,7 +3279,7 @@ (decorator (expression) @exp) @last . (comment)* . definition: (function_definition) @funcdef -) @decorator +) @_decorator { attr (@last.node) func = @exp.node edge @last.node -> @funcdef.funcexpr @@ -3291,7 +3291,7 @@ (decorator (expression) @exp) @last . (comment)* . definition: (class_definition) @class -) @decorator +) @_decorator { attr (@last.node) func = @exp.node edge @last.node -> @class.class_expr diff --git a/python/extractor/tsg-python/src/main.rs b/python/extractor/tsg-python/src/main.rs index 6b72efdb6ef6..c99145132f76 100644 --- a/python/extractor/tsg-python/src/main.rs +++ b/python/extractor/tsg-python/src/main.rs @@ -502,7 +502,7 @@ fn main() -> Result<()> { let source_path = Path::new(matches.get_one::("source").unwrap()); let language = tsp::language(); let mut parser = Parser::new(); - parser.set_language(language)?; + parser.set_language(&language)?; // Statically include `python.tsg`: let tsg = if matches.contains_id("tsg") { std::fs::read(&tsg_path).with_context(|| format!("Error reading TSG file {}", tsg_path))? diff --git a/python/extractor/tsg-python/tsp/Cargo.toml b/python/extractor/tsg-python/tsp/Cargo.toml index e36144566627..3d8587bbe5ab 100644 --- a/python/extractor/tsg-python/tsp/Cargo.toml +++ b/python/extractor/tsg-python/tsp/Cargo.toml @@ -26,7 +26,7 @@ path = "bindings/rust/lib.rs" ## When updating these dependencies, run `misc/bazel/3rdparty/update_cargo_deps.sh` [dependencies] -tree-sitter = ">= 0.20, < 0.21" +tree-sitter = "=0.24.7" [build-dependencies] cc = "1.2" From e07e2136f97eddf5489f4bac5547041744a5069c Mon Sep 17 00:00:00 2001 From: Taus Date: Mon, 30 Jun 2025 14:57:42 +0000 Subject: [PATCH 2/5] Python: Update bazel dependencies --- MODULE.bazel | 4 +- misc/bazel/3rdparty/py_deps/BUILD.bazel | 12 +-- .../py_deps/BUILD.hashbrown-0.9.1.bazel | 90 ------------------ ...l => BUILD.streaming-iterator-0.1.9.bazel} | 8 +- .../BUILD.string-interner-0.12.2.bazel | 92 ------------------- ...4.bazel => BUILD.tree-sitter-0.24.7.bazel} | 22 ++++- ...l => BUILD.tree-sitter-graph-0.12.0.bazel} | 6 +- ...=> BUILD.tree-sitter-language-0.1.5.bazel} | 10 +- misc/bazel/3rdparty/py_deps/defs.bzl | 80 ++++++---------- 9 files changed, 67 insertions(+), 257 deletions(-) delete mode 100644 misc/bazel/3rdparty/py_deps/BUILD.hashbrown-0.9.1.bazel rename misc/bazel/3rdparty/py_deps/{BUILD.ahash-0.4.8.bazel => BUILD.streaming-iterator-0.1.9.bazel} (96%) delete mode 100644 misc/bazel/3rdparty/py_deps/BUILD.string-interner-0.12.2.bazel rename misc/bazel/3rdparty/py_deps/{BUILD.tree-sitter-0.20.4.bazel => BUILD.tree-sitter-0.24.7.bazel} (90%) rename misc/bazel/3rdparty/py_deps/{BUILD.tree-sitter-graph-0.7.0.bazel => BUILD.tree-sitter-graph-0.12.0.bazel} (96%) rename misc/bazel/3rdparty/py_deps/{BUILD.cfg-if-1.0.0.bazel => BUILD.tree-sitter-language-0.1.5.bazel} (95%) diff --git a/MODULE.bazel b/MODULE.bazel index c30304156e37..f1c0c9ca12df 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -62,8 +62,8 @@ use_repo( "vendor_py__cc-1.2.14", "vendor_py__clap-4.5.30", "vendor_py__regex-1.11.1", - "vendor_py__tree-sitter-0.20.4", - "vendor_py__tree-sitter-graph-0.7.0", + "vendor_py__tree-sitter-0.24.7", + "vendor_py__tree-sitter-graph-0.12.0", ) # deps for ruby+rust diff --git a/misc/bazel/3rdparty/py_deps/BUILD.bazel b/misc/bazel/3rdparty/py_deps/BUILD.bazel index f756b4a519f7..86bfde266419 100644 --- a/misc/bazel/3rdparty/py_deps/BUILD.bazel +++ b/misc/bazel/3rdparty/py_deps/BUILD.bazel @@ -80,26 +80,26 @@ alias( ) alias( - name = "tree-sitter-0.20.4", - actual = "@vendor_py__tree-sitter-0.20.4//:tree_sitter", + name = "tree-sitter-0.24.7", + actual = "@vendor_py__tree-sitter-0.24.7//:tree_sitter", tags = ["manual"], ) alias( name = "tree-sitter", - actual = "@vendor_py__tree-sitter-0.20.4//:tree_sitter", + actual = "@vendor_py__tree-sitter-0.24.7//:tree_sitter", tags = ["manual"], ) alias( - name = "tree-sitter-graph-0.7.0", - actual = "@vendor_py__tree-sitter-graph-0.7.0//:tree_sitter_graph", + name = "tree-sitter-graph-0.12.0", + actual = "@vendor_py__tree-sitter-graph-0.12.0//:tree_sitter_graph", tags = ["manual"], ) alias( name = "tree-sitter-graph", - actual = "@vendor_py__tree-sitter-graph-0.7.0//:tree_sitter_graph", + actual = "@vendor_py__tree-sitter-graph-0.12.0//:tree_sitter_graph", tags = ["manual"], ) diff --git a/misc/bazel/3rdparty/py_deps/BUILD.hashbrown-0.9.1.bazel b/misc/bazel/3rdparty/py_deps/BUILD.hashbrown-0.9.1.bazel deleted file mode 100644 index 7f29950f9a30..000000000000 --- a/misc/bazel/3rdparty/py_deps/BUILD.hashbrown-0.9.1.bazel +++ /dev/null @@ -1,90 +0,0 @@ -############################################################################### -# @generated -# DO NOT MODIFY: This file is auto-generated by a crate_universe tool. To -# regenerate this file, run the following: -# -# bazel run @@//misc/bazel/3rdparty:vendor_py_deps -############################################################################### - -load("@rules_rust//rust:defs.bzl", "rust_library") - -package(default_visibility = ["//visibility:public"]) - -rust_library( - name = "hashbrown", - srcs = glob( - include = ["**/*.rs"], - allow_empty = True, - ), - compile_data = glob( - include = ["**"], - allow_empty = True, - exclude = [ - "**/* *", - ".tmp_git_root/**/*", - "BUILD", - "BUILD.bazel", - "WORKSPACE", - "WORKSPACE.bazel", - ], - ), - crate_features = [ - "ahash", - "inline-more", - ], - crate_root = "src/lib.rs", - edition = "2018", - rustc_flags = [ - "--cap-lints=allow", - ], - tags = [ - "cargo-bazel", - "crate-name=hashbrown", - "manual", - "noclippy", - "norustfmt", - ], - target_compatible_with = select({ - "@rules_rust//rust/platform:aarch64-apple-darwin": [], - "@rules_rust//rust/platform:aarch64-apple-ios": [], - "@rules_rust//rust/platform:aarch64-apple-ios-sim": [], - "@rules_rust//rust/platform:aarch64-linux-android": [], - "@rules_rust//rust/platform:aarch64-pc-windows-msvc": [], - "@rules_rust//rust/platform:aarch64-unknown-fuchsia": [], - "@rules_rust//rust/platform:aarch64-unknown-linux-gnu": [], - "@rules_rust//rust/platform:aarch64-unknown-nixos-gnu": [], - "@rules_rust//rust/platform:aarch64-unknown-nto-qnx710": [], - "@rules_rust//rust/platform:aarch64-unknown-uefi": [], - "@rules_rust//rust/platform:arm-unknown-linux-gnueabi": [], - "@rules_rust//rust/platform:armv7-linux-androideabi": [], - "@rules_rust//rust/platform:armv7-unknown-linux-gnueabi": [], - "@rules_rust//rust/platform:i686-apple-darwin": [], - "@rules_rust//rust/platform:i686-linux-android": [], - "@rules_rust//rust/platform:i686-pc-windows-msvc": [], - "@rules_rust//rust/platform:i686-unknown-freebsd": [], - "@rules_rust//rust/platform:i686-unknown-linux-gnu": [], - "@rules_rust//rust/platform:powerpc-unknown-linux-gnu": [], - "@rules_rust//rust/platform:riscv32imc-unknown-none-elf": [], - "@rules_rust//rust/platform:riscv64gc-unknown-none-elf": [], - "@rules_rust//rust/platform:s390x-unknown-linux-gnu": [], - "@rules_rust//rust/platform:thumbv7em-none-eabi": [], - "@rules_rust//rust/platform:thumbv8m.main-none-eabi": [], - "@rules_rust//rust/platform:wasm32-unknown-unknown": [], - "@rules_rust//rust/platform:wasm32-wasip1": [], - "@rules_rust//rust/platform:x86_64-apple-darwin": [], - "@rules_rust//rust/platform:x86_64-apple-ios": [], - "@rules_rust//rust/platform:x86_64-linux-android": [], - "@rules_rust//rust/platform:x86_64-pc-windows-msvc": [], - "@rules_rust//rust/platform:x86_64-unknown-freebsd": [], - "@rules_rust//rust/platform:x86_64-unknown-fuchsia": [], - "@rules_rust//rust/platform:x86_64-unknown-linux-gnu": [], - "@rules_rust//rust/platform:x86_64-unknown-nixos-gnu": [], - "@rules_rust//rust/platform:x86_64-unknown-none": [], - "@rules_rust//rust/platform:x86_64-unknown-uefi": [], - "//conditions:default": ["@platforms//:incompatible"], - }), - version = "0.9.1", - deps = [ - "@vendor_py__ahash-0.4.8//:ahash", - ], -) diff --git a/misc/bazel/3rdparty/py_deps/BUILD.ahash-0.4.8.bazel b/misc/bazel/3rdparty/py_deps/BUILD.streaming-iterator-0.1.9.bazel similarity index 96% rename from misc/bazel/3rdparty/py_deps/BUILD.ahash-0.4.8.bazel rename to misc/bazel/3rdparty/py_deps/BUILD.streaming-iterator-0.1.9.bazel index 8ce7511104c6..7f9939822a64 100644 --- a/misc/bazel/3rdparty/py_deps/BUILD.ahash-0.4.8.bazel +++ b/misc/bazel/3rdparty/py_deps/BUILD.streaming-iterator-0.1.9.bazel @@ -11,7 +11,7 @@ load("@rules_rust//rust:defs.bzl", "rust_library") package(default_visibility = ["//visibility:public"]) rust_library( - name = "ahash", + name = "streaming_iterator", srcs = glob( include = ["**/*.rs"], allow_empty = True, @@ -29,13 +29,13 @@ rust_library( ], ), crate_root = "src/lib.rs", - edition = "2018", + edition = "2021", rustc_flags = [ "--cap-lints=allow", ], tags = [ "cargo-bazel", - "crate-name=ahash", + "crate-name=streaming-iterator", "manual", "noclippy", "norustfmt", @@ -79,5 +79,5 @@ rust_library( "@rules_rust//rust/platform:x86_64-unknown-uefi": [], "//conditions:default": ["@platforms//:incompatible"], }), - version = "0.4.8", + version = "0.1.9", ) diff --git a/misc/bazel/3rdparty/py_deps/BUILD.string-interner-0.12.2.bazel b/misc/bazel/3rdparty/py_deps/BUILD.string-interner-0.12.2.bazel deleted file mode 100644 index b3804389f4a5..000000000000 --- a/misc/bazel/3rdparty/py_deps/BUILD.string-interner-0.12.2.bazel +++ /dev/null @@ -1,92 +0,0 @@ -############################################################################### -# @generated -# DO NOT MODIFY: This file is auto-generated by a crate_universe tool. To -# regenerate this file, run the following: -# -# bazel run @@//misc/bazel/3rdparty:vendor_py_deps -############################################################################### - -load("@rules_rust//rust:defs.bzl", "rust_library") - -package(default_visibility = ["//visibility:public"]) - -rust_library( - name = "string_interner", - srcs = glob( - include = ["**/*.rs"], - allow_empty = True, - ), - compile_data = glob( - include = ["**"], - allow_empty = True, - exclude = [ - "**/* *", - ".tmp_git_root/**/*", - "BUILD", - "BUILD.bazel", - "WORKSPACE", - "WORKSPACE.bazel", - ], - ), - crate_features = [ - "backends", - "inline-more", - "std", - ], - crate_root = "src/lib.rs", - edition = "2018", - rustc_flags = [ - "--cap-lints=allow", - ], - tags = [ - "cargo-bazel", - "crate-name=string-interner", - "manual", - "noclippy", - "norustfmt", - ], - target_compatible_with = select({ - "@rules_rust//rust/platform:aarch64-apple-darwin": [], - "@rules_rust//rust/platform:aarch64-apple-ios": [], - "@rules_rust//rust/platform:aarch64-apple-ios-sim": [], - "@rules_rust//rust/platform:aarch64-linux-android": [], - "@rules_rust//rust/platform:aarch64-pc-windows-msvc": [], - "@rules_rust//rust/platform:aarch64-unknown-fuchsia": [], - "@rules_rust//rust/platform:aarch64-unknown-linux-gnu": [], - "@rules_rust//rust/platform:aarch64-unknown-nixos-gnu": [], - "@rules_rust//rust/platform:aarch64-unknown-nto-qnx710": [], - "@rules_rust//rust/platform:aarch64-unknown-uefi": [], - "@rules_rust//rust/platform:arm-unknown-linux-gnueabi": [], - "@rules_rust//rust/platform:armv7-linux-androideabi": [], - "@rules_rust//rust/platform:armv7-unknown-linux-gnueabi": [], - "@rules_rust//rust/platform:i686-apple-darwin": [], - "@rules_rust//rust/platform:i686-linux-android": [], - "@rules_rust//rust/platform:i686-pc-windows-msvc": [], - "@rules_rust//rust/platform:i686-unknown-freebsd": [], - "@rules_rust//rust/platform:i686-unknown-linux-gnu": [], - "@rules_rust//rust/platform:powerpc-unknown-linux-gnu": [], - "@rules_rust//rust/platform:riscv32imc-unknown-none-elf": [], - "@rules_rust//rust/platform:riscv64gc-unknown-none-elf": [], - "@rules_rust//rust/platform:s390x-unknown-linux-gnu": [], - "@rules_rust//rust/platform:thumbv7em-none-eabi": [], - "@rules_rust//rust/platform:thumbv8m.main-none-eabi": [], - "@rules_rust//rust/platform:wasm32-unknown-unknown": [], - "@rules_rust//rust/platform:wasm32-wasip1": [], - "@rules_rust//rust/platform:x86_64-apple-darwin": [], - "@rules_rust//rust/platform:x86_64-apple-ios": [], - "@rules_rust//rust/platform:x86_64-linux-android": [], - "@rules_rust//rust/platform:x86_64-pc-windows-msvc": [], - "@rules_rust//rust/platform:x86_64-unknown-freebsd": [], - "@rules_rust//rust/platform:x86_64-unknown-fuchsia": [], - "@rules_rust//rust/platform:x86_64-unknown-linux-gnu": [], - "@rules_rust//rust/platform:x86_64-unknown-nixos-gnu": [], - "@rules_rust//rust/platform:x86_64-unknown-none": [], - "@rules_rust//rust/platform:x86_64-unknown-uefi": [], - "//conditions:default": ["@platforms//:incompatible"], - }), - version = "0.12.2", - deps = [ - "@vendor_py__cfg-if-1.0.0//:cfg_if", - "@vendor_py__hashbrown-0.9.1//:hashbrown", - ], -) diff --git a/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-0.20.4.bazel b/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-0.24.7.bazel similarity index 90% rename from misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-0.20.4.bazel rename to misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-0.24.7.bazel index 3b5db51f105a..53109d9d1505 100644 --- a/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-0.20.4.bazel +++ b/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-0.24.7.bazel @@ -29,8 +29,12 @@ rust_library( "WORKSPACE.bazel", ], ), + crate_features = [ + "default", + "std", + ], crate_root = "binding_rust/lib.rs", - edition = "2018", + edition = "2021", rustc_flags = [ "--cap-lints=allow", ], @@ -80,10 +84,13 @@ rust_library( "@rules_rust//rust/platform:x86_64-unknown-uefi": [], "//conditions:default": ["@platforms//:incompatible"], }), - version = "0.20.4", + version = "0.24.7", deps = [ "@vendor_py__regex-1.11.1//:regex", - "@vendor_py__tree-sitter-0.20.4//:build_script_build", + "@vendor_py__regex-syntax-0.8.5//:regex_syntax", + "@vendor_py__streaming-iterator-0.1.9//:streaming_iterator", + "@vendor_py__tree-sitter-0.24.7//:build_script_build", + "@vendor_py__tree-sitter-language-0.1.5//:tree_sitter_language", ], ) @@ -106,6 +113,10 @@ cargo_build_script( "WORKSPACE.bazel", ], ), + crate_features = [ + "default", + "std", + ], crate_name = "build_script_build", crate_root = "binding_rust/build.rs", data = glob( @@ -120,7 +131,8 @@ cargo_build_script( "WORKSPACE.bazel", ], ), - edition = "2018", + edition = "2021", + links = "tree-sitter", pkg_name = "tree-sitter", rustc_flags = [ "--cap-lints=allow", @@ -132,7 +144,7 @@ cargo_build_script( "noclippy", "norustfmt", ], - version = "0.20.4", + version = "0.24.7", visibility = ["//visibility:private"], deps = [ "@vendor_py__cc-1.2.14//:cc", diff --git a/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-graph-0.7.0.bazel b/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-graph-0.12.0.bazel similarity index 96% rename from misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-graph-0.7.0.bazel rename to misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-graph-0.12.0.bazel index adb4680650dd..824040b4486c 100644 --- a/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-graph-0.7.0.bazel +++ b/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-graph-0.12.0.bazel @@ -79,15 +79,15 @@ rust_library( "@rules_rust//rust/platform:x86_64-unknown-uefi": [], "//conditions:default": ["@platforms//:incompatible"], }), - version = "0.7.0", + version = "0.12.0", deps = [ "@vendor_py__log-0.4.25//:log", "@vendor_py__regex-1.11.1//:regex", "@vendor_py__serde-1.0.217//:serde", "@vendor_py__serde_json-1.0.138//:serde_json", "@vendor_py__smallvec-1.14.0//:smallvec", - "@vendor_py__string-interner-0.12.2//:string_interner", + "@vendor_py__streaming-iterator-0.1.9//:streaming_iterator", "@vendor_py__thiserror-1.0.69//:thiserror", - "@vendor_py__tree-sitter-0.20.4//:tree_sitter", + "@vendor_py__tree-sitter-0.24.7//:tree_sitter", ], ) diff --git a/misc/bazel/3rdparty/py_deps/BUILD.cfg-if-1.0.0.bazel b/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-language-0.1.5.bazel similarity index 95% rename from misc/bazel/3rdparty/py_deps/BUILD.cfg-if-1.0.0.bazel rename to misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-language-0.1.5.bazel index 40e9d8ed53fe..152b9e078016 100644 --- a/misc/bazel/3rdparty/py_deps/BUILD.cfg-if-1.0.0.bazel +++ b/misc/bazel/3rdparty/py_deps/BUILD.tree-sitter-language-0.1.5.bazel @@ -11,7 +11,7 @@ load("@rules_rust//rust:defs.bzl", "rust_library") package(default_visibility = ["//visibility:public"]) rust_library( - name = "cfg_if", + name = "tree_sitter_language", srcs = glob( include = ["**/*.rs"], allow_empty = True, @@ -28,14 +28,14 @@ rust_library( "WORKSPACE.bazel", ], ), - crate_root = "src/lib.rs", - edition = "2018", + crate_root = "language.rs", + edition = "2021", rustc_flags = [ "--cap-lints=allow", ], tags = [ "cargo-bazel", - "crate-name=cfg-if", + "crate-name=tree-sitter-language", "manual", "noclippy", "norustfmt", @@ -79,5 +79,5 @@ rust_library( "@rules_rust//rust/platform:x86_64-unknown-uefi": [], "//conditions:default": ["@platforms//:incompatible"], }), - version = "1.0.0", + version = "0.1.5", ) diff --git a/misc/bazel/3rdparty/py_deps/defs.bzl b/misc/bazel/3rdparty/py_deps/defs.bzl index 9d78e46b2f88..ea04edbae037 100644 --- a/misc/bazel/3rdparty/py_deps/defs.bzl +++ b/misc/bazel/3rdparty/py_deps/defs.bzl @@ -298,13 +298,13 @@ _NORMAL_DEPENDENCIES = { "anyhow": Label("@vendor_py__anyhow-1.0.95//:anyhow"), "clap": Label("@vendor_py__clap-4.5.30//:clap"), "regex": Label("@vendor_py__regex-1.11.1//:regex"), - "tree-sitter": Label("@vendor_py__tree-sitter-0.20.4//:tree_sitter"), - "tree-sitter-graph": Label("@vendor_py__tree-sitter-graph-0.7.0//:tree_sitter_graph"), + "tree-sitter": Label("@vendor_py__tree-sitter-0.24.7//:tree_sitter"), + "tree-sitter-graph": Label("@vendor_py__tree-sitter-graph-0.12.0//:tree_sitter_graph"), }, }, "python/extractor/tsg-python/tsp": { _COMMON_CONDITION: { - "tree-sitter": Label("@vendor_py__tree-sitter-0.20.4//:tree_sitter"), + "tree-sitter": Label("@vendor_py__tree-sitter-0.24.7//:tree_sitter"), }, }, } @@ -452,16 +452,6 @@ def crate_repositories(): Returns: A list of repos visible to the module through the module extension. """ - maybe( - http_archive, - name = "vendor_py__ahash-0.4.8", - sha256 = "0453232ace82dee0dd0b4c87a59bd90f7b53b314f3e0f61fe2ee7c8a16482289", - type = "tar.gz", - urls = ["https://static.crates.io/crates/ahash/0.4.8/download"], - strip_prefix = "ahash-0.4.8", - build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.ahash-0.4.8.bazel"), - ) - maybe( http_archive, name = "vendor_py__aho-corasick-1.1.3", @@ -542,16 +532,6 @@ def crate_repositories(): build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.cc-1.2.14.bazel"), ) - maybe( - http_archive, - name = "vendor_py__cfg-if-1.0.0", - sha256 = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd", - type = "tar.gz", - urls = ["https://static.crates.io/crates/cfg-if/1.0.0/download"], - strip_prefix = "cfg-if-1.0.0", - build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.cfg-if-1.0.0.bazel"), - ) - maybe( http_archive, name = "vendor_py__clap-4.5.30", @@ -592,16 +572,6 @@ def crate_repositories(): build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.colorchoice-1.0.3.bazel"), ) - maybe( - http_archive, - name = "vendor_py__hashbrown-0.9.1", - sha256 = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04", - type = "tar.gz", - urls = ["https://static.crates.io/crates/hashbrown/0.9.1/download"], - strip_prefix = "hashbrown-0.9.1", - build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.hashbrown-0.9.1.bazel"), - ) - maybe( http_archive, name = "vendor_py__is_terminal_polyfill-1.70.1", @@ -764,12 +734,12 @@ def crate_repositories(): maybe( http_archive, - name = "vendor_py__string-interner-0.12.2", - sha256 = "383196d1876517ee6f9f0864d1fc1070331b803335d3c6daaa04bbcccd823c08", + name = "vendor_py__streaming-iterator-0.1.9", + sha256 = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520", type = "tar.gz", - urls = ["https://static.crates.io/crates/string-interner/0.12.2/download"], - strip_prefix = "string-interner-0.12.2", - build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.string-interner-0.12.2.bazel"), + urls = ["https://static.crates.io/crates/streaming-iterator/0.1.9/download"], + strip_prefix = "streaming-iterator-0.1.9", + build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.streaming-iterator-0.1.9.bazel"), ) maybe( @@ -814,22 +784,32 @@ def crate_repositories(): maybe( http_archive, - name = "vendor_py__tree-sitter-0.20.4", - sha256 = "4e34327f8eac545e3f037382471b2b19367725a242bba7bc45edb9efb49fe39a", + name = "vendor_py__tree-sitter-0.24.7", + sha256 = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75", + type = "tar.gz", + urls = ["https://static.crates.io/crates/tree-sitter/0.24.7/download"], + strip_prefix = "tree-sitter-0.24.7", + build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.tree-sitter-0.24.7.bazel"), + ) + + maybe( + http_archive, + name = "vendor_py__tree-sitter-graph-0.12.0", + sha256 = "63f86eb73c7d891c4b9b6fe4d4e63dd94c506e4788af7c2296afdcfbeea626cc", type = "tar.gz", - urls = ["https://static.crates.io/crates/tree-sitter/0.20.4/download"], - strip_prefix = "tree-sitter-0.20.4", - build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.tree-sitter-0.20.4.bazel"), + urls = ["https://static.crates.io/crates/tree-sitter-graph/0.12.0/download"], + strip_prefix = "tree-sitter-graph-0.12.0", + build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.tree-sitter-graph-0.12.0.bazel"), ) maybe( http_archive, - name = "vendor_py__tree-sitter-graph-0.7.0", - sha256 = "639d21e886f581d293de5f5081f09af003c54607ff3fa85efa159b243ba1f97a", + name = "vendor_py__tree-sitter-language-0.1.5", + sha256 = "c4013970217383f67b18aef68f6fb2e8d409bc5755227092d32efb0422ba24b8", type = "tar.gz", - urls = ["https://static.crates.io/crates/tree-sitter-graph/0.7.0/download"], - strip_prefix = "tree-sitter-graph-0.7.0", - build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.tree-sitter-graph-0.7.0.bazel"), + urls = ["https://static.crates.io/crates/tree-sitter-language/0.1.5/download"], + strip_prefix = "tree-sitter-language-0.1.5", + build_file = Label("//misc/bazel/3rdparty/py_deps:BUILD.tree-sitter-language-0.1.5.bazel"), ) maybe( @@ -957,6 +937,6 @@ def crate_repositories(): struct(repo = "vendor_py__cc-1.2.14", is_dev_dep = False), struct(repo = "vendor_py__clap-4.5.30", is_dev_dep = False), struct(repo = "vendor_py__regex-1.11.1", is_dev_dep = False), - struct(repo = "vendor_py__tree-sitter-0.20.4", is_dev_dep = False), - struct(repo = "vendor_py__tree-sitter-graph-0.7.0", is_dev_dep = False), + struct(repo = "vendor_py__tree-sitter-0.24.7", is_dev_dep = False), + struct(repo = "vendor_py__tree-sitter-graph-0.12.0", is_dev_dep = False), ] From 0a3cadd46888442f7decf67bae0c342fb1d59fef Mon Sep 17 00:00:00 2001 From: Taus Date: Mon, 30 Jun 2025 15:10:27 +0000 Subject: [PATCH 3/5] Python: Update parser test output It seems that with a newer version of tree-sitter, we no longer parse the (not actually valid!) syntax `Spam[**P2]` as if the `**` is an exponentiation operation (with a missing left operand). --- python/extractor/tests/parser/types_new.expected | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/python/extractor/tests/parser/types_new.expected b/python/extractor/tests/parser/types_new.expected index a390ed1aae2a..ac470c08cf03 100644 --- a/python/extractor/tests/parser/types_new.expected +++ b/python/extractor/tests/parser/types_new.expected @@ -338,16 +338,9 @@ Module: [1, 0] - [23, 0] variable: Variable('Spam', None) ctx: Load index: - BinOp: [20, 36] - [20, 40] - left: - Name: [20, 36] - [20, 36] - variable: Variable('', None) - ctx: Load - op: Pow - right: - Name: [20, 38] - [20, 40] - variable: Variable('P2', None) - ctx: Load + Name: [20, 38] - [20, 40] + variable: Variable('P2', None) + ctx: Load ctx: Load TypeAlias: [21, 0] - [21, 41] name: From afa7195819964e26f032e4f8a7a5466e17699e3b Mon Sep 17 00:00:00 2001 From: Taus Date: Wed, 2 Jul 2025 12:33:39 +0000 Subject: [PATCH 4/5] Python: Improve handling of syntax errors Rather than relying on matching arbitrary nodes inside tree-sitter-graph and then checking whether they are of type ERROR or MISSING (which seems to have stopped working in later versions of tree-sitter), we now explicitly go through the tree-sitter tree, locating all of the error and missing nodes along the way. We then add these on to the graph output in the same format as was previously produced by tree-sitter-graph. Note that it's very likely that some of the syntax errors will move around a bit as a consequence of this change. In general, we don't expect syntax errors to have stable locations, as small changes in the grammar can cause an error to appear in a different position, even if the underlying (erroneous) code has not changed. --- python/extractor/tsg-python/python.tsg | 10 --- python/extractor/tsg-python/src/main.rs | 107 +++++++++++++++++++++++- 2 files changed, 104 insertions(+), 13 deletions(-) diff --git a/python/extractor/tsg-python/python.tsg b/python/extractor/tsg-python/python.tsg index 8dec9ad5d3ef..963df06fd7c7 100644 --- a/python/extractor/tsg-python/python.tsg +++ b/python/extractor/tsg-python/python.tsg @@ -6,16 +6,6 @@ (module) @mod { let @mod.node = (ast-node @mod "Module") } -(_) @anynode -{ - scan (node-type @anynode) { - "^(ERROR|MISSING)$" { - let @anynode.node = (ast-node @anynode "SyntaxErrorNode") - attr (@anynode.node) source = (source-text @anynode) - } - } -} - (parenthesized_expression) @nd { let @nd.node = (ast-node @nd "Expr") } diff --git a/python/extractor/tsg-python/src/main.rs b/python/extractor/tsg-python/src/main.rs index c99145132f76..a94d93ba2092 100644 --- a/python/extractor/tsg-python/src/main.rs +++ b/python/extractor/tsg-python/src/main.rs @@ -480,6 +480,99 @@ pub mod extra_functions { } } +struct TreeIterator<'a> { + nodes_to_visit: Vec>, +} + +impl<'a> TreeIterator<'a> { + fn new(root: tree_sitter::Node<'a>) -> Self { + Self { + nodes_to_visit: vec![root], + } + } +} + +impl<'a> Iterator for TreeIterator<'a> { + type Item = tree_sitter::Node<'a>; + + fn next(&mut self) -> Option { + if let Some(node) = self.nodes_to_visit.pop() { + // Add all children to the queue for processing + self.nodes_to_visit + .extend((0..node.child_count()).rev().filter_map(|i| node.child(i))); + Some(node) + } else { + None + } + } +} + +#[derive(Debug, Clone)] +struct SyntaxError { + start_pos: tree_sitter::Point, + end_pos: tree_sitter::Point, + source: String, +} + +fn syntax_errors_from_tree<'a>( + root: tree_sitter::Node<'a>, + source: &'a str, +) -> impl Iterator + 'a { + TreeIterator::new(root) + .filter(|&node| node.is_error() || node.is_missing()) + .map(move |node| { + let start_pos = node.start_position(); + let end_pos = node.end_position(); + let text = &source[node.byte_range()]; + SyntaxError { + start_pos, + end_pos, + source: text.to_string(), + } + }) +} + +fn add_syntax_error_nodes(graph: &mut tree_sitter_graph::graph::Graph, errors: &[SyntaxError]) { + for error in errors { + let error_node = graph.add_graph_node(); + + // Add _kind attribute + graph[error_node] + .attributes + .add( + tree_sitter_graph::Identifier::from("_kind"), + tree_sitter_graph::graph::Value::String("SyntaxErrorNode".to_string()), + ) + .expect("Fresh node should not have duplicate attributes"); + + // Add _location attribute + let location = tree_sitter_graph::graph::Value::List( + vec![ + error.start_pos.row, + error.start_pos.column, + error.end_pos.row, + error.end_pos.column, + ] + .into_iter() + .map(|v| tree_sitter_graph::graph::Value::from(v as u32)) + .collect(), + ); + graph[error_node] + .attributes + .add(tree_sitter_graph::Identifier::from("_location"), location) + .expect("Fresh node should not have duplicate attributes"); + + // Add source attribute + graph[error_node] + .attributes + .add( + tree_sitter_graph::Identifier::from("source"), + tree_sitter_graph::graph::Value::String(error.source.clone()), + ) + .expect("Fresh node should not have duplicate attributes"); + } +} + fn main() -> Result<()> { let matches = Command::new("tsg-python") .version(BUILD_VERSION) @@ -581,10 +674,18 @@ fn main() -> Result<()> { ); let globals = Variables::new(); - let mut config = ExecutionConfig::new(&mut functions, &globals).lazy(false); - let graph = file - .execute(&tree, &source, &mut config, &NoCancellation) + let config = ExecutionConfig::new(&functions, &globals).lazy(false); + let mut graph = file + .execute(&tree, &source, &config, &NoCancellation) .with_context(|| format!("Could not execute TSG file {}", tsg_path))?; + + // Collect and add syntax error nodes to the graph + if tree.root_node().has_error() { + let syntax_errors: Vec = + syntax_errors_from_tree(tree.root_node(), &source).collect(); + add_syntax_error_nodes(&mut graph, &syntax_errors); + } + print!("{}", graph.pretty_print()); Ok(()) } From b90a3588f2774bebc7b4d468fd87f12c301eb87b Mon Sep 17 00:00:00 2001 From: Taus Date: Wed, 2 Jul 2025 12:53:39 +0000 Subject: [PATCH 5/5] Python: Update `types_new.py` and test output --- .../extractor/tests/parser/types_new.expected | 34 ------------------- python/extractor/tests/parser/types_new.py | 2 +- 2 files changed, 1 insertion(+), 35 deletions(-) diff --git a/python/extractor/tests/parser/types_new.expected b/python/extractor/tests/parser/types_new.expected index ac470c08cf03..de4f44862800 100644 --- a/python/extractor/tests/parser/types_new.expected +++ b/python/extractor/tests/parser/types_new.expected @@ -308,40 +308,6 @@ Module: [1, 0] - [23, 0] ] ctx: Load ctx: Load - TypeAlias: [20, 0] - [20, 41] - name: - Name: [20, 5] - [20, 9] - variable: Variable('Baz2', None) - ctx: Store - type_parameters: [ - ParamSpec: [20, 10] - [20, 27] - name: - Name: [20, 12] - [20, 14] - variable: Variable('P2', None) - ctx: Store - default: - List: [20, 17] - [20, 27] - elts: [ - Name: [20, 18] - [20, 21] - variable: Variable('int', None) - ctx: Load - Name: [20, 23] - [20, 26] - variable: Variable('str', None) - ctx: Load - ] - ctx: Load - ] - value: - Subscript: [20, 31] - [20, 41] - value: - Name: [20, 31] - [20, 35] - variable: Variable('Spam', None) - ctx: Load - index: - Name: [20, 38] - [20, 40] - variable: Variable('P2', None) - ctx: Load - ctx: Load TypeAlias: [21, 0] - [21, 41] name: Name: [21, 5] - [21, 9] diff --git a/python/extractor/tests/parser/types_new.py b/python/extractor/tests/parser/types_new.py index 12e5eac0556c..9709d0f30449 100644 --- a/python/extractor/tests/parser/types_new.py +++ b/python/extractor/tests/parser/types_new.py @@ -17,6 +17,6 @@ class Qux1[*Ts1 = *tuple[int, bool]]: ... # TypeAliases type Foo2[T15, U1 = str] = Bar1[T15, U1] -type Baz2[**P2 = [int, str]] = Spam[**P2] +# type Baz2[**P2 = [int, str]] = Spam[**P2] # From the PEP, but this is not actually valid syntax! type Qux2[*Ts2 = *tuple[str]] = Ham[*Ts2] type Rab[U2, T15 = str] = Bar2[T15, U2]