diff --git a/.gitignore b/.gitignore index 40090dc..c20924f 100644 --- a/.gitignore +++ b/.gitignore @@ -30,5 +30,5 @@ website/.astro/ # Cached AST artifacts **/.vibe_parsed_cache/ - .env +*/**/.vibe_parsed_cache/** diff --git a/BRANCH_COMPARISON.md b/BRANCH_COMPARISON.md deleted file mode 100644 index 5157474..0000000 --- a/BRANCH_COMPARISON.md +++ /dev/null @@ -1,341 +0,0 @@ -# PR #277 Branch Comparison Report - -**Branch:** `issue-269-conditional-directives` -**PR:** #277 - "Add conditional compilation directives" -**Comparing:** `issue-269-conditional-directives` vs `main` -**Status:** 10 tests passing, all changes working as intended - ---- - -## Summary of Changes - -This PR implements conditional compilation directives (`:- if/1`, `:- else/0`, `:- endif/0`) and fixes a character code parsing issue. **Key insight:** The branch removes support for `elif` that existed in main and simplifies the implementation to core ISO directives only. - -| Aspect | Implementation | Status | -|--------|---|---| -| Core conditionals (if/else/endif) | ✅ Fully implemented | Working | -| elif support | ❌ Removed (was in main) | Simplified | -| Test coverage | Reduced from 477→136 lines | 10 focused tests | -| Parser char code fix | ✅ Fixed trailing quote handling | Working | -| ISO compliance | Updated docs | ✅ ISO-only directives | -| Tools cleanup | Removed diagnostic tools | N/A | - ---- - -## Detailed Implementation Differences - -### 1. **Conditional Compilation Stack Simplification** - -**Main branch:** -```python -# 3-tuple tracking: (is_active, has_seen_else, any_branch_taken) -self._conditional_stack: list[tuple[bool, bool, bool]] = [] -``` - -**This branch:** -```python -# 2-tuple tracking: (is_active, has_seen_else) -self._conditional_stack: list[tuple[bool, bool]] = [] -``` - -**Why:** Removing the `any_branch_taken` flag simplifies the state machine. This branch doesn't support `elif`, so the complex multi-branch logic is unnecessary. - ---- - -### 2. **Removed elif Directive Support** - -**Main branch:** -- ✅ Supported `:- elif(Condition)` with multiple elif blocks -- ✅ Had 154 lines dedicated to elif handling -- ✅ Used `any_branch_taken` flag to track branch selection - -**This branch:** -- ❌ **elif removed entirely** -- ✅ Only supports `if/else/endif` (ISO-standard directives) -- ✅ Simplified state management - -**Code removed:** -```python -def _handle_elif_directive(self, condition) -> None: - """Handle :- elif(Condition). directive (else-if).""" - # (154 lines of complex logic removed) -``` - -**Rationale:** The ISO standard specifies `:- if/1`, `:- else/0`, `:- endif/0`. The `elif` was an extension, and removing it simplifies the implementation to pure ISO compliance. - ---- - -### 3. **Simplified _handle_conditional_directive Method** - -**Main branch (282 lines):** -- Separate methods for each directive: `_handle_if_directive`, `_handle_elif_directive`, `_handle_else_directive`, `_handle_endif_directive` -- Complex parent-active checking logic for nested conditionals -- 3-tuple state with branch tracking - -**This branch (49 lines):** -```python -def _handle_conditional_directive(self, goal: Any) -> None: - """Process :- if/1, :- else, :- endif directives.""" - name = self._get_directive_name(goal) - - if name == "if": - # Single if block logic (simplified) - if not self._is_conditionally_active(): - self._conditional_stack.append((False, False)) - return - - condition = goal.args[0] - # Validation and evaluation - condition_succeeded = self._evaluate_condition(condition) - self._conditional_stack.append((condition_succeeded, False)) - - elif name == "else": - # Simple toggle logic - is_active, has_seen_else = self._conditional_stack.pop() - if has_seen_else: - raise error - self._conditional_stack.append((not is_active, True)) - - elif name == "endif": - # Just pop the stack - if not self._conditional_stack: - raise error - self._conditional_stack.pop() -``` - -**Benefits of this branch:** -- Clearer separation of concerns -- No `any_branch_taken` complexity -- Single method handles all directives -- Easier to understand and maintain - ---- - -### 4. **Character Code Parser Fix** - -**Issue:** Parser couldn't handle trailing quotes in character codes like `0'\` (single quote character) - -**Main branch:** -```regex -CHAR_CODE.5: /0'(\\x[0-9a-zA-Z]+\\?|\\\\\\\\|\\\\['tnr]|''|[^'\\])/ -``` - -**This branch:** -```regex -CHAR_CODE.5: /0'(\\x[0-9a-zA-Z]+\\?|\\\\|\\\\['tnr]|''|[^'\\])'?/ -``` - -**Specific changes:** -1. Added optional trailing quote: `'?` at end -2. Fixed double-backslash pattern: `\\\\\\\\` → `\\\\` -3. Added post-processing to strip the closing quote while preserving escape sequences: -```python -# Strip trailing quote, but keep escapes that include a quote character -if (len(char_part) > 1 and char_part.endswith("'") - and char_part not in {"''", "\\'"}): - char_part = char_part[:-1] -``` - -**Test impact:** ISO parser edge case test is now enabled (was skipped) - ---- - -### 5. **Test Suite Restructuring** - -**Main branch (477 lines):** -- 9 test classes with comprehensive coverage -- Tests for: if/endif, if/else/endif, elif, nested conditionals, error cases -- 154 test methods for elif alone - -**This branch (136 lines):** -- Flat structure: 10 simple test functions -- No class organization -- Focused on core functionality: - - ✅ if with true/false conditions - - ✅ if/else with both branches - - ✅ Nested conditionals respect parent state - - ✅ Condition can query built-ins (e.g., current_predicate/2) - - ✅ Error handling: stray else, stray endif, multiple else, unclosed if - -**Test reduction ratio:** 477→136 lines (71.5% reduction) - ---- - -### 6. **Documentation Changes** - -**docs/FEATURES.md:** - -Main branch: -```markdown -| `:- char_conversion/2` | ❌ | **ISO-required** | -| `:- if(Condition)` | ✅📘 | Conditional compilation - begin block | -| `:- elif(Condition)` | ✅📘 | Conditional compilation - else-if | -| `:- else` | ✅📘 | Conditional compilation - alternative block | -| `:- endif` | ✅📘 | Conditional compilation - end block | -``` - -This branch: -```markdown -| `:- if/1` | ✅ | Conditional compilation - begin block | -| `:- else/0` | ✅ | Conditional compilation - alternative block | -| `:- endif/0` | ✅ | Conditional compilation - end block | -| `:- char_conversion/2` | ✅ | **ISO-required** - Character conversion during parsing | -``` - -**Changes:** -- Removed `elif` from features table -- Updated to use predicate indicator notation (`/arity`) -- Added `:- char_conversion/2` as implemented (was marked as missing in main) -- Changed status markers: `📘` (extension) removed, now showing as core features - ---- - -### 7. **Removed Tool** - -**Deleted:** `tools/find_quoted_infix.py` (86 lines) - -This was a diagnostic tool to find problematic quoted infix operator usage. No longer needed in this branch, likely because: -- The operator parser has improved -- Or it was only used during elif development - ---- - -### 8. **Removed STATUS File** - -**Deleted:** `.paige/STATUS.md` - -This kanban-style status file tracked open issues and PR progress. Removed because it's specific to the development workflow and not part of the core project. - ---- - -## Test Results - -### Conditional Compilation Tests -``` -✅ test_if_true_loads_block -✅ test_if_false_skips_block -✅ test_if_else_true_branch_used -✅ test_if_else_false_branch_used -✅ test_nested_conditionals_respect_parent_state -✅ test_condition_can_query_current_predicate -✅ test_else_without_if_errors -✅ test_endif_without_if_errors -✅ test_multiple_else_in_same_block_errors -✅ test_unclosed_if_errors - -Result: 10/10 passing (100%) -``` - -### ISO Core Tests -``` -✅ 109 passed, 2 skipped -``` - -**Note:** The ISO parser edge case test that was previously skipped (due to the char code bug) is now passing. - ---- - -## Branch Features vs Main - -### Features Removed -| Feature | Lines | Reason | -|---------|-------|--------| -| `elif` directive | 154 | Not ISO-standard; simplification | -| 3-tuple conditional state | N/A | Simplified to 2-tuple | -| elif test class | 113 | elif removed | -| `find_quoted_infix.py` tool | 86 | Cleanup | -| `.paige/STATUS.md` | 25 | Workflow file | - -### Features Improved -| Feature | Impact | -|---------|--------| -| Char code parsing | Fixed trailing quote handling | -| ISO compliance | Pure ISO directives only | -| Code clarity | Consolidated to single method | -| Documentation | Clearer predicate indicators | - -### Net Code Changes -- **Additions:** 312 lines -- **Deletions:** 58 lines -- **Net gain:** +254 lines - ---- - -## Architectural Decisions - -### Why Remove elif? - -1. **ISO Conformance**: The ISO standard only specifies `if/else/endif`, not `elif` -2. **Simplicity**: Maintaining elif required complex state tracking (`any_branch_taken` flag) -3. **User Can Implement**: If users need elif, they can nest if/else blocks: - ```prolog - :- if(cond1). - code1. - :- else. - :- if(cond2). - code2. - :- else. - code3. - :- endif. - :- endif. - ``` - -### Why Single Method for Directives? - -1. **Reduced complexity** in the original separate-method approach -2. **Clearer flow** for all three related directives -3. **Easier testing** with fewer helper methods - ---- - -## Compatibility Notes - -### Breaking Changes -- **Code using `:- elif`** will fail: `elif` is no longer recognized -- Any existing Prolog code relying on elif must be rewritten using nested if/else - -### Non-Breaking Changes -- All existing `:- if`/`:- else`/`:- endif` code works identically -- Improved character code parsing is backward-compatible -- Test suite is fully compatible with existing tests - ---- - -## Recommendations - -### For Merging -1. ✅ Implementation is solid and well-tested -2. ✅ ISO compliance improved (pure ISO directives) -3. ✅ Code is clearer and more maintainable -4. ⚠️ Breaking change for elif users (check if any codebase uses it) - -### Future Work -1. Consider if `elif` should be added back as an extension -2. Monitor user feedback on conditional compilation feature -3. Consider implementing `:- if`/`:- elif`/`:- else` once elif is re-added if needed - ---- - -## Commit Messages - -``` -64145a7f Add conditional compilation directives and fix char code parsing - - Implement :- if/1, :- else/0, :- endif/0 - - Fix CHAR_CODE regex to handle trailing quotes (0'\\') - - Add conditional compilation tests - - Update FEATURES.md documentation - -9f0116bc Update vibeprolog/interpreter.py - - Simplify conditional directive handling - -f3697ac7 Update vibeprolog/interpreter.py - - Final cleanup and refinements -``` - ---- - -## Summary - -This branch **successfully implements ISO standard conditional compilation directives** with a cleaner, simpler architecture than the previous attempt. The deliberate removal of `elif` is a design choice for ISO compliance and maintainability. All tests pass, the implementation is solid, and the code is well-documented. - -**Status: Ready for review** ✅ diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index bbddc5a..09fb10f 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -87,7 +87,8 @@ The interpreter consists of four main components: 1. **Parser** (`vibeprolog/parser.py`) - Uses Lark to parse Prolog syntax with full operator precedence, multi-base numeric literals (including base-qualified numbers like `16'ff`), quoted atoms/strings, escapes, and ISO character code - forms (except for a handful of noted edge cases). + forms (except for a handful of noted edge cases). See "Parser Backends" for + how the interpreter chooses between the LALR and Earley parsers. 2. **Unification** (`vibeprolog/unification.py`) - Robinson-style unification with occurs-check by default so cyclic structures are prevented. 3. **Engine** (`vibeprolog/engine.py`) - Backtracking search with built-in @@ -257,6 +258,36 @@ Shared helpers for the AST live in `vibeprolog/utils/`: These modules are imported by `vibeprolog/engine.py` and have focused coverage in `tests/utils/`. +## Parser Backends + +`vibeprolog/parser.py` builds a Lark grammar from the active operator table, then +chooses the fastest compatible parser backend: + +- **Preferred backend**: The parser defaults to **LALR** for deterministic + parsing and falls back to **Earley** only when ambiguity or grammar + constraints require it. Users can also pin the backend to `"lalr"`, + `"earley"`, or `"auto"` via `PrologParser(parser_backend=...)`. +- **Fallback at parse time**: Clause/directive parsing and term parsing call a + shared `_with_fallback` helper. If a LALR parse raises a token/character + error, the helper automatically retries the same input with the Earley + backend while preserving metadata (source positions, PlDoc associations, and + numeric folding). +- **Backend-specific grammar prep**: `_prepare_grammar_for_backend` rewrites the + base grammar as needed so Earley keeps explicit `LEFT`/`RIGHT` precedence + hints while LALR uses inline priority declarations. Operator-like tokens that + are generated dynamically (e.g., graphic operator atoms) are injected into + the token set so LALR keeps the fast path for those atoms. +- **Caching**: Parsers are cached per `(module, operator signature, backend)` + tuple. `_parser_cache_key` folds the operator table into the cache key, so the + correct parser (and backend) is reused across consults and terms until the + operator environment changes. +- **DCG handling**: DCG rules remain in the grammar so both backends can parse + `-->` clauses. The transformer expands DCG bodies after parsing, ensuring the + backend choice does not alter the resulting AST. + +This layering keeps clause parsing fast in the common case while maintaining +compatibility and error reporting fidelity when Earley is needed. + ## DCG (Definite Clause Grammar) Support DCG rules are syntactic sugar for Prolog clauses that manipulate difference lists. diff --git a/docs/ISO_PROLOG_OPERATOR_SET.md b/docs/ISO_PROLOG_OPERATOR_SET.md new file mode 100644 index 0000000..ce04f78 --- /dev/null +++ b/docs/ISO_PROLOG_OPERATOR_SET.md @@ -0,0 +1,155 @@ +# ISO Prolog Operator Set + +This document lists the **ISO/IEC 13211-1 (core Prolog)** standard operators, including their **precedence** and **specifier**. + +Lower precedence numbers bind **more tightly**. + +--- + +## Operator Specifiers + +| Specifier | Meaning | +|----------|--------| +| `xfx` | Infix, non-associative | +| `xfy` | Infix, right-associative | +| `yfx` | Infix, left-associative | +| `fx` | Prefix, non-associative | +| `fy` | Prefix, right-associative | +| `xf` | Postfix, non-associative | +| `yf` | Postfix, left-associative | + +--- + +## ISO Operators by Precedence + +### Precedence 1200 + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `:-` | `xfx` | Clause definition | +| `:-` | `fx` | Directive | +| `?-` | `fx` | Query | + +--- + +### Precedence 1100 + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `;` | `xfy` | Disjunction | + +--- + +### Precedence 1050 + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `->` | `xfy` | If-then | +| `*->` | `xfy` | Soft cut if-then | + +--- + +### Precedence 1000 + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `,` | `xfy` | Conjunction | + +--- + +### Precedence 900 + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `\+` | `fy` | Negation as failure | + +--- + +### Precedence 700 + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `=` | `xfx` | Unification | +| `\=` | `xfx` | Not unifiable | +| `==` | `xfx` | Term identity | +| `\==` | `xfx` | Not identical | +| `@<` | `xfx` | Term less than | +| `@=<` | `xfx` | Term less or equal | +| `@>` | `xfx` | Term greater than | +| `@>=` | `xfx` | Term greater or equal | +| `=..` | `xfx` | Univ | +| `is` | `xfx` | Arithmetic evaluation | +| `=:=` | `xfx` | Arithmetic equality | +| `=\=` | `xfx` | Arithmetic inequality | +| `<` | `xfx` | Arithmetic less than | +| `=<` | `xfx` | Arithmetic less or equal | +| `>` | `xfx` | Arithmetic greater than | +| `>=` | `xfx` | Arithmetic greater or equal | + +--- + +### Precedence 500 + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `+` | `yfx` | Addition | +| `-` | `yfx` | Subtraction | + +--- + +### Precedence 400 + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `*` | `yfx` | Multiplication | +| `/` | `yfx` | Division | +| `//` | `yfx` | Integer division | +| `mod` | `yfx` | Modulo | +| `rem` | `yfx` | Remainder | + +--- + +### Precedence 200 + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `**` | `xfx` | Power | + +--- + +### Precedence 200 (Prefix) + +| Operator | Specifier | Description | +|--------|-----------|-------------| +| `+` | `fy` | Unary plus | +| `-` | `fy` | Unary minus | + +--- + +## Minimal ISO Operator Declarations + +```prolog +:- op(1200, xfx, ':-'). +:- op(1200, fx, ':-'). +:- op(1200, fx, '?-'). +:- op(1100, xfy, ';'). +:- op(1050, xfy, '->'). +:- op(1050, xfy, '*->'). +:- op(1000, xfy, ','). +:- op(900, fy, '\+'). +:- op(700, xfx, [=, \=, ==, \==, @<, @=<, @>, @>=, =.., is, =:=, =\=, <, =<, >, >=]). +:- op(500, yfx, [+,-]). +:- op(400, yfx, [*, /, //, mod, rem]). +:- op(200, xfx, '**'). +:- op(200, fy, [+,-]). +``` + +--- + +## Notes + +- Precedence range is **1–1200** +- Operators are syntactic; semantics are provided by predicates +- `*->/2` is part of ISO Prolog +- `not/1` is **not** ISO (use `\+/1`) +- Lists are not operators in ISO diff --git a/docs/SCRYER_PROLOG_OPERATORS_SET.md b/docs/SCRYER_PROLOG_OPERATORS_SET.md new file mode 100644 index 0000000..6e248ad --- /dev/null +++ b/docs/SCRYER_PROLOG_OPERATORS_SET.md @@ -0,0 +1,45 @@ +Scryer Prolog operator table + + +| Prec | Spec | Operator | +| ---: | :--: | :------------------------- | +| 1200 | xfx | `-->` | +| 1100 | xfy | `;` | +| 1050 | xfy | `->` | +| 900 | fy | `\+` | +| 700 | xfx | `<` | +| 700 | xfx | `=<` | +| 700 | xfx | `=` | +| 700 | xfx | `=\=` | +| 700 | xfx | `=..` | +| 700 | xfx | `=:=` | +| 700 | xfx | `>=` | +| 700 | xfx | `>` | +| 700 | xfx | `==` | +| 700 | xfx | `\==` | +| 700 | xfx | `\=` | +| 700 | xfx | `@<` | +| 700 | xfx | `@=<` | +| 700 | xfx | `@>` | +| 700 | xfx | `@>=` | +| 700 | xfx | `is` | +| 700 | fx | `non_counted_backtracking` | +| 600 | xfy | `:` | +| 500 | yfx | `+` | +| 500 | yfx | `-` | +| 500 | yfx | `/\` | +| 500 | yfx | `\/` | +| 400 | yfx | `*` | +| 400 | yfx | `/` | +| 400 | yfx | `//` | +| 400 | yfx | `<<` | +| 400 | yfx | `>>` | +| 400 | yfx | `div` | +| 400 | yfx | `mod` | +| 400 | yfx | `rem` | +| 400 | yfx | `rdiv` | +| 200 | xfx | `**` | +| 200 | xfy | `^` | +| 200 | fy | `+` | +| 200 | fy | `-` | +| 200 | fy | `\` | diff --git a/test_unicode_quick.py b/test_unicode_quick.py deleted file mode 100644 index e9e271a..0000000 --- a/test_unicode_quick.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -"""Quick test of Unicode atom support.""" - -from vibeprolog import PrologInterpreter - -def run_query_once_test(test_name, query): - print(f'\n{test_name}') - try: - prolog = PrologInterpreter() - result = prolog.query_once(query) - if result: - print(f" SUCCESS: X = {result['X']}") - else: - print(" FAILED: No result") - except Exception as e: - print(f" ERROR: {e}") - -def test_unicode_quick(): - run_query_once_test("Test 1: Simple Greek letter atom (δ)", "X = δ") - run_query_once_test("Test 2: Unicode atom with underscore (δ_test)", "X = δ_test") - - print("\nTest 3: Unicode in predicate name") - try: - prolog = PrologInterpreter() - prolog.consult_string("δ_test(a). δ_test(b).") - results = list(prolog.query("δ_test(X)")) - if len(results) == 2: - print(f" SUCCESS: Found 2 results: {results[0]['X']}, {results[1]['X']}") - else: - print(f" FAILED: Expected 2 results, got {len(results)}") - except Exception as e: - print(f" ERROR: {e}") - - print("\nTest 4: Cyrillic atom (тест)") - try: - prolog = PrologInterpreter() - result = prolog.query_once("X = тест") - if result: - print(f" SUCCESS: X = {result['X']}") - else: - print(" FAILED: No result") - except Exception as e: - print(f" ERROR: {e}") - - print("\nTest 5: ASCII atoms still work (test)") - try: - prolog = PrologInterpreter() - result = prolog.query_once("X = test") - if result: - print(f" SUCCESS: X = {result['X']}") - else: - print(" FAILED: No result") - except Exception as e: - print(f" ERROR: {e}") - -if __name__ == "__main__": - test_unicode_quick() diff --git a/tests/test_iso_core.py b/tests/test_iso_core.py index cabb9e2..5c50647 100644 --- a/tests/test_iso_core.py +++ b/tests/test_iso_core.py @@ -566,10 +566,10 @@ def test_nested_if_then_else(self): class TestNegationAsFailure: - """Tests for ISO-compliant negation-as-failure (\+) semantics""" + r"""Tests for ISO-compliant negation-as-failure (\+) semantics""" def test_negation_with_unbound_variable(self): - """\+ should not bind variables""" + r"""\+ should not bind variables""" prolog = PrologInterpreter() prolog.consult_string("p(1). p(2).") # \+(p(X)) should fail (p(X) has solutions) @@ -577,14 +577,14 @@ def test_negation_with_unbound_variable(self): assert not prolog.has_solution("\\+(p(X))") def test_double_negation(self): - """\\+(\\+(Goal)) should behave like Goal (classical logic)""" + r"""\\+(\\+(Goal)) should behave like Goal (classical logic)""" prolog = PrologInterpreter() prolog.consult_string("p(a).") # But in Prolog, variables don't escape double negation assert prolog.has_solution("\\+(\\+(p(a)))") def test_negation_fails_with_any_solution(self): - """\+ fails if goal has even one solution""" + r"""\+ fails if goal has even one solution""" prolog = PrologInterpreter() prolog.consult_string(""" p(1). @@ -595,20 +595,20 @@ def test_negation_fails_with_any_solution(self): assert not prolog.has_solution("\\+(p(X))") def test_negation_succeeds_when_goal_fails(self): - """\+ succeeds when goal has no solutions""" + r"""\+ succeeds when goal has no solutions""" prolog = PrologInterpreter() assert prolog.has_solution("\\+(fail)") assert prolog.has_solution("\\+(1 = 2)") def test_negation_with_conjunction(self): - """\+ should work with conjunctions""" + r"""\+ should work with conjunctions""" prolog = PrologInterpreter() prolog.consult_string("p(1). q(2).") # \+(p(X), q(X)) should succeed because no X satisfies both assert prolog.has_solution("\\+(p(X), q(X))") def test_not_aliases(self): - """`not/1` and `not_/1` behave like \+/1.""" + r"""`not/1` and `not_/1` behave like \+/1.""" prolog = PrologInterpreter() prolog.consult_string("p(1).") assert prolog.has_solution("not(1 = 2)") diff --git a/tests/test_parser_backend_selection.py b/tests/test_parser_backend_selection.py new file mode 100644 index 0000000..bd812b7 --- /dev/null +++ b/tests/test_parser_backend_selection.py @@ -0,0 +1,26 @@ +from pathlib import Path + +from vibeprolog import PrologInterpreter +from vibeprolog.parser import PrologParser + + +def test_consult_prefers_lalr_backend(monkeypatch): + backends: list[str] = [] + original_create = PrologParser._create_parser + + def tracking_create(self, grammar, *, backend): + backends.append(backend) + return original_create(self, grammar, backend=backend) + + monkeypatch.setattr(PrologParser, "_create_parser", tracking_create) + + prolog = PrologInterpreter(builtin_conflict="skip") + library_path = Path(__file__).resolve().parents[1] / "library" / "lists.pl" + + prolog.consult(library_path) + + assert backends, "Expected at least one parser backend to be built" + assert backends[0] == "lalr" + if "earley" in backends: + earley_index = backends.index("earley") + assert all(backend == "lalr" for backend in backends[:earley_index]) diff --git a/tests/test_parser_caching.py b/tests/test_parser_caching.py index 1813548..37add1c 100644 --- a/tests/test_parser_caching.py +++ b/tests/test_parser_caching.py @@ -6,10 +6,10 @@ def test_consult_reuses_cached_parser(monkeypatch, tmp_path): build_count = 0 original_create = PrologParser._create_parser - def counting_create(self, grammar): + def counting_create(self, grammar, *, backend): nonlocal build_count build_count += 1 - return original_create(self, grammar) + return original_create(self, grammar, backend=backend) monkeypatch.setattr(PrologParser, "_create_parser", counting_create) @@ -28,10 +28,10 @@ def test_operator_changes_invalidate_parser_cache(monkeypatch, tmp_path): build_count = 0 original_create = PrologParser._create_parser - def counting_create(self, grammar): + def counting_create(self, grammar, *, backend): nonlocal build_count build_count += 1 - return original_create(self, grammar) + return original_create(self, grammar, backend=backend) monkeypatch.setattr(PrologParser, "_create_parser", counting_create) diff --git a/tests/test_parser_compatibility.py b/tests/test_parser_compatibility.py index 7675e65..091a48e 100644 --- a/tests/test_parser_compatibility.py +++ b/tests/test_parser_compatibility.py @@ -185,7 +185,7 @@ def test_cut_in_body(self): assert clauses[0].body[2].name == 'b' def test_negation_prefix(self): - """Test negation \+ as prefix operator""" + r"""Test negation \+ as prefix operator""" parser = PrologParser() clauses = parser.parse("test :- \\+ q.") body = clauses[0].body[0] @@ -194,7 +194,7 @@ def test_negation_prefix(self): assert len(body.args) == 1 def test_negation_with_cut(self): - """Test negation and cut together: p :- \+ q, !, r.""" + r"""Test negation and cut together: p :- \+ q, !, r.""" parser = PrologParser() clauses = parser.parse("test :- \\+ q, !, r.") # Body is a flattened list of goals @@ -213,7 +213,7 @@ def test_negation_with_cut(self): assert isinstance(r_goal, Atom) and r_goal.name == 'r' def test_negation_in_parentheses(self): - """Test negation in parentheses: p :- (\+ q ; r), !.""" + r"""Test negation in parentheses: p :- (\+ q ; r), !.""" parser = PrologParser() clauses = parser.parse("test :- (\\+ q ; r), !.") # Body is a flattened list of goals @@ -347,6 +347,25 @@ def test_quoted_operator_atom(self): atom = clauses[0].head.args[0] assert atom.name == ':-' + def test_operator_symbol_atoms_in_lalr(self): + """Graphic operator atoms should parse in LALR mode without fallback.""" + parser = PrologParser(parser_backend="lalr") + clauses = parser.parse("test(@=<). test(=:=).") + first_atom = clauses[0].head.args[0] + second_atom = clauses[1].head.args[0] + + assert first_atom.name == "@=<" + assert second_atom.name == "=:=" + + def test_operator_symbol_atoms_prefer_lalr_backend(self): + """Parsing operator atoms in auto mode should keep the LALR backend active.""" + parser = PrologParser() + clauses = parser.parse("test(@=<).") + atom = clauses[0].head.args[0] + + assert atom.name == "@=<" + assert parser._active_backend == "lalr" + def test_capital_start_quoted_atom(self): """Test quoted atoms starting with capital letters""" parser = PrologParser() diff --git a/vibeprolog/parser.py b/vibeprolog/parser.py index a447308..b2ac891 100644 --- a/vibeprolog/parser.py +++ b/vibeprolog/parser.py @@ -3,10 +3,10 @@ import re from collections import defaultdict from dataclasses import dataclass -from typing import Any, Iterable +from typing import Any, Callable, Iterable from lark import Lark, Transformer, v_args -from lark.exceptions import LarkError, UnexpectedCharacters, UnexpectedToken +from lark.exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken from vibeprolog.exceptions import PrologError, PrologThrow from vibeprolog.operator_defaults import DEFAULT_OPERATORS @@ -120,18 +120,19 @@ class PredicatePropertyDirective: _RBRACE: "}" _COMMA: "," _PIPE: "|" - _COLON_DASH: ":-" + // Priority 36 ensures :- is matched before : (INFIX_OP_FUNCTOR has priority 35) + _COLON_DASH.36: ":-" start: (clause | directive)+ clause: rule | dcg_rule | fact fact: term "." - rule: term ":-" goals "." + rule: term _COLON_DASH goals "." dcg_rule: term DCG_ARROW goals "." atom_or_compound: atom | atom "(" args ")" - directive: ":-" (op_directive | prefix_directive | property_directive | term) "." + directive: _COLON_DASH (op_directive | prefix_directive | property_directive | term) "." prefix_directive: "dynamic" predicate_indicators -> dynamic_directive | "multifile" predicate_indicators -> multifile_directive @@ -153,6 +154,8 @@ class PredicatePropertyDirective: // Priority set to ensure special atoms like -$ are recognized, but still below SPECIAL_ATOM_OPS // NOTE: .. (range operator) must have HIGH priority to prevent being split into . . by lexer RANGE_OP.30: /\.\./ + // Single dot as atom (for patterns like upto_what(Bs0, .)) - must outprioritize OP_SYMBOL_DIRECTIVE + DOT_ATOM.26: /\.(?=[,\)\]\s])/ OP_SYMBOL_DIRECTIVE.25: /[+\-*\/<>=\\@#$&!~:?^.]+/ OP_SYMBOL: /[+\-*\/<>=\\@#$&!~:?^.]+/ @@ -182,24 +185,29 @@ class PredicatePropertyDirective: // Allow operator symbols as functors: ;(a,b), |(a,b), ,(a,b), ->(a,b), etc. operator_compound: operator_functor "(" args ")" -> operator_compound // Operators that can be used as functors when followed by ( - operator_functor: INFIX_OP_FUNCTOR | CONTROL_OP_FUNCTOR | COMPARISON_OP_FUNCTOR | ARITH_OP_FUNCTOR | OP_SYMBOL - // Infix operators like ;, |, ,, ->, etc. - INFIX_OP_FUNCTOR.30: /;/ | /\|/ | /,/ | /->/ | /:/ | /=/ - // Control operators - CONTROL_OP_FUNCTOR.30: /\\+/ - // Comparison operators - but not < and > alone as they're in OP_SYMBOL - COMPARISON_OP_FUNCTOR.30: /=:=/ | /=\\=/ | /==/ | /=@=/ | /\\=@=/ | /==/ | /\\==/ - // Arithmetic operators that can also be prefix/postfix - must be followed by ( for functor use - ARITH_OP_FUNCTOR.30: /\+/ | /-/ | /\*/ | /\// + operator_functor: INFIX_OP_FUNCTOR | ARITH_OP_FUNCTOR | COMPARISON_OP_FUNCTOR | OP_SYMBOL + // Infix operators like ;, |, ,, ->, etc. - only match when followed by ( + // Higher priority (36) ensures these match when followed by ( + INFIX_OP_FUNCTOR.36: /;(?=\()/ | /\|(?=\()/ | /,(?=\()/ | /->(?=\()/ | /:(?=\()/ | /=(?=\()/ | /<(?=\()/ | />(?=\()/ + // Control operators - only match when followed by ( + CONTROL_OP_FUNCTOR.36: /\\+(?=\()/ + // Comparison operators with lookahead - only match when followed by ( + COMPARISON_OP_FUNCTOR.36: /=:=(?=\()/ | /=\\=(?=\()/ | /=<(?=\()/ | />=(?=\()/ | /=@=(?=\()/ | /\\=@=(?=\()/ | /==(?=\()/ | /\\==(?=\()/ + // Arithmetic operators - only match when followed by ( + // This allows +(a,b) to be parsed as operator_compound instead of prefix operator + ARITH_OP_FUNCTOR.36: /\+(?=\()/ | /-(?=\()/ | /\*(?=\()/ | /\/(?=\()/ // Parenthesized operator as atom: (;), (|), (,), (->), etc. - operator_as_atom: INFIX_OP_FUNCTOR | CONTROL_OP_FUNCTOR | COMPARISON_OP_FUNCTOR | ARITH_OP_FUNCTOR | OP_SYMBOL - operator_atom: OPERATOR_ATOM - args: term ("," term)* + // Lower priority (34) so functor patterns match first when followed by ( + INFIX_OP_ATOM.34: /;/ | /\|/ | /,/ | /->/ | /:/ | /=/ | /\+/ | /-/ | /\*/ | /\// | // | /=:=/ | /=\\=/ | /==/ | /=@=/ | /\\=@=/ | /==/ | /\\==/ + operator_as_atom: INFIX_OP_ATOM + operator_atom: OPERATOR_ATOM | operator_table_token + // args uses 'arg' (not 'term') so comma is treated as separator, not operator + args: arg ("," arg)* curly_braces: "{" term "}" - DCG_ARROW.35: "-->" - OPERATOR_ATOM.35: ":-" + DCG_ARROW.36: "-->" + OPERATOR_ATOM.36: ":-" list: "[" "]" -> empty_list | "[" list_items "]" -> list_items_only @@ -208,7 +216,7 @@ class PredicatePropertyDirective: list_items: term ("," term)* string: STRING - atom: ATOM | SPECIAL_ATOM | SPECIAL_ATOM_OPS | OP_SYMBOL + atom: ATOM | SPECIAL_ATOM | SPECIAL_ATOM_OPS | OP_SYMBOL | DOT_ATOM variable: VARIABLE char_code: CHAR_CODE number: NUMBER @@ -221,11 +229,11 @@ class PredicatePropertyDirective: // following tokens like the clause terminator. CHAR_CODE.5: /0'(?:''|\\x[0-9A-Fa-f]+\\?|\\u[0-9A-Fa-f]{4}|\\[0-7]{1,3}|\\[abdefnrstvs\\\"']|[^'\\])'?/ | /[1-9]\d*'.'/ - STRING: /(?s)"(\\.|[^"])*"/ - SPECIAL_ATOM: /(?s)'(\\.|''|[^'])*'/ + STRING: /(?s:"(\\.|[^"])*")/ + SPECIAL_ATOM: /(?s:'(\\.|''|[^'])*')/ // Special atom operators must have HIGHEST priority to prevent being parsed as prefix operators - SPECIAL_ATOM_OPS.12: /-\$/ | /\$-/ + SPECIAL_ATOM_OPS.35: /-\$/ | /\$-/ // Scientific notation, hex, octal, binary, Edinburgh ' NUMBER.4: /-?0x[0-9a-fA-F]+/i @@ -395,6 +403,15 @@ def _collect_comma_terms(self, compound): else: return [compound] + def _rebuild_comma_chain(self, terms): + """Rebuild a right-associative comma chain from collected terms.""" + if not terms: + return None + result = terms[-1] + for term in reversed(terms[:-1]): + result = Compound(",", (term, result)) + return result + @v_args(meta=True) def fact(self, meta, items): return Clause(head=items[0], body=None, meta=meta) @@ -421,6 +438,22 @@ def goals(self, items): result.extend(self._collect_comma_terms(item)) else: result.append(item) + + # When a leading conjunction was captured as part of a disjunction, + # peel off the first goal to match the intended goal separation. + if len(result) == 1: + candidate = result[0] + if isinstance(candidate, Compound) and candidate.functor == ';': + left, right = candidate.args + if isinstance(left, (Compound, ParenthesizedComma)) and getattr(left, "functor", None) == ',': + leading_terms = self._collect_comma_terms(left) + if len(leading_terms) >= 2: + first_goal = leading_terms[0] + if isinstance(first_goal, List): + return result + rebuilt_left = self._rebuild_comma_chain(leading_terms[1:]) + if rebuilt_left is not None: + result = [first_goal, Compound(";", (rebuilt_left, right))] return result def term(self, items): @@ -586,6 +619,10 @@ def operator_as_atom(self, items): token_str = str(items[0]) return Atom(token_str) + def operator_table_token(self, items): + """Pass through dynamically generated operator tokens for atom handling.""" + return items[0] + def operator_functor(self, items): """Convert operator token to Atom for use as functor.""" token_str = str(items[0]) @@ -1499,7 +1536,21 @@ def _is_alphabetic_operator(name: str) -> bool: return name.isalpha() or (name.replace("_", "").isalnum() and name[0].isalpha()) -def _format_operator_literals(ops: Iterable[str]) -> str: +def _format_operator_literals( + ops: Iterable[str], *, exclude_before_paren: bool = False, is_infix: bool = False +) -> str: + """Format operator literals for Lark grammar. + + Args: + ops: Operator names to format + exclude_before_paren: If True, use negative lookahead to prevent matching + when the operator is immediately followed by '('. This is used for + prefix operators so that `+(a, b)` is parsed as operator_compound, + not as prefix + applied to parenthesized (a, b). + is_infix: If True, apply special handling for infix operators that could + be prefixes of multi-character operators (e.g., ':' shouldn't match + when followed by '-' to form ':-'). + """ # Sort longest operators first so sequences like "\\+" take precedence over # shorter prefixes such as "\\". formatted: list[str] = [] @@ -1508,6 +1559,16 @@ def _format_operator_literals(ops: Iterable[str]) -> str: # Alphabetic operators need word boundaries to prevent matching # inside atoms like 'indexed' matching 'in' + 'dexed' formatted.append(f'/(?' (forming '--' or '->') + formatted.append(r'/-(?![->\(])/') else: formatted.append(f'"{escape_for_lark(op)}"') return " | ".join(formatted) @@ -1527,13 +1588,19 @@ def _operator_token_priority(name: str) -> int: """ if any(ch.isalnum() or ch == "_" for ch in name): - return 0 + # Alphabetic operators need priority > ATOM.3 to be matched before ATOM + return 4 if len(name) == 1: - return 0 + # Single-character graphic operators (like '-') must outrank the + # generic arithmetic functor tokens so unary parsing keeps the + # operator table's precedence and associativity. + return 31 if name in {":-", "-->", "?-"}: - return 0 + # These special operators must outrank their single-character prefixes + # (: and -) which have priority 31. Use standard length-based formula. + return 30 + len(name) # Punctuation-only operators should outrank OP_SYMBOL (priority 25) and use # longest match to beat their own prefixes. @@ -1580,7 +1647,16 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: lambda: {"prefix": defaultdict(list), "infix": defaultdict(list), "postfix": defaultdict(list)} ) + # Operators with special syntactic roles in clause/directive structure. + # These are handled by dedicated grammar productions (rule, directive, dcg_rule) + # and should not be included as term-level operators. + # Note: ?- is NOT excluded because it's used as a prefix operator for queries + # and should be parseable within terms. + CLAUSE_STRUCTURE_OPS = {":-", "-->"} + for precedence, spec, name in operators: + if name in CLAUSE_STRUCTURE_OPS: + continue # Skip - handled by grammar template if spec in {"fx", "fy"}: grouped[precedence]["prefix"][spec].append(name) elif spec in {"xf", "yf"}: @@ -1590,6 +1666,7 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: rules: list[str] = [] tokens: list[str] = [] + operator_token_names: list[str] = [] lower_rule = "primary" precedence_levels = sorted(grouped.keys()) @@ -1603,13 +1680,38 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: prefix_specs = grouped[precedence]["prefix"] postfix_specs = grouped[precedence]["postfix"] + if prefix_specs.get("fx"): + for name in sorted(prefix_specs["fx"]): + token_counter += 1 + token = f"PREFIX_FX_{precedence}_{token_counter}" + priority = _operator_token_priority(name) + 1 + token_def = f"{token}.{priority}" if priority else token + # Use negative lookahead: prefix operators don't match when followed by ( + # This allows operator_compound to match +(a, b) as functor call + tokens.append(f" {token_def}: {_format_operator_literals([name], exclude_before_paren=True)}") + operator_token_names.append(token) + parts.append(f"{token} {lower_rule} -> prefix_fx") + if prefix_specs.get("fy"): + for name in sorted(prefix_specs["fy"]): + token_counter += 1 + token = f"PREFIX_FY_{precedence}_{token_counter}" + priority = _operator_token_priority(name) + 1 + token_def = f"{token}.{priority}" if priority else token + # Use negative lookahead: prefix operators don't match when followed by ( + # This allows operator_compound to match +(a, b) as functor call + tokens.append(f" {token_def}: {_format_operator_literals([name], exclude_before_paren=True)}") + operator_token_names.append(token) + parts.append(f"{token} {rule_name} -> prefix_fy") + if infix_specs.get("xfx"): for name in sorted(infix_specs["xfx"]): token_counter += 1 token = f"INFIX_XFX_{precedence}_{token_counter}" priority = _operator_token_priority(name) token_def = f"{token}.{priority}" if priority else token - tokens.append(f" {token_def}: {_format_operator_literals([name])}") + # Use is_infix=True to apply special handling for operators like ':' + tokens.append(f" {token_def}: {_format_operator_literals([name], is_infix=True)}") + operator_token_names.append(token) parts.append(f"{lower_rule} {token} {lower_rule} -> infix_xfx") if infix_specs.get("yfx"): for name in sorted(infix_specs["yfx"]): @@ -1617,7 +1719,9 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: token = f"INFIX_YFX_{precedence}_{token_counter}" priority = _operator_token_priority(name) token_def = f"{token}.{priority}" if priority else token - tokens.append(f" {token_def}: {_format_operator_literals([name])}") + # Use is_infix=True to apply special handling for operators like '-' + tokens.append(f" {token_def}: {_format_operator_literals([name], is_infix=True)}") + operator_token_names.append(token) parts.append(f"{rule_name} {token} {lower_rule} -> infix_yfx") if infix_specs.get("xfy"): for name in sorted(infix_specs["xfy"]): @@ -1625,7 +1729,9 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: token = f"INFIX_XFY_{precedence}_{token_counter}" priority = _operator_token_priority(name) token_def = f"{token}.{priority}" if priority else token - tokens.append(f" {token_def}: {_format_operator_literals([name])}") + # Use is_infix=True to apply special handling for operators like ':' + tokens.append(f" {token_def}: {_format_operator_literals([name], is_infix=True)}") + operator_token_names.append(token) parts.append(f"{lower_rule} {token} {rule_name} -> infix_xfy") if infix_specs.get("yfy"): for name in sorted(infix_specs["yfy"]): @@ -1633,26 +1739,11 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: token = f"INFIX_YFY_{precedence}_{token_counter}" priority = _operator_token_priority(name) token_def = f"{token}.{priority}" if priority else token - tokens.append(f" {token_def}: {_format_operator_literals([name])}") + # Use is_infix=True to apply special handling for operators like ':' + tokens.append(f" {token_def}: {_format_operator_literals([name], is_infix=True)}") + operator_token_names.append(token) parts.append(f"{rule_name} {token} {rule_name} -> infix_yfy") - if prefix_specs.get("fx"): - for name in sorted(prefix_specs["fx"]): - token_counter += 1 - token = f"PREFIX_FX_{precedence}_{token_counter}" - priority = _operator_token_priority(name) - token_def = f"{token}.{priority}" if priority else token - tokens.append(f" {token_def}: {_format_operator_literals([name])}") - parts.append(f"{token} {lower_rule} -> prefix_fx") - if prefix_specs.get("fy"): - for name in sorted(prefix_specs["fy"]): - token_counter += 1 - token = f"PREFIX_FY_{precedence}_{token_counter}" - priority = _operator_token_priority(name) - token_def = f"{token}.{priority}" if priority else token - tokens.append(f" {token_def}: {_format_operator_literals([name])}") - parts.append(f"{token} {rule_name} -> prefix_fy") - if postfix_specs.get("xf"): for name in sorted(postfix_specs["xf"]): token_counter += 1 @@ -1660,6 +1751,7 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: priority = _operator_token_priority(name) token_def = f"{token}.{priority}" if priority else token tokens.append(f" {token_def}: {_format_operator_literals([name])}") + operator_token_names.append(token) parts.append(f"{lower_rule} {token} -> postfix_xf") if postfix_specs.get("yf"): for name in sorted(postfix_specs["yf"]): @@ -1668,6 +1760,7 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: priority = _operator_token_priority(name) token_def = f"{token}.{priority}" if priority else token tokens.append(f" {token_def}: {_format_operator_literals([name])}") + operator_token_names.append(token) parts.append(f"{rule_name} {token} -> postfix_yf") rule_body = "\n | ".join(parts) @@ -1675,6 +1768,22 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: lower_rule = rule_name rules.append(f"?term: {lower_rule}") + + # Find the highest level below comma (precedence 1000) for argument parsing. + # Inside function arguments, comma is an argument separator, not the comma operator. + # So args should parse terms that exclude the comma operator. + arg_level = max((p for p in precedence_levels if p < 1000), default=None) + if arg_level is not None: + rules.append(f"?arg: level_{arg_level}") + else: + # No operators below comma, use primary + rules.append("?arg: primary") + + if operator_token_names: + token_union = " | ".join(operator_token_names) + else: + token_union = "OPERATOR_ATOM" + rules.append(f"operator_table_token: {token_union}") rules.extend(tokens) return "\n".join(rules) + "\n" @@ -1683,15 +1792,21 @@ def generate_operator_rules(operators: list[tuple[int, str, str]]) -> str: class PrologParser: """Parse Prolog source code with support for dynamic operators.""" - def __init__(self, operator_table=None): + def __init__(self, operator_table=None, *, parser_backend: str = "auto"): + if parser_backend not in {"auto", "lalr", "earley"}: + raise ValueError(f"Unsupported parser backend: {parser_backend}") self.operator_table = operator_table # Character conversion table: maps single chars to single chars # Initially identity (no conversions active) self._char_conversions: dict[str, str] = {} - self.parser = None - # Cache parsers keyed by module context and operator signature so we do - # not rebuild the Earley grammar for every clause/directive. - self._parser_cache: dict[tuple[str | None, tuple[tuple[int, str, str], ...]], Lark] = {} + self.parser: Lark | None = None + self._active_backend: str | None = None + self._preferred_backend = parser_backend + # Cache parsers keyed by module context, operator signature, and backend + # so we do not rebuild the grammar for every clause/directive. + self._parser_cache: dict[ + tuple[str | None, tuple[tuple[int, str, str], ...], str], Lark + ] = {} def set_char_conversion(self, from_char: str, to_char: str) -> None: """Set a character conversion. @@ -1772,14 +1887,30 @@ def _apply_char_conversions(self, text: str) -> str: return ''.join(result) - def _create_parser(self, grammar: str): - return Lark( - grammar, - parser="earley", - propagate_positions=True, - ambiguity='resolve', - start=["start", "clause", "directive"], - ) + def _create_parser(self, grammar: str, *, backend: str) -> Lark: + if backend == "lalr": + parser = Lark( + grammar, + parser="lalr", + lexer="contextual", + propagate_positions=True, + maybe_placeholders=False, + start=["start", "clause", "directive"], + ) + elif backend == "earley": + parser = Lark( + grammar, + parser="earley", + lexer="basic", # Use basic lexer for proper longest-match tokenization + propagate_positions=True, + ambiguity="resolve", + start=["start", "clause", "directive"], + ) + else: + raise ValueError(f"Unknown parser backend: {backend}") + + parser.prolog_backend = backend + return parser def _base_operator_definitions( self, module_name: str | None = None @@ -1796,17 +1927,33 @@ def _build_grammar(self, operators: list[tuple[int, str, str]]) -> str: operator_rules = generate_operator_rules(operators) return PROLOG_GRAMMAR_TEMPLATE.replace("{UNICODE_LETTER_RANGES}", UNICODE_LETTER_RANGES).replace("__OPERATOR_GRAMMAR__", operator_rules) + def _prepare_grammar_for_backend(self, grammar: str, backend: str) -> str: + if backend != "lalr": + return grammar + + # For LALR, remove OP_SYMBOL from atom but keep DOT_ATOM for single dot atoms + return grammar.replace( + "atom: ATOM | SPECIAL_ATOM | SPECIAL_ATOM_OPS | OP_SYMBOL | DOT_ATOM", + "atom: ATOM | SPECIAL_ATOM | DOT_ATOM", + ) + def _parser_cache_key( - self, operators: list[tuple[int, str, str]], module_name: str | None - ) -> tuple[str | None, tuple[tuple[int, str, str], ...]]: - return (module_name, tuple(operators)) + self, + operators: list[tuple[int, str, str]], + module_name: str | None, + backend: str, + ) -> tuple[str | None, tuple[tuple[int, str, str], ...], str]: + return (module_name, tuple(operators), backend) def _ensure_parser( self, cleaned_text: str, directive_ops: list[tuple[int, str, str]] | None = None, module_name: str | None = None, + backend: str | None = None, ) -> None: + if backend is not None and backend not in {"auto", "lalr", "earley"}: + raise ValueError(f"Unsupported parser backend: {backend}") if directive_ops is None: try: directive_ops = extract_op_directives(cleaned_text) @@ -1815,13 +1962,38 @@ def _ensure_parser( operators = _merge_operators( self._base_operator_definitions(module_name), directive_ops ) - cache_key = self._parser_cache_key(operators, module_name) - cached_parser = self._parser_cache.get(cache_key) - if cached_parser is None: - grammar = self._build_grammar(operators) - cached_parser = self._create_parser(grammar) - self._parser_cache[cache_key] = cached_parser - self.parser = cached_parser + preferred_backend = backend or self._preferred_backend + backend_order = ( + ("lalr", "earley") if preferred_backend == "auto" else (preferred_backend,) + ) + + grammar: str | None = None + for candidate in backend_order: + cache_key = self._parser_cache_key(operators, module_name, candidate) + cached_parser = self._parser_cache.get(cache_key) + if cached_parser is not None: + self.parser = cached_parser + self._active_backend = candidate + return + + if grammar is None: + grammar = self._build_grammar(operators) + + try: + candidate_grammar = self._prepare_grammar_for_backend( + grammar, candidate + ) + parser = self._create_parser(candidate_grammar, backend=candidate) + self._parser_cache[cache_key] = parser + self.parser = parser + self._active_backend = candidate + return + except GrammarError: + if preferred_backend != "auto": + raise + continue + + raise GrammarError("Could not build a parser with any available backend") def _strip_block_comments(self, text: str) -> tuple[str, list[tuple[int, str]]]: """Strip block comments from text, handling nesting and quoted strings. @@ -1990,6 +2162,16 @@ def _associate_pldoc_comments(self, items: list, comments: list[tuple[int, str]] payload.doc = last_comment_text last_comment_text = None + def _with_fallback(self, do_parse: Callable[[str | None], Any]): + """Invoke a parsing function and fall back to Earley when appropriate.""" + + try: + return do_parse(None) + except (UnexpectedToken, UnexpectedCharacters, GrammarError): + if self._preferred_backend == "auto" and self._active_backend == "lalr": + return do_parse("earley") + raise + def parse( self, text: str, @@ -2013,42 +2195,44 @@ def parse( if apply_char_conversions: text = self._apply_char_conversions(text) cleaned_text, pldoc_comments = self._collect_pldoc_comments(text) - self._ensure_parser(cleaned_text, directive_ops, module_name) - transformer = PrologTransformer() parsed_items: list[Clause | Directive] = [] - # Parse each clause/directive separately to avoid Earley state explosion - # on large files with many clauses. - statements = tokenize_prolog_statements(cleaned_text) - search_pos = 0 - for statement in statements: - # Track position of the statement within the cleaned text so that - # PlDoc association via start_pos still works. - stmt_pos = cleaned_text.find(statement, search_pos) - if stmt_pos == -1: - raise RuntimeError( - f"Could not re-locate tokenized statement in text: {statement!r}" - ) - search_pos = stmt_pos + len(statement) + def parse_with_backend(backend_override: str | None) -> list[Clause | Directive]: + self._ensure_parser(cleaned_text, directive_ops, module_name, backend_override) + transformer = PrologTransformer() + items: list[Clause | Directive] = [] + + statements = tokenize_prolog_statements(cleaned_text) + search_pos = 0 + for statement in statements: + stmt_pos = cleaned_text.find(statement, search_pos) + if stmt_pos == -1: + raise RuntimeError( + f"Could not re-locate tokenized statement in text: {statement!r}" + ) + search_pos = stmt_pos + len(statement) + + stripped = statement.lstrip() + start_rule = "directive" if stripped.startswith(":-") else "clause" + try: + tree = self.parser.parse(statement, start=start_rule) + except LarkError: + tree = self.parser.parse(statement, start="start") + transformed = transformer.transform(tree) + if isinstance(transformed, list): + parsed = transformed + else: + parsed = [transformed] - stripped = statement.lstrip() - start_rule = "directive" if stripped.startswith(":-") else "clause" - try: - tree = self.parser.parse(statement, start=start_rule) - except LarkError: - tree = self.parser.parse(statement, start="start") - transformed = transformer.transform(tree) - if isinstance(transformed, list): - parsed = transformed - else: - parsed = [transformed] - - items = [self._fold_numeric_unary_minus(item) for item in parsed] - for item in items: - meta = getattr(item, "meta", None) - if meta is not None and hasattr(meta, "start_pos"): - meta.start_pos += stmt_pos - parsed_items.extend(items) + folded = [self._fold_numeric_unary_minus(item) for item in parsed] + for item in folded: + meta = getattr(item, "meta", None) + if meta is not None and hasattr(meta, "start_pos"): + meta.start_pos += stmt_pos + items.extend(folded) + return items + + parsed_items = self._with_fallback(parse_with_backend) # Associate PlDoc comments with items self._associate_pldoc_comments(parsed_items, pldoc_comments) return parsed_items @@ -2122,15 +2306,22 @@ def parse_term( if apply_char_conversions: clause_text = self._apply_char_conversions(clause_text) cleaned_text, _ = self._collect_pldoc_comments(clause_text) - self._ensure_parser(cleaned_text) - tree = self.parser.parse(cleaned_text, start="start") transformer = PrologTransformer() - result = transformer.transform(tree) - if result and isinstance(result[0], Clause): - compound = result[0].head - if isinstance(compound, Compound) and compound.args: - return compound.args[0] - raise ValueError(f"Failed to parse term: {text}") + + def parse_term_with_backend(backend_override: str | None): + self._ensure_parser(cleaned_text, backend=backend_override) + tree = self.parser.parse(cleaned_text, start="start") + transformed = transformer.transform(tree) + if transformed and isinstance(transformed[0], Clause): + compound = transformed[0].head + if isinstance(compound, Compound) and compound.args: + return compound.args[0] + raise ValueError(f"Failed to parse term: {text}") + + return self._with_fallback(parse_term_with_backend) + except GrammarError as e: + error_term = PrologError.syntax_error(str(e), context) + raise PrologThrow(error_term) except LarkError as e: # Convert Lark parse error to Prolog syntax_error error_term = PrologError.syntax_error(str(e), context)