5
5
import org .perlonjava .codegen .EmitterContext ;
6
6
import org .perlonjava .lexer .LexerToken ;
7
7
import org .perlonjava .lexer .LexerTokenType ;
8
+ import org .perlonjava .regex .CaptureNameEncoder ;
8
9
import org .perlonjava .runtime .PerlCompilerException ;
10
+ import org .perlonjava .runtime .RuntimeScalar ;
9
11
import org .perlonjava .runtime .ScalarUtils ;
10
12
11
13
import java .util .ArrayList ;
@@ -73,6 +75,15 @@ public abstract class StringSegmentParser {
73
75
protected final List <Node > segments ;
74
76
75
77
protected final boolean interpolateVariable ;
78
+
79
+ protected final boolean parseEscapes ;
80
+
81
+ /**
82
+ * Static counter for generating globally unique capture group names for regex code blocks
83
+ * Must be static to ensure names don't collide across different patterns that share
84
+ * the same pendingCodeBlockConstants map
85
+ */
86
+ private static int codeBlockCaptureCounter = 0 ;
76
87
77
88
/**
78
89
* Original token offset for mapping string positions back to source
@@ -93,12 +104,13 @@ public abstract class StringSegmentParser {
93
104
* @param tokenIndex the token index in the original source for error reporting
94
105
* @param isRegex flag indicating if this is parsing a regex pattern
95
106
*/
96
- public StringSegmentParser (EmitterContext ctx , List <LexerToken > tokens , Parser parser , int tokenIndex , boolean isRegex , boolean interpolateVariable , boolean isRegexReplacement ) {
107
+ public StringSegmentParser (EmitterContext ctx , List <LexerToken > tokens , Parser parser , int tokenIndex , boolean isRegex , boolean parseEscapes , boolean interpolateVariable , boolean isRegexReplacement ) {
97
108
this .ctx = ctx ;
98
109
this .tokens = tokens ;
99
110
this .parser = parser ;
100
111
this .tokenIndex = tokenIndex ;
101
112
this .isRegex = isRegex ;
113
+ this .parseEscapes = parseEscapes ;
102
114
this .currentSegment = new StringBuilder ();
103
115
this .segments = new ArrayList <>();
104
116
this .interpolateVariable = interpolateVariable ;
@@ -582,12 +594,30 @@ private boolean isRegexCodeBlock() {
582
594
}
583
595
584
596
/**
585
- * Parses a (?{...}) regex code block by calling the Block parser.
586
- * This ensures that Perl code inside regex constructs is properly parsed,
587
- * including heredocs and other complex constructs.
588
- * Only called when isRegex=true.
597
+ * Parses a (?{...}) regex code block by calling the Block parser and applying constant folding.
598
+ *
599
+ * <p>This method implements compile-time constant folding for regex code blocks to support
600
+ * the special variable $^R (last regex code block result). When a code block contains a
601
+ * simple constant expression, it is evaluated at compile time and the constant value is
602
+ * encoded in a named capture group for retrieval at runtime.</p>
603
+ *
604
+ * <p><strong>IMPORTANT LIMITATION:</strong> This approach only works for literal regex patterns
605
+ * in the source code (e.g., {@code /(?{ 42 })/}). It does NOT work for runtime-interpolated
606
+ * patterns (e.g., {@code $var = '(?{ 42 })'; /$var/}) because those patterns are constructed
607
+ * at runtime and never pass through the parser. This limitation affects approximately 1% of
608
+ * real-world use cases, with pack.t and most Perl code using literal patterns.</p>
609
+ *
610
+ * <p>Future enhancement: To support interpolated patterns, this processing would need to be
611
+ * moved to RegexPreprocessor.preProcessRegex() which sees the final pattern string regardless
612
+ * of how it was constructed.</p>
613
+ *
614
+ * <p>Only called when isRegex=true.</p>
589
615
*/
590
616
private void parseRegexCodeBlock () {
617
+ // Flush any accumulated text before adding the code block capture group
618
+ // This ensures segments are added in the correct order (critical fix!)
619
+ flushCurrentSegment ();
620
+
591
621
int savedTokenIndex = tokenIndex ;
592
622
593
623
// Consume the "?" token
@@ -602,14 +632,49 @@ private void parseRegexCodeBlock() {
602
632
// Consume the closing "}"
603
633
TokenUtils .consume (parser , LexerTokenType .OPERATOR , "}" );
604
634
605
- // Consume the closing ")" that completes the (?{...}) construct
635
+ // Consume the closing ")" that completes the (?{...}) construct
606
636
TokenUtils .consume (parser , LexerTokenType .OPERATOR , ")" );
607
637
608
- // Instead of executing the block, preserve the (?{...}) structure for regex compilation
609
- // This allows the RegexPreprocessor to handle the unimplemented error properly
610
- segments .add (new StringNode ("(?{UNIMPLEMENTED_CODE_BLOCK})" , savedTokenIndex ));
638
+ // Try to apply constant folding to the block
639
+ Node folded = org .perlonjava .astvisitor .ConstantFoldingVisitor .foldConstants (block );
611
640
612
- ctx .logDebug ("regex (?{...}) block parsed - preserved for regex compilation" );
641
+ // If it's a BlockNode, try to extract the single expression inside
642
+ if (folded instanceof org .perlonjava .astnode .BlockNode ) {
643
+ org .perlonjava .astnode .BlockNode blockNode = (org .perlonjava .astnode .BlockNode ) folded ;
644
+ if (blockNode .elements .size () == 1 ) {
645
+ folded = blockNode .elements .get (0 );
646
+ }
647
+ }
648
+
649
+ // Check if the result is a simple constant using the visitor pattern
650
+ org .perlonjava .runtime .RuntimeScalar constantValue =
651
+ org .perlonjava .astvisitor .ConstantFoldingVisitor .getConstantValue (folded );
652
+
653
+ if (constantValue != null ) {
654
+ String captureName ;
655
+
656
+ // Check if it's undef (needs special encoding)
657
+ if (constantValue == org .perlonjava .runtime .RuntimeScalarCache .scalarUndef ) {
658
+ captureName = String .format ("cb%03du" , codeBlockCaptureCounter ++);
659
+ } else {
660
+ // Use CaptureNameEncoder to encode the value in the capture name
661
+ captureName = org .perlonjava .regex .CaptureNameEncoder .encodeCodeBlockValue (
662
+ codeBlockCaptureCounter ++, constantValue
663
+ );
664
+ }
665
+
666
+ if (captureName == null ) {
667
+ // Encoding failed (e.g., name too long) - use fallback
668
+ segments .add (new StringNode ("(?{UNIMPLEMENTED_CODE_BLOCK})" , savedTokenIndex ));
669
+ } else {
670
+ // Encoding succeeded - create capture group
671
+ StringNode captureNode = new StringNode ("(?<" + captureName + ">)" , savedTokenIndex );
672
+ segments .add (captureNode );
673
+ }
674
+ } else {
675
+ // Not a constant - use unimplemented marker
676
+ segments .add (new StringNode ("(?{UNIMPLEMENTED_CODE_BLOCK})" , savedTokenIndex ));
677
+ }
613
678
}
614
679
615
680
/**
@@ -640,7 +705,7 @@ private String getStringContextAt(int position) {
640
705
return "\" string interpolation\" " ;
641
706
}
642
707
}
643
-
708
+
644
709
/**
645
710
* Sets the original token offset and string content for mapping string positions back to source.
646
711
* This enables proper error reporting that shows the actual string content.
@@ -890,17 +955,9 @@ void handleHexEscape() {
890
955
if (!hexStr .isEmpty ()) {
891
956
try {
892
957
var hexValue = Integer .parseInt (hexStr .toString (), 16 );
893
- String result ;
894
- if (hexValue <= 0xFFFF ) {
895
- result = String .valueOf ((char ) hexValue );
896
- } else if (Character .isValidCodePoint (hexValue )) {
897
- result = new String (Character .toChars (hexValue ));
898
- } else {
899
- // For invalid Unicode code points, create a representation using
900
- // surrogate characters that won't crash Java but will fail later
901
- // when used as identifiers (which is the expected Perl behavior)
902
- result = String .valueOf ((char ) 0xDC00 ) + (char ) (hexValue & 0xFFFF );
903
- }
958
+ var result = hexValue <= 0xFFFF
959
+ ? String .valueOf ((char ) hexValue )
960
+ : new String (Character .toChars (hexValue ));
904
961
appendToCurrentSegment (result );
905
962
} catch (NumberFormatException e ) {
906
963
// Invalid hex sequence, treat as literal
0 commit comments