Factor positive lookaheads better into find optimizations (#112107)

* Factor positive lookaheads better into find optimizations A positive lookahead at the start of a pattern can be used for determining find optimizations even when the non-zero-width portions of the pattern aren't. This helps particularly in cases where the positive lookahead contains an anchor or a literal. Also extends the existing alternation reduction optimization to factor out anchors that begin every branch of an alternation.
dotnet · Feb 5, 2025 · 2faef6d · 2faef6d
1 parent bcf880c
commit 2faef6d
Show file tree

Hide file tree

Showing 6 changed files with 235 additions and 22 deletions.
diff --git a/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -1,7 +1,9 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+#if SYSTEM_TEXT_REGULAREXPRESSIONS
 using System.Buffers;
+#endif
 using System.Collections.Generic;
 using System.Diagnostics;
 
@@ -10,28 +12,55 @@ namespace System.Text.RegularExpressions
     /// <summary>Contains state and provides operations related to finding the next location a match could possibly begin.</summary>
     internal sealed class RegexFindOptimizations
     {
-        /// <summary>True if the input should be processed right-to-left rather than left-to-right.</summary>
-        private readonly bool _rightToLeft;
         /// <summary>Lookup table used for optimizing ASCII when doing set queries.</summary>
         private readonly uint[]?[]? _asciiLookups;
 
-        public RegexFindOptimizations(RegexNode root, RegexOptions options)
+        public static RegexFindOptimizations Create(RegexNode root, RegexOptions options)
+        {
+            RegexFindOptimizations opts = new(root, options, isLeadingPartial: false);
+
+            if ((options & RegexOptions.RightToLeft) == 0 &&
+                !opts.IsUseful &&
+                RegexPrefixAnalyzer.FindLeadingPositiveLookahead(root) is RegexNode positiveLookahead)
+            {
+                RegexFindOptimizations positiveLookaheadOpts = new(positiveLookahead.Child(0), options, isLeadingPartial: true);
+
+                // Fixups to incorporate relevant information from the original optimizations.
+                // - If the original has a larger minimum length than the lookahead, use it. Lookaheads don't currently factor into
+                //   the computation of the minimum as it complicates the logic due to them possibly overlapping with other portions.
+                // - Use whatever max came from the original, if any. We shouldn't have computed a max for the lookahead because
+                //   it's partial.
+                positiveLookaheadOpts.MinRequiredLength = Math.Max(opts.MinRequiredLength, positiveLookaheadOpts.MinRequiredLength);
+                positiveLookaheadOpts.MaxPossibleLength = opts.MaxPossibleLength;
+
+                opts = positiveLookaheadOpts;
+            }
+
+            return opts;
+        }
+
+        /// <summary>Creates optimization information for searching with the pattern represented by <paramref name="root"/>.</summary>
+        /// <param name="root">The root of the pattern node tree.</param>
+        /// <param name="options">Options used when creating the regex.</param>
+        /// <param name="isLeadingPartial">true if <paramref name="root"/> may not represent the whole pattern, only a leading node in it.</param>
+        private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLeadingPartial)
         {
-            _rightToLeft = (options & RegexOptions.RightToLeft) != 0;
+            bool rightToLeft = (options & RegexOptions.RightToLeft) != 0;
+            Debug.Assert(!isLeadingPartial || !rightToLeft, "RightToLeft unexpected when isLeadingPartial");
 
             MinRequiredLength = root.ComputeMinLength();
 
             // Compute any anchor starting the expression.  If there is one, we won't need to search for anything,
             // as we can just match at that single location.
             LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(root);
-            if (_rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
+            if (rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
             {
                 // Filter out Bol for RightToLeft, as we don't currently optimize for it.
                 LeadingAnchor = RegexNodeKind.Unknown;
             }
             if (LeadingAnchor is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.EndZ or RegexNodeKind.End)
             {
-                FindMode = (LeadingAnchor, _rightToLeft) switch
+                FindMode = (LeadingAnchor, rightToLeft) switch
                 {
                     (RegexNodeKind.Beginning, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning,
                     (RegexNodeKind.Beginning, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning,
@@ -47,7 +76,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
 
             // Compute any anchor trailing the expression.  If there is one, and we can also compute a fixed length
             // for the whole expression, we can use that to quickly jump to the right location in the input.
-            if (!_rightToLeft) // haven't added FindNextStartingPositionMode trailing anchor support for RTL
+            if (!rightToLeft && // haven't added FindNextStartingPositionMode trailing anchor support for RTL
+                !isLeadingPartial) // trailing anchors in a partial root aren't relevant
             {
                 TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(root);
                 if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ &&
@@ -70,7 +100,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
             if (prefix.Length > 1)
             {
                 LeadingPrefix = prefix;
-                FindMode = _rightToLeft ?
+                FindMode = rightToLeft ?
                     FindNextStartingPositionMode.LeadingString_RightToLeft :
                     FindNextStartingPositionMode.LeadingString_LeftToRight;
                 return;
@@ -89,7 +119,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
             // more expensive; someone who wants to pay to do more work can specify Compiled.  So for the interpreter
             // we focus only on creating a set for the first character.  Same for right-to-left, which is used very
             // rarely and thus we don't need to invest in special-casing it.
-            if (_rightToLeft)
+            if (rightToLeft)
             {
                 // Determine a set for anything that can possibly start the expression.
                 if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
@@ -253,21 +283,21 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
         public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch;
 
         /// <summary>Gets the leading anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
-        public RegexNodeKind LeadingAnchor { get; }
+        public RegexNodeKind LeadingAnchor { get; private set; }
 
         /// <summary>Gets the trailing anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
         public RegexNodeKind TrailingAnchor { get; }
 
         /// <summary>Gets the minimum required length an input need be to match the pattern.</summary>
         /// <remarks>0 is a valid minimum length.  This value may also be the max (and hence fixed) length of the expression.</remarks>
-        public int MinRequiredLength { get; }
+        public int MinRequiredLength { get; private set; }
 
         /// <summary>The maximum possible length an input could be to match the pattern.</summary>
         /// <remarks>
         /// This is currently only set when <see cref="TrailingAnchor"/> is found to be an end anchor.
         /// That can be expanded in the future as needed.
         /// </remarks>
-        public int? MaxPossibleLength { get; }
+        public int? MaxPossibleLength { get; private set; }
 
         /// <summary>Gets the leading prefix.  May be an empty string.</summary>
         public string LeadingPrefix { get; } = string.Empty;

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -940,7 +940,7 @@ private RegexNode ReduceAlternation()
                         node = ExtractCommonPrefixText(node);
                         if (node.Kind == RegexNodeKind.Alternate)
                         {
-                            node = ExtractCommonPrefixOneNotoneSet(node);
+                            node = ExtractCommonPrefixNode(node);
                             if (node.Kind == RegexNodeKind.Alternate)
                             {
                                 node = RemoveRedundantEmptiesAndNothings(node);
@@ -1072,7 +1072,7 @@ void ReduceSingleLetterAndNestedAlternations()
             // This function optimizes out prefix nodes from alternation branches that are
             // the same across multiple contiguous branches.
             // e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90)
-            static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
+            static RegexNode ExtractCommonPrefixNode(RegexNode alternation)
             {
                 Debug.Assert(alternation.Kind == RegexNodeKind.Alternate);
                 Debug.Assert(alternation.Children is List<RegexNode> { Count: >= 2 });
@@ -1097,7 +1097,7 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
                 {
                     Debug.Assert(children[startingIndex].Children is List<RegexNode> { Count: >= 2 });
 
-                    // Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop).
+                    // Only handle the case where each branch begins with the same One, Notone, Set (individual or loop), or Anchor.
                     // Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing
                     // it for non-atomic variable length loops could change behavior as each branch could otherwise have a
                     // different number of characters consumed by the loop based on what's after it.
@@ -1107,6 +1107,10 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
                         case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set:
                         case RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic:
                         case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when required.M == required.N:
+                        case RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.Bol
+                             or RegexNodeKind.End or RegexNodeKind.EndZ or RegexNodeKind.Eol
+                             or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary
+                             or RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary:
                             break;
 
                         default: