Skip to content

Commit

Permalink
Factor positive lookaheads better into find optimizations (#112107)
Browse files Browse the repository at this point in the history
* Factor positive lookaheads better into find optimizations

A positive lookahead at the start of a pattern can be used for determining find optimizations even when the non-zero-width portions of the pattern aren't. This helps particularly in cases where the positive lookahead contains an anchor or a literal.

Also extends the existing alternation reduction optimization to factor out anchors that begin every branch of an alternation.
  • Loading branch information
stephentoub authored Feb 5, 2025
1 parent bcf880c commit 2faef6d
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 22 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

#if SYSTEM_TEXT_REGULAREXPRESSIONS
using System.Buffers;
#endif
using System.Collections.Generic;
using System.Diagnostics;

Expand All @@ -10,28 +12,55 @@ namespace System.Text.RegularExpressions
/// <summary>Contains state and provides operations related to finding the next location a match could possibly begin.</summary>
internal sealed class RegexFindOptimizations
{
/// <summary>True if the input should be processed right-to-left rather than left-to-right.</summary>
private readonly bool _rightToLeft;
/// <summary>Lookup table used for optimizing ASCII when doing set queries.</summary>
private readonly uint[]?[]? _asciiLookups;

public RegexFindOptimizations(RegexNode root, RegexOptions options)
public static RegexFindOptimizations Create(RegexNode root, RegexOptions options)
{
RegexFindOptimizations opts = new(root, options, isLeadingPartial: false);

if ((options & RegexOptions.RightToLeft) == 0 &&
!opts.IsUseful &&
RegexPrefixAnalyzer.FindLeadingPositiveLookahead(root) is RegexNode positiveLookahead)
{
RegexFindOptimizations positiveLookaheadOpts = new(positiveLookahead.Child(0), options, isLeadingPartial: true);

// Fixups to incorporate relevant information from the original optimizations.
// - If the original has a larger minimum length than the lookahead, use it. Lookaheads don't currently factor into
// the computation of the minimum as it complicates the logic due to them possibly overlapping with other portions.
// - Use whatever max came from the original, if any. We shouldn't have computed a max for the lookahead because
// it's partial.
positiveLookaheadOpts.MinRequiredLength = Math.Max(opts.MinRequiredLength, positiveLookaheadOpts.MinRequiredLength);
positiveLookaheadOpts.MaxPossibleLength = opts.MaxPossibleLength;

opts = positiveLookaheadOpts;
}

return opts;
}

/// <summary>Creates optimization information for searching with the pattern represented by <paramref name="root"/>.</summary>
/// <param name="root">The root of the pattern node tree.</param>
/// <param name="options">Options used when creating the regex.</param>
/// <param name="isLeadingPartial">true if <paramref name="root"/> may not represent the whole pattern, only a leading node in it.</param>
private RegexFindOptimizations(RegexNode root, RegexOptions options, bool isLeadingPartial)
{
_rightToLeft = (options & RegexOptions.RightToLeft) != 0;
bool rightToLeft = (options & RegexOptions.RightToLeft) != 0;
Debug.Assert(!isLeadingPartial || !rightToLeft, "RightToLeft unexpected when isLeadingPartial");

MinRequiredLength = root.ComputeMinLength();

// Compute any anchor starting the expression. If there is one, we won't need to search for anything,
// as we can just match at that single location.
LeadingAnchor = RegexPrefixAnalyzer.FindLeadingAnchor(root);
if (_rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
if (rightToLeft && LeadingAnchor == RegexNodeKind.Bol)
{
// Filter out Bol for RightToLeft, as we don't currently optimize for it.
LeadingAnchor = RegexNodeKind.Unknown;
}
if (LeadingAnchor is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.EndZ or RegexNodeKind.End)
{
FindMode = (LeadingAnchor, _rightToLeft) switch
FindMode = (LeadingAnchor, rightToLeft) switch
{
(RegexNodeKind.Beginning, false) => FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning,
(RegexNodeKind.Beginning, true) => FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning,
Expand All @@ -47,7 +76,8 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)

// Compute any anchor trailing the expression. If there is one, and we can also compute a fixed length
// for the whole expression, we can use that to quickly jump to the right location in the input.
if (!_rightToLeft) // haven't added FindNextStartingPositionMode trailing anchor support for RTL
if (!rightToLeft && // haven't added FindNextStartingPositionMode trailing anchor support for RTL
!isLeadingPartial) // trailing anchors in a partial root aren't relevant
{
TrailingAnchor = RegexPrefixAnalyzer.FindTrailingAnchor(root);
if (TrailingAnchor is RegexNodeKind.End or RegexNodeKind.EndZ &&
Expand All @@ -70,7 +100,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
if (prefix.Length > 1)
{
LeadingPrefix = prefix;
FindMode = _rightToLeft ?
FindMode = rightToLeft ?
FindNextStartingPositionMode.LeadingString_RightToLeft :
FindNextStartingPositionMode.LeadingString_LeftToRight;
return;
Expand All @@ -89,7 +119,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
// more expensive; someone who wants to pay to do more work can specify Compiled. So for the interpreter
// we focus only on creating a set for the first character. Same for right-to-left, which is used very
// rarely and thus we don't need to invest in special-casing it.
if (_rightToLeft)
if (rightToLeft)
{
// Determine a set for anything that can possibly start the expression.
if (RegexPrefixAnalyzer.FindFirstCharClass(root) is string charClass)
Expand Down Expand Up @@ -253,21 +283,21 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch;

/// <summary>Gets the leading anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
public RegexNodeKind LeadingAnchor { get; }
public RegexNodeKind LeadingAnchor { get; private set; }

/// <summary>Gets the trailing anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
public RegexNodeKind TrailingAnchor { get; }

/// <summary>Gets the minimum required length an input need be to match the pattern.</summary>
/// <remarks>0 is a valid minimum length. This value may also be the max (and hence fixed) length of the expression.</remarks>
public int MinRequiredLength { get; }
public int MinRequiredLength { get; private set; }

/// <summary>The maximum possible length an input could be to match the pattern.</summary>
/// <remarks>
/// This is currently only set when <see cref="TrailingAnchor"/> is found to be an end anchor.
/// That can be expanded in the future as needed.
/// </remarks>
public int? MaxPossibleLength { get; }
public int? MaxPossibleLength { get; private set; }

/// <summary>Gets the leading prefix. May be an empty string.</summary>
public string LeadingPrefix { get; } = string.Empty;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -940,7 +940,7 @@ private RegexNode ReduceAlternation()
node = ExtractCommonPrefixText(node);
if (node.Kind == RegexNodeKind.Alternate)
{
node = ExtractCommonPrefixOneNotoneSet(node);
node = ExtractCommonPrefixNode(node);
if (node.Kind == RegexNodeKind.Alternate)
{
node = RemoveRedundantEmptiesAndNothings(node);
Expand Down Expand Up @@ -1072,7 +1072,7 @@ void ReduceSingleLetterAndNestedAlternations()
// This function optimizes out prefix nodes from alternation branches that are
// the same across multiple contiguous branches.
// e.g. \w12|\d34|\d56|\w78|\w90 => \w12|\d(?:34|56)|\w(?:78|90)
static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
static RegexNode ExtractCommonPrefixNode(RegexNode alternation)
{
Debug.Assert(alternation.Kind == RegexNodeKind.Alternate);
Debug.Assert(alternation.Children is List<RegexNode> { Count: >= 2 });
Expand All @@ -1097,7 +1097,7 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
{
Debug.Assert(children[startingIndex].Children is List<RegexNode> { Count: >= 2 });

// Only handle the case where each branch begins with the same One, Notone, or Set (individual or loop).
// Only handle the case where each branch begins with the same One, Notone, Set (individual or loop), or Anchor.
// Note that while we can do this for individual characters, fixed length loops, and atomic loops, doing
// it for non-atomic variable length loops could change behavior as each branch could otherwise have a
// different number of characters consumed by the loop based on what's after it.
Expand All @@ -1107,6 +1107,10 @@ static RegexNode ExtractCommonPrefixOneNotoneSet(RegexNode alternation)
case RegexNodeKind.One or RegexNodeKind.Notone or RegexNodeKind.Set:
case RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloopatomic:
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when required.M == required.N:
case RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.Bol
or RegexNodeKind.End or RegexNodeKind.EndZ or RegexNodeKind.Eol
or RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary
or RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary:
break;

default:
Expand Down
Loading

0 comments on commit 2faef6d

Please sign in to comment.