Fix a few recent perf regressions in Regex (#78003)

* Split TryFindNextStartingPosition into LeftToRight and RightToLeft versions A previous change increased the size of the method. Splitting it to reduce the size (and how many locals needed to be zerod) as well as to reduce some branching. * Avoid generating match-all code for starting "any" sets * Avoid using IndexOfAnyExcept('\n') for TryFindNextPossibleStartingPosition It's common at the beginning of a pattern and it's rare to have long sequences of newlines that benefit from IndexOf. Also tweak how we prioritize sets; if two sets have ranges, prefer the one whose range is less inclusive / has a lower chance of matching (based purely on count). * Address PR feedback
author: Stephen Toub <stoub@microsoft.com> 2022-11-10 15:18:51 +0300
committer: GitHub <noreply@github.com> 2022-11-10 15:18:51 +0300
commit: 1205538e7a619e294fec7dc059df66072d302462 (patch)
tree: 409c9b18663432e60da8f309b2300c1ea5578c18
parent: 21700ffcb647081207aff9cba00eb59f976c061e (diff)
7 files changed, 201 insertions, 152 deletions
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index 04ec466a090..9e08b3ee1d2 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -779,8 +779,12 @@ namespace System.Text.RegularExpressions.Generator
 
                 // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix.
                 // We can use it if this is a case-sensitive class with a small number of characters in the class.
+                // We avoid using it for the relatively common case of the starting set being '.', aka anything other than
+                // a newline, as it's very rare to have long, uninterrupted sequences of newlines.
                 int setIndex = 0;
-                bool canUseIndexOf = primarySet.Chars is not null || primarySet.Range is not null;
+                bool canUseIndexOf =
+                    primarySet.Set != RegexCharClass.NotNewLineClass &&
+                    (primarySet.Chars is not null || primarySet.Range is not null);
                 bool needLoop = !canUseIndexOf || setsToUse > 1;
 
                 FinishEmitBlock loopBlock = default;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index aa60e511742..70d2e6a7261 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -97,6 +97,8 @@ namespace System.Text.RegularExpressions
         internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitRanges;
         internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitRanges;
 
+        internal const string NotNewLineClass = "\x01\x02\x00\x0A\x0B";
+
         internal const string AnyClass = "\x00\x01\x00\x00";
         private const string EmptyClass = "\x00\x00\x00";
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index 0ed046e282f..3ae0451e2e5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -824,8 +824,12 @@ namespace System.Text.RegularExpressions
 
                 // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix.
                 // We can use it if this is a case-sensitive class with a small number of characters in the class.
+                // We avoid using it for the relatively common case of the starting set being '.', aka anything other than
+                // a newline, as it's very rare to have long, uninterrupted sequences of newlines.
                 int setIndex = 0;
-                bool canUseIndexOf = primarySet.Chars is not null || primarySet.Range is not null;
+                bool canUseIndexOf =
+                    primarySet.Set != RegexCharClass.NotNewLineClass &&
+                    (primarySet.Chars is not null || primarySet.Range is not null);
                 bool needLoop = !canUseIndexOf || setsToUse > 1;
 
                 Label checkSpanLengthLabel = default;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
index 823e3617e3b..099073da0cb 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -194,12 +194,12 @@ namespace System.Text.RegularExpressions
             }
         }
 
-        /// <summary>true iff <see cref="TryFindNextStartingPosition"/> might advance the position.</summary>
+        /// <summary>true iff <see cref="TryFindNextStartingPositionLeftToRight"/> might advance the position.</summary>
         public bool IsUseful =>
             FindMode != FindNextStartingPositionMode.NoSearch || // there's a searching scheme available
             LeadingAnchor == RegexNodeKind.Bol; // there's a leading BOL anchor we can otherwise search for
 
-        /// <summary>Gets the selected mode for performing the next <see cref="TryFindNextStartingPosition"/> operation</summary>
+        /// <summary>Gets the selected mode for performing the next <see cref="TryFindNextStartingPositionLeftToRight"/> or <see cref="TryFindNextStartingPositionRightToLeft"/> operation</summary>
         public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch;
 
         /// <summary>Gets the leading anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
@@ -313,27 +313,138 @@ namespace System.Text.RegularExpressions
         /// <param name="pos">The position in <paramref name="textSpan"/>.  This is updated with the found position.</param>
         /// <param name="start">The index in <paramref name="textSpan"/> to consider the start for start anchor purposes.</param>
         /// <returns>true if a position to attempt a match was found; false if none was found.</returns>
-        public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos, int start)
+        public bool TryFindNextStartingPositionRightToLeft(ReadOnlySpan<char> textSpan, ref int pos, int start)
         {
             // Return early if we know there's not enough input left to match.
-            if (!_rightToLeft)
+            if (pos < MinRequiredLength)
             {
-                if (pos > textSpan.Length - MinRequiredLength)
-                {
-                    pos = textSpan.Length;
-                    return false;
-                }
+                pos = 0;
+                return false;
             }
-            else
+
+            Debug.Assert(LeadingAnchor != RegexNodeKind.Bol, "BOL isn't enabled for RTL");
+
+            switch (FindMode)
             {
-                if (pos < MinRequiredLength)
-                {
-                    pos = 0;
-                    return false;
-                }
+                // There's an anchor.  For some, we can simply compare against the current position.
+                // For others, we can jump to the relevant location.
+
+                case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning:
+                    if (pos != 0)
+                    {
+                        // If we're not currently at the beginning, skip ahead (or, rather, backwards)
+                        // since nothing until then can possibly match. (We're iterating from the end
+                        // to the beginning in RightToLeft mode.)
+                        pos = 0;
+                    }
+                    return true;
+
+                case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start:
+                    if (pos != start)
+                    {
+                        // If we're not currently at the starting position, we'll never be, so fail immediately.
+                        // This is different from beginning, since beginning is the fixed location of 0 whereas
+                        // start is wherever the iteration actually starts from; in left-to-right, that's often
+                        // the same as beginning, but in RightToLeft it rarely is.
+                        pos = 0;
+                        return false;
+                    }
+                    return true;
+
+                case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ:
+                    if (pos < textSpan.Length - 1 || ((uint)pos < (uint)textSpan.Length && textSpan[pos] != '\n'))
+                    {
+                        // If we're not currently at the end, we'll never be (we're iterating from end to beginning),
+                        // so fail immediately.
+                        pos = 0;
+                        return false;
+                    }
+                    return true;
+
+                case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End:
+                    if (pos < textSpan.Length)
+                    {
+                        // If we're not currently at the end, we'll never be (we're iterating from end to beginning),
+                        // so fail immediately.
+                        pos = 0;
+                        return false;
+                    }
+                    return true;
+
+                // There's a case-sensitive prefix.  Search for it with ordinal IndexOf.
+
+                case FindNextStartingPositionMode.LeadingString_RightToLeft:
+                    {
+                        int i = textSpan.Slice(0, pos).LastIndexOf(LeadingPrefix.AsSpan());
+                        if (i >= 0)
+                        {
+                            pos = i + LeadingPrefix.Length;
+                            return true;
+                        }
+
+                        pos = 0;
+                        return false;
+                    }
+
+                // There's a literal at the beginning of the pattern.  Search for it.
+
+                case FindNextStartingPositionMode.LeadingChar_RightToLeft:
+                    {
+                        int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Char);
+                        if (i >= 0)
+                        {
+                            pos = i + 1;
+                            return true;
+                        }
+
+                        pos = 0;
+                        return false;
+                    }
+
+                // There's a set at the beginning of the pattern.  Search for it.
+
+                case FindNextStartingPositionMode.LeadingSet_RightToLeft:
+                    {
+                        ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
+                        string set = FixedDistanceSets![0].Set;
+
+                        ReadOnlySpan<char> span = textSpan.Slice(0, pos);
+                        for (int i = span.Length - 1; i >= 0; i--)
+                        {
+                            if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
+                            {
+                                pos = i + 1;
+                                return true;
+                            }
+                        }
+
+                        pos = 0;
+                        return false;
+                    }
+
+                // Nothing special to look for.  Just return true indicating this is a valid position to try to match.
+
+                default:
+                    Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch);
+                    return true;
             }
+        }
 
-            // Optimize the handling of a Beginning-Of-Line (BOL) anchor (only for left-to-right).  BOL is special, in that unlike
+        /// <summary>Try to advance to the next starting position that might be a location for a match.</summary>
+        /// <param name="textSpan">The text to search.</param>
+        /// <param name="pos">The position in <paramref name="textSpan"/>.  This is updated with the found position.</param>
+        /// <param name="start">The index in <paramref name="textSpan"/> to consider the start for start anchor purposes.</param>
+        /// <returns>true if a position to attempt a match was found; false if none was found.</returns>
+        public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan, ref int pos, int start)
+        {
+            // Return early if we know there's not enough input left to match.
+            if (pos > textSpan.Length - MinRequiredLength)
+            {
+                pos = textSpan.Length;
+                return false;
+            }
+
+            // Optimize the handling of a Beginning-Of-Line (BOL) anchor.  BOL is special, in that unlike
             // other anchors like Beginning, there are potentially multiple places a BOL can match.  So unlike
             // the other anchors, which all skip all subsequent processing if found, with BOL we just use it
             // to boost our position to the next line, and then continue normally with any searches.
@@ -342,7 +453,6 @@ namespace System.Text.RegularExpressions
                 // If we're not currently positioned at the beginning of a line (either
                 // the beginning of the string or just after a line feed), find the next
                 // newline and position just after it.
-                Debug.Assert(!_rightToLeft);
                 int posm1 = pos - 1;
                 if ((uint)posm1 < (uint)textSpan.Length && textSpan[posm1] != '\n')
                 {
@@ -369,22 +479,22 @@ namespace System.Text.RegularExpressions
                 // For others, we can jump to the relevant location.
 
                 case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning:
-                    if (pos != 0)
+                    // If we're not currently at the beginning, we'll never be, so fail immediately.
+                    if (pos == 0)
                     {
-                        // If we're not currently at the beginning, we'll never be, so fail immediately.
-                        pos = textSpan.Length;
-                        return false;
+                        return true;
                     }
-                    return true;
+                    pos = textSpan.Length;
+                    return false;
 
                 case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start:
-                    if (pos != start)
+                    // If we're not currently at the start, we'll never be, so fail immediately.
+                    if (pos == start)
                     {
-                        // If we're not currently at the start, we'll never be, so fail immediately.
-                        pos = textSpan.Length;
-                        return false;
+                        return true;
                     }
-                    return true;
+                    pos = textSpan.Length;
+                    return false;
 
                 case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ:
                     if (pos < textSpan.Length - 1)
@@ -404,48 +514,6 @@ namespace System.Text.RegularExpressions
                     }
                     return true;
 
-                case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning:
-                    if (pos != 0)
-                    {
-                        // If we're not currently at the beginning, skip ahead (or, rather, backwards)
-                        // since nothing until then can possibly match. (We're iterating from the end
-                        // to the beginning in RightToLeft mode.)
-                        pos = 0;
-                    }
-                    return true;
-
-                case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start:
-                    if (pos != start)
-                    {
-                        // If we're not currently at the starting position, we'll never be, so fail immediately.
-                        // This is different from beginning, since beginning is the fixed location of 0 whereas
-                        // start is wherever the iteration actually starts from; in left-to-right, that's often
-                        // the same as beginning, but in RightToLeft it rarely is.
-                        pos = 0;
-                        return false;
-                    }
-                    return true;
-
-                case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ:
-                    if (pos < textSpan.Length - 1 || ((uint)pos < (uint)textSpan.Length && textSpan[pos] != '\n'))
-                    {
-                        // If we're not currently at the end, we'll never be (we're iterating from end to beginning),
-                        // so fail immediately.
-                        pos = 0;
-                        return false;
-                    }
-                    return true;
-
-                case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End:
-                    if (pos < textSpan.Length)
-                    {
-                        // If we're not currently at the end, we'll never be (we're iterating from end to beginning),
-                        // so fail immediately.
-                        pos = 0;
-                        return false;
-                    }
-                    return true;
-
                 case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ:
                     if (pos < textSpan.Length - MinRequiredLength - 1)
                     {
@@ -475,34 +543,6 @@ namespace System.Text.RegularExpressions
                         return false;
                     }
 
-                case FindNextStartingPositionMode.LeadingString_RightToLeft:
-                    {
-                        int i = textSpan.Slice(0, pos).LastIndexOf(LeadingPrefix.AsSpan());
-                        if (i >= 0)
-                        {
-                            pos = i + LeadingPrefix.Length;
-                            return true;
-                        }
-
-                        pos = 0;
-                        return false;
-                    }
-
-                // There's a literal at the beginning of the pattern.  Search for it.
-
-                case FindNextStartingPositionMode.LeadingChar_RightToLeft:
-                    {
-                        int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Char);
-                        if (i >= 0)
-                        {
-                            pos = i + 1;
-                            return true;
-                        }
-
-                        pos = 0;
-                        return false;
-                    }
-
                 // There's a set at the beginning of the pattern.  Search for it.
 
                 case FindNextStartingPositionMode.LeadingSet_LeftToRight:
@@ -538,25 +578,6 @@ namespace System.Text.RegularExpressions
                         return false;
                     }
 
-                case FindNextStartingPositionMode.LeadingSet_RightToLeft:
-                    {
-                        ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
-                        string set = FixedDistanceSets![0].Set;
-
-                        ReadOnlySpan<char> span = textSpan.Slice(0, pos);
-                        for (int i = span.Length - 1; i >= 0; i--)
-                        {
-                            if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
-                            {
-                                pos = i + 1;
-                                return true;
-                            }
-                        }
-
-                        pos = 0;
-                        return false;
-                    }
-
                 // There's a literal at a fixed offset from the beginning of the pattern.  Search for it.
 
                 case FindNextStartingPositionMode.FixedDistanceChar_LeftToRight:
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
index cd70f00b222..bdb5cdb3834 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
@@ -339,31 +339,41 @@ namespace System.Text.RegularExpressions
             Debug.Assert(runstack is not null);
             Debug.Assert(runcrawl is not null);
 
-            // Configure the additional value to "bump" the position along each time we loop around
-            // to call TryFindNextStartingPosition again, as well as the stopping position for the loop.  We generally
-            // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump
-            // by -1 and stop at textbeg.
-            int bump = 1, stoppos = text.Length;
             if (runregex.RightToLeft)
             {
-                bump = -1;
-                stoppos = 0;
-            }
+                while (_code.FindOptimizations.TryFindNextStartingPositionRightToLeft(text, ref runtextpos, runtextstart))
+                {
+                    CheckTimeout();
 
-            while (_code.FindOptimizations.TryFindNextStartingPosition(text, ref runtextpos, runtextstart))
-            {
-                CheckTimeout();
+                    if (TryMatchAtCurrentPosition(text) || runtextpos == 0)
+                    {
+                        return;
+                    }
 
-                if (TryMatchAtCurrentPosition(text) || runtextpos == stoppos)
-                {
-                    return;
+                    // Reset state for another iteration.
+                    runtrackpos = runtrack.Length;
+                    runstackpos = runstack.Length;
+                    runcrawlpos = runcrawl.Length;
+                    runtextpos--;
                 }
+            }
+            else
+            {
+                while (_code.FindOptimizations.TryFindNextStartingPositionLeftToRight(text, ref runtextpos, runtextstart))
+                {
+                    CheckTimeout();
 
-                // Reset state for another iteration.
-                runtrackpos = runtrack.Length;
-                runstackpos = runstack.Length;
-                runcrawlpos = runcrawl.Length;
-                runtextpos += bump;
+                    if (TryMatchAtCurrentPosition(text) || runtextpos == text.Length)
+                    {
+                        return;
+                    }
+
+                    // Reset state for another iteration.
+                    runtrackpos = runtrack.Length;
+                    runstackpos = runstack.Length;
+                    runcrawlpos = runcrawl.Length;
+                    runtextpos++;
+                }
             }
         }
 
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
index 1aeb8f6d4fd..cacf02d321e 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -170,36 +170,28 @@ namespace System.Text.RegularExpressions
             TryFindFixedSets(root, results, ref distance, thorough);
 
             // Remove any sets that match everything; they're not helpful.  (This check exists primarily to weed
-            // out use of . in Singleline mode.)
-            bool hasAny = false;
+            // out use of . in Singleline mode, but also filters out explicit sets like [\s\S].)
             for (int i = 0; i < results.Count; i++)
             {
                 if (results[i].Set == RegexCharClass.AnyClass)
                 {
-                    hasAny = true;
+                    results.RemoveAll(s => s.Set == RegexCharClass.AnyClass);
                     break;
                 }
             }
-            if (hasAny)
-            {
-                results.RemoveAll(s => s.Set == RegexCharClass.AnyClass);
-            }
 
             // If we don't have any results, try harder to compute one for the starting character.
             // This is a more involved computation that can find things the fixed-distance investigation
             // doesn't.
             if (results.Count == 0)
             {
-                string? charClass = FindFirstCharClass(root);
-                if (charClass is not null)
-                {
-                    results.Add(new RegexFindOptimizations.FixedDistanceSet(null, charClass, 0));
-                }
-
-                if (results.Count == 0)
+                if (FindFirstCharClass(root) is not string charClass ||
+                    charClass == RegexCharClass.AnyClass) // weed out match-all, same as above
                 {
                     return null;
                 }
+
+                results.Add(new RegexFindOptimizations.FixedDistanceSet(null, charClass, 0));
             }
 
             // For every entry, try to get the chars that make up the set, if there are few enough.
@@ -484,6 +476,22 @@ namespace System.Text.RegularExpressions
                     return s1.Range is not null ? -1 : 1;
                 }
 
+                // If both have ranges, prefer the one that includes fewer characters.
+                if (s1.Range is not null)
+                {
+                    return
+                        GetRangeLength(s1.Range.GetValueOrDefault()).CompareTo(
+                        GetRangeLength(s2.Range.GetValueOrDefault()));
+
+                    static int GetRangeLength((char LowInclusive, char HighInclusive, bool Negated) range)
+                    {
+                        int length = range.HighInclusive - range.LowInclusive + 1;
+                        return range.Negated ?
+                            char.MaxValue + 1 - length :
+                            length;
+                    }
+                }
+
                 // As a tiebreaker, prioritize the earlier one.
                 return s1.Distance.CompareTo(s2.Distance);
             });
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 89650f3e27a..107beaa4e38 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -1271,7 +1271,7 @@ namespace System.Text.RegularExpressions.Symbolic
                 where TInputReader : struct, IInputReader
             {
                 // Find the first position that matches with some likely character.
-                if (!matcher._findOpts!.TryFindNextStartingPosition(input, ref pos, 0))
+                if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
                 {
                     // No match exists
                     return false;
author	Stephen Toub <stoub@microsoft.com>	2022-11-10 15:18:51 +0300
committer	GitHub <noreply@github.com>	2022-11-10 15:18:51 +0300
commit	1205538e7a619e294fec7dc059df66072d302462 (patch)
tree	409c9b18663432e60da8f309b2300c1ea5578c18
parent	21700ffcb647081207aff9cba00eb59f976c061e (diff)