diff options
author | Stephen Toub <stoub@microsoft.com> | 2022-11-10 15:18:51 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-11-10 15:18:51 +0300 |
commit | 1205538e7a619e294fec7dc059df66072d302462 (patch) | |
tree | 409c9b18663432e60da8f309b2300c1ea5578c18 | |
parent | 21700ffcb647081207aff9cba00eb59f976c061e (diff) |
Fix a few recent perf regressions in Regex (#78003)
* Split TryFindNextStartingPosition into LeftToRight and RightToLeft versions
A previous change increased the size of the method. Splitting it to reduce the size (and how many locals needed to be zerod) as well as to reduce some branching.
* Avoid generating match-all code for starting "any" sets
* Avoid using IndexOfAnyExcept('\n') for TryFindNextPossibleStartingPosition
It's common at the beginning of a pattern and it's rare to have long sequences of newlines that benefit from IndexOf.
Also tweak how we prioritize sets; if two sets have ranges, prefer the one whose range is less inclusive / has a lower chance of matching (based purely on count).
* Address PR feedback
7 files changed, 201 insertions, 152 deletions
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 04ec466a090..9e08b3ee1d2 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -779,8 +779,12 @@ namespace System.Text.RegularExpressions.Generator // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix. // We can use it if this is a case-sensitive class with a small number of characters in the class. + // We avoid using it for the relatively common case of the starting set being '.', aka anything other than + // a newline, as it's very rare to have long, uninterrupted sequences of newlines. int setIndex = 0; - bool canUseIndexOf = primarySet.Chars is not null || primarySet.Range is not null; + bool canUseIndexOf = + primarySet.Set != RegexCharClass.NotNewLineClass && + (primarySet.Chars is not null || primarySet.Range is not null); bool needLoop = !canUseIndexOf || setsToUse > 1; FinishEmitBlock loopBlock = default; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index aa60e511742..70d2e6a7261 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -97,6 +97,8 @@ namespace System.Text.RegularExpressions internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitRanges; internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitRanges; + internal const string NotNewLineClass = "\x01\x02\x00\x0A\x0B"; + internal const string AnyClass = "\x00\x01\x00\x00"; private const string EmptyClass = "\x00\x00\x00"; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 0ed046e282f..3ae0451e2e5 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -824,8 +824,12 @@ namespace System.Text.RegularExpressions // If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix. // We can use it if this is a case-sensitive class with a small number of characters in the class. + // We avoid using it for the relatively common case of the starting set being '.', aka anything other than + // a newline, as it's very rare to have long, uninterrupted sequences of newlines. int setIndex = 0; - bool canUseIndexOf = primarySet.Chars is not null || primarySet.Range is not null; + bool canUseIndexOf = + primarySet.Set != RegexCharClass.NotNewLineClass && + (primarySet.Chars is not null || primarySet.Range is not null); bool needLoop = !canUseIndexOf || setsToUse > 1; Label checkSpanLengthLabel = default; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index 823e3617e3b..099073da0cb 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -194,12 +194,12 @@ namespace System.Text.RegularExpressions } } - /// <summary>true iff <see cref="TryFindNextStartingPosition"/> might advance the position.</summary> + /// <summary>true iff <see cref="TryFindNextStartingPositionLeftToRight"/> might advance the position.</summary> public bool IsUseful => FindMode != FindNextStartingPositionMode.NoSearch || // there's a searching scheme available LeadingAnchor == RegexNodeKind.Bol; // there's a leading BOL anchor we can otherwise search for - /// <summary>Gets the selected mode for performing the next <see cref="TryFindNextStartingPosition"/> operation</summary> + /// <summary>Gets the selected mode for performing the next <see cref="TryFindNextStartingPositionLeftToRight"/> or <see cref="TryFindNextStartingPositionRightToLeft"/> operation</summary> public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch; /// <summary>Gets the leading anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary> @@ -313,27 +313,138 @@ namespace System.Text.RegularExpressions /// <param name="pos">The position in <paramref name="textSpan"/>. This is updated with the found position.</param> /// <param name="start">The index in <paramref name="textSpan"/> to consider the start for start anchor purposes.</param> /// <returns>true if a position to attempt a match was found; false if none was found.</returns> - public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos, int start) + public bool TryFindNextStartingPositionRightToLeft(ReadOnlySpan<char> textSpan, ref int pos, int start) { // Return early if we know there's not enough input left to match. - if (!_rightToLeft) + if (pos < MinRequiredLength) { - if (pos > textSpan.Length - MinRequiredLength) - { - pos = textSpan.Length; - return false; - } + pos = 0; + return false; } - else + + Debug.Assert(LeadingAnchor != RegexNodeKind.Bol, "BOL isn't enabled for RTL"); + + switch (FindMode) { - if (pos < MinRequiredLength) - { - pos = 0; - return false; - } + // There's an anchor. For some, we can simply compare against the current position. + // For others, we can jump to the relevant location. + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: + if (pos != 0) + { + // If we're not currently at the beginning, skip ahead (or, rather, backwards) + // since nothing until then can possibly match. (We're iterating from the end + // to the beginning in RightToLeft mode.) + pos = 0; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: + if (pos != start) + { + // If we're not currently at the starting position, we'll never be, so fail immediately. + // This is different from beginning, since beginning is the fixed location of 0 whereas + // start is wherever the iteration actually starts from; in left-to-right, that's often + // the same as beginning, but in RightToLeft it rarely is. + pos = 0; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: + if (pos < textSpan.Length - 1 || ((uint)pos < (uint)textSpan.Length && textSpan[pos] != '\n')) + { + // If we're not currently at the end, we'll never be (we're iterating from end to beginning), + // so fail immediately. + pos = 0; + return false; + } + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: + if (pos < textSpan.Length) + { + // If we're not currently at the end, we'll never be (we're iterating from end to beginning), + // so fail immediately. + pos = 0; + return false; + } + return true; + + // There's a case-sensitive prefix. Search for it with ordinal IndexOf. + + case FindNextStartingPositionMode.LeadingString_RightToLeft: + { + int i = textSpan.Slice(0, pos).LastIndexOf(LeadingPrefix.AsSpan()); + if (i >= 0) + { + pos = i + LeadingPrefix.Length; + return true; + } + + pos = 0; + return false; + } + + // There's a literal at the beginning of the pattern. Search for it. + + case FindNextStartingPositionMode.LeadingChar_RightToLeft: + { + int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Char); + if (i >= 0) + { + pos = i + 1; + return true; + } + + pos = 0; + return false; + } + + // There's a set at the beginning of the pattern. Search for it. + + case FindNextStartingPositionMode.LeadingSet_RightToLeft: + { + ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; + string set = FixedDistanceSets![0].Set; + + ReadOnlySpan<char> span = textSpan.Slice(0, pos); + for (int i = span.Length - 1; i >= 0; i--) + { + if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup)) + { + pos = i + 1; + return true; + } + } + + pos = 0; + return false; + } + + // Nothing special to look for. Just return true indicating this is a valid position to try to match. + + default: + Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch); + return true; } + } - // Optimize the handling of a Beginning-Of-Line (BOL) anchor (only for left-to-right). BOL is special, in that unlike + /// <summary>Try to advance to the next starting position that might be a location for a match.</summary> + /// <param name="textSpan">The text to search.</param> + /// <param name="pos">The position in <paramref name="textSpan"/>. This is updated with the found position.</param> + /// <param name="start">The index in <paramref name="textSpan"/> to consider the start for start anchor purposes.</param> + /// <returns>true if a position to attempt a match was found; false if none was found.</returns> + public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan, ref int pos, int start) + { + // Return early if we know there's not enough input left to match. + if (pos > textSpan.Length - MinRequiredLength) + { + pos = textSpan.Length; + return false; + } + + // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike // the other anchors, which all skip all subsequent processing if found, with BOL we just use it // to boost our position to the next line, and then continue normally with any searches. @@ -342,7 +453,6 @@ namespace System.Text.RegularExpressions // If we're not currently positioned at the beginning of a line (either // the beginning of the string or just after a line feed), find the next // newline and position just after it. - Debug.Assert(!_rightToLeft); int posm1 = pos - 1; if ((uint)posm1 < (uint)textSpan.Length && textSpan[posm1] != '\n') { @@ -369,22 +479,22 @@ namespace System.Text.RegularExpressions // For others, we can jump to the relevant location. case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: - if (pos != 0) + // If we're not currently at the beginning, we'll never be, so fail immediately. + if (pos == 0) { - // If we're not currently at the beginning, we'll never be, so fail immediately. - pos = textSpan.Length; - return false; + return true; } - return true; + pos = textSpan.Length; + return false; case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: - if (pos != start) + // If we're not currently at the start, we'll never be, so fail immediately. + if (pos == start) { - // If we're not currently at the start, we'll never be, so fail immediately. - pos = textSpan.Length; - return false; + return true; } - return true; + pos = textSpan.Length; + return false; case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: if (pos < textSpan.Length - 1) @@ -404,48 +514,6 @@ namespace System.Text.RegularExpressions } return true; - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: - if (pos != 0) - { - // If we're not currently at the beginning, skip ahead (or, rather, backwards) - // since nothing until then can possibly match. (We're iterating from the end - // to the beginning in RightToLeft mode.) - pos = 0; - } - return true; - - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: - if (pos != start) - { - // If we're not currently at the starting position, we'll never be, so fail immediately. - // This is different from beginning, since beginning is the fixed location of 0 whereas - // start is wherever the iteration actually starts from; in left-to-right, that's often - // the same as beginning, but in RightToLeft it rarely is. - pos = 0; - return false; - } - return true; - - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: - if (pos < textSpan.Length - 1 || ((uint)pos < (uint)textSpan.Length && textSpan[pos] != '\n')) - { - // If we're not currently at the end, we'll never be (we're iterating from end to beginning), - // so fail immediately. - pos = 0; - return false; - } - return true; - - case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: - if (pos < textSpan.Length) - { - // If we're not currently at the end, we'll never be (we're iterating from end to beginning), - // so fail immediately. - pos = 0; - return false; - } - return true; - case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ: if (pos < textSpan.Length - MinRequiredLength - 1) { @@ -475,34 +543,6 @@ namespace System.Text.RegularExpressions return false; } - case FindNextStartingPositionMode.LeadingString_RightToLeft: - { - int i = textSpan.Slice(0, pos).LastIndexOf(LeadingPrefix.AsSpan()); - if (i >= 0) - { - pos = i + LeadingPrefix.Length; - return true; - } - - pos = 0; - return false; - } - - // There's a literal at the beginning of the pattern. Search for it. - - case FindNextStartingPositionMode.LeadingChar_RightToLeft: - { - int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Char); - if (i >= 0) - { - pos = i + 1; - return true; - } - - pos = 0; - return false; - } - // There's a set at the beginning of the pattern. Search for it. case FindNextStartingPositionMode.LeadingSet_LeftToRight: @@ -538,25 +578,6 @@ namespace System.Text.RegularExpressions return false; } - case FindNextStartingPositionMode.LeadingSet_RightToLeft: - { - ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; - string set = FixedDistanceSets![0].Set; - - ReadOnlySpan<char> span = textSpan.Slice(0, pos); - for (int i = span.Length - 1; i >= 0; i--) - { - if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup)) - { - pos = i + 1; - return true; - } - } - - pos = 0; - return false; - } - // There's a literal at a fixed offset from the beginning of the pattern. Search for it. case FindNextStartingPositionMode.FixedDistanceChar_LeftToRight: diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index cd70f00b222..bdb5cdb3834 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -339,31 +339,41 @@ namespace System.Text.RegularExpressions Debug.Assert(runstack is not null); Debug.Assert(runcrawl is not null); - // Configure the additional value to "bump" the position along each time we loop around - // to call TryFindNextStartingPosition again, as well as the stopping position for the loop. We generally - // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump - // by -1 and stop at textbeg. - int bump = 1, stoppos = text.Length; if (runregex.RightToLeft) { - bump = -1; - stoppos = 0; - } + while (_code.FindOptimizations.TryFindNextStartingPositionRightToLeft(text, ref runtextpos, runtextstart)) + { + CheckTimeout(); - while (_code.FindOptimizations.TryFindNextStartingPosition(text, ref runtextpos, runtextstart)) - { - CheckTimeout(); + if (TryMatchAtCurrentPosition(text) || runtextpos == 0) + { + return; + } - if (TryMatchAtCurrentPosition(text) || runtextpos == stoppos) - { - return; + // Reset state for another iteration. + runtrackpos = runtrack.Length; + runstackpos = runstack.Length; + runcrawlpos = runcrawl.Length; + runtextpos--; } + } + else + { + while (_code.FindOptimizations.TryFindNextStartingPositionLeftToRight(text, ref runtextpos, runtextstart)) + { + CheckTimeout(); - // Reset state for another iteration. - runtrackpos = runtrack.Length; - runstackpos = runstack.Length; - runcrawlpos = runcrawl.Length; - runtextpos += bump; + if (TryMatchAtCurrentPosition(text) || runtextpos == text.Length) + { + return; + } + + // Reset state for another iteration. + runtrackpos = runtrack.Length; + runstackpos = runstack.Length; + runcrawlpos = runcrawl.Length; + runtextpos++; + } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index 1aeb8f6d4fd..cacf02d321e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -170,36 +170,28 @@ namespace System.Text.RegularExpressions TryFindFixedSets(root, results, ref distance, thorough); // Remove any sets that match everything; they're not helpful. (This check exists primarily to weed - // out use of . in Singleline mode.) - bool hasAny = false; + // out use of . in Singleline mode, but also filters out explicit sets like [\s\S].) for (int i = 0; i < results.Count; i++) { if (results[i].Set == RegexCharClass.AnyClass) { - hasAny = true; + results.RemoveAll(s => s.Set == RegexCharClass.AnyClass); break; } } - if (hasAny) - { - results.RemoveAll(s => s.Set == RegexCharClass.AnyClass); - } // If we don't have any results, try harder to compute one for the starting character. // This is a more involved computation that can find things the fixed-distance investigation // doesn't. if (results.Count == 0) { - string? charClass = FindFirstCharClass(root); - if (charClass is not null) - { - results.Add(new RegexFindOptimizations.FixedDistanceSet(null, charClass, 0)); - } - - if (results.Count == 0) + if (FindFirstCharClass(root) is not string charClass || + charClass == RegexCharClass.AnyClass) // weed out match-all, same as above { return null; } + + results.Add(new RegexFindOptimizations.FixedDistanceSet(null, charClass, 0)); } // For every entry, try to get the chars that make up the set, if there are few enough. @@ -484,6 +476,22 @@ namespace System.Text.RegularExpressions return s1.Range is not null ? -1 : 1; } + // If both have ranges, prefer the one that includes fewer characters. + if (s1.Range is not null) + { + return + GetRangeLength(s1.Range.GetValueOrDefault()).CompareTo( + GetRangeLength(s2.Range.GetValueOrDefault())); + + static int GetRangeLength((char LowInclusive, char HighInclusive, bool Negated) range) + { + int length = range.HighInclusive - range.LowInclusive + 1; + return range.Negated ? + char.MaxValue + 1 - length : + length; + } + } + // As a tiebreaker, prioritize the earlier one. return s1.Distance.CompareTo(s2.Distance); }); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs index 89650f3e27a..107beaa4e38 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs @@ -1271,7 +1271,7 @@ namespace System.Text.RegularExpressions.Symbolic where TInputReader : struct, IInputReader { // Find the first position that matches with some likely character. - if (!matcher._findOpts!.TryFindNextStartingPosition(input, ref pos, 0)) + if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0)) { // No match exists return false; |