Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/dotnet/runtime.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Toub <stoub@microsoft.com>2022-11-10 15:18:51 +0300
committerGitHub <noreply@github.com>2022-11-10 15:18:51 +0300
commit1205538e7a619e294fec7dc059df66072d302462 (patch)
tree409c9b18663432e60da8f309b2300c1ea5578c18
parent21700ffcb647081207aff9cba00eb59f976c061e (diff)
Fix a few recent perf regressions in Regex (#78003)
* Split TryFindNextStartingPosition into LeftToRight and RightToLeft versions A previous change increased the size of the method. Splitting it to reduce the size (and how many locals needed to be zerod) as well as to reduce some branching. * Avoid generating match-all code for starting "any" sets * Avoid using IndexOfAnyExcept('\n') for TryFindNextPossibleStartingPosition It's common at the beginning of a pattern and it's rare to have long sequences of newlines that benefit from IndexOf. Also tweak how we prioritize sets; if two sets have ranges, prefer the one whose range is less inclusive / has a lower chance of matching (based purely on count). * Address PR feedback
-rw-r--r--src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs6
-rw-r--r--src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs2
-rw-r--r--src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs6
-rw-r--r--src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs253
-rw-r--r--src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs48
-rw-r--r--src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs36
-rw-r--r--src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs2
7 files changed, 201 insertions, 152 deletions
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index 04ec466a090..9e08b3ee1d2 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -779,8 +779,12 @@ namespace System.Text.RegularExpressions.Generator
// If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix.
// We can use it if this is a case-sensitive class with a small number of characters in the class.
+ // We avoid using it for the relatively common case of the starting set being '.', aka anything other than
+ // a newline, as it's very rare to have long, uninterrupted sequences of newlines.
int setIndex = 0;
- bool canUseIndexOf = primarySet.Chars is not null || primarySet.Range is not null;
+ bool canUseIndexOf =
+ primarySet.Set != RegexCharClass.NotNewLineClass &&
+ (primarySet.Chars is not null || primarySet.Range is not null);
bool needLoop = !canUseIndexOf || setsToUse > 1;
FinishEmitBlock loopBlock = default;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index aa60e511742..70d2e6a7261 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -97,6 +97,8 @@ namespace System.Text.RegularExpressions
internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitRanges;
internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitRanges;
+ internal const string NotNewLineClass = "\x01\x02\x00\x0A\x0B";
+
internal const string AnyClass = "\x00\x01\x00\x00";
private const string EmptyClass = "\x00\x00\x00";
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index 0ed046e282f..3ae0451e2e5 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -824,8 +824,12 @@ namespace System.Text.RegularExpressions
// If we can use IndexOf{Any}, try to accelerate the skip loop via vectorization to match the first prefix.
// We can use it if this is a case-sensitive class with a small number of characters in the class.
+ // We avoid using it for the relatively common case of the starting set being '.', aka anything other than
+ // a newline, as it's very rare to have long, uninterrupted sequences of newlines.
int setIndex = 0;
- bool canUseIndexOf = primarySet.Chars is not null || primarySet.Range is not null;
+ bool canUseIndexOf =
+ primarySet.Set != RegexCharClass.NotNewLineClass &&
+ (primarySet.Chars is not null || primarySet.Range is not null);
bool needLoop = !canUseIndexOf || setsToUse > 1;
Label checkSpanLengthLabel = default;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
index 823e3617e3b..099073da0cb 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -194,12 +194,12 @@ namespace System.Text.RegularExpressions
}
}
- /// <summary>true iff <see cref="TryFindNextStartingPosition"/> might advance the position.</summary>
+ /// <summary>true iff <see cref="TryFindNextStartingPositionLeftToRight"/> might advance the position.</summary>
public bool IsUseful =>
FindMode != FindNextStartingPositionMode.NoSearch || // there's a searching scheme available
LeadingAnchor == RegexNodeKind.Bol; // there's a leading BOL anchor we can otherwise search for
- /// <summary>Gets the selected mode for performing the next <see cref="TryFindNextStartingPosition"/> operation</summary>
+ /// <summary>Gets the selected mode for performing the next <see cref="TryFindNextStartingPositionLeftToRight"/> or <see cref="TryFindNextStartingPositionRightToLeft"/> operation</summary>
public FindNextStartingPositionMode FindMode { get; } = FindNextStartingPositionMode.NoSearch;
/// <summary>Gets the leading anchor (e.g. RegexNodeKind.Bol) if one exists and was computed.</summary>
@@ -313,27 +313,138 @@ namespace System.Text.RegularExpressions
/// <param name="pos">The position in <paramref name="textSpan"/>. This is updated with the found position.</param>
/// <param name="start">The index in <paramref name="textSpan"/> to consider the start for start anchor purposes.</param>
/// <returns>true if a position to attempt a match was found; false if none was found.</returns>
- public bool TryFindNextStartingPosition(ReadOnlySpan<char> textSpan, ref int pos, int start)
+ public bool TryFindNextStartingPositionRightToLeft(ReadOnlySpan<char> textSpan, ref int pos, int start)
{
// Return early if we know there's not enough input left to match.
- if (!_rightToLeft)
+ if (pos < MinRequiredLength)
{
- if (pos > textSpan.Length - MinRequiredLength)
- {
- pos = textSpan.Length;
- return false;
- }
+ pos = 0;
+ return false;
}
- else
+
+ Debug.Assert(LeadingAnchor != RegexNodeKind.Bol, "BOL isn't enabled for RTL");
+
+ switch (FindMode)
{
- if (pos < MinRequiredLength)
- {
- pos = 0;
- return false;
- }
+ // There's an anchor. For some, we can simply compare against the current position.
+ // For others, we can jump to the relevant location.
+
+ case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning:
+ if (pos != 0)
+ {
+ // If we're not currently at the beginning, skip ahead (or, rather, backwards)
+ // since nothing until then can possibly match. (We're iterating from the end
+ // to the beginning in RightToLeft mode.)
+ pos = 0;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start:
+ if (pos != start)
+ {
+ // If we're not currently at the starting position, we'll never be, so fail immediately.
+ // This is different from beginning, since beginning is the fixed location of 0 whereas
+ // start is wherever the iteration actually starts from; in left-to-right, that's often
+ // the same as beginning, but in RightToLeft it rarely is.
+ pos = 0;
+ return false;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ:
+ if (pos < textSpan.Length - 1 || ((uint)pos < (uint)textSpan.Length && textSpan[pos] != '\n'))
+ {
+ // If we're not currently at the end, we'll never be (we're iterating from end to beginning),
+ // so fail immediately.
+ pos = 0;
+ return false;
+ }
+ return true;
+
+ case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End:
+ if (pos < textSpan.Length)
+ {
+ // If we're not currently at the end, we'll never be (we're iterating from end to beginning),
+ // so fail immediately.
+ pos = 0;
+ return false;
+ }
+ return true;
+
+ // There's a case-sensitive prefix. Search for it with ordinal IndexOf.
+
+ case FindNextStartingPositionMode.LeadingString_RightToLeft:
+ {
+ int i = textSpan.Slice(0, pos).LastIndexOf(LeadingPrefix.AsSpan());
+ if (i >= 0)
+ {
+ pos = i + LeadingPrefix.Length;
+ return true;
+ }
+
+ pos = 0;
+ return false;
+ }
+
+ // There's a literal at the beginning of the pattern. Search for it.
+
+ case FindNextStartingPositionMode.LeadingChar_RightToLeft:
+ {
+ int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Char);
+ if (i >= 0)
+ {
+ pos = i + 1;
+ return true;
+ }
+
+ pos = 0;
+ return false;
+ }
+
+ // There's a set at the beginning of the pattern. Search for it.
+
+ case FindNextStartingPositionMode.LeadingSet_RightToLeft:
+ {
+ ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
+ string set = FixedDistanceSets![0].Set;
+
+ ReadOnlySpan<char> span = textSpan.Slice(0, pos);
+ for (int i = span.Length - 1; i >= 0; i--)
+ {
+ if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
+ {
+ pos = i + 1;
+ return true;
+ }
+ }
+
+ pos = 0;
+ return false;
+ }
+
+ // Nothing special to look for. Just return true indicating this is a valid position to try to match.
+
+ default:
+ Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch);
+ return true;
}
+ }
- // Optimize the handling of a Beginning-Of-Line (BOL) anchor (only for left-to-right). BOL is special, in that unlike
+ /// <summary>Try to advance to the next starting position that might be a location for a match.</summary>
+ /// <param name="textSpan">The text to search.</param>
+ /// <param name="pos">The position in <paramref name="textSpan"/>. This is updated with the found position.</param>
+ /// <param name="start">The index in <paramref name="textSpan"/> to consider the start for start anchor purposes.</param>
+ /// <returns>true if a position to attempt a match was found; false if none was found.</returns>
+ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan, ref int pos, int start)
+ {
+ // Return early if we know there's not enough input left to match.
+ if (pos > textSpan.Length - MinRequiredLength)
+ {
+ pos = textSpan.Length;
+ return false;
+ }
+
+ // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike
// other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
// to boost our position to the next line, and then continue normally with any searches.
@@ -342,7 +453,6 @@ namespace System.Text.RegularExpressions
// If we're not currently positioned at the beginning of a line (either
// the beginning of the string or just after a line feed), find the next
// newline and position just after it.
- Debug.Assert(!_rightToLeft);
int posm1 = pos - 1;
if ((uint)posm1 < (uint)textSpan.Length && textSpan[posm1] != '\n')
{
@@ -369,22 +479,22 @@ namespace System.Text.RegularExpressions
// For others, we can jump to the relevant location.
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning:
- if (pos != 0)
+ // If we're not currently at the beginning, we'll never be, so fail immediately.
+ if (pos == 0)
{
- // If we're not currently at the beginning, we'll never be, so fail immediately.
- pos = textSpan.Length;
- return false;
+ return true;
}
- return true;
+ pos = textSpan.Length;
+ return false;
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start:
- if (pos != start)
+ // If we're not currently at the start, we'll never be, so fail immediately.
+ if (pos == start)
{
- // If we're not currently at the start, we'll never be, so fail immediately.
- pos = textSpan.Length;
- return false;
+ return true;
}
- return true;
+ pos = textSpan.Length;
+ return false;
case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ:
if (pos < textSpan.Length - 1)
@@ -404,48 +514,6 @@ namespace System.Text.RegularExpressions
}
return true;
- case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning:
- if (pos != 0)
- {
- // If we're not currently at the beginning, skip ahead (or, rather, backwards)
- // since nothing until then can possibly match. (We're iterating from the end
- // to the beginning in RightToLeft mode.)
- pos = 0;
- }
- return true;
-
- case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start:
- if (pos != start)
- {
- // If we're not currently at the starting position, we'll never be, so fail immediately.
- // This is different from beginning, since beginning is the fixed location of 0 whereas
- // start is wherever the iteration actually starts from; in left-to-right, that's often
- // the same as beginning, but in RightToLeft it rarely is.
- pos = 0;
- return false;
- }
- return true;
-
- case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ:
- if (pos < textSpan.Length - 1 || ((uint)pos < (uint)textSpan.Length && textSpan[pos] != '\n'))
- {
- // If we're not currently at the end, we'll never be (we're iterating from end to beginning),
- // so fail immediately.
- pos = 0;
- return false;
- }
- return true;
-
- case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End:
- if (pos < textSpan.Length)
- {
- // If we're not currently at the end, we'll never be (we're iterating from end to beginning),
- // so fail immediately.
- pos = 0;
- return false;
- }
- return true;
-
case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ:
if (pos < textSpan.Length - MinRequiredLength - 1)
{
@@ -475,34 +543,6 @@ namespace System.Text.RegularExpressions
return false;
}
- case FindNextStartingPositionMode.LeadingString_RightToLeft:
- {
- int i = textSpan.Slice(0, pos).LastIndexOf(LeadingPrefix.AsSpan());
- if (i >= 0)
- {
- pos = i + LeadingPrefix.Length;
- return true;
- }
-
- pos = 0;
- return false;
- }
-
- // There's a literal at the beginning of the pattern. Search for it.
-
- case FindNextStartingPositionMode.LeadingChar_RightToLeft:
- {
- int i = textSpan.Slice(0, pos).LastIndexOf(FixedDistanceLiteral.Char);
- if (i >= 0)
- {
- pos = i + 1;
- return true;
- }
-
- pos = 0;
- return false;
- }
-
// There's a set at the beginning of the pattern. Search for it.
case FindNextStartingPositionMode.LeadingSet_LeftToRight:
@@ -538,25 +578,6 @@ namespace System.Text.RegularExpressions
return false;
}
- case FindNextStartingPositionMode.LeadingSet_RightToLeft:
- {
- ref uint[]? startingAsciiLookup = ref _asciiLookups![0];
- string set = FixedDistanceSets![0].Set;
-
- ReadOnlySpan<char> span = textSpan.Slice(0, pos);
- for (int i = span.Length - 1; i >= 0; i--)
- {
- if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup))
- {
- pos = i + 1;
- return true;
- }
- }
-
- pos = 0;
- return false;
- }
-
// There's a literal at a fixed offset from the beginning of the pattern. Search for it.
case FindNextStartingPositionMode.FixedDistanceChar_LeftToRight:
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
index cd70f00b222..bdb5cdb3834 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs
@@ -339,31 +339,41 @@ namespace System.Text.RegularExpressions
Debug.Assert(runstack is not null);
Debug.Assert(runcrawl is not null);
- // Configure the additional value to "bump" the position along each time we loop around
- // to call TryFindNextStartingPosition again, as well as the stopping position for the loop. We generally
- // bump by 1 and stop at textend, but if we're examining right-to-left, we instead bump
- // by -1 and stop at textbeg.
- int bump = 1, stoppos = text.Length;
if (runregex.RightToLeft)
{
- bump = -1;
- stoppos = 0;
- }
+ while (_code.FindOptimizations.TryFindNextStartingPositionRightToLeft(text, ref runtextpos, runtextstart))
+ {
+ CheckTimeout();
- while (_code.FindOptimizations.TryFindNextStartingPosition(text, ref runtextpos, runtextstart))
- {
- CheckTimeout();
+ if (TryMatchAtCurrentPosition(text) || runtextpos == 0)
+ {
+ return;
+ }
- if (TryMatchAtCurrentPosition(text) || runtextpos == stoppos)
- {
- return;
+ // Reset state for another iteration.
+ runtrackpos = runtrack.Length;
+ runstackpos = runstack.Length;
+ runcrawlpos = runcrawl.Length;
+ runtextpos--;
}
+ }
+ else
+ {
+ while (_code.FindOptimizations.TryFindNextStartingPositionLeftToRight(text, ref runtextpos, runtextstart))
+ {
+ CheckTimeout();
- // Reset state for another iteration.
- runtrackpos = runtrack.Length;
- runstackpos = runstack.Length;
- runcrawlpos = runcrawl.Length;
- runtextpos += bump;
+ if (TryMatchAtCurrentPosition(text) || runtextpos == text.Length)
+ {
+ return;
+ }
+
+ // Reset state for another iteration.
+ runtrackpos = runtrack.Length;
+ runstackpos = runstack.Length;
+ runcrawlpos = runcrawl.Length;
+ runtextpos++;
+ }
}
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
index 1aeb8f6d4fd..cacf02d321e 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -170,36 +170,28 @@ namespace System.Text.RegularExpressions
TryFindFixedSets(root, results, ref distance, thorough);
// Remove any sets that match everything; they're not helpful. (This check exists primarily to weed
- // out use of . in Singleline mode.)
- bool hasAny = false;
+ // out use of . in Singleline mode, but also filters out explicit sets like [\s\S].)
for (int i = 0; i < results.Count; i++)
{
if (results[i].Set == RegexCharClass.AnyClass)
{
- hasAny = true;
+ results.RemoveAll(s => s.Set == RegexCharClass.AnyClass);
break;
}
}
- if (hasAny)
- {
- results.RemoveAll(s => s.Set == RegexCharClass.AnyClass);
- }
// If we don't have any results, try harder to compute one for the starting character.
// This is a more involved computation that can find things the fixed-distance investigation
// doesn't.
if (results.Count == 0)
{
- string? charClass = FindFirstCharClass(root);
- if (charClass is not null)
- {
- results.Add(new RegexFindOptimizations.FixedDistanceSet(null, charClass, 0));
- }
-
- if (results.Count == 0)
+ if (FindFirstCharClass(root) is not string charClass ||
+ charClass == RegexCharClass.AnyClass) // weed out match-all, same as above
{
return null;
}
+
+ results.Add(new RegexFindOptimizations.FixedDistanceSet(null, charClass, 0));
}
// For every entry, try to get the chars that make up the set, if there are few enough.
@@ -484,6 +476,22 @@ namespace System.Text.RegularExpressions
return s1.Range is not null ? -1 : 1;
}
+ // If both have ranges, prefer the one that includes fewer characters.
+ if (s1.Range is not null)
+ {
+ return
+ GetRangeLength(s1.Range.GetValueOrDefault()).CompareTo(
+ GetRangeLength(s2.Range.GetValueOrDefault()));
+
+ static int GetRangeLength((char LowInclusive, char HighInclusive, bool Negated) range)
+ {
+ int length = range.HighInclusive - range.LowInclusive + 1;
+ return range.Negated ?
+ char.MaxValue + 1 - length :
+ length;
+ }
+ }
+
// As a tiebreaker, prioritize the earlier one.
return s1.Distance.CompareTo(s2.Distance);
});
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index 89650f3e27a..107beaa4e38 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -1271,7 +1271,7 @@ namespace System.Text.RegularExpressions.Symbolic
where TInputReader : struct, IInputReader
{
// Find the first position that matches with some likely character.
- if (!matcher._findOpts!.TryFindNextStartingPosition(input, ref pos, 0))
+ if (!matcher._findOpts!.TryFindNextStartingPositionLeftToRight(input, ref pos, 0))
{
// No match exists
return false;