Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/dotnet/runtime.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Toub <stoub@microsoft.com>2022-11-07 18:23:44 +0300
committerGitHub <noreply@github.com>2022-11-07 18:23:44 +0300
commit62e06de8f2876a71615fb5d696c76fbae2a5b38e (patch)
treebc71521369f5227c46a43394597f3d1d929dab5e
parente7a304c942735951bbdb32cde7ba9c37b845f32e (diff)
Improve IndexOf handling in regex source generator / compiler (#77925)
This PR does a few related things: 1. Consolidates _most_ (but not all) use of IndexOf variants into a single helper that can then be used from multiple locations to avoid code duplication and make it easier for us to extend in the future with additional IndexOf variants. 2. Stops using IndexOf when doing lazy backtracking in an optional. 3. Special-cases "any" repeaters to not do any character checking. 4. Adds use of IndexOf (via the new helper) into repeaters.
-rw-r--r--src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs262
-rw-r--r--src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs492
-rw-r--r--src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs71
3 files changed, 433 insertions, 392 deletions
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index e758cac88d6..16bf09065ae 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -1,19 +1,16 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
-using System.Buffers.Binary;
using System.CodeDom.Compiler;
using System.Collections;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.IO;
using System.Linq;
-using System.Net.Cache;
-using System.Runtime.InteropServices;
using System.Threading;
-using System.Web;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
@@ -2891,33 +2888,19 @@ namespace System.Text.RegularExpressions.Generator
// We're backtracking. Check the timeout.
EmitTimeoutCheckIfNeeded(writer, rm);
- if (!rtl && subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal)
+ if (!rtl &&
+ node.N > 1 && // no point in using IndexOf for small loops, in particular optionals
+ subsequent?.FindStartingLiteralNode() is RegexNode literalNode &&
+ TryEmitIndexOf(literalNode, useLast: true, negate: false, out int literalLength, out string indexOfExpr))
{
writer.WriteLine($"if ({startingPos} >= {endingPos} ||");
- (string lastIndexOfName, string lastIndexOfAnyName) = !literal.Negated ?
- ("LastIndexOf", "LastIndexOfAny") :
- ("LastIndexOfAnyExcept", "LastIndexOfAnyExcept");
string setEndingPosCondition = $" ({endingPos} = inputSpan.Slice({startingPos}, ";
- if (literal.String is not null)
- {
- setEndingPosCondition += $"Math.Min(inputSpan.Length, {endingPos} + {literal.String.Length - 1}) - {startingPos}).{lastIndexOfName}({Literal(literal.String)}";
- }
- else
- {
- setEndingPosCondition += $"{endingPos} - {startingPos}).";
- setEndingPosCondition += literal.SetChars is not null ? literal.SetChars.Length switch
- {
- 2 => $"{lastIndexOfAnyName}({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}",
- 3 => $"{lastIndexOfAnyName}({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}, {Literal(literal.SetChars[2])}",
- _ => $"{lastIndexOfAnyName}({Literal(literal.SetChars)}",
- } :
- literal.Range.LowInclusive == literal.Range.HighInclusive ? $"{lastIndexOfName}({Literal(literal.Range.LowInclusive)}" :
- $"{lastIndexOfAnyName}InRange({Literal(literal.Range.LowInclusive)}, {Literal(literal.Range.HighInclusive)}";
- }
- setEndingPosCondition += ")) < 0)";
+ setEndingPosCondition = literalLength > 1 ?
+ $"{setEndingPosCondition}Math.Min(inputSpan.Length, {endingPos} + {literalLength - 1}) - {startingPos})" :
+ $"{setEndingPosCondition}{endingPos} - {startingPos})";
- using (EmitBlock(writer, setEndingPosCondition))
+ using (EmitBlock(writer, $"{setEndingPosCondition}.{indexOfExpr}) < 0)"))
{
Goto(doneLabel);
}
@@ -3098,7 +3081,7 @@ namespace System.Text.RegularExpressions.Generator
(false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal($"{node.Ch}{literal.SetChars}")});",
});
}
- else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char
+ else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
{
overlap = literal.Range.LowInclusive == node.Ch;
writer.WriteLine(overlap ?
@@ -3131,26 +3114,13 @@ namespace System.Text.RegularExpressions.Generator
else if (iterationCount is null &&
node.Kind is RegexNodeKind.Setlazy &&
node.Str == RegexCharClass.AnyClass &&
- subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal2)
+ subsequent?.FindStartingLiteralNode() is RegexNode literal2 &&
+ TryEmitIndexOf(literal2, useLast: false, negate: false, out _, out string? indexOfExpr))
{
// e.g. ".*?string" with RegexOptions.Singleline
// This lazy loop will consume all characters until the subsequent literal. If the subsequent literal
// isn't found, the loop fails. We can implement it to just search for that literal.
- (string indexOfName, string indexOfAnyName) = !literal2.Negated ?
- ("IndexOf", "IndexOfAny") :
- ("IndexOfAnyExcept", "IndexOfAnyExcept");
- writer.WriteLine($"{startingPos} = {sliceSpan}.");
- writer.WriteLine(
- literal2.String is not null ? $"{indexOfName}({Literal(literal2.String)});" :
- literal2.SetChars is not null ? literal2.SetChars.Length switch
- {
- 2 => $"{indexOfAnyName}({Literal(literal2.SetChars[0])}, {Literal(literal2.SetChars[1])});",
- 3 => $"{indexOfAnyName}({Literal(literal2.SetChars[0])}, {Literal(literal2.SetChars[1])}, {Literal(literal2.SetChars[2])});",
- _ => $"{indexOfAnyName}({Literal(literal2.SetChars)});",
- } :
- literal2.Range.LowInclusive == literal2.Range.HighInclusive ? $"{indexOfName}({Literal(literal2.Range.LowInclusive)});" :
- $"{indexOfAnyName}InRange({Literal(literal2.Range.LowInclusive)}, {Literal(literal2.Range.HighInclusive)});");
-
+ writer.WriteLine($"{startingPos} = {sliceSpan}.{indexOfExpr};");
using (EmitBlock(writer, $"if ({startingPos} < 0)"))
{
Goto(doneLabel);
@@ -3543,6 +3513,15 @@ namespace System.Text.RegularExpressions.Generator
EmitSingleChar(node);
}
}
+ else if (node.IsSetFamily && node.Str == RegexCharClass.AnyClass)
+ {
+ // This is a repeater for anything, which means we only care about length and can jump past that length.
+ if (emitLengthCheck)
+ {
+ EmitSpanLengthCheck(iterations);
+ }
+ sliceStaticPos += iterations;
+ }
else if (iterations <= MaxUnrollSize)
{
// if ((uint)(sliceStaticPos + iterations - 1) >= (uint)slice.Length ||
@@ -3577,20 +3556,37 @@ namespace System.Text.RegularExpressions.Generator
if (emitLengthCheck)
{
EmitSpanLengthCheck(iterations);
+ writer.WriteLine();
}
- string repeaterSpan = "repeaterSlice"; // As this repeater doesn't wrap arbitrary node emits, this shouldn't conflict with anything
- writer.WriteLine($"ReadOnlySpan<char> {repeaterSpan} = {sliceSpan}.Slice({sliceStaticPos}, {iterations});");
- using (EmitBlock(writer, $"for (int i = 0; i < {repeaterSpan}.Length; i++)"))
+ // If we're able to vectorize the search, do so. Otherwise, fall back to a loop.
+ // For the loop, we're validating that each char matches the target node.
+ // For IndexOf, we're looking for the first thing that _doesn't_ match the target node,
+ // and thus similarly validating that everything does.
+ if (TryEmitIndexOf(node, useLast: false, negate: true, out _, out string? indexOfExpr))
{
- string tmpTextSpanLocal = sliceSpan; // we want EmitSingleChar to refer to this temporary
- int tmpSliceStaticPos = sliceStaticPos;
- sliceSpan = repeaterSpan;
- sliceStaticPos = 0;
- EmitSingleChar(node, emitLengthCheck: false, offset: "i");
- sliceSpan = tmpTextSpanLocal;
- sliceStaticPos = tmpSliceStaticPos;
+ using (EmitBlock(writer, $"if ({sliceSpan}.Slice({sliceStaticPos}, {iterations}).{indexOfExpr} >= 0)"))
+ {
+ Goto(doneLabel);
+ }
}
+ else
+ {
+ string repeaterSpan = "repeaterSlice"; // As this repeater doesn't wrap arbitrary node emits, this shouldn't conflict with anything
+ writer.WriteLine($"ReadOnlySpan<char> {repeaterSpan} = {sliceSpan}.Slice({sliceStaticPos}, {iterations});");
+
+ using (EmitBlock(writer, $"for (int i = 0; i < {repeaterSpan}.Length; i++)"))
+ {
+ string tmpTextSpanLocal = sliceSpan; // we want EmitSingleChar to refer to this temporary
+ int tmpSliceStaticPos = sliceStaticPos;
+ sliceSpan = repeaterSpan;
+ sliceStaticPos = 0;
+ EmitSingleChar(node, emitLengthCheck: false, offset: "i");
+ sliceSpan = tmpTextSpanLocal;
+ sliceStaticPos = tmpSliceStaticPos;
+ }
+ }
+
sliceStaticPos += iterations;
}
}
@@ -3618,9 +3614,6 @@ namespace System.Text.RegularExpressions.Generator
int minIterations = node.M;
int maxIterations = node.N;
bool rtl = (node.Options & RegexOptions.RightToLeft) != 0;
-
- Span<char> setChars = stackalloc char[5]; // 5 is max optimized by IndexOfAny today
- int numSetChars = 0;
string iterationLocal = ReserveName("iteration");
if (rtl)
@@ -3655,61 +3648,6 @@ namespace System.Text.RegularExpressions.Generator
writer.WriteLine();
}
}
- else if ((node.IsOneFamily || node.IsNotoneFamily) && maxIterations == int.MaxValue)
- {
- // For One or Notone, we're looking for a specific character, as everything until we find
- // it (or its negation in the case of One) is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive,
- // we can use the vectorized IndexOf{AnyExcept} to do the search, rather than open-coding it. The unbounded
- // restriction is purely for simplicity; it could be removed in the future with additional code to
- // handle the unbounded case.
-
- writer.Write($"int {iterationLocal} = {sliceSpan}");
- if (sliceStaticPos > 0)
- {
- writer.Write($".Slice({sliceStaticPos})");
- }
- string op = node.IsNotoneFamily ? "IndexOf" : "IndexOfAnyExcept";
- writer.WriteLine($".{op}({Literal(node.Ch)});");
-
- using (EmitBlock(writer, $"if ({iterationLocal} < 0)"))
- {
- writer.WriteLine(sliceStaticPos > 0 ?
- $"{iterationLocal} = {sliceSpan}.Length - {sliceStaticPos};" :
- $"{iterationLocal} = {sliceSpan}.Length;");
- }
- writer.WriteLine();
- }
- else if (node.IsSetFamily &&
- maxIterations == int.MaxValue &&
- (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0)
- {
- // If the set contains only a few characters (if it contained 1 and was negated, it should
- // have been reduced to a Notone), we can use an IndexOfAny{Except} to find any of the target characters.
- // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity.
- Debug.Assert(numSetChars > 1);
-
- writer.Write($"int {iterationLocal} = {sliceSpan}");
- if (sliceStaticPos != 0)
- {
- writer.Write($".Slice({sliceStaticPos})");
- }
- writer.WriteLine((numSetChars, RegexCharClass.IsNegated(node.Str!)) switch
- {
- (2, true) => $".IndexOfAny({Literal(setChars[0])}, {Literal(setChars[1])});",
- (3, true) => $".IndexOfAny({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});",
- (_, true) => $".IndexOfAny({Literal(setChars.Slice(0, numSetChars).ToString())});",
- (2, false) => $".IndexOfAnyExcept({Literal(setChars[0])}, {Literal(setChars[1])});",
- (3, false) => $".IndexOfAnyExcept({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});",
- (_, false) => $".IndexOfAnyExcept({Literal(setChars.Slice(0, numSetChars).ToString())});",
- });
- using (EmitBlock(writer, $"if ({iterationLocal} < 0)"))
- {
- writer.WriteLine(sliceStaticPos > 0 ?
- $"{iterationLocal} = {sliceSpan}.Length - {sliceStaticPos};" :
- $"{iterationLocal} = {sliceSpan}.Length;");
- }
- writer.WriteLine();
- }
else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass)
{
// .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end.
@@ -3718,20 +3656,18 @@ namespace System.Text.RegularExpressions.Generator
TransferSliceStaticPosToPos();
writer.WriteLine($"int {iterationLocal} = inputSpan.Length - pos;");
}
- else if (node.IsSetFamily &&
- maxIterations == int.MaxValue &&
- RegexCharClass.TryGetSingleRange(node.Str!, out char rangeLowInclusive, out char rangeHighInclusive))
+ else if (maxIterations == int.MaxValue && TryEmitIndexOf(node, useLast: false, negate: true, out _, out string indexOfExpr))
{
- // If the set contains a single range, we can use an IndexOfAny{Except}InRange to find any of the target characters.
- // As with the cases above, the unbounded constraint is purely for simplicity.
- string indexOfMethod = RegexCharClass.IsNegated(node.Str!) ? "IndexOfAnyInRange" : "IndexOfAnyExceptInRange";
+ // We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is
+ // purely for simplicity; it could be removed in the future with additional code to handle that case.
writer.Write($"int {iterationLocal} = {sliceSpan}");
if (sliceStaticPos != 0)
{
writer.Write($".Slice({sliceStaticPos})");
}
- writer.WriteLine($".{indexOfMethod}({Literal(rangeLowInclusive)}, {Literal(rangeHighInclusive)});");
+ writer.WriteLine($".{indexOfExpr};");
+
using (EmitBlock(writer, $"if ({iterationLocal} < 0)"))
{
writer.WriteLine(sliceStaticPos > 0 ?
@@ -3745,14 +3681,9 @@ namespace System.Text.RegularExpressions.Generator
// For everything else, do a normal loop.
string expr = $"{sliceSpan}[{iterationLocal}]";
- if (node.IsSetFamily)
- {
- expr = MatchCharacterClass(options, expr, node.Str!, negate: false, additionalDeclarations, requiredHelpers);
- }
- else
- {
- expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}";
- }
+ expr = node.IsSetFamily ?
+ MatchCharacterClass(options, expr, node.Str!, negate: false, additionalDeclarations, requiredHelpers) :
+ $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}";
if (minIterations != 0 || maxIterations != int.MaxValue)
{
@@ -4348,6 +4279,85 @@ namespace System.Text.RegularExpressions.Generator
}
}
+ /// <summary>Tries to create an IndexOf expression for the node.</summary>
+ /// <param name="node">The RegexNode. If it's a loop, only the one/notone/set aspect of the node is factored in.</param>
+ /// <param name="useLast">true to use LastIndexOf variants; false to use IndexOf variants.</param>
+ /// <param name="negate">true to search for the opposite of the node.</param>
+ /// <param name="literalLength">0 if returns false. If it returns true, string.Length for a multi, otherwise 1.</param>
+ /// <param name="indexOfExpr">The resulting expression if it returns true; otherwise, null.</param>
+ /// <returns>true if an expression could be produced; otherwise, false.</returns>
+ private static bool TryEmitIndexOf(
+ RegexNode node,
+ bool useLast, bool negate,
+ out int literalLength, [NotNullWhen(true)] out string? indexOfExpr)
+ {
+ string last = useLast ? "Last" : "";
+
+ if (node.Kind == RegexNodeKind.Multi)
+ {
+ Debug.Assert(!negate, "Negation isn't appropriate for a multi");
+ indexOfExpr = $"{last}IndexOf({Literal(node.Str)})";
+ literalLength = node.Str.Length;
+ return true;
+ }
+
+ if (node.IsOneFamily)
+ {
+ indexOfExpr = negate ? $"{last}IndexOfAnyExcept({Literal(node.Ch)})" : $"{last}IndexOf({Literal(node.Ch)})";
+ literalLength = 1;
+ return true;
+ }
+
+ if (node.IsNotoneFamily)
+ {
+ indexOfExpr = negate ? $"{last}IndexOf({Literal(node.Ch)})" : $"{last}IndexOfAnyExcept({Literal(node.Ch)})";
+ literalLength = 1;
+ return true;
+ }
+
+ if (node.IsSetFamily)
+ {
+ bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;
+
+ Span<char> setChars = stackalloc char[5]; // current max that's vectorized
+ int setCharsCount;
+ if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0)
+ {
+ (string indexOfName, string indexOfAnyName) = !negated ?
+ ("IndexOf", "IndexOfAny") :
+ ("IndexOfAnyExcept", "IndexOfAnyExcept");
+
+ setChars = setChars.Slice(0, setCharsCount);
+ indexOfExpr = setChars.Length switch
+ {
+ 1 => $"{last}{indexOfName}({Literal(setChars[0])})",
+ 2 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])})",
+ 3 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})",
+ _ => $"{last}{indexOfAnyName}({Literal(setChars.ToString())})",
+ };
+
+ literalLength = 1;
+ return true;
+ }
+
+ if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
+ {
+ string indexOfAnyInRangeName = !negated ?
+ "IndexOfAnyInRange" :
+ "IndexOfAnyExceptInRange";
+
+ indexOfExpr = $"{last}{indexOfAnyInRangeName}({Literal(lowInclusive)}, {Literal(highInclusive)})";
+
+ literalLength = 1;
+ return true;
+ }
+ }
+
+ indexOfExpr = null;
+ literalLength = 0;
+ return false;
+ }
+
private static string MatchCharacterClass(RegexOptions options, string chExpr, string charClass, bool negate, HashSet<string> additionalDeclarations, Dictionary<string, string[]> requiredHelpers)
{
// We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass),
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index 2eee61f47e2..0ed046e282f 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -2792,7 +2792,7 @@ namespace System.Text.RegularExpressions
// if (loadedChar != ch) goto doneLabel;
if (node.IsSetFamily)
{
- EmitMatchCharacterClass(node.Str!);
+ EmitMatchCharacterClass(node.Str);
BrfalseFar(doneLabel);
}
else
@@ -3176,7 +3176,10 @@ namespace System.Text.RegularExpressions
BleFar(doneLabel);
}
- if (!rtl && subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal)
+ if (!rtl &&
+ node.N > 1 &&
+ subsequent?.FindStartingLiteralNode() is RegexNode literal &&
+ CanEmitIndexOf(literal, out int literalLength))
{
// endingPos = inputSpan.Slice(startingPos, Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos).LastIndexOf(literal);
// if (endingPos < 0)
@@ -3185,65 +3188,28 @@ namespace System.Text.RegularExpressions
// }
Ldloca(inputSpan);
Ldloc(startingPos);
- if (literal.String is not null)
+ if (literalLength > 1)
{
- Debug.Assert(!literal.Negated, "strings should not be negated");
+ // Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos
Ldloca(inputSpan);
Call(s_spanGetLengthMethod);
Ldloc(endingPos);
- Ldc(literal.String.Length - 1);
+ Ldc(literalLength - 1);
Add();
Call(s_mathMinIntInt);
- Ldloc(startingPos);
- Sub();
- Call(s_spanSliceIntIntMethod);
- Ldstr(literal.String);
- Call(s_stringAsSpanMethod);
- Call(s_spanLastIndexOfSpan);
}
else
{
+ // endingPos - startingPos
Ldloc(endingPos);
- Ldloc(startingPos);
- Sub();
- Call(s_spanSliceIntIntMethod);
- if (literal.SetChars is not null)
- {
- switch (literal.SetChars.Length)
- {
- case 2:
- Ldc(literal.SetChars[0]);
- Ldc(literal.SetChars[1]);
- Call(literal.Negated ? s_spanLastIndexOfAnyExceptCharChar : s_spanLastIndexOfAnyCharChar);
- break;
-
- case 3:
- Ldc(literal.SetChars[0]);
- Ldc(literal.SetChars[1]);
- Ldc(literal.SetChars[2]);
- Call(literal.Negated ? s_spanLastIndexOfAnyExceptCharCharChar : s_spanLastIndexOfAnyCharCharChar);
- break;
-
- default:
- Ldstr(literal.SetChars);
- Call(s_stringAsSpanMethod);
- Call(literal.Negated ? s_spanLastIndexOfAnyExceptSpan : s_spanLastIndexOfAnySpan);
- break;
- }
- }
- else if (literal.Range.LowInclusive == literal.Range.HighInclusive)
- {
- Ldc(literal.Range.LowInclusive);
- Call(literal.Negated ? s_spanLastIndexOfAnyExceptChar : s_spanLastIndexOfChar);
- }
- else
- {
- Ldc(literal.Range.LowInclusive);
- Ldc(literal.Range.HighInclusive);
- Call(literal.Negated ? s_spanLastIndexOfAnyExceptInRange : s_spanLastIndexOfAnyInRange);
- }
}
+ Ldloc(startingPos);
+ Sub();
+ Call(s_spanSliceIntIntMethod);
+
+ EmitIndexOf(literal, useLast: true, negate: false);
Stloc(endingPos);
+
Ldloc(endingPos);
Ldc(0);
BltFar(doneLabel);
@@ -3487,7 +3453,7 @@ namespace System.Text.RegularExpressions
break;
}
}
- else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // char literal
+ else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One
{
overlap = literal.Range.LowInclusive == node.Ch;
if (overlap)
@@ -3557,7 +3523,8 @@ namespace System.Text.RegularExpressions
iterationCount is null &&
node.Kind is RegexNodeKind.Setlazy &&
node.Str == RegexCharClass.AnyClass &&
- subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal2)
+ subsequent?.FindStartingLiteralNode() is RegexNode literal2 &&
+ CanEmitIndexOf(literal2, out _))
{
// e.g. ".*?string" with RegexOptions.Singleline
// This lazy loop will consume all characters until the subsequent literal. If the subsequent literal
@@ -3565,50 +3532,7 @@ namespace System.Text.RegularExpressions
// startingPos = slice.IndexOf(literal);
Ldloc(slice);
- if (literal2.String is not null)
- {
- Debug.Assert(!literal2.Negated, "strings should not be negated");
- Ldstr(literal2.String);
- Call(s_stringAsSpanMethod);
- Call(s_spanIndexOfSpan);
- }
- else if (literal2.SetChars is not null)
- {
- switch (literal2.SetChars.Length)
- {
- case 2:
- Ldc(literal2.SetChars[0]);
- Ldc(literal2.SetChars[1]);
- Call(literal2.Negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar);
- break;
-
- case 3:
- Ldc(literal2.SetChars[0]);
- Ldc(literal2.SetChars[1]);
- Ldc(literal2.SetChars[2]);
- Call(literal2.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
- break;
-
- default:
- Ldstr(literal2.SetChars);
- Call(s_stringAsSpanMethod);
- Call(literal2.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
- break;
- }
- }
- else
- {
- Ldc(literal2.Range.LowInclusive);
- if (literal2.Range.LowInclusive == literal2.Range.HighInclusive)
- {
- Call(literal2.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar);
- }
- else
- {
- Ldc(literal2.Range.HighInclusive);
- Call(literal2.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange);
- }
- }
+ EmitIndexOf(node, useLast: false, negate: false);
Stloc(startingPos);
// if (startingPos < 0) goto doneLabel;
@@ -4114,6 +4038,13 @@ namespace System.Text.RegularExpressions
EmitSpanLengthCheck(iterations);
}
+ // If this is a repeater for anything,we only care about length and can jump past that length.
+ if (node.IsSetFamily && node.Str == RegexCharClass.AnyClass)
+ {
+ sliceStaticPos += iterations;
+ return;
+ }
+
// Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated
// code with other costs, like the (small) overhead of slicing to create the temp span to iterate.
const int MaxUnrollSize = 16;
@@ -4132,48 +4063,61 @@ namespace System.Text.RegularExpressions
else
{
// ReadOnlySpan<char> tmp = slice.Slice(sliceStaticPos, iterations);
- // for (int i = 0; i < tmp.Length; i++)
- // {
- // TimeoutCheck();
- // if (tmp[i] != ch) goto Done;
- // }
- // sliceStaticPos += iterations;
-
- Label conditionLabel = DefineLabel();
- Label bodyLabel = DefineLabel();
-
- using RentedLocalBuilder spanLocal = RentReadOnlySpanCharLocal();
Ldloca(slice);
Ldc(sliceStaticPos);
Ldc(iterations);
Call(s_spanSliceIntIntMethod);
- Stloc(spanLocal);
- using RentedLocalBuilder iterationLocal = RentInt32Local();
- Ldc(0);
- Stloc(iterationLocal);
- BrFar(conditionLabel);
+ // If we're able to vectorize the search, do so. Otherwise, fall back to a loop.
+ // For the loop, we're validating that each char matches the target node.
+ // For IndexOf, we're looking for the first thing that _doesn't_ match the target node,
+ // and thus similarly validating that everything does.
+ if (CanEmitIndexOf(node, out _))
+ {
+ // if (tmp.IndexOf(...) >= 0) goto doneLabel;
+ EmitIndexOf(node, useLast: false, negate: true);
+ Ldc(0);
+ BgeFar(doneLabel);
+ }
+ else
+ {
+ using RentedLocalBuilder spanLocal = RentReadOnlySpanCharLocal();
+ Stloc(spanLocal);
- MarkLabel(bodyLabel);
+ // for (int i = 0; i < tmp.Length; i++)
+ // {
+ // if (tmp[i] != ch) goto Done;
+ // }
- LocalBuilder tmpTextSpanLocal = slice; // we want EmitSingleChar to refer to this temporary
- int tmpTextSpanPos = sliceStaticPos;
- slice = spanLocal;
- sliceStaticPos = 0;
- EmitSingleChar(node, emitLengthCheck: false, offset: iterationLocal);
- slice = tmpTextSpanLocal;
- sliceStaticPos = tmpTextSpanPos;
+ Label conditionLabel = DefineLabel();
+ Label bodyLabel = DefineLabel();
- Ldloc(iterationLocal);
- Ldc(1);
- Add();
- Stloc(iterationLocal);
+ using RentedLocalBuilder iterationLocal = RentInt32Local();
+ Ldc(0);
+ Stloc(iterationLocal);
+ BrFar(conditionLabel);
- MarkLabel(conditionLabel);
- Ldloc(iterationLocal);
- Ldloca(spanLocal);
- Call(s_spanGetLengthMethod);
- BltFar(bodyLabel);
+ MarkLabel(bodyLabel);
+
+ LocalBuilder tmpTextSpanLocal = slice; // we want EmitSingleChar to refer to this temporary
+ int tmpTextSpanPos = sliceStaticPos;
+ slice = spanLocal;
+ sliceStaticPos = 0;
+ EmitSingleChar(node, emitLengthCheck: false, offset: iterationLocal);
+ slice = tmpTextSpanLocal;
+ sliceStaticPos = tmpTextSpanPos;
+
+ Ldloc(iterationLocal);
+ Ldc(1);
+ Add();
+ Stloc(iterationLocal);
+
+ MarkLabel(conditionLabel);
+ Ldloc(iterationLocal);
+ Ldloca(spanLocal);
+ Call(s_spanGetLengthMethod);
+ BltFar(bodyLabel);
+ }
sliceStaticPos += iterations;
}
@@ -4202,14 +4146,9 @@ namespace System.Text.RegularExpressions
int minIterations = node.M;
int maxIterations = node.N;
bool rtl = (node.Options & RegexOptions.RightToLeft) != 0;
-
using RentedLocalBuilder iterationLocal = RentInt32Local();
-
Label atomicLoopDoneLabel = DefineLabel();
- Span<char> setChars = stackalloc char[5]; // max optimized by IndexOfAny today
- int numSetChars = 0;
-
if (rtl)
{
TransferSliceStaticPosToPos(); // we don't use static position for rtl
@@ -4242,7 +4181,7 @@ namespace System.Text.RegularExpressions
LdindU2();
if (node.IsSetFamily)
{
- EmitMatchCharacterClass(node.Str!);
+ EmitMatchCharacterClass(node.Str);
BrfalseFar(atomicLoopDoneLabel);
}
else
@@ -4277,103 +4216,6 @@ namespace System.Text.RegularExpressions
BrFar(bodyLabel);
}
}
- else if ((node.IsOneFamily || node.IsNotoneFamily) && maxIterations == int.MaxValue)
- {
- // For One or Notone, we're looking for a specific character, as everything until we find
- // it (or its negation in the case of One) is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive,
- // we can use the vectorized IndexOf{AnyExcept} to do the search, rather than open-coding it. The unbounded
- // restriction is purely for simplicity; it could be removed in the future with additional code to
- // handle the unbounded case.
-
- // int i = slice.Slice(sliceStaticPos).IndexOf(char);
- if (sliceStaticPos > 0)
- {
- Ldloca(slice);
- Ldc(sliceStaticPos);
- Call(s_spanSliceIntMethod);
- }
- else
- {
- Ldloc(slice);
- }
- Ldc(node.Ch);
- Call(node.IsNotoneFamily ? s_spanIndexOfChar : s_spanIndexOfAnyExceptChar);
- Stloc(iterationLocal);
-
- // if (i >= 0) goto atomicLoopDoneLabel;
- Ldloc(iterationLocal);
- Ldc(0);
- BgeFar(atomicLoopDoneLabel);
-
- // i = slice.Length - sliceStaticPos;
- Ldloca(slice);
- Call(s_spanGetLengthMethod);
- if (sliceStaticPos > 0)
- {
- Ldc(sliceStaticPos);
- Sub();
- }
- Stloc(iterationLocal);
- }
- else if (node.IsSetFamily &&
- maxIterations == int.MaxValue &&
- (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0)
- {
- // If the set contains only a few characters (if it contained 1 and was negated, it should
- // have been reduced to a Notone), we can use an IndexOfAny{Except} to find any of the target characters.
- // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity.
- Debug.Assert(numSetChars > 1);
- bool negated = RegexCharClass.IsNegated(node.Str!);
-
- // int i = slice.Slice(sliceStaticPos).IndexOfAny(ch1, ch2, ...);
- if (sliceStaticPos > 0)
- {
- Ldloca(slice);
- Ldc(sliceStaticPos);
- Call(s_spanSliceIntMethod);
- }
- else
- {
- Ldloc(slice);
- }
- switch (numSetChars)
- {
- case 2:
- Ldc(setChars[0]);
- Ldc(setChars[1]);
- Call(negated ? s_spanIndexOfAnyCharChar : s_spanIndexOfAnyExceptCharChar);
- break;
-
- case 3:
- Ldc(setChars[0]);
- Ldc(setChars[1]);
- Ldc(setChars[2]);
- Call(negated ? s_spanIndexOfAnyCharCharChar : s_spanIndexOfAnyExceptCharCharChar);
- break;
-
- default:
- Ldstr(setChars.Slice(0, numSetChars).ToString());
- Call(s_stringAsSpanMethod);
- Call(negated ? s_spanIndexOfAnySpan : s_spanIndexOfAnyExceptSpan);
- break;
- }
- Stloc(iterationLocal);
-
- // if (i >= 0) goto atomicLoopDoneLabel;
- Ldloc(iterationLocal);
- Ldc(0);
- BgeFar(atomicLoopDoneLabel);
-
- // i = slice.Length - sliceStaticPos;
- Ldloca(slice);
- Call(s_spanGetLengthMethod);
- if (sliceStaticPos > 0)
- {
- Ldc(sliceStaticPos);
- Sub();
- }
- Stloc(iterationLocal);
- }
else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass)
{
// .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end.
@@ -4387,14 +4229,12 @@ namespace System.Text.RegularExpressions
Sub();
Stloc(iterationLocal);
}
- else if (node.IsSetFamily &&
- maxIterations == int.MaxValue &&
- RegexCharClass.TryGetSingleRange(node.Str!, out char rangeLowInclusive, out char rangeHighInclusive))
+ else if (maxIterations == int.MaxValue && CanEmitIndexOf(node, out _))
{
- // If the set contains a single range, we can use an IndexOfAny{Except}InRange to find any of the target characters.
- // As with the cases above, the unbounded constraint is purely for simplicity.
+ // We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is
+ // purely for simplicity; it could be removed in the future with additional code to handle that case.
- // int i = slice.Slice(sliceStaticPos).IndexOfAny{Except}InRange(rangeLowInclusive, rangeHighInclusive);
+ // int i = slice.Slice(sliceStaticPos).IndexOf(...);
if (sliceStaticPos > 0)
{
Ldloca(slice);
@@ -4405,9 +4245,8 @@ namespace System.Text.RegularExpressions
{
Ldloc(slice);
}
- Ldc(rangeLowInclusive);
- Ldc(rangeHighInclusive);
- Call(RegexCharClass.IsNegated(node.Str!) ? s_spanIndexOfAnyInRange : s_spanIndexOfAnyExceptInRange);
+
+ EmitIndexOf(node, useLast: false, negate: true);
Stloc(iterationLocal);
// if (i >= 0) goto atomicLoopDoneLabel;
@@ -4457,7 +4296,7 @@ namespace System.Text.RegularExpressions
LdindU2();
if (node.IsSetFamily)
{
- EmitMatchCharacterClass(node.Str!);
+ EmitMatchCharacterClass(node.Str);
BrfalseFar(atomicLoopDoneLabel);
}
else
@@ -4579,7 +4418,7 @@ namespace System.Text.RegularExpressions
LdindU2();
if (node.IsSetFamily)
{
- EmitMatchCharacterClass(node.Str!);
+ EmitMatchCharacterClass(node.Str);
BrfalseFar(skipUpdatesLabel);
}
else
@@ -5013,6 +4852,175 @@ namespace System.Text.RegularExpressions
}
}
+ // <summary>Gets whether an IndexOf expression can be emitted for the node.</summary>
+ // <param name="node">The RegexNode. If it's a loop, only the one/notone/set aspect of the node is factored in.</param>
+ // <param name="literalLength">0 if returns false. If it returns true, string.Length for a multi, otherwise 1.</param>
+ // <returns>true if an IndexOf can be emitted; otherwise, false.</returns>
+ bool CanEmitIndexOf(RegexNode node, out int literalLength)
+ {
+ if (node.Kind == RegexNodeKind.Multi)
+ {
+ literalLength = node.Str!.Length;
+ return true;
+ }
+
+ if (node.IsOneFamily || node.IsNotoneFamily)
+ {
+ literalLength = 1;
+ return true;
+ }
+
+ if (node.IsSetFamily)
+ {
+ Span<char> setChars = stackalloc char[5]; // current max that's vectorized
+ int setCharsCount;
+ if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0)
+ {
+ literalLength = 1;
+ return true;
+ }
+
+ if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
+ {
+ literalLength = 1;
+ return true;
+ }
+ }
+
+ literalLength = 0;
+ return false;
+ }
+
+ // <summary>Emits the code for IndexOf call based on the node.</summary>
+ // <param name="node">The RegexNode. If it's a loop, only the one/notone/set aspect of the node is factored in.</param>
+ // <param name="useLast">true to use LastIndexOf variants; false to use IndexOf variants.</param>
+ // <param name="negate">true to search for the opposite of the node.</param>
+ void EmitIndexOf(RegexNode node, bool useLast, bool negate)
+ {
+ if (node.Kind == RegexNodeKind.Multi)
+ {
+ // IndexOf(span)
+ Debug.Assert(!negate, "Negation isn't appropriate for a multi");
+ Ldstr(node.Str!);
+ Call(s_stringAsSpanMethod);
+ Call(useLast ? s_spanLastIndexOfSpan : s_spanIndexOfSpan);
+ return;
+ }
+
+ if (node.IsOneFamily || node.IsNotoneFamily)
+ {
+ // IndexOf{AnyExcept}(char)
+
+ if (node.IsNotoneFamily)
+ {
+ negate = !negate;
+ }
+
+ Ldc(node.Ch);
+ Call((useLast, negate) switch
+ {
+ (false, false) => s_spanIndexOfChar,
+ (false, true) => s_spanIndexOfAnyExceptChar,
+ (true, false) => s_spanLastIndexOfChar,
+ (true, true) => s_spanLastIndexOfAnyExceptChar,
+ });
+ return;
+ }
+
+ if (node.IsSetFamily)
+ {
+ bool negated = RegexCharClass.IsNegated(node.Str) ^ negate;
+
+ // IndexOfAny{Except}(ch1, ...)
+ Span<char> setChars = stackalloc char[5]; // current max that's vectorized
+ int setCharsCount;
+ if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0)
+ {
+ setChars = setChars.Slice(0, setCharsCount);
+ switch (setChars.Length)
+ {
+ case 1:
+ Ldc(setChars[0]);
+ Call((useLast, negated) switch
+ {
+ (false, false) => s_spanIndexOfChar,
+ (false, true) => s_spanIndexOfAnyExceptChar,
+ (true, false) => s_spanLastIndexOfChar,
+ (true, true) => s_spanLastIndexOfAnyExceptChar,
+ });
+ return;
+
+ case 2:
+ Ldc(setChars[0]);
+ Ldc(setChars[1]);
+ Call((useLast, negated) switch
+ {
+ (false, false) => s_spanIndexOfAnyCharChar,
+ (false, true) => s_spanIndexOfAnyExceptCharChar,
+ (true, false) => s_spanLastIndexOfAnyCharChar,
+ (true, true) => s_spanLastIndexOfAnyExceptCharChar,
+ });
+ return;
+
+ case 3:
+ Ldc(setChars[0]);
+ Ldc(setChars[1]);
+ Ldc(setChars[2]);
+ Call((useLast, negated) switch
+ {
+ (false, false) => s_spanIndexOfAnyCharCharChar,
+ (false, true) => s_spanIndexOfAnyExceptCharCharChar,
+ (true, false) => s_spanLastIndexOfAnyCharCharChar,
+ (true, true) => s_spanLastIndexOfAnyExceptCharCharChar,
+ });
+ return;
+
+ default:
+ Ldstr(setChars.ToString());
+ Call(s_stringAsSpanMethod);
+ Call((useLast, negated) switch
+ {
+ (false, false) => s_spanIndexOfAnySpan,
+ (false, true) => s_spanIndexOfAnyExceptSpan,
+ (true, false) => s_spanLastIndexOfAnySpan,
+ (true, true) => s_spanLastIndexOfAnyExceptSpan,
+ });
+ return;
+ }
+ }
+
+ // IndexOfAny{Except}InRange
+ if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive))
+ {
+ if (lowInclusive == highInclusive)
+ {
+ Ldc(lowInclusive);
+ Call((useLast, negated) switch
+ {
+ (false, false) => s_spanIndexOfChar,
+ (false, true) => s_spanIndexOfAnyExceptChar,
+ (true, false) => s_spanLastIndexOfChar,
+ (true, true) => s_spanLastIndexOfAnyExceptChar,
+ });
+ return;
+ }
+
+ Ldc(lowInclusive);
+ Ldc(highInclusive);
+ Call((useLast, negated) switch
+ {
+ (false, false) => s_spanIndexOfAnyInRange,
+ (false, true) => s_spanIndexOfAnyExceptInRange,
+ (true, false) => s_spanLastIndexOfAnyInRange,
+ (true, true) => s_spanLastIndexOfAnyExceptInRange,
+ });
+ return;
+ }
+ }
+
+ Debug.Fail("We should never get here. This method should only be called if CanEmitIndexOf returned true, and all of the same cases should be covered.");
+ }
+
// <summary>
// If the expression contains captures, pops a crawl position from the stack and uncaptures
// until that position is reached.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index d2eef1c622f..80ec75bda88 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -1381,10 +1381,8 @@ namespace System.Text.RegularExpressions
/// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
/// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
/// </returns>
- public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today
+ public RegexNode? FindStartingLiteralNode()
{
- Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated.");
-
RegexNode? node = this;
while (true)
{
@@ -1394,31 +1392,12 @@ namespace System.Text.RegularExpressions
{
case RegexNodeKind.One:
case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when node.M > 0:
- return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false);
-
case RegexNodeKind.Notone:
case RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when node.M > 0:
- return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true);
-
- case RegexNodeKind.Multi:
- return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false);
-
case RegexNodeKind.Set:
case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when node.M > 0:
- Span<char> setChars = stackalloc char[maxSetCharacters];
- int numChars;
- if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0)
- {
- setChars = setChars.Slice(0, numChars);
- return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!));
- }
-
- if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive))
- {
- Debug.Assert(lowInclusive < highInclusive);
- return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!));
- }
- break;
+ case RegexNodeKind.Multi:
+ return node;
case RegexNodeKind.Atomic:
case RegexNodeKind.Concatenate:
@@ -1435,6 +1414,49 @@ namespace System.Text.RegularExpressions
}
}
+ /// <summary>Finds the guaranteed beginning literal(s) of the node, or null if none exists.</summary>
+ /// <returns>
+ /// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
+ /// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
+ /// </returns>
+ public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today
+ {
+ Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated.");
+
+ if (FindStartingLiteralNode() is RegexNode node)
+ {
+ switch (node.Kind)
+ {
+ case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy:
+ return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false);
+
+ case RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy:
+ return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true);
+
+ case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy:
+ Span<char> setChars = stackalloc char[maxSetCharacters];
+ int numChars;
+ if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0)
+ {
+ setChars = setChars.Slice(0, numChars);
+ return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!));
+ }
+
+ if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive))
+ {
+ Debug.Assert(lowInclusive < highInclusive);
+ return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!));
+ }
+ break;
+
+ case RegexNodeKind.Multi:
+ return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false);
+ }
+ }
+
+ return null;
+ }
+
/// <summary>Data about a starting literal as returned by <see cref="FindStartingLiteral"/>.</summary>
public readonly struct StartingLiteralData
{
@@ -2767,6 +2789,7 @@ namespace System.Text.RegularExpressions
}
/// <summary>Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.</summary>
+ [MemberNotNullWhen(true, nameof(Str))]
public bool IsSetFamily => Kind is RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy;
/// <summary>Gets whether the node is a One/Oneloop/Oneloopatomic/Onelazy node.</summary>