Welcome to mirror list, hosted at ThFree Co, Russian Federation.

RegexGenerator.cs « gen « System.Text.RegularExpressions « libraries « src - github.com/dotnet/runtime.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 15645d72fd7a3fbe6f584952a225b354c2c3bb33 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.CodeDom.Compiler;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Linq;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;

[assembly: System.Resources.NeutralResourcesLanguage("en-us")]

namespace System.Text.RegularExpressions.Generator
{
    /// <summary>Generates C# source code to implement regular expressions.</summary>
    [Generator(LanguageNames.CSharp)]
    public partial class RegexGenerator : IIncrementalGenerator
    {
        /// <summary>Name of the type emitted to contain helpers used by the generated code.</summary>
        private const string HelpersTypeName = "Utilities";
        /// <summary>Namespace containing all the generated code.</summary>
        private const string GeneratedNamespace = "System.Text.RegularExpressions.Generated";
        /// <summary>Code for a [GeneratedCode] attribute to put on the top-level generated members.</summary>
        private static readonly string s_generatedCodeAttribute = $"GeneratedCodeAttribute(\"{typeof(RegexGenerator).Assembly.GetName().Name}\", \"{typeof(RegexGenerator).Assembly.GetName().Version}\")";
        /// <summary>Header comments and usings to include at the top of every generated file.</summary>
        private static readonly string[] s_headers = new string[]
        {
            "// <auto-generated/>",
            "#nullable enable",
            "#pragma warning disable CS0162 // Unreachable code",
            "#pragma warning disable CS0164 // Unreferenced label",
            "#pragma warning disable CS0219 // Variable assigned but never used",
        };

        public void Initialize(IncrementalGeneratorInitializationContext context)
        {
            // Produces one entry per generated regex.  This may be:
            // - Diagnostic in the case of a failure that should end the compilation
            // - (RegexMethod regexMethod, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers) in the case of valid regex
            // - (RegexMethod regexMethod, string reason, Diagnostic diagnostic) in the case of a limited-support regex
            IncrementalValueProvider<ImmutableArray<object>> codeOrDiagnostics =
                context.SyntaxProvider

                // Find all MethodDeclarationSyntax nodes attributed with GeneratedRegex and gather the required information.
                .ForAttributeWithMetadataName(
                    GeneratedRegexAttributeName,
                    (node, _) => node is MethodDeclarationSyntax,
                    GetSemanticTargetForGeneration)
                .Where(static m => m is not null)

                // Generate the RunnerFactory for each regex, if possible.  This is where the bulk of the implementation occurs.
                .Select((state, _) =>
                {
                    if (state is not RegexMethod regexMethod)
                    {
                        Debug.Assert(state is Diagnostic);
                        return state;
                    }

                    // If we're unable to generate a full implementation for this regex, report a diagnostic.
                    // We'll still output a limited implementation that just caches a new Regex(...).
                    if (!SupportsCodeGeneration(regexMethod, out string? reason))
                    {
                        return (regexMethod, reason, Diagnostic.Create(DiagnosticDescriptors.LimitedSourceGeneration, regexMethod.MethodSyntax.GetLocation()));
                    }

                    // Generate the core logic for the regex.
                    Dictionary<string, string[]> requiredHelpers = new();
                    var sw = new StringWriter();
                    var writer = new IndentedTextWriter(sw);
                    writer.Indent += 2;
                    writer.WriteLine();
                    EmitRegexDerivedTypeRunnerFactory(writer, regexMethod, requiredHelpers);
                    writer.Indent -= 2;
                    return (regexMethod, sw.ToString(), requiredHelpers);
                })
                .Collect();

            // To avoid invalidating every regex's output when anything from the compilation changes,
            // we extract from it the only things we care about: whether unsafe code is allowed,
            // and a name based on the assembly's name, and only that information is then fed into
            // RegisterSourceOutput along with all of the cached generated data from each regex.
            IncrementalValueProvider<(bool AllowUnsafe, string? AssemblyName)> compilationDataProvider =
                context.CompilationProvider
                .Select((x, _) => (x.Options is CSharpCompilationOptions { AllowUnsafe: true }, x.AssemblyName));

            // When there something to output, take all the generated strings and concatenate them to output,
            // and raise all of the created diagnostics.
            context.RegisterSourceOutput(codeOrDiagnostics.Combine(compilationDataProvider), static (context, compilationDataAndResults) =>
            {
                ImmutableArray<object> results = compilationDataAndResults.Left;

                // Report any top-level diagnostics.
                bool allFailures = true;
                foreach (object result in results)
                {
                    if (result is Diagnostic d)
                    {
                        context.ReportDiagnostic(d);
                    }
                    else
                    {
                        allFailures = false;
                    }
                }
                if (allFailures)
                {
                    return;
                }

                // At this point we'll be emitting code.  Create a writer to hold it all.
                var sw = new StringWriter();
                IndentedTextWriter writer = new(sw);

                // Add file headers and required usings.
                foreach (string header in s_headers)
                {
                    writer.WriteLine(header);
                }
                writer.WriteLine();

                // For every generated type, we give it an incrementally increasing ID, in order to create
                // unique type names even in situations where method names were the same, while also keeping
                // the type names short.  Note that this is why we only generate the RunnerFactory implementations
                // earlier in the pipeline... we want to avoid generating code that relies on the class names
                // until we're able to iterate through them linearly keeping track of a deterministic ID
                // used to name them.  The boilerplate code generation that happens here is minimal when compared to
                // the work required to generate the actual matching code for the regex.
                int id = 0;

                // To minimize generated code in the event of duplicated regexes, we only emit one derived Regex type per unique
                // expression/options/timeout.  A Dictionary<(expression, options, timeout), RegexMethod> is used to deduplicate, where the value of the
                // pair is the implementation used for the key.
                var emittedExpressions = new Dictionary<(string Pattern, RegexOptions Options, int? Timeout), RegexMethod>();

                // If we have any (RegexMethod regexMethod, string generatedName, string reason, Diagnostic diagnostic), these are regexes for which we have
                // limited support and need to simply output boilerplate.  We need to emit their diagnostics.
                // If we have any (RegexMethod regexMethod, string generatedName, string runnerFactoryImplementation, Dictionary<string, string[]> requiredHelpers),
                // those are generated implementations to be emitted.  We need to gather up their required helpers.
                Dictionary<string, string[]> requiredHelpers = new();
                foreach (object? result in results)
                {
                    RegexMethod? regexMethod = null;
                    if (result is ValueTuple<RegexMethod, string, Diagnostic> limitedSupportResult)
                    {
                        context.ReportDiagnostic(limitedSupportResult.Item3);
                        regexMethod = limitedSupportResult.Item1;
                    }
                    else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>> regexImpl)
                    {
                        foreach (KeyValuePair<string, string[]> helper in regexImpl.Item3)
                        {
                            if (!requiredHelpers.ContainsKey(helper.Key))
                            {
                                requiredHelpers.Add(helper.Key, helper.Value);
                            }
                        }

                        regexMethod = regexImpl.Item1;
                    }

                    if (regexMethod is not null)
                    {
                        var key = (regexMethod.Pattern, regexMethod.Options, regexMethod.MatchTimeout);
                        if (emittedExpressions.TryGetValue(key, out RegexMethod? implementation))
                        {
                            regexMethod.IsDuplicate = true;
                            regexMethod.GeneratedName = implementation.GeneratedName;
                        }
                        else
                        {
                            regexMethod.IsDuplicate = false;
                            regexMethod.GeneratedName = $"{regexMethod.MethodName}_{id++}";
                            emittedExpressions.Add(key, regexMethod);
                        }

                        EmitRegexPartialMethod(regexMethod, writer);
                        writer.WriteLine();
                    }
                }

                // At this point we've emitted all the partial method definitions, but we still need to emit the actual regex-derived implementations.
                // These are all emitted inside of our generated class.

                writer.WriteLine($"namespace {GeneratedNamespace}");
                writer.WriteLine($"{{");

                // We emit usings here now that we're inside of a namespace block and are no longer emitting code into
                // a user's partial type.  We can now rely on binding rules mapping to these usings and don't need to
                // use global-qualified names for the rest of the implementation.
                writer.WriteLine($"    using System;");
                writer.WriteLine($"    using System.CodeDom.Compiler;");
                writer.WriteLine($"    using System.Collections;");
                writer.WriteLine($"    using System.ComponentModel;");
                writer.WriteLine($"    using System.Globalization;");
                writer.WriteLine($"    using System.Runtime.CompilerServices;");
                writer.WriteLine($"    using System.Text.RegularExpressions;");
                writer.WriteLine($"    using System.Threading;");
                writer.WriteLine($"");

                // Emit each Regex-derived type.
                writer.Indent++;
                foreach (object? result in results)
                {
                    if (result is ValueTuple<RegexMethod, string, Diagnostic> limitedSupportResult)
                    {
                        if (!limitedSupportResult.Item1.IsDuplicate)
                        {
                            EmitRegexLimitedBoilerplate(writer, limitedSupportResult.Item1, limitedSupportResult.Item2);
                            writer.WriteLine();
                        }
                    }
                    else if (result is ValueTuple<RegexMethod, string, Dictionary<string, string[]>> regexImpl)
                    {
                        if (!regexImpl.Item1.IsDuplicate)
                        {
                            EmitRegexDerivedImplementation(writer, regexImpl.Item1, regexImpl.Item2, compilationDataAndResults.Right.AllowUnsafe);
                            writer.WriteLine();
                        }
                    }
                }
                writer.Indent--;

                // If any of the Regex-derived types asked for helper methods, emit those now.
                if (requiredHelpers.Count != 0)
                {
                    writer.Indent++;
                    writer.WriteLine($"/// <summary>Helper methods used by generated <see cref=\"Regex\"/>-derived implementations.</summary>");
                    writer.WriteLine($"[{s_generatedCodeAttribute}]");
                    writer.WriteLine($"file static class {HelpersTypeName}");
                    writer.WriteLine($"{{");
                    writer.Indent++;
                    bool sawFirst = false;
                    foreach (KeyValuePair<string, string[]> helper in requiredHelpers)
                    {
                        if (sawFirst)
                        {
                            writer.WriteLine();
                        }
                        sawFirst = true;

                        foreach (string value in helper.Value)
                        {
                            writer.WriteLine(value);
                        }
                    }
                    writer.Indent--;
                    writer.WriteLine($"}}");
                    writer.Indent--;
                }

                writer.WriteLine($"}}");

                // Save out the source
                context.AddSource("RegexGenerator.g.cs", sw.ToString());
            });
        }

        /// <summary>Determines whether the passed in node supports C# code generation.</summary>
        /// <remarks>
        // It also provides a human-readable string to explain the reason. It will be emitted by the source generator
        // as a comment into the C# code, hence there's no need to localize.
        /// </remarks>
        private static bool SupportsCodeGeneration(RegexMethod method, [NotNullWhen(false)] out string? reason)
        {
            if (method.MethodSyntax.SyntaxTree.Options is CSharpParseOptions { LanguageVersion: <= LanguageVersion.CSharp10 })
            {
                reason = "the language version must be C# 11 or higher.";
                return false;
            }

            RegexNode node = method.Tree.Root;

            if (!node.SupportsCompilation(out reason))
            {
                // If the pattern doesn't support Compilation, then code generation won't be supported either.
                return false;
            }

            if (HasCaseInsensitiveBackReferences(node))
            {
                // For case-insensitive patterns, we use our internal Regex case equivalence table when doing character comparisons.
                // Most of the use of this table is done at Regex construction time by substituting all characters that are involved in
                // case conversions into sets that contain all possible characters that could match. That said, there is still one case
                // where you may need to do case-insensitive comparisons at match time which is the case for backreferences. For that reason,
                // and given the Regex case equivalence table is internal and can't be called by the source generated emitted type, if
                // the pattern contains case-insensitive backreferences, we won't try to create a source generated Regex-derived type.
                reason = "the expression contains case-insensitive backreferences which are not supported by the source generator";
                return false;
            }

            // If Compilation is supported and pattern doesn't have case insensitive backreferences, then code generation is supported.
            reason = null;
            return true;

            static bool HasCaseInsensitiveBackReferences(RegexNode node)
            {
                if (node.Kind is RegexNodeKind.Backreference && (node.Options & RegexOptions.IgnoreCase) != 0)
                {
                    return true;
                }

                int childCount = node.ChildCount();
                for (int i = 0; i < childCount; i++)
                {
                    // This recursion shouldn't hit issues with stack depth since this gets checked after
                    // SupportCompilation has ensured that the max depth is not greater than 40.
                    if (HasCaseInsensitiveBackReferences(node.Child(i)))
                    {
                        return true;
                    }
                }

                return false;
            }
        }
    }
}