Welcome to mirror list, hosted at ThFree Co, Russian Federation.

arch.cs « System.Text.RegularExpressions « System « class « mcs - github.com/mono/mono.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 274d9add1a98a347f855bfbdcfe6709de54e5828 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
//
// assembly:	System
// namespace:	System.Text.RegularExpressions
// file:	arch.cs
//
// author:	Dan Lewis (dlewis@gmx.co.uk)
// 		(c) 2002

using System;

namespace System.Text.RegularExpressions {

	enum OpCode : ushort {
		False		= 0,	// always fails
		True,			// always succeeds

		// matching

		Position,		// zero-width position assertion
		String,			// match string literal
		Reference,		// back reference

		// character matching

		Character,		// match character exactly
		Category,		// match character from category
		Range,			// match character from range
		Set,			// match character from set
		In,			// match character from group of tests

		// capturing

		Open,			// open group
		Close,			// close group
		Balance,		// balance groups

		// control flow

		IfDefined,		// conditional on capture
		Sub,			// non-backtracking subexpression
		Test,			// non-backtracking lookahead/behind
		Branch,			// alternative expression
		Jump,			// unconditional goto
		Repeat,			// new repeat context
		Until,			// repeat subexpression within context
		FastRepeat,		// repeat simple subexpression
		Anchor,			// anchoring expression

		// miscellaneous
		
		Info			// pattern information
	}

	[Flags]
	enum OpFlags : ushort {
		None		= 0x000,
		Negate		= 0x100,	// succeed on mismatch
		IgnoreCase	= 0x200,	// case insensitive matching
		RightToLeft	= 0x400,	// right-to-left matching
		Lazy		= 0x800		// minimizing repeat
	}

	enum Position : ushort {
		Any,			// anywhere
		Start,			// start of string			\A
		StartOfString,		// start of string			\A
		StartOfLine,		// start of line			^
		StartOfScan,		// start of scan			\G
		End,			// end or before newline at end		\Z
		EndOfString,		// end of string			\z
		EndOfLine,		// end of line				$
		Boundary,		// word boundary			\b
		NonBoundary		// not word boundary			\B
	};
	
	// see category.cs for Category enum

	interface IMachine {
		Match Scan (Regex regex, string text, int start, int end);
	}

	interface IMachineFactory {
		IMachine NewInstance ();
	}

	// Anchor SKIP OFFSET
	//
	// Flags:	[RightToLeft] ??
	// SKIP:	relative address of tail expression
	// OFFSET:	offset of anchor from start of pattern
	//
	// Usage:
	//
	// 	Anchor :1 OFFSET
	//		<expr>
	//		True
	// 1:	<tail>
	//
	// Notes:
	//
	// In practice, the anchoring expression is only going to be
	// Position (StartOfString, StartOfLine, StartOfScan) or String.
	// This is because the optimizer looks for position anchors at the
	// start of the expression, and if that fails it looks for the
	// longest substring. If an expression has neither a position
	// anchor or a longest substring anchor, then the anchoring expression
	// is left empty. Since an empty expression will anchor at any
	// position in any string, the entire input string will be scanned.

	// String LEN STR...
	//
	// Flags:	[RightToLeft, IgnoreCase]
	// LEN:		length of string
	// STR:		string characters

	// Branch SKIP
	//
	// SKIP:	relative address of next branch
	//
	//	Branch :1
	//		<alt expr 1>
	//		Jump :4
	// 1:	Branch :2
	//		<alt expr 2>
	//		Jump :4
	// 2:	Branch :3
	//		<alt expr 3>
	//		Jump :4
	// 3:	False
	// 4:	<tail>

	// Repeat SKIP MIN MAX
	//
	// Flags:	[Lazy]
	// SKIP:	relative address of Until instruction
	// MIN:		minimum iterations
	// MAX:		maximum iterations (0xffff is infinity)
	//
	//	Repeat :1 MIN MAX
	//		<expr>
	//		Until
	// 1:	<tail>

	// FastRepeat SKIP MIN MAX
	//
	// Flags:	[Lazy]
	// SKIP:	relative address of tail expression
	// MIN:		minimum iterations
	// MAX:		maximum iterations (0xffff is infinity)
	//
	//	FastRepeat :1 MIN MAX
	//		<expr>
	//		True
	// 1:	<tail>
	//
	// Notes:
	//
	// The subexpression of a FastRepeat construct must not contain any
	// complex operators. These include: Open, Close, Balance, Repeat,
	// FastRepeat, Sub, Test. In addition, the subexpression must have
	// been determined to have a fixed width.
	
	// Sub SKIP
	//
	// SKIP:	relative address of tail expression
	//
	//	Sub :1
	//		<expr>
	// 1:	<tail>
	//
	// Notes:
	//
	// The Sub operator invokes an independent subexpression. This means
	// that the subexpression will match only once and so will not
	// participate in any backtracking.

	// Test TSKIP FSKIP
	//
	// TSKIP:	relative address of true expression
	// FSKIP:	relative address of false expression
	//
	// Usage:	(?(?=test)true|false)
	//
	//	Test :1 :2
	//		<test expr>
	// 1:		<true expr>
	//		Jump
	// 2:		<false epxr>
	//	<tail>
	//
	// Usage:	(?(?=test)true)
	//
	//	Test :1 :2
	//		<test expr>
	// 1:		<true expr>
	// 2:	<tail>
	//
	// Usage:	(?=test)
	//
	//	Test :1 :2
	//		<test expr>
	// 1:		<true expr>
	//		Jump 3:
	// 2:		False
	// 3:		<tail>
	//
	// Notes:
	//
	// For negative lookaheads, just swap the values of TSKIP and
	// FSKIP. For lookbehinds, the test expression must be compiled
	// in reverse. The test expression is always executed as an
	// independent subexpression, so its behaviour is non-backtracking
	// (like a Sub clause.)

	// IfDefined SKIP GID
	//
	// SKIP:	relative address of else expression
	// GID:		number of group to check
	//
	// Usage:	(?(gid)true)
	//
	//	IfDefined :1
	//		<true expr>
	// 1:	<tail>
	//
	// Usage:	(?(gid)true|false)
	//
	//	IfDefined :1
	//		<true expr>
	//		Jump :2
	// 1:		<false expr>
	// 2:	<tail>

	// Jump SKIP
	//
	// SKIP:	relative address of target expression
	//
	//	Jump :1
	//	...
	// :1	<target expr>

	// Character CHAR
	//
	// Flags:	[Negate, IgnoreCase, RightToLeft]
	// CHAR:	exact character to match

	// Category CAT
	//
	// Flags:	[Negate, RightToLeft]
	// CAT:		category to match (see Category enum)

	// Range LO HI
	//
	// Flags:	[Negate, IgnoreCase, RightToLeft]
	// LO:		lowest character in range
	// HI:		higest character in range

	// Set LO LEN SET...
	//
	// Flags:	[Negate, IgnoreCase, RightToLeft]
	// LO:		lowest character in set
	// LEN:		number of words in set
	// SET:		bit array representing characters in set
	//
	// Notes:
	//
	// Each word in the set represents 16 characters, so the first word
	// defines membership for characters LO to LO + 15, the second for
	// LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
	// up to the compiler to provide a compact representation for sparse
	// unicode sets. The simple way is to use Set 0 4096. Other methods
	// involve paritioning the set and placing the components into an
	// In block.

	// In SKIP
	//
	// SKIP:	relative address of tail expression
	//
	// Usage:	[expr]
	//
	//	In :1
	//		<expr>
	//		True
	// :1	<tail>
	//
	// Usage:	[^expr]
	//
	//	In :1
	//		<expr>
	//		False
	// :1	<tail>
	//
	// Notes:
	//
	// The In instruction consumes a single character, using the flags
	// of the first instruction in the subexpression to determine its
	// IgnoreCase and RightToLeft properties. The subexpression is then
	// applied to the single character as a disjunction. If any instruction
	// in the subexpression succeeds, the entire In construct succeeds
	// and matching continues with the tail.

	// Position POS
	//
	// POS:		position to match (see Position enum)

	// Open GID
	//
	// GID:		number of group to open

	// Close GID
	//
	// GID:		number of group to close
	
	// Balance GID BAL
	//
	// GID:		number of capturing group (0 if none)
	// BAL:		number of group to undefine

	// Info GROUPS MIN MAX
	//
	// GROUPS:	number of capturing groups
	// MIN:		minimum width of pattern
	// MAX:		maximum width of pattern (0xffff means undefined)

	// False

	// True

	// Reference GID
	//
	// Flags:	[IgnoreCase, RightToLeft]
	// GID:		number of group to reference
}