Welcome to mirror list, hosted at ThFree Co, Russian Federation.

RegExp.hpp « mix « src « far2l - github.com/elfmz/far2l.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: f5dacb9bb0758e0d07f8a07df2943cf03e6b0542 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#ifndef REGEXP_HPP_18B41BD7_69F8_461A_8A81_069B447D5554
#define REGEXP_HPP_18B41BD7_69F8_461A_8A81_069B447D5554
#pragma once

/*
RegExp.hpp

Regular expressions
Syntax and semantics are very close to perl
*/
/*
Copyright © 2000 Konstantin Stupnik
Copyright © 2008 Far Group
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
3. The name of the authors may not be used to endorse or promote products
   derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <farplug-wide.h>
#include <unordered_map>
#include <string>
#include <string.h>
#include "FARString.hpp"

//----------------------------------------------------------------------------

//#define RE_DEBUG

struct ReStringView // TODO: replace with std::wstring_view once will adopt C++17
{
	typedef wchar_t value_type;

	ReStringView(const ReStringView&) = default;
	ReStringView(ReStringView&&) = default;

	ReStringView& operator=(const ReStringView&) = default;

	inline ReStringView(const FARString &str)
		: _pw(str.CPtr()), _sz(str.GetLength())
	{
	}

	inline ReStringView()
		: _pw(nullptr), _sz(0)
	{
	}

	inline ReStringView(const wchar_t *pw, size_t sz = std::string::npos)
		: _pw(pw), _sz((sz == std::string::npos) ? wcslen(pw) : sz)
	{
	}

	inline ReStringView substr(size_t pos, size_t sz = std::string::npos) const
	{
		return ReStringView(_pw + pos, std::min(_sz - pos, sz));
	}

	inline bool operator ==(const ReStringView &other)
	{
		return _sz == other._sz && wmemcmp(_pw, other._pw, _sz) == 0;
	}

	inline bool operator !=(const ReStringView &other)
	{
		return !operator ==(other);
	}

	inline wchar_t front() const { return _sz ? _pw[0] : 0; }
	inline wchar_t back() const { return _sz ? _pw[_sz - 1] : 0; }
	inline const wchar_t *data() const { return _pw; }
	inline size_t size() const { return _sz; }
	inline bool empty() const { return _sz == 0; }

	inline const wchar_t operator[](size_t i) const { return _pw[i]; }

	size_t rfind(wchar_t c) const
	{
		for (size_t i = _sz; i;) {
			--i;
			if (_pw[i] == c) {
				return i;
			}
		}

		return std::string::npos;
	}

	inline const wchar_t *cbegin() const { return _pw; }

	inline const wchar_t *cend() const { return _pw + _sz; }

private:
	const wchar_t *_pw;
	size_t _sz;
};


//! Possible compile and runtime errors returned by LastError.
enum REError
{
	//! No errors
	errNone=0,
	//! RegExp wasn't even tried to compile
	errNotCompiled,
	//! expression contain syntax error
	errSyntax,
	//! Unbalanced brackets
	errBrackets,
	//! Max recursive brackets level reached. Controlled in compile time
	errMaxDepth,
	//! Invalid options combination
	errOptions,
	//! Reference to nonexistent bracket
	errInvalidBackRef,
	//! Invalid escape char
	errInvalidEscape,
	//! Invalid range value
	errInvalidRange,
	//! Quantifier applied to invalid object. f.e. lookahead assertion
	errInvalidQuantifiersCombination,
	//! Size of match array isn't large enough.
	errNotEnoughMatches,
	//! Attempt to match RegExp with Named Brackets, and no storage class provided.
	errNoStorageForNB,
	//! Reference to undefined named bracket
	errReferenceToUndefinedNamedBracket,
	//! Only fixed length look behind assertions are supported
	errVariableLengthLookBehind,

	errCancelled
};

enum
{
	//! Match in a case insensitive manner
	OP_IGNORECASE   =0x0001,
	//! Single line mode, dot meta-character will match newline symbol
	OP_SINGLELINE   =0x0002,
	//! MultiLine mode, ^ and $ can match line start and line end
	OP_MULTILINE    =0x0004,
	//! Extended syntax, spaces symbols are ignored unless escaped
	OP_XTENDEDSYNTAX=0x0008,
	//! Perl style RegExp provided. i.e. /expression/imsx
	OP_PERLSTYLE    =0x0010,
	//! Optimize after compile
	OP_OPTIMIZE     =0x0020,
	//! Strict escapes - only unrecognized escape will produce errInvalidEscape error
	OP_STRICT       =0x0040,
	//! Replace backslash with slash, used
	//! when RegExp source embedded in c++ sources
	OP_CPPMODE      =0x0080,
};

//! Hash table with match info
struct MatchHash
{
	std::unordered_map<std::wstring, RegExpMatch> Matches;
};

/*! Regular expressions support class.

Expressions must be Compile'ed first,
and than Match string or Search for matching fragment.
*/
class RegExp
{
public:
	struct REOpCode;
	class UniSet;
	struct StateStackItem;

private:
		// code
		std::vector<REOpCode> code;
		char slashChar;
		char backslashChar;

		std::unique_ptr<UniSet> firstptr;
		UniSet& first;

		int havefirst{};
		int havelookahead{};

		int minlength{};

		// error info
		mutable int errorcode;
		mutable int errorpos{};
		int srcstart{};

		// options
		int bracketscount{};
		int maxbackref{};
		int havenamedbrackets{};

		bool ignorecase{};

#ifdef RE_DEBUG
		std::wstring resrc;
#endif

		int CalcLength(ReStringView src);
		bool InnerCompile(const wchar_t* start, const wchar_t* src, int srclength, int options);

		bool InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t* strend, RegExpMatch* match, int& matchcount, MatchHash* hmatch, std::vector<StateStackItem>& stack) const;

		void TrimTail(const wchar_t* start, const wchar_t*& strend) const;

		// BUGBUG not thread safe!
		// TODO: split to compile errors (stateful) and match errors (stateless)
		bool SetError(int _code, int pos) const { errorcode = _code; errorpos = pos; return false; }

		int StrCmp(const wchar_t*& str,const wchar_t* start,const wchar_t* end) const;

	public:
		//! Default constructor.
		RegExp();
		~RegExp();

		RegExp(RegExp&&) noexcept;
		RegExp& operator=(RegExp&&) = delete;

		/*! Compile regular expression
		    Generate internal op-codes of expression.

		    \param src - source of expression
		    \param options - compile options

		    If compilation fails error code can be obtained with LastError function,
		    position of error in a expression can be obtained with ErrorPosition function.
		    See error codes in REError enumeration.
		    \sa LastError
		    \sa REError
		    \sa ErrorPosition
		    \sa options
		*/
		bool Compile(ReStringView src, int options=OP_PERLSTYLE|OP_OPTIMIZE);

		/*! Try to optimize regular expression
		    Significally speedup Search mode in some cases.
		*/
		bool Optimize();

		/*! Try to match string with regular expression
		    \param text - string to match
		    \param match - array of SMatch structures that receive brackets positions.
		    \param matchcount - in/out parameter that indicate number of items in
		    match array on input, and number of brackets on output.
		    \param hmatch - storage of named brackets.
		    \sa SMatch
		*/
		bool Match(ReStringView text, RegExpMatch* match, int& matchcount, MatchHash* hmatch = nullptr) const;
		/*! Advanced version of match. Can be used for multiple matches
		    on one string (to imitate /g modifier of perl regexp
		*/
		bool MatchEx(ReStringView text, size_t From, RegExpMatch* match, int& matchcount, MatchHash* hmatch = nullptr) const;
		/*! Try to find substring that will match regexp.
		    Parameters and return value are the same as for Match.
		    It is highly recommended to call Optimize before Search.
		*/
		bool Search(ReStringView text, RegExpMatch* match, int& matchcount, MatchHash* hmatch = nullptr) const;
		/*! Advanced version of search. Can be used for multiple searches
		    on one string (to imitate /g modifier of perl regexp
		*/
		bool SearchEx(ReStringView text, size_t From, RegExpMatch* match, int& matchcount, MatchHash* hmatch = nullptr) const;

		bool Search(ReStringView Str) const;

		/*! Get last error
		    \return code of the last error
		    Check REError for explanation
		    \sa REError
		    \sa ErrorPosition
		*/
		int LastError() const {return errorcode;}
		/*! Get last error position.
		    \return position of the last error in the regexp source.
		    \sa LastError
		*/
		int ErrorPosition() const { return srcstart + errorpos; }
		/*! Get number of brackets in expression
		    \return number of brackets, excluding brackets of type (:expr)
		    and named brackets.
		*/
		int GetBracketsCount() const {return bracketscount;}
};

#endif // REGEXP_HPP_18B41BD7_69F8_461A_8A81_069B447D5554