diff options
Diffstat (limited to 'winsup/cygwin/regex')
-rw-r--r-- | winsup/cygwin/regex/COPYRIGHT | 52 | ||||
-rw-r--r-- | winsup/cygwin/regex/cname.h | 138 | ||||
-rw-r--r-- | winsup/cygwin/regex/engine.c | 1197 | ||||
-rw-r--r-- | winsup/cygwin/regex/regcomp.c | 1885 | ||||
-rw-r--r-- | winsup/cygwin/regex/regerror.c | 190 | ||||
-rw-r--r-- | winsup/cygwin/regex/regex.3 | 727 | ||||
-rw-r--r-- | winsup/cygwin/regex/regex.7 | 480 | ||||
-rw-r--r-- | winsup/cygwin/regex/regex2.h | 196 | ||||
-rw-r--r-- | winsup/cygwin/regex/regexec.c | 256 | ||||
-rw-r--r-- | winsup/cygwin/regex/regfree.c | 89 | ||||
-rw-r--r-- | winsup/cygwin/regex/utils.h | 54 |
11 files changed, 0 insertions, 5264 deletions
diff --git a/winsup/cygwin/regex/COPYRIGHT b/winsup/cygwin/regex/COPYRIGHT deleted file mode 100644 index dc823b124..000000000 --- a/winsup/cygwin/regex/COPYRIGHT +++ /dev/null @@ -1,52 +0,0 @@ -Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved. -This software is not subject to any license of the American Telephone -and Telegraph Company or of the Regents of the University of California. - -Permission is granted to anyone to use this software for any purpose on -any computer system, and to alter it and redistribute it, subject -to the following restrictions: - -1. The author is not responsible for the consequences of use of this - software, no matter how awful, even if they arise from flaws in it. - -2. The origin of this software must not be misrepresented, either by - explicit claim or by omission. Since few users ever read sources, - credits must appear in the documentation. - -3. Altered versions must be plainly marked as such, and must not be - misrepresented as being the original software. Since few users - ever read sources, credits must appear in the documentation. - -4. This notice may not be removed or altered. - -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -/*- - * Copyright (c) 1994 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)COPYRIGHT 8.1 (Berkeley) 3/16/94 - */ diff --git a/winsup/cygwin/regex/cname.h b/winsup/cygwin/regex/cname.h deleted file mode 100644 index 4ad1ea93b..000000000 --- a/winsup/cygwin/regex/cname.h +++ /dev/null @@ -1,138 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)cname.h 8.3 (Berkeley) 3/20/94 - * $FreeBSD: src/lib/libc/regex/cname.h,v 1.4 2007/01/09 00:28:04 imp Exp $ - */ - -/* character-name table */ -static struct cname { - const char *name; - char code; -} cnames[] = { - {"NUL", '\0'}, - {"SOH", '\001'}, - {"STX", '\002'}, - {"ETX", '\003'}, - {"EOT", '\004'}, - {"ENQ", '\005'}, - {"ACK", '\006'}, - {"BEL", '\007'}, - {"alert", '\007'}, - {"BS", '\010'}, - {"backspace", '\b'}, - {"HT", '\011'}, - {"tab", '\t'}, - {"LF", '\012'}, - {"newline", '\n'}, - {"VT", '\013'}, - {"vertical-tab", '\v'}, - {"FF", '\014'}, - {"form-feed", '\f'}, - {"CR", '\015'}, - {"carriage-return", '\r'}, - {"SO", '\016'}, - {"SI", '\017'}, - {"DLE", '\020'}, - {"DC1", '\021'}, - {"DC2", '\022'}, - {"DC3", '\023'}, - {"DC4", '\024'}, - {"NAK", '\025'}, - {"SYN", '\026'}, - {"ETB", '\027'}, - {"CAN", '\030'}, - {"EM", '\031'}, - {"SUB", '\032'}, - {"ESC", '\033'}, - {"IS4", '\034'}, - {"FS", '\034'}, - {"IS3", '\035'}, - {"GS", '\035'}, - {"IS2", '\036'}, - {"RS", '\036'}, - {"IS1", '\037'}, - {"US", '\037'}, - {"space", ' '}, - {"exclamation-mark", '!'}, - {"quotation-mark", '"'}, - {"number-sign", '#'}, - {"dollar-sign", '$'}, - {"percent-sign", '%'}, - {"ampersand", '&'}, - {"apostrophe", '\''}, - {"left-parenthesis", '('}, - {"right-parenthesis", ')'}, - {"asterisk", '*'}, - {"plus-sign", '+'}, - {"comma", ','}, - {"hyphen", '-'}, - {"hyphen-minus", '-'}, - {"period", '.'}, - {"full-stop", '.'}, - {"slash", '/'}, - {"solidus", '/'}, - {"zero", '0'}, - {"one", '1'}, - {"two", '2'}, - {"three", '3'}, - {"four", '4'}, - {"five", '5'}, - {"six", '6'}, - {"seven", '7'}, - {"eight", '8'}, - {"nine", '9'}, - {"colon", ':'}, - {"semicolon", ';'}, - {"less-than-sign", '<'}, - {"equals-sign", '='}, - {"greater-than-sign", '>'}, - {"question-mark", '?'}, - {"commercial-at", '@'}, - {"left-square-bracket", '['}, - {"backslash", '\\'}, - {"reverse-solidus", '\\'}, - {"right-square-bracket",']'}, - {"circumflex", '^'}, - {"circumflex-accent", '^'}, - {"underscore", '_'}, - {"low-line", '_'}, - {"grave-accent", '`'}, - {"left-brace", '{'}, - {"left-curly-bracket", '{'}, - {"vertical-line", '|'}, - {"right-brace", '}'}, - {"right-curly-bracket", '}'}, - {"tilde", '~'}, - {"DEL", '\177'}, - {NULL, 0} -}; diff --git a/winsup/cygwin/regex/engine.c b/winsup/cygwin/regex/engine.c deleted file mode 100644 index 4afaf8d9a..000000000 --- a/winsup/cygwin/regex/engine.c +++ /dev/null @@ -1,1197 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)engine.c 8.5 (Berkeley) 3/20/94 - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/lib/libc/regex/engine.c,v 1.23 2009/09/16 06:32:23 dds Exp $"); - -/* - * The matching engine and friends. This file is #included by regexec.c - * after suitable #defines of a variety of macros used herein, so that - * different state representations can be used without duplicating masses - * of code. - */ - -#ifdef SNAMES -#define matcher smatcher -#define fast sfast -#define slow sslow -#define dissect sdissect -#define backref sbackref -#define step sstep -#define print sprint -#define at sat -#define match smat -#endif -#ifdef LNAMES -#define matcher lmatcher -#define fast lfast -#define slow lslow -#define dissect ldissect -#define backref lbackref -#define step lstep -#define print lprint -#define at lat -#define match lmat -#endif -#ifdef MNAMES -#define matcher mmatcher -#define fast mfast -#define slow mslow -#define dissect mdissect -#define backref mbackref -#define step mstep -#define print mprint -#define at mat -#define match mmat -#endif - -/* another structure passed up and down to avoid zillions of parameters */ -struct match { - struct re_guts *g; - int eflags; - regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ - const char *offp; /* offsets work from here */ - const char *beginp; /* start of string -- virtual NUL precedes */ - const char *endp; /* end of string -- virtual NUL here */ - const char *coldp; /* can be no match starting before here */ - const char **lastpos; /* [nplus+1] */ - STATEVARS; - states st; /* current states */ - states fresh; /* states for a fresh start */ - states tmp; /* temporary */ - states empty; /* empty set of states */ - mbstate_t mbs; /* multibyte conversion state */ -}; - -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === engine.c === */ -static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); -static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); -static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); -static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); -static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); -static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); -#define MAX_RECURSION 100 -#define BOL (OUT-1) -#define EOL (BOL-1) -#define BOLEOL (BOL-2) -#define NOTHING (BOL-3) -#define BOW (BOL-4) -#define EOW (BOL-5) -#define BADCHAR (BOL-6) -/* When using wint_t, which is defined as unsigned int on BSD, - as well as on Cygwin or Linux, the NONCHAR test is broken without - the below cast. I'm wondering how this is supposed to work at all... */ -#define NONCHAR(c) ((int)(c) <= OUT) -#ifdef REDEBUG -static void print(struct match *m, const char *caption, states st, int ch, FILE *d); -#endif -#ifdef REDEBUG -static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); -#endif -#ifdef REDEBUG -static const char *pchar(int ch); -#endif - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ - -#ifdef REDEBUG -#define SP(t, s, c) print(m, t, s, c, stdout) -#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) -#define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } -#else -#define SP(t, s, c) /* nothing */ -#define AT(t, p1, p2, s1, s2) /* nothing */ -#define NOTE(s) /* nothing */ -#endif - -/* - - matcher - the actual matching engine - == static int matcher(struct re_guts *g, const char *string, \ - == size_t nmatch, regmatch_t pmatch[], int eflags); - */ -static int /* 0 success, REG_NOMATCH failure */ -matcher(struct re_guts *g, - const char *string, - size_t nmatch, - regmatch_t pmatch[], - int eflags) -{ - const char *endp; - int i; - struct match mv; - struct match *m = &mv; - const char *dp; - const sopno gf = g->firststate+1; /* +1 for OEND */ - const sopno gl = g->laststate; - const char *start; - const char *stop; - /* Boyer-Moore algorithms variables */ - const char *pp; - int cj, mj; - const char *mustfirst; - const char *mustlast; - int *matchjump; - int *charjump; - - /* simplify the situation where possible */ - if (g->cflags®_NOSUB) - nmatch = 0; - if (eflags®_STARTEND) { - start = string + pmatch[0].rm_so; - stop = string + pmatch[0].rm_eo; - } else { - start = string; - stop = start + strlen(start); - } - if (stop < start) - return(REG_INVARG); - - /* prescreening; this does wonders for this rather slow code */ - if (g->must != NULL) { - if (g->charjump != NULL && g->matchjump != NULL) { - mustfirst = g->must; - mustlast = g->must + g->mlen - 1; - charjump = g->charjump; - matchjump = g->matchjump; - pp = mustlast; - for (dp = start+g->mlen-1; dp < stop;) { - /* Fast skip non-matches */ - while (dp < stop && charjump[(int)*dp]) - dp += charjump[(int)*dp]; - - if (dp >= stop) - break; - - /* Greedy matcher */ - /* We depend on not being used for - * for strings of length 1 - */ - while (*--dp == *--pp && pp != mustfirst); - - if (*dp == *pp) - break; - - /* Jump to next possible match */ - mj = matchjump[pp - mustfirst]; - cj = charjump[(int)*dp]; - dp += (cj < mj ? mj : cj); - pp = mustlast; - } - if (pp != mustfirst) - return(REG_NOMATCH); - } else { - for (dp = start; dp < stop; dp++) - if (*dp == g->must[0] && - stop - dp >= g->mlen && - memcmp(dp, g->must, (size_t)g->mlen) == 0) - break; - if (dp == stop) /* we didn't find g->must */ - return(REG_NOMATCH); - } - } - - /* match struct setup */ - m->g = g; - m->eflags = eflags; - m->pmatch = NULL; - m->lastpos = NULL; - m->offp = string; - m->beginp = start; - m->endp = stop; - STATESETUP(m, 4); - SETUP(m->st); - SETUP(m->fresh); - SETUP(m->tmp); - SETUP(m->empty); - CLEAR(m->empty); - ZAPSTATE(&m->mbs); - - /* Adjust start according to moffset, to speed things up */ -#ifndef MNAMES - /* The code evaluating moffset doesn't seem to work right - in the multibyte case. */ - if (g->moffset > -1) - start = ((dp - g->moffset) < start) ? start : dp - g->moffset; -#endif - SP("mloop", m->st, *start); - - /* this loop does only one repetition except for backrefs */ - for (;;) { - endp = fast(m, start, stop, gf, gl); - if (endp == NULL) { /* a miss */ - if (m->pmatch != NULL) - free((char *)m->pmatch); - if (m->lastpos != NULL) - free((char *)m->lastpos); - STATETEARDOWN(m); - return(REG_NOMATCH); - } - if (nmatch == 0 && !g->backrefs) - break; /* no further info needed */ - - /* where? */ - assert(m->coldp != NULL); - for (;;) { - NOTE("finding start"); - endp = slow(m, m->coldp, stop, gf, gl); - if (endp != NULL) - break; - assert(m->coldp < m->endp); - m->coldp += XMBRTOWC(NULL, m->coldp, - m->endp - m->coldp, &m->mbs, 0); - } - if (nmatch == 1 && !g->backrefs) - break; /* no further info needed */ - - /* oh my, he wants the subexpressions... */ - if (m->pmatch == NULL) - m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * - sizeof(regmatch_t)); - if (m->pmatch == NULL) { - STATETEARDOWN(m); - return(REG_ESPACE); - } - for (i = 1; i <= m->g->nsub; i++) - m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; - if (!g->backrefs && !(m->eflags®_BACKR)) { - NOTE("dissecting"); - dp = dissect(m, m->coldp, endp, gf, gl); - } else { - if (g->nplus > 0 && m->lastpos == NULL) - m->lastpos = malloc((g->nplus+1) * - sizeof(const char *)); - if (g->nplus > 0 && m->lastpos == NULL) { - free(m->pmatch); - STATETEARDOWN(m); - return(REG_ESPACE); - } - NOTE("backref dissect"); - dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); - } - if (dp != NULL) - break; - - /* uh-oh... we couldn't find a subexpression-level match */ - assert(g->backrefs); /* must be back references doing it */ - assert(g->nplus == 0 || m->lastpos != NULL); - for (;;) { - if (dp != NULL || endp <= m->coldp) - break; /* defeat */ - NOTE("backoff"); - endp = slow(m, m->coldp, endp-1, gf, gl); - if (endp == NULL) - break; /* defeat */ - /* try it on a shorter possibility */ -#ifndef NDEBUG - for (i = 1; i <= m->g->nsub; i++) { - assert(m->pmatch[i].rm_so == -1); - assert(m->pmatch[i].rm_eo == -1); - } -#endif - NOTE("backoff dissect"); - dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); - } - assert(dp == NULL || dp == endp); - if (dp != NULL) /* found a shorter one */ - break; - - /* despite initial appearances, there is no match here */ - NOTE("false alarm"); - /* recycle starting later */ - start = m->coldp + XMBRTOWC(NULL, m->coldp, - stop - m->coldp, &m->mbs, 0); - assert(start <= stop); - } - - /* fill in the details if requested */ - if (nmatch > 0) { - pmatch[0].rm_so = m->coldp - m->offp; - pmatch[0].rm_eo = endp - m->offp; - } - if (nmatch > 1) { - assert(m->pmatch != NULL); - for (i = 1; i < nmatch; i++) - if (i <= m->g->nsub) - pmatch[i] = m->pmatch[i]; - else { - pmatch[i].rm_so = -1; - pmatch[i].rm_eo = -1; - } - } - - if (m->pmatch != NULL) - free((char *)m->pmatch); - if (m->lastpos != NULL) - free((char *)m->lastpos); - STATETEARDOWN(m); - return(0); -} - -/* - - dissect - figure out what matched what, no back references - == static const char *dissect(struct match *m, const char *start, \ - == const char *stop, sopno startst, sopno stopst); - */ -static const char * /* == stop (success) always */ -dissect(struct match *m, - const char *start, - const char *stop, - sopno startst, - sopno stopst) -{ - int i; - sopno ss; /* start sop of current subRE */ - sopno es; /* end sop of current subRE */ - const char *sp; /* start of string matched by it */ - const char *stp; /* string matched by it cannot pass here */ - const char *rest; /* start of rest of string */ - const char *tail; /* string unmatched by rest of RE */ - sopno ssub; /* start sop of subsubRE */ - sopno esub; /* end sop of subsubRE */ - const char *ssp; /* start of string matched by subsubRE */ - const char *sep; /* end of string matched by subsubRE */ - const char *oldssp; /* previous ssp */ - const char *dp; - - AT("diss", start, stop, startst, stopst); - sp = start; - for (ss = startst; ss < stopst; ss = es) { - /* identify end of subRE */ - es = ss; - switch (OP(m->g->strip[es])) { - case OPLUS_: - case OQUEST_: - es += OPND(m->g->strip[es]); - break; - case OCH_: - while (OP(m->g->strip[es]) != O_CH) - es += OPND(m->g->strip[es]); - break; - } - es++; - - /* figure out what it matched */ - switch (OP(m->g->strip[ss])) { - case OEND: - assert(nope); - break; - case OCHAR: - sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); - break; - case OBOL: - case OEOL: - case OBOW: - case OEOW: - break; - case OANY: - case OANYOF: - sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); - break; - case OBACK_: - case O_BACK: - assert(nope); - break; - /* cases where length of match is hard to find */ - case OQUEST_: - stp = stop; - for (;;) { - /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); - assert(rest != NULL); /* it did match */ - /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); - if (tail == stop) - break; /* yes! */ - /* no -- try a shorter match for this one */ - stp = rest - 1; - assert(stp >= sp); /* it did work */ - } - ssub = ss + 1; - esub = es - 1; - /* did innards match? */ - if (slow(m, sp, rest, ssub, esub) != NULL) { - dp = dissect(m, sp, rest, ssub, esub); - assert(dp == rest); - } else /* no */ - assert(sp == rest); - sp = rest; - break; - case OPLUS_: - stp = stop; - for (;;) { - /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); - assert(rest != NULL); /* it did match */ - /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); - if (tail == stop) - break; /* yes! */ - /* no -- try a shorter match for this one */ - stp = rest - 1; - assert(stp >= sp); /* it did work */ - } - ssub = ss + 1; - esub = es - 1; - ssp = sp; - oldssp = ssp; - for (;;) { /* find last match of innards */ - sep = slow(m, ssp, rest, ssub, esub); - if (sep == NULL || sep == ssp) - break; /* failed or matched null */ - oldssp = ssp; /* on to next try */ - ssp = sep; - } - if (sep == NULL) { - /* last successful match */ - sep = ssp; - ssp = oldssp; - } - assert(sep == rest); /* must exhaust substring */ - assert(slow(m, ssp, sep, ssub, esub) == rest); - dp = dissect(m, ssp, sep, ssub, esub); - assert(dp == sep); - sp = rest; - break; - case OCH_: - stp = stop; - for (;;) { - /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); - assert(rest != NULL); /* it did match */ - /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); - if (tail == stop) - break; /* yes! */ - /* no -- try a shorter match for this one */ - stp = rest - 1; - assert(stp >= sp); /* it did work */ - } - ssub = ss + 1; - esub = ss + OPND(m->g->strip[ss]) - 1; - assert(OP(m->g->strip[esub]) == OOR1); - for (;;) { /* find first matching branch */ - if (slow(m, sp, rest, ssub, esub) == rest) - break; /* it matched all of it */ - /* that one missed, try next one */ - assert(OP(m->g->strip[esub]) == OOR1); - esub++; - assert(OP(m->g->strip[esub]) == OOR2); - ssub = esub + 1; - esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) - esub--; - else - assert(OP(m->g->strip[esub]) == O_CH); - } - dp = dissect(m, sp, rest, ssub, esub); - assert(dp == rest); - sp = rest; - break; - case O_PLUS: - case O_QUEST: - case OOR1: - case OOR2: - case O_CH: - assert(nope); - break; - case OLPAREN: - i = OPND(m->g->strip[ss]); - assert(0 < i && i <= m->g->nsub); - m->pmatch[i].rm_so = sp - m->offp; - break; - case ORPAREN: - i = OPND(m->g->strip[ss]); - assert(0 < i && i <= m->g->nsub); - m->pmatch[i].rm_eo = sp - m->offp; - break; - default: /* uh oh */ - assert(nope); - break; - } - } - - assert(sp == stop); - return(sp); -} - -/* - - backref - figure out what matched what, figuring in back references - == static const char *backref(struct match *m, const char *start, \ - == const char *stop, sopno startst, sopno stopst, sopno lev); - */ -static const char * /* == stop (success) or NULL (failure) */ -backref(struct match *m, - const char *start, - const char *stop, - sopno startst, - sopno stopst, - sopno lev, /* PLUS nesting level */ - int rec) -{ - int i; - sopno ss; /* start sop of current subRE */ - const char *sp; /* start of string matched by it */ - sopno ssub; /* start sop of subsubRE */ - sopno esub; /* end sop of subsubRE */ - const char *ssp; /* start of string matched by subsubRE */ - const char *dp; - size_t len; - int hard; - sop s; - regoff_t offsave; - cset *cs; - wint_t wc; - - AT("back", start, stop, startst, stopst); - sp = start; - - /* get as far as we can with easy stuff */ - hard = 0; - for (ss = startst; !hard && ss < stopst; ss++) - switch (OP(s = m->g->strip[ss])) { - case OCHAR: - if (sp == stop) - return(NULL); - sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); - if (wc != OPND(s)) - return(NULL); - break; - case OANY: - if (sp == stop) - return(NULL); - sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); - if (wc == BADCHAR) - return (NULL); - break; - case OANYOF: - if (sp == stop) - return (NULL); - cs = &m->g->sets[OPND(s)]; - sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); - if (wc == BADCHAR || !CHIN(cs, wc)) - return(NULL); - break; - case OBOL: - if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || - (sp < m->endp && *(sp-1) == '\n' && - (m->g->cflags®_NEWLINE)) ) - { /* yes */ } - else - return(NULL); - break; - case OEOL: - if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || - (sp < m->endp && *sp == '\n' && - (m->g->cflags®_NEWLINE)) ) - { /* yes */ } - else - return(NULL); - break; - case OBOW: - if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || - (sp < m->endp && *(sp-1) == '\n' && - (m->g->cflags®_NEWLINE)) || - (sp > m->beginp && - !ISWORD(*(sp-1))) ) && - (sp < m->endp && ISWORD(*sp)) ) - { /* yes */ } - else - return(NULL); - break; - case OEOW: - if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || - (sp < m->endp && *sp == '\n' && - (m->g->cflags®_NEWLINE)) || - (sp < m->endp && !ISWORD(*sp)) ) && - (sp > m->beginp && ISWORD(*(sp-1))) ) - { /* yes */ } - else - return(NULL); - break; - case O_QUEST: - break; - case OOR1: /* matches null but needs to skip */ - ss++; - s = m->g->strip[ss]; - do { - assert(OP(s) == OOR2); - ss += OPND(s); - } while (OP(s = m->g->strip[ss]) != O_CH); - /* note that the ss++ gets us past the O_CH */ - break; - default: /* have to make a choice */ - hard = 1; - break; - } - if (!hard) { /* that was it! */ - if (sp != stop) - return(NULL); - return(sp); - } - ss--; /* adjust for the for's final increment */ - - /* the hard stuff */ - AT("hard", sp, stop, ss, stopst); - s = m->g->strip[ss]; - switch (OP(s)) { - case OBACK_: /* the vilest depths */ - i = OPND(s); - assert(0 < i && i <= m->g->nsub); - if (m->pmatch[i].rm_eo == -1) - return(NULL); - assert(m->pmatch[i].rm_so != -1); - len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; - if (len == 0 && rec++ > MAX_RECURSION) - return(NULL); - assert(stop - m->beginp >= len); - if (sp > stop - len) - return(NULL); /* not enough left to match */ - ssp = m->offp + m->pmatch[i].rm_so; - if (memcmp(sp, ssp, len) != 0) - return(NULL); - while (m->g->strip[ss] != SOP(O_BACK, i)) - ss++; - return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); - break; - case OQUEST_: /* to null or not */ - dp = backref(m, sp, stop, ss+1, stopst, lev, rec); - if (dp != NULL) - return(dp); /* not */ - return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); - break; - case OPLUS_: - assert(m->lastpos != NULL); - assert(lev+1 <= m->g->nplus); - m->lastpos[lev+1] = sp; - return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); - break; - case O_PLUS: - if (sp == m->lastpos[lev]) /* last pass matched null */ - return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); - /* try another pass */ - m->lastpos[lev] = sp; - dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); - if (dp == NULL) - return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); - else - return(dp); - break; - case OCH_: /* find the right one, if any */ - ssub = ss + 1; - esub = ss + OPND(s) - 1; - assert(OP(m->g->strip[esub]) == OOR1); - for (;;) { /* find first matching branch */ - dp = backref(m, sp, stop, ssub, esub, lev, rec); - if (dp != NULL) - return(dp); - /* that one missed, try next one */ - if (OP(m->g->strip[esub]) == O_CH) - return(NULL); /* there is none */ - esub++; - assert(OP(m->g->strip[esub]) == OOR2); - ssub = esub + 1; - esub += OPND(m->g->strip[esub]); - if (OP(m->g->strip[esub]) == OOR2) - esub--; - else - assert(OP(m->g->strip[esub]) == O_CH); - } - break; - case OLPAREN: /* must undo assignment if rest fails */ - i = OPND(s); - assert(0 < i && i <= m->g->nsub); - offsave = m->pmatch[i].rm_so; - m->pmatch[i].rm_so = sp - m->offp; - dp = backref(m, sp, stop, ss+1, stopst, lev, rec); - if (dp != NULL) - return(dp); - m->pmatch[i].rm_so = offsave; - return(NULL); - break; - case ORPAREN: /* must undo assignment if rest fails */ - i = OPND(s); - assert(0 < i && i <= m->g->nsub); - offsave = m->pmatch[i].rm_eo; - m->pmatch[i].rm_eo = sp - m->offp; - dp = backref(m, sp, stop, ss+1, stopst, lev, rec); - if (dp != NULL) - return(dp); - m->pmatch[i].rm_eo = offsave; - return(NULL); - break; - default: /* uh oh */ - assert(nope); - break; - } - - /* "can't happen" */ - assert(nope); - /* NOTREACHED */ - return "shut up gcc"; -} - -/* - - fast - step through the string at top speed - == static const char *fast(struct match *m, const char *start, \ - == const char *stop, sopno startst, sopno stopst); - */ -static const char * /* where tentative match ended, or NULL */ -fast( struct match *m, - const char *start, - const char *stop, - sopno startst, - sopno stopst) -{ - states st = m->st; - states fresh = m->fresh; - states tmp = m->tmp; - const char *p = start; - wint_t c; - wint_t lastc; /* previous c */ - wint_t flagch; - int i; - const char *coldp; /* last p after which no match was underway */ - size_t clen; - - CLEAR(st); - SET1(st, startst); - SP("fast", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); - ASSIGN(fresh, st); - SP("start", st, *p); - coldp = NULL; - if (start == m->beginp) - c = OUT; - else { - /* - * XXX Wrong if the previous character was multi-byte. - * Newline never is (in encodings supported by FreeBSD), - * so this only breaks the ISWORD tests below. - */ - c = (uch)*(start - 1); - } - for (;;) { - /* next character */ - lastc = c; - if (p == m->endp) { - clen = 0; - c = OUT; - } else - clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); - if (EQ(st, fresh)) - coldp = p; - - /* is there an EOL and/or BOL between lastc and c? */ - flagch = '\0'; - i = 0; - if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || - (lastc == OUT && !(m->eflags®_NOTBOL)) ) { - flagch = BOL; - i = m->g->nbol; - } - if ( (c == '\n' && m->g->cflags®_NEWLINE) || - (c == OUT && !(m->eflags®_NOTEOL)) ) { - flagch = (flagch == BOL) ? BOLEOL : EOL; - i += m->g->neol; - } - if (i != 0) { - for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); - SP("boleol", st, c); - } - - /* how about a word boundary? */ - if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c)) ) { - flagch = BOW; - } - if ( (lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c))) ) { - flagch = EOW; - } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("boweow", st, c); - } - - /* are we done? */ - if (ISSET(st, stopst) || p == stop || clen > stop - p) - break; /* NOTE BREAK OUT */ - - /* no, we must deal with this character */ - ASSIGN(tmp, st); - ASSIGN(st, fresh); - assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); - SP("aft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p += clen; - } - - assert(coldp != NULL); - m->coldp = coldp; - if (ISSET(st, stopst)) - return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); - else - return(NULL); -} - -/* - - slow - step through the string more deliberately - == static const char *slow(struct match *m, const char *start, \ - == const char *stop, sopno startst, sopno stopst); - */ -static const char * /* where it ended */ -slow( struct match *m, - const char *start, - const char *stop, - sopno startst, - sopno stopst) -{ - states st = m->st; - states empty = m->empty; - states tmp = m->tmp; - const char *p = start; - wint_t c; - wint_t lastc; /* previous c */ - wint_t flagch; - int i; - const char *matchp; /* last p at which a match ended */ - size_t clen; - - AT("slow", start, stop, startst, stopst); - CLEAR(st); - SET1(st, startst); - SP("sstart", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); - matchp = NULL; - if (start == m->beginp) - c = OUT; - else { - /* - * XXX Wrong if the previous character was multi-byte. - * Newline never is (in encodings supported by FreeBSD), - * so this only breaks the ISWORD tests below. - */ - c = (uch)*(start - 1); - } - for (;;) { - /* next character */ - lastc = c; - if (p == m->endp) { - c = OUT; - clen = 0; - } else - clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); - - /* is there an EOL and/or BOL between lastc and c? */ - flagch = '\0'; - i = 0; - if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || - (lastc == OUT && !(m->eflags®_NOTBOL)) ) { - flagch = BOL; - i = m->g->nbol; - } - if ( (c == '\n' && m->g->cflags®_NEWLINE) || - (c == OUT && !(m->eflags®_NOTEOL)) ) { - flagch = (flagch == BOL) ? BOLEOL : EOL; - i += m->g->neol; - } - if (i != 0) { - for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); - SP("sboleol", st, c); - } - - /* how about a word boundary? */ - if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c)) ) { - flagch = BOW; - } - if ( (lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c))) ) { - flagch = EOW; - } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("sboweow", st, c); - } - - /* are we done? */ - if (ISSET(st, stopst)) - matchp = p; - if (EQ(st, empty) || p == stop || clen > stop - p) - break; /* NOTE BREAK OUT */ - - /* no, we must deal with this character */ - ASSIGN(tmp, st); - ASSIGN(st, empty); - assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); - SP("saft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p += clen; - } - - return(matchp); -} - - -/* - - step - map set of states reachable before char to set reachable after - == static states step(struct re_guts *g, sopno start, sopno stop, \ - == states bef, int ch, states aft); - == #define BOL (OUT-1) - == #define EOL (BOL-1) - == #define BOLEOL (BOL-2) - == #define NOTHING (BOL-3) - == #define BOW (BOL-4) - == #define EOW (BOL-5) - == #define BADCHAR (BOL-6) - == #define NONCHAR(c) ((c) <= OUT) - */ -static states -step(struct re_guts *g, - sopno start, /* start state within strip */ - sopno stop, /* state after stop state within strip */ - states bef, /* states reachable before */ - wint_t ch, /* character or NONCHAR code */ - states aft) /* states already known reachable after */ -{ - cset *cs; - sop s; - sopno pc; - onestate here; /* note, macros know this name */ - sopno look; - int i; - - for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { - s = g->strip[pc]; - switch (OP(s)) { - case OEND: - assert(pc == stop-1); - break; - case OCHAR: - /* only characters can match */ - assert(!NONCHAR(ch) || ch != OPND(s)); - if (ch == OPND(s)) - FWD(aft, bef, 1); - break; - case OBOL: - if (ch == BOL || ch == BOLEOL) - FWD(aft, bef, 1); - break; - case OEOL: - if (ch == EOL || ch == BOLEOL) - FWD(aft, bef, 1); - break; - case OBOW: - if (ch == BOW) - FWD(aft, bef, 1); - break; - case OEOW: - if (ch == EOW) - FWD(aft, bef, 1); - break; - case OANY: - if (!NONCHAR(ch)) - FWD(aft, bef, 1); - break; - case OANYOF: - cs = &g->sets[OPND(s)]; - if (!NONCHAR(ch) && CHIN(cs, ch)) - FWD(aft, bef, 1); - break; - case OBACK_: /* ignored here */ - case O_BACK: - FWD(aft, aft, 1); - break; - case OPLUS_: /* forward, this is just an empty */ - FWD(aft, aft, 1); - break; - case O_PLUS: /* both forward and back */ - FWD(aft, aft, 1); - i = ISSETBACK(aft, OPND(s)); - BACK(aft, aft, OPND(s)); - if (!i && ISSETBACK(aft, OPND(s))) { - /* oho, must reconsider loop body */ - pc -= OPND(s) + 1; - INIT(here, pc); - } - break; - case OQUEST_: /* two branches, both forward */ - FWD(aft, aft, 1); - FWD(aft, aft, OPND(s)); - break; - case O_QUEST: /* just an empty */ - FWD(aft, aft, 1); - break; - case OLPAREN: /* not significant here */ - case ORPAREN: - FWD(aft, aft, 1); - break; - case OCH_: /* mark the first two branches */ - FWD(aft, aft, 1); - assert(OP(g->strip[pc+OPND(s)]) == OOR2); - FWD(aft, aft, OPND(s)); - break; - case OOR1: /* done a branch, find the O_CH */ - if (ISSTATEIN(aft, here)) { - for (look = 1; - OP(s = g->strip[pc+look]) != O_CH; - look += OPND(s)) - assert(OP(s) == OOR2); - FWD(aft, aft, look + 1); - } - break; - case OOR2: /* propagate OCH_'s marking */ - FWD(aft, aft, 1); - if (OP(g->strip[pc+OPND(s)]) != O_CH) { - assert(OP(g->strip[pc+OPND(s)]) == OOR2); - FWD(aft, aft, OPND(s)); - } - break; - case O_CH: /* just empty */ - FWD(aft, aft, 1); - break; - default: /* ooooops... */ - assert(nope); - break; - } - } - - return(aft); -} - -#ifdef REDEBUG -/* - - print - print a set of states - == #ifdef REDEBUG - == static void print(struct match *m, const char *caption, states st, \ - == int ch, FILE *d); - == #endif - */ -static void -print(struct match *m, - const char *caption, - states st, - int ch, - FILE *d) -{ - struct re_guts *g = m->g; - int i; - int first = 1; - - if (!(m->eflags®_TRACE)) - return; - - fprintf(d, "%s", caption); - if (ch != '\0') - fprintf(d, " %s", pchar(ch)); - for (i = 0; i < g->nstates; i++) - if (ISSET(st, i)) { - fprintf(d, "%s%d", (first) ? "\t" : ", ", i); - first = 0; - } - fprintf(d, "\n"); -} - -/* - - at - print current situation - == #ifdef REDEBUG - == static void at(struct match *m, const char *title, const char *start, \ - == const char *stop, sopno startst, sopno stopst); - == #endif - */ -static void -at( struct match *m, - const char *title, - const char *start, - const char *stop, - sopno startst, - sopno stopst) -{ - if (!(m->eflags®_TRACE)) - return; - - printf("%s %s-", title, pchar(*start)); - printf("%s ", pchar(*stop)); - printf("%ld-%ld\n", (long)startst, (long)stopst); -} - -#ifndef PCHARDONE -#define PCHARDONE /* never again */ -/* - - pchar - make a character printable - == #ifdef REDEBUG - == static const char *pchar(int ch); - == #endif - * - * Is this identical to regchar() over in debug.c? Well, yes. But a - * duplicate here avoids having a debugging-capable regexec.o tied to - * a matching debug.o, and this is convenient. It all disappears in - * the non-debug compilation anyway, so it doesn't matter much. - */ -static const char * /* -> representation */ -pchar(int ch) -{ - static char pbuf[10]; - - if (isprint((uch)ch) || ch == ' ') - sprintf(pbuf, "%c", ch); - else - sprintf(pbuf, "\\%o", ch); - return(pbuf); -} -#endif -#endif - -#undef matcher -#undef fast -#undef slow -#undef dissect -#undef backref -#undef step -#undef print -#undef at -#undef match diff --git a/winsup/cygwin/regex/regcomp.c b/winsup/cygwin/regex/regcomp.c deleted file mode 100644 index 5aee63948..000000000 --- a/winsup/cygwin/regex/regcomp.c +++ /dev/null @@ -1,1885 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; -#endif /* LIBC_SCCS and not lint */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.36 2007/06/11 03:05:54 delphij Exp $"); - -#ifdef __CYGWIN__ -#include "winsup.h" -#endif -#include <sys/types.h> -#include <stdio.h> -#include <string.h> -#include <ctype.h> -#include <limits.h> -#include <stdlib.h> -#include <regex.h> -#ifndef __CYGWIN__ -#include <runetype.h> -#endif -#include <wchar.h> -#include <wctype.h> - -#ifndef __CYGWIN__ -#include "collate.h" -#endif - -#include "utils.h" -#include "regex2.h" - -#include "cname.h" - -#ifdef __CYGWIN__ -/* Don't pull in windows headers just for LCID. */ -typedef unsigned long LCID; -/* These are defined in nlsfuncs.cc. */ -extern LCID collate_lcid; -extern char collate_charset[]; -#endif - -/* - * parse structure, passed up and down to avoid global variables and - * other clumsinesses - */ -struct parse { - char *next; /* next character in RE */ - char *end; /* end of string (-> NUL normally) */ - int error; /* has an error been seen? */ - sop *strip; /* malloced strip */ - sopno ssize; /* malloced strip size (allocated) */ - sopno slen; /* malloced strip length (used) */ - int ncsalloc; /* number of csets allocated */ - struct re_guts *g; -# define NPAREN 10 /* we need to remember () 1-9 for back refs */ - sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ - sopno pend[NPAREN]; /* -> ) ([0] unused) */ -}; - -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === regcomp.c === */ -#ifdef __CYGWIN__ /* Defined below `int stop'. Our gcc chokes on that. */ -static void p_ere(struct parse *p, int stop); -#else -static void p_ere(struct parse *p, wint_t stop); -#endif -static void p_ere_exp(struct parse *p); -static void p_str(struct parse *p); -#ifdef __CYGWIN__ /* Defined below `int end1/end2'. Our gcc chokes on that. */ -static void p_bre(struct parse *p, int end1, int end2); -#else -static void p_bre(struct parse *p, wint_t end1, wint_t end2); -#endif -static int p_simp_re(struct parse *p, int starordinary); -static int p_count(struct parse *p); -static void p_bracket(struct parse *p); -static void p_b_term(struct parse *p, cset *cs); -static void p_b_cclass(struct parse *p, cset *cs); -static void p_b_eclass(struct parse *p, cset *cs); -static wint_t p_b_symbol(struct parse *p); -static wint_t p_b_coll_elem(struct parse *p, wint_t endc); -static wint_t othercase(wint_t ch); -static void bothcases(struct parse *p, wint_t ch); -static void ordinary(struct parse *p, wint_t ch); -static void nonnewline(struct parse *p); -static void repeat(struct parse *p, sopno start, int from, int to); -static int seterr(struct parse *p, int e); -static cset *allocset(struct parse *p); -static void freeset(struct parse *p, cset *cs); -static void CHadd(struct parse *p, cset *cs, wint_t ch); -static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); -static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); -static wint_t singleton(cset *cs); -static sopno dupl(struct parse *p, sopno start, sopno finish); -static void doemit(struct parse *p, sop op, size_t opnd); -static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); -static void dofwd(struct parse *p, sopno pos, sop value); -static void enlarge(struct parse *p, sopno size); -static void stripsnug(struct parse *p, struct re_guts *g); -static void findmust(struct parse *p, struct re_guts *g); -static int altoffset(sop *scan, int offset); -static void computejumps(struct parse *p, struct re_guts *g); -static void computematchjumps(struct parse *p, struct re_guts *g); -static sopno pluscount(struct parse *p, struct re_guts *g); -static wint_t wgetnext(struct parse *p); -static size_t xwcrtomb (char *s, wint_t wc, mbstate_t *ps); - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ - -static char nuls[10]; /* place to point scanner in event of error */ - -/* - * macros for use with parse structure - * BEWARE: these know that the parse structure is named `p' !!! - */ -#define PEEK() (*p->next) -#define PEEK2() (*(p->next+1)) -#define MORE() (p->next < p->end) -#define MORE2() (p->next+1 < p->end) -#define SEE(c) (MORE() && PEEK() == (c)) -#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) -#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) -#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) -#define NEXT() (p->next++) -#define NEXT2() (p->next += 2) -#define NEXTn(n) (p->next += (n)) -#define GETNEXT() (*p->next++) -#define WGETNEXT() wgetnext(p) -#define SETERROR(e) seterr(p, (e)) -#define REQUIRE(co, e) ((co) || SETERROR(e)) -#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) -#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) -#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) -#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) -#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) -#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) -#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) -#define HERE() (p->slen) -#define THERE() (p->slen - 1) -#define THERETHERE() (p->slen - 2) -#define DROP(n) (p->slen -= (n)) - -#ifndef NDEBUG -static int never = 0; /* for use in asserts; shuts lint up */ -#else -#define never 0 /* some <assert.h>s have bugs too */ -#endif - -/* Macro used by computejump()/computematchjump() */ -#define MIN(a,b) ((a)<(b)?(a):(b)) - -/* - - regcomp - interface for parser and compilation - = extern int regcomp(regex_t *, const char *, int); - = #define REG_BASIC 0000 - = #define REG_EXTENDED 0001 - = #define REG_ICASE 0002 - = #define REG_NOSUB 0004 - = #define REG_NEWLINE 0010 - = #define REG_NOSPEC 0020 - = #define REG_PEND 0040 - = #define REG_DUMP 0200 - */ -int /* 0 success, otherwise REG_something */ -regcomp(regex_t * __restrict preg, - const char * __restrict pattern, - int cflags) -{ - struct parse pa; - struct re_guts *g; - struct parse *p = &pa; - int i; - size_t len; -#ifdef REDEBUG -# define GOODFLAGS(f) (f) -#else -# define GOODFLAGS(f) ((f)&~REG_DUMP) -#endif - - cflags = GOODFLAGS(cflags); - if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) - return(REG_INVARG); - - if (cflags®_PEND) { - if (preg->re_endp < pattern) - return(REG_INVARG); - len = preg->re_endp - pattern; - } else - len = strlen((char *)pattern); - - /* do the mallocs early so failure handling is easy */ - g = (struct re_guts *)malloc(sizeof(struct re_guts)); - if (g == NULL) - return(REG_ESPACE); - p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ - p->strip = (sop *)malloc(p->ssize * sizeof(sop)); - p->slen = 0; - if (p->strip == NULL) { - free((char *)g); - return(REG_ESPACE); - } - - /* set things up */ - p->g = g; - p->next = (char *)pattern; /* convenience; we do not modify it */ - p->end = p->next + len; - p->error = 0; - p->ncsalloc = 0; - for (i = 0; i < NPAREN; i++) { - p->pbegin[i] = 0; - p->pend[i] = 0; - } - g->sets = NULL; - g->ncsets = 0; - g->cflags = cflags; - g->iflags = 0; - g->nbol = 0; - g->neol = 0; - g->must = NULL; - g->moffset = -1; - g->charjump = NULL; - g->matchjump = NULL; - g->mlen = 0; - g->nsub = 0; - g->backrefs = 0; - - /* do it */ - EMIT(OEND, 0); - g->firststate = THERE(); - if (cflags®_EXTENDED) - p_ere(p, OUT); - else if (cflags®_NOSPEC) - p_str(p); - else - p_bre(p, OUT, OUT); - EMIT(OEND, 0); - g->laststate = THERE(); - - /* tidy up loose ends and fill things in */ - stripsnug(p, g); - findmust(p, g); - /* only use Boyer-Moore algorithm if the pattern is bigger - * than three characters - */ - if(g->mlen > 3) { - computejumps(p, g); - computematchjumps(p, g); - if(g->matchjump == NULL && g->charjump != NULL) { - free(g->charjump); - g->charjump = NULL; - } - } - g->nplus = pluscount(p, g); - g->magic = MAGIC2; - preg->re_nsub = g->nsub; - preg->re_g = g; - preg->re_magic = MAGIC1; -#ifndef REDEBUG - /* not debugging, so can't rely on the assert() in regexec() */ - if (g->iflags&BAD) - SETERROR(REG_ASSERT); -#endif - - /* win or lose, we're done */ - if (p->error != 0) /* lose */ - regfree(preg); - return(p->error); -} - -/* - - p_ere - ERE parser top level, concatenation and alternation - == static void p_ere(struct parse *p, int stop); - */ -static void -p_ere(struct parse *p, - int stop) /* character this ERE should end at */ -{ - char c; - sopno prevback = 0; - sopno prevfwd = 0; - sopno conc; - int first = 1; /* is this the first alternative? */ - - for (;;) { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) - p_ere_exp(p); - (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ - - if (!EAT('|')) - break; /* NOTE BREAK OUT */ - - if (first) { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - first = 0; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!first) { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - - assert(!MORE() || SEE(stop)); -} - -/* - - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op - == static void p_ere_exp(struct parse *p); - */ -static void -p_ere_exp(struct parse *p) -{ - char c; - wint_t wc; - sopno pos; - int count; - int count2; - sopno subno; - int wascaret = 0; - - assert(MORE()); /* caller should have ensured this */ - c = GETNEXT(); - - pos = HERE(); - switch (c) { - case '(': - (void)REQUIRE(MORE(), REG_EPAREN); - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) - p->pbegin[subno] = HERE(); - EMIT(OLPAREN, subno); - if (!SEE(')')) - p_ere(p, ')'); - if (subno < NPAREN) { - p->pend[subno] = HERE(); - assert(p->pend[subno] != 0); - } - EMIT(ORPAREN, subno); - (void)MUSTEAT(')', REG_EPAREN); - break; -#ifndef POSIX_MISTAKE - case ')': /* happens only if no current unmatched ( */ - /* - * You may ask, why the ifndef? Because I didn't notice - * this until slightly too late for 1003.2, and none of the - * other 1003.2 regular-expression reviewers noticed it at - * all. So an unmatched ) is legal POSIX, at least until - * we can get it fixed. - */ - SETERROR(REG_EPAREN); - break; -#endif - case '^': - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - wascaret = 1; - break; - case '$': - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; - break; - case '|': - SETERROR(REG_EMPTY); - break; - case '*': - case '+': - case '?': - SETERROR(REG_BADRPT); - break; - case '.': - if (p->g->cflags®_NEWLINE) - nonnewline(p); - else - EMIT(OANY, 0); - break; - case '[': - p_bracket(p); - break; - case '\\': - (void)REQUIRE(MORE(), REG_EESCAPE); - wc = WGETNEXT(); -#ifdef __CYGWIN__ - /* \< and \> are the GNU equivalents to [[:<:]] and [[:>:]] */ - switch (wc) - { - case L'<': - EMIT(OBOW, 0); - break; - case L'>': - EMIT(OEOW, 0); - break; - default: - ordinary(p, wc); - break; - } -#else - ordinary(p, wc); -#endif - break; - case '{': /* okay as ordinary except if digit follows */ - (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); - /* FALLTHROUGH */ - default: - p->next--; - wc = WGETNEXT(); - ordinary(p, wc); - break; - } - - if (!MORE()) - return; - c = PEEK(); - /* we call { a repetition if followed by a digit */ - if (!( c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) - return; /* no repetition, we're done */ - NEXT(); - - (void)REQUIRE(!wascaret, REG_BADRPT); - switch (c) { - case '*': /* implemented as +? */ - /* this case does not require the (y|) trick, noKLUDGE */ - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - INSERT(OQUEST_, pos); - ASTERN(O_QUEST, pos); - break; - case '+': - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - break; - case '?': - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, pos); /* offset slightly wrong */ - ASTERN(OOR1, pos); /* this one's right */ - AHEAD(pos); /* fix the OCH_ */ - EMIT(OOR2, 0); /* offset very wrong... */ - AHEAD(THERE()); /* ...so fix it */ - ASTERN(O_CH, THERETHERE()); - break; - case '{': - count = p_count(p); - if (EAT(',')) { - if (isdigit((uch)PEEK())) { - count2 = p_count(p); - (void)REQUIRE(count <= count2, REG_BADBR); - } else /* single number with comma */ - count2 = INFINITY; - } else /* just a single number */ - count2 = count; - repeat(p, pos, count, count2); - if (!EAT('}')) { /* error heuristics */ - while (MORE() && PEEK() != '}') - NEXT(); - (void)REQUIRE(MORE(), REG_EBRACE); - SETERROR(REG_BADBR); - } - break; - } - - if (!MORE()) - return; - c = PEEK(); - if (!( c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) - return; - SETERROR(REG_BADRPT); -} - -/* - - p_str - string (no metacharacters) "parser" - == static void p_str(struct parse *p); - */ -static void -p_str(struct parse *p) -{ - (void)REQUIRE(MORE(), REG_EMPTY); - while (MORE()) - ordinary(p, WGETNEXT()); -} - -/* - - p_bre - BRE parser top level, anchoring and concatenation - == static void p_bre(struct parse *p, int end1, \ - == int end2); - * Giving end1 as OUT essentially eliminates the end1/end2 check. - * - * This implementation is a bit of a kludge, in that a trailing $ is first - * taken as an ordinary character and then revised to be an anchor. - * The amount of lookahead needed to avoid this kludge is excessive. - */ -static void -p_bre(struct parse *p, - int end1, /* first terminating character */ - int end2) /* second terminating character */ -{ - sopno start = HERE(); - int first = 1; /* first subexpression? */ - int wasdollar = 0; - - if (EAT('^')) { - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - } - while (MORE() && !SEETWO(end1, end2)) { - wasdollar = p_simp_re(p, first); - first = 0; - } - if (wasdollar) { /* oops, that was a trailing anchor */ - DROP(1); - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; - } - - (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ -} - -/* - - p_simp_re - parse a simple RE, an atom possibly followed by a repetition - == static int p_simp_re(struct parse *p, int starordinary); - */ -static int /* was the simple RE an unbackslashed $? */ -p_simp_re(struct parse *p, - int starordinary) /* is a leading * an ordinary character? */ -{ - int c; - int count; - int count2; - sopno pos; - int i; - wint_t wc; - sopno subno; -# define BACKSL (1<<CHAR_BIT) - - pos = HERE(); /* repetion op, if any, covers from here */ - - assert(MORE()); /* caller should have ensured this */ - c = GETNEXT(); - if (c == '\\') { - (void)REQUIRE(MORE(), REG_EESCAPE); - c = BACKSL | GETNEXT(); - } - switch (c) { - case '.': - if (p->g->cflags®_NEWLINE) - nonnewline(p); - else - EMIT(OANY, 0); - break; - case '[': - p_bracket(p); - break; -#ifdef __CYGWIN__ - case BACKSL|'<': - /* \< is the GNU equivalents to [[:<:]] */ - EMIT(OBOW, 0); - break; - case BACKSL|'>': - /* \> is the GNU equivalents to [[:>:]] */ - EMIT(OEOW, 0); - break; -#endif - case BACKSL|'{': - SETERROR(REG_BADRPT); - break; - case BACKSL|'(': - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) - p->pbegin[subno] = HERE(); - EMIT(OLPAREN, subno); - /* the MORE here is an error heuristic */ - if (MORE() && !SEETWO('\\', ')')) - p_bre(p, '\\', ')'); - if (subno < NPAREN) { - p->pend[subno] = HERE(); - assert(p->pend[subno] != 0); - } - EMIT(ORPAREN, subno); - (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); - break; - case BACKSL|')': /* should not get here -- must be user */ - case BACKSL|'}': - SETERROR(REG_EPAREN); - break; - case BACKSL|'1': - case BACKSL|'2': - case BACKSL|'3': - case BACKSL|'4': - case BACKSL|'5': - case BACKSL|'6': - case BACKSL|'7': - case BACKSL|'8': - case BACKSL|'9': - i = (c&~BACKSL) - '0'; - assert(i < NPAREN); - if (p->pend[i] != 0) { - assert(i <= p->g->nsub); - EMIT(OBACK_, i); - assert(p->pbegin[i] != 0); - assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); - assert(OP(p->strip[p->pend[i]]) == ORPAREN); - (void) dupl(p, p->pbegin[i]+1, p->pend[i]); - EMIT(O_BACK, i); - } else - SETERROR(REG_ESUBREG); - p->g->backrefs = 1; - break; - case '*': - (void)REQUIRE(starordinary, REG_BADRPT); - /* FALLTHROUGH */ - default: - p->next--; - wc = WGETNEXT(); - ordinary(p, wc); - break; - } - - if (EAT('*')) { /* implemented as +? */ - /* this case does not require the (y|) trick, noKLUDGE */ - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - INSERT(OQUEST_, pos); - ASTERN(O_QUEST, pos); - } else if (EATTWO('\\', '{')) { - count = p_count(p); - if (EAT(',')) { - if (MORE() && isdigit((uch)PEEK())) { - count2 = p_count(p); - (void)REQUIRE(count <= count2, REG_BADBR); - } else /* single number with comma */ - count2 = INFINITY; - } else /* just a single number */ - count2 = count; - repeat(p, pos, count, count2); - if (!EATTWO('\\', '}')) { /* error heuristics */ - while (MORE() && !SEETWO('\\', '}')) - NEXT(); - (void)REQUIRE(MORE(), REG_EBRACE); - SETERROR(REG_BADBR); - } - } else if (c == '$') /* $ (but not \$) ends it */ - return(1); - - return(0); -} - -/* - - p_count - parse a repetition count - == static int p_count(struct parse *p); - */ -static int /* the value */ -p_count(struct parse *p) -{ - int count = 0; - int ndigits = 0; - - while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { - count = count*10 + (GETNEXT() - '0'); - ndigits++; - } - - (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); - return(count); -} - -/* - - p_bracket - parse a bracketed character list - == static void p_bracket(struct parse *p); - */ -static void -p_bracket(struct parse *p) -{ - cset *cs; - wint_t ch; - - /* Dept of Truly Sickening Special-Case Kludges */ - if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { - EMIT(OBOW, 0); - NEXTn(6); - return; - } - if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { - EMIT(OEOW, 0); - NEXTn(6); - return; - } - - if ((cs = allocset(p)) == NULL) - return; - - if (p->g->cflags®_ICASE) - cs->icase = 1; - if (EAT('^')) - cs->invert = 1; - if (EAT(']')) - CHadd(p, cs, ']'); - else if (EAT('-')) - CHadd(p, cs, '-'); - while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) - p_b_term(p, cs); - if (EAT('-')) - CHadd(p, cs, '-'); - (void)MUSTEAT(']', REG_EBRACK); - - if (p->error != 0) /* don't mess things up further */ - return; - - if (cs->invert && p->g->cflags®_NEWLINE) - cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); - - if ((ch = singleton(cs)) != OUT /* optimize singleton sets */ - && cs->invert == 0) { /* But not in invert case. */ - ordinary(p, ch); - freeset(p, cs); - } else - EMIT(OANYOF, (int)(cs - p->g->sets)); -} - -#ifdef __CYGWIN__ -/* This function is usually part of FreeBSD's libc. */ -int -__collate_range_cmp(int c1, int c2) -{ - char s1[2] = { c1, '\0' }; - char s2[2] = { c2, '\0' }; - return strcoll (s1, s2); -} -#endif - -/* - - p_b_term - parse one term of a bracketed character list - == static void p_b_term(struct parse *p, cset *cs); - */ -static void -p_b_term(struct parse *p, cset *cs) -{ - char c; - wint_t start, finish; - wint_t i; - - /* classify what we've got */ - switch ((MORE()) ? PEEK() : '\0') { - case '[': - c = (MORE2()) ? PEEK2() : '\0'; - break; - case '-': - SETERROR(REG_ERANGE); - return; /* NOTE RETURN */ - break; - default: - c = '\0'; - break; - } - - switch (c) { - case ':': /* character class */ - NEXT2(); - (void)REQUIRE(MORE(), REG_EBRACK); - c = PEEK(); - (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); - p_b_cclass(p, cs); - (void)REQUIRE(MORE(), REG_EBRACK); - (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); - break; - case '=': /* equivalence class */ - NEXT2(); - (void)REQUIRE(MORE(), REG_EBRACK); - c = PEEK(); - (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); - p_b_eclass(p, cs); - (void)REQUIRE(MORE(), REG_EBRACK); - (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); - break; - default: /* symbol, ordinary character, or range */ - start = p_b_symbol(p); - if (SEE('-') && MORE2() && PEEK2() != ']') { - /* range */ - NEXT(); - if (EAT('-')) - finish = '-'; - else - finish = p_b_symbol(p); - } else if (SEE('-') && !MORE2()) { - SETERROR(REG_EBRACK); - return; - } else - finish = start; - if (start == finish) - CHadd(p, cs, start); - else { -#ifdef __CYGWIN__ - if (!collate_lcid) { -#else - if (__collate_load_error) { -#endif - (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); - CHaddrange(p, cs, start, finish); - } else { - (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); - for (i = 0; i <= UCHAR_MAX; i++) { - if ( __collate_range_cmp(start, i) <= 0 - && __collate_range_cmp(i, finish) <= 0 - ) - CHadd(p, cs, i); - } - } - } - break; - } -} - -/* - - p_b_cclass - parse a character-class name and deal with it - == static void p_b_cclass(struct parse *p, cset *cs); - */ -static void -p_b_cclass(struct parse *p, cset *cs) -{ - char *sp = p->next; - size_t len; - wctype_t wct; - char clname[16]; - - while (MORE() && isalpha((uch)PEEK())) - NEXT(); - len = p->next - sp; - if (len >= sizeof(clname) - 1) { - SETERROR(REG_ECTYPE); - return; - } - memcpy(clname, sp, len); - clname[len] = '\0'; - if ((wct = wctype(clname)) == 0) { - SETERROR(REG_ECTYPE); - return; - } - CHaddtype(p, cs, wct); -} - -/* - - p_b_eclass - parse an equivalence-class name and deal with it - == static void p_b_eclass(struct parse *p, cset *cs); - * - * This implementation is incomplete. xxx - */ -static void -p_b_eclass(struct parse *p, cset *cs) -{ - wint_t c; - - c = p_b_coll_elem(p, '='); - CHadd(p, cs, c); -} - -/* - - p_b_symbol - parse a character or [..]ed multicharacter collating symbol - == static char p_b_symbol(struct parse *p); - */ -static wint_t /* value of symbol */ -p_b_symbol(struct parse *p) -{ - wint_t value; - - (void)REQUIRE(MORE(), REG_EBRACK); - if (!EATTWO('[', '.')) - return(WGETNEXT()); - - /* collating symbol */ - value = p_b_coll_elem(p, '.'); - (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); - return(value); -} - -/* - - p_b_coll_elem - parse a collating-element name and look it up - == static char p_b_coll_elem(struct parse *p, int endc); - */ -static wint_t /* value of collating element */ -p_b_coll_elem(struct parse *p, - wint_t endc) /* name ended by endc,']' */ -{ - char *sp = p->next; - struct cname *cp; - int len; - mbstate_t mbs; - wchar_t wc; - size_t clen; - - while (MORE() && !SEETWO(endc, ']')) - NEXT(); - if (!MORE()) { - SETERROR(REG_EBRACK); - return(0); - } - len = p->next - sp; - for (cp = cnames; cp->name != NULL; cp++) - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') - return(cp->code); /* known name */ - memset(&mbs, 0, sizeof(mbs)); - if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) - return (wc); /* single character */ - else if (clen == (size_t)-1 || clen == (size_t)-2) - SETERROR(REG_ILLSEQ); - else - SETERROR(REG_ECOLLATE); /* neither */ - return(0); -} - -/* - - othercase - return the case counterpart of an alphabetic - == static char othercase(int ch); - */ -static wint_t /* if no counterpart, return ch */ -othercase(wint_t ch) -{ - assert(iswalpha(ch)); - if (iswupper(ch)) - return(towlower(ch)); - else if (iswlower(ch)) - return(towupper(ch)); - else /* peculiar, but could happen */ - return(ch); -} - -/* - - bothcases - emit a dualcase version of a two-case character - == static void bothcases(struct parse *p, int ch); - * - * Boy, is this implementation ever a kludge... - */ -static void -bothcases(struct parse *p, wint_t ch) -{ - char *oldnext = p->next; - char *oldend = p->end; - char bracket[3 + MB_LEN_MAX]; - size_t n; - mbstate_t mbs; - - assert(othercase(ch) != ch); /* p_bracket() would recurse */ - p->next = bracket; - memset(&mbs, 0, sizeof(mbs)); - n = xwcrtomb(bracket, ch, &mbs); - assert(n != (size_t)-1); - bracket[n] = ']'; - bracket[n + 1] = '\0'; - p->end = bracket+n+1; - p_bracket(p); - assert(p->next == p->end); - p->next = oldnext; - p->end = oldend; -} - -/* - - ordinary - emit an ordinary character - == static void ordinary(struct parse *p, int ch); - */ -static void -ordinary(struct parse *p, wint_t ch) -{ - cset *cs; - - if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) - bothcases(p, ch); - else if ((ch & OPDMASK) == ch) - EMIT(OCHAR, ch); - else { - /* - * Kludge: character is too big to fit into an OCHAR operand. - * Emit a singleton set. - */ - if ((cs = allocset(p)) == NULL) - return; - CHadd(p, cs, ch); - EMIT(OANYOF, (int)(cs - p->g->sets)); - } -} - -/* - - nonnewline - emit REG_NEWLINE version of OANY - == static void nonnewline(struct parse *p); - * - * Boy, is this implementation ever a kludge... - */ -static void -nonnewline(struct parse *p) -{ - char *oldnext = p->next; - char *oldend = p->end; - char bracket[4]; - - p->next = bracket; - p->end = bracket+3; - bracket[0] = '^'; - bracket[1] = '\n'; - bracket[2] = ']'; - bracket[3] = '\0'; - p_bracket(p); - assert(p->next == bracket+3); - p->next = oldnext; - p->end = oldend; -} - -/* - - repeat - generate code for a bounded repetition, recursively if needed - == static void repeat(struct parse *p, sopno start, int from, int to); - */ -static void -repeat(struct parse *p, - sopno start, /* operand from here to end of strip */ - int from, /* repeated from this number */ - int to) /* to this number of times (maybe INFINITY) */ -{ - sopno finish = HERE(); -# define N 2 -# define INF 3 -# define REP(f, t) ((f)*8 + (t)) -# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) - sopno copy; - - if (p->error != 0) /* head off possible runaway recursion */ - return; - - assert(from <= to); - - switch (REP(MAP(from), MAP(to))) { - case REP(0, 0): /* must be user doing this */ - DROP(finish-start); /* drop the operand */ - break; - case REP(0, 1): /* as x{1,1}? */ - case REP(0, N): /* as x{1,n}? */ - case REP(0, INF): /* as x{1,}? */ - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, start); /* offset is wrong... */ - repeat(p, start+1, 1, to); - ASTERN(OOR1, start); - AHEAD(start); /* ... fix it */ - EMIT(OOR2, 0); - AHEAD(THERE()); - ASTERN(O_CH, THERETHERE()); - break; - case REP(1, 1): /* trivial case */ - /* done */ - break; - case REP(1, N): /* as x?x{1,n-1} */ - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, start); - ASTERN(OOR1, start); - AHEAD(start); - EMIT(OOR2, 0); /* offset very wrong... */ - AHEAD(THERE()); /* ...so fix it */ - ASTERN(O_CH, THERETHERE()); - copy = dupl(p, start+1, finish+1); - assert(copy == finish+4); - repeat(p, copy, 1, to-1); - break; - case REP(1, INF): /* as x+ */ - INSERT(OPLUS_, start); - ASTERN(O_PLUS, start); - break; - case REP(N, N): /* as xx{m-1,n-1} */ - copy = dupl(p, start, finish); - repeat(p, copy, from-1, to-1); - break; - case REP(N, INF): /* as xx{n-1,INF} */ - copy = dupl(p, start, finish); - repeat(p, copy, from-1, to); - break; - default: /* "can't happen" */ - SETERROR(REG_ASSERT); /* just in case */ - break; - } -} - -/* - - wgetnext - helper function for WGETNEXT() macro. Gets the next wide - - character from the parse struct, signals a REG_ILLSEQ error if the - - character can't be converted. Returns the number of bytes consumed. - */ -static wint_t -wgetnext(struct parse *p) -{ - mbstate_t mbs; - wchar_t wc; - wint_t ret; - size_t n; - - memset(&mbs, 0, sizeof(mbs)); - n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); - if (n == (size_t)-1 || n == (size_t)-2) { - SETERROR(REG_ILLSEQ); - return (0); - } - ret = wc; - if (n == 0) - n = 1; - else if (sizeof (wchar_t) == 2 && wc >= 0xd800 && wc <= 0xdbff) { - /* UTF-16 surrogate pair. Fetch second half and - compute UTF-32 value */ - size_t n2 = mbrtowc(&wc, p->next + n, - p->end - p->next - n, &mbs); - if (n2 == 0 || n2 == (size_t)-1 || n2 == (size_t)-2) { - SETERROR(REG_ILLSEQ); - return (0); - } - ret = (((ret & 0x3ff) << 10) | (wc & 0x3ff)) - + 0x10000; - n += n2; - } - p->next += n; - return (ret); -} - -static size_t -xwcrtomb (char *s, wint_t wc, mbstate_t *ps) -{ - if (sizeof (wchar_t) == 2 && wc >= 0x10000) - { - /* UTF-16 wcrtomb can't handle these values directly. The rest of the - code isn't surrogate pair aware, so we handle this here. Convert - value to UTF-16 surrogate and call wcsrtombs to convert the "string" - to the correct multibyte representation, if any. */ - wchar_t ws[2]; - const wchar_t *wsp = ws; - - wc -= 0x10000; - ws[0] = 0xd800 | (wc >> 10); - ws[1] = 0xdc00 | (wc & 0x3ff); - return wcsnrtombs (s, &wsp, 2, MB_CUR_MAX, ps); - } - return wcrtomb (s, wc, ps); -} - - -/* - - seterr - set an error condition - == static int seterr(struct parse *p, int e); - */ -static int /* useless but makes type checking happy */ -seterr(struct parse *p, int e) -{ - if (p->error == 0) /* keep earliest error condition */ - p->error = e; - p->next = nuls; /* try to bring things to a halt */ - p->end = nuls; - return(0); /* make the return value well-defined */ -} - -/* - - allocset - allocate a set of characters for [] - == static cset *allocset(struct parse *p); - */ -static cset * -allocset(struct parse *p) -{ - cset *cs, *ncs; - - ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); - if (ncs == NULL) { - SETERROR(REG_ESPACE); - return (NULL); - } - p->g->sets = ncs; - cs = &p->g->sets[p->g->ncsets++]; - memset(cs, 0, sizeof(*cs)); - - return(cs); -} - -/* - - freeset - free a now-unused set - == static void freeset(struct parse *p, cset *cs); - */ -static void -freeset(struct parse *p, cset *cs) -{ - cset *top = &p->g->sets[p->g->ncsets]; - - free(cs->wides); - free(cs->ranges); - free(cs->types); - memset(cs, 0, sizeof(*cs)); - if (cs == top-1) /* recover only the easy case */ - p->g->ncsets--; -} - -/* - - singleton - Determine whether a set contains only one character, - - returning it if so, otherwise returning OUT. - */ -static wint_t -singleton(cset *cs) -{ - wint_t i, s, n; - - for (i = n = 0; i < NC; i++) - if (CHIN(cs, i)) { - n++; - s = i; - } - if (n == 1 && cs->nwides == 0) - return (s); - if (n == 0 && cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && - cs->icase == 0) - return (cs->wides[0]); - /* Don't bother handling the other cases. */ - return (OUT); -} - -/* - - CHadd - add character to character set. - */ -static void -CHadd(struct parse *p, cset *cs, wint_t ch) -{ - wint_t nch, *newwides; - assert(ch >= 0); - if (ch < NC) - cs->bmp[ch >> 3] |= 1 << (ch & 7); - else { - newwides = realloc(cs->wides, (cs->nwides + 1) * - sizeof(*cs->wides)); - if (newwides == NULL) { - SETERROR(REG_ESPACE); - return; - } - cs->wides = newwides; - cs->wides[cs->nwides++] = ch; - } - if (cs->icase) { - if ((nch = towlower(ch)) < NC) - cs->bmp[nch >> 3] |= 1 << (nch & 7); - if ((nch = towupper(ch)) < NC) - cs->bmp[nch >> 3] |= 1 << (nch & 7); - } -} - -/* - - CHaddrange - add all characters in the range [min,max] to a character set. - */ -static void -CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) -{ - crange *newranges; - - for (; min < NC && min <= max; min++) - CHadd(p, cs, min); - if (min >= max) - return; - newranges = realloc(cs->ranges, (cs->nranges + 1) * - sizeof(*cs->ranges)); - if (newranges == NULL) { - SETERROR(REG_ESPACE); - return; - } - cs->ranges = newranges; - cs->ranges[cs->nranges].min = min; - cs->ranges[cs->nranges].min = max; - cs->nranges++; -} - -/* - - CHaddtype - add all characters of a certain type to a character set. - */ -static void -CHaddtype(struct parse *p, cset *cs, wctype_t wct) -{ - wint_t i; - wctype_t *newtypes; - - for (i = 0; i < NC; i++) - if (iswctype(i, wct)) - CHadd(p, cs, i); - newtypes = realloc(cs->types, (cs->ntypes + 1) * - sizeof(*cs->types)); - if (newtypes == NULL) { - SETERROR(REG_ESPACE); - return; - } - cs->types = newtypes; - cs->types[cs->ntypes++] = wct; -} - -/* - - dupl - emit a duplicate of a bunch of sops - == static sopno dupl(struct parse *p, sopno start, sopno finish); - */ -static sopno /* start of duplicate */ -dupl(struct parse *p, - sopno start, /* from here */ - sopno finish) /* to this less one */ -{ - sopno ret = HERE(); - sopno len = finish - start; - - assert(finish >= start); - if (len == 0) - return(ret); - enlarge(p, p->ssize + len); /* this many unexpected additions */ - assert(p->ssize >= p->slen + len); - (void) memcpy((char *)(p->strip + p->slen), - (char *)(p->strip + start), (size_t)len*sizeof(sop)); - p->slen += len; - return(ret); -} - -/* - - doemit - emit a strip operator - == static void doemit(struct parse *p, sop op, size_t opnd); - * - * It might seem better to implement this as a macro with a function as - * hard-case backup, but it's just too big and messy unless there are - * some changes to the data structures. Maybe later. - */ -static void -doemit(struct parse *p, sop op, size_t opnd) -{ - /* avoid making error situations worse */ - if (p->error != 0) - return; - - /* deal with oversize operands ("can't happen", more or less) */ - assert(opnd < 1<<OPSHIFT); - - /* deal with undersized strip */ - if (p->slen >= p->ssize) - enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ - assert(p->slen < p->ssize); - - /* finally, it's all reduced to the easy case */ - p->strip[p->slen++] = SOP(op, opnd); -} - -/* - - doinsert - insert a sop into the strip - == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); - */ -static void -doinsert(struct parse *p, sop op, size_t opnd, sopno pos) -{ - sopno sn; - sop s; - int i; - - /* avoid making error situations worse */ - if (p->error != 0) - return; - - sn = HERE(); - EMIT(op, opnd); /* do checks, ensure space */ - assert(HERE() == sn+1); - s = p->strip[sn]; - - /* adjust paren pointers */ - assert(pos > 0); - for (i = 1; i < NPAREN; i++) { - if (p->pbegin[i] >= pos) { - p->pbegin[i]++; - } - if (p->pend[i] >= pos) { - p->pend[i]++; - } - } - - memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], - (HERE()-pos-1)*sizeof(sop)); - p->strip[pos] = s; -} - -/* - - dofwd - complete a forward reference - == static void dofwd(struct parse *p, sopno pos, sop value); - */ -static void -dofwd(struct parse *p, sopno pos, sop value) -{ - /* avoid making error situations worse */ - if (p->error != 0) - return; - - assert(value < 1<<OPSHIFT); - p->strip[pos] = OP(p->strip[pos]) | value; -} - -/* - - enlarge - enlarge the strip - == static void enlarge(struct parse *p, sopno size); - */ -static void -enlarge(struct parse *p, sopno size) -{ - sop *sp; - - if (p->ssize >= size) - return; - - sp = (sop *)realloc(p->strip, size*sizeof(sop)); - if (sp == NULL) { - SETERROR(REG_ESPACE); - return; - } - p->strip = sp; - p->ssize = size; -} - -/* - - stripsnug - compact the strip - == static void stripsnug(struct parse *p, struct re_guts *g); - */ -static void -stripsnug(struct parse *p, struct re_guts *g) -{ - g->nstates = p->slen; - g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); - if (g->strip == NULL) { - SETERROR(REG_ESPACE); - g->strip = p->strip; - } -} - -/* - - findmust - fill in must and mlen with longest mandatory literal string - == static void findmust(struct parse *p, struct re_guts *g); - * - * This algorithm could do fancy things like analyzing the operands of | - * for common subsequences. Someday. This code is simple and finds most - * of the interesting cases. - * - * Note that must and mlen got initialized during setup. - */ -static void -findmust(struct parse *p, struct re_guts *g) -{ - sop *scan; - sop *start; - sop *newstart; - sopno newlen; - sop s; - char *cp; - int offset; - char buf[MB_LEN_MAX]; - size_t clen; - mbstate_t mbs; - - /* avoid making error situations worse */ - if (p->error != 0) - return; - - /* - * It's not generally safe to do a ``char'' substring search on - * multibyte character strings, but it's safe for at least - * UTF-8 (see RFC 3629). - */ - if (MB_CUR_MAX > 1 && -#ifdef __CYGWIN__ - strcmp(__locale_charset (), "UTF-8") != 0) -#else - strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) -#endif - return; - - /* find the longest OCHAR sequence in strip */ - newlen = 0; - offset = 0; - g->moffset = 0; - scan = g->strip + 1; - do { - s = *scan++; - switch (OP(s)) { - case OCHAR: /* sequence member */ - if (newlen == 0) { /* new sequence */ - memset(&mbs, 0, sizeof(mbs)); - newstart = scan - 1; - } - clen = xwcrtomb(buf, OPND(s), &mbs); - if (clen == (size_t)-1) - goto toohard; - newlen += clen; - break; - case OPLUS_: /* things that don't break one */ - case OLPAREN: - case ORPAREN: - break; - case OQUEST_: /* things that must be skipped */ - case OCH_: - offset = altoffset(scan, offset); - scan--; - do { - scan += OPND(s); - s = *scan; - /* assert() interferes w debug printouts */ - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) { - g->iflags |= BAD; - return; - } - } while (OP(s) != O_QUEST && OP(s) != O_CH); - /* FALLTHROUGH */ - case OBOW: /* things that break a sequence */ - case OEOW: - case OBOL: - case OEOL: - case O_QUEST: - case O_CH: - case OEND: - if (newlen > g->mlen) { /* ends one */ - start = newstart; - g->mlen = newlen; - if (offset > -1) { - g->moffset += offset; - offset = newlen; - } else - g->moffset = offset; - } else { - if (offset > -1) - offset += newlen; - } - newlen = 0; - break; - case OANY: - if (newlen > g->mlen) { /* ends one */ - start = newstart; - g->mlen = newlen; - if (offset > -1) { - g->moffset += offset; - offset = newlen; - } else - g->moffset = offset; - } else { - if (offset > -1) - offset += newlen; - } - if (offset > -1) - offset++; - newlen = 0; - break; - case OANYOF: /* may or may not invalidate offset */ - /* First, everything as OANY */ - if (newlen > g->mlen) { /* ends one */ - start = newstart; - g->mlen = newlen; - if (offset > -1) { - g->moffset += offset; - offset = newlen; - } else - g->moffset = offset; - } else { - if (offset > -1) - offset += newlen; - } - if (offset > -1) - offset++; - newlen = 0; - break; - toohard: - default: - /* Anything here makes it impossible or too hard - * to calculate the offset -- so we give up; - * save the last known good offset, in case the - * must sequence doesn't occur later. - */ - if (newlen > g->mlen) { /* ends one */ - start = newstart; - g->mlen = newlen; - if (offset > -1) - g->moffset += offset; - else - g->moffset = offset; - } - offset = -1; - newlen = 0; - break; - } - } while (OP(s) != OEND); - - if (g->mlen == 0) { /* there isn't one */ - g->moffset = -1; - return; - } - - /* turn it into a character string */ - g->must = malloc((size_t)g->mlen + 1); - if (g->must == NULL) { /* argh; just forget it */ - g->mlen = 0; - g->moffset = -1; - return; - } - cp = g->must; - scan = start; - memset(&mbs, 0, sizeof(mbs)); - while (cp < g->must + g->mlen) { - while (OP(s = *scan++) != OCHAR) - continue; - clen = xwcrtomb(cp, OPND(s), &mbs); - assert(clen != (size_t)-1); - cp += clen; - } - assert(cp == g->must + g->mlen); - *cp++ = '\0'; /* just on general principles */ -} - -/* - - altoffset - choose biggest offset among multiple choices - == static int altoffset(sop *scan, int offset); - * - * Compute, recursively if necessary, the largest offset among multiple - * re paths. - */ -static int -altoffset(sop *scan, int offset) -{ - int largest; - int try; - sop s; - - /* If we gave up already on offsets, return */ - if (offset == -1) - return -1; - - largest = 0; - try = 0; - s = *scan++; - while (OP(s) != O_QUEST && OP(s) != O_CH) { - switch (OP(s)) { - case OOR1: - if (try > largest) - largest = try; - try = 0; - break; - case OQUEST_: - case OCH_: - try = altoffset(scan, try); - if (try == -1) - return -1; - scan--; - do { - scan += OPND(s); - s = *scan; - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) - return -1; - } while (OP(s) != O_QUEST && OP(s) != O_CH); - /* We must skip to the next position, or we'll - * leave altoffset() too early. - */ - scan++; - break; - case OANYOF: - case OCHAR: - case OANY: - try++; - case OBOW: - case OEOW: - case OLPAREN: - case ORPAREN: - case OOR2: - break; - default: - try = -1; - break; - } - if (try == -1) - return -1; - s = *scan++; - } - - if (try > largest) - largest = try; - - return largest+offset; -} - -/* - - computejumps - compute char jumps for BM scan - == static void computejumps(struct parse *p, struct re_guts *g); - * - * This algorithm assumes g->must exists and is has size greater than - * zero. It's based on the algorithm found on Computer Algorithms by - * Sara Baase. - * - * A char jump is the number of characters one needs to jump based on - * the value of the character from the text that was mismatched. - */ -static void -computejumps(struct parse *p, struct re_guts *g) -{ - int ch; - int mindex; - - /* Avoid making errors worse */ - if (p->error != 0) - return; - - g->charjump = (int*) malloc((NC + 1) * sizeof(int)); - if (g->charjump == NULL) /* Not a fatal error */ - return; - /* Adjust for signed chars, if necessary */ - g->charjump = &g->charjump[-(CHAR_MIN)]; - - /* If the character does not exist in the pattern, the jump - * is equal to the number of characters in the pattern. - */ - for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) - g->charjump[ch] = g->mlen; - - /* If the character does exist, compute the jump that would - * take us to the last character in the pattern equal to it - * (notice that we match right to left, so that last character - * is the first one that would be matched). - */ - for (mindex = 0; mindex < g->mlen; mindex++) - g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; -} - -/* - - computematchjumps - compute match jumps for BM scan - == static void computematchjumps(struct parse *p, struct re_guts *g); - * - * This algorithm assumes g->must exists and is has size greater than - * zero. It's based on the algorithm found on Computer Algorithms by - * Sara Baase. - * - * A match jump is the number of characters one needs to advance based - * on the already-matched suffix. - * Notice that all values here are minus (g->mlen-1), because of the way - * the search algorithm works. - */ -static void -computematchjumps(struct parse *p, struct re_guts *g) -{ - int mindex; /* General "must" iterator */ - int suffix; /* Keeps track of matching suffix */ - int ssuffix; /* Keeps track of suffixes' suffix */ - int* pmatches; /* pmatches[k] points to the next i - * such that i+1...mlen is a substring - * of k+1...k+mlen-i-1 - */ - - /* Avoid making errors worse */ - if (p->error != 0) - return; - - pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); - if (pmatches == NULL) { - g->matchjump = NULL; - return; - } - - g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); - if (g->matchjump == NULL) /* Not a fatal error */ - return; - - /* Set maximum possible jump for each character in the pattern */ - for (mindex = 0; mindex < g->mlen; mindex++) - g->matchjump[mindex] = 2*g->mlen - mindex - 1; - - /* Compute pmatches[] */ - for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; - mindex--, suffix--) { - pmatches[mindex] = suffix; - - /* If a mismatch is found, interrupting the substring, - * compute the matchjump for that position. If no - * mismatch is found, then a text substring mismatched - * against the suffix will also mismatch against the - * substring. - */ - while (suffix < g->mlen - && g->must[mindex] != g->must[suffix]) { - g->matchjump[suffix] = MIN(g->matchjump[suffix], - g->mlen - mindex - 1); - suffix = pmatches[suffix]; - } - } - - /* Compute the matchjump up to the last substring found to jump - * to the beginning of the largest must pattern prefix matching - * it's own suffix. - */ - for (mindex = 0; mindex <= suffix; mindex++) - g->matchjump[mindex] = MIN(g->matchjump[mindex], - g->mlen + suffix - mindex); - - ssuffix = pmatches[suffix]; - while (suffix < g->mlen) { - while (suffix <= ssuffix && suffix < g->mlen) { - g->matchjump[suffix] = MIN(g->matchjump[suffix], - g->mlen + ssuffix - suffix); - suffix++; - } - if (suffix < g->mlen) - ssuffix = pmatches[ssuffix]; - } - - free(pmatches); -} - -/* - - pluscount - count + nesting - == static sopno pluscount(struct parse *p, struct re_guts *g); - */ -static sopno /* nesting depth */ -pluscount(struct parse *p, struct re_guts *g) -{ - sop *scan; - sop s; - sopno plusnest = 0; - sopno maxnest = 0; - - if (p->error != 0) - return(0); /* there may not be an OEND */ - - scan = g->strip + 1; - do { - s = *scan++; - switch (OP(s)) { - case OPLUS_: - plusnest++; - break; - case O_PLUS: - if (plusnest > maxnest) - maxnest = plusnest; - plusnest--; - break; - } - } while (OP(s) != OEND); - if (plusnest != 0) - g->iflags |= BAD; - return(maxnest); -} diff --git a/winsup/cygwin/regex/regerror.c b/winsup/cygwin/regex/regerror.c deleted file mode 100644 index 1bba3e4a6..000000000 --- a/winsup/cygwin/regex/regerror.c +++ /dev/null @@ -1,190 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regerror.c 8.4 (Berkeley) 3/20/94 - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)regerror.c 8.4 (Berkeley) 3/20/94"; -#endif /* LIBC_SCCS and not lint */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/lib/libc/regex/regerror.c,v 1.11 2007/06/11 03:05:54 delphij Exp $"); - -#include <sys/types.h> -#include <stdio.h> -#include <string.h> -#include <limits.h> -#include <stdlib.h> -#include <regex.h> - -#include "utils.h" - -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === regerror.c === */ -static char *regatoi(const regex_t *preg, char *localbuf); - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ -/* - = #define REG_NOMATCH 1 - = #define REG_BADPAT 2 - = #define REG_ECOLLATE 3 - = #define REG_ECTYPE 4 - = #define REG_EESCAPE 5 - = #define REG_ESUBREG 6 - = #define REG_EBRACK 7 - = #define REG_EPAREN 8 - = #define REG_EBRACE 9 - = #define REG_BADBR 10 - = #define REG_ERANGE 11 - = #define REG_ESPACE 12 - = #define REG_BADRPT 13 - = #define REG_EMPTY 14 - = #define REG_ASSERT 15 - = #define REG_INVARG 16 - = #define REG_ILLSEQ 17 - = #define REG_ATOI 255 // convert name to number (!) - = #define REG_ITOA 0400 // convert number to name (!) - */ -static struct rerr { - int code; -#ifdef __CYGWIN__ /* Avoid whining compiler */ - const char *name; - const char *explain; -#else - char *name; - char *explain; -#endif -} rerrs[] = { - {REG_NOMATCH, "REG_NOMATCH", "regexec() failed to match"}, - {REG_BADPAT, "REG_BADPAT", "invalid regular expression"}, - {REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element"}, - {REG_ECTYPE, "REG_ECTYPE", "invalid character class"}, - {REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)"}, - {REG_ESUBREG, "REG_ESUBREG", "invalid backreference number"}, - {REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced"}, - {REG_EPAREN, "REG_EPAREN", "parentheses not balanced"}, - {REG_EBRACE, "REG_EBRACE", "braces not balanced"}, - {REG_BADBR, "REG_BADBR", "invalid repetition count(s)"}, - {REG_ERANGE, "REG_ERANGE", "invalid character range"}, - {REG_ESPACE, "REG_ESPACE", "out of memory"}, - {REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid"}, - {REG_EMPTY, "REG_EMPTY", "empty (sub)expression"}, - {REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug"}, - {REG_INVARG, "REG_INVARG", "invalid argument to regex routine"}, - {REG_ILLSEQ, "REG_ILLSEQ", "illegal byte sequence"}, - {0, "", "*** unknown regexp error code ***"} -}; - -/* - - regerror - the interface to error numbers - = extern size_t regerror(int, const regex_t *, char *, size_t); - */ -/* ARGSUSED */ -size_t -regerror(int errcode, - const regex_t * __restrict preg, - char * __restrict errbuf, - size_t errbuf_size) -{ - struct rerr *r; - size_t len; - int target = errcode &~ REG_ITOA; -#ifdef __CYGWIN__ /* Avoid whining compiler */ - const char *s; -#else - char *s; -#endif - char convbuf[50]; - - if (errcode == REG_ATOI) - s = regatoi(preg, convbuf); - else { - for (r = rerrs; r->code != 0; r++) - if (r->code == target) - break; - - if (errcode®_ITOA) { - if (r->code != 0) - (void) strcpy(convbuf, r->name); - else - sprintf(convbuf, "REG_0x%x", target); - assert(strlen(convbuf) < sizeof(convbuf)); - s = convbuf; - } else - s = r->explain; - } - - len = strlen(s) + 1; - if (errbuf_size > 0) { - if (errbuf_size > len) - (void) strcpy(errbuf, s); - else { - (void) strncpy(errbuf, s, errbuf_size-1); - errbuf[errbuf_size-1] = '\0'; - } - } - - return(len); -} - -/* - - regatoi - internal routine to implement REG_ATOI - == static char *regatoi(const regex_t *preg, char *localbuf); - */ -static char * -regatoi(const regex_t *preg, char *localbuf) -{ - struct rerr *r; - - for (r = rerrs; r->code != 0; r++) - if (strcmp(r->name, preg->re_endp) == 0) - break; - if (r->code == 0) -#ifdef __CYGWIN__ /* Avoid whining compiler */ - { - static char null[] = "0"; - return null; - } -#else - return("0"); -#endif - - sprintf(localbuf, "%d", r->code); - return(localbuf); -} diff --git a/winsup/cygwin/regex/regex.3 b/winsup/cygwin/regex/regex.3 deleted file mode 100644 index f848d66c3..000000000 --- a/winsup/cygwin/regex/regex.3 +++ /dev/null @@ -1,727 +0,0 @@ -.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. -.\" Copyright (c) 1992, 1993, 1994 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" This code is derived from software contributed to Berkeley by -.\" Henry Spencer. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 4. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)regex.3 8.4 (Berkeley) 3/20/94 -.\" $FreeBSD: src/lib/libc/regex/regex.3,v 1.21 2007/01/09 00:28:04 imp Exp $ -.\" -.Dd August 17, 2005 -.Dt REGEX 3 -.Os -.Sh NAME -.Nm regcomp , -.Nm regexec , -.Nm regerror , -.Nm regfree -.Nd regular-expression library -.Sh LIBRARY -.Lb libc -.Sh SYNOPSIS -.In regex.h -.Ft int -.Fo regcomp -.Fa "regex_t * restrict preg" "const char * restrict pattern" "int cflags" -.Fc -.Ft int -.Fo regexec -.Fa "const regex_t * restrict preg" "const char * restrict string" -.Fa "size_t nmatch" "regmatch_t pmatch[restrict]" "int eflags" -.Fc -.Ft size_t -.Fo regerror -.Fa "int errcode" "const regex_t * restrict preg" -.Fa "char * restrict errbuf" "size_t errbuf_size" -.Fc -.Ft void -.Fn regfree "regex_t *preg" -.Sh DESCRIPTION -These routines implement -.St -p1003.2 -regular expressions -.Pq Do RE Dc Ns s ; -see -.Xr re_format 7 . -The -.Fn regcomp -function -compiles an RE written as a string into an internal form, -.Fn regexec -matches that internal form against a string and reports results, -.Fn regerror -transforms error codes from either into human-readable messages, -and -.Fn regfree -frees any dynamically-allocated storage used by the internal form -of an RE. -.Pp -The header -.In regex.h -declares two structure types, -.Ft regex_t -and -.Ft regmatch_t , -the former for compiled internal forms and the latter for match reporting. -It also declares the four functions, -a type -.Ft regoff_t , -and a number of constants with names starting with -.Dq Dv REG_ . -.Pp -The -.Fn regcomp -function -compiles the regular expression contained in the -.Fa pattern -string, -subject to the flags in -.Fa cflags , -and places the results in the -.Ft regex_t -structure pointed to by -.Fa preg . -The -.Fa cflags -argument -is the bitwise OR of zero or more of the following flags: -.Bl -tag -width REG_EXTENDED -.It Dv REG_EXTENDED -Compile modern -.Pq Dq extended -REs, -rather than the obsolete -.Pq Dq basic -REs that -are the default. -.It Dv REG_BASIC -This is a synonym for 0, -provided as a counterpart to -.Dv REG_EXTENDED -to improve readability. -.It Dv REG_NOSPEC -Compile with recognition of all special characters turned off. -All characters are thus considered ordinary, -so the -.Dq RE -is a literal string. -This is an extension, -compatible with but not specified by -.St -p1003.2 , -and should be used with -caution in software intended to be portable to other systems. -.Dv REG_EXTENDED -and -.Dv REG_NOSPEC -may not be used -in the same call to -.Fn regcomp . -.It Dv REG_ICASE -Compile for matching that ignores upper/lower case distinctions. -See -.Xr re_format 7 . -.It Dv REG_NOSUB -Compile for matching that need only report success or failure, -not what was matched. -.It Dv REG_NEWLINE -Compile for newline-sensitive matching. -By default, newline is a completely ordinary character with no special -meaning in either REs or strings. -With this flag, -.Ql [^ -bracket expressions and -.Ql .\& -never match newline, -a -.Ql ^\& -anchor matches the null string after any newline in the string -in addition to its normal function, -and the -.Ql $\& -anchor matches the null string before any newline in the -string in addition to its normal function. -.It Dv REG_PEND -The regular expression ends, -not at the first NUL, -but just before the character pointed to by the -.Va re_endp -member of the structure pointed to by -.Fa preg . -The -.Va re_endp -member is of type -.Ft "const char *" . -This flag permits inclusion of NULs in the RE; -they are considered ordinary characters. -This is an extension, -compatible with but not specified by -.St -p1003.2 , -and should be used with -caution in software intended to be portable to other systems. -.El -.Pp -When successful, -.Fn regcomp -returns 0 and fills in the structure pointed to by -.Fa preg . -One member of that structure -(other than -.Va re_endp ) -is publicized: -.Va re_nsub , -of type -.Ft size_t , -contains the number of parenthesized subexpressions within the RE -(except that the value of this member is undefined if the -.Dv REG_NOSUB -flag was used). -If -.Fn regcomp -fails, it returns a non-zero error code; -see -.Sx DIAGNOSTICS . -.Pp -The -.Fn regexec -function -matches the compiled RE pointed to by -.Fa preg -against the -.Fa string , -subject to the flags in -.Fa eflags , -and reports results using -.Fa nmatch , -.Fa pmatch , -and the returned value. -The RE must have been compiled by a previous invocation of -.Fn regcomp . -The compiled form is not altered during execution of -.Fn regexec , -so a single compiled RE can be used simultaneously by multiple threads. -.Pp -By default, -the NUL-terminated string pointed to by -.Fa string -is considered to be the text of an entire line, minus any terminating -newline. -The -.Fa eflags -argument is the bitwise OR of zero or more of the following flags: -.Bl -tag -width REG_STARTEND -.It Dv REG_NOTBOL -The first character of -the string -is not the beginning of a line, so the -.Ql ^\& -anchor should not match before it. -This does not affect the behavior of newlines under -.Dv REG_NEWLINE . -.It Dv REG_NOTEOL -The NUL terminating -the string -does not end a line, so the -.Ql $\& -anchor should not match before it. -This does not affect the behavior of newlines under -.Dv REG_NEWLINE . -.It Dv REG_STARTEND -The string is considered to start at -.Fa string -+ -.Fa pmatch Ns [0]. Ns Va rm_so -and to have a terminating NUL located at -.Fa string -+ -.Fa pmatch Ns [0]. Ns Va rm_eo -(there need not actually be a NUL at that location), -regardless of the value of -.Fa nmatch . -See below for the definition of -.Fa pmatch -and -.Fa nmatch . -This is an extension, -compatible with but not specified by -.St -p1003.2 , -and should be used with -caution in software intended to be portable to other systems. -Note that a non-zero -.Va rm_so -does not imply -.Dv REG_NOTBOL ; -.Dv REG_STARTEND -affects only the location of the string, -not how it is matched. -.El -.Pp -See -.Xr re_format 7 -for a discussion of what is matched in situations where an RE or a -portion thereof could match any of several substrings of -.Fa string . -.Pp -Normally, -.Fn regexec -returns 0 for success and the non-zero code -.Dv REG_NOMATCH -for failure. -Other non-zero error codes may be returned in exceptional situations; -see -.Sx DIAGNOSTICS . -.Pp -If -.Dv REG_NOSUB -was specified in the compilation of the RE, -or if -.Fa nmatch -is 0, -.Fn regexec -ignores the -.Fa pmatch -argument (but see below for the case where -.Dv REG_STARTEND -is specified). -Otherwise, -.Fa pmatch -points to an array of -.Fa nmatch -structures of type -.Ft regmatch_t . -Such a structure has at least the members -.Va rm_so -and -.Va rm_eo , -both of type -.Ft regoff_t -(a signed arithmetic type at least as large as an -.Ft off_t -and a -.Ft ssize_t ) , -containing respectively the offset of the first character of a substring -and the offset of the first character after the end of the substring. -Offsets are measured from the beginning of the -.Fa string -argument given to -.Fn regexec . -An empty substring is denoted by equal offsets, -both indicating the character following the empty substring. -.Pp -The 0th member of the -.Fa pmatch -array is filled in to indicate what substring of -.Fa string -was matched by the entire RE. -Remaining members report what substring was matched by parenthesized -subexpressions within the RE; -member -.Va i -reports subexpression -.Va i , -with subexpressions counted (starting at 1) by the order of their opening -parentheses in the RE, left to right. -Unused entries in the array (corresponding either to subexpressions that -did not participate in the match at all, or to subexpressions that do not -exist in the RE (that is, -.Va i -> -.Fa preg Ns -> Ns Va re_nsub ) ) -have both -.Va rm_so -and -.Va rm_eo -set to -1. -If a subexpression participated in the match several times, -the reported substring is the last one it matched. -(Note, as an example in particular, that when the RE -.Ql "(b*)+" -matches -.Ql bbb , -the parenthesized subexpression matches each of the three -.So Li b Sc Ns s -and then -an infinite number of empty strings following the last -.Ql b , -so the reported substring is one of the empties.) -.Pp -If -.Dv REG_STARTEND -is specified, -.Fa pmatch -must point to at least one -.Ft regmatch_t -(even if -.Fa nmatch -is 0 or -.Dv REG_NOSUB -was specified), -to hold the input offsets for -.Dv REG_STARTEND . -Use for output is still entirely controlled by -.Fa nmatch ; -if -.Fa nmatch -is 0 or -.Dv REG_NOSUB -was specified, -the value of -.Fa pmatch Ns [0] -will not be changed by a successful -.Fn regexec . -.Pp -The -.Fn regerror -function -maps a non-zero -.Fa errcode -from either -.Fn regcomp -or -.Fn regexec -to a human-readable, printable message. -If -.Fa preg -is -.No non\- Ns Dv NULL , -the error code should have arisen from use of -the -.Ft regex_t -pointed to by -.Fa preg , -and if the error code came from -.Fn regcomp , -it should have been the result from the most recent -.Fn regcomp -using that -.Ft regex_t . -The -.Fn ( regerror -may be able to supply a more detailed message using information -from the -.Ft regex_t . ) -The -.Fn regerror -function -places the NUL-terminated message into the buffer pointed to by -.Fa errbuf , -limiting the length (including the NUL) to at most -.Fa errbuf_size -bytes. -If the whole message will not fit, -as much of it as will fit before the terminating NUL is supplied. -In any case, -the returned value is the size of buffer needed to hold the whole -message (including terminating NUL). -If -.Fa errbuf_size -is 0, -.Fa errbuf -is ignored but the return value is still correct. -.Pp -If the -.Fa errcode -given to -.Fn regerror -is first ORed with -.Dv REG_ITOA , -the -.Dq message -that results is the printable name of the error code, -e.g.\& -.Dq Dv REG_NOMATCH , -rather than an explanation thereof. -If -.Fa errcode -is -.Dv REG_ATOI , -then -.Fa preg -shall be -.No non\- Ns Dv NULL -and the -.Va re_endp -member of the structure it points to -must point to the printable name of an error code; -in this case, the result in -.Fa errbuf -is the decimal digits of -the numeric value of the error code -(0 if the name is not recognized). -.Dv REG_ITOA -and -.Dv REG_ATOI -are intended primarily as debugging facilities; -they are extensions, -compatible with but not specified by -.St -p1003.2 , -and should be used with -caution in software intended to be portable to other systems. -Be warned also that they are considered experimental and changes are possible. -.Pp -The -.Fn regfree -function -frees any dynamically-allocated storage associated with the compiled RE -pointed to by -.Fa preg . -The remaining -.Ft regex_t -is no longer a valid compiled RE -and the effect of supplying it to -.Fn regexec -or -.Fn regerror -is undefined. -.Pp -None of these functions references global variables except for tables -of constants; -all are safe for use from multiple threads if the arguments are safe. -.Sh IMPLEMENTATION CHOICES -There are a number of decisions that -.St -p1003.2 -leaves up to the implementor, -either by explicitly saying -.Dq undefined -or by virtue of them being -forbidden by the RE grammar. -This implementation treats them as follows. -.Pp -See -.Xr re_format 7 -for a discussion of the definition of case-independent matching. -.Pp -There is no particular limit on the length of REs, -except insofar as memory is limited. -Memory usage is approximately linear in RE size, and largely insensitive -to RE complexity, except for bounded repetitions. -See -.Sx BUGS -for one short RE using them -that will run almost any system out of memory. -.Pp -A backslashed character other than one specifically given a magic meaning -by -.St -p1003.2 -(such magic meanings occur only in obsolete -.Bq Dq basic -REs) -is taken as an ordinary character. -.Pp -Any unmatched -.Ql [\& -is a -.Dv REG_EBRACK -error. -.Pp -Equivalence classes cannot begin or end bracket-expression ranges. -The endpoint of one range cannot begin another. -.Pp -.Dv RE_DUP_MAX , -the limit on repetition counts in bounded repetitions, is 255. -.Pp -A repetition operator -.Ql ( ?\& , -.Ql *\& , -.Ql +\& , -or bounds) -cannot follow another -repetition operator. -A repetition operator cannot begin an expression or subexpression -or follow -.Ql ^\& -or -.Ql |\& . -.Pp -.Ql |\& -cannot appear first or last in a (sub)expression or after another -.Ql |\& , -i.e., an operand of -.Ql |\& -cannot be an empty subexpression. -An empty parenthesized subexpression, -.Ql "()" , -is legal and matches an -empty (sub)string. -An empty string is not a legal RE. -.Pp -A -.Ql {\& -followed by a digit is considered the beginning of bounds for a -bounded repetition, which must then follow the syntax for bounds. -A -.Ql {\& -.Em not -followed by a digit is considered an ordinary character. -.Pp -.Ql ^\& -and -.Ql $\& -beginning and ending subexpressions in obsolete -.Pq Dq basic -REs are anchors, not ordinary characters. -.Sh DIAGNOSTICS -Non-zero error codes from -.Fn regcomp -and -.Fn regexec -include the following: -.Pp -.Bl -tag -width REG_ECOLLATE -compact -.It Dv REG_NOMATCH -The -.Fn regexec -function -failed to match -.It Dv REG_BADPAT -invalid regular expression -.It Dv REG_ECOLLATE -invalid collating element -.It Dv REG_ECTYPE -invalid character class -.It Dv REG_EESCAPE -.Ql \e -applied to unescapable character -.It Dv REG_ESUBREG -invalid backreference number -.It Dv REG_EBRACK -brackets -.Ql "[ ]" -not balanced -.It Dv REG_EPAREN -parentheses -.Ql "( )" -not balanced -.It Dv REG_EBRACE -braces -.Ql "{ }" -not balanced -.It Dv REG_BADBR -invalid repetition count(s) in -.Ql "{ }" -.It Dv REG_ERANGE -invalid character range in -.Ql "[ ]" -.It Dv REG_ESPACE -ran out of memory -.It Dv REG_BADRPT -.Ql ?\& , -.Ql *\& , -or -.Ql +\& -operand invalid -.It Dv REG_EMPTY -empty (sub)expression -.It Dv REG_ASSERT -cannot happen - you found a bug -.It Dv REG_INVARG -invalid argument, e.g.\& negative-length string -.It Dv REG_ILLSEQ -illegal byte sequence (bad multibyte character) -.El -.Sh SEE ALSO -.Xr grep 1 , -.Xr re_format 7 -.Pp -.St -p1003.2 , -sections 2.8 (Regular Expression Notation) -and -B.5 (C Binding for Regular Expression Matching). -.Sh HISTORY -Originally written by -.An Henry Spencer . -Altered for inclusion in the -.Bx 4.4 -distribution. -.Sh BUGS -This is an alpha release with known defects. -Please report problems. -.Pp -The back-reference code is subtle and doubts linger about its correctness -in complex cases. -.Pp -The -.Fn regexec -function -performance is poor. -This will improve with later releases. -The -.Fa nmatch -argument -exceeding 0 is expensive; -.Fa nmatch -exceeding 1 is worse. -The -.Fn regexec -function -is largely insensitive to RE complexity -.Em except -that back -references are massively expensive. -RE length does matter; in particular, there is a strong speed bonus -for keeping RE length under about 30 characters, -with most special characters counting roughly double. -.Pp -The -.Fn regcomp -function -implements bounded repetitions by macro expansion, -which is costly in time and space if counts are large -or bounded repetitions are nested. -An RE like, say, -.Ql "((((a{1,100}){1,100}){1,100}){1,100}){1,100}" -will (eventually) run almost any existing machine out of swap space. -.Pp -There are suspected problems with response to obscure error conditions. -Notably, -certain kinds of internal overflow, -produced only by truly enormous REs or by multiply nested bounded repetitions, -are probably not handled well. -.Pp -Due to a mistake in -.St -p1003.2 , -things like -.Ql "a)b" -are legal REs because -.Ql )\& -is -a special character only in the presence of a previous unmatched -.Ql (\& . -This cannot be fixed until the spec is fixed. -.Pp -The standard's definition of back references is vague. -For example, does -.Ql "a\e(\e(b\e)*\e2\e)*d" -match -.Ql "abbbd" ? -Until the standard is clarified, -behavior in such cases should not be relied on. -.Pp -The implementation of word-boundary matching is a bit of a kludge, -and bugs may lurk in combinations of word-boundary matching and anchoring. -.Pp -Word-boundary matching does not work properly in multibyte locales. diff --git a/winsup/cygwin/regex/regex.7 b/winsup/cygwin/regex/regex.7 deleted file mode 100644 index 79fecc197..000000000 --- a/winsup/cygwin/regex/regex.7 +++ /dev/null @@ -1,480 +0,0 @@ -.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. -.\" Copyright (c) 1992, 1993, 1994 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" This code is derived from software contributed to Berkeley by -.\" Henry Spencer. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 4. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)re_format.7 8.3 (Berkeley) 3/20/94 -.\" $FreeBSD: src/lib/libc/regex/re_format.7,v 1.12 2008/09/05 17:41:20 keramida Exp $ -.\" -.Dd March 20, 1994 -.Dt RE_FORMAT 7 -.Os -.Sh NAME -.Nm re_format -.Nd POSIX 1003.2 regular expressions -.Sh DESCRIPTION -Regular expressions -.Pq Dq RE Ns s , -as defined in -.St -p1003.2 , -come in two forms: -modern REs (roughly those of -.Xr egrep 1 ; -1003.2 calls these -.Dq extended -REs) -and obsolete REs (roughly those of -.Xr ed 1 ; -1003.2 -.Dq basic -REs). -Obsolete REs mostly exist for backward compatibility in some old programs; -they will be discussed at the end. -.St -p1003.2 -leaves some aspects of RE syntax and semantics open; -`\(dd' marks decisions on these aspects that -may not be fully portable to other -.St -p1003.2 -implementations. -.Pp -A (modern) RE is one\(dd or more non-empty\(dd -.Em branches , -separated by -.Ql \&| . -It matches anything that matches one of the branches. -.Pp -A branch is one\(dd or more -.Em pieces , -concatenated. -It matches a match for the first, followed by a match for the second, etc. -.Pp -A piece is an -.Em atom -possibly followed -by a single\(dd -.Ql \&* , -.Ql \&+ , -.Ql \&? , -or -.Em bound . -An atom followed by -.Ql \&* -matches a sequence of 0 or more matches of the atom. -An atom followed by -.Ql \&+ -matches a sequence of 1 or more matches of the atom. -An atom followed by -.Ql ?\& -matches a sequence of 0 or 1 matches of the atom. -.Pp -A -.Em bound -is -.Ql \&{ -followed by an unsigned decimal integer, -possibly followed by -.Ql \&, -possibly followed by another unsigned decimal integer, -always followed by -.Ql \&} . -The integers must lie between 0 and -.Dv RE_DUP_MAX -(255\(dd) inclusive, -and if there are two of them, the first may not exceed the second. -An atom followed by a bound containing one integer -.Em i -and no comma matches -a sequence of exactly -.Em i -matches of the atom. -An atom followed by a bound -containing one integer -.Em i -and a comma matches -a sequence of -.Em i -or more matches of the atom. -An atom followed by a bound -containing two integers -.Em i -and -.Em j -matches -a sequence of -.Em i -through -.Em j -(inclusive) matches of the atom. -.Pp -An atom is a regular expression enclosed in -.Ql () -(matching a match for the -regular expression), -an empty set of -.Ql () -(matching the null string)\(dd, -a -.Em bracket expression -(see below), -.Ql .\& -(matching any single character), -.Ql \&^ -(matching the null string at the beginning of a line), -.Ql \&$ -(matching the null string at the end of a line), a -.Ql \e -followed by one of the characters -.Ql ^.[$()|*+?{\e -(matching that character taken as an ordinary character), -a -.Ql \e -followed by any other character\(dd -(matching that character taken as an ordinary character, -as if the -.Ql \e -had not been present\(dd), -or a single character with no other significance (matching that character). -A -.Ql \&{ -followed by a character other than a digit is an ordinary -character, not the beginning of a bound\(dd. -It is illegal to end an RE with -.Ql \e . -.Pp -A -.Em bracket expression -is a list of characters enclosed in -.Ql [] . -It normally matches any single character from the list (but see below). -If the list begins with -.Ql \&^ , -it matches any single character -(but see below) -.Em not -from the rest of the list. -If two characters in the list are separated by -.Ql \&- , -this is shorthand -for the full -.Em range -of characters between those two (inclusive) in the -collating sequence, -.No e.g. Ql [0-9] -in ASCII matches any decimal digit. -It is illegal\(dd for two ranges to share an -endpoint, -.No e.g. Ql a-c-e . -Ranges are very collating-sequence-dependent, -and portable programs should avoid relying on them. -.Pp -To include a literal -.Ql \&] -in the list, make it the first character -(following a possible -.Ql \&^ ) . -To include a literal -.Ql \&- , -make it the first or last character, -or the second endpoint of a range. -To use a literal -.Ql \&- -as the first endpoint of a range, -enclose it in -.Ql [.\& -and -.Ql .]\& -to make it a collating element (see below). -With the exception of these and some combinations using -.Ql \&[ -(see next paragraphs), all other special characters, including -.Ql \e , -lose their special significance within a bracket expression. -.Pp -Within a bracket expression, a collating element (a character, -a multi-character sequence that collates as if it were a single character, -or a collating-sequence name for either) -enclosed in -.Ql [.\& -and -.Ql .]\& -stands for the -sequence of characters of that collating element. -The sequence is a single element of the bracket expression's list. -A bracket expression containing a multi-character collating element -can thus match more than one character, -e.g.\& if the collating sequence includes a -.Ql ch -collating element, -then the RE -.Ql [[.ch.]]*c -matches the first five characters -of -.Ql chchcc . -.Pp -Within a bracket expression, a collating element enclosed in -.Ql [= -and -.Ql =] -is an equivalence class, standing for the sequences of characters -of all collating elements equivalent to that one, including itself. -(If there are no other equivalent collating elements, -the treatment is as if the enclosing delimiters were -.Ql [.\& -and -.Ql .] . ) -For example, if -.Ql x -and -.Ql y -are the members of an equivalence class, -then -.Ql [[=x=]] , -.Ql [[=y=]] , -and -.Ql [xy] -are all synonymous. -An equivalence class may not\(dd be an endpoint -of a range. -.Pp -Within a bracket expression, the name of a -.Em character class -enclosed in -.Ql [: -and -.Ql :] -stands for the list of all characters belonging to that -class. -Standard character class names are: -.Pp -.Bl -column "alnum" "digit" "xdigit" -offset indent -.It Em "alnum digit punct" -.It Em "alpha graph space" -.It Em "blank lower upper" -.It Em "cntrl print xdigit" -.El -.Pp -These stand for the character classes defined in -.Xr ctype 3 . -A locale may provide others. -A character class may not be used as an endpoint of a range. -.Pp -A bracketed expression like -.Ql [[:class:]] -can be used to match a single character that belongs to a character -class. -The reverse, matching any character that does not belong to a specific -class, the negation operator of bracket expressions may be used: -.Ql [^[:class:]] . -.Pp -There are two special cases\(dd of bracket expressions: -the bracket expressions -.Ql [[:<:]] -and -.Ql [[:>:]] -match the null string at the beginning and end of a word respectively. -A word is defined as a sequence of word characters -which is neither preceded nor followed by -word characters. -A word character is an -.Em alnum -character (as defined by -.Xr ctype 3 ) -or an underscore. -This is an extension, -compatible with but not specified by -.St -p1003.2 , -and should be used with -caution in software intended to be portable to other systems. -.Pp -In the event that an RE could match more than one substring of a given -string, -the RE matches the one starting earliest in the string. -If the RE could match more than one substring starting at that point, -it matches the longest. -Subexpressions also match the longest possible substrings, subject to -the constraint that the whole match be as long as possible, -with subexpressions starting earlier in the RE taking priority over -ones starting later. -Note that higher-level subexpressions thus take priority over -their lower-level component subexpressions. -.Pp -Match lengths are measured in characters, not collating elements. -A null string is considered longer than no match at all. -For example, -.Ql bb* -matches the three middle characters of -.Ql abbbc , -.Ql (wee|week)(knights|nights) -matches all ten characters of -.Ql weeknights , -when -.Ql (.*).*\& -is matched against -.Ql abc -the parenthesized subexpression -matches all three characters, and -when -.Ql (a*)* -is matched against -.Ql bc -both the whole RE and the parenthesized -subexpression match the null string. -.Pp -If case-independent matching is specified, -the effect is much as if all case distinctions had vanished from the -alphabet. -When an alphabetic that exists in multiple cases appears as an -ordinary character outside a bracket expression, it is effectively -transformed into a bracket expression containing both cases, -.No e.g. Ql x -becomes -.Ql [xX] . -When it appears inside a bracket expression, all case counterparts -of it are added to the bracket expression, so that (e.g.) -.Ql [x] -becomes -.Ql [xX] -and -.Ql [^x] -becomes -.Ql [^xX] . -.Pp -No particular limit is imposed on the length of REs\(dd. -Programs intended to be portable should not employ REs longer -than 256 bytes, -as an implementation can refuse to accept such REs and remain -POSIX-compliant. -.Pp -Obsolete -.Pq Dq basic -regular expressions differ in several respects. -.Ql \&| -is an ordinary character and there is no equivalent -for its functionality. -.Ql \&+ -and -.Ql ?\& -are ordinary characters, and their functionality -can be expressed using bounds -.No ( Ql {1,} -or -.Ql {0,1} -respectively). -Also note that -.Ql x+ -in modern REs is equivalent to -.Ql xx* . -The delimiters for bounds are -.Ql \e{ -and -.Ql \e} , -with -.Ql \&{ -and -.Ql \&} -by themselves ordinary characters. -The parentheses for nested subexpressions are -.Ql \e( -and -.Ql \e) , -with -.Ql \&( -and -.Ql \&) -by themselves ordinary characters. -.Ql \&^ -is an ordinary character except at the beginning of the -RE or\(dd the beginning of a parenthesized subexpression, -.Ql \&$ -is an ordinary character except at the end of the -RE or\(dd the end of a parenthesized subexpression, -and -.Ql \&* -is an ordinary character if it appears at the beginning of the -RE or the beginning of a parenthesized subexpression -(after a possible leading -.Ql \&^ ) . -Finally, there is one new type of atom, a -.Em back reference : -.Ql \e -followed by a non-zero decimal digit -.Em d -matches the same sequence of characters -matched by the -.Em d Ns th -parenthesized subexpression -(numbering subexpressions by the positions of their opening parentheses, -left to right), -so that (e.g.) -.Ql \e([bc]\e)\e1 -matches -.Ql bb -or -.Ql cc -but not -.Ql bc . -.Sh SEE ALSO -.Xr regex 3 -.Rs -.%T Regular Expression Notation -.%R IEEE Std -.%N 1003.2 -.%P section 2.8 -.Re -.Sh BUGS -Having two kinds of REs is a botch. -.Pp -The current -.St -p1003.2 -spec says that -.Ql \&) -is an ordinary character in -the absence of an unmatched -.Ql \&( ; -this was an unintentional result of a wording error, -and change is likely. -Avoid relying on it. -.Pp -Back references are a dreadful botch, -posing major problems for efficient implementations. -They are also somewhat vaguely defined -(does -.Ql a\e(\e(b\e)*\e2\e)*d -match -.Ql abbbd ? ) . -Avoid using them. -.Pp -.St -p1003.2 -specification of case-independent matching is vague. -The -.Dq one case implies all cases -definition given above -is current consensus among implementors as to the right interpretation. -.Pp -The syntax for word boundaries is incredibly ugly. diff --git a/winsup/cygwin/regex/regex2.h b/winsup/cygwin/regex/regex2.h deleted file mode 100644 index 53f687bf6..000000000 --- a/winsup/cygwin/regex/regex2.h +++ /dev/null @@ -1,196 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regex2.h 8.4 (Berkeley) 3/20/94 - * $FreeBSD: src/lib/libc/regex/regex2.h,v 1.11 2007/01/09 00:28:04 imp Exp $ - */ - -/* - * First, the stuff that ends up in the outside-world include file - = typedef off_t regoff_t; - = typedef struct { - = int re_magic; - = size_t re_nsub; // number of parenthesized subexpressions - = const char *re_endp; // end pointer for REG_PEND - = struct re_guts *re_g; // none of your business :-) - = } regex_t; - = typedef struct { - = regoff_t rm_so; // start of match - = regoff_t rm_eo; // end of match - = } regmatch_t; - */ -/* - * internals of regex_t - */ -#define MAGIC1 ((('r'^0200)<<8) | 'e') - -/* - * The internal representation is a *strip*, a sequence of - * operators ending with an endmarker. (Some terminology etc. is a - * historical relic of earlier versions which used multiple strips.) - * Certain oddities in the representation are there to permit running - * the machinery backwards; in particular, any deviation from sequential - * flow must be marked at both its source and its destination. Some - * fine points: - * - * - OPLUS_ and O_PLUS are *inside* the loop they create. - * - OQUEST_ and O_QUEST are *outside* the bypass they create. - * - OCH_ and O_CH are *outside* the multi-way branch they create, while - * OOR1 and OOR2 are respectively the end and the beginning of one of - * the branches. Note that there is an implicit OOR2 following OCH_ - * and an implicit OOR1 preceding O_CH. - * - * In state representations, an operator's bit is on to signify a state - * immediately *preceding* "execution" of that operator. - */ -typedef unsigned long sop; /* strip operator */ -typedef long sopno; -#define OPRMASK 0xf8000000L -#define OPDMASK 0x07ffffffL -#define OPSHIFT ((unsigned)27) -#define OP(n) ((n)&OPRMASK) -#define OPND(n) ((n)&OPDMASK) -#define SOP(op, opnd) ((op)|(opnd)) -/* operators meaning operand */ -/* (back, fwd are offsets) */ -#define OEND (1L<<OPSHIFT) /* endmarker - */ -#define OCHAR (2L<<OPSHIFT) /* character wide character */ -#define OBOL (3L<<OPSHIFT) /* left anchor - */ -#define OEOL (4L<<OPSHIFT) /* right anchor - */ -#define OANY (5L<<OPSHIFT) /* . - */ -#define OANYOF (6L<<OPSHIFT) /* [...] set number */ -#define OBACK_ (7L<<OPSHIFT) /* begin \d paren number */ -#define O_BACK (8L<<OPSHIFT) /* end \d paren number */ -#define OPLUS_ (9L<<OPSHIFT) /* + prefix fwd to suffix */ -#define O_PLUS (10L<<OPSHIFT) /* + suffix back to prefix */ -#define OQUEST_ (11L<<OPSHIFT) /* ? prefix fwd to suffix */ -#define O_QUEST (12L<<OPSHIFT) /* ? suffix back to prefix */ -#define OLPAREN (13L<<OPSHIFT) /* ( fwd to ) */ -#define ORPAREN (14L<<OPSHIFT) /* ) back to ( */ -#define OCH_ (15L<<OPSHIFT) /* begin choice fwd to OOR2 */ -#define OOR1 (16L<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */ -#define OOR2 (17L<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */ -#define O_CH (18L<<OPSHIFT) /* end choice back to OOR1 */ -#define OBOW (19L<<OPSHIFT) /* begin word - */ -#define OEOW (20L<<OPSHIFT) /* end word - */ - -/* - * Structures for [] character-set representation. - */ -typedef struct { - wint_t min; - wint_t max; -} crange; -typedef struct { - unsigned char bmp[NC / 8]; - wctype_t *types; - int ntypes; - wint_t *wides; - int nwides; - crange *ranges; - int nranges; - int invert; - int icase; -} cset; - -static int -CHIN1(cset *cs, wint_t ch) -{ - int i; - - assert(ch >= 0); - if (ch < NC) - return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ - cs->invert); - for (i = 0; i < cs->nwides; i++) - if (ch == cs->wides[i]) - return (!cs->invert); - for (i = 0; i < cs->nranges; i++) - if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max) - return (!cs->invert); - for (i = 0; i < cs->ntypes; i++) - if (iswctype(ch, cs->types[i])) - return (!cs->invert); - return (cs->invert); -} - -static __inline int -CHIN(cset *cs, wint_t ch) -{ - - assert(ch >= 0); - if (ch < NC) - return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ - cs->invert); - else if (cs->icase) { - if (cs->invert) - return (CHIN1(cs, ch) && CHIN1(cs, towlower(ch)) && - CHIN1(cs, towupper(ch))); - else - return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) || - CHIN1(cs, towupper(ch))); - } else - return (CHIN1(cs, ch)); -} - -/* - * main compiled-expression structure - */ -struct re_guts { - int magic; -# define MAGIC2 ((('R'^0200)<<8)|'E') - sop *strip; /* malloced area for strip */ - int ncsets; /* number of csets in use */ - cset *sets; /* -> cset [ncsets] */ - int cflags; /* copy of regcomp() cflags argument */ - sopno nstates; /* = number of sops */ - sopno firststate; /* the initial OEND (normally 0) */ - sopno laststate; /* the final OEND */ - int iflags; /* internal flags */ -# define USEBOL 01 /* used ^ */ -# define USEEOL 02 /* used $ */ -# define BAD 04 /* something wrong */ - int nbol; /* number of ^ used */ - int neol; /* number of $ used */ - char *must; /* match must contain this string */ - int moffset; /* latest point at which must may be located */ - int *charjump; /* Boyer-Moore char jump table */ - int *matchjump; /* Boyer-Moore match jump table */ - int mlen; /* length of must */ - size_t nsub; /* copy of re_nsub */ - int backrefs; /* does it use back references? */ - sopno nplus; /* how deep does it nest +s? */ -}; - -/* misc utilities */ -#define OUT (CHAR_MIN - 1) /* a non-character value */ -#define ISWORD(c) (iswalnum((wint_t)(c)) || (c) == '_') diff --git a/winsup/cygwin/regex/regexec.c b/winsup/cygwin/regex/regexec.c deleted file mode 100644 index ad12ada41..000000000 --- a/winsup/cygwin/regex/regexec.c +++ /dev/null @@ -1,256 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regexec.c 8.3 (Berkeley) 3/20/94 - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)regexec.c 8.3 (Berkeley) 3/20/94"; -#endif /* LIBC_SCCS and not lint */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/lib/libc/regex/regexec.c,v 1.8 2007/06/11 03:05:54 delphij Exp $"); - -/* - * the outer shell of regexec() - * - * This file includes engine.c three times, after muchos fiddling with the - * macros that code uses. This lets the same code operate on two different - * representations for state sets and characters. - */ -#ifdef __CYGWIN__ -#include "winsup.h" -#endif -#include <sys/types.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <limits.h> -#include <ctype.h> -#include <regex.h> -#include <wchar.h> -#include <wctype.h> - -#include "utils.h" -#include "regex2.h" - -#ifdef __CYGWIN__ -#define __unused __attribute__ ((unused)) -#endif - -static int nope __unused = 0; /* for use in asserts; shuts lint up */ - -static __inline size_t -xmbrtowc(wint_t *wi, const char *s, size_t n, mbstate_t *mbs, wint_t dummy) -{ - size_t nr; - wchar_t wc; - - nr = mbrtowc(&wc, s, n, mbs); - if (wi != NULL) - *wi = wc; - if (nr == 0) - return (1); - else if (nr == (size_t)-1 || nr == (size_t)-2) { - memset(mbs, 0, sizeof(*mbs)); - if (wi != NULL) - *wi = dummy; - return (1); - } else { - if (sizeof (wchar_t) == 2 && wc >= 0xd800 && wc <= 0xdbff) { - /* UTF-16 surrogate pair. Fetch second half and - compute UTF-32 value */ - size_t n2 = mbrtowc(&wc, s + nr, n - nr, mbs); - if (n2 == 0 || n2 == (size_t)-1 || n2 == (size_t)-2) { - memset(mbs, 0, sizeof(*mbs)); - if (wi != NULL) - *wi = dummy; - return (1); - } - if (wi != NULL) - *wi = (((*wi & 0x3ff) << 10) | (wc & 0x3ff)) - + 0x10000; - nr += n2; - } - return (nr); - } -} - -static __inline size_t -xmbrtowc_dummy(wint_t *wi, - const char *s, - size_t n __unused, - mbstate_t *mbs __unused, - wint_t dummy __unused) -{ - - if (wi != NULL) - *wi = (unsigned char)*s; - return (1); -} - -/* macros for manipulating states, small version */ -#define states long -#define states1 states /* for later use in regexec() decision */ -#define CLEAR(v) ((v) = 0) -#define SET0(v, n) ((v) &= ~((unsigned long)1 << (n))) -#define SET1(v, n) ((v) |= (unsigned long)1 << (n)) -#define ISSET(v, n) (((v) & ((unsigned long)1 << (n))) != 0) -#define ASSIGN(d, s) ((d) = (s)) -#define EQ(a, b) ((a) == (b)) -#define STATEVARS long dummy /* dummy version */ -#define STATESETUP(m, n) /* nothing */ -#define STATETEARDOWN(m) /* nothing */ -#define SETUP(v) ((v) = 0) -#define onestate long -#define INIT(o, n) ((o) = (unsigned long)1 << (n)) -#define INC(o) ((o) <<= 1) -#define ISSTATEIN(v, o) (((v) & (o)) != 0) -/* some abbreviations; note that some of these know variable names! */ -/* do "if I'm here, I can also be there" etc without branches */ -#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n)) -#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n)) -#define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0) -/* no multibyte support */ -#define XMBRTOWC xmbrtowc_dummy -#define ZAPSTATE(mbs) ((void)(mbs)) -/* function names */ -#define SNAMES /* engine.c looks after details */ - -#include "engine.c" - -/* now undo things */ -#undef states -#undef CLEAR -#undef SET0 -#undef SET1 -#undef ISSET -#undef ASSIGN -#undef EQ -#undef STATEVARS -#undef STATESETUP -#undef STATETEARDOWN -#undef SETUP -#undef onestate -#undef INIT -#undef INC -#undef ISSTATEIN -#undef FWD -#undef BACK -#undef ISSETBACK -#undef SNAMES -#undef XMBRTOWC -#undef ZAPSTATE - -/* macros for manipulating states, large version */ -#define states char * -#define CLEAR(v) memset(v, 0, m->g->nstates) -#define SET0(v, n) ((v)[n] = 0) -#define SET1(v, n) ((v)[n] = 1) -#define ISSET(v, n) ((v)[n]) -#define ASSIGN(d, s) memcpy(d, s, m->g->nstates) -#define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0) -#define STATEVARS long vn; char *space -#define STATESETUP(m, nv) { (m)->space = malloc((nv)*(m)->g->nstates); \ - if ((m)->space == NULL) return(REG_ESPACE); \ - (m)->vn = 0; } -#define STATETEARDOWN(m) { free((m)->space); } -#define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates]) -#define onestate long -#define INIT(o, n) ((o) = (n)) -#define INC(o) ((o)++) -#define ISSTATEIN(v, o) ((v)[o]) -/* some abbreviations; note that some of these know variable names! */ -/* do "if I'm here, I can also be there" etc without branches */ -#define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here]) -#define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here]) -#define ISSETBACK(v, n) ((v)[here - (n)]) -/* no multibyte support */ -#define XMBRTOWC xmbrtowc_dummy -#define ZAPSTATE(mbs) ((void)(mbs)) -/* function names */ -#define LNAMES /* flag */ - -#include "engine.c" - -/* multibyte character & large states version */ -#undef LNAMES -#undef XMBRTOWC -#undef ZAPSTATE -#define XMBRTOWC xmbrtowc -#define ZAPSTATE(mbs) memset((mbs), 0, sizeof(*(mbs))) -#define MNAMES - -#include "engine.c" - -/* - - regexec - interface for matching - = extern int regexec(const regex_t *, const char *, size_t, \ - = regmatch_t [], int); - = #define REG_NOTBOL 00001 - = #define REG_NOTEOL 00002 - = #define REG_STARTEND 00004 - = #define REG_TRACE 00400 // tracing of execution - = #define REG_LARGE 01000 // force large representation - = #define REG_BACKR 02000 // force use of backref code - * - * We put this here so we can exploit knowledge of the state representation - * when choosing which matcher to call. Also, by this point the matchers - * have been prototyped. - */ -int /* 0 success, REG_NOMATCH failure */ -regexec(const regex_t * __restrict preg, - const char * __restrict string, - size_t nmatch, - regmatch_t pmatch[__restrict], - int eflags) -{ - struct re_guts *g = preg->re_g; -#ifdef REDEBUG -# define GOODFLAGS(f) (f) -#else -# define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND)) -#endif - - if (preg->re_magic != MAGIC1 || g->magic != MAGIC2) - return(REG_BADPAT); - assert(!(g->iflags&BAD)); - if (g->iflags&BAD) /* backstop for no-debug case */ - return(REG_BADPAT); - eflags = GOODFLAGS(eflags); - - if (MB_CUR_MAX > 1) - return(mmatcher(g, (char *)string, nmatch, pmatch, eflags)); - else if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags®_LARGE)) - return(smatcher(g, (char *)string, nmatch, pmatch, eflags)); - else - return(lmatcher(g, (char *)string, nmatch, pmatch, eflags)); -} diff --git a/winsup/cygwin/regex/regfree.c b/winsup/cygwin/regex/regfree.c deleted file mode 100644 index aa795fa78..000000000 --- a/winsup/cygwin/regex/regfree.c +++ /dev/null @@ -1,89 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regfree.c 8.3 (Berkeley) 3/20/94 - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)regfree.c 8.3 (Berkeley) 3/20/94"; -#endif /* LIBC_SCCS and not lint */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/lib/libc/regex/regfree.c,v 1.8 2007/06/11 03:05:54 delphij Exp $"); - -#include <sys/types.h> -#include <stdio.h> -#include <stdlib.h> -#include <limits.h> -#include <regex.h> -#include <wchar.h> -#include <wctype.h> - -#include "utils.h" -#include "regex2.h" - -/* - - regfree - free everything - = extern void regfree(regex_t *); - */ -void -regfree(regex_t *preg) -{ - struct re_guts *g; - int i; - - if (preg->re_magic != MAGIC1) /* oops */ - return; /* nice to complain, but hard */ - - g = preg->re_g; - if (g == NULL || g->magic != MAGIC2) /* oops again */ - return; - preg->re_magic = 0; /* mark it invalid */ - g->magic = 0; /* mark it invalid */ - - if (g->strip != NULL) - free((char *)g->strip); - if (g->sets != NULL) { - for (i = 0; i < g->ncsets; i++) { - free(g->sets[i].ranges); - free(g->sets[i].wides); - free(g->sets[i].types); - } - free((char *)g->sets); - } - if (g->must != NULL) - free(g->must); - if (g->charjump != NULL) - free(&g->charjump[CHAR_MIN]); - if (g->matchjump != NULL) - free(g->matchjump); - free((char *)g); -} diff --git a/winsup/cygwin/regex/utils.h b/winsup/cygwin/regex/utils.h deleted file mode 100644 index 2a2ed9694..000000000 --- a/winsup/cygwin/regex/utils.h +++ /dev/null @@ -1,54 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)utils.h 8.3 (Berkeley) 3/20/94 - * $FreeBSD: src/lib/libc/regex/utils.h,v 1.3 2007/01/09 00:28:04 imp Exp $ - */ - -/* utility definitions */ -#define DUPMAX _POSIX2_RE_DUP_MAX /* xxx is this right? */ -#define INFINITY (DUPMAX + 1) -#define NC (CHAR_MAX - CHAR_MIN + 1) -typedef unsigned char uch; - -/* switch off assertions (if not already off) if no REDEBUG */ -#ifndef REDEBUG -#ifndef NDEBUG -#define NDEBUG /* no assertions please */ -#endif -#endif -#include <assert.h> - -/* for old systems with bcopy() but no memmove() */ -#ifdef USEBCOPY -#define memmove(d, s, c) bcopy(s, d, c) -#endif |