diff options
Diffstat (limited to 'winsup/cygwin/regex/regcomp.c')
-rw-r--r-- | winsup/cygwin/regex/regcomp.c | 1885 |
1 files changed, 0 insertions, 1885 deletions
diff --git a/winsup/cygwin/regex/regcomp.c b/winsup/cygwin/regex/regcomp.c deleted file mode 100644 index 5aee63948..000000000 --- a/winsup/cygwin/regex/regcomp.c +++ /dev/null @@ -1,1885 +0,0 @@ -/*- - * Copyright (c) 1992, 1993, 1994 Henry Spencer. - * Copyright (c) 1992, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Henry Spencer. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; -#endif /* LIBC_SCCS and not lint */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.36 2007/06/11 03:05:54 delphij Exp $"); - -#ifdef __CYGWIN__ -#include "winsup.h" -#endif -#include <sys/types.h> -#include <stdio.h> -#include <string.h> -#include <ctype.h> -#include <limits.h> -#include <stdlib.h> -#include <regex.h> -#ifndef __CYGWIN__ -#include <runetype.h> -#endif -#include <wchar.h> -#include <wctype.h> - -#ifndef __CYGWIN__ -#include "collate.h" -#endif - -#include "utils.h" -#include "regex2.h" - -#include "cname.h" - -#ifdef __CYGWIN__ -/* Don't pull in windows headers just for LCID. */ -typedef unsigned long LCID; -/* These are defined in nlsfuncs.cc. */ -extern LCID collate_lcid; -extern char collate_charset[]; -#endif - -/* - * parse structure, passed up and down to avoid global variables and - * other clumsinesses - */ -struct parse { - char *next; /* next character in RE */ - char *end; /* end of string (-> NUL normally) */ - int error; /* has an error been seen? */ - sop *strip; /* malloced strip */ - sopno ssize; /* malloced strip size (allocated) */ - sopno slen; /* malloced strip length (used) */ - int ncsalloc; /* number of csets allocated */ - struct re_guts *g; -# define NPAREN 10 /* we need to remember () 1-9 for back refs */ - sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ - sopno pend[NPAREN]; /* -> ) ([0] unused) */ -}; - -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === regcomp.c === */ -#ifdef __CYGWIN__ /* Defined below `int stop'. Our gcc chokes on that. */ -static void p_ere(struct parse *p, int stop); -#else -static void p_ere(struct parse *p, wint_t stop); -#endif -static void p_ere_exp(struct parse *p); -static void p_str(struct parse *p); -#ifdef __CYGWIN__ /* Defined below `int end1/end2'. Our gcc chokes on that. */ -static void p_bre(struct parse *p, int end1, int end2); -#else -static void p_bre(struct parse *p, wint_t end1, wint_t end2); -#endif -static int p_simp_re(struct parse *p, int starordinary); -static int p_count(struct parse *p); -static void p_bracket(struct parse *p); -static void p_b_term(struct parse *p, cset *cs); -static void p_b_cclass(struct parse *p, cset *cs); -static void p_b_eclass(struct parse *p, cset *cs); -static wint_t p_b_symbol(struct parse *p); -static wint_t p_b_coll_elem(struct parse *p, wint_t endc); -static wint_t othercase(wint_t ch); -static void bothcases(struct parse *p, wint_t ch); -static void ordinary(struct parse *p, wint_t ch); -static void nonnewline(struct parse *p); -static void repeat(struct parse *p, sopno start, int from, int to); -static int seterr(struct parse *p, int e); -static cset *allocset(struct parse *p); -static void freeset(struct parse *p, cset *cs); -static void CHadd(struct parse *p, cset *cs, wint_t ch); -static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); -static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); -static wint_t singleton(cset *cs); -static sopno dupl(struct parse *p, sopno start, sopno finish); -static void doemit(struct parse *p, sop op, size_t opnd); -static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); -static void dofwd(struct parse *p, sopno pos, sop value); -static void enlarge(struct parse *p, sopno size); -static void stripsnug(struct parse *p, struct re_guts *g); -static void findmust(struct parse *p, struct re_guts *g); -static int altoffset(sop *scan, int offset); -static void computejumps(struct parse *p, struct re_guts *g); -static void computematchjumps(struct parse *p, struct re_guts *g); -static sopno pluscount(struct parse *p, struct re_guts *g); -static wint_t wgetnext(struct parse *p); -static size_t xwcrtomb (char *s, wint_t wc, mbstate_t *ps); - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ - -static char nuls[10]; /* place to point scanner in event of error */ - -/* - * macros for use with parse structure - * BEWARE: these know that the parse structure is named `p' !!! - */ -#define PEEK() (*p->next) -#define PEEK2() (*(p->next+1)) -#define MORE() (p->next < p->end) -#define MORE2() (p->next+1 < p->end) -#define SEE(c) (MORE() && PEEK() == (c)) -#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) -#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) -#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) -#define NEXT() (p->next++) -#define NEXT2() (p->next += 2) -#define NEXTn(n) (p->next += (n)) -#define GETNEXT() (*p->next++) -#define WGETNEXT() wgetnext(p) -#define SETERROR(e) seterr(p, (e)) -#define REQUIRE(co, e) ((co) || SETERROR(e)) -#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) -#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) -#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) -#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) -#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) -#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) -#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) -#define HERE() (p->slen) -#define THERE() (p->slen - 1) -#define THERETHERE() (p->slen - 2) -#define DROP(n) (p->slen -= (n)) - -#ifndef NDEBUG -static int never = 0; /* for use in asserts; shuts lint up */ -#else -#define never 0 /* some <assert.h>s have bugs too */ -#endif - -/* Macro used by computejump()/computematchjump() */ -#define MIN(a,b) ((a)<(b)?(a):(b)) - -/* - - regcomp - interface for parser and compilation - = extern int regcomp(regex_t *, const char *, int); - = #define REG_BASIC 0000 - = #define REG_EXTENDED 0001 - = #define REG_ICASE 0002 - = #define REG_NOSUB 0004 - = #define REG_NEWLINE 0010 - = #define REG_NOSPEC 0020 - = #define REG_PEND 0040 - = #define REG_DUMP 0200 - */ -int /* 0 success, otherwise REG_something */ -regcomp(regex_t * __restrict preg, - const char * __restrict pattern, - int cflags) -{ - struct parse pa; - struct re_guts *g; - struct parse *p = &pa; - int i; - size_t len; -#ifdef REDEBUG -# define GOODFLAGS(f) (f) -#else -# define GOODFLAGS(f) ((f)&~REG_DUMP) -#endif - - cflags = GOODFLAGS(cflags); - if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) - return(REG_INVARG); - - if (cflags®_PEND) { - if (preg->re_endp < pattern) - return(REG_INVARG); - len = preg->re_endp - pattern; - } else - len = strlen((char *)pattern); - - /* do the mallocs early so failure handling is easy */ - g = (struct re_guts *)malloc(sizeof(struct re_guts)); - if (g == NULL) - return(REG_ESPACE); - p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ - p->strip = (sop *)malloc(p->ssize * sizeof(sop)); - p->slen = 0; - if (p->strip == NULL) { - free((char *)g); - return(REG_ESPACE); - } - - /* set things up */ - p->g = g; - p->next = (char *)pattern; /* convenience; we do not modify it */ - p->end = p->next + len; - p->error = 0; - p->ncsalloc = 0; - for (i = 0; i < NPAREN; i++) { - p->pbegin[i] = 0; - p->pend[i] = 0; - } - g->sets = NULL; - g->ncsets = 0; - g->cflags = cflags; - g->iflags = 0; - g->nbol = 0; - g->neol = 0; - g->must = NULL; - g->moffset = -1; - g->charjump = NULL; - g->matchjump = NULL; - g->mlen = 0; - g->nsub = 0; - g->backrefs = 0; - - /* do it */ - EMIT(OEND, 0); - g->firststate = THERE(); - if (cflags®_EXTENDED) - p_ere(p, OUT); - else if (cflags®_NOSPEC) - p_str(p); - else - p_bre(p, OUT, OUT); - EMIT(OEND, 0); - g->laststate = THERE(); - - /* tidy up loose ends and fill things in */ - stripsnug(p, g); - findmust(p, g); - /* only use Boyer-Moore algorithm if the pattern is bigger - * than three characters - */ - if(g->mlen > 3) { - computejumps(p, g); - computematchjumps(p, g); - if(g->matchjump == NULL && g->charjump != NULL) { - free(g->charjump); - g->charjump = NULL; - } - } - g->nplus = pluscount(p, g); - g->magic = MAGIC2; - preg->re_nsub = g->nsub; - preg->re_g = g; - preg->re_magic = MAGIC1; -#ifndef REDEBUG - /* not debugging, so can't rely on the assert() in regexec() */ - if (g->iflags&BAD) - SETERROR(REG_ASSERT); -#endif - - /* win or lose, we're done */ - if (p->error != 0) /* lose */ - regfree(preg); - return(p->error); -} - -/* - - p_ere - ERE parser top level, concatenation and alternation - == static void p_ere(struct parse *p, int stop); - */ -static void -p_ere(struct parse *p, - int stop) /* character this ERE should end at */ -{ - char c; - sopno prevback = 0; - sopno prevfwd = 0; - sopno conc; - int first = 1; /* is this the first alternative? */ - - for (;;) { - /* do a bunch of concatenated expressions */ - conc = HERE(); - while (MORE() && (c = PEEK()) != '|' && c != stop) - p_ere_exp(p); - (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ - - if (!EAT('|')) - break; /* NOTE BREAK OUT */ - - if (first) { - INSERT(OCH_, conc); /* offset is wrong */ - prevfwd = conc; - prevback = conc; - first = 0; - } - ASTERN(OOR1, prevback); - prevback = THERE(); - AHEAD(prevfwd); /* fix previous offset */ - prevfwd = HERE(); - EMIT(OOR2, 0); /* offset is very wrong */ - } - - if (!first) { /* tail-end fixups */ - AHEAD(prevfwd); - ASTERN(O_CH, prevback); - } - - assert(!MORE() || SEE(stop)); -} - -/* - - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op - == static void p_ere_exp(struct parse *p); - */ -static void -p_ere_exp(struct parse *p) -{ - char c; - wint_t wc; - sopno pos; - int count; - int count2; - sopno subno; - int wascaret = 0; - - assert(MORE()); /* caller should have ensured this */ - c = GETNEXT(); - - pos = HERE(); - switch (c) { - case '(': - (void)REQUIRE(MORE(), REG_EPAREN); - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) - p->pbegin[subno] = HERE(); - EMIT(OLPAREN, subno); - if (!SEE(')')) - p_ere(p, ')'); - if (subno < NPAREN) { - p->pend[subno] = HERE(); - assert(p->pend[subno] != 0); - } - EMIT(ORPAREN, subno); - (void)MUSTEAT(')', REG_EPAREN); - break; -#ifndef POSIX_MISTAKE - case ')': /* happens only if no current unmatched ( */ - /* - * You may ask, why the ifndef? Because I didn't notice - * this until slightly too late for 1003.2, and none of the - * other 1003.2 regular-expression reviewers noticed it at - * all. So an unmatched ) is legal POSIX, at least until - * we can get it fixed. - */ - SETERROR(REG_EPAREN); - break; -#endif - case '^': - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - wascaret = 1; - break; - case '$': - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; - break; - case '|': - SETERROR(REG_EMPTY); - break; - case '*': - case '+': - case '?': - SETERROR(REG_BADRPT); - break; - case '.': - if (p->g->cflags®_NEWLINE) - nonnewline(p); - else - EMIT(OANY, 0); - break; - case '[': - p_bracket(p); - break; - case '\\': - (void)REQUIRE(MORE(), REG_EESCAPE); - wc = WGETNEXT(); -#ifdef __CYGWIN__ - /* \< and \> are the GNU equivalents to [[:<:]] and [[:>:]] */ - switch (wc) - { - case L'<': - EMIT(OBOW, 0); - break; - case L'>': - EMIT(OEOW, 0); - break; - default: - ordinary(p, wc); - break; - } -#else - ordinary(p, wc); -#endif - break; - case '{': /* okay as ordinary except if digit follows */ - (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); - /* FALLTHROUGH */ - default: - p->next--; - wc = WGETNEXT(); - ordinary(p, wc); - break; - } - - if (!MORE()) - return; - c = PEEK(); - /* we call { a repetition if followed by a digit */ - if (!( c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) - return; /* no repetition, we're done */ - NEXT(); - - (void)REQUIRE(!wascaret, REG_BADRPT); - switch (c) { - case '*': /* implemented as +? */ - /* this case does not require the (y|) trick, noKLUDGE */ - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - INSERT(OQUEST_, pos); - ASTERN(O_QUEST, pos); - break; - case '+': - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - break; - case '?': - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, pos); /* offset slightly wrong */ - ASTERN(OOR1, pos); /* this one's right */ - AHEAD(pos); /* fix the OCH_ */ - EMIT(OOR2, 0); /* offset very wrong... */ - AHEAD(THERE()); /* ...so fix it */ - ASTERN(O_CH, THERETHERE()); - break; - case '{': - count = p_count(p); - if (EAT(',')) { - if (isdigit((uch)PEEK())) { - count2 = p_count(p); - (void)REQUIRE(count <= count2, REG_BADBR); - } else /* single number with comma */ - count2 = INFINITY; - } else /* just a single number */ - count2 = count; - repeat(p, pos, count, count2); - if (!EAT('}')) { /* error heuristics */ - while (MORE() && PEEK() != '}') - NEXT(); - (void)REQUIRE(MORE(), REG_EBRACE); - SETERROR(REG_BADBR); - } - break; - } - - if (!MORE()) - return; - c = PEEK(); - if (!( c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) - return; - SETERROR(REG_BADRPT); -} - -/* - - p_str - string (no metacharacters) "parser" - == static void p_str(struct parse *p); - */ -static void -p_str(struct parse *p) -{ - (void)REQUIRE(MORE(), REG_EMPTY); - while (MORE()) - ordinary(p, WGETNEXT()); -} - -/* - - p_bre - BRE parser top level, anchoring and concatenation - == static void p_bre(struct parse *p, int end1, \ - == int end2); - * Giving end1 as OUT essentially eliminates the end1/end2 check. - * - * This implementation is a bit of a kludge, in that a trailing $ is first - * taken as an ordinary character and then revised to be an anchor. - * The amount of lookahead needed to avoid this kludge is excessive. - */ -static void -p_bre(struct parse *p, - int end1, /* first terminating character */ - int end2) /* second terminating character */ -{ - sopno start = HERE(); - int first = 1; /* first subexpression? */ - int wasdollar = 0; - - if (EAT('^')) { - EMIT(OBOL, 0); - p->g->iflags |= USEBOL; - p->g->nbol++; - } - while (MORE() && !SEETWO(end1, end2)) { - wasdollar = p_simp_re(p, first); - first = 0; - } - if (wasdollar) { /* oops, that was a trailing anchor */ - DROP(1); - EMIT(OEOL, 0); - p->g->iflags |= USEEOL; - p->g->neol++; - } - - (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ -} - -/* - - p_simp_re - parse a simple RE, an atom possibly followed by a repetition - == static int p_simp_re(struct parse *p, int starordinary); - */ -static int /* was the simple RE an unbackslashed $? */ -p_simp_re(struct parse *p, - int starordinary) /* is a leading * an ordinary character? */ -{ - int c; - int count; - int count2; - sopno pos; - int i; - wint_t wc; - sopno subno; -# define BACKSL (1<<CHAR_BIT) - - pos = HERE(); /* repetion op, if any, covers from here */ - - assert(MORE()); /* caller should have ensured this */ - c = GETNEXT(); - if (c == '\\') { - (void)REQUIRE(MORE(), REG_EESCAPE); - c = BACKSL | GETNEXT(); - } - switch (c) { - case '.': - if (p->g->cflags®_NEWLINE) - nonnewline(p); - else - EMIT(OANY, 0); - break; - case '[': - p_bracket(p); - break; -#ifdef __CYGWIN__ - case BACKSL|'<': - /* \< is the GNU equivalents to [[:<:]] */ - EMIT(OBOW, 0); - break; - case BACKSL|'>': - /* \> is the GNU equivalents to [[:>:]] */ - EMIT(OEOW, 0); - break; -#endif - case BACKSL|'{': - SETERROR(REG_BADRPT); - break; - case BACKSL|'(': - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) - p->pbegin[subno] = HERE(); - EMIT(OLPAREN, subno); - /* the MORE here is an error heuristic */ - if (MORE() && !SEETWO('\\', ')')) - p_bre(p, '\\', ')'); - if (subno < NPAREN) { - p->pend[subno] = HERE(); - assert(p->pend[subno] != 0); - } - EMIT(ORPAREN, subno); - (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); - break; - case BACKSL|')': /* should not get here -- must be user */ - case BACKSL|'}': - SETERROR(REG_EPAREN); - break; - case BACKSL|'1': - case BACKSL|'2': - case BACKSL|'3': - case BACKSL|'4': - case BACKSL|'5': - case BACKSL|'6': - case BACKSL|'7': - case BACKSL|'8': - case BACKSL|'9': - i = (c&~BACKSL) - '0'; - assert(i < NPAREN); - if (p->pend[i] != 0) { - assert(i <= p->g->nsub); - EMIT(OBACK_, i); - assert(p->pbegin[i] != 0); - assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); - assert(OP(p->strip[p->pend[i]]) == ORPAREN); - (void) dupl(p, p->pbegin[i]+1, p->pend[i]); - EMIT(O_BACK, i); - } else - SETERROR(REG_ESUBREG); - p->g->backrefs = 1; - break; - case '*': - (void)REQUIRE(starordinary, REG_BADRPT); - /* FALLTHROUGH */ - default: - p->next--; - wc = WGETNEXT(); - ordinary(p, wc); - break; - } - - if (EAT('*')) { /* implemented as +? */ - /* this case does not require the (y|) trick, noKLUDGE */ - INSERT(OPLUS_, pos); - ASTERN(O_PLUS, pos); - INSERT(OQUEST_, pos); - ASTERN(O_QUEST, pos); - } else if (EATTWO('\\', '{')) { - count = p_count(p); - if (EAT(',')) { - if (MORE() && isdigit((uch)PEEK())) { - count2 = p_count(p); - (void)REQUIRE(count <= count2, REG_BADBR); - } else /* single number with comma */ - count2 = INFINITY; - } else /* just a single number */ - count2 = count; - repeat(p, pos, count, count2); - if (!EATTWO('\\', '}')) { /* error heuristics */ - while (MORE() && !SEETWO('\\', '}')) - NEXT(); - (void)REQUIRE(MORE(), REG_EBRACE); - SETERROR(REG_BADBR); - } - } else if (c == '$') /* $ (but not \$) ends it */ - return(1); - - return(0); -} - -/* - - p_count - parse a repetition count - == static int p_count(struct parse *p); - */ -static int /* the value */ -p_count(struct parse *p) -{ - int count = 0; - int ndigits = 0; - - while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { - count = count*10 + (GETNEXT() - '0'); - ndigits++; - } - - (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); - return(count); -} - -/* - - p_bracket - parse a bracketed character list - == static void p_bracket(struct parse *p); - */ -static void -p_bracket(struct parse *p) -{ - cset *cs; - wint_t ch; - - /* Dept of Truly Sickening Special-Case Kludges */ - if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { - EMIT(OBOW, 0); - NEXTn(6); - return; - } - if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { - EMIT(OEOW, 0); - NEXTn(6); - return; - } - - if ((cs = allocset(p)) == NULL) - return; - - if (p->g->cflags®_ICASE) - cs->icase = 1; - if (EAT('^')) - cs->invert = 1; - if (EAT(']')) - CHadd(p, cs, ']'); - else if (EAT('-')) - CHadd(p, cs, '-'); - while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) - p_b_term(p, cs); - if (EAT('-')) - CHadd(p, cs, '-'); - (void)MUSTEAT(']', REG_EBRACK); - - if (p->error != 0) /* don't mess things up further */ - return; - - if (cs->invert && p->g->cflags®_NEWLINE) - cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); - - if ((ch = singleton(cs)) != OUT /* optimize singleton sets */ - && cs->invert == 0) { /* But not in invert case. */ - ordinary(p, ch); - freeset(p, cs); - } else - EMIT(OANYOF, (int)(cs - p->g->sets)); -} - -#ifdef __CYGWIN__ -/* This function is usually part of FreeBSD's libc. */ -int -__collate_range_cmp(int c1, int c2) -{ - char s1[2] = { c1, '\0' }; - char s2[2] = { c2, '\0' }; - return strcoll (s1, s2); -} -#endif - -/* - - p_b_term - parse one term of a bracketed character list - == static void p_b_term(struct parse *p, cset *cs); - */ -static void -p_b_term(struct parse *p, cset *cs) -{ - char c; - wint_t start, finish; - wint_t i; - - /* classify what we've got */ - switch ((MORE()) ? PEEK() : '\0') { - case '[': - c = (MORE2()) ? PEEK2() : '\0'; - break; - case '-': - SETERROR(REG_ERANGE); - return; /* NOTE RETURN */ - break; - default: - c = '\0'; - break; - } - - switch (c) { - case ':': /* character class */ - NEXT2(); - (void)REQUIRE(MORE(), REG_EBRACK); - c = PEEK(); - (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); - p_b_cclass(p, cs); - (void)REQUIRE(MORE(), REG_EBRACK); - (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); - break; - case '=': /* equivalence class */ - NEXT2(); - (void)REQUIRE(MORE(), REG_EBRACK); - c = PEEK(); - (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); - p_b_eclass(p, cs); - (void)REQUIRE(MORE(), REG_EBRACK); - (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); - break; - default: /* symbol, ordinary character, or range */ - start = p_b_symbol(p); - if (SEE('-') && MORE2() && PEEK2() != ']') { - /* range */ - NEXT(); - if (EAT('-')) - finish = '-'; - else - finish = p_b_symbol(p); - } else if (SEE('-') && !MORE2()) { - SETERROR(REG_EBRACK); - return; - } else - finish = start; - if (start == finish) - CHadd(p, cs, start); - else { -#ifdef __CYGWIN__ - if (!collate_lcid) { -#else - if (__collate_load_error) { -#endif - (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); - CHaddrange(p, cs, start, finish); - } else { - (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); - for (i = 0; i <= UCHAR_MAX; i++) { - if ( __collate_range_cmp(start, i) <= 0 - && __collate_range_cmp(i, finish) <= 0 - ) - CHadd(p, cs, i); - } - } - } - break; - } -} - -/* - - p_b_cclass - parse a character-class name and deal with it - == static void p_b_cclass(struct parse *p, cset *cs); - */ -static void -p_b_cclass(struct parse *p, cset *cs) -{ - char *sp = p->next; - size_t len; - wctype_t wct; - char clname[16]; - - while (MORE() && isalpha((uch)PEEK())) - NEXT(); - len = p->next - sp; - if (len >= sizeof(clname) - 1) { - SETERROR(REG_ECTYPE); - return; - } - memcpy(clname, sp, len); - clname[len] = '\0'; - if ((wct = wctype(clname)) == 0) { - SETERROR(REG_ECTYPE); - return; - } - CHaddtype(p, cs, wct); -} - -/* - - p_b_eclass - parse an equivalence-class name and deal with it - == static void p_b_eclass(struct parse *p, cset *cs); - * - * This implementation is incomplete. xxx - */ -static void -p_b_eclass(struct parse *p, cset *cs) -{ - wint_t c; - - c = p_b_coll_elem(p, '='); - CHadd(p, cs, c); -} - -/* - - p_b_symbol - parse a character or [..]ed multicharacter collating symbol - == static char p_b_symbol(struct parse *p); - */ -static wint_t /* value of symbol */ -p_b_symbol(struct parse *p) -{ - wint_t value; - - (void)REQUIRE(MORE(), REG_EBRACK); - if (!EATTWO('[', '.')) - return(WGETNEXT()); - - /* collating symbol */ - value = p_b_coll_elem(p, '.'); - (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); - return(value); -} - -/* - - p_b_coll_elem - parse a collating-element name and look it up - == static char p_b_coll_elem(struct parse *p, int endc); - */ -static wint_t /* value of collating element */ -p_b_coll_elem(struct parse *p, - wint_t endc) /* name ended by endc,']' */ -{ - char *sp = p->next; - struct cname *cp; - int len; - mbstate_t mbs; - wchar_t wc; - size_t clen; - - while (MORE() && !SEETWO(endc, ']')) - NEXT(); - if (!MORE()) { - SETERROR(REG_EBRACK); - return(0); - } - len = p->next - sp; - for (cp = cnames; cp->name != NULL; cp++) - if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') - return(cp->code); /* known name */ - memset(&mbs, 0, sizeof(mbs)); - if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) - return (wc); /* single character */ - else if (clen == (size_t)-1 || clen == (size_t)-2) - SETERROR(REG_ILLSEQ); - else - SETERROR(REG_ECOLLATE); /* neither */ - return(0); -} - -/* - - othercase - return the case counterpart of an alphabetic - == static char othercase(int ch); - */ -static wint_t /* if no counterpart, return ch */ -othercase(wint_t ch) -{ - assert(iswalpha(ch)); - if (iswupper(ch)) - return(towlower(ch)); - else if (iswlower(ch)) - return(towupper(ch)); - else /* peculiar, but could happen */ - return(ch); -} - -/* - - bothcases - emit a dualcase version of a two-case character - == static void bothcases(struct parse *p, int ch); - * - * Boy, is this implementation ever a kludge... - */ -static void -bothcases(struct parse *p, wint_t ch) -{ - char *oldnext = p->next; - char *oldend = p->end; - char bracket[3 + MB_LEN_MAX]; - size_t n; - mbstate_t mbs; - - assert(othercase(ch) != ch); /* p_bracket() would recurse */ - p->next = bracket; - memset(&mbs, 0, sizeof(mbs)); - n = xwcrtomb(bracket, ch, &mbs); - assert(n != (size_t)-1); - bracket[n] = ']'; - bracket[n + 1] = '\0'; - p->end = bracket+n+1; - p_bracket(p); - assert(p->next == p->end); - p->next = oldnext; - p->end = oldend; -} - -/* - - ordinary - emit an ordinary character - == static void ordinary(struct parse *p, int ch); - */ -static void -ordinary(struct parse *p, wint_t ch) -{ - cset *cs; - - if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) - bothcases(p, ch); - else if ((ch & OPDMASK) == ch) - EMIT(OCHAR, ch); - else { - /* - * Kludge: character is too big to fit into an OCHAR operand. - * Emit a singleton set. - */ - if ((cs = allocset(p)) == NULL) - return; - CHadd(p, cs, ch); - EMIT(OANYOF, (int)(cs - p->g->sets)); - } -} - -/* - - nonnewline - emit REG_NEWLINE version of OANY - == static void nonnewline(struct parse *p); - * - * Boy, is this implementation ever a kludge... - */ -static void -nonnewline(struct parse *p) -{ - char *oldnext = p->next; - char *oldend = p->end; - char bracket[4]; - - p->next = bracket; - p->end = bracket+3; - bracket[0] = '^'; - bracket[1] = '\n'; - bracket[2] = ']'; - bracket[3] = '\0'; - p_bracket(p); - assert(p->next == bracket+3); - p->next = oldnext; - p->end = oldend; -} - -/* - - repeat - generate code for a bounded repetition, recursively if needed - == static void repeat(struct parse *p, sopno start, int from, int to); - */ -static void -repeat(struct parse *p, - sopno start, /* operand from here to end of strip */ - int from, /* repeated from this number */ - int to) /* to this number of times (maybe INFINITY) */ -{ - sopno finish = HERE(); -# define N 2 -# define INF 3 -# define REP(f, t) ((f)*8 + (t)) -# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) - sopno copy; - - if (p->error != 0) /* head off possible runaway recursion */ - return; - - assert(from <= to); - - switch (REP(MAP(from), MAP(to))) { - case REP(0, 0): /* must be user doing this */ - DROP(finish-start); /* drop the operand */ - break; - case REP(0, 1): /* as x{1,1}? */ - case REP(0, N): /* as x{1,n}? */ - case REP(0, INF): /* as x{1,}? */ - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, start); /* offset is wrong... */ - repeat(p, start+1, 1, to); - ASTERN(OOR1, start); - AHEAD(start); /* ... fix it */ - EMIT(OOR2, 0); - AHEAD(THERE()); - ASTERN(O_CH, THERETHERE()); - break; - case REP(1, 1): /* trivial case */ - /* done */ - break; - case REP(1, N): /* as x?x{1,n-1} */ - /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ - INSERT(OCH_, start); - ASTERN(OOR1, start); - AHEAD(start); - EMIT(OOR2, 0); /* offset very wrong... */ - AHEAD(THERE()); /* ...so fix it */ - ASTERN(O_CH, THERETHERE()); - copy = dupl(p, start+1, finish+1); - assert(copy == finish+4); - repeat(p, copy, 1, to-1); - break; - case REP(1, INF): /* as x+ */ - INSERT(OPLUS_, start); - ASTERN(O_PLUS, start); - break; - case REP(N, N): /* as xx{m-1,n-1} */ - copy = dupl(p, start, finish); - repeat(p, copy, from-1, to-1); - break; - case REP(N, INF): /* as xx{n-1,INF} */ - copy = dupl(p, start, finish); - repeat(p, copy, from-1, to); - break; - default: /* "can't happen" */ - SETERROR(REG_ASSERT); /* just in case */ - break; - } -} - -/* - - wgetnext - helper function for WGETNEXT() macro. Gets the next wide - - character from the parse struct, signals a REG_ILLSEQ error if the - - character can't be converted. Returns the number of bytes consumed. - */ -static wint_t -wgetnext(struct parse *p) -{ - mbstate_t mbs; - wchar_t wc; - wint_t ret; - size_t n; - - memset(&mbs, 0, sizeof(mbs)); - n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); - if (n == (size_t)-1 || n == (size_t)-2) { - SETERROR(REG_ILLSEQ); - return (0); - } - ret = wc; - if (n == 0) - n = 1; - else if (sizeof (wchar_t) == 2 && wc >= 0xd800 && wc <= 0xdbff) { - /* UTF-16 surrogate pair. Fetch second half and - compute UTF-32 value */ - size_t n2 = mbrtowc(&wc, p->next + n, - p->end - p->next - n, &mbs); - if (n2 == 0 || n2 == (size_t)-1 || n2 == (size_t)-2) { - SETERROR(REG_ILLSEQ); - return (0); - } - ret = (((ret & 0x3ff) << 10) | (wc & 0x3ff)) - + 0x10000; - n += n2; - } - p->next += n; - return (ret); -} - -static size_t -xwcrtomb (char *s, wint_t wc, mbstate_t *ps) -{ - if (sizeof (wchar_t) == 2 && wc >= 0x10000) - { - /* UTF-16 wcrtomb can't handle these values directly. The rest of the - code isn't surrogate pair aware, so we handle this here. Convert - value to UTF-16 surrogate and call wcsrtombs to convert the "string" - to the correct multibyte representation, if any. */ - wchar_t ws[2]; - const wchar_t *wsp = ws; - - wc -= 0x10000; - ws[0] = 0xd800 | (wc >> 10); - ws[1] = 0xdc00 | (wc & 0x3ff); - return wcsnrtombs (s, &wsp, 2, MB_CUR_MAX, ps); - } - return wcrtomb (s, wc, ps); -} - - -/* - - seterr - set an error condition - == static int seterr(struct parse *p, int e); - */ -static int /* useless but makes type checking happy */ -seterr(struct parse *p, int e) -{ - if (p->error == 0) /* keep earliest error condition */ - p->error = e; - p->next = nuls; /* try to bring things to a halt */ - p->end = nuls; - return(0); /* make the return value well-defined */ -} - -/* - - allocset - allocate a set of characters for [] - == static cset *allocset(struct parse *p); - */ -static cset * -allocset(struct parse *p) -{ - cset *cs, *ncs; - - ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); - if (ncs == NULL) { - SETERROR(REG_ESPACE); - return (NULL); - } - p->g->sets = ncs; - cs = &p->g->sets[p->g->ncsets++]; - memset(cs, 0, sizeof(*cs)); - - return(cs); -} - -/* - - freeset - free a now-unused set - == static void freeset(struct parse *p, cset *cs); - */ -static void -freeset(struct parse *p, cset *cs) -{ - cset *top = &p->g->sets[p->g->ncsets]; - - free(cs->wides); - free(cs->ranges); - free(cs->types); - memset(cs, 0, sizeof(*cs)); - if (cs == top-1) /* recover only the easy case */ - p->g->ncsets--; -} - -/* - - singleton - Determine whether a set contains only one character, - - returning it if so, otherwise returning OUT. - */ -static wint_t -singleton(cset *cs) -{ - wint_t i, s, n; - - for (i = n = 0; i < NC; i++) - if (CHIN(cs, i)) { - n++; - s = i; - } - if (n == 1 && cs->nwides == 0) - return (s); - if (n == 0 && cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && - cs->icase == 0) - return (cs->wides[0]); - /* Don't bother handling the other cases. */ - return (OUT); -} - -/* - - CHadd - add character to character set. - */ -static void -CHadd(struct parse *p, cset *cs, wint_t ch) -{ - wint_t nch, *newwides; - assert(ch >= 0); - if (ch < NC) - cs->bmp[ch >> 3] |= 1 << (ch & 7); - else { - newwides = realloc(cs->wides, (cs->nwides + 1) * - sizeof(*cs->wides)); - if (newwides == NULL) { - SETERROR(REG_ESPACE); - return; - } - cs->wides = newwides; - cs->wides[cs->nwides++] = ch; - } - if (cs->icase) { - if ((nch = towlower(ch)) < NC) - cs->bmp[nch >> 3] |= 1 << (nch & 7); - if ((nch = towupper(ch)) < NC) - cs->bmp[nch >> 3] |= 1 << (nch & 7); - } -} - -/* - - CHaddrange - add all characters in the range [min,max] to a character set. - */ -static void -CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) -{ - crange *newranges; - - for (; min < NC && min <= max; min++) - CHadd(p, cs, min); - if (min >= max) - return; - newranges = realloc(cs->ranges, (cs->nranges + 1) * - sizeof(*cs->ranges)); - if (newranges == NULL) { - SETERROR(REG_ESPACE); - return; - } - cs->ranges = newranges; - cs->ranges[cs->nranges].min = min; - cs->ranges[cs->nranges].min = max; - cs->nranges++; -} - -/* - - CHaddtype - add all characters of a certain type to a character set. - */ -static void -CHaddtype(struct parse *p, cset *cs, wctype_t wct) -{ - wint_t i; - wctype_t *newtypes; - - for (i = 0; i < NC; i++) - if (iswctype(i, wct)) - CHadd(p, cs, i); - newtypes = realloc(cs->types, (cs->ntypes + 1) * - sizeof(*cs->types)); - if (newtypes == NULL) { - SETERROR(REG_ESPACE); - return; - } - cs->types = newtypes; - cs->types[cs->ntypes++] = wct; -} - -/* - - dupl - emit a duplicate of a bunch of sops - == static sopno dupl(struct parse *p, sopno start, sopno finish); - */ -static sopno /* start of duplicate */ -dupl(struct parse *p, - sopno start, /* from here */ - sopno finish) /* to this less one */ -{ - sopno ret = HERE(); - sopno len = finish - start; - - assert(finish >= start); - if (len == 0) - return(ret); - enlarge(p, p->ssize + len); /* this many unexpected additions */ - assert(p->ssize >= p->slen + len); - (void) memcpy((char *)(p->strip + p->slen), - (char *)(p->strip + start), (size_t)len*sizeof(sop)); - p->slen += len; - return(ret); -} - -/* - - doemit - emit a strip operator - == static void doemit(struct parse *p, sop op, size_t opnd); - * - * It might seem better to implement this as a macro with a function as - * hard-case backup, but it's just too big and messy unless there are - * some changes to the data structures. Maybe later. - */ -static void -doemit(struct parse *p, sop op, size_t opnd) -{ - /* avoid making error situations worse */ - if (p->error != 0) - return; - - /* deal with oversize operands ("can't happen", more or less) */ - assert(opnd < 1<<OPSHIFT); - - /* deal with undersized strip */ - if (p->slen >= p->ssize) - enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ - assert(p->slen < p->ssize); - - /* finally, it's all reduced to the easy case */ - p->strip[p->slen++] = SOP(op, opnd); -} - -/* - - doinsert - insert a sop into the strip - == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); - */ -static void -doinsert(struct parse *p, sop op, size_t opnd, sopno pos) -{ - sopno sn; - sop s; - int i; - - /* avoid making error situations worse */ - if (p->error != 0) - return; - - sn = HERE(); - EMIT(op, opnd); /* do checks, ensure space */ - assert(HERE() == sn+1); - s = p->strip[sn]; - - /* adjust paren pointers */ - assert(pos > 0); - for (i = 1; i < NPAREN; i++) { - if (p->pbegin[i] >= pos) { - p->pbegin[i]++; - } - if (p->pend[i] >= pos) { - p->pend[i]++; - } - } - - memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], - (HERE()-pos-1)*sizeof(sop)); - p->strip[pos] = s; -} - -/* - - dofwd - complete a forward reference - == static void dofwd(struct parse *p, sopno pos, sop value); - */ -static void -dofwd(struct parse *p, sopno pos, sop value) -{ - /* avoid making error situations worse */ - if (p->error != 0) - return; - - assert(value < 1<<OPSHIFT); - p->strip[pos] = OP(p->strip[pos]) | value; -} - -/* - - enlarge - enlarge the strip - == static void enlarge(struct parse *p, sopno size); - */ -static void -enlarge(struct parse *p, sopno size) -{ - sop *sp; - - if (p->ssize >= size) - return; - - sp = (sop *)realloc(p->strip, size*sizeof(sop)); - if (sp == NULL) { - SETERROR(REG_ESPACE); - return; - } - p->strip = sp; - p->ssize = size; -} - -/* - - stripsnug - compact the strip - == static void stripsnug(struct parse *p, struct re_guts *g); - */ -static void -stripsnug(struct parse *p, struct re_guts *g) -{ - g->nstates = p->slen; - g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); - if (g->strip == NULL) { - SETERROR(REG_ESPACE); - g->strip = p->strip; - } -} - -/* - - findmust - fill in must and mlen with longest mandatory literal string - == static void findmust(struct parse *p, struct re_guts *g); - * - * This algorithm could do fancy things like analyzing the operands of | - * for common subsequences. Someday. This code is simple and finds most - * of the interesting cases. - * - * Note that must and mlen got initialized during setup. - */ -static void -findmust(struct parse *p, struct re_guts *g) -{ - sop *scan; - sop *start; - sop *newstart; - sopno newlen; - sop s; - char *cp; - int offset; - char buf[MB_LEN_MAX]; - size_t clen; - mbstate_t mbs; - - /* avoid making error situations worse */ - if (p->error != 0) - return; - - /* - * It's not generally safe to do a ``char'' substring search on - * multibyte character strings, but it's safe for at least - * UTF-8 (see RFC 3629). - */ - if (MB_CUR_MAX > 1 && -#ifdef __CYGWIN__ - strcmp(__locale_charset (), "UTF-8") != 0) -#else - strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) -#endif - return; - - /* find the longest OCHAR sequence in strip */ - newlen = 0; - offset = 0; - g->moffset = 0; - scan = g->strip + 1; - do { - s = *scan++; - switch (OP(s)) { - case OCHAR: /* sequence member */ - if (newlen == 0) { /* new sequence */ - memset(&mbs, 0, sizeof(mbs)); - newstart = scan - 1; - } - clen = xwcrtomb(buf, OPND(s), &mbs); - if (clen == (size_t)-1) - goto toohard; - newlen += clen; - break; - case OPLUS_: /* things that don't break one */ - case OLPAREN: - case ORPAREN: - break; - case OQUEST_: /* things that must be skipped */ - case OCH_: - offset = altoffset(scan, offset); - scan--; - do { - scan += OPND(s); - s = *scan; - /* assert() interferes w debug printouts */ - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) { - g->iflags |= BAD; - return; - } - } while (OP(s) != O_QUEST && OP(s) != O_CH); - /* FALLTHROUGH */ - case OBOW: /* things that break a sequence */ - case OEOW: - case OBOL: - case OEOL: - case O_QUEST: - case O_CH: - case OEND: - if (newlen > g->mlen) { /* ends one */ - start = newstart; - g->mlen = newlen; - if (offset > -1) { - g->moffset += offset; - offset = newlen; - } else - g->moffset = offset; - } else { - if (offset > -1) - offset += newlen; - } - newlen = 0; - break; - case OANY: - if (newlen > g->mlen) { /* ends one */ - start = newstart; - g->mlen = newlen; - if (offset > -1) { - g->moffset += offset; - offset = newlen; - } else - g->moffset = offset; - } else { - if (offset > -1) - offset += newlen; - } - if (offset > -1) - offset++; - newlen = 0; - break; - case OANYOF: /* may or may not invalidate offset */ - /* First, everything as OANY */ - if (newlen > g->mlen) { /* ends one */ - start = newstart; - g->mlen = newlen; - if (offset > -1) { - g->moffset += offset; - offset = newlen; - } else - g->moffset = offset; - } else { - if (offset > -1) - offset += newlen; - } - if (offset > -1) - offset++; - newlen = 0; - break; - toohard: - default: - /* Anything here makes it impossible or too hard - * to calculate the offset -- so we give up; - * save the last known good offset, in case the - * must sequence doesn't occur later. - */ - if (newlen > g->mlen) { /* ends one */ - start = newstart; - g->mlen = newlen; - if (offset > -1) - g->moffset += offset; - else - g->moffset = offset; - } - offset = -1; - newlen = 0; - break; - } - } while (OP(s) != OEND); - - if (g->mlen == 0) { /* there isn't one */ - g->moffset = -1; - return; - } - - /* turn it into a character string */ - g->must = malloc((size_t)g->mlen + 1); - if (g->must == NULL) { /* argh; just forget it */ - g->mlen = 0; - g->moffset = -1; - return; - } - cp = g->must; - scan = start; - memset(&mbs, 0, sizeof(mbs)); - while (cp < g->must + g->mlen) { - while (OP(s = *scan++) != OCHAR) - continue; - clen = xwcrtomb(cp, OPND(s), &mbs); - assert(clen != (size_t)-1); - cp += clen; - } - assert(cp == g->must + g->mlen); - *cp++ = '\0'; /* just on general principles */ -} - -/* - - altoffset - choose biggest offset among multiple choices - == static int altoffset(sop *scan, int offset); - * - * Compute, recursively if necessary, the largest offset among multiple - * re paths. - */ -static int -altoffset(sop *scan, int offset) -{ - int largest; - int try; - sop s; - - /* If we gave up already on offsets, return */ - if (offset == -1) - return -1; - - largest = 0; - try = 0; - s = *scan++; - while (OP(s) != O_QUEST && OP(s) != O_CH) { - switch (OP(s)) { - case OOR1: - if (try > largest) - largest = try; - try = 0; - break; - case OQUEST_: - case OCH_: - try = altoffset(scan, try); - if (try == -1) - return -1; - scan--; - do { - scan += OPND(s); - s = *scan; - if (OP(s) != O_QUEST && OP(s) != O_CH && - OP(s) != OOR2) - return -1; - } while (OP(s) != O_QUEST && OP(s) != O_CH); - /* We must skip to the next position, or we'll - * leave altoffset() too early. - */ - scan++; - break; - case OANYOF: - case OCHAR: - case OANY: - try++; - case OBOW: - case OEOW: - case OLPAREN: - case ORPAREN: - case OOR2: - break; - default: - try = -1; - break; - } - if (try == -1) - return -1; - s = *scan++; - } - - if (try > largest) - largest = try; - - return largest+offset; -} - -/* - - computejumps - compute char jumps for BM scan - == static void computejumps(struct parse *p, struct re_guts *g); - * - * This algorithm assumes g->must exists and is has size greater than - * zero. It's based on the algorithm found on Computer Algorithms by - * Sara Baase. - * - * A char jump is the number of characters one needs to jump based on - * the value of the character from the text that was mismatched. - */ -static void -computejumps(struct parse *p, struct re_guts *g) -{ - int ch; - int mindex; - - /* Avoid making errors worse */ - if (p->error != 0) - return; - - g->charjump = (int*) malloc((NC + 1) * sizeof(int)); - if (g->charjump == NULL) /* Not a fatal error */ - return; - /* Adjust for signed chars, if necessary */ - g->charjump = &g->charjump[-(CHAR_MIN)]; - - /* If the character does not exist in the pattern, the jump - * is equal to the number of characters in the pattern. - */ - for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) - g->charjump[ch] = g->mlen; - - /* If the character does exist, compute the jump that would - * take us to the last character in the pattern equal to it - * (notice that we match right to left, so that last character - * is the first one that would be matched). - */ - for (mindex = 0; mindex < g->mlen; mindex++) - g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; -} - -/* - - computematchjumps - compute match jumps for BM scan - == static void computematchjumps(struct parse *p, struct re_guts *g); - * - * This algorithm assumes g->must exists and is has size greater than - * zero. It's based on the algorithm found on Computer Algorithms by - * Sara Baase. - * - * A match jump is the number of characters one needs to advance based - * on the already-matched suffix. - * Notice that all values here are minus (g->mlen-1), because of the way - * the search algorithm works. - */ -static void -computematchjumps(struct parse *p, struct re_guts *g) -{ - int mindex; /* General "must" iterator */ - int suffix; /* Keeps track of matching suffix */ - int ssuffix; /* Keeps track of suffixes' suffix */ - int* pmatches; /* pmatches[k] points to the next i - * such that i+1...mlen is a substring - * of k+1...k+mlen-i-1 - */ - - /* Avoid making errors worse */ - if (p->error != 0) - return; - - pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); - if (pmatches == NULL) { - g->matchjump = NULL; - return; - } - - g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); - if (g->matchjump == NULL) /* Not a fatal error */ - return; - - /* Set maximum possible jump for each character in the pattern */ - for (mindex = 0; mindex < g->mlen; mindex++) - g->matchjump[mindex] = 2*g->mlen - mindex - 1; - - /* Compute pmatches[] */ - for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; - mindex--, suffix--) { - pmatches[mindex] = suffix; - - /* If a mismatch is found, interrupting the substring, - * compute the matchjump for that position. If no - * mismatch is found, then a text substring mismatched - * against the suffix will also mismatch against the - * substring. - */ - while (suffix < g->mlen - && g->must[mindex] != g->must[suffix]) { - g->matchjump[suffix] = MIN(g->matchjump[suffix], - g->mlen - mindex - 1); - suffix = pmatches[suffix]; - } - } - - /* Compute the matchjump up to the last substring found to jump - * to the beginning of the largest must pattern prefix matching - * it's own suffix. - */ - for (mindex = 0; mindex <= suffix; mindex++) - g->matchjump[mindex] = MIN(g->matchjump[mindex], - g->mlen + suffix - mindex); - - ssuffix = pmatches[suffix]; - while (suffix < g->mlen) { - while (suffix <= ssuffix && suffix < g->mlen) { - g->matchjump[suffix] = MIN(g->matchjump[suffix], - g->mlen + ssuffix - suffix); - suffix++; - } - if (suffix < g->mlen) - ssuffix = pmatches[ssuffix]; - } - - free(pmatches); -} - -/* - - pluscount - count + nesting - == static sopno pluscount(struct parse *p, struct re_guts *g); - */ -static sopno /* nesting depth */ -pluscount(struct parse *p, struct re_guts *g) -{ - sop *scan; - sop s; - sopno plusnest = 0; - sopno maxnest = 0; - - if (p->error != 0) - return(0); /* there may not be an OEND */ - - scan = g->strip + 1; - do { - s = *scan++; - switch (OP(s)) { - case OPLUS_: - plusnest++; - break; - case O_PLUS: - if (plusnest > maxnest) - maxnest = plusnest; - plusnest--; - break; - } - } while (OP(s) != OEND); - if (plusnest != 0) - g->iflags |= BAD; - return(maxnest); -} |