Welcome to mirror list, hosted at ThFree Co, Russian Federation.

cygwin.com/git/newlib-cygwin.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCorinna Vinschen <corinna@vinschen.de>2010-02-04 15:35:49 +0300
committerCorinna Vinschen <corinna@vinschen.de>2010-02-04 15:35:49 +0300
commite1e595a649d55a304416c1e8e23f99e0df58a452 (patch)
treecb83b739abde54551467f2b187416820bd25d54f /winsup/cygwin/regex/regcomp.c
parentc8f7d3cb48efbcdeed082c16a6d5b90ae3c75791 (diff)
Replace regex files with multibyte-aware version from FreeBSD.
* Makefile.in (install-headers): Remove extra command to install regex.h. (uninstall-headers): Remove extra command to uninstall regex.h. * nlsfuncs.cc (collate_lcid): Make externally available to allow access to collation internals from regex functions. (collate_charset): Ditto. * wchar.h: Add __cplusplus guards to make C-clean. * include/regex.h: New file, replacing regex/regex.h. Remove UCB advertising clause. * regex/COPYRIGHT: Accommodate BSD license. Remove UCB advertising clause. * regex/cclass.h: Remove. * regex/cname.h: New file from FreeBSD. * regex/engine.c: Ditto. (NONCHAR): Tweak for Cygwin. * regex/engine.ih: Remove. * regex/mkh: Remove. * regex/regcomp.c: New file from FreeBSD. Tweak slightly for Cygwin. Import required collate internals from nlsfunc.cc. (p_ere_exp): Add GNU-specific \< and \> handling for word boundaries. (p_simp_re): Ditto. (__collate_range_cmp): Define. (p_b_term): Use Cygwin-specific collate internals. (findmust): Ditto. * regex/regcomp.ih: Remove. * regex/regerror.c: New file from FreeBSD. Fix a few compiler warnings. * regex/regerror.ih: Remove. * regex/regex.7: New file from FreeBSD. Remove UCB advertising clause. * regex/regex.h: Remove. Replaced by include/regex.h. * regex/regexec.c: New file from FreeBSD. Fix a few compiler warnings. * regex/regfree.c: New file from FreeBSD. * regex/tests: Remove. * regex/utils.h: New file from FreeBSD.
Diffstat (limited to 'winsup/cygwin/regex/regcomp.c')
-rw-r--r--winsup/cygwin/regex/regcomp.c1386
1 files changed, 842 insertions, 544 deletions
diff --git a/winsup/cygwin/regex/regcomp.c b/winsup/cygwin/regex/regcomp.c
index f771a1ac4..5bcaa98c3 100644
--- a/winsup/cygwin/regex/regcomp.c
+++ b/winsup/cygwin/regex/regcomp.c
@@ -1,18 +1,77 @@
+/*-
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
+#endif /* LIBC_SCCS and not lint */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.36 2007/06/11 03:05:54 delphij Exp $");
+
+#ifdef __CYGWIN__
#include "winsup.h"
+#endif
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
-#include "regex.h"
+#include <regex.h>
+#ifndef __CYGWIN__
+#include <runetype.h>
+#endif
+#include <wchar.h>
+#include <wctype.h>
+
+#ifndef __CYGWIN__
+#include "collate.h"
+#endif
#include "utils.h"
#include "regex2.h"
-#include "cclass.h"
#include "cname.h"
+#ifdef __CYGWIN__
+/* Don't pull in windows headers just for LCID. */
+typedef unsigned long LCID;
+/* These are defined in nlsfuncs.cc. */
+extern LCID collate_lcid;
+extern char collate_charset[];
+#endif
+
/*
* parse structure, passed up and down to avoid global variables and
* other clumsinesses
@@ -31,7 +90,61 @@ struct parse {
sopno pend[NPAREN]; /* -> ) ([0] unused) */
};
-#include "regcomp.ih"
+/* ========= begin header generated by ./mkh ========= */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* === regcomp.c === */
+#ifdef __CYGWIN__ /* Defined below `int stop'. Our gcc chokes on that. */
+static void p_ere(struct parse *p, int stop);
+#else
+static void p_ere(struct parse *p, wint_t stop);
+#endif
+static void p_ere_exp(struct parse *p);
+static void p_str(struct parse *p);
+#ifdef __CYGWIN__ /* Defined below `int end1/end2'. Our gcc chokes on that. */
+static void p_bre(struct parse *p, int end1, int end2);
+#else
+static void p_bre(struct parse *p, wint_t end1, wint_t end2);
+#endif
+static int p_simp_re(struct parse *p, int starordinary);
+static int p_count(struct parse *p);
+static void p_bracket(struct parse *p);
+static void p_b_term(struct parse *p, cset *cs);
+static void p_b_cclass(struct parse *p, cset *cs);
+static void p_b_eclass(struct parse *p, cset *cs);
+static wint_t p_b_symbol(struct parse *p);
+static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
+static wint_t othercase(wint_t ch);
+static void bothcases(struct parse *p, wint_t ch);
+static void ordinary(struct parse *p, wint_t ch);
+static void nonnewline(struct parse *p);
+static void repeat(struct parse *p, sopno start, int from, int to);
+static int seterr(struct parse *p, int e);
+static cset *allocset(struct parse *p);
+static void freeset(struct parse *p, cset *cs);
+static void CHadd(struct parse *p, cset *cs, wint_t ch);
+static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
+static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
+static wint_t singleton(cset *cs);
+static sopno dupl(struct parse *p, sopno start, sopno finish);
+static void doemit(struct parse *p, sop op, size_t opnd);
+static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
+static void dofwd(struct parse *p, sopno pos, sop value);
+static void enlarge(struct parse *p, sopno size);
+static void stripsnug(struct parse *p, struct re_guts *g);
+static void findmust(struct parse *p, struct re_guts *g);
+static int altoffset(sop *scan, int offset);
+static void computejumps(struct parse *p, struct re_guts *g);
+static void computematchjumps(struct parse *p, struct re_guts *g);
+static sopno pluscount(struct parse *p, struct re_guts *g);
+static wint_t wgetnext(struct parse *p);
+
+#ifdef __cplusplus
+}
+#endif
+/* ========= end header generated by ./mkh ========= */
static char nuls[10]; /* place to point scanner in event of error */
@@ -51,8 +164,9 @@ static char nuls[10]; /* place to point scanner in event of error */
#define NEXT2() (p->next += 2)
#define NEXTn(n) (p->next += (n))
#define GETNEXT() (*p->next++)
+#define WGETNEXT() wgetnext(p)
#define SETERROR(e) seterr(p, (e))
-#define REQUIRE(co, e) (void) ((co) || SETERROR(e))
+#define REQUIRE(co, e) ((co) || SETERROR(e))
#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
@@ -71,6 +185,9 @@ static int never = 0; /* for use in asserts; shuts lint up */
#define never 0 /* some <assert.h>s have bugs too */
#endif
+/* Macro used by computejump()/computematchjump() */
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
/*
- regcomp - interface for parser and compilation
= extern int regcomp(regex_t *, const char *, int);
@@ -84,16 +201,15 @@ static int never = 0; /* for use in asserts; shuts lint up */
= #define REG_DUMP 0200
*/
int /* 0 success, otherwise REG_something */
-regcomp(preg, pattern, cflags)
-regex_t *preg;
-const char *pattern;
-int cflags;
+regcomp(regex_t * __restrict preg,
+ const char * __restrict pattern,
+ int cflags)
{
struct parse pa;
- register struct re_guts *g;
- register struct parse *p = &pa;
- register int i;
- register size_t len;
+ struct re_guts *g;
+ struct parse *p = &pa;
+ int i;
+ size_t len;
#ifdef REDEBUG
# define GOODFLAGS(f) (f)
#else
@@ -112,8 +228,7 @@ int cflags;
len = strlen((char *)pattern);
/* do the mallocs early so failure handling is easy */
- g = (struct re_guts *)malloc(sizeof(struct re_guts) +
- (NC-1)*sizeof(cat_t));
+ g = (struct re_guts *)malloc(sizeof(struct re_guts));
if (g == NULL)
return(REG_ESPACE);
p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
@@ -134,20 +249,18 @@ int cflags;
p->pbegin[i] = 0;
p->pend[i] = 0;
}
- g->csetsize = NC;
g->sets = NULL;
- g->setbits = NULL;
g->ncsets = 0;
g->cflags = cflags;
g->iflags = 0;
g->nbol = 0;
g->neol = 0;
g->must = NULL;
+ g->moffset = -1;
+ g->charjump = NULL;
+ g->matchjump = NULL;
g->mlen = 0;
g->nsub = 0;
- g->ncategories = 1; /* category 0 is "everything else" */
- g->categories = &g->catspace[-(CHAR_MIN)];
- (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
g->backrefs = 0;
/* do it */
@@ -163,9 +276,19 @@ int cflags;
g->laststate = THERE();
/* tidy up loose ends and fill things in */
- categorize(p, g);
stripsnug(p, g);
findmust(p, g);
+ /* only use Boyer-Moore algorithm if the pattern is bigger
+ * than three characters
+ */
+ if(g->mlen > 3) {
+ computejumps(p, g);
+ computematchjumps(p, g);
+ if(g->matchjump == NULL && g->charjump != NULL) {
+ free(g->charjump);
+ g->charjump = NULL;
+ }
+ }
g->nplus = pluscount(p, g);
g->magic = MAGIC2;
preg->re_nsub = g->nsub;
@@ -185,25 +308,24 @@ int cflags;
/*
- p_ere - ERE parser top level, concatenation and alternation
- == static void p_ere(register struct parse *p, int stop);
+ == static void p_ere(struct parse *p, int stop);
*/
static void
-p_ere(p, stop)
-register struct parse *p;
-int stop; /* character this ERE should end at */
+p_ere(struct parse *p,
+ int stop) /* character this ERE should end at */
{
- register char c;
- register sopno prevback = 0;
- register sopno prevfwd = 0;
- register sopno conc = 0;
- register int first = 1; /* is this the first alternative? */
+ char c;
+ sopno prevback;
+ sopno prevfwd;
+ sopno conc;
+ int first = 1; /* is this the first alternative? */
for (;;) {
/* do a bunch of concatenated expressions */
conc = HERE();
while (MORE() && (c = PEEK()) != '|' && c != stop)
p_ere_exp(p);
- REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
+ (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
if (!EAT('|'))
break; /* NOTE BREAK OUT */
@@ -231,17 +353,17 @@ int stop; /* character this ERE should end at */
/*
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
- == static void p_ere_exp(register struct parse *p);
+ == static void p_ere_exp(struct parse *p);
*/
static void
-p_ere_exp(p)
-register struct parse *p;
+p_ere_exp(struct parse *p)
{
- register char c;
- register sopno pos;
- register int count;
- register int count2;
- register sopno subno;
+ char c;
+ wint_t wc;
+ sopno pos;
+ int count;
+ int count2;
+ sopno subno;
int wascaret = 0;
assert(MORE()); /* caller should have ensured this */
@@ -250,7 +372,7 @@ register struct parse *p;
pos = HERE();
switch (c) {
case '(':
- REQUIRE(MORE(), REG_EPAREN);
+ (void)REQUIRE(MORE(), REG_EPAREN);
p->g->nsub++;
subno = p->g->nsub;
if (subno < NPAREN)
@@ -263,7 +385,7 @@ register struct parse *p;
assert(p->pend[subno] != 0);
}
EMIT(ORPAREN, subno);
- MUSTEAT(')', REG_EPAREN);
+ (void)MUSTEAT(')', REG_EPAREN);
break;
#ifndef POSIX_MISTAKE
case ')': /* happens only if no current unmatched ( */
@@ -306,15 +428,33 @@ register struct parse *p;
p_bracket(p);
break;
case '\\':
- REQUIRE(MORE(), REG_EESCAPE);
- c = GETNEXT();
- ordinary(p, c);
+ (void)REQUIRE(MORE(), REG_EESCAPE);
+ wc = WGETNEXT();
+#ifdef __CYGWIN__
+ /* \< and \> are the GNU equivalents to [[:<:]] and [[:>:]] */
+ switch (wc)
+ {
+ case L'<':
+ EMIT(OBOW, 0);
+ break;
+ case L'>':
+ EMIT(OEOW, 0);
+ break;
+ default:
+ ordinary(p, wc);
+ break;
+ }
+#else
+ ordinary(p, wc);
+#endif
break;
case '{': /* okay as ordinary except if digit follows */
- REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT);
+ (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
/* FALLTHROUGH */
default:
- ordinary(p, c);
+ p->next--;
+ wc = WGETNEXT();
+ ordinary(p, wc);
break;
}
@@ -323,11 +463,11 @@ register struct parse *p;
c = PEEK();
/* we call { a repetition if followed by a digit */
if (!( c == '*' || c == '+' || c == '?' ||
- (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ))
+ (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
return; /* no repetition, we're done */
NEXT();
- REQUIRE(!wascaret, REG_BADRPT);
+ (void)REQUIRE(!wascaret, REG_BADRPT);
switch (c) {
case '*': /* implemented as +? */
/* this case does not require the (y|) trick, noKLUDGE */
@@ -352,9 +492,9 @@ register struct parse *p;
case '{':
count = p_count(p);
if (EAT(',')) {
- if (isdigit((unsigned char)PEEK())) {
+ if (isdigit((uch)PEEK())) {
count2 = p_count(p);
- REQUIRE(count <= count2, REG_BADBR);
+ (void)REQUIRE(count <= count2, REG_BADBR);
} else /* single number with comma */
count2 = INFINITY;
} else /* just a single number */
@@ -363,7 +503,7 @@ register struct parse *p;
if (!EAT('}')) { /* error heuristics */
while (MORE() && PEEK() != '}')
NEXT();
- REQUIRE(MORE(), REG_EBRACE);
+ (void)REQUIRE(MORE(), REG_EBRACE);
SETERROR(REG_BADBR);
}
break;
@@ -373,45 +513,41 @@ register struct parse *p;
return;
c = PEEK();
if (!( c == '*' || c == '+' || c == '?' ||
- (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) )
+ (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
return;
SETERROR(REG_BADRPT);
}
/*
- p_str - string (no metacharacters) "parser"
- == static void p_str(register struct parse *p);
+ == static void p_str(struct parse *p);
*/
static void
-p_str(p)
-register struct parse *p;
+p_str(struct parse *p)
{
- REQUIRE(MORE(), REG_EMPTY);
+ (void)REQUIRE(MORE(), REG_EMPTY);
while (MORE())
- ordinary(p, GETNEXT());
+ ordinary(p, WGETNEXT());
}
/*
- p_bre - BRE parser top level, anchoring and concatenation
- == static void p_bre(register struct parse *p, register int end1, \
- == register int end2);
+ == static void p_bre(struct parse *p, int end1, \
+ == int end2);
* Giving end1 as OUT essentially eliminates the end1/end2 check.
*
* This implementation is a bit of a kludge, in that a trailing $ is first
- * taken as an ordinary character and then revised to be an anchor. The
- * only undesirable side effect is that '$' gets included as a character
- * category in such cases. This is fairly harmless; not worth fixing.
+ * taken as an ordinary character and then revised to be an anchor.
* The amount of lookahead needed to avoid this kludge is excessive.
*/
static void
-p_bre(p, end1, end2)
-register struct parse *p;
-register int end1; /* first terminating character */
-register int end2; /* second terminating character */
+p_bre(struct parse *p,
+ int end1, /* first terminating character */
+ int end2) /* second terminating character */
{
- register sopno start = HERE();
- register int first = 1; /* first subexpression? */
- register int wasdollar = 0;
+ sopno start = HERE();
+ int first = 1; /* first subexpression? */
+ int wasdollar = 0;
if (EAT('^')) {
EMIT(OBOL, 0);
@@ -429,24 +565,24 @@ register int end2; /* second terminating character */
p->g->neol++;
}
- REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
+ (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
}
/*
- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
- == static int p_simp_re(register struct parse *p, int starordinary);
+ == static int p_simp_re(struct parse *p, int starordinary);
*/
static int /* was the simple RE an unbackslashed $? */
-p_simp_re(p, starordinary)
-register struct parse *p;
-int starordinary; /* is a leading * an ordinary character? */
+p_simp_re(struct parse *p,
+ int starordinary) /* is a leading * an ordinary character? */
{
- register int c;
- register int count;
- register int count2;
- register sopno pos;
- register int i;
- register sopno subno;
+ int c;
+ int count;
+ int count2;
+ sopno pos;
+ int i;
+ wint_t wc;
+ sopno subno;
# define BACKSL (1<<CHAR_BIT)
pos = HERE(); /* repetion op, if any, covers from here */
@@ -454,8 +590,8 @@ int starordinary; /* is a leading * an ordinary character? */
assert(MORE()); /* caller should have ensured this */
c = GETNEXT();
if (c == '\\') {
- REQUIRE(MORE(), REG_EESCAPE);
- c = BACKSL | (unsigned char)GETNEXT();
+ (void)REQUIRE(MORE(), REG_EESCAPE);
+ c = BACKSL | GETNEXT();
}
switch (c) {
case '.':
@@ -467,6 +603,16 @@ int starordinary; /* is a leading * an ordinary character? */
case '[':
p_bracket(p);
break;
+#ifdef __CYGWIN__
+ case BACKSL|'<':
+ /* \< is the GNU equivalents to [[:<:]] */
+ EMIT(OBOW, 0);
+ break;
+ case BACKSL|'>':
+ /* \> is the GNU equivalents to [[:>:]] */
+ EMIT(OEOW, 0);
+ break;
+#endif
case BACKSL|'{':
SETERROR(REG_BADRPT);
break;
@@ -484,7 +630,7 @@ int starordinary; /* is a leading * an ordinary character? */
assert(p->pend[subno] != 0);
}
EMIT(ORPAREN, subno);
- REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
+ (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
break;
case BACKSL|')': /* should not get here -- must be user */
case BACKSL|'}':
@@ -514,10 +660,12 @@ int starordinary; /* is a leading * an ordinary character? */
p->g->backrefs = 1;
break;
case '*':
- REQUIRE(starordinary, REG_BADRPT);
+ (void)REQUIRE(starordinary, REG_BADRPT);
/* FALLTHROUGH */
default:
- ordinary(p, (char)c); /* takes off BACKSL, if any */
+ p->next--;
+ wc = WGETNEXT();
+ ordinary(p, wc);
break;
}
@@ -530,9 +678,9 @@ int starordinary; /* is a leading * an ordinary character? */
} else if (EATTWO('\\', '{')) {
count = p_count(p);
if (EAT(',')) {
- if (MORE() && isdigit((unsigned char)PEEK())) {
+ if (MORE() && isdigit((uch)PEEK())) {
count2 = p_count(p);
- REQUIRE(count <= count2, REG_BADBR);
+ (void)REQUIRE(count <= count2, REG_BADBR);
} else /* single number with comma */
count2 = INFINITY;
} else /* just a single number */
@@ -541,10 +689,10 @@ int starordinary; /* is a leading * an ordinary character? */
if (!EATTWO('\\', '}')) { /* error heuristics */
while (MORE() && !SEETWO('\\', '}'))
NEXT();
- REQUIRE(MORE(), REG_EBRACE);
+ (void)REQUIRE(MORE(), REG_EBRACE);
SETERROR(REG_BADBR);
}
- } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */
+ } else if (c == '$') /* $ (but not \$) ends it */
return(1);
return(0);
@@ -552,37 +700,32 @@ int starordinary; /* is a leading * an ordinary character? */
/*
- p_count - parse a repetition count
- == static int p_count(register struct parse *p);
+ == static int p_count(struct parse *p);
*/
static int /* the value */
-p_count(p)
-register struct parse *p;
+p_count(struct parse *p)
{
- register int count = 0;
- register int ndigits = 0;
+ int count = 0;
+ int ndigits = 0;
- while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) {
+ while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
count = count*10 + (GETNEXT() - '0');
ndigits++;
}
- REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
+ (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
return(count);
}
/*
- p_bracket - parse a bracketed character list
- == static void p_bracket(register struct parse *p);
- *
- * Note a significant property of this code: if the allocset() did SETERROR,
- * no set operations are done.
+ == static void p_bracket(struct parse *p);
*/
static void
-p_bracket(p)
-register struct parse *p;
+p_bracket(struct parse *p)
{
- register cset *cs = allocset(p);
- register int invert = 0;
+ cset *cs;
+ wint_t ch;
/* Dept of Truly Sickening Special-Case Kludges */
if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
@@ -596,69 +739,57 @@ register struct parse *p;
return;
}
+ if ((cs = allocset(p)) == NULL)
+ return;
+
+ if (p->g->cflags&REG_ICASE)
+ cs->icase = 1;
if (EAT('^'))
- invert++; /* make note to invert set at end */
+ cs->invert = 1;
if (EAT(']'))
- CHadd(cs, ']');
+ CHadd(p, cs, ']');
else if (EAT('-'))
- CHadd(cs, '-');
+ CHadd(p, cs, '-');
while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
p_b_term(p, cs);
if (EAT('-'))
- CHadd(cs, '-');
- MUSTEAT(']', REG_EBRACK);
+ CHadd(p, cs, '-');
+ (void)MUSTEAT(']', REG_EBRACK);
if (p->error != 0) /* don't mess things up further */
return;
- if (p->g->cflags&REG_ICASE) {
- register int i;
- register int ci;
-
- for (i = p->g->csetsize - 1; i >= 0; i--)
- if (CHIN(cs, i) && isalpha(i)) {
- ci = othercase(i);
- if (ci != i)
- CHadd(cs, ci);
- }
- if (cs->multis != NULL)
- mccase(p, cs);
- }
- if (invert) {
- register int i;
-
- for (i = p->g->csetsize - 1; i >= 0; i--)
- if (CHIN(cs, i))
- CHsub(cs, i);
- else
- CHadd(cs, i);
- if (p->g->cflags&REG_NEWLINE)
- CHsub(cs, '\n');
- if (cs->multis != NULL)
- mcinvert(p, cs);
- }
+ if (cs->invert && p->g->cflags&REG_NEWLINE)
+ cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
- assert(cs->multis == NULL); /* xxx */
-
- if (nch(p, cs) == 1) { /* optimize singleton sets */
- ordinary(p, firstch(p, cs));
+ if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
+ ordinary(p, ch);
freeset(p, cs);
} else
- EMIT(OANYOF, freezeset(p, cs));
+ EMIT(OANYOF, (int)(cs - p->g->sets));
}
+#ifdef __CYGWIN__
+/* This function is usually part of FreeBSD's libc. */
+int
+__collate_range_cmp(int c1, int c2)
+{
+ char s1[2] = { c1, '\0' };
+ char s2[2] = { c2, '\0' };
+ return strcoll (s1, s2);
+}
+#endif
+
/*
- p_b_term - parse one term of a bracketed character list
- == static void p_b_term(register struct parse *p, register cset *cs);
+ == static void p_b_term(struct parse *p, cset *cs);
*/
static void
-p_b_term(p, cs)
-register struct parse *p;
-register cset *cs;
+p_b_term(struct parse *p, cset *cs)
{
- register char c;
- register char start, finish;
- register int i;
+ char c;
+ wint_t start, finish;
+ wint_t i;
/* classify what we've got */
switch ((MORE()) ? PEEK() : '\0') {
@@ -677,24 +808,23 @@ register cset *cs;
switch (c) {
case ':': /* character class */
NEXT2();
- REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(MORE(), REG_EBRACK);
c = PEEK();
- REQUIRE(c != '-' && c != ']', REG_ECTYPE);
+ (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
p_b_cclass(p, cs);
- REQUIRE(MORE(), REG_EBRACK);
- REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
+ (void)REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
break;
case '=': /* equivalence class */
NEXT2();
- REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(MORE(), REG_EBRACK);
c = PEEK();
- REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
+ (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
p_b_eclass(p, cs);
- REQUIRE(MORE(), REG_EBRACK);
- REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
+ (void)REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
break;
default: /* symbol, ordinary character, or range */
-/* xxx revision needed for multichar stuff */
start = p_b_symbol(p);
if (SEE('-') && MORE2() && PEEK2() != ']') {
/* range */
@@ -705,97 +835,106 @@ register cset *cs;
finish = p_b_symbol(p);
} else
finish = start;
-/* xxx what about signed chars here... */
- REQUIRE(start <= finish, REG_ERANGE);
- for (i = start; i <= finish; i++)
- CHadd(cs, i);
+ if (start == finish)
+ CHadd(p, cs, start);
+ else {
+#ifdef __CYGWIN__
+ if (!collate_lcid) {
+#else
+ if (__collate_load_error) {
+#endif
+ (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
+ CHaddrange(p, cs, start, finish);
+ } else {
+ (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
+ for (i = 0; i <= UCHAR_MAX; i++) {
+ if ( __collate_range_cmp(start, i) <= 0
+ && __collate_range_cmp(i, finish) <= 0
+ )
+ CHadd(p, cs, i);
+ }
+ }
+ }
break;
}
}
/*
- p_b_cclass - parse a character-class name and deal with it
- == static void p_b_cclass(register struct parse *p, register cset *cs);
+ == static void p_b_cclass(struct parse *p, cset *cs);
*/
static void
-p_b_cclass(p, cs)
-register struct parse *p;
-register cset *cs;
+p_b_cclass(struct parse *p, cset *cs)
{
- register char *sp = p->next;
- register struct cclass *cp;
- register size_t len;
- register const char *u;
- register char c;
+ char *sp = p->next;
+ size_t len;
+ wctype_t wct;
+ char clname[16];
- while (MORE() && isalpha((unsigned char)PEEK()))
+ while (MORE() && isalpha((uch)PEEK()))
NEXT();
len = p->next - sp;
- for (cp = cclasses; cp->name != NULL; cp++)
- if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
- break;
- if (cp->name == NULL) {
- /* oops, didn't find it */
+ if (len >= sizeof(clname) - 1) {
SETERROR(REG_ECTYPE);
return;
}
-
- u = cp->chars;
- while ((c = *u++) != '\0')
- CHadd(cs, c);
- for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
- MCadd(p, cs, u);
+ memcpy(clname, sp, len);
+ clname[len] = '\0';
+ if ((wct = wctype(clname)) == 0) {
+ SETERROR(REG_ECTYPE);
+ return;
+ }
+ CHaddtype(p, cs, wct);
}
/*
- p_b_eclass - parse an equivalence-class name and deal with it
- == static void p_b_eclass(register struct parse *p, register cset *cs);
+ == static void p_b_eclass(struct parse *p, cset *cs);
*
* This implementation is incomplete. xxx
*/
static void
-p_b_eclass(p, cs)
-register struct parse *p;
-register cset *cs;
+p_b_eclass(struct parse *p, cset *cs)
{
- register char c;
+ wint_t c;
c = p_b_coll_elem(p, '=');
- CHadd(cs, c);
+ CHadd(p, cs, c);
}
/*
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
- == static char p_b_symbol(register struct parse *p);
+ == static char p_b_symbol(struct parse *p);
*/
-static char /* value of symbol */
-p_b_symbol(p)
-register struct parse *p;
+static wint_t /* value of symbol */
+p_b_symbol(struct parse *p)
{
- register char value;
+ wint_t value;
- REQUIRE(MORE(), REG_EBRACK);
+ (void)REQUIRE(MORE(), REG_EBRACK);
if (!EATTWO('[', '.'))
- return(GETNEXT());
+ return(WGETNEXT());
/* collating symbol */
value = p_b_coll_elem(p, '.');
- REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
+ (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
return(value);
}
/*
- p_b_coll_elem - parse a collating-element name and look it up
- == static char p_b_coll_elem(register struct parse *p, int endc);
+ == static char p_b_coll_elem(struct parse *p, int endc);
*/
-static char /* value of collating element */
-p_b_coll_elem(p, endc)
-register struct parse *p;
-int endc; /* name ended by endc,']' */
+static wint_t /* value of collating element */
+p_b_coll_elem(struct parse *p,
+ wint_t endc) /* name ended by endc,']' */
{
- register char *sp = p->next;
- register struct cname *cp;
- register int len;
+ char *sp = p->next;
+ struct cname *cp;
+ int len;
+ mbstate_t mbs;
+ wchar_t wc;
+ size_t clen;
while (MORE() && !SEETWO(endc, ']'))
NEXT();
@@ -807,9 +946,13 @@ int endc; /* name ended by endc,']' */
for (cp = cnames; cp->name != NULL; cp++)
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
return(cp->code); /* known name */
- if (len == 1)
- return(*sp); /* single character */
- SETERROR(REG_ECOLLATE); /* neither */
+ memset(&mbs, 0, sizeof(mbs));
+ if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
+ return (wc); /* single character */
+ else if (clen == (size_t)-1 || clen == (size_t)-2)
+ SETERROR(REG_ILLSEQ);
+ else
+ SETERROR(REG_ECOLLATE); /* neither */
return(0);
}
@@ -817,78 +960,83 @@ int endc; /* name ended by endc,']' */
- othercase - return the case counterpart of an alphabetic
== static char othercase(int ch);
*/
-static char /* if no counterpart, return ch */
-othercase(ch)
-int ch;
+static wint_t /* if no counterpart, return ch */
+othercase(wint_t ch)
{
- assert(isalpha(ch));
- if (isupper(ch))
- return(tolower(ch));
- else if (islower(ch))
- return(toupper(ch));
+ assert(iswalpha(ch));
+ if (iswupper(ch))
+ return(towlower(ch));
+ else if (iswlower(ch))
+ return(towupper(ch));
else /* peculiar, but could happen */
return(ch);
}
/*
- bothcases - emit a dualcase version of a two-case character
- == static void bothcases(register struct parse *p, int ch);
+ == static void bothcases(struct parse *p, int ch);
*
* Boy, is this implementation ever a kludge...
*/
static void
-bothcases(p, ch)
-register struct parse *p;
-int ch;
+bothcases(struct parse *p, wint_t ch)
{
- register char *oldnext = p->next;
- register char *oldend = p->end;
- char bracket[3];
+ char *oldnext = p->next;
+ char *oldend = p->end;
+ char bracket[3 + MB_LEN_MAX];
+ size_t n;
+ mbstate_t mbs;
assert(othercase(ch) != ch); /* p_bracket() would recurse */
p->next = bracket;
- p->end = bracket+2;
- bracket[0] = ch;
- bracket[1] = ']';
- bracket[2] = '\0';
+ memset(&mbs, 0, sizeof(mbs));
+ n = wcrtomb(bracket, ch, &mbs);
+ assert(n != (size_t)-1);
+ bracket[n] = ']';
+ bracket[n + 1] = '\0';
+ p->end = bracket+n+1;
p_bracket(p);
- assert(p->next == bracket+2);
+ assert(p->next == p->end);
p->next = oldnext;
p->end = oldend;
}
/*
- ordinary - emit an ordinary character
- == static void ordinary(register struct parse *p, register int ch);
+ == static void ordinary(struct parse *p, int ch);
*/
static void
-ordinary(p, ch)
-register struct parse *p;
-register int ch;
+ordinary(struct parse *p, wint_t ch)
{
- register cat_t *cap = p->g->categories;
+ cset *cs;
- if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
+ if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
bothcases(p, ch);
+ else if ((ch & OPDMASK) == ch)
+ EMIT(OCHAR, ch);
else {
- EMIT(OCHAR, (unsigned char)ch);
- if (cap[ch] == 0)
- cap[ch] = p->g->ncategories++;
+ /*
+ * Kludge: character is too big to fit into an OCHAR operand.
+ * Emit a singleton set.
+ */
+ if ((cs = allocset(p)) == NULL)
+ return;
+ CHadd(p, cs, ch);
+ EMIT(OANYOF, (int)(cs - p->g->sets));
}
}
/*
- nonnewline - emit REG_NEWLINE version of OANY
- == static void nonnewline(register struct parse *p);
+ == static void nonnewline(struct parse *p);
*
* Boy, is this implementation ever a kludge...
*/
static void
-nonnewline(p)
-register struct parse *p;
+nonnewline(struct parse *p)
{
- register char *oldnext = p->next;
- register char *oldend = p->end;
+ char *oldnext = p->next;
+ char *oldend = p->end;
char bracket[4];
p->next = bracket;
@@ -905,21 +1053,20 @@ register struct parse *p;
/*
- repeat - generate code for a bounded repetition, recursively if needed
- == static void repeat(register struct parse *p, sopno start, int from, int to);
+ == static void repeat(struct parse *p, sopno start, int from, int to);
*/
static void
-repeat(p, start, from, to)
-register struct parse *p;
-sopno start; /* operand from here to end of strip */
-int from; /* repeated from this number */
-int to; /* to this number of times (maybe INFINITY) */
+repeat(struct parse *p,
+ sopno start, /* operand from here to end of strip */
+ int from, /* repeated from this number */
+ int to) /* to this number of times (maybe INFINITY) */
{
- register sopno finish = HERE();
+ sopno finish = HERE();
# define N 2
# define INF 3
# define REP(f, t) ((f)*8 + (t))
# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
- register sopno copy;
+ sopno copy;
if (p->error != 0) /* head off possible runaway recursion */
return;
@@ -976,13 +1123,35 @@ int to; /* to this number of times (maybe INFINITY) */
}
/*
+ - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
+ - character from the parse struct, signals a REG_ILLSEQ error if the
+ - character can't be converted. Returns the number of bytes consumed.
+ */
+static wint_t
+wgetnext(struct parse *p)
+{
+ mbstate_t mbs;
+ wchar_t wc;
+ size_t n;
+
+ memset(&mbs, 0, sizeof(mbs));
+ n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
+ if (n == (size_t)-1 || n == (size_t)-2) {
+ SETERROR(REG_ILLSEQ);
+ return (0);
+ }
+ if (n == 0)
+ n = 1;
+ p->next += n;
+ return (wc);
+}
+
+/*
- seterr - set an error condition
- == static int seterr(register struct parse *p, int e);
+ == static int seterr(struct parse *p, int e);
*/
static int /* useless but makes type checking happy */
-seterr(p, e)
-register struct parse *p;
-int e;
+seterr(struct parse *p, int e)
{
if (p->error == 0) /* keep earliest error condition */
p->error = e;
@@ -993,295 +1162,150 @@ int e;
/*
- allocset - allocate a set of characters for []
- == static cset *allocset(register struct parse *p);
+ == static cset *allocset(struct parse *p);
*/
static cset *
-allocset(p)
-register struct parse *p;
+allocset(struct parse *p)
{
- register int no = p->g->ncsets++;
- register size_t nc;
- register size_t nbytes;
- register cset *cs;
- register size_t css = (size_t)p->g->csetsize;
- register int i;
-
- if (no >= p->ncsalloc) { /* need another column of space */
- p->ncsalloc += CHAR_BIT;
- nc = p->ncsalloc;
- assert(nc % CHAR_BIT == 0);
- nbytes = nc / CHAR_BIT * css;
- if (p->g->sets == NULL)
- p->g->sets = (cset *)malloc(nc * sizeof(cset));
- else
- p->g->sets = (cset *)realloc((char *)p->g->sets,
- nc * sizeof(cset));
- if (p->g->setbits == NULL)
- p->g->setbits = (uch *)malloc(nbytes);
- else {
- p->g->setbits = (uch *)realloc((char *)p->g->setbits,
- nbytes);
- /* xxx this isn't right if setbits is now NULL */
- for (i = 0; i < no; i++)
- p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
- }
- if (p->g->sets != NULL && p->g->setbits != NULL)
- (void) memset((char *)p->g->setbits + (nbytes - css),
- 0, css);
- else {
- no = 0;
- SETERROR(REG_ESPACE);
- /* caller's responsibility not to do set ops */
- }
- }
+ cset *cs, *ncs;
- assert(p->g->sets != NULL); /* xxx */
- cs = &p->g->sets[no];
- cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
- cs->mask = 1 << ((no) % CHAR_BIT);
- cs->hash = 0;
- cs->smultis = 0;
- cs->multis = NULL;
+ ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
+ if (ncs == NULL) {
+ SETERROR(REG_ESPACE);
+ return (NULL);
+ }
+ p->g->sets = ncs;
+ cs = &p->g->sets[p->g->ncsets++];
+ memset(cs, 0, sizeof(*cs));
return(cs);
}
/*
- freeset - free a now-unused set
- == static void freeset(register struct parse *p, register cset *cs);
+ == static void freeset(struct parse *p, cset *cs);
*/
static void
-freeset(p, cs)
-register struct parse *p;
-register cset *cs;
+freeset(struct parse *p, cset *cs)
{
- register size_t i;
- register cset *top = &p->g->sets[p->g->ncsets];
- register size_t css = (size_t)p->g->csetsize;
+ cset *top = &p->g->sets[p->g->ncsets];
- for (i = 0; i < css; i++)
- CHsub(cs, i);
+ free(cs->wides);
+ free(cs->ranges);
+ free(cs->types);
+ memset(cs, 0, sizeof(*cs));
if (cs == top-1) /* recover only the easy case */
p->g->ncsets--;
}
/*
- - freezeset - final processing on a set of characters
- == static int freezeset(register struct parse *p, register cset *cs);
- *
- * The main task here is merging identical sets. This is usually a waste
- * of time (although the hash code minimizes the overhead), but can win
- * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
- * is done using addition rather than xor -- all ASCII [aA] sets xor to
- * the same value!
- */
-static int /* set number */
-freezeset(p, cs)
-register struct parse *p;
-register cset *cs;
-{
- register uch h = cs->hash;
- register size_t i;
- register cset *top = &p->g->sets[p->g->ncsets];
- register cset *cs2;
- register size_t css = (size_t)p->g->csetsize;
-
- /* look for an earlier one which is the same */
- for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
- if (cs2->hash == h && cs2 != cs) {
- /* maybe */
- for (i = 0; i < css; i++)
- if (!!CHIN(cs2, i) != !!CHIN(cs, i))
- break; /* no */
- if (i == css)
- break; /* yes */
- }
-
- if (cs2 < top) { /* found one */
- freeset(p, cs);
- cs = cs2;
- }
-
- return((int)(cs - p->g->sets));
-}
-
-/*
- - firstch - return first character in a set (which must have at least one)
- == static int firstch(register struct parse *p, register cset *cs);
- */
-static int /* character; there is no "none" value */
-firstch(p, cs)
-register struct parse *p;
-register cset *cs;
-{
- register size_t i;
- register size_t css = (size_t)p->g->csetsize;
-
- for (i = 0; i < css; i++)
- if (CHIN(cs, i))
- return((char)i);
- assert(never);
- return(0); /* arbitrary */
-}
-
-/*
- - nch - number of characters in a set
- == static int nch(register struct parse *p, register cset *cs);
+ - singleton - Determine whether a set contains only one character,
+ - returning it if so, otherwise returning OUT.
*/
-static int
-nch(p, cs)
-register struct parse *p;
-register cset *cs;
+static wint_t
+singleton(cset *cs)
{
- register size_t i;
- register size_t css = (size_t)p->g->csetsize;
- register int n = 0;
+ wint_t i, s, n;
- for (i = 0; i < css; i++)
- if (CHIN(cs, i))
+ for (i = n = 0; i < NC; i++)
+ if (CHIN(cs, i)) {
n++;
- return(n);
+ s = i;
+ }
+ if (n == 1)
+ return (s);
+ if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
+ cs->icase == 0)
+ return (cs->wides[0]);
+ /* Don't bother handling the other cases. */
+ return (OUT);
}
/*
- - mcadd - add a collating element to a cset
- == static void mcadd(register struct parse *p, register cset *cs, \
- == register char *cp);
+ - CHadd - add character to character set.
*/
static void
-mcadd(p, cs, cp)
-register struct parse *p;
-register cset *cs;
-register const char *cp;
+CHadd(struct parse *p, cset *cs, wint_t ch)
{
- register size_t oldend = cs->smultis;
-
- cs->smultis += strlen(cp) + 1;
- if (cs->multis == NULL)
- cs->multis = malloc(cs->smultis);
- else
- cs->multis = realloc(cs->multis, cs->smultis);
- if (cs->multis == NULL) {
- SETERROR(REG_ESPACE);
- return;
+ wint_t nch, *newwides;
+ assert(ch >= 0);
+ if (ch < NC)
+ cs->bmp[ch >> 3] |= 1 << (ch & 7);
+ else {
+ newwides = realloc(cs->wides, (cs->nwides + 1) *
+ sizeof(*cs->wides));
+ if (newwides == NULL) {
+ SETERROR(REG_ESPACE);
+ return;
+ }
+ cs->wides = newwides;
+ cs->wides[cs->nwides++] = ch;
+ }
+ if (cs->icase) {
+ if ((nch = towlower(ch)) < NC)
+ cs->bmp[nch >> 3] |= 1 << (nch & 7);
+ if ((nch = towupper(ch)) < NC)
+ cs->bmp[nch >> 3] |= 1 << (nch & 7);
}
-
- (void) strcpy(cs->multis + oldend - 1, cp);
- cs->multis[cs->smultis - 1] = '\0';
-}
-
-/*
- - mcinvert - invert the list of collating elements in a cset
- == static void mcinvert(register struct parse *p, register cset *cs);
- *
- * This would have to know the set of possibilities. Implementation
- * is deferred.
- */
-static void
-mcinvert(p, cs)
-register struct parse *p;
-register cset *cs;
-{
- assert(cs->multis == NULL); /* xxx */
}
/*
- - mccase - add case counterparts of the list of collating elements in a cset
- == static void mccase(register struct parse *p, register cset *cs);
- *
- * This would have to know the set of possibilities. Implementation
- * is deferred.
+ - CHaddrange - add all characters in the range [min,max] to a character set.
*/
static void
-mccase(p, cs)
-register struct parse *p;
-register cset *cs;
-{
- assert(cs->multis == NULL); /* xxx */
-}
-
-/*
- - isinsets - is this character in any sets?
- == static int isinsets(register struct re_guts *g, int c);
- */
-static int /* predicate */
-isinsets(g, c)
-register struct re_guts *g;
-int c;
+CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
{
- register uch *col;
- register int i;
- register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
- register unsigned uc = (unsigned char)c;
-
- for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
- if (col[uc] != 0)
- return(1);
- return(0);
-}
+ crange *newranges;
-/*
- - samesets - are these two characters in exactly the same sets?
- == static int samesets(register struct re_guts *g, int c1, int c2);
- */
-static int /* predicate */
-samesets(g, c1, c2)
-register struct re_guts *g;
-int c1;
-int c2;
-{
- register uch *col;
- register int i;
- register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
- register unsigned uc1 = (unsigned char)c1;
- register unsigned uc2 = (unsigned char)c2;
-
- for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
- if (col[uc1] != col[uc2])
- return(0);
- return(1);
+ for (; min < NC && min <= max; min++)
+ CHadd(p, cs, min);
+ if (min >= max)
+ return;
+ newranges = realloc(cs->ranges, (cs->nranges + 1) *
+ sizeof(*cs->ranges));
+ if (newranges == NULL) {
+ SETERROR(REG_ESPACE);
+ return;
+ }
+ cs->ranges = newranges;
+ cs->ranges[cs->nranges].min = min;
+ cs->ranges[cs->nranges].min = max;
+ cs->nranges++;
}
/*
- - categorize - sort out character categories
- == static void categorize(struct parse *p, register struct re_guts *g);
+ - CHaddtype - add all characters of a certain type to a character set.
*/
static void
-categorize(p, g)
-struct parse *p;
-register struct re_guts *g;
+CHaddtype(struct parse *p, cset *cs, wctype_t wct)
{
- register cat_t *cats = g->categories;
- register int c;
- register int c2;
- register cat_t cat;
-
- /* avoid making error situations worse */
- if (p->error != 0)
+ wint_t i;
+ wctype_t *newtypes;
+
+ for (i = 0; i < NC; i++)
+ if (iswctype(i, wct))
+ CHadd(p, cs, i);
+ newtypes = realloc(cs->types, (cs->ntypes + 1) *
+ sizeof(*cs->types));
+ if (newtypes == NULL) {
+ SETERROR(REG_ESPACE);
return;
-
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (cats[c] == 0 && isinsets(g, c)) {
- cat = g->ncategories++;
- cats[c] = cat;
- for (c2 = c+1; c2 <= CHAR_MAX; c2++)
- if (cats[c2] == 0 && samesets(g, c, c2))
- cats[c2] = cat;
- }
+ }
+ cs->types = newtypes;
+ cs->types[cs->ntypes++] = wct;
}
/*
- dupl - emit a duplicate of a bunch of sops
- == static sopno dupl(register struct parse *p, sopno start, sopno finish);
+ == static sopno dupl(struct parse *p, sopno start, sopno finish);
*/
static sopno /* start of duplicate */
-dupl(p, start, finish)
-register struct parse *p;
-sopno start; /* from here */
-sopno finish; /* to this less one */
+dupl(struct parse *p,
+ sopno start, /* from here */
+ sopno finish) /* to this less one */
{
- register sopno ret = HERE();
- register sopno len = finish - start;
+ sopno ret = HERE();
+ sopno len = finish - start;
assert(finish >= start);
if (len == 0)
@@ -1296,17 +1320,14 @@ sopno finish; /* to this less one */
/*
- doemit - emit a strip operator
- == static void doemit(register struct parse *p, sop op, size_t opnd);
+ == static void doemit(struct parse *p, sop op, size_t opnd);
*
* It might seem better to implement this as a macro with a function as
* hard-case backup, but it's just too big and messy unless there are
* some changes to the data structures. Maybe later.
*/
static void
-doemit(p, op, opnd)
-register struct parse *p;
-sop op;
-size_t opnd;
+doemit(struct parse *p, sop op, size_t opnd)
{
/* avoid making error situations worse */
if (p->error != 0)
@@ -1326,18 +1347,14 @@ size_t opnd;
/*
- doinsert - insert a sop into the strip
- == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
+ == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
*/
static void
-doinsert(p, op, opnd, pos)
-register struct parse *p;
-sop op;
-size_t opnd;
-sopno pos;
+doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
{
- register sopno sn;
- register sop s;
- register int i;
+ sopno sn;
+ sop s;
+ int i;
/* avoid making error situations worse */
if (p->error != 0)
@@ -1366,13 +1383,10 @@ sopno pos;
/*
- dofwd - complete a forward reference
- == static void dofwd(register struct parse *p, sopno pos, sop value);
+ == static void dofwd(struct parse *p, sopno pos, sop value);
*/
static void
-dofwd(p, pos, value)
-register struct parse *p;
-register sopno pos;
-sop value;
+dofwd(struct parse *p, sopno pos, sop value)
{
/* avoid making error situations worse */
if (p->error != 0)
@@ -1384,14 +1398,12 @@ sop value;
/*
- enlarge - enlarge the strip
- == static void enlarge(register struct parse *p, sopno size);
+ == static void enlarge(struct parse *p, sopno size);
*/
static void
-enlarge(p, size)
-register struct parse *p;
-register sopno size;
+enlarge(struct parse *p, sopno size)
{
- register sop *sp;
+ sop *sp;
if (p->ssize >= size)
return;
@@ -1407,12 +1419,10 @@ register sopno size;
/*
- stripsnug - compact the strip
- == static void stripsnug(register struct parse *p, register struct re_guts *g);
+ == static void stripsnug(struct parse *p, struct re_guts *g);
*/
static void
-stripsnug(p, g)
-register struct parse *p;
-register struct re_guts *g;
+stripsnug(struct parse *p, struct re_guts *g)
{
g->nstates = p->slen;
g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
@@ -1424,7 +1434,7 @@ register struct re_guts *g;
/*
- findmust - fill in must and mlen with longest mandatory literal string
- == static void findmust(register struct parse *p, register struct re_guts *g);
+ == static void findmust(struct parse *p, struct re_guts *g);
*
* This algorithm could do fancy things like analyzing the operands of |
* for common subsequences. Someday. This code is simple and finds most
@@ -1433,32 +1443,53 @@ register struct re_guts *g;
* Note that must and mlen got initialized during setup.
*/
static void
-findmust(p, g)
-struct parse *p;
-register struct re_guts *g;
+findmust(struct parse *p, struct re_guts *g)
{
- register sop *scan;
- sop *start = NULL;
- register sop *newstart = NULL;
- register sopno newlen;
- register sop s;
- register char *cp;
- register sopno i;
+ sop *scan;
+ sop *start;
+ sop *newstart;
+ sopno newlen;
+ sop s;
+ char *cp;
+ int offset;
+ char buf[MB_LEN_MAX];
+ size_t clen;
+ mbstate_t mbs;
/* avoid making error situations worse */
if (p->error != 0)
return;
+ /*
+ * It's not generally safe to do a ``char'' substring search on
+ * multibyte character strings, but it's safe for at least
+ * UTF-8 (see RFC 3629).
+ */
+ if (MB_CUR_MAX > 1 &&
+#ifdef __CYGWIN__
+ strcmp(collate_charset, "UTF-8") != 0)
+#else
+ strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
+#endif
+ return;
+
/* find the longest OCHAR sequence in strip */
newlen = 0;
+ offset = 0;
+ g->moffset = 0;
scan = g->strip + 1;
do {
s = *scan++;
switch (OP(s)) {
case OCHAR: /* sequence member */
- if (newlen == 0) /* new sequence */
+ if (newlen == 0) { /* new sequence */
+ memset(&mbs, 0, sizeof(mbs));
newstart = scan - 1;
- newlen++;
+ }
+ clen = wcrtomb(buf, OPND(s), &mbs);
+ if (clen == (size_t)-1)
+ goto toohard;
+ newlen += clen;
break;
case OPLUS_: /* things that don't break one */
case OLPAREN:
@@ -1466,6 +1497,7 @@ register struct re_guts *g;
break;
case OQUEST_: /* things that must be skipped */
case OCH_:
+ offset = altoffset(scan, offset);
scan--;
do {
scan += OPND(s);
@@ -1477,51 +1509,317 @@ register struct re_guts *g;
return;
}
} while (OP(s) != O_QUEST && OP(s) != O_CH);
- /* fallthrough */
- default: /* things that break a sequence */
+ /* FALLTHROUGH */
+ case OBOW: /* things that break a sequence */
+ case OEOW:
+ case OBOL:
+ case OEOL:
+ case O_QUEST:
+ case O_CH:
+ case OEND:
+ if (newlen > g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ if (offset > -1) {
+ g->moffset += offset;
+ offset = newlen;
+ } else
+ g->moffset = offset;
+ } else {
+ if (offset > -1)
+ offset += newlen;
+ }
+ newlen = 0;
+ break;
+ case OANY:
if (newlen > g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
+ if (offset > -1) {
+ g->moffset += offset;
+ offset = newlen;
+ } else
+ g->moffset = offset;
+ } else {
+ if (offset > -1)
+ offset += newlen;
}
+ if (offset > -1)
+ offset++;
+ newlen = 0;
+ break;
+ case OANYOF: /* may or may not invalidate offset */
+ /* First, everything as OANY */
+ if (newlen > g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ if (offset > -1) {
+ g->moffset += offset;
+ offset = newlen;
+ } else
+ g->moffset = offset;
+ } else {
+ if (offset > -1)
+ offset += newlen;
+ }
+ if (offset > -1)
+ offset++;
+ newlen = 0;
+ break;
+ toohard:
+ default:
+ /* Anything here makes it impossible or too hard
+ * to calculate the offset -- so we give up;
+ * save the last known good offset, in case the
+ * must sequence doesn't occur later.
+ */
+ if (newlen > g->mlen) { /* ends one */
+ start = newstart;
+ g->mlen = newlen;
+ if (offset > -1)
+ g->moffset += offset;
+ else
+ g->moffset = offset;
+ }
+ offset = -1;
newlen = 0;
break;
}
} while (OP(s) != OEND);
- if (g->mlen == 0) /* there isn't one */
+ if (g->mlen == 0) { /* there isn't one */
+ g->moffset = -1;
return;
+ }
/* turn it into a character string */
g->must = malloc((size_t)g->mlen + 1);
if (g->must == NULL) { /* argh; just forget it */
g->mlen = 0;
+ g->moffset = -1;
return;
}
cp = g->must;
scan = start;
- for (i = g->mlen; i > 0; i--) {
+ memset(&mbs, 0, sizeof(mbs));
+ while (cp < g->must + g->mlen) {
while (OP(s = *scan++) != OCHAR)
continue;
- assert(cp < g->must + g->mlen);
- *cp++ = (char)OPND(s);
+ clen = wcrtomb(cp, OPND(s), &mbs);
+ assert(clen != (size_t)-1);
+ cp += clen;
}
assert(cp == g->must + g->mlen);
*cp++ = '\0'; /* just on general principles */
}
/*
+ - altoffset - choose biggest offset among multiple choices
+ == static int altoffset(sop *scan, int offset);
+ *
+ * Compute, recursively if necessary, the largest offset among multiple
+ * re paths.
+ */
+static int
+altoffset(sop *scan, int offset)
+{
+ int largest;
+ int try;
+ sop s;
+
+ /* If we gave up already on offsets, return */
+ if (offset == -1)
+ return -1;
+
+ largest = 0;
+ try = 0;
+ s = *scan++;
+ while (OP(s) != O_QUEST && OP(s) != O_CH) {
+ switch (OP(s)) {
+ case OOR1:
+ if (try > largest)
+ largest = try;
+ try = 0;
+ break;
+ case OQUEST_:
+ case OCH_:
+ try = altoffset(scan, try);
+ if (try == -1)
+ return -1;
+ scan--;
+ do {
+ scan += OPND(s);
+ s = *scan;
+ if (OP(s) != O_QUEST && OP(s) != O_CH &&
+ OP(s) != OOR2)
+ return -1;
+ } while (OP(s) != O_QUEST && OP(s) != O_CH);
+ /* We must skip to the next position, or we'll
+ * leave altoffset() too early.
+ */
+ scan++;
+ break;
+ case OANYOF:
+ case OCHAR:
+ case OANY:
+ try++;
+ case OBOW:
+ case OEOW:
+ case OLPAREN:
+ case ORPAREN:
+ case OOR2:
+ break;
+ default:
+ try = -1;
+ break;
+ }
+ if (try == -1)
+ return -1;
+ s = *scan++;
+ }
+
+ if (try > largest)
+ largest = try;
+
+ return largest+offset;
+}
+
+/*
+ - computejumps - compute char jumps for BM scan
+ == static void computejumps(struct parse *p, struct re_guts *g);
+ *
+ * This algorithm assumes g->must exists and is has size greater than
+ * zero. It's based on the algorithm found on Computer Algorithms by
+ * Sara Baase.
+ *
+ * A char jump is the number of characters one needs to jump based on
+ * the value of the character from the text that was mismatched.
+ */
+static void
+computejumps(struct parse *p, struct re_guts *g)
+{
+ int ch;
+ int mindex;
+
+ /* Avoid making errors worse */
+ if (p->error != 0)
+ return;
+
+ g->charjump = (int*) malloc((NC + 1) * sizeof(int));
+ if (g->charjump == NULL) /* Not a fatal error */
+ return;
+ /* Adjust for signed chars, if necessary */
+ g->charjump = &g->charjump[-(CHAR_MIN)];
+
+ /* If the character does not exist in the pattern, the jump
+ * is equal to the number of characters in the pattern.
+ */
+ for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
+ g->charjump[ch] = g->mlen;
+
+ /* If the character does exist, compute the jump that would
+ * take us to the last character in the pattern equal to it
+ * (notice that we match right to left, so that last character
+ * is the first one that would be matched).
+ */
+ for (mindex = 0; mindex < g->mlen; mindex++)
+ g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
+}
+
+/*
+ - computematchjumps - compute match jumps for BM scan
+ == static void computematchjumps(struct parse *p, struct re_guts *g);
+ *
+ * This algorithm assumes g->must exists and is has size greater than
+ * zero. It's based on the algorithm found on Computer Algorithms by
+ * Sara Baase.
+ *
+ * A match jump is the number of characters one needs to advance based
+ * on the already-matched suffix.
+ * Notice that all values here are minus (g->mlen-1), because of the way
+ * the search algorithm works.
+ */
+static void
+computematchjumps(struct parse *p, struct re_guts *g)
+{
+ int mindex; /* General "must" iterator */
+ int suffix; /* Keeps track of matching suffix */
+ int ssuffix; /* Keeps track of suffixes' suffix */
+ int* pmatches; /* pmatches[k] points to the next i
+ * such that i+1...mlen is a substring
+ * of k+1...k+mlen-i-1
+ */
+
+ /* Avoid making errors worse */
+ if (p->error != 0)
+ return;
+
+ pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
+ if (pmatches == NULL) {
+ g->matchjump = NULL;
+ return;
+ }
+
+ g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
+ if (g->matchjump == NULL) /* Not a fatal error */
+ return;
+
+ /* Set maximum possible jump for each character in the pattern */
+ for (mindex = 0; mindex < g->mlen; mindex++)
+ g->matchjump[mindex] = 2*g->mlen - mindex - 1;
+
+ /* Compute pmatches[] */
+ for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
+ mindex--, suffix--) {
+ pmatches[mindex] = suffix;
+
+ /* If a mismatch is found, interrupting the substring,
+ * compute the matchjump for that position. If no
+ * mismatch is found, then a text substring mismatched
+ * against the suffix will also mismatch against the
+ * substring.
+ */
+ while (suffix < g->mlen
+ && g->must[mindex] != g->must[suffix]) {
+ g->matchjump[suffix] = MIN(g->matchjump[suffix],
+ g->mlen - mindex - 1);
+ suffix = pmatches[suffix];
+ }
+ }
+
+ /* Compute the matchjump up to the last substring found to jump
+ * to the beginning of the largest must pattern prefix matching
+ * it's own suffix.
+ */
+ for (mindex = 0; mindex <= suffix; mindex++)
+ g->matchjump[mindex] = MIN(g->matchjump[mindex],
+ g->mlen + suffix - mindex);
+
+ ssuffix = pmatches[suffix];
+ while (suffix < g->mlen) {
+ while (suffix <= ssuffix && suffix < g->mlen) {
+ g->matchjump[suffix] = MIN(g->matchjump[suffix],
+ g->mlen + ssuffix - suffix);
+ suffix++;
+ }
+ if (suffix < g->mlen)
+ ssuffix = pmatches[ssuffix];
+ }
+
+ free(pmatches);
+}
+
+/*
- pluscount - count + nesting
- == static sopno pluscount(register struct parse *p, register struct re_guts *g);
+ == static sopno pluscount(struct parse *p, struct re_guts *g);
*/
static sopno /* nesting depth */
-pluscount(p, g)
-struct parse *p;
-register struct re_guts *g;
+pluscount(struct parse *p, struct re_guts *g)
{
- register sop *scan;
- register sop s;
- register sopno plusnest = 0;
- register sopno maxnest = 0;
+ sop *scan;
+ sop s;
+ sopno plusnest = 0;
+ sopno maxnest = 0;
if (p->error != 0)
return(0); /* there may not be an OEND */