From 44caccfca243364ea0282a8711ad788e3bc703dc Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Thu, 11 Feb 2010 21:19:19 +0000 Subject: * regex/engine.c (step): Drop Cygwin-specific definition. (NONCHAR): Better cast here to make the test work. Move comment from step here. (matcher): Disable skipping initial string in multibyte case. * regex/regcomp.c (p_bracket): Don't simplify singleton in the invert case. (p_b_term): Handle early end of pattern after dash in bracket expression. (singleton): Don't ignore the wides just because there's already a singleton in the single byte chars. Fix condition for a singleton wide accordingly. (findmust): Check for LC_CTYPE charset, rather than LC_COLLATE charset. * regex2.h (CHIN): Fix condition in the icase & invert case. (ISWORD): Fix wrong cast to unsigned char. --- winsup/cygwin/ChangeLog | 17 +++++++++++++++++ winsup/cygwin/regex/engine.c | 21 ++++++++------------- winsup/cygwin/regex/regcomp.c | 12 ++++++++---- winsup/cygwin/regex/regex2.h | 14 +++++++++----- 4 files changed, 42 insertions(+), 22 deletions(-) diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog index 39e72d4a7..7f67f4181 100644 --- a/winsup/cygwin/ChangeLog +++ b/winsup/cygwin/ChangeLog @@ -1,3 +1,20 @@ +2010-02-11 Corinna Vinschen + + * regex/engine.c (step): Drop Cygwin-specific definition. + (NONCHAR): Better cast here to make the test work. Move comment + from step here. + (matcher): Disable skipping initial string in multibyte case. + * regex/regcomp.c (p_bracket): Don't simplify singleton in the invert + case. + (p_b_term): Handle early end of pattern after dash in bracket + expression. + (singleton): Don't ignore the wides just because there's already a + singleton in the single byte chars. Fix condition for a singleton + wide accordingly. + (findmust): Check for LC_CTYPE charset, rather than LC_COLLATE charset. + * regex2.h (CHIN): Fix condition in the icase & invert case. + (ISWORD): Fix wrong cast to unsigned char. + 2010-02-11 Andy Koppe * nlsfuncs.cc (initial_setlocale): Move check whether charset has diff --git a/winsup/cygwin/regex/engine.c b/winsup/cygwin/regex/engine.c index a517a67ee..4afaf8d9a 100644 --- a/winsup/cygwin/regex/engine.c +++ b/winsup/cygwin/regex/engine.c @@ -106,11 +106,7 @@ static const char *dissect(struct match *m, const char *start, const char *stop, static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); -#ifdef __CYGWIN__ -static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); -#else static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); -#endif #define MAX_RECURSION 100 #define BOL (OUT-1) #define EOL (BOL-1) @@ -119,7 +115,10 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_ #define BOW (BOL-4) #define EOW (BOL-5) #define BADCHAR (BOL-6) -#define NONCHAR(c) ((c) <= OUT) +/* When using wint_t, which is defined as unsigned int on BSD, + as well as on Cygwin or Linux, the NONCHAR test is broken without + the below cast. I'm wondering how this is supposed to work at all... */ +#define NONCHAR(c) ((int)(c) <= OUT) #ifdef REDEBUG static void print(struct match *m, const char *caption, states st, int ch, FILE *d); #endif @@ -248,9 +247,12 @@ matcher(struct re_guts *g, ZAPSTATE(&m->mbs); /* Adjust start according to moffset, to speed things up */ +#ifndef MNAMES + /* The code evaluating moffset doesn't seem to work right + in the multibyte case. */ if (g->moffset > -1) start = ((dp - g->moffset) < start) ? start : dp - g->moffset; - +#endif SP("mloop", m->st, *start); /* this loop does only one repetition except for backrefs */ @@ -993,14 +995,7 @@ step(struct re_guts *g, sopno start, /* start state within strip */ sopno stop, /* state after stop state within strip */ states bef, /* states reachable before */ -#ifdef __CYGWIN__ - /* When using wint_t, which is defined as unsigned int on BSD, - as well as on Cygwin or Linux, the NONCHAR test is broken. - I'm wondering how this is supposed to work at all... */ - int ch, /* character or NONCHAR code */ -#else wint_t ch, /* character or NONCHAR code */ -#endif states aft) /* states already known reachable after */ { cset *cs; diff --git a/winsup/cygwin/regex/regcomp.c b/winsup/cygwin/regex/regcomp.c index 721982ab1..a7a48e023 100644 --- a/winsup/cygwin/regex/regcomp.c +++ b/winsup/cygwin/regex/regcomp.c @@ -762,7 +762,8 @@ p_bracket(struct parse *p) if (cs->invert && p->g->cflags®_NEWLINE) cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); - if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ + if ((ch = singleton(cs)) != OUT /* optimize singleton sets */ + && cs->invert == 0) { /* But not in invert case. */ ordinary(p, ch); freeset(p, cs); } else @@ -833,6 +834,9 @@ p_b_term(struct parse *p, cset *cs) finish = '-'; else finish = p_b_symbol(p); + } else if (SEE('-') && !MORE2()) { + SETERROR(REG_EBRACK); + return; } else finish = start; if (start == finish) @@ -1212,9 +1216,9 @@ singleton(cset *cs) n++; s = i; } - if (n == 1) + if (n == 1 && cs->nwides == 0) return (s); - if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && + if (n == 0 && cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && cs->icase == 0) return (cs->wides[0]); /* Don't bother handling the other cases. */ @@ -1467,7 +1471,7 @@ findmust(struct parse *p, struct re_guts *g) */ if (MB_CUR_MAX > 1 && #ifdef __CYGWIN__ - strcmp(collate_charset, "UTF-8") != 0) + strcmp(__locale_charset (), "UTF-8") != 0) #else strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) #endif diff --git a/winsup/cygwin/regex/regex2.h b/winsup/cygwin/regex/regex2.h index 13bbf64a7..53f687bf6 100644 --- a/winsup/cygwin/regex/regex2.h +++ b/winsup/cygwin/regex/regex2.h @@ -151,10 +151,14 @@ CHIN(cset *cs, wint_t ch) if (ch < NC) return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ cs->invert); - else if (cs->icase) - return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) || - CHIN1(cs, towupper(ch))); - else + else if (cs->icase) { + if (cs->invert) + return (CHIN1(cs, ch) && CHIN1(cs, towlower(ch)) && + CHIN1(cs, towupper(ch))); + else + return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) || + CHIN1(cs, towupper(ch))); + } else return (CHIN1(cs, ch)); } @@ -189,4 +193,4 @@ struct re_guts { /* misc utilities */ #define OUT (CHAR_MIN - 1) /* a non-character value */ -#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_') +#define ISWORD(c) (iswalnum((wint_t)(c)) || (c) == '_') -- cgit v1.2.3