From 7bd2296c8384f80585ace60b9d67cff4cab2cc6f Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Fri, 12 Feb 2010 17:46:39 +0000 Subject: * regex/regcomp.c (xwcrtomb): New function to convert wide chars outside of the base plane to UTF-8. Call throughout instead of wcrtomb. (wgetnext): Handle surrogate pairs on UTF-16 systems. * regex/regexec.c (xmbrtowc): Ditto. --- winsup/cygwin/ChangeLog | 8 ++++++++ winsup/cygwin/regex/regcomp.c | 41 +++++++++++++++++++++++++++++++++++++---- winsup/cygwin/regex/regexec.c | 18 +++++++++++++++++- 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog index 955e256b3..3a841c876 100644 --- a/winsup/cygwin/ChangeLog +++ b/winsup/cygwin/ChangeLog @@ -1,3 +1,11 @@ +2010-02-12 Corinna Vinschen + + * regex/regcomp.c (xwcrtomb): New function to convert wide chars + outside of the base plane to UTF-8. Call throughout instead of + wcrtomb. + (wgetnext): Handle surrogate pairs on UTF-16 systems. + * regex/regexec.c (xmbrtowc): Ditto. + 2010-02-12 Corinna Vinschen * sec_auth.cc (get_user_local_groups): Retrieve name of well known diff --git a/winsup/cygwin/regex/regcomp.c b/winsup/cygwin/regex/regcomp.c index a7a48e023..c5e68a2b5 100644 --- a/winsup/cygwin/regex/regcomp.c +++ b/winsup/cygwin/regex/regcomp.c @@ -140,6 +140,7 @@ static void computejumps(struct parse *p, struct re_guts *g); static void computematchjumps(struct parse *p, struct re_guts *g); static sopno pluscount(struct parse *p, struct re_guts *g); static wint_t wgetnext(struct parse *p); +static size_t xwcrtomb (char *s, wint_t wc, mbstate_t *ps); #ifdef __cplusplus } @@ -994,7 +995,7 @@ bothcases(struct parse *p, wint_t ch) assert(othercase(ch) != ch); /* p_bracket() would recurse */ p->next = bracket; memset(&mbs, 0, sizeof(mbs)); - n = wcrtomb(bracket, ch, &mbs); + n = xwcrtomb(bracket, ch, &mbs); assert(n != (size_t)-1); bracket[n] = ']'; bracket[n + 1] = '\0'; @@ -1136,6 +1137,7 @@ wgetnext(struct parse *p) { mbstate_t mbs; wchar_t wc; + wint_t ret; size_t n; memset(&mbs, 0, sizeof(mbs)); @@ -1144,12 +1146,43 @@ wgetnext(struct parse *p) SETERROR(REG_ILLSEQ); return (0); } + ret = wc; if (n == 0) n = 1; + else if (sizeof (wchar_t) == 2 && wc >= 0xd800 && wc <= 0xdbff) { + /* UTF-16 surrogate pair. Fetch second half and + compute UTF-32 value */ + int n2 = mbrtowc(&wc, p->next + n, p->end - p->next - n, &mbs); + if (n2 == 0 || n2 == (size_t)-1 || n2 == (size_t)-2) { + SETERROR(REG_ILLSEQ); + return (0); + } + ret = (((ret & 0x3ff) << 10) | (wc & 0x3ff)) + + 0x10000; + n += n2; + } p->next += n; - return (wc); + return (ret); } +static size_t +xwcrtomb (char *s, wint_t wc, mbstate_t *ps) +{ + if (sizeof (wchar_t) == 2 && wc >= 0x10000) + { + /* UTF-16 systems can't handle these values directly. Since the + rest of the code isn't surrogate pair aware, we handle this here, + invisible for the rest of the code. */ + *s++ = 0xf0 | ((wc & 0x1c0000) >> 18); + *s++ = 0x80 | ((wc & 0x3f000) >> 12); + *s++ = 0x80 | ((wc & 0xfc0) >> 6); + *s = 0x80 | (wc & 0x3f); + return 4; + } + return wcrtomb (s, wc, ps); +} + + /* - seterr - set an error condition == static int seterr(struct parse *p, int e); @@ -1490,7 +1523,7 @@ findmust(struct parse *p, struct re_guts *g) memset(&mbs, 0, sizeof(mbs)); newstart = scan - 1; } - clen = wcrtomb(buf, OPND(s), &mbs); + clen = xwcrtomb(buf, OPND(s), &mbs); if (clen == (size_t)-1) goto toohard; newlen += clen; @@ -1609,7 +1642,7 @@ findmust(struct parse *p, struct re_guts *g) while (cp < g->must + g->mlen) { while (OP(s = *scan++) != OCHAR) continue; - clen = wcrtomb(cp, OPND(s), &mbs); + clen = xwcrtomb(cp, OPND(s), &mbs); assert(clen != (size_t)-1); cp += clen; } diff --git a/winsup/cygwin/regex/regexec.c b/winsup/cygwin/regex/regexec.c index 6195e508c..788ef5eeb 100644 --- a/winsup/cygwin/regex/regexec.c +++ b/winsup/cygwin/regex/regexec.c @@ -84,8 +84,24 @@ xmbrtowc(wint_t *wi, const char *s, size_t n, mbstate_t *mbs, wint_t dummy) if (wi != NULL) *wi = dummy; return (1); - } else + } else { + if (sizeof (wchar_t) == 2 && wc >= 0xd800 && wc <= 0xdbff) { + /* UTF-16 surrogate pair. Fetch second half and + compute UTF-32 value */ + int n2 = mbrtowc(&wc, s + nr, n - nr, mbs); + if (n2 == 0 || n2 == (size_t)-1 || n2 == (size_t)-2) { + memset(mbs, 0, sizeof(*mbs)); + if (wi != NULL) + *wi = dummy; + return (1); + } + if (wi != NULL) + *wi = (((*wi & 0x3ff) << 10) | (wc & 0x3ff)) + + 0x10000; + nr += n2; + } return (nr); + } } static __inline size_t -- cgit v1.2.3