diff options
author | Corinna Vinschen <corinna@vinschen.de> | 2023-02-14 14:20:20 +0300 |
---|---|---|
committer | Corinna Vinschen <corinna@vinschen.de> | 2023-02-14 14:20:20 +0300 |
commit | 60c25da90d015f27c5697c6db7ab0557585d09aa (patch) | |
tree | 4e07d2789ee4c7424951e382a40620fb8ee082aa /winsup/cygwin/strfuncs.cc | |
parent | 210eca1b31090d4c93c22a3152f1faa795dfd775 (diff) |
Cygwin: mbrtowi: define replacement for mbrtowc, returning UTF-32 value
Given how UTF-16 isn't capable to hold all Unicode chars in a single
wchar_t, we need a function returning a wint_t value representing
a UTF-32 value for comparison functions. Fortunately the important
wide character functions like towupper/towlower, isw<class>, iswctype,
etc, already take wint_t values and newlib handles them as UTF-32.
If only we had switched wchar_t to 32 bit way back when... sigh.
Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
Diffstat (limited to 'winsup/cygwin/strfuncs.cc')
-rw-r--r-- | winsup/cygwin/strfuncs.cc | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc index 0ab229053..0b9d8ac1f 100644 --- a/winsup/cygwin/strfuncs.cc +++ b/winsup/cygwin/strfuncs.cc @@ -112,6 +112,38 @@ transform_chars_af_unix (PWCHAR out, const char *path, __socklen_t len) return out; } +/* replacement function for mbrtowc, returning a wint_t representing + a UTF-32 value. */ +extern "C" wint_t +mbrtowi (wint_t *pwi, const char *s, size_t n, mbstate_t *ps) +{ + size_t len, len2; + wchar_t w1, w2; + + len = mbrtowc (&w1, s, n, ps); + if (len == (size_t) -1 || len == (size_t) -2) + return len; + *pwi = w1; + /* Convert surrogate pair to wint_t value */ + if (len > 0 && w1 >= 0xd800 && w1 <= 0xdbff) + { + s += len; + n -= len; + len2 = mbrtowc (&w2, s, n, ps); + if (len2 > 0 && w2 >= 0xdc00 && w2 <= 0xdfff) + { + len += len2; + *pwi = (((w1 & 0x3ff) << 10) | (w2 & 0x3ff)) + 0x10000; + } + else + { + len = (size_t) -1; + errno = EILSEQ; + } + } + return len; +} + /* The SJIS, JIS and eucJP conversion in newlib does not use UTF as wchar_t character representation. That's unfortunate for us since we require UTF for the OS. What we do here is to have our own |