Welcome to mirror list, hosted at ThFree Co, Russian Federation.

cygwin.com/git/newlib-cygwin.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCorinna Vinschen <corinna@vinschen.de>2008-02-06 21:24:50 +0300
committerCorinna Vinschen <corinna@vinschen.de>2008-02-06 21:24:50 +0300
commit4b65f190450f70bd5819bb5c18e3370d75ffebde (patch)
tree7ee585a605c7225086373821f94312485b406c8a
parenta7197550f31c8db32c671bc43401c1b90c78ed74 (diff)
* fhandler.h (fhandler_console::trunc_buf): Add to use as cache for
truncated multibyte characters on input. (fhandler_console::write_replacement_char): Declare new method. * fhandler_console.cc (CONVERT_LIMIT): Raise to 64K. (fhandler_console::fhandler_console): Initialize trunc_buf. (ERR): Define as independent value again. (fhandler_console::write_replacement_char): New method to print replacement chars. (fhandler_console::write_normal): Add handling for truncated multibyte sequences. Call next_char instead of pathetic CharNextExA function. Don't change src, rather just work with found later on. * miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc. Don't call Windows function, restrict to well-known ANSI/OEM codepages and UTF-8. (next_char): Call CharNextExA only for doublebyte codepages. Implement for UTF-8 here. * strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc. * winsup.h (next_char): Declare. * include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX as defined by newlib for now.
-rw-r--r--winsup/cygwin/ChangeLog23
-rw-r--r--winsup/cygwin/fhandler.h8
-rw-r--r--winsup/cygwin/fhandler_console.cc99
-rw-r--r--winsup/cygwin/include/limits.h4
-rw-r--r--winsup/cygwin/miscfuncs.cc115
-rw-r--r--winsup/cygwin/strfuncs.cc8
-rw-r--r--winsup/cygwin/winsup.h3
7 files changed, 226 insertions, 34 deletions
diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog
index 98b5076ce..a5213a705 100644
--- a/winsup/cygwin/ChangeLog
+++ b/winsup/cygwin/ChangeLog
@@ -1,3 +1,26 @@
+2008-02-06 Corinna Vinschen <corinna@vinschen.de>
+
+ * fhandler.h (fhandler_console::trunc_buf): Add to use as cache for
+ truncated multibyte characters on input.
+ (fhandler_console::write_replacement_char): Declare new method.
+ * fhandler_console.cc (CONVERT_LIMIT): Raise to 64K.
+ (fhandler_console::fhandler_console): Initialize trunc_buf.
+ (ERR): Define as independent value again.
+ (fhandler_console::write_replacement_char): New method to print
+ replacement chars.
+ (fhandler_console::write_normal): Add handling for truncated multibyte
+ sequences. Call next_char instead of pathetic CharNextExA function.
+ Don't change src, rather just work with found later on.
+ * miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc.
+ Don't call Windows function, restrict to well-known ANSI/OEM codepages
+ and UTF-8.
+ (next_char): Call CharNextExA only for doublebyte codepages.
+ Implement for UTF-8 here.
+ * strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc.
+ * winsup.h (next_char): Declare.
+ * include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX
+ as defined by newlib for now.
+
2008-02-05 Corinna Vinschen <corinna@vinschen.de>
* autoload.cc (CharToOemA): Remove.
diff --git a/winsup/cygwin/fhandler.h b/winsup/cygwin/fhandler.h
index 35d942f6d..28f34df45 100644
--- a/winsup/cygwin/fhandler.h
+++ b/winsup/cygwin/fhandler.h
@@ -896,6 +896,13 @@ class fhandler_console: public fhandler_termios
static dev_console *dev_state;
static bool invisible_console;
+ /* Used when we encounter a truncated multi-byte sequence. The
+ lead bytes are stored here and revisited in the next write call. */
+ struct {
+ int len;
+ unsigned char buf[4]; /* Max len of valid UTF-8 sequence. */
+ } trunc_buf;
+
/* Output calls */
void set_default_attr ();
@@ -904,6 +911,7 @@ class fhandler_console: public fhandler_termios
void cursor_set (bool, int, int);
void cursor_get (int *, int *);
void cursor_rel (int, int);
+ void write_replacement_char (const unsigned char *);
const unsigned char *write_normal (unsigned const char*, unsigned const char *);
void char_command (char);
bool set_raw_win32_keyboard_mode (bool);
diff --git a/winsup/cygwin/fhandler_console.cc b/winsup/cygwin/fhandler_console.cc
index f23bacb3e..cc972e10a 100644
--- a/winsup/cygwin/fhandler_console.cc
+++ b/winsup/cygwin/fhandler_console.cc
@@ -33,7 +33,7 @@ details. */
#include "cygtls.h"
#include "registry.h"
-#define CONVERT_LIMIT 16384
+#define CONVERT_LIMIT 65536
/*
* Scroll the screen context.
@@ -895,7 +895,9 @@ fhandler_console::tcgetattr (struct termios *t)
fhandler_console::fhandler_console () :
fhandler_termios ()
{
+ trunc_buf.len = 0;
}
+
void
dev_console::set_color (HANDLE h)
{
@@ -1037,7 +1039,7 @@ fhandler_console::cursor_get (int *x, int *y)
#define ESC 2
#define NOR 0
#define IGN 4
-#if 0
+#if 1
#define ERR 5
#else
#define ERR NOR
@@ -1425,41 +1427,86 @@ beep ()
MessageBeep (MB_OK);
}
+/* This gets called when we found an invalid UTF-8 character. We try with
+ the default ANSI codepage. If that fails we just print a question mark.
+ Looks ugly but is a neat and alomst sane fallback for many languages. */
+void
+fhandler_console::write_replacement_char (const unsigned char *char_p)
+{
+ int n;
+ WCHAR def_cp_chars[2];
+ DWORD done;
+
+ n = MultiByteToWideChar (GetACP (), 0, (const CHAR *) char_p, 1,
+ def_cp_chars, 2);
+ if (n)
+ WriteConsoleW (get_output_handle (), def_cp_chars, n, &done, 0);
+ else
+ WriteConsoleW (get_output_handle (), L"?", 1, &done, 0);
+}
+
const unsigned char *
fhandler_console::write_normal (const unsigned char *src,
const unsigned char *end)
{
/* Scan forward to see what a char which needs special treatment */
DWORD done;
- unsigned char *found = (unsigned char *) src;
+ DWORD buf_len;
+ const unsigned char *found = src;
+ const unsigned char *nfound;
UINT cp = dev_state->get_console_cp ();
- bool mb = is_cp_multibyte (cp);
+
+ /* First check if we have cached lead bytes of a former try to write
+ a truncated multibyte sequence. If so, process it. */
+ if (trunc_buf.len)
+ {
+ int cp_len = min (end - src, 4 - trunc_buf.len);
+ memcpy (trunc_buf.buf + trunc_buf.len, src, cp_len);
+ nfound = next_char (cp, trunc_buf.buf,
+ trunc_buf.buf + trunc_buf.len + cp_len);
+ if (!nfound) /* Invalid multibyte sequence. */
+ { /* Give up and print replacement chars. */
+ for (int i = 0; i < trunc_buf.len; ++i)
+ write_replacement_char (trunc_buf.buf + i);
+ }
+ else if (nfound == trunc_buf.buf)
+ { /* Still truncated multibyte sequence. */
+ trunc_buf.len += cp_len;
+ return end;
+ }
+ else
+ {
+ /* Valid multibyte sequence. Process. */
+ WCHAR buf[2];
+ buf_len = dev_state->str_to_con (buf, (const char *) trunc_buf.buf,
+ nfound - trunc_buf.buf);
+ WriteConsoleW (get_output_handle (), buf, buf_len, &done, 0);
+ found = src + (nfound - trunc_buf.buf - trunc_buf.len);
+ }
+ /* Mark trunc_buf as unused. */
+ trunc_buf.len = 0;
+ }
while (found < end
&& found - src < CONVERT_LIMIT
&& base_chars[*found] == NOR)
{
- if (mb && *found && *found >= 0x80)
- {
- unsigned char *nfound = (unsigned char *)
- CharNextExA (cp, (const CHAR *) found, 0);
- /* Sanity check for UTF-8 to workaround the problem in
- MultiByteToWideChar, that it's not capable of using replacement
- characters for invalid source chars in the given codepage. */
- if (nfound == found + 1 && cp == CP_UTF8)
- *found++ = '?';
- else
- found = nfound;
+ nfound = next_char (cp, found, end);
+ if (!nfound) /* Invalid multibyte sequence. */
+ break;
+ if (nfound == found) /* Truncated multibyte sequence. */
+ { /* Stick to it until the next write. */
+ trunc_buf.len = end - found;
+ memcpy (trunc_buf.buf, found, trunc_buf.len);
+ return end;
}
- else
- ++found;
+ found = nfound;
}
/* Print all the base ones out */
if (found != src)
{
DWORD len = found - src;
- DWORD buf_len;
PWCHAR buf = (PWCHAR) alloca (CONVERT_LIMIT * sizeof (WCHAR));
buf_len = dev_state->str_to_con (buf, (const char *) src, len);
@@ -1490,13 +1537,14 @@ fhandler_console::write_normal (const unsigned char *src,
buf += done;
}
while (buf_len > 0);
- src = found;
+ if (len >= CONVERT_LIMIT)
+ return found;
}
- if (src < end)
+ if (found < end)
{
int x, y;
- switch (base_chars[*src])
+ switch (base_chars[*found])
{
case BEL:
beep ();
@@ -1529,16 +1577,19 @@ fhandler_console::write_normal (const unsigned char *src,
cursor_set (false, 0, y);
break;
case ERR:
- WriteFile (get_output_handle (), src, 1, &done, 0);
+ WriteFile (get_output_handle (), found, 1, &done, 0);
break;
case TAB:
cursor_get (&x, &y);
cursor_set (false, 8 * (x / 8 + 1), y);
break;
+ case NOR:
+ write_replacement_char (found);
+ break;
}
- src ++;
+ found++;
}
- return src;
+ return found;
}
int
diff --git a/winsup/cygwin/include/limits.h b/winsup/cygwin/include/limits.h
index 7f43cfb5c..e6e089da9 100644
--- a/winsup/cygwin/include/limits.h
+++ b/winsup/cygwin/include/limits.h
@@ -28,7 +28,9 @@ details. */
/* Maximum length of a multibyte character. */
#ifndef MB_LEN_MAX
-#define MB_LEN_MAX 1
+/* TODO: This is newlib's max value. We should probably rather define our
+ own _mbtowc_r and _wctomb_r functions which are only codepage dependent. */
+#define MB_LEN_MAX 8
#endif
/* Minimum and maximum values a `signed char' can hold. */
diff --git a/winsup/cygwin/miscfuncs.cc b/winsup/cygwin/miscfuncs.cc
index 0ec0b4873..4edfbab94 100644
--- a/winsup/cygwin/miscfuncs.cc
+++ b/winsup/cygwin/miscfuncs.cc
@@ -17,7 +17,8 @@ details. */
#include <alloca.h>
#include <limits.h>
#include <wchar.h>
-#include <winbase.h>
+#include <wingdi.h>
+#include <winuser.h>
#include <winnls.h>
#include "cygthread.h"
#include "cygtls.h"
@@ -192,6 +193,118 @@ cygwin_strupr (char *string)
return string;
}
+/* FIXME? We only support standard ANSI/OEM codepages according to
+ http://www.microsoft.com/globaldev/reference/cphome.mspx as well
+ as UTF-8 and codepage 1361, which is also mentioned as valid
+ doublebyte codepage in MSDN man pages (e.g. IsDBCSLeadByteEx).
+ Everything else will be hosed. */
+
+bool
+is_cp_multibyte (UINT cp)
+{
+ switch (cp)
+ {
+ case 932:
+ case 936:
+ case 949:
+ case 950:
+ case 1361:
+ case 65001:
+ return true;
+ }
+ return false;
+}
+
+/* OMYGOD! CharNextExA is not UTF-8 aware! It only works fine with
+ double byte charsets. So we have to do it ourselves for UTF-8.
+
+ While being at it, we do more. If a double-byte or multibyte
+ sequence is trucated due to an early end, we need a way to recognize
+ it. The reason is that multiple buffered write statements might
+ accidentally stop and start in the middle of a single character byte
+ sequence. If we have to interpret the byte sequences (as in
+ fhandler_console, we would print wrong output in these cases.
+
+ So we have four possible return values here:
+
+ ret = end if str >= end
+ ret = NULL if we encounter an invalid byte sequence
+ ret = str if we encounter the start byte of a truncated byte sequence
+ ret = str + n if we encounter a vaild byte sequence
+*/
+
+const unsigned char *
+next_char (UINT cp, const unsigned char *str, const unsigned char *end)
+{
+ const unsigned char *ret;
+
+ if (str >= end)
+ return end;
+
+ switch (cp)
+ {
+ case 932:
+ case 936:
+ case 949:
+ case 950:
+ case 1361:
+ if (*str <= 0x7f)
+ ret = str + 1;
+ else if (str == end - 1 && IsDBCSLeadByteEx (cp, *str))
+ ret = str;
+ else
+ ret = (const unsigned char *) CharNextExA (cp, (const CHAR *) str, 0);
+ break;
+ case CP_UTF8:
+ switch (str[0] >> 4)
+ {
+ case 0x0 ... 0x7: /* One byte character. */
+ ret = str + 1;
+ break;
+ case 0x8 ... 0xb: /* Followup byte. Invalid as first byte. */
+ ret = NULL;
+ break;
+ case 0xc ... 0xd: /* Two byte character. */
+ /* Check followup bytes for validity. */
+ if (str >= end - 1)
+ ret = str;
+ else if (str[1] <= 0xbf)
+ ret = str + 2;
+ else
+ ret = NULL;
+ break;
+ case 0xe: /* Three byte character. */
+ if (str >= end - 2)
+ ret = str;
+ else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
+ && (str[0] != 0xe0 || str[1] >= 0xa0)
+ && (str[0] != 0xed || str[1] <= 0x9f))
+ ret = str + 3;
+ else
+ ret = NULL;
+ break;
+ case 0xf: /* Four byte character. */
+ if (str[0] >= 0xf8)
+ ret = NULL;
+ else if (str >= end - 3)
+ ret = str;
+ else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80
+ && (str[3] & 0xc0) == 0x80
+ && (str[0] == 0xf0 || str[1] >= 0x90)
+ && (str[0] == 0xf4 || str[1] <= 0x8f))
+ ret = str + 4;
+ else
+ ret = NULL;
+ break;
+ }
+ break;
+ default:
+ ret = str + 1;
+ break;
+ }
+ return ret;
+}
+
int __stdcall
check_invalid_virtual_addr (const void *s, unsigned sz)
{
diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc
index 130be76f1..23471d258 100644
--- a/winsup/cygwin/strfuncs.cc
+++ b/winsup/cygwin/strfuncs.cc
@@ -36,14 +36,6 @@ get_cp ()
return active_codepage;
}
-bool
-is_cp_multibyte (UINT cp)
-{
- CPINFO cpi;
- GetCPInfo (cp, &cpi);
- return cpi.MaxCharSize > 1;
-}
-
/* tlen is always treated as the maximum buffer size, including the '\0'
character. sys_wcstombs will always return a 0-terminated result, no
matter what. */
diff --git a/winsup/cygwin/winsup.h b/winsup/cygwin/winsup.h
index f88f23fab..952292862 100644
--- a/winsup/cygwin/winsup.h
+++ b/winsup/cygwin/winsup.h
@@ -110,6 +110,7 @@ extern const char case_folded_upper[];
/* The one function we use from winuser.h most of the time */
extern "C" DWORD WINAPI GetLastError (void);
+/* Codepage and multibyte string specific stuff. */
enum codepage_type {ansi_cp, oem_cp, utf8_cp};
extern codepage_type current_codepage;
extern UINT active_codepage;
@@ -117,6 +118,8 @@ extern UINT active_codepage;
void codepage_init (const char *buf);
UINT get_cp ();
bool is_cp_multibyte (UINT cp);
+const unsigned char *next_char (UINT cp, const unsigned char *str,
+ const unsigned char *end);
/* Used as type by sys_wcstombs_alloc and sys_mbstowcs_alloc. For a
description see there. */