Cygwin: locale(1): drop using LCID, use Windows locale names

LCIDs are deprecated since Windows Vista. Worse, lots of new locales have been added in the meantime which have no LCID attached. They are only available by locale name. As first step, rearrange the locale(1) tool to use Windows locale names, rather than LCIDs, so we can now enumerate *all* locales available in more recent Windows versions. Signed-off-by: Corinna Vinschen <corinna@vinschen.de>
author: Corinna Vinschen <corinna@vinschen.de> 2023-02-23 02:22:56 +0300
committer: Corinna Vinschen <corinna@vinschen.de> 2023-02-24 18:40:58 +0300
commit: ac405ab9bc07789d5576fd823cbaa7eef472b11b (patch)
tree: 61a1d89d4e15b2e27bca21d8c8af49e82e7ad98a /winsup/utils
parent: 17ac400c11bab30ac2c0bef12cbf7788f0b6f954 (diff)
1 files changed, 190 insertions, 130 deletions
diff --git a/winsup/utils/locale.cc b/winsup/utils/locale.cc
index 3f7e5dcca..1d90550f3 100644
--- a/winsup/utils/locale.cc
+++ b/winsup/utils/locale.cc
@@ -30,6 +30,7 @@
 #include <getopt.h>
 #include <string.h>
 #include <wchar.h>
+#include <wctype.h>
 #include <locale.h>
 #include <langinfo.h>
 #include <limits.h>
@@ -107,17 +108,42 @@ struct option longopts[] = {
 const char *opts = "acfhikmnsuUvV";
 
 int
-getlocale (LCID lcid, char *name)
+getlocale (PWCHAR loc_name, wchar_t *iso639, wchar_t *iso3166,
+	   wchar_t *iso15924 = NULL)
 {
-  char iso639[10];
-  char iso3166[10];
+  wchar_t *cp;
 
-  iso3166[0] = '\0';
-  if (!GetLocaleInfo (lcid, LOCALE_SISO639LANGNAME, iso639, 10))
+  /* Skip language-only locales, e. g. "en" */
+  if (!(cp = wcschr (loc_name, L'-')))
     return 0;
-  GetLocaleInfo (lcid, LOCALE_SISO3166CTRYNAME, iso3166, 10);
-  sprintf (name, "%s%s%s", iso639, lcid > 0x3ff ? "_" : "",
-			   lcid > 0x3ff ? iso3166 : "");
+  ++cp;
+  /* Script inside?  Scripts are Upper/Lower, e. g. "Latn" */
+  if (iswupper (cp[0]) && iswlower (cp[1]))
+    {
+      wchar_t *cp2;
+
+      /* Skip language-Script locales, missing country  */
+      if (!(cp2 = wcschr (cp + 2, L'-')))
+	return 0;
+      /* Otherwise, store in iso15924 */
+      if (iso15924)
+	wcpcpy (wcpncpy (iso15924, cp, cp2 - cp), L";");
+    }
+  cp = wcsrchr (loc_name, L'-');
+  if (cp)
+    {
+      /* Skip numeric iso3166 country name. */
+      if (iswdigit (cp[1]))
+	return 0;
+      /* Special case postfix after iso3166 country name: ca-ES-valencia.
+	 Use the postfix thingy as script so it will become a @modifier */
+      if (iswlower (cp[1]))
+	wcpcpy (iso15924, cp + 1);
+    }
+
+  if (!GetLocaleInfoEx (loc_name, LOCALE_SISO639LANGNAME, iso639, 10))
+    return 0;
+  GetLocaleInfoEx (loc_name, LOCALE_SISO3166CTRYNAME, iso3166, 10);
   return 1;
 }
 
@@ -133,14 +159,6 @@ size_t loc_max;
 size_t loc_num;
 
 void
-print_codeset (const char *codeset)
-{
-  for (; *codeset; ++codeset)
-    if (*codeset != '-')
-      putc (tolower ((int)(unsigned char) *codeset), stdout);
-}
-
-void
 print_locale_with_codeset (int verbose, loc_t *locale, bool utf8,
 			   const char *modifier)
 {
@@ -158,7 +176,7 @@ print_locale_with_codeset (int verbose, loc_t *locale, bool utf8,
 	sysroot = (const char *) cygwin_create_path (CCP_WIN_W_TO_POSIX,
 						     sysbuf);
       if (!sysroot)
-      	sysroot = "kernel32.dll";
+	sysroot = "kernel32.dll";
     }
   snprintf (locname, 32, "%s%s%s%s", locale->name, utf8 ? ".utf8" : "",
 				     modifier ? "@" : "", modifier ?: "");
@@ -187,7 +205,7 @@ print_locale (int verbose, loc_t *locale)
     {
       if (!modifier)
 	print_locale_with_codeset (verbose, locale, true, NULL);
-      else if (!strcmp (modifier, "@cjknarrow"))
+      else if (strcmp (modifier, "@euro"))
 	{
 	  *modifier++ = '\0';
 	  print_locale_with_codeset (verbose, locale, true, modifier);
@@ -213,7 +231,7 @@ add_locale (const char *name, const wchar_t *language, const wchar_t *territory,
     {
       loc_t *tmp = (loc_t *) realloc (locale, (loc_max + 32) * sizeof (loc_t));
       if (!tmp)
-      	{
+	{
 	  fprintf (stderr, "Out of memory!\n");
 	  exit (1);
 	}
@@ -266,110 +284,147 @@ add_locale_alias_locales ()
       c = strchr (replace, '.');
       if (c)
 	*c = '\0';
+      /* Ignore "ja_JP" and "ko_KR" locales from here, they are in the Windows
+	 DB anyway. */
+      if (!strcmp (alias, "ja_JP") || !strcmp (alias, "ko_KR"))
+	continue;
       search.name = replace;
       loc = (loc_t *) bsearch (&search, locale, orig_loc_num, sizeof (loc_t),
 			       compare_locales);
+
       add_locale (alias, loc ? loc->language : L"", loc ? loc->territory : L"",
 		  true);
     }
   fclose (fp);
 }
 
-void
-print_all_locales (int verbose)
+BOOL
+print_all_locales_proc (LPWSTR loc_name, DWORD info, LPARAM param)
 {
-  LCID lcid = 0;
-  char name[32];
+  wchar_t iso639[32] = { 0 };
+  wchar_t iso3166[32] = { 0 };
+  wchar_t iso15924[32] = { 0 };
   DWORD cp;
 
-  unsigned lang, sublang;
+#if 0
+  struct {
+    wchar_t language[256];
+    wchar_t country[256];
+    char loc[32];
+  } loc_list[32];
+  int lcnt = 0;
+#endif
 
-  add_locale ("C", L"C", L"POSIX");
-  add_locale ("POSIX", L"C", L"POSIX", true);
-  for (lang = 1; lang <= 0xff; ++lang)
+  if (getlocale (loc_name, iso639, iso3166, iso15924))
     {
-      struct {
-	wchar_t language[256];
-	wchar_t country[256];
-	char loc[32];
-      } loc_list[32];
-      int lcnt = 0;
-
-      for (sublang = 1; sublang <= 0x3f; ++sublang)
+      char *c, posix_loc[32];
+      wchar_t language[256];
+      wchar_t country[256];
+      wchar_t currency[9];
+
+      c = posix_loc + snprintf (posix_loc, sizeof posix_loc, "%ls_%ls",
+				iso639, iso3166);
+      /* Inuktitut: equivalent @latin due to lack of info on Linux */
+      if (!wcscmp (iso639, L"iu"))
 	{
-	  lcid = (sublang << 10) | lang;
-	  if (getlocale (lcid, name))
+	  if (wcscmp (iso15924, L"Latn;"))
+	    return TRUE;
+	}
+      /* Javanese: only use @latin locale. */
+      else if (!wcscmp (iso639, L"jv"))
+	{
+	  if (wcscmp (iso15924, L"Latn;"))
+	    return TRUE;
+	}
+      /* Mongolian: only use @mongolian locale. */
+      else if (!wcscmp (iso639, L"mn"))
+	{
+	  if (wcscmp (iso15924, L"Mong;"))
+	    return TRUE;
+	}
+      /* Serbian: Windows default is Latin, Linux default is Cyrillic.
+	 We want the Linux default and attach @latin otherwise */
+      else if (!wcscmp (iso639, L"sr")  && !wcscmp (iso15924, L"Latn;"))
+	stpcpy (c, "@latin");
+      /* Tamazight: no modifier, iso639 is "ber" on Linux.
+	 "zgh-Tfng-MA" is equivalent to "ber_MA". */
+      else if (!wcscmp (iso639, L"zgh"))
+	snprintf (posix_loc, sizeof posix_loc, "ber_%.27ls", iso3166);
+      /* Tamazight: "tzm-Latn-DZ" is equivalent to "ber_DZ",
+		    skip everything else. */
+      else if (!wcscmp (iso639, L"tzm"))
+	{
+	  if (!wcscmp (iso3166, L"DZ") && !wcscmp (iso15924, L"Latn;"))
+	    snprintf (posix_loc, sizeof posix_loc, "ber_%.27ls", iso3166);
+	  else
+	    return TRUE;
+	}
+      /* In all other cases, we check if the script from the Windows
+	 locale is the default locale in that language.  If not, we
+	 add it as modifier if possible, or skip it */
+      else if (iso15924[0])
+	{
+	  wchar_t scriptless_win_locale[32];
+	  wchar_t default_iso15924[32];
+
+	  wcpcpy (wcpcpy (wcpcpy (scriptless_win_locale, iso639), L"_"),
+		  iso3166);
+	  if ((GetLocaleInfoEx (scriptless_win_locale, LOCALE_SSCRIPTS,
+				default_iso15924, 32)
+	       || GetLocaleInfoEx (iso639, LOCALE_SSCRIPTS,
+				   default_iso15924, 32))
+	      && !wcsstr (default_iso15924, iso15924))
 	    {
-	      wchar_t language[256];
-	      wchar_t country[256];
-	      int i;
-	      char *c, loc[32];
-	      wchar_t wbuf[9];
-
-	      /* Go figure.  Even the English name of a language or
-		 locale might contain native characters. */
-	      GetLocaleInfoW (lcid, LOCALE_SENGLANGUAGE, language, 256);
-	      GetLocaleInfoW (lcid, LOCALE_SENGCOUNTRY, country, 256);
-	      /* Avoid dups */
-	      for (i = 0; i < lcnt; ++ i)
-		if (!wcscmp (loc_list[i].language, language)
-		    && !wcscmp (loc_list[i].country, country))
-		  break;
-	      if (i < lcnt)
-		continue;
-	      if (lcnt < 32)
-		{
-		  wcscpy (loc_list[lcnt].language, language);
-		  wcscpy (loc_list[lcnt].country, country);
-		}
-	      c = stpcpy (loc, name);
-	      /* Now check certain conditions to figure out if that
-		 locale requires a modifier. */
-	      if (lang == LANG_SERBIAN && !strncmp (loc, "sr_", 3)
-		  && wcsstr (language, L"(Latin)"))
+	      if (!wcscmp (iso15924, L"Latn;"))
 		stpcpy (c, "@latin");
-	      else if (lang == LANG_UZBEK
-		       && sublang == SUBLANG_UZBEK_CYRILLIC)
+	      else if (!wcscmp (iso15924, L"Cyrl;"))
 		stpcpy (c, "@cyrillic");
-	      /* Avoid more dups */
-	      for (i = 0; i < lcnt; ++ i)
-		if (!strcmp (loc_list[i].loc, loc))
-		  {
-		    lcnt++;
-		    break;
-		  }
-	      if (i < lcnt)
-		continue;
-	      if (lcnt < 32)
-		strcpy (loc_list[lcnt++].loc, loc);
-	      /* Print */
-	      add_locale (loc, language, country);
-	      /* Check for locales which sport a modifier for
-		 changing the codeset and other stuff. */
-	      if (lang == LANG_BELARUSIAN
-		  && sublang == SUBLANG_BELARUSIAN_BELARUS)
-		stpcpy (c, "@latin");
-	      else if (lang == LANG_TATAR
-		       && sublang == SUBLANG_TATAR_RUSSIA)
-		stpcpy (c, "@iqtelif");
-	      else if (GetLocaleInfoW (lcid,
-				       LOCALE_IDEFAULTANSICODEPAGE
-				       | LOCALE_RETURN_NUMBER,
-				       (PWCHAR) &cp, sizeof cp)
-		       && cp == 1252 /* Latin1*/
-		       && GetLocaleInfoW (lcid, LOCALE_SINTLSYMBOL, wbuf, 9)
-		       && !wcsncmp (wbuf, L"EUR", 3))
-		stpcpy (c, "@euro");
-	      else if (lang == LANG_JAPANESE
-		       || lang == LANG_KOREAN
-		       || lang == LANG_CHINESE)
-		stpcpy (c, "@cjknarrow");
+	      else if (!wcscmp (iso15924, L"Deva;"))
+		stpcpy (c, "@devanagari");
+	      else if (!wcscmp (iso15924, L"Adlm;"))
+		stpcpy (c, "@adlam");
 	      else
-		continue;
-	      add_locale (loc, language, country);
+		return TRUE;
 	    }
 	}
+
+      /* Print */
+      GetLocaleInfoEx (loc_name, LOCALE_SENGLISHLANGUAGENAME, language, 256);
+      GetLocaleInfoEx (loc_name, LOCALE_SENGLISHCOUNTRYNAME, country, 256);
+      add_locale (posix_loc, language, country);
+      /* Check for locales sporting an additional modifier for
+	 changing the codeset and other stuff. */
+      if (!wcscmp (iso639, L"be") && !wcscmp (iso3166, L"BY"))
+	stpcpy (c, "@latin");
+      if (!wcscmp (iso639, L"tt") && !wcscmp (iso3166, L"RU"))
+	stpcpy (c, "@iqtelif");
+      else if (GetLocaleInfoEx (loc_name,
+				LOCALE_IDEFAULTANSICODEPAGE
+				| LOCALE_RETURN_NUMBER,
+				(PWCHAR) &cp, sizeof cp)
+	       && cp == 1252 /* Latin1*/
+	       && GetLocaleInfoEx (loc_name, LOCALE_SINTLSYMBOL, currency, 9)
+	       && !wcsncmp (currency, L"EUR", 3))
+	stpcpy (c, "@euro");
+      else if (!wcscmp (iso639, L"ja")
+	       || !wcscmp (iso639, L"ko")
+	       || !wcscmp (iso639, L"zh"))
+	stpcpy (c, "@cjknarrow");
+      else
+	return TRUE;
+      add_locale (posix_loc, language, country);
     }
+  return TRUE;
+}
+
+void
+print_all_locales (int verbose)
+{
+  add_locale ("C", L"C", L"POSIX");
+  add_locale ("POSIX", L"C", L"POSIX", true);
+  EnumSystemLocalesEx (print_all_locales_proc,
+		       LOCALE_WINDOWS | LOCALE_SUPPLEMENTAL,
+		       0, NULL);
   /* First sort allows add_locale_alias_locales to bsearch in locales. */
   qsort (locale, loc_num, sizeof (loc_t), compare_locales);
   add_locale_alias_locales ();
@@ -544,8 +599,8 @@ const char *fake_string[] = {
 
 lc_names_t lc_ctype_names[] =
 {
-  { "ctype-class-names", 	 is_string_fake, 0,			 0 },
-  { "ctype-map-names",   	 is_string_fake, 2,			 0 },
+  { "ctype-class-names",	 is_string_fake, 0,			 0 },
+  { "ctype-map-names",		 is_string_fake, 2,			 0 },
   { "ctype-outdigit0_mb",	 is_string,	_NL_CTYPE_OUTDIGITS0_MB, 0 },
   { "ctype-outdigit1_mb",	 is_string,	_NL_CTYPE_OUTDIGITS1_MB, 0 },
   { "ctype-outdigit2_mb",	 is_string,	_NL_CTYPE_OUTDIGITS2_MB, 0 },
@@ -556,30 +611,30 @@ lc_names_t lc_ctype_names[] =
   { "ctype-outdigit7_mb",	 is_string,	_NL_CTYPE_OUTDIGITS7_MB, 0 },
   { "ctype-outdigit8_mb",	 is_string,	_NL_CTYPE_OUTDIGITS8_MB, 0 },
   { "ctype-outdigit9_mb",	 is_string,	_NL_CTYPE_OUTDIGITS9_MB, 0 },
-  { "ctype-outdigit0_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS0_WC, 0 },
-  { "ctype-outdigit1_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS1_WC, 0 },
-  { "ctype-outdigit2_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS2_WC, 0 },
-  { "ctype-outdigit3_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS3_WC, 0 },
-  { "ctype-outdigit4_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS4_WC, 0 },
-  { "ctype-outdigit5_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS5_WC, 0 },
-  { "ctype-outdigit6_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS6_WC, 0 },
-  { "ctype-outdigit7_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS7_WC, 0 },
-  { "ctype-outdigit8_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS8_WC, 0 },
-  { "ctype-outdigit9_wc",	 is_wchar, 	_NL_CTYPE_OUTDIGITS9_WC, 0 },
+  { "ctype-outdigit0_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS0_WC, 0 },
+  { "ctype-outdigit1_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS1_WC, 0 },
+  { "ctype-outdigit2_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS2_WC, 0 },
+  { "ctype-outdigit3_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS3_WC, 0 },
+  { "ctype-outdigit4_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS4_WC, 0 },
+  { "ctype-outdigit5_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS5_WC, 0 },
+  { "ctype-outdigit6_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS6_WC, 0 },
+  { "ctype-outdigit7_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS7_WC, 0 },
+  { "ctype-outdigit8_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS8_WC, 0 },
+  { "ctype-outdigit9_wc",	 is_wchar,	_NL_CTYPE_OUTDIGITS9_WC, 0 },
   { "charmap",			 is_string,	CODESET,		 0 },
   { "ctype-mb-cur-max",		 is_int,	_NL_CTYPE_MB_CUR_MAX,	 0 },
-  { NULL, 			 is_end,	0,		 	 0 }
+  { NULL,			 is_end,	0,			 0 }
 };
 
 lc_names_t lc_numeric_names[] =
 {
   { "decimal_point",		 is_string,	RADIXCHAR,		 0 },
-  { "thousands_sep",		 is_string,	THOUSEP, 		 0 },
+  { "thousands_sep",		 is_string,	THOUSEP,		 0 },
   { "grouping",			 is_grouping,	_NL_NUMERIC_GROUPING,	 0 },
   { "numeric-decimal-point-wc",	 is_wchar,	_NL_NUMERIC_DECIMAL_POINT_WC, 0 },
   { "numeric-thousands-sep-wc",	 is_wchar,	_NL_NUMERIC_THOUSANDS_SEP_WC, 0 },
   { "numeric-codeset",		 is_string,	_NL_NUMERIC_CODESET,	 0 },
-  { NULL, 			 is_end,	0,			 0 }
+  { NULL,			 is_end,	0,			 0 }
 };
 
 lc_names_t lc_time_names[] =
@@ -600,13 +655,13 @@ lc_names_t lc_time_names[] =
   { "era_t_fmt",		 is_string,	ERA_T_FMT,		0 },
   { "date_fmt",			 is_string,	_DATE_FMT,		0 },
   { "time-codeset",		 is_string,	_NL_TIME_CODESET,	0 },
-  { NULL, 			 is_end,	0,			0 }
+  { NULL,			 is_end,	0,			0 }
 };
 
 lc_names_t lc_collate_names[] =
 {
   { "collate-codeset",		 is_string,	_NL_COLLATE_CODESET,	0 },
-  { NULL, 			 is_end,	0,			0 }
+  { NULL,			 is_end,	0,			0 }
 };
 
 lc_names_t lc_monetary_names[] =
@@ -635,7 +690,7 @@ lc_names_t lc_monetary_names[] =
   { "monetary-decimal-point-wc", is_wchar,	_NL_MONETARY_WMON_DECIMAL_POINT, 0 },
   { "monetary-thousands-sep-wc", is_wchar,	_NL_MONETARY_WMON_THOUSANDS_SEP, 0 },
   { "monetary-codeset",		 is_string,	_NL_MONETARY_CODESET,	   0 },
-  { NULL, 			 is_end,	0,			   0 }
+  { NULL,			 is_end,	0,			   0 }
 };
 
 lc_names_t lc_messages_names[] =
@@ -645,7 +700,7 @@ lc_names_t lc_messages_names[] =
   { "yesstr",			 is_string,	YESSTR,			0 },
   { "nostr",			 is_string,	NOSTR,			0 },
   { "messages-codeset",		 is_string,	_NL_MESSAGES_CODESET,	0 },
-  { NULL, 			 is_end,	0,			0 }
+  { NULL,			 is_end,	0,			0 }
 };
 
 void
@@ -739,14 +794,13 @@ int
 main (int argc, char **argv)
 {
   int opt;
-  LCID lcid = 0;
+  wchar_t loc_name[256] = { 0 };
   int all = 0;
   int cat = 0;
   int key = 0;
   int maps = 0;
   int verbose = 0;
   const char *utf = "";
-  char name[32];
 
   setlocale (LC_ALL, "");
   while ((opt = getopt_long (argc, argv, opts, longopts, NULL)) != -1)
@@ -765,19 +819,22 @@ main (int argc, char **argv)
 	maps = 1;
 	break;
       case 'i':
-	lcid = (UINT_PTR) GetKeyboardLayout (0) & 0xffff;
+	GetLocaleInfoW ((UINT_PTR) GetKeyboardLayout (0) & 0xffff, LOCALE_SNAME,
+			loc_name, 256);
 	break;
       case 's':
-	lcid = GetSystemDefaultUILanguage ();
+	GetLocaleInfoW (GetSystemDefaultUILanguage (), LOCALE_SNAME,
+			loc_name, 256);
 	break;
       case 'u':
-	lcid = GetUserDefaultUILanguage ();
+	GetLocaleInfoW (GetUserDefaultUILanguage (), LOCALE_SNAME,
+			loc_name, 256);
 	break;
       case 'f':
-	lcid = GetUserDefaultLCID ();
+	GetUserDefaultLocaleName (loc_name, 256);
 	break;
       case 'n':
-	lcid = GetSystemDefaultLCID ();
+	GetSystemDefaultLocaleName (loc_name, 256);
 	break;
       case 'U':
 	utf = ".UTF-8";
@@ -799,10 +856,13 @@ main (int argc, char **argv)
     print_all_locales (verbose);
   else if (maps)
     print_charmaps ();
-  else if (lcid)
+  else if (loc_name[0])
     {
-      if (getlocale (lcid, name))
-	printf ("%s%s\n", name, utf);
+      wchar_t iso639[10];
+      wchar_t iso3166[10];
+
+      if (getlocale (loc_name, iso639, iso3166, NULL))
+	printf ("%ls_%ls%s", iso639, iso3166, utf);
     }
   else if (optind < argc)
     while (optind < argc)
author	Corinna Vinschen <corinna@vinschen.de>	2023-02-23 02:22:56 +0300
committer	Corinna Vinschen <corinna@vinschen.de>	2023-02-24 18:40:58 +0300
commit	ac405ab9bc07789d5576fd823cbaa7eef472b11b (patch)
tree	61a1d89d4e15b2e27bca21d8c8af49e82e7ad98a /winsup/utils
parent	17ac400c11bab30ac2c0bef12cbf7788f0b6f954 (diff)