diff options
Diffstat (limited to 'src/vrcommon/strtools_public.cpp')
-rw-r--r-- | src/vrcommon/strtools_public.cpp | 194 |
1 files changed, 109 insertions, 85 deletions
diff --git a/src/vrcommon/strtools_public.cpp b/src/vrcommon/strtools_public.cpp index 61cf324..661325a 100644 --- a/src/vrcommon/strtools_public.cpp +++ b/src/vrcommon/strtools_public.cpp @@ -4,7 +4,11 @@ #include <stdio.h> #include <stdlib.h> #include <sstream> +#include <codecvt> #include <iostream> +#include <functional> +#include <locale> +#include <codecvt> //----------------------------------------------------------------------------- // Purpose: @@ -49,96 +53,36 @@ bool StringHasSuffixCaseSensitive( const std::string &sString, const std::string //----------------------------------------------------------------------------- // Purpose: //----------------------------------------------------------------------------- + std::string UTF16to8(const wchar_t * in) { - std::string out; - unsigned int codepoint = 0; - for ( ; in && *in != 0; ++in ) + try { - if (*in >= 0xd800 && *in <= 0xdbff) - codepoint = ((*in - 0xd800) << 10) + 0x10000; - else - { - if (*in >= 0xdc00 && *in <= 0xdfff) - codepoint |= *in - 0xdc00; - else - codepoint = *in; + typedef std::codecvt_utf8< wchar_t > convert_type; + std::wstring_convert< convert_type, wchar_t > converter; - if (codepoint <= 0x7f) - out.append(1, static_cast<char>(codepoint)); - else if (codepoint <= 0x7ff) - { - out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f))); - out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f))); - } - else if (codepoint <= 0xffff) - { - out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f))); - out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f))); - out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f))); - } - else - { - out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07))); - out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f))); - out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f))); - out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f))); - } - codepoint = 0; - } + return converter.to_bytes( in ); + } + catch ( ... ) + { + return std::string(); } - return out; } + std::wstring UTF8to16(const char * in) { - std::wstring out; - unsigned int codepoint = 0; - int following = 0; - for ( ; in && *in != 0; ++in ) + try { - unsigned char ch = *in; - if (ch <= 0x7f) - { - codepoint = ch; - following = 0; - } - else if (ch <= 0xbf) - { - if (following > 0) - { - codepoint = (codepoint << 6) | (ch & 0x3f); - --following; - } - } - else if (ch <= 0xdf) - { - codepoint = ch & 0x1f; - following = 1; - } - else if (ch <= 0xef) - { - codepoint = ch & 0x0f; - following = 2; - } - else - { - codepoint = ch & 0x07; - following = 3; - } - if (following == 0) - { - if (codepoint > 0xffff) - { - out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10))); - out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff))); - } - else - out.append(1, static_cast<wchar_t>(codepoint)); - codepoint = 0; - } + typedef std::codecvt_utf8< wchar_t > convert_type; + std::wstring_convert< convert_type, wchar_t > converter; + + return converter.from_bytes( in ); + } + catch ( ... ) + { + return std::wstring(); } - return out; } @@ -245,11 +189,30 @@ int iHexCharToInt( char cValue ) return -1; } + +//----------------------------------------------------------------------------- +// Purpose: These define the set of characters to filter for components (which +// need all the escaping we can muster) vs. paths (which don't want +// / and : escaped so we don't break less compliant URL handling code. +//----------------------------------------------------------------------------- +static bool CharNeedsEscape_Component( const char c ) +{ + return (!(c >= 'a' && c <= 'z') && !(c >= 'A' && c <= 'Z') && !(c >= '0' && c <= '9') + && c != '-' && c != '_' && c != '.'); +} +static bool CharNeedsEscape_FullPath( const char c ) +{ + return (!(c >= 'a' && c <= 'z') && !(c >= 'A' && c <= 'Z') && !(c >= '0' && c <= '9') + && c != '-' && c != '_' && c != '.' && c != '/' && c != ':' ); +} + + //----------------------------------------------------------------------------- // Purpose: Internal implementation of encode, works in the strict RFC manner, or // with spaces turned to + like HTML form encoding. //----------------------------------------------------------------------------- -void V_URLEncodeInternal( char *pchDest, int nDestLen, const char *pchSource, int nSourceLen, bool bUsePlusForSpace ) +void V_URLEncodeInternal( char *pchDest, int nDestLen, const char *pchSource, int nSourceLen, + bool bUsePlusForSpace, std::function< bool(const char)> fnNeedsEscape ) { //AssertMsg( nDestLen > 3*nSourceLen, "Target buffer for V_URLEncode should be 3x source length, plus one for terminating null\n" ); @@ -267,9 +230,7 @@ void V_URLEncodeInternal( char *pchDest, int nDestLen, const char *pchSource, in // We allow only a-z, A-Z, 0-9, period, underscore, and hyphen to pass through unescaped. // These are the characters allowed by both the original RFC 1738 and the latest RFC 3986. // Current specs also allow '~', but that is forbidden under original RFC 1738. - if ( !( pchSource[i] >= 'a' && pchSource[i] <= 'z' ) && !( pchSource[i] >= 'A' && pchSource[i] <= 'Z' ) && !(pchSource[i] >= '0' && pchSource[i] <= '9' ) - && pchSource[i] != '-' && pchSource[i] != '_' && pchSource[i] != '.' - ) + if ( fnNeedsEscape( pchSource[i] ) ) { if ( bUsePlusForSpace && pchSource[i] == ' ' ) { @@ -397,15 +358,19 @@ size_t V_URLDecodeInternal( char *pchDecodeDest, int nDecodeDestLen, const char //----------------------------------------------------------------------------- void V_URLEncode( char *pchDest, int nDestLen, const char *pchSource, int nSourceLen ) { - return V_URLEncodeInternal( pchDest, nDestLen, pchSource, nSourceLen, true ); + return V_URLEncodeInternal( pchDest, nDestLen, pchSource, nSourceLen, true, CharNeedsEscape_Component ); } void V_URLEncodeNoPlusForSpace( char *pchDest, int nDestLen, const char *pchSource, int nSourceLen ) { - return V_URLEncodeInternal( pchDest, nDestLen, pchSource, nSourceLen, false ); + return V_URLEncodeInternal( pchDest, nDestLen, pchSource, nSourceLen, false, CharNeedsEscape_Component ); } +void V_URLEncodeFullPath( char *pchDest, int nDestLen, const char *pchSource, int nSourceLen ) +{ + return V_URLEncodeInternal( pchDest, nDestLen, pchSource, nSourceLen, false, CharNeedsEscape_FullPath ); +} //----------------------------------------------------------------------------- // Purpose: Decodes a string (or binary data) from URL encoding format, see rfc1738 section 2.2. @@ -458,3 +423,62 @@ std::vector<std::string> TokenizeString( const std::string & sString, char cToke return vecStrings; } +//----------------------------------------------------------------------------- +// Purpose: Repairs a should-be-UTF-8 string to a for-sure-is-UTF-8 string, plus return boolean if we subbed in '?' somewhere +//----------------------------------------------------------------------------- +bool RepairUTF8( const char *pbegin, const char *pend, std::string & sOutputUtf8 ) +{ + typedef std::codecvt_utf8<char32_t> facet_type; + facet_type myfacet; + + std::mbstate_t mystate = std::mbstate_t(); + + sOutputUtf8.clear(); + sOutputUtf8.reserve( pend - pbegin ); + bool bSqueakyClean = true; + + const char *pmid = pbegin; + + while ( pmid != pend ) + { + char32_t out = 0xdeadbeef, *pout; + pbegin = pmid; + switch ( myfacet.in( mystate, pbegin, pend, pmid, &out, &out + 1, pout ) ) + { + case facet_type::ok: + // could convert back, but no need + for ( const char *p = pbegin; p != pmid; ++p ) + { + sOutputUtf8 += *p; + } + break; + + case facet_type::noconv: + // unexpected! always converting type + bSqueakyClean = false; + break; + + case facet_type::partial: + sOutputUtf8 += '?'; + pmid++; // partial consumes 0, so make progress + bSqueakyClean = false; + break; + + case facet_type::error: + sOutputUtf8 += '?'; + // error consumes some bytes + bSqueakyClean = false; + break; + } + } + + return bSqueakyClean; +} + +//----------------------------------------------------------------------------- +// Purpose: Repairs a should-be-UTF-8 string to a for-sure-is-UTF-8 string, plus return boolean if we subbed in '?' somewhere +//----------------------------------------------------------------------------- +bool RepairUTF8( const std::string & sInputUtf8, std::string & sOutputUtf8 ) +{ + return RepairUTF8( sInputUtf8.data(), sInputUtf8.data() + sInputUtf8.size(), sOutputUtf8 ); +} |