Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Zolotarev <deathbaba@gmail.com>2011-10-04 17:16:13 +0400
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:25:07 +0300
commit1da1ba760229acc9df092ec8e7e21ef22cfaa903 (patch)
treec9199e82d7cb9b7f5da33525646c531fc04588a5 /3party/utfcpp
parentc824cfa2d4b73cc399a9b350406f54e0dc25f757 (diff)
Updated utf8cpp to 2.3.1
Diffstat (limited to '3party/utfcpp')
-rwxr-xr-x[-rw-r--r--]3party/utfcpp/doc/ReleaseNotes11
-rwxr-xr-x[-rw-r--r--]3party/utfcpp/doc/utf8cpp.html188
-rw-r--r--3party/utfcpp/source/utf8.h68
-rw-r--r--3party/utfcpp/source/utf8/checked.h28
-rwxr-xr-x[-rw-r--r--]3party/utfcpp/source/utf8/core.h16
-rwxr-xr-x[-rw-r--r--]3party/utfcpp/source/utf8/unchecked.h2
6 files changed, 251 insertions, 62 deletions
diff --git a/3party/utfcpp/doc/ReleaseNotes b/3party/utfcpp/doc/ReleaseNotes
index 857a72b756..c0c01a3c61 100644..100755
--- a/3party/utfcpp/doc/ReleaseNotes
+++ b/3party/utfcpp/doc/ReleaseNotes
@@ -1,9 +1,12 @@
utf8 cpp library
-Release 2.2.4
+Release 2.3.1
-This is a minor bug fix release that improves converting from utf-16 to utf-8 error detection.
+A bug fix release. Special thanks to dalle and Ivan Sorokin who reported the bugs.
-Changes from version 2.2.3
-- Bug fix [2857454] dereference invalid iterator when lead surrogate was last element of the string.
+Changes from version 2.3
+- Bug fix [3025042]: is_bom documentation issue.
+- Bug fix [3083640]: is_code_point_valid incorrectly returns false.
+- Bug fix [3167987]: prior moves it before start.
+- Bug fix [3185087]: utf8::prior and utf8::previous documentation issue.
Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes
diff --git a/3party/utfcpp/doc/utf8cpp.html b/3party/utfcpp/doc/utf8cpp.html
index 069c2be521..c0a89a27de 100644..100755
--- a/3party/utfcpp/doc/utf8cpp.html
+++ b/3party/utfcpp/doc/utf8cpp.html
@@ -67,7 +67,7 @@
<li>
<a href=#fixinvalid>Ensure that a string contains valid UTF-8 text</a>
</li>
- </li>
+ </ul>
<li>
<a href="#reference">Reference</a>
<ul class="toc">
@@ -275,11 +275,12 @@ octet_iterator append(uint32_t cp, octet_iterator result);
</pre>
<p>
- <code>cp</code>: A 32 bit integer representing a code point to append to the
+ <code>octet_iterator</code>: an output iterator.<br>
+ <code>cp</code>: a 32 bit integer representing a code point to append to the
sequence.<br>
- <code>result</code>: An output iterator to the place in the sequence where to
+ <code>result</code>: an output iterator to the place in the sequence where to
append the code point.<br>
- <span class="return_value">Return value</span>: An iterator pointing to the place
+ <span class="return_value">Return value</span>: an iterator pointing to the place
after the newly appended sequence.
</p>
<p>
@@ -326,6 +327,7 @@ uint32_t next(octet_iterator&amp; it, octet_iterator end);
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
beginning of the next code point.<br>
@@ -370,6 +372,7 @@ uint32_t peek_next(octet_iterator it, octet_iterator end);
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: an iterator pointing to the beginning of an UTF-8
encoded code point.<br>
<code>end</code>: end of the UTF-8 sequence to be processed. If <code>it</code>
@@ -400,7 +403,7 @@ assert (w == twochars);
Available in version 1.02 and later.
</p>
<p>
- Given a reference to an iterator pointing to an octet in a UTF-8 seqence, it
+ Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it
decreases the iterator until it hits the beginning of the previous UTF-8 encoded
code point and returns the 32 bits representation of the code point.
</p>
@@ -411,6 +414,7 @@ uint32_t prior(octet_iterator&amp; it, octet_iterator start);
</pre>
<p>
+ <code>octet_iterator</code>: a bidirectional iterator.<br>
<code>it</code>: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.<br>
@@ -437,7 +441,9 @@ assert (w == twochars);
This function has two purposes: one is two iterate backwards through a UTF-8
encoded string. Note that it is usually a better idea to iterate forward instead,
since <code>utf8::next</code> is faster. The second purpose is to find a beginning
- of a UTF-8 sequence if we have a random position within a string.
+ of a UTF-8 sequence if we have a random position within a string. Note that in that
+ case <code>utf8::prior</code> may not detect an invalid UTF-8 sequence in some scenarios:
+ for instance if there are superfluous trail octets, it will just skip them.
</p>
<p>
<code>it</code> will typically point to the beginning of
@@ -447,10 +453,12 @@ assert (w == twochars);
beginning with that octet is decoded to a 32 bit representation and returned.
</p>
<p>
- In case <code>pass_end</code> is reached before a UTF-8 lead octet is hit, or if an
+ In case <code>start</code> is reached before a UTF-8 lead octet is hit, or if an
invalid UTF-8 sequence is started by the lead octet, an <code>invalid_utf8</code>
exception is thrown.
</p>
+ <p>In case <code>start</code> equals <code>it</code>, a <code>not_enough_room</code>
+ exception is thrown.
<h4>
utf8::previous
</h4>
@@ -469,6 +477,7 @@ uint32_t previous(octet_iterator&amp; it, octet_iterator pass_start);
</pre>
<p>
+ <code>octet_iterator</code>: a random access iterator.<br>
<code>it</code>: a reference pointing to an octet within a UTF-8 encoded string.
After the function returns, it is decremented to point to the beginning of the
previous code point.<br>
@@ -507,7 +516,7 @@ assert (w == twochars);
beginning with that octet is decoded to a 32 bit representation and returned.
</p>
<p>
- In case <code>pass_end</code> is reached before a UTF-8 lead octet is hit, or if an
+ In case <code>pass_start</code> is reached before a UTF-8 lead octet is hit, or if an
invalid UTF-8 sequence is started by the lead octet, an <code>invalid_utf8</code>
exception is thrown
</p>
@@ -529,6 +538,8 @@ assert (w == twochars);
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
+ <code>distance_type</code>: an integral type convertible to <code>octet_iterator</code>'s difference type.<br>
<code>it</code>: a reference to an iterator pointing to the beginning of an UTF-8
encoded code point. After the function returns, it is incremented to point to the
nth following code point.<br>
@@ -574,8 +585,9 @@ assert (w == twochars + <span class="literal">5</span>);
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
<code>first</code>: an iterator to a beginning of a UTF-8 encoded code point.<br>
- <code>last</code>: an iterator to a "post-end" of the last UTF-8 encoded code
+ <code>last</code>: an iterator to a "post-end" of the last UTF-8 encoded code
point in the sequence we are trying to determine the length. It can be the
beginning of a new code point, or not.<br>
<span class="return_value">Return value</span> the distance between the iterators,
@@ -619,6 +631,8 @@ octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_itera
</pre>
<p>
+ <code>u16bit_iterator</code>: an input iterator.<br>
+ <code>octet_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-16 encoded
string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-16 encoded
@@ -661,6 +675,8 @@ u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_itera
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
+ <code>u16bit_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 encoded
string to convert. &lt; br /&gt; <code>end</code>: an iterator pointing to
pass-the-end of the UTF-8 encoded string to convert.<br>
@@ -705,6 +721,8 @@ octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_itera
</pre>
<p>
+ <code>octet_iterator</code>: an output iterator.<br>
+ <code>u32bit_iterator</code>: an input iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-32 encoded
string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-32 encoded
@@ -747,6 +765,8 @@ u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_itera
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
+ <code>u32bit_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 encoded
string to convert.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 encoded string
@@ -787,6 +807,7 @@ assert (utf32result.size() == <span class="literal">2</span>);
octet_iterator find_invalid(octet_iterator start, octet_iterator end);
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
test for validity.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to test
@@ -827,6 +848,7 @@ assert (invalid == utf_invalid + <span class="literal">5</span>);
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
test for validity.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to test
@@ -868,6 +890,8 @@ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
+ <code>output_iterator</code>: an output iterator.<br>
<code>start</code>: an iterator pointing to the beginning of the UTF-8 string to
look for invalid UTF-8 sequences.<br>
<code>end</code>: an iterator pointing to pass-the-end of the UTF-8 string to look
@@ -905,10 +929,47 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
<code>utf8::not_enough_room</code> exception is thrown.
</p>
<h4>
+ utf8::starts_with_bom
+ </h4>
+ <p class="version">
+ Available in version 2.3 and later. Relaces deprecated <code>is_bom()</code> function.
+ </p>
+ <p>
+ Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM)
+ </p>
+<pre>
+<span class="keyword">template</span> &lt;<span class=
+"keyword">typename</span> octet_iterator&gt;
+<span class="keyword">bool</span> starts_with_bom (octet_iterator it, octet_iterator end);
+</pre>
+ <p>
+ <code>octet_iterator</code>: an input iterator.<br>
+ <code>it</code>: beginning of the octet sequence to check<br>
+ <code>end</code>: pass-end of the sequence to check<br>
+ <span class="return_value">Return value</span>: <code>true</code> if the sequence
+ starts with a UTF-8 byte order mark; <code>false</code> if not.
+ </p>
+ <p>
+ Example of use:
+ </p>
+<pre>
+<span class="keyword">unsigned char</span> byte_order_mark[] = {<span class=
+"literal">0xef</span>, <span class="literal">0xbb</span>, <span class=
+"literal">0xbf</span>};
+<span class="keyword">bool</span> bbom = starts_with_bom(byte_order_mark, byte_order_mark + <span class="keyword">sizeof</span>(byte_order_mark));
+assert (bbom == <span class="literal">true</span>);
+</pre>
+ <p>
+ The typical use of this function is to check the first three bytes of a file. If
+ they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
+ encoded text.
+ </p>
+ <h4>
utf8::is_bom
</h4>
<p class="version">
- Available in version 1.0 and later.
+ Available in version 1.0 and later. Deprecated in version 2.3. <code>starts_with_bom()</code> should be used
+ instead.
</p>
<p>
Checks whether a sequence of three octets is a UTF-8 byte order mark (BOM)
@@ -916,9 +977,10 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(),
<pre>
<span class="keyword">template</span> &lt;<span class=
"keyword">typename</span> octet_iterator&gt;
-<span class="keyword">bool</span> is_bom (octet_iterator it);
+<span class="keyword">bool</span> is_bom (octet_iterator it); <span class="comment"> // Deprecated</span>
</pre>
<p>
+ <code>octet_iterator</code>: an input iterator.<br>
<code>it</code>: beginning of the 3-octet sequence to check<br>
<span class="return_value">Return value</span>: <code>true</code> if the sequence
is UTF-8 byte order mark; <code>false</code> if not.
@@ -938,9 +1000,113 @@ assert (bbom == <span class="literal">true</span>);
they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8
encoded text.
</p>
+ <p>
+ If a sequence is
+ shorter than three bytes, an invalid iterator will be dereferenced. Therefore, this function is deprecated
+ in favor of <code>starts_with_bom()</code>that takes the end of sequence as an argument.
+ </p>
<h3 id="typesutf8">
Types From utf8 Namespace
</h3>
+ <h4>utf8::exception
+ </h4>
+ <p class="version">
+ Available in version 2.3 and later.
+ </p>
+ <p>
+ Base class for the exceptions thrown by UTF CPP library functions.
+ </p>
+<pre>
+<span class="keyword">class</span> exception : <span class="keyword">public</span> std::exception {};
+</pre>
+ <p>
+ Example of use:
+ </p>
+<pre>
+<span class="keyword">try</span> {
+ code_that_uses_utf_cpp_library();
+}
+<span class="keyword">catch</span>(<span class="keyword">const</span> utf8::exception&amp; utfcpp_ex) {
+ cerr &lt;&lt; utfcpp_ex.what();
+}
+</pre>
+
+ <h4>utf8::invalid_code_point
+ </h4>
+ <p class="version">
+ Available in version 1.0 and later.
+ </p>
+ <p>
+ Thrown by UTF8 CPP functions such as <code>advance</code> and <code>next</code> if an UTF-8 sequence represents and invalid code point.
+ </p>
+
+<pre>
+<span class="keyword">class</span> invalid_code_point : <span class="keyword">public</span> exception {
+<span class="keyword">public</span>:
+ uint32_t code_point() <span class="keyword">const</span>;
+};
+
+</pre>
+ <p>
+ Member function <code>code_point()</code> can be used to determine the invalid code point that
+ caused the exception to be thrown.
+ </p>
+ <h4>utf8::invalid_utf8
+ </h4>
+ <p class="version">
+ Available in version 1.0 and later.
+ </p>
+ <p>
+ Thrown by UTF8 CPP functions such as <code>next</code> and <code>prior</code> if an invalid UTF-8 sequence
+ is detected during decoding.
+ </p>
+
+<pre>
+<span class="keyword">class</span> invalid_utf8 : <span class="keyword">public</span> exception {
+<span class="keyword">public</span>:
+ uint8_t utf8_octet() <span class="keyword">const</span>;
+};
+</pre>
+
+ <p>
+ Member function <code>utf8_octet()</code> can be used to determine the beginning of the byte
+ sequence that caused the exception to be thrown.
+ </p>
+</pre>
+ <h4>utf8::invalid_utf16
+ </h4>
+ <p class="version">
+ Available in version 1.0 and later.
+ </p>
+ <p>
+ Thrown by UTF8 CPP function <code>utf16to8</code> if an invalid UTF-16 sequence
+ is detected during decoding.
+ </p>
+
+<pre>
+<span class="keyword">class</span> invalid_utf16 : <span class="keyword">public</span> exception {
+<span class="keyword">public</span>:
+ uint16_t utf16_word() <span class="keyword">const</span>;
+};
+</pre>
+
+ <p>
+ Member function <code>utf16_word()</code> can be used to determine the UTF-16 code unit
+ that caused the exception to be thrown.
+ </p>
+ <h4>utf8::not_enough_room
+ </h4>
+ <p class="version">
+ Available in version 1.0 and later.
+ </p>
+ <p>
+ Thrown by UTF8 CPP functions such as <code>next</code> if the end of the decoded UTF-8 sequence
+ was reached before the code point was decoded.
+ </p>
+
+<pre>
+<span class="keyword">class</span> not_enough_room : <span class="keyword">public</span> exception {};
+</pre>
<h4>
utf8::iterator
</h4>
diff --git a/3party/utfcpp/source/utf8.h b/3party/utfcpp/source/utf8.h
index 82b13f59f9..4e4451403f 100644
--- a/3party/utfcpp/source/utf8.h
+++ b/3party/utfcpp/source/utf8.h
@@ -1,34 +1,34 @@
-// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "utf8/checked.h"
-#include "utf8/unchecked.h"
-
-#endif // header guard
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "utf8/checked.h"
+#include "utf8/unchecked.h"
+
+#endif // header guard
diff --git a/3party/utfcpp/source/utf8/checked.h b/3party/utfcpp/source/utf8/checked.h
index ff75eb7892..9cb8d2c7f2 100644
--- a/3party/utfcpp/source/utf8/checked.h
+++ b/3party/utfcpp/source/utf8/checked.h
@@ -33,8 +33,12 @@ DEALINGS IN THE SOFTWARE.
namespace utf8
{
+ // Base for the exceptions that may be thrown from the library
+ class exception : public std::exception {
+ };
+
// Exceptions that may be thrown from the library functions.
- class invalid_code_point : public std::exception {
+ class invalid_code_point : public exception {
uint32_t cp;
public:
invalid_code_point(uint32_t cp) : cp(cp) {}
@@ -42,7 +46,7 @@ namespace utf8
uint32_t code_point() const {return cp;}
};
- class invalid_utf8 : public std::exception {
+ class invalid_utf8 : public exception {
uint8_t u8;
public:
invalid_utf8 (uint8_t u) : u8(u) {}
@@ -50,7 +54,7 @@ namespace utf8
uint8_t utf8_octet() const {return u8;}
};
- class invalid_utf16 : public std::exception {
+ class invalid_utf16 : public exception {
uint16_t u16;
public:
invalid_utf16 (uint16_t u) : u16(u) {}
@@ -58,7 +62,7 @@ namespace utf8
uint16_t utf16_word() const {return u16;}
};
- class not_enough_room : public std::exception {
+ class not_enough_room : public exception {
public:
virtual const char* what() const throw() { return "Not enough space"; }
};
@@ -157,13 +161,17 @@ namespace utf8
template <typename octet_iterator>
uint32_t prior(octet_iterator& it, octet_iterator start)
- {
- octet_iterator end = it;
+ {
+ // can't do much if it == start
+ if (it == start)
+ throw not_enough_room();
+
+ octet_iterator end = it;
+ // Go back until we hit either a lead octet or start
while (internal::is_trail(*(--it)))
- if (it < start)
+ if (it == start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
- octet_iterator temp = it;
- return next(temp, end);
+ return peek_next(it, end);
}
/// Deprecated in versions that include "prior"
@@ -249,7 +257,7 @@ namespace utf8
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
- while (start < end)
+ while (start != end)
(*result++) = next(start, end);
return result;
diff --git a/3party/utfcpp/source/utf8/core.h b/3party/utfcpp/source/utf8/core.h
index 5a55f0655c..268cf7cd48 100644..100755
--- a/3party/utfcpp/source/utf8/core.h
+++ b/3party/utfcpp/source/utf8/core.h
@@ -92,7 +92,7 @@ namespace internal
template <typename u32>
inline bool is_code_point_valid(u32 cp)
{
- return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
+ return (cp <= CODE_POINT_MAX && !is_surrogate(cp));
}
template <typename octet_iterator>
@@ -112,7 +112,8 @@ namespace internal
return 0;
}
- inline bool is_overlong_sequence(uint32_t cp, int length)
+ template <typename octet_difference_type>
+ inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
{
if (cp < 0x80) {
if (length != 1)
@@ -331,6 +332,17 @@ namespace internal
}
template <typename octet_iterator>
+ inline bool starts_with_bom (octet_iterator it, octet_iterator end)
+ {
+ return (
+ ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
+ ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
+ ((it != end) && (internal::mask8(*it)) == bom[2])
+ );
+ }
+
+ //Deprecated in release 2.3
+ template <typename octet_iterator>
inline bool is_bom (octet_iterator it)
{
return (
diff --git a/3party/utfcpp/source/utf8/unchecked.h b/3party/utfcpp/source/utf8/unchecked.h
index d3110cb88e..2f3eb4d1d0 100644..100755
--- a/3party/utfcpp/source/utf8/unchecked.h
+++ b/3party/utfcpp/source/utf8/unchecked.h
@@ -144,7 +144,7 @@ namespace utf8
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
- while (start != end) {
+ while (start < end) {
uint32_t cp = next(start);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);