External subtitles: add support for UTF-8 without BOM.

Change the default subtitle encoding to UTF-8 and fallback to ANSI if invalid UTF8 characters are found while reading the file. The file Utf8.h is borrowed from Notepad++ source code. Thanks to Francois-R. Boyer. This commit fixes ticket #439.
author: Underground78 <underground78@users.sourceforge.net> 2012-10-07 23:36:00 +0400
committer: Underground78 <underground78@users.sourceforge.net> 2012-10-14 21:40:36 +0400
commit: 7cb86ad153e6c6409a49cd1f9546aaa7411c514a (patch)
tree: 7fa46d18b702361faa6cae400c47aa976a5d77a6 /include
parent: 6c0b8b6774334272c16106f9d0919d15bf2b8b3a (diff)
1 files changed, 57 insertions, 0 deletions
diff --git a/include/Utf8.h b/include/Utf8.h
new file mode 100644
index 000000000..eed48df37
--- /dev/null
+++ b/include/Utf8.h
@@ -0,0 +1,57 @@
+// Simple functions to test UTF-8 characters.
+// Copyright (C)2010 Francois-R.Boyer@PolyMtl.ca
+// First version 2010-08
+//
+// Written for notepad++, and distributed under same license:
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either
+// version 2 of the License, or (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+#pragma once
+
+namespace Utf8 { // could be a static class, instead of a namespace, if it needs private members
+	// basic classification of UTF-8 bytes
+	inline static bool isSingleByte(unsigned char c)       { return c < 0x80; }
+	inline static bool isPartOfMultibyte(unsigned char c)  { return c >= 0x80; }
+	inline static bool isFirstOfMultibyte(unsigned char c) { return c >= 0xC2 && c < 0xF5; } // 0xF5 to 0xFD are defined by UTF-8, but are not currently valid Unicode
+	inline static bool isContinuation(unsigned char c)     { return (c & 0xC0) == 0x80; }
+	inline static bool isValid(unsigned char c)            { return c < 0xC0 || isFirstOfMultibyte(c); }	// validates a byte, out of context
+
+	// number of continuation bytes for a given valid first character (0 for single byte characters)
+	inline static int  continuationBytes(unsigned char c)  {
+		static const char _len[] = { 1,1,2,3 };
+		return (c < 0xC0) ? 0 : _len[(c & 0x30) >>  4];
+	} 
+
+	// validates a full character
+	inline static bool isValid(const unsigned char* buf, int buflen) {
+		if(isSingleByte(buf[0])) return true; // single byte is valid
+		if(!isFirstOfMultibyte(buf[0])) return false; // not single byte, nor valid multi-byte first byte
+		int charContinuationBytes = continuationBytes(buf[0]);
+		if(buflen < charContinuationBytes+1) return false; // character does not fit in buffer
+		for(int i = charContinuationBytes; i>0; --i)
+			if(!isContinuation(*(++buf))) return false; // not enough continuation bytes
+		return true;  // the character is valid (if there are too many continuation bytes, it is the next character that will be invalid)
+	}
+
+	// rewinds to the first byte of a multi-byte character for any valid UTF-8 (and will not rewind too much on any other input)
+	inline static int characterStart(const unsigned char* buf, int startingIndex) {
+		int charContinuationBytes = 0;
+		while(charContinuationBytes < startingIndex	// rewind past start of buffer?
+			&& charContinuationBytes < 5	// UTF-8 support up to 5 continuation bytes (but valid sequences currently do not have more than 3)
+			&& isContinuation(buf[startingIndex-charContinuationBytes])
+			)
+			++charContinuationBytes;
+		return startingIndex-charContinuationBytes;
+	}
+};
author	Underground78 <underground78@users.sourceforge.net>	2012-10-07 23:36:00 +0400
committer	Underground78 <underground78@users.sourceforge.net>	2012-10-14 21:40:36 +0400
commit	7cb86ad153e6c6409a49cd1f9546aaa7411c514a (patch)
tree	7fa46d18b702361faa6cae400c47aa976a5d77a6 /include
parent	6c0b8b6774334272c16106f9d0919d15bf2b8b3a (diff)