From 3e8426170ce005c111dfcae7982e18b647b7383f Mon Sep 17 00:00:00 2001 From: wm4 Date: Tue, 2 Sep 2014 20:48:45 +0200 Subject: avformat/assdec: UTF-16 support Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents to UTF-8 on the fly using FFTextReader, which acts as converting wrapper around AVIOContext. It also can work on a static buffer, needed for format probing. The FFTextReader wrapper now also takes care of skipping the UTF-8 BOM. Fix Ticket #3496. --- libavformat/assdec.c | 19 +++++++++------ libavformat/subtitles.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ libavformat/subtitles.h | 56 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 7 deletions(-) (limited to 'libavformat') diff --git a/libavformat/assdec.c b/libavformat/assdec.c index bb953c7276..a5f792aee4 100644 --- a/libavformat/assdec.c +++ b/libavformat/assdec.c @@ -33,10 +33,13 @@ typedef struct ASSContext { static int ass_probe(AVProbeData *p) { - const char *header = "[Script Info]"; + char buf[13]; + FFTextReader tr; + ff_text_init_buf(&tr, p->buf, p->buf_size); - if (!memcmp(p->buf, header, strlen(header)) || - !memcmp(p->buf + 3, header, strlen(header))) + ff_text_read(&tr, buf, sizeof(buf)); + + if (!memcmp(buf, "[Script Info]", 13)) return AVPROBE_SCORE_MAX; return 0; @@ -66,13 +69,13 @@ static int read_ts(const uint8_t *p, int64_t *start, int *duration) return -1; } -static int64_t get_line(AVBPrint *buf, AVIOContext *pb) +static int64_t get_line(AVBPrint *buf, FFTextReader *tr) { - int64_t pos = avio_tell(pb); + int64_t pos = ff_text_pos(tr); av_bprint_clear(buf); for (;;) { - char c = avio_r8(pb); + char c = ff_text_r8(tr); if (!c) break; av_bprint_chars(buf, c, 1); @@ -88,6 +91,8 @@ static int ass_read_header(AVFormatContext *s) AVBPrint header, line; int header_remaining, res = 0; AVStream *st; + FFTextReader tr; + ff_text_init_avio(&tr, s->pb); st = avformat_new_stream(s, NULL); if (!st) @@ -102,7 +107,7 @@ static int ass_read_header(AVFormatContext *s) av_bprint_init(&line, 0, AV_BPRINT_SIZE_UNLIMITED); for (;;) { - int64_t pos = get_line(&line, s->pb); + int64_t pos = get_line(&line, &tr); if (!line.str[0]) // EOF break; diff --git a/libavformat/subtitles.c b/libavformat/subtitles.c index fce2bf190b..cebd453965 100644 --- a/libavformat/subtitles.c +++ b/libavformat/subtitles.c @@ -20,9 +20,72 @@ #include "avformat.h" #include "subtitles.h" +#include "avio_internal.h" #include "libavutil/avassert.h" #include "libavutil/avstring.h" +void ff_text_init_avio(FFTextReader *r, AVIOContext *pb) +{ + int i; + r->pb = pb; + r->buf_pos = r->buf_len = 0; + r->type = FF_UTF_8; + for (i = 0; i < 2; i++) + r->buf[r->buf_len++] = avio_r8(r->pb); + if (strncmp("\xFF\xFE", r->buf, 2) == 0) { + r->type = FF_UTF16LE; + r->buf_pos += 2; + } else if (strncmp("\xFE\xFF", r->buf, 2) == 0) { + r->type = FF_UTF16BE; + r->buf_pos += 2; + } else { + r->buf[r->buf_len++] = avio_r8(r->pb); + if (strncmp("\xEF\xBB\xBF", r->buf, 3) == 0) { + // UTF8 + r->buf_pos += 3; + } + } +} + +void ff_text_init_buf(FFTextReader *r, void *buf, size_t size) +{ + memset(&r->buf_pb, 0, sizeof(r->buf_pb)); + ffio_init_context(&r->buf_pb, buf, size, 0, NULL, NULL, NULL, NULL); + ff_text_init_avio(r, &r->buf_pb); +} + +int64_t ff_text_pos(FFTextReader *r) +{ + return avio_tell(r->pb) - r->buf_len + r->buf_pos; +} + +int ff_text_r8(FFTextReader *r) +{ + uint32_t val; + uint8_t tmp; + if (r->buf_pos < r->buf_len) + return r->buf[r->buf_pos++]; + if (r->type == FF_UTF16LE) { + GET_UTF16(val, avio_rl16(r->pb), return 0;) + } else if (r->type == FF_UTF16BE) { + GET_UTF16(val, avio_rb16(r->pb), return 0;) + } else { + return avio_r8(r->pb); + } + if (!val) + return 0; + r->buf_pos = 0; + r->buf_len = 0; + PUT_UTF8(val, tmp, r->buf[r->buf_len++] = tmp;) + return r->buf[r->buf_pos++]; // buf_len is at least 1 +} + +void ff_text_read(FFTextReader *r, char *buf, size_t size) +{ + for ( ; size > 0; size--) + *buf++ = ff_text_r8(r); +} + AVPacket *ff_subtitles_queue_insert(FFDemuxSubtitlesQueue *q, const uint8_t *event, int len, int merge) { diff --git a/libavformat/subtitles.h b/libavformat/subtitles.h index b5a96ec08c..38502d6f86 100644 --- a/libavformat/subtitles.h +++ b/libavformat/subtitles.h @@ -30,6 +30,62 @@ enum sub_sort { SUB_SORT_POS_TS, ///< sort by position, then timestamps }; +enum ff_utf_type { + FF_UTF_8, // or other 8 bit encodings + FF_UTF16LE, + FF_UTF16BE, +}; + +typedef struct { + int type; + AVIOContext *pb; + unsigned char buf[8]; + int buf_pos, buf_len; + AVIOContext buf_pb; +} FFTextReader; + +/** + * Initialize the FFTextReader from the given AVIOContext. This function will + * read some bytes from pb, and test for UTF-8 or UTF-16 BOMs. Further accesses + * to FFTextReader will read more data from pb. + * + * The purpose of FFTextReader is to transparently convert read data to UTF-8 + * if the stream had a UTF-16 BOM. + * + * @param r object which will be initialized + * @param pb stream to read from (referenced as long as FFTextReader is in use) + */ +void ff_text_init_avio(FFTextReader *r, AVIOContext *pb); + +/** + * Similar to ff_text_init_avio(), but sets it up to read from a bounded buffer. + * + * @param r object which will be initialized + * @param buf buffer to read from (referenced as long as FFTextReader is in use) + * @param size size of buf + */ +void ff_text_init_buf(FFTextReader *r, void *buf, size_t size); + +/** + * Return the byte position of the next byte returned by ff_text_r8(). For + * UTF-16 source streams, this will return the original position, but it will + * be incorrect if a codepoint was only partially read with ff_text_r8(). + */ +int64_t ff_text_pos(FFTextReader *r); + +/** + * Return the next byte. The return value is always 0 - 255. Returns 0 on EOF. + * If the source stream is UTF-16, this reads from the stream converted to + * UTF-8. On invalid UTF-16, 0 is returned. + */ +int ff_text_r8(FFTextReader *r); + +/** + * Read the given number of bytes (in UTF-8). On error or EOF, \0 bytes are + * written. + */ +void ff_text_read(FFTextReader *r, char *buf, size_t size); + typedef struct { AVPacket *subs; ///< array of subtitles packets int nb_subs; ///< number of subtitles packets -- cgit v1.2.3