#include "stdafx.h" #include "RealTextParser.h" CRealTextParser::CRealTextParser(void): m_bIgnoreFont(false), m_bIgnoreFontSize(false), m_bIgnoreFontColor(false), m_bIgnoreFontWeight(false), m_bIgnoreFontFace(false), m_iMinFontSize(14), m_iMaxFontSize(25), m_iDefaultSubtitleDurationInMillisecs(4000), m_bTryToIgnoreErrors(true) { } CRealTextParser::~CRealTextParser(void) { } bool CRealTextParser::ParseRealText(wstring p_szFile) { vector vStartTimecodes; vector vEndTimecodes; bool bPrevEndTimeMissing(false); list listTags; list listPreviousOpenTags; while (p_szFile.length() > 0) { if (p_szFile.at(0) == '<') { Tag oTag; if (!ExtractTag(p_szFile, oTag)) return false; if (oTag.m_bComment) continue; if (oTag.m_szName == L"time") { int iStartTimecode = GetTimecode(oTag.m_mapAttributes[L"begin"]); int iEndTimecode = GetTimecode(oTag.m_mapAttributes[L"end"]); // FilterReduntantTags(listTags); wstring szLine = RenderTags(listTags); if (bPrevEndTimeMissing) { pair pairTimecodes(vStartTimecodes.back(), iStartTimecode); // Fix issues where the next time code isn't valid end time code for the previous subtitle if (pairTimecodes.first >= pairTimecodes.second) { pairTimecodes.second = pairTimecodes.first + m_iDefaultSubtitleDurationInMillisecs; } if (szLine.length() > 0) m_RealText.m_mapLines[pairTimecodes] = szLine; bPrevEndTimeMissing = false; } else if (vStartTimecodes.size() > 0 && vEndTimecodes.size() > 0) { pair pairTimecodes(vStartTimecodes.back(), vEndTimecodes.back()); if (szLine.length() > 0) m_RealText.m_mapLines[pairTimecodes] = szLine; } vStartTimecodes.push_back(iStartTimecode); if (iEndTimecode <= 0) { bPrevEndTimeMissing = true; } else { vEndTimecodes.push_back(iEndTimecode); } } else if (oTag.m_szName == L"b" || oTag.m_szName == L"i" || oTag.m_szName == L"font") { if (oTag.m_bOpen) listPreviousOpenTags.push_back(oTag); if (oTag.m_bClose) PopTag(listPreviousOpenTags, oTag.m_szName); listTags.push_back(oTag); } else if (oTag.m_szName == L"clear") { listTags.clear(); // set existing tags listTags.insert(listTags.end(), listPreviousOpenTags.begin(), listPreviousOpenTags.end()); } else if (oTag.m_szName == L"window") { if (oTag.m_bOpen) m_RealText.m_WindowTag = oTag; // Ignore close } else if (oTag.m_szName == L"center") { m_RealText.m_bCenter = true; } else if (oTag.m_szName == L"required") { // Ignore } else if (oTag.m_szName == L"") { // Ignore } else { // assume formating tag (handled later) listTags.push_back(oTag); } } else { Tag oTextTag; if (!ExtractTextTag(p_szFile, oTextTag)) return false; listTags.push_back(oTextTag); } } // Handle final line // FilterReduntantTags(listTags); wstring szLine = RenderTags(listTags); if (bPrevEndTimeMissing) { pair pairTimecodes(vStartTimecodes.back(), vStartTimecodes.back() + m_iDefaultSubtitleDurationInMillisecs); if (szLine.length() > 0) m_RealText.m_mapLines[pairTimecodes] = szLine; bPrevEndTimeMissing = false; } else if (vStartTimecodes.size() > 0 && vEndTimecodes.size() > 0) { pair pairTimecodes(vStartTimecodes.back(), vEndTimecodes.back()); if (szLine.length() > 0) m_RealText.m_mapLines[pairTimecodes] = szLine; } return true; } const CRealTextParser::Subtitles& CRealTextParser::GetParsedSubtitles() { return m_RealText; } bool CRealTextParser::ExtractTag(wstring& p_rszLine, Tag& p_rTag) { if (p_rszLine.length() < 2 || p_rszLine.at(0) != '<') { if (m_bTryToIgnoreErrors) { size_t iTempPos = p_rszLine.find_first_of('<'); if (iTempPos != wstring::npos) { p_rszLine = p_rszLine.substr(iTempPos); if (p_rszLine.length() < 2) return false; } } else { return false; } } unsigned int iPos = 1; // skip comments if (p_rszLine.at(iPos) == '!') { p_rTag.m_bComment = true; wstring szComment; GetString(p_rszLine, iPos, szComment, L">"); p_rTag.m_szName = szComment; ++iPos; // Skip > p_rszLine = p_rszLine.substr(iPos); return true; } else { p_rTag.m_bComment = false; } if (!SkipSpaces(p_rszLine, iPos)) return false; if (p_rszLine.at(iPos) == '/') { p_rTag.m_bOpen = false; p_rTag.m_bClose = true; ++iPos; } else { p_rTag.m_bOpen = true; p_rTag.m_bClose = false; } if (!GetString(p_rszLine, iPos, p_rTag.m_szName, L"\r\n\t />")) return false; p_rTag.m_szName = StringToLower(p_rTag.m_szName); if (!GetAttributes(p_rszLine, iPos, p_rTag.m_mapAttributes)) return false; if (p_rszLine.at(iPos) == '/') { ++iPos; p_rTag.m_bClose = true; } if (p_rszLine.at(iPos) == '>') { ++iPos; p_rszLine = p_rszLine.substr(iPos); return true; } else { if (m_bTryToIgnoreErrors) { size_t iTempPos = p_rszLine.find_first_of('>'); if (iTempPos != wstring::npos) { if (iTempPos - 1 >= p_rszLine.length()) return false; p_rszLine = p_rszLine.substr(iTempPos + 1); return true; } else { return false; } } else { return false; } } } bool CRealTextParser::ExtractTextTag(wstring& p_rszLine, Tag& p_rTag) { p_rTag.m_bText = true; return ExtractString(p_rszLine, p_rTag.m_szName); } bool CRealTextParser::ExtractString(wstring& p_rszLine, wstring& p_rszString) { if (p_rszLine.length() == 0 || p_rszLine.at(0) == '<') { if (m_bTryToIgnoreErrors) { p_rszString = L""; return true; } else { return false; } } unsigned int iPos = 0; if (!SkipSpaces(p_rszLine, iPos)) return false; if (!GetString(p_rszLine, iPos, p_rszString, L"<")) return false; p_rszLine = p_rszLine.substr(iPos); return true; } bool CRealTextParser::SkipSpaces(wstring& p_rszLine, unsigned int& p_riPos) { while (p_rszLine.length() > p_riPos && iswspace(p_rszLine.at(p_riPos))) { ++p_riPos; } return p_rszLine.length() > p_riPos; } bool CRealTextParser::GetString(wstring& p_rszLine, unsigned int& p_riPos, wstring& p_rszString, const wstring& p_crszEndChars) { while (p_rszLine.length() > p_riPos && p_crszEndChars.find(p_rszLine.at(p_riPos)) == wstring::npos) { p_rszString += p_rszLine.at(p_riPos); ++p_riPos; } return p_rszLine.length() > p_riPos; } bool CRealTextParser::GetAttributes(wstring& p_rszLine, unsigned int& p_riPos, map& p_rmapAttributes) { if (!SkipSpaces(p_rszLine, p_riPos)) return false; while (p_riPos>p_rszLine.length() && p_rszLine.at(p_riPos) != '/' && p_rszLine.at(p_riPos) != '>') { wstring szName; if (!GetString(p_rszLine, p_riPos, szName, L"\r\n\t =")) return false; if (!SkipSpaces(p_rszLine, p_riPos)) return false; if (p_rszLine.at(p_riPos) != '=') { if (m_bTryToIgnoreErrors) { p_riPos = p_rszLine.find_first_of('=', p_riPos); if (p_riPos == wstring::npos) return false; } else { return false; } } ++p_riPos; if (!SkipSpaces(p_rszLine, p_riPos)) return false; bool bUsesQuotes(false); if (p_rszLine.at(p_riPos) == '\'' || p_rszLine.at(p_riPos) == '\"') { ++p_riPos; bUsesQuotes = true; } if (!SkipSpaces(p_rszLine, p_riPos)) return false; wstring szValue; if (bUsesQuotes) { if (!GetString(p_rszLine, p_riPos, szValue, L"\"\'/>")) return false; } else { if (!GetString(p_rszLine, p_riPos, szValue, L" \t/>")) return false; } p_rmapAttributes[StringToLower(szName)] = szValue; if (!SkipSpaces(p_rszLine, p_riPos)) return false; if (p_rszLine.at(p_riPos) == '\'' || p_rszLine.at(p_riPos) == '\"') ++p_riPos; if (!SkipSpaces(p_rszLine, p_riPos)) return false; } return p_rszLine.length() > p_riPos; } int CRealTextParser::GetTimecode(const wstring& p_crszTimecode) { int iTimecode(0); int iMultiplier(1); // Exception: if the timecode doesn't contain any separators, assume the time code is in seconds (and change multiplier to reflect that) if (p_crszTimecode.find_first_of('.') == wstring::npos && p_crszTimecode.find_first_of(':') == wstring::npos) iMultiplier = 1000; wstring szCurrentPart; for (int i = p_crszTimecode.length() - 1; i >= 0; --i) { if (p_crszTimecode.at(i) == '.' || p_crszTimecode.at(i) == ':') { if (iMultiplier == 1) { while (szCurrentPart.length() < 3) szCurrentPart += L"0"; } iTimecode += iMultiplier * ::_wtoi(szCurrentPart.c_str()); if (iMultiplier == 1) { iMultiplier = 1000; } else { iMultiplier *= 60; } szCurrentPart = L""; } else { szCurrentPart = p_crszTimecode.substr(i, 1) + szCurrentPart; } } iTimecode += iMultiplier * ::_wtoi(szCurrentPart.c_str()); return iTimecode; } wstring CRealTextParser::FormatTimecode(int iTimecode, int iMillisecondPrecision/* = 3*/, bool p_bPadZeroes/* = true*/, const wstring& p_crszSeparator/* = ":"*/, const wstring& p_crszMillisecondSeparator/* = "."*/) { wostringstream ossTimecode; int iHours = iTimecode / 1000 / 60 / 60; ossTimecode << iHours; int iMinutes = (iTimecode / 1000 / 60) % 60; ossTimecode << p_crszSeparator; ossTimecode << iMinutes; int iSeconds = (iTimecode / 1000) % 60; ossTimecode << p_crszSeparator; ossTimecode << iSeconds; int iMilliSeconds = iTimecode % 1000; if (iMillisecondPrecision < 3) iMilliSeconds /= 10 * (3 - iMillisecondPrecision); ossTimecode << p_crszMillisecondSeparator; ossTimecode << iMilliSeconds; return ossTimecode.str(); } wstring CRealTextParser::StringToLower(const wstring& p_crszString) { wstring szLowercaseString; for(unsigned int i=0; i < p_crszString.length(); ++i) { szLowercaseString += towlower(p_crszString.at(i)); } return szLowercaseString; } wstring CRealTextParser::RenderTags(const list& p_crlTags) { bool bEmpty(true); wstring szString; for (list::const_iterator iter = p_crlTags.begin(); iter != p_crlTags.end(); ++iter) { Tag oTag(*iter); if (oTag.m_szName == L"br") { szString += L"\n"; } else if (oTag.m_szName == L"b") { if (!m_bIgnoreFontWeight) { if (oTag.m_bOpen) { szString += L""; } else if (oTag.m_bClose) { szString += L""; } } } else if (oTag.m_szName == L"i") { if (!m_bIgnoreFontWeight) { if (oTag.m_bOpen) { szString += L""; } else if (oTag.m_bClose) { szString += L""; } } } else if (oTag.m_szName == L"font") { if (!m_bIgnoreFont) { if (oTag.m_bOpen) { szString += L":: iterator i = oTag.m_mapAttributes.begin(); i != oTag.m_mapAttributes.end(); ++i) { if (m_bIgnoreFontSize && i->first == L"size") continue; if (m_bIgnoreFontColor && i->first == L"color") continue; if (m_bIgnoreFontFace && i->first == L"face") continue; if (i->first == L"size" && i->second.length() > 0 && ::iswdigit(i->second.at(0))) { int iSize = ::_wtoi(i->second.c_str()); if (iSize > 0 && iSize < m_iMinFontSize) continue; if (iSize > m_iMaxFontSize) continue; } szString += L" "; szString += i->first; szString += L"=\""; szString += i->second; szString += L"\""; } szString += L">"; } if (oTag.m_bClose) { szString += L""; } } } else if (oTag.m_bText) { szString += oTag.m_szName; if (!oTag.m_szName.empty()) bEmpty = false; } else { // AfxMessageBox(CString(_T("Unknown RealText-tag: ")) + oTag.m_szName.c_str()); } } if (bEmpty) return L""; else return szString; } bool CRealTextParser::OutputSRT(wostream& p_rOutput) { int iCounter(1); for (map, wstring>::const_iterator i = m_RealText.m_mapLines.begin(); i != m_RealText.m_mapLines.end(); ++i) { p_rOutput << iCounter++; p_rOutput << endl; p_rOutput << FormatTimecode(i->first.first); p_rOutput << L" --> "; p_rOutput << FormatTimecode(i->first.second); p_rOutput << endl; p_rOutput << i->second; p_rOutput << endl; p_rOutput << endl; } return true; } void CRealTextParser::PopTag(list& p_rlistTags, const wstring& p_crszTagName) { for (list::reverse_iterator riter = p_rlistTags.rbegin(); riter != p_rlistTags.rend(); ++riter) { if (riter->m_szName == p_crszTagName) { p_rlistTags.erase((++riter).base()); return; } } } void CRealTextParser::FilterReduntantTags(list& p_rlistTags) { list::iterator iterPrev; for (list::iterator iterCurrent = p_rlistTags.begin(); iterCurrent != p_rlistTags.end(); ++iterCurrent) { if (iterCurrent != p_rlistTags.begin()) { if (iterPrev->m_szName == L"font" && iterCurrent->m_szName == L"font" && iterPrev->m_bOpen && iterCurrent->m_bOpen) { p_rlistTags.erase(iterPrev); } } iterPrev = iterCurrent; } }