// Written by Sergey Sharybin // Added support for contexts // // Based on Python script msgfmt.py from Python source // code tree, which was written by Written by // Martin v. Löwis // // Generate binary message catalog from textual translation description. // // This program converts a textual Uniforum-style message catalog (.po file) into // a binary GNU catalog (.mo file). This is essentially the same function as the // GNU msgfmt program, however, it is a simpler implementation. // // Usage: msgfmt input.po output.po #include #include #include #include #include #include #include #include #include namespace { std::map MESSAGES; bool starts_with(const std::string &string, const std::string &prefix) { return prefix.size() <= string.size() && string.compare(0, prefix.size(), prefix) == 0; } std::string ltrim(const std::string &s) { std::string result = s; result.erase(result.begin(), std::find_if(result.begin(), result.end(), std::not1(std::ptr_fun(std::isspace)))); return result; } std::string rtrim(const std::string &s) { std::string result = s; result.erase( std::find_if(result.rbegin(), result.rend(), std::not1(std::ptr_fun(std::isspace))).base(), result.end()); return result; } std::string trim(const std::string &s) { return ltrim(rtrim(s)); } std::string unescape(const std::string &s) { std::string result; std::string::const_iterator it = s.begin(); while (it != s.end()) { char current_char = *it++; if (current_char == '\\' && it != s.end()) { char next_char = *it++; if (next_char == '\\') { current_char = '\\'; } else if (next_char == 'n') { current_char = '\n'; } else if (next_char == 't') { current_char = '\t'; } else { current_char = next_char; } } result += current_char; } if (result[0] == '"' && result[result.size() - 1] == '"') { result = result.substr(1, result.size() - 2); } return result; } // Add a non-fuzzy translation to the dictionary. void add(const std::string &msgctxt, const std::string &msgid, const std::string &msgstr, bool fuzzy) { if (fuzzy == false && msgstr.empty() == false) { if (msgctxt.empty()) { MESSAGES[msgid] = msgstr; } else { MESSAGES[msgctxt + (char)0x04 + msgid] = msgstr; } } } template void get_keys(std::map map, std::vector *keys) { for (typename std::map::iterator it = map.begin(); it != map.end(); it++) { keys->push_back(it->first); } } std::string intToBytes(int value) { std::string result; for (unsigned int i = 0; i < sizeof(value); i++) { result += (unsigned char) ((value >> (i * 8)) & 0xff); } return result; } typedef enum { SECTION_NONE = 0, SECTION_CTX = 1, SECTION_ID = 2, SECTION_STR = 3 } eSectionType; struct Offset { unsigned int o1, l1, o2, l2; }; // Return the generated output. std::string generate(void) { // The keys are sorted in the .mo file std::vector keys; // Get list of sorted keys. get_keys(MESSAGES, &keys); std::sort(keys.begin(), keys.end()); std::vector offsets; std::string ids = "", strs = ""; for (std::vector::iterator it = keys.begin(); it != keys.end(); it++) { std::string &id = *it; // For each string, we need size and file offset. Each string is NUL // terminated; the NUL does not count into the size. Offset offset = {(unsigned int) ids.size(), (unsigned int) id.size(), (unsigned int) strs.size(), (unsigned int) MESSAGES[id].size()}; offsets.push_back(offset); ids += id + '\0'; strs += MESSAGES[id] + '\0'; } // The header is 7 32-bit unsigned integers. We don't use hash tables, so // the keys start right after the index tables. // translated string. int keystart = 7 * 4 + 16 * keys.size(); // and the values start after the keys int valuestart = keystart + ids.size(); std::vector koffsets; std::vector voffsets; // The string table first has the list of keys, then the list of values. // Each entry has first the size of the string, then the file offset. for (std::vector::iterator it = offsets.begin(); it != offsets.end(); it++) { Offset &offset = *it; koffsets.push_back(offset.l1); koffsets.push_back(offset.o1 + keystart); voffsets.push_back(offset.l2); voffsets.push_back(offset.o2 + valuestart); } std::vector all_offsets; all_offsets.reserve(koffsets.size() + voffsets.size()); all_offsets.insert(all_offsets.end(), koffsets.begin(), koffsets.end()); all_offsets.insert(all_offsets.end(), voffsets.begin(), voffsets.end()); std::string output = ""; output += intToBytes(0x950412de); // Magic output += intToBytes(0x0); // Version output += intToBytes(keys.size()); // # of entries output += intToBytes(7 * 4); // start of key index output += intToBytes(7 * 4 + keys.size() * 8); // start of value index output += intToBytes(0); // Size of hash table output += intToBytes(0); // Offset of hash table for (std::vector::iterator it = all_offsets.begin(); it != all_offsets.end(); it++) { int offset = *it; output += intToBytes(offset); } output += ids; output += strs; return output; } void make(const char *input_file_name, const char *output_file_name) { std::map messages; // Start off assuming Latin-1, so everything decodes without failure, // until we know the exact encoding. // TODO(sergey): Support encoding. // const char *encoding = "latin-1"; eSectionType section = SECTION_NONE; bool fuzzy = false; bool is_plural = false; std::string msgctxt, msgid, msgstr; std::ifstream input_file_stream(input_file_name); // Parse the catalog. int lno = 0; for (std::string l; getline(input_file_stream, l); ) { lno++; // If we get a comment line after a msgstr, this is a new entry. if (l[0] == '#' && section == SECTION_STR) { add(msgctxt, msgid, msgstr, fuzzy); section = SECTION_NONE; msgctxt = ""; fuzzy = false; } // Record a fuzzy mark. if (starts_with(l, "#,") && l.find("fuzzy") != std::string::npos) { fuzzy = 1; } // Skip comments if (l[0] == '#') { continue; } // Now we are in a msgid section, output previous section. if (starts_with(l, "msgctxt")) { if (section == SECTION_STR) { add(msgctxt, msgid, msgstr, fuzzy); } section = SECTION_CTX; l = l.substr(7, l.size() - 7); msgctxt = msgid = msgstr = ""; } else if (starts_with(l, "msgid") && !starts_with(l, "msgid_plural")) { if (section == SECTION_STR) { add(msgctxt, msgid, msgstr, fuzzy); msgctxt = ""; if (msgid == "") { #if 0 // See whether there is an encoding declaration. p = HeaderParser(); charset = p.parsestr(msgstr.decode(encoding)).get_content_charset(); if (charset) { encoding = charset; } #else // Not ported to C++ yet. std::cerr << "Encoding declarations are not supported yet.\n" << std::endl; abort(); #endif } } section = SECTION_ID; l = l.substr(5, l.size() - 5); msgid = msgstr = ""; is_plural = false; } else if (starts_with(l, "msgid_plural")) { // This is a message with plural forms. if (section != SECTION_ID) { std::cerr << "msgid_plural not preceeded by msgid on" << input_file_name << ":" << lno << std::endl; abort(); } l = l.substr(12, l.size() - 12); msgid += '\0'; // separator of singular and plural is_plural = true; } else if (starts_with(l, "msgstr")) { // Now we are in a msgstr section section = SECTION_STR; if (starts_with(l, "msgstr[")) { if (is_plural == false) { std::cerr << "plural without msgid_plural on " << input_file_name << ":" << lno << std::endl; abort(); } int bracket_position = l.find(']'); if (bracket_position == std::string::npos) { std::cerr << "Syntax error on " << input_file_name << ":" << lno << std::endl; abort(); } l = l.substr(bracket_position, l.size() - bracket_position); if (msgstr != "") { msgstr += '\0'; // Separator of the various plural forms; } } else { if (is_plural) { std::cerr << "indexed msgstr required for plural on " << input_file_name << ":" << lno << std::endl; abort(); } l = l.substr(6, l.size() - 6); } } // Skip empty lines. l = trim(l); if (l.empty()) { continue; } l = unescape(l); if (section == SECTION_CTX) { // TODO(sergey): Support encoding. // msgid += l.encode(encoding); msgctxt += l; } else if (section == SECTION_ID) { // TODO(sergey): Support encoding. // msgid += l.encode(encoding); msgid += l; } else if (section == SECTION_STR) { // TODO(sergey): Support encoding. // msgstr += l.encode(encoding) msgstr += l; } else { std::cerr << "Syntax error on " << input_file_name << ":" << lno << std::endl; abort(); } // Add last entry if (section == SECTION_STR) { add(msgctxt, msgid, msgstr, fuzzy); } } // Compute output std::string output = generate(); std::ofstream output_file_stream(output_file_name, std::ios::out | std::ios::binary); output_file_stream << output; } } // namespace int main(int argc, char **argv) { if (argc != 3) { printf("Usage: %s \n", argv[0]); return EXIT_FAILURE; } const char *input_file = argv[1]; const char *output_file = argv[2]; make(input_file, output_file); return EXIT_SUCCESS; }