// Written by Sergey Sharybin // Added support for contexts // // Based on Python script msgfmt.py from Python source // code tree, which was written by Written by // Martin v. Löwis // // Generate binary message catalog from textual translation description. // // This program converts a textual Uniforum-style message catalog (.po file) into // a binary GNU catalog (.mo file). This is essentially the same function as the // GNU msgfmt program, however, it is a simpler implementation. // // Usage: msgfmt input.po output.po #include #include #include #include #include #include #include #include #include namespace { std::map MESSAGES; bool starts_with(const std::string &str, const std::string &prefix) { const size_t prefix_length = prefix.length(); if (prefix_length == 0) { return true; } // TODO(sergey): Could be optimized if we calculate str.length() // to maximum of prefix_length characters. if (prefix_length > str.length()) { return false; } else { return str.compare(0, prefix_length, prefix) == 0; } } std::string trim(const std::string &str) { std::string result = str; result.erase(0, result.find_first_not_of(" \t\r\n")); result.erase(result.find_last_not_of(" \t\r\n") + 1); return result; } std::string unescape(const std::string &str) { std::string result; const size_t str_length = str.length(); size_t i = 0; while (i < str_length) { char current_char = str[i++]; if (current_char == '\\' && i < str_length - 1) { char next_char = str[i++]; if (next_char == '\\') { current_char = '\\'; } else if (next_char == 'n') { current_char = '\n'; } else if (next_char == 't') { current_char = '\t'; } else { current_char = next_char; } } result += current_char; } const size_t result_length = result.length(); if (result[0] == '"' && result[result_length - 1] == '"') { result = result.substr(1, result_length - 2); } return result; } // Add a non-fuzzy translation to the dictionary. void add(const std::string &msgctxt, const std::string &msgid, const std::string &msgstr, bool fuzzy) { if (fuzzy == false && msgstr.empty() == false) { if (msgctxt.empty()) { MESSAGES[msgid] = msgstr; } else { MESSAGES[msgctxt + (char)0x04 + msgid] = msgstr; } } } template void get_keys(std::map map, std::vector *keys) { keys->reserve(map.size()); for (typename std::map::iterator it = map.begin(); it != map.end(); it++) { keys->push_back(it->first); } } std::string intToBytes(int value) { std::string result; for (unsigned int i = 0; i < sizeof(value); i++) { result += (unsigned char) ((value >> (i * 8)) & 0xff); } return result; } typedef enum { SECTION_NONE = 0, SECTION_CTX = 1, SECTION_ID = 2, SECTION_STR = 3 } eSectionType; struct Offset { unsigned int o1, l1, o2, l2; }; // Return the generated output. std::string generate(void) { // The keys are sorted in the .mo file std::vector keys; // Get list of sorted keys. get_keys(MESSAGES, &keys); std::sort(keys.begin(), keys.end()); std::vector offsets; offsets.reserve(keys.size()); std::string ids = "", strs = ""; for (std::vector::iterator it = keys.begin(); it != keys.end(); it++) { std::string &id = *it; // For each string, we need size and file offset. Each string is NUL // terminated; the NUL does not count into the size. Offset offset = {(unsigned int) ids.size(), (unsigned int) id.size(), (unsigned int) strs.size(), (unsigned int) MESSAGES[id].size()}; offsets.push_back(offset); ids += id + '\0'; strs += MESSAGES[id] + '\0'; } // The header is 7 32-bit unsigned integers. We don't use hash tables, so // the keys start right after the index tables. // translated string. int keystart = 7 * 4 + 16 * keys.size(); // and the values start after the keys int valuestart = keystart + ids.size(); std::vector koffsets; std::vector voffsets; koffsets.reserve(offsets.size() * 2); voffsets.reserve(offsets.size() * 2); // The string table first has the list of keys, then the list of values. // Each entry has first the size of the string, then the file offset. for (std::vector::iterator it = offsets.begin(); it != offsets.end(); it++) { Offset &offset = *it; koffsets.push_back(offset.l1); koffsets.push_back(offset.o1 + keystart); voffsets.push_back(offset.l2); voffsets.push_back(offset.o2 + valuestart); } std::vector all_offsets; all_offsets.reserve(koffsets.size() + voffsets.size()); all_offsets.insert(all_offsets.end(), koffsets.begin(), koffsets.end()); all_offsets.insert(all_offsets.end(), voffsets.begin(), voffsets.end()); std::string output = ""; output += intToBytes(0x950412de); // Magic output += intToBytes(0x0); // Version output += intToBytes(keys.size()); // # of entries output += intToBytes(7 * 4); // start of key index output += intToBytes(7 * 4 + keys.size() * 8); // start of value index output += intToBytes(0); // Size of hash table output += intToBytes(0); // Offset of hash table for (std::vector::iterator it = all_offsets.begin(); it != all_offsets.end(); it++) { int offset = *it; output += intToBytes(offset); } output += ids; output += strs; return output; } void make(const char *input_file_name, const char *output_file_name) { std::map messages; // Start off assuming Latin-1, so everything decodes without failure, // until we know the exact encoding. // TODO(sergey): Support encoding. // const char *encoding = "latin-1"; eSectionType section = SECTION_NONE; bool fuzzy = false; bool is_plural = false; std::string msgctxt, msgid, msgstr; std::ifstream input_file_stream(input_file_name); // Parse the catalog. int lno = 0; for (std::string l; getline(input_file_stream, l); ) { lno++; // If we get a comment line after a msgstr, this is a new entry. if (l[0] == '#' && section == SECTION_STR) { add(msgctxt, msgid, msgstr, fuzzy); section = SECTION_NONE; msgctxt = ""; fuzzy = false; } // Record a fuzzy mark. if (starts_with(l, "#,") && l.find("fuzzy") != std::string::npos) { fuzzy = true; } // Skip comments if (l[0] == '#') { continue; } // Now we are in a msgid section, output previous section. if (starts_with(l, "msgctxt")) { if (section == SECTION_STR) { add(msgctxt, msgid, msgstr, fuzzy); } section = SECTION_CTX; l = l.substr(7, l.size() - 7); msgctxt = msgid = msgstr = ""; } else if (starts_with(l, "msgid") && !starts_with(l, "msgid_plural")) { if (section == SECTION_STR) { add(msgctxt, msgid, msgstr, fuzzy); msgctxt = ""; if (msgid == "") { #if 0 // See whether there is an encoding declaration. p = HeaderParser(); charset = p.parsestr(msgstr.decode(encoding)).get_content_charset(); if (charset) { encoding = charset; } #else // Not ported to C++ yet. std::cerr << "Encoding declarations are not supported yet.\n" << std::endl; abort(); #endif } } section = SECTION_ID; l = l.substr(5, l.size() - 5); msgid = msgstr = ""; is_plural = false; } else if (starts_with(l, "msgid_plural")) { // This is a message with plural forms. if (section != SECTION_ID) { std::cerr << "msgid_plural not preceeded by msgid on" << input_file_name << ":" << lno << std::endl; abort(); } l = l.substr(12, l.size() - 12); msgid += '\0'; // separator of singular and plural is_plural = true; } else if (starts_with(l, "msgstr")) { // Now we are in a msgstr section section = SECTION_STR; if (starts_with(l, "msgstr[")) { if (is_plural == false) { std::cerr << "plural without msgid_plural on " << input_file_name << ":" << lno << std::endl; abort(); } int bracket_position = l.find(']'); if (bracket_position == std::string::npos) { std::cerr << "Syntax error on " << input_file_name << ":" << lno << std::endl; abort(); } l = l.substr(bracket_position, l.size() - bracket_position); if (msgstr != "") { msgstr += '\0'; // Separator of the various plural forms; } } else { if (is_plural) { std::cerr << "indexed msgstr required for plural on " << input_file_name << ":" << lno << std::endl; abort(); } l = l.substr(6, l.size() - 6); } } // Skip empty lines. l = trim(l); if (l.empty()) { if (section == SECTION_STR) { add(msgctxt, msgid, msgstr, fuzzy); msgctxt = msgid = msgstr = ""; section = SECTION_NONE; fuzzy = false; } continue; } l = unescape(l); if (section == SECTION_CTX) { // TODO(sergey): Support encoding. // msgid += l.encode(encoding); msgctxt += l; } else if (section == SECTION_ID) { // TODO(sergey): Support encoding. // msgid += l.encode(encoding); msgid += l; } else if (section == SECTION_STR) { // TODO(sergey): Support encoding. // msgstr += l.encode(encoding) msgstr += l; } else { std::cerr << "Syntax error on " << input_file_name << ":" << lno << std::endl; abort(); } // Add last entry if (section == SECTION_STR) { add(msgctxt, msgid, msgstr, fuzzy); } } // Compute output std::string output = generate(); std::ofstream output_file_stream(output_file_name, std::ios::out | std::ios::binary); output_file_stream << output; } } // namespace int main(int argc, char **argv) { if (argc != 3) { printf("Usage: %s \n", argv[0]); return EXIT_FAILURE; } const char *input_file = argv[1]; const char *output_file = argv[2]; make(input_file, output_file); return EXIT_SUCCESS; }