diff options
Diffstat (limited to 'intern/locale/msgfmt.cc')
-rw-r--r-- | intern/locale/msgfmt.cc | 366 |
1 files changed, 366 insertions, 0 deletions
diff --git a/intern/locale/msgfmt.cc b/intern/locale/msgfmt.cc new file mode 100644 index 00000000000..cd858cda82d --- /dev/null +++ b/intern/locale/msgfmt.cc @@ -0,0 +1,366 @@ +// Written by Sergey Sharybin <sergey.vfx@gmail.com> +// Added support for contexts +// +// Based on Python script msgfmt.py from Python source +// code tree, which was written by Written by +// Martin v. Löwis <loewis@informatik.hu-berlin.de> +// +// Generate binary message catalog from textual translation description. +// +// This program converts a textual Uniforum-style message catalog (.po file) into +// a binary GNU catalog (.mo file). This is essentially the same function as the +// GNU msgfmt program, however, it is a simpler implementation. +// +// Usage: msgfmt input.po output.po + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <map> +#include <stdlib.h> +#include <string> +#include <vector> + +namespace { + +std::map<std::string, std::string> MESSAGES; + +bool starts_with(const std::string &string, + const std::string &prefix) { + return prefix.size() <= string.size() && + string.compare(0, prefix.size(), prefix) == 0; +} + +std::string ltrim(const std::string &s) { + std::string result = s; + result.erase(result.begin(), + std::find_if(result.begin(), + result.end(), + std::not1(std::ptr_fun<int, int>(std::isspace)))); + return result; +} + +std::string rtrim(const std::string &s) { + std::string result = s; + result.erase( + std::find_if(result.rbegin(), + result.rend(), + std::not1(std::ptr_fun<int, int>(std::isspace))).base(), + result.end()); + return result; +} + +std::string trim(const std::string &s) { + return ltrim(rtrim(s)); +} + +std::string unescape(const std::string &s) { + std::string result; + std::string::const_iterator it = s.begin(); + while (it != s.end()) { + char current_char = *it++; + if (current_char == '\\' && it != s.end()) { + char next_char = *it++; + if (next_char == '\\') { + current_char = '\\'; + } else if (next_char == 'n') { + current_char = '\n'; + } else if (next_char == 't') { + current_char = '\t'; + } else { + current_char = next_char; + } + } + result += current_char; + } + + if (result[0] == '"' && result[result.size() - 1] == '"') { + result = result.substr(1, result.size() - 2); + } + + return result; +} + +// Add a non-fuzzy translation to the dictionary. +void add(const std::string &msgctxt, + const std::string &msgid, + const std::string &msgstr, + bool fuzzy) { + if (fuzzy == false && msgstr.empty() == false) { + if (msgctxt.empty()) { + MESSAGES[msgid] = msgstr; + } else { + MESSAGES[msgctxt + (char)0x04 + msgid] = msgstr; + } + } +} + +template<typename TKey, typename TValue> +void get_keys(std::map<TKey, TValue> map, + std::vector<TKey> *keys) { + for (typename std::map<TKey, TValue>::iterator it = map.begin(); + it != map.end(); + it++) { + keys->push_back(it->first); + } +} + +std::string intToBytes(int value) { + std::string result; + for (unsigned int i = 0; i < sizeof(value); i++) { + result += (unsigned char) ((value >> (i * 8)) & 0xff); + } + return result; +} + +typedef enum { + SECTION_NONE = 0, + SECTION_CTX = 1, + SECTION_ID = 2, + SECTION_STR = 3 +} eSectionType; + +struct Offset { + unsigned int o1, l1, o2, l2; +}; + +// Return the generated output. +std::string generate(void) { + // The keys are sorted in the .mo file + std::vector<std::string> keys; + + // Get list of sorted keys. + get_keys(MESSAGES, &keys); + std::sort(keys.begin(), keys.end()); + + std::vector<Offset> offsets; + std::string ids = "", strs = ""; + for (std::vector<std::string>::iterator it = keys.begin(); + it != keys.end(); + it++) { + std::string &id = *it; + // For each string, we need size and file offset. Each string is NUL + // terminated; the NUL does not count into the size. + Offset offset = {(unsigned int) ids.size(), + (unsigned int) id.size(), + (unsigned int) strs.size(), + (unsigned int) MESSAGES[id].size()}; + offsets.push_back(offset); + ids += id + '\0'; + strs += MESSAGES[id] + '\0'; + } + + // The header is 7 32-bit unsigned integers. We don't use hash tables, so + // the keys start right after the index tables. + // translated string. + int keystart = 7 * 4 + 16 * keys.size(); + // and the values start after the keys + int valuestart = keystart + ids.size(); + std::vector<int> koffsets; + std::vector<int> voffsets; + // The string table first has the list of keys, then the list of values. + // Each entry has first the size of the string, then the file offset. + for (std::vector<Offset>::iterator it = offsets.begin(); + it != offsets.end(); + it++) { + Offset &offset = *it; + koffsets.push_back(offset.l1); + koffsets.push_back(offset.o1 + keystart); + voffsets.push_back(offset.l2); + voffsets.push_back(offset.o2 + valuestart); + } + + std::vector<int> all_offsets; + all_offsets.reserve(koffsets.size() + voffsets.size()); + all_offsets.insert(all_offsets.end(), koffsets.begin(), koffsets.end()); + all_offsets.insert(all_offsets.end(), voffsets.begin(), voffsets.end()); + + std::string output = ""; + output += intToBytes(0x950412de); // Magic + output += intToBytes(0x0); // Version + output += intToBytes(keys.size()); // # of entries + output += intToBytes(7 * 4); // start of key index + output += intToBytes(7 * 4 + keys.size() * 8); // start of value index + output += intToBytes(0); // Size of hash table + output += intToBytes(0); // Offset of hash table + + for (std::vector<int>::iterator it = all_offsets.begin(); + it != all_offsets.end(); + it++) { + int offset = *it; + output += intToBytes(offset); + } + + output += ids; + output += strs; + + return output; +} + +void make(const char *input_file_name, + const char *output_file_name) { + std::map<std::string, std::string> messages; + + // Start off assuming Latin-1, so everything decodes without failure, + // until we know the exact encoding. + // TODO(sergey): Support encoding. + // const char *encoding = "latin-1"; + + eSectionType section = SECTION_NONE; + bool fuzzy = false; + bool is_plural = false; + std::string msgctxt, msgid, msgstr; + + std::ifstream input_file_stream(input_file_name); + + // Parse the catalog. + int lno = 0; + for (std::string l; getline(input_file_stream, l); ) { + lno++; + // If we get a comment line after a msgstr, this is a new entry. + if (l[0] == '#' && section == SECTION_STR) { + add(msgctxt, msgid, msgstr, fuzzy); + section = SECTION_NONE; + fuzzy = false; + } + // Record a fuzzy mark. + if (starts_with(l, "#,") && l.find("fuzzy") != std::string::npos) { + fuzzy = 1; + } + // Skip comments + if (l[0] == '#') { + continue; + } + // Now we are in a msgid section, output previous section. + if (starts_with(l, "msgctxt")) { + if (section == SECTION_STR) { + add(msgctxt, msgid, msgstr, fuzzy); + } + section = SECTION_CTX; + l = l.substr(7, l.size() - 7); + msgctxt = msgid = msgstr = ""; + } + else if (starts_with(l, "msgid") && !starts_with(l, "msgid_plural")) { + if (section == SECTION_STR) { + add(msgctxt, msgid, msgstr, fuzzy); + msgctxt = ""; + if (msgid == "") { +#if 0 + // See whether there is an encoding declaration. + p = HeaderParser(); + charset = p.parsestr(msgstr.decode(encoding)).get_content_charset(); + if (charset) { + encoding = charset; + } +#else + // Not ported to C++ yet. + std::cerr << "Encoding declarations are not supported yet.\n" + << std::endl; + abort(); +#endif + } + } + section = SECTION_ID; + l = l.substr(5, l.size() - 5); + msgid = msgstr = ""; + is_plural = false; + } else if (starts_with(l, "msgid_plural")) { + // This is a message with plural forms. + if (section != SECTION_ID) { + std::cerr << "msgid_plural not preceeded by msgid on" + << input_file_name << ":" + << lno + << std::endl; + abort(); + } + l = l.substr(12, l.size() - 12); + msgid += '\0'; // separator of singular and plural + is_plural = true; + } else if (starts_with(l, "msgstr")) { + // Now we are in a msgstr section + section = SECTION_STR; + if (starts_with(l, "msgstr[")) { + if (is_plural == false) { + std::cerr << "plural without msgid_plural on " + << input_file_name << ":" + << lno + << std::endl; + abort(); + } + int bracket_position = l.find(']'); + if (bracket_position == std::string::npos) { + std::cerr << "Syntax error on " + << input_file_name << ":" + << lno + << std::endl; + abort(); + } + l = l.substr(bracket_position, l.size() - bracket_position); + if (msgstr != "") { + msgstr += '\0'; // Separator of the various plural forms; + } + } else { + if (is_plural) { + std::cerr << "indexed msgstr required for plural on " + << input_file_name << ":" + << lno + << std::endl; + abort(); + } + l = l.substr(6, l.size() - 6); + } + } + // Skip empty lines. + l = trim(l); + if (l.empty()) { + continue; + } + l = unescape(l); + if (section == SECTION_CTX) { + // TODO(sergey): Support encoding. + // msgid += l.encode(encoding); + msgctxt += l; + } + else if (section == SECTION_ID) { + // TODO(sergey): Support encoding. + // msgid += l.encode(encoding); + msgid += l; + } else if (section == SECTION_STR) { + // TODO(sergey): Support encoding. + // msgstr += l.encode(encoding) + msgstr += l; + } else { + std::cerr << "Syntax error on " + << input_file_name << ":" + << lno + << std::endl; + abort(); + } + // Add last entry + if (section == SECTION_STR) { + add(msgctxt, msgid, msgstr, fuzzy); + } + } + + // Compute output + std::string output = generate(); + + std::ofstream output_file_stream(output_file_name, + std::ios::out | std::ios::binary); + output_file_stream << output; +} + +} // namespace + +int main(int argc, char **argv) { + if (argc != 3) { + printf("Usage: %s <input.po> <output.mo>\n", argv[0]); + return EXIT_FAILURE; + } + const char *input_file = argv[1]; + const char *output_file = argv[2]; + + make(input_file, output_file); + + return EXIT_SUCCESS; +} |