Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorvng <viktor.govako@gmail.com>2011-11-18 22:22:44 +0400
committerAlex Zolotarev <alex@maps.me>2015-09-23 01:28:20 +0300
commitad54bdeecc904dd1a984561a8ef013e2803e8c84 (patch)
treee056365968b356d26387257a3e09c1386daafd9f /lang_getter
parent53322f4a860c7374ed5c5d7419e1e292ecbbc6ec (diff)
Add languages getter utility. It uses wikipedia as source.
Diffstat (limited to 'lang_getter')
-rwxr-xr-xlang_getter/debug/lang_getterbin0 -> 185228 bytes
-rw-r--r--lang_getter/lang_getter.pro28
-rw-r--r--lang_getter/logging.cpp30
-rw-r--r--lang_getter/logging.h16
-rw-r--r--lang_getter/main.cpp13
-rw-r--r--lang_getter/mainmanager.cpp304
-rw-r--r--lang_getter/mainmanager.h64
-rw-r--r--lang_getter/pagedownloader.cpp62
-rw-r--r--lang_getter/pagedownloader.h37
-rwxr-xr-xlang_getter/release/lang_getterbin0 -> 60852 bytes
-rw-r--r--lang_getter/stringparser.cpp36
-rw-r--r--lang_getter/stringparser.h39
12 files changed, 629 insertions, 0 deletions
diff --git a/lang_getter/debug/lang_getter b/lang_getter/debug/lang_getter
new file mode 100755
index 0000000000..95df683fa4
--- /dev/null
+++ b/lang_getter/debug/lang_getter
Binary files differ
diff --git a/lang_getter/lang_getter.pro b/lang_getter/lang_getter.pro
new file mode 100644
index 0000000000..e3dc4cc0af
--- /dev/null
+++ b/lang_getter/lang_getter.pro
@@ -0,0 +1,28 @@
+#-------------------------------------------------
+#
+# Project created by QtCreator 2011-11-18T08:50:14
+#
+#-------------------------------------------------
+
+QT += core network xml
+
+QT -= gui
+
+TARGET = lang_getter
+CONFIG += console
+CONFIG -= app_bundle
+
+TEMPLATE = app
+
+
+SOURCES += main.cpp \
+ pagedownloader.cpp \
+ logging.cpp \
+ mainmanager.cpp \
+ stringparser.cpp
+
+HEADERS += \
+ pagedownloader.h \
+ logging.h \
+ mainmanager.h \
+ stringparser.h
diff --git a/lang_getter/logging.cpp b/lang_getter/logging.cpp
new file mode 100644
index 0000000000..22250fca96
--- /dev/null
+++ b/lang_getter/logging.cpp
@@ -0,0 +1,30 @@
+#include "logging.h"
+
+#include <iostream>
+
+
+using namespace std;
+
+Logging::Logging()
+{
+}
+
+void Logging::Print(STATUS s, QString const & msg)
+{
+ cout << StatusToString(s).toStdString() << " " << msg.toStdString() << endl;
+}
+
+void Logging::Percent(qint64 curr, qint64 total)
+{
+}
+
+QString Logging::StatusToString(STATUS s) const
+{
+ switch (s)
+ {
+ case INFO: return "INFO";
+ case WARNING: return "WARNING";
+ case ERROR: return "ERROR";
+ default: return "NONE";
+ }
+}
diff --git a/lang_getter/logging.h b/lang_getter/logging.h
new file mode 100644
index 0000000000..86c1fcfc9e
--- /dev/null
+++ b/lang_getter/logging.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <QString>
+
+class Logging
+{
+public:
+ Logging();
+
+ enum STATUS { INFO, WARNING, ERROR };
+
+ void Print(STATUS s, QString const & msg);
+ void Percent(qint64 curr, qint64 total);
+
+ QString StatusToString(STATUS s) const;
+};
diff --git a/lang_getter/main.cpp b/lang_getter/main.cpp
new file mode 100644
index 0000000000..7aa0e9dfa8
--- /dev/null
+++ b/lang_getter/main.cpp
@@ -0,0 +1,13 @@
+#include <QtCore/QCoreApplication>
+
+#include "mainmanager.h"
+
+int main(int argc, char *argv[])
+{
+ QCoreApplication a(argc, argv);
+
+ MainManager manager("/Users/alena/omim/omim/data/metainfo/");
+ manager.ProcessCountryList("/Users/alena/omim/omim/data/polygons.lst");
+
+ return a.exec();
+}
diff --git a/lang_getter/mainmanager.cpp b/lang_getter/mainmanager.cpp
new file mode 100644
index 0000000000..5f8df4bbb9
--- /dev/null
+++ b/lang_getter/mainmanager.cpp
@@ -0,0 +1,304 @@
+#include "mainmanager.h"
+
+#include <QFile>
+
+#include <algorithm>
+#include <fstream>
+#include <string>
+
+
+using namespace std;
+
+void MainManager::Country::AddCode(QString const & code)
+{
+ if (m_codes.end() == find(m_codes.begin(), m_codes.end(), code))
+ m_codes.push_back(code);
+}
+
+void MainManager::Country::AddUrl(size_t url)
+{
+ if (m_langUrls.end() == find(m_langUrls.begin(), m_langUrls.end(), url))
+ m_langUrls.push_back(url);
+}
+
+namespace
+{
+ void append(QString & res, QString const & s)
+ {
+ if (res.isEmpty()) res = s;
+ else res = res + "|" + s;
+ }
+}
+
+bool MainManager::Country::GetResult(QString & res, MainManager const & m) const
+{
+ res.clear();
+
+ for (size_t i = 0; i < m_codes.size(); ++i)
+ append(res, m_codes[i]);
+
+ for (size_t i = 0; i < m_langUrls.size(); ++i)
+ {
+ QString const code = m.m_langUrls[m_langUrls[i]];
+ if (!code.isEmpty())
+ append(res, code);
+ }
+
+ return !res.isEmpty();
+}
+
+
+MainManager::MainManager(QString const & outDir)
+ : m_downloader(m_log), m_parser(m_log), m_outDir(outDir)
+{
+}
+
+char const * MainManager::LangNameToCode(QString const & name)
+{
+ if (name.contains("English", Qt::CaseInsensitive)) return "en";
+ if (name.contains("Spanish", Qt::CaseInsensitive)) return "es";
+ if (name.contains("French", Qt::CaseInsensitive)) return "fr";
+ if (name.contains("Mandarin", Qt::CaseInsensitive)) return "zh";
+ return 0;
+}
+
+void MainManager::ProcessCountryList(QString const & file)
+{
+ ifstream s(file.toStdString().c_str());
+ if (!s.is_open() || !s.good())
+ {
+ m_log.Print(Logging::ERROR, QString("Can't open file: ") + file);
+ return;
+ }
+
+ char buffer[256];
+ while (s.good())
+ {
+ s.getline(buffer, 256);
+ if (strlen(buffer) > 0)
+ m_countries.push_back(buffer);
+ }
+
+ m_downloader.ConnectFinished(this, SLOT(countryDownloaded(QString const &)));
+
+ m_index = 0;
+ ProcessNextCountry();
+}
+
+namespace
+{
+ void get_country_url(QString & name)
+ {
+ int const i = name.indexOf('_');
+ if (i != -1)
+ name = name.mid(0, i); // for regions return country name
+
+ name.replace(' ', '_'); // make correct wiki url
+ }
+}
+
+void MainManager::ProcessNextCountry()
+{
+ if (m_index >= m_countries.size())
+ {
+ m_downloader.ConnectFinished(this, SLOT(languageDownloaded(QString const &)));
+
+ m_index = 0;
+ ProcessNextLanguage();
+ return;
+ }
+
+ QString url = m_countries[m_index].m_name;
+ get_country_url(url);
+
+ m_downloader.Download(QString("http://en.wikipedia.org/wiki/") + url);
+}
+
+namespace
+{
+ class append_result
+ {
+ MainManager & m_manager;
+ public:
+ append_result(MainManager & m) : m_manager(m) {}
+ void operator() (QString const & s)
+ {
+ char const * code = m_manager.LangNameToCode(s);
+ if (code)
+ m_manager.AppendResult(code);
+ }
+ };
+
+ class nodes_iterator
+ {
+ QDomElement m_node;
+ bool m_isList;
+
+ public:
+ nodes_iterator(QDomElement const & root) : m_isList(false)
+ {
+ // process single elements ...
+ m_node = root.firstChildElement("a");
+ if (m_node.isNull())
+ {
+ // ... or compound list
+ m_node = root.firstChildElement("ul");
+ if (!m_node.isNull())
+ {
+ m_node = m_node.firstChildElement("li");
+ m_isList = true;
+ }
+ }
+ }
+
+ bool valid() const { return !m_node.isNull(); }
+
+ QDomElement get() const
+ {
+ return (m_isList ? m_node.firstChildElement("a") : m_node);
+ }
+
+ void next()
+ {
+ m_node = m_node.nextSiblingElement(m_isList ? "li" : "a");
+ }
+ };
+}
+
+void MainManager::ProcessLangEntry(QString const & xml, QString const & entry)
+{
+ if (m_parser.InitSubDOM(xml, entry, "td"))
+ {
+ nodes_iterator it(m_parser.Root());
+
+ if (!it.valid())
+ {
+ // try to get language from root node
+ TokenizeString(m_parser.Root().text(), ", ", append_result(*this));
+ }
+
+ // iterate through child nodes
+ while (it.valid())
+ {
+ QDomElement e = it.get();
+
+ char const * code = LangNameToCode(e.text());
+ if (code)
+ {
+ AppendResult(code);
+ }
+ else
+ {
+ QString const url = e.attribute("href");
+ if (!url.isEmpty())
+ AppendLangUrl(url);
+ else
+ m_log.Print(Logging::WARNING, QString("Undefined language without url: ") + e.text());
+ }
+
+ it.next();
+ }
+ }
+}
+
+void MainManager::countryDownloaded(QString const & s)
+{
+ ProcessLangEntry(s, "Official language(s)");
+ ProcessLangEntry(s, "National language");
+
+ ++m_index;
+ ProcessNextCountry();
+}
+
+void MainManager::AppendResult(QString const & code)
+{
+ m_countries[m_index].AddCode(code);
+}
+
+void MainManager::AppendLangUrl(QString url)
+{
+ {
+ int const i = url.lastIndexOf("/");
+ if (i != -1)
+ url = url.mid(i+1);
+ }
+
+ size_t index;
+ {
+ vector<QString>::iterator i = find(m_langUrls.begin(), m_langUrls.end(), url);
+ if (i == m_langUrls.end())
+ {
+ m_langUrls.push_back(url);
+ index = m_langUrls.size()-1;
+ }
+ else
+ index = std::distance(m_langUrls.begin(), i);
+ }
+
+ m_countries[m_index].AddUrl(index);
+}
+
+void MainManager::ProcessNextLanguage()
+{
+ if (m_index >= m_langUrls.size())
+ {
+ CreateResultFiles();
+ m_log.Print(Logging::INFO, "Done!");
+
+ exit(0);
+ return;
+ }
+
+ m_downloader.Download(QString("http://en.wikipedia.org/wiki/") + m_langUrls[m_index]);
+}
+
+bool MainManager::ProcessCodeEntry(QString const & xml, QString const & entry)
+{
+ if (m_parser.InitSubDOM(xml, entry, "td"))
+ {
+ QDomElement e = m_parser.Root().firstChildElement("tt");
+ if (!e.isNull())
+ {
+ QString const name = e.text();
+ if (!name.isEmpty())
+ {
+ m_langUrls[m_index] = name;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+void MainManager::languageDownloaded(QString const & s)
+{
+ if (!ProcessCodeEntry(s, "ISO 639-1"))
+ if (!ProcessCodeEntry(s, "ISO 639-2"))
+ if (!ProcessCodeEntry(s, "ISO 639-3"))
+ {
+ m_log.Print(Logging::WARNING, QString("Can't find code for url: ") + m_langUrls[m_index]);
+ m_langUrls[m_index] = QString();
+ }
+
+ ++m_index;
+ ProcessNextLanguage();
+}
+
+void MainManager::CreateResultFiles()
+{
+ m_log.Print(Logging::INFO, "Results:");
+
+ for (size_t i = 0; i < m_countries.size(); ++i)
+ {
+ QString s;
+ if (m_countries[i].GetResult(s, *this))
+ {
+ QFile f(m_outDir + m_countries[i].m_name + QString(".meta"));
+ f.open(QFile::WriteOnly);
+ f.write(s.toStdString().c_str());
+ }
+ else
+ m_log.Print(Logging::WARNING, QString("No languages for country: ") + m_countries[i].m_name);
+ }
+}
diff --git a/lang_getter/mainmanager.h b/lang_getter/mainmanager.h
new file mode 100644
index 0000000000..5c80403ff1
--- /dev/null
+++ b/lang_getter/mainmanager.h
@@ -0,0 +1,64 @@
+#pragma once
+#include "logging.h"
+#include "pagedownloader.h"
+#include "stringparser.h"
+
+#include <QObject>
+#include <vector>
+
+
+class MainManager : public QObject
+{
+ Q_OBJECT
+
+ Logging m_log;
+ PageDownloader m_downloader;
+ ContentParser m_parser;
+
+ QString m_outDir;
+
+ class Country
+ {
+ std::vector<QString> m_codes;
+ std::vector<size_t> m_langUrls;
+
+ public:
+ QString m_name;
+
+ Country(char const * name) : m_name(name) {}
+
+ void AddCode(QString const & code);
+ void AddUrl(size_t url);
+
+ bool GetResult(QString & res, MainManager const & m) const;
+ };
+
+ std::vector<Country> m_countries;
+ std::vector<QString> m_langUrls;
+
+ size_t m_index;
+
+public:
+ MainManager(QString const & outDir);
+
+ void ProcessCountryList(QString const & file);
+
+protected:
+ void ProcessNextCountry();
+ void ProcessLangEntry(QString const & xml, QString const & entry);
+
+public: // need for functor
+ char const * LangNameToCode(QString const & name);
+ void AppendResult(QString const & code);
+protected:
+ void AppendLangUrl(QString url);
+
+ void ProcessNextLanguage();
+ bool ProcessCodeEntry(QString const & xml, QString const & entry);
+
+ void CreateResultFiles();
+
+private slots:
+ void countryDownloaded(QString const & s);
+ void languageDownloaded(QString const & s);
+};
diff --git a/lang_getter/pagedownloader.cpp b/lang_getter/pagedownloader.cpp
new file mode 100644
index 0000000000..87e089e13d
--- /dev/null
+++ b/lang_getter/pagedownloader.cpp
@@ -0,0 +1,62 @@
+#include "pagedownloader.h"
+#include "logging.h"
+
+#include <QUrl>
+#include <QNetworkReply>
+
+
+void PageDownloader::ConnectFinished(QObject * obj, char const * slot)
+{
+ disconnect(SIGNAL(finished(QString const &)));
+ connect(this, SIGNAL(finished(QString const &)), obj, slot);
+}
+
+void PageDownloader::Download(QUrl const & url)
+{
+ m_res.clear();
+
+ m_reply = m_manager.get(QNetworkRequest(url));
+ connect(m_reply, SIGNAL(finished()), this, SLOT(httpFinished()));
+ connect(m_reply, SIGNAL(readyRead()), this, SLOT(httpReadyRead()));
+ connect(m_reply, SIGNAL(downloadProgress(qint64,qint64)), this,
+ SLOT(updateDataReadProgress(qint64,qint64)));
+}
+
+void PageDownloader::Download(QString const & url)
+{
+ Download(QUrl(url));
+}
+
+void PageDownloader::httpFinished()
+{
+ QString const s = QString::fromUtf8(m_res.constData());
+ QString const url = m_reply->url().toString();
+
+ if (s.isEmpty())
+ {
+ m_log.Print(Logging::WARNING, QString("Downloading of ") +
+ url +
+ QString(" failed."));
+ }
+ else
+ {
+ m_log.Print(Logging::INFO, QString("Downloading of ") +
+ url +
+ QString(" finished successfully."));
+ }
+
+ m_reply->deleteLater();
+ m_reply = 0;
+
+ emit finished(s);
+}
+
+void PageDownloader::httpReadyRead()
+{
+ m_res += m_reply->readAll();
+}
+
+void PageDownloader::updateDataReadProgress(qint64 read, qint64 total)
+{
+ m_log.Percent(read, total);
+}
diff --git a/lang_getter/pagedownloader.h b/lang_getter/pagedownloader.h
new file mode 100644
index 0000000000..8884818c57
--- /dev/null
+++ b/lang_getter/pagedownloader.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <QObject>
+#include <QNetworkAccessManager>
+
+
+class Logging;
+class QUrl;
+class QNetworkReply;
+
+class PageDownloader : public QObject
+{
+ Q_OBJECT
+
+ QNetworkAccessManager m_manager;
+ QNetworkReply * m_reply;
+
+ Logging & m_log;
+
+ QByteArray m_res;
+
+public:
+ PageDownloader(Logging & log) : m_log(log) {}
+
+ void ConnectFinished(QObject * obj, char const * slot);
+
+ void Download(QUrl const & url);
+ void Download(QString const & url);
+
+signals:
+ void finished(QString const &);
+
+private slots:
+ void httpFinished();
+ void httpReadyRead();
+ void updateDataReadProgress(qint64 read, qint64 total);
+};
diff --git a/lang_getter/release/lang_getter b/lang_getter/release/lang_getter
new file mode 100755
index 0000000000..147f9c40a1
--- /dev/null
+++ b/lang_getter/release/lang_getter
Binary files differ
diff --git a/lang_getter/stringparser.cpp b/lang_getter/stringparser.cpp
new file mode 100644
index 0000000000..e74530dcdc
--- /dev/null
+++ b/lang_getter/stringparser.cpp
@@ -0,0 +1,36 @@
+#include "stringparser.h"
+#include "logging.h"
+
+
+bool ContentParser::InitSubDOM(QString const & xml, QString const & entry, QString const & tag)
+{
+ int const i = xml.indexOf(entry);
+ if (i == -1)
+ {
+ m_log.Print(Logging::INFO, QString("Can't find entry: ") + entry);
+ return false;
+ }
+
+ int const beg = xml.indexOf(QString("<") + tag, i);
+ if (beg == -1 || beg < i)
+ {
+ m_log.Print(Logging::INFO, QString("Can't find tag: ") + tag);
+ return false;
+ }
+
+ QString last = QString("/") + tag + QString(">");
+ int const end = xml.indexOf(last, beg);
+ Q_ASSERT ( end != -1 && beg < end );
+
+ if (!m_doc.setContent(xml.mid(beg, end - beg + last.length())))
+ {
+ m_log.Print(Logging::ERROR, QString("QDomDocument::setContent error"));
+ return false;
+ }
+
+ m_node = m_doc.documentElement();
+ Q_ASSERT ( !m_node.isNull() );
+ Q_ASSERT ( m_node.tagName() == tag );
+
+ return true;
+}
diff --git a/lang_getter/stringparser.h b/lang_getter/stringparser.h
new file mode 100644
index 0000000000..adbbb991de
--- /dev/null
+++ b/lang_getter/stringparser.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <QDomDocument>
+
+
+class Logging;
+
+class ContentParser
+{
+ Logging & m_log;
+
+ QDomDocument m_doc;
+ QDomElement m_node;
+
+public:
+ ContentParser(Logging & log) : m_log(log) {}
+
+ bool InitSubDOM(QString const & xml, QString const & entry, QString const & tag);
+ QDomElement Root() const { return m_node; }
+};
+
+template <class ToDo>
+void TokenizeString(QString const & s, QString const & delim, ToDo toDo)
+{
+ int beg = 0;
+ int i = 0;
+ for (; i < s.length(); ++i)
+ {
+ if (delim.indexOf(s[i]) != -1)
+ {
+ if (i > beg)
+ toDo(s.mid(beg, i-beg));
+ beg = i+1;
+ }
+ }
+
+ if (i > beg)
+ toDo(s.mid(beg, i-beg));
+}