diff options
author | Yury Melnichek <melnichek@gmail.com> | 2011-09-01 23:24:12 +0400 |
---|---|---|
committer | Alex Zolotarev <alex@maps.me> | 2015-09-23 01:22:56 +0300 |
commit | 0c79ed900011be611c952221bc18d1aae5d77007 (patch) | |
tree | 4340617d7f2fb3c71d7e29ec2423c05b35d6b6c7 /generator/dumper.cpp | |
parent | dc2033017d2104ddb7d1e76c5babe516a0a30763 (diff) |
[generator] Normalize names in --dump_prefixes the same way, as we do it in search.
Diffstat (limited to 'generator/dumper.cpp')
-rw-r--r-- | generator/dumper.cpp | 33 |
1 files changed, 13 insertions, 20 deletions
diff --git a/generator/dumper.cpp b/generator/dumper.cpp index ab16f28e8f..980f6332e7 100644 --- a/generator/dumper.cpp +++ b/generator/dumper.cpp @@ -1,5 +1,8 @@ #include "dumper.hpp" +#include "../indexer/search_delimiters.hpp" +#include "../indexer/search_string_utils.hpp" + #include "../coding/multilang_utf8_string.hpp" #include "../indexer/classificator.hpp" @@ -74,7 +77,7 @@ namespace feature /////////////////////////////////////////////////////////////////// - typedef map<int8_t, map<string, pair<unsigned int, string> > > TokensContainerT; + typedef map<int8_t, map<strings::UniString, pair<unsigned int, string> > > TokensContainerT; class PrefixesCollector { public: @@ -84,31 +87,20 @@ namespace feature { CHECK(!name.empty(), ("Feature name is empty")); - vector<string> tokens; - strings::SimpleTokenizer tok(name, " "); - while (tok) - { - tokens.push_back(*tok); - ++tok; - } + vector<strings::UniString> tokens; + search::SplitUniString(search::NormalizeAndSimplifyString(name), + MakeBackInsertFunctor(tokens), search::Delimiters()); if (tokens.empty()) return true; - /* - // ignore token if it's first letter is an uppercase letter - strings::UniString const s1 = strings::MakeUniString(tokens[0]); - strings::UniString const s2 = strings::MakeLowerCase(s1); - if (s1[0] != s2[0]) - return true; - */ for (size_t i = 1; i < tokens.size(); ++i) { - string s; + strings::UniString s; for (size_t numTokens = 0; numTokens < i; ++numTokens) { - s += tokens[numTokens]; - s += " "; + s.append(tokens[numTokens].begin(), tokens[numTokens].end()); + s.push_back(' '); } pair<TokensContainerT::mapped_type::iterator, bool> found = m_stats[langCode].insert(make_pair(s, make_pair(1U, name))); @@ -128,7 +120,7 @@ namespace feature void Print(int8_t langCode, TokensContainerT::mapped_type const & container) { - typedef pair<string, pair<unsigned int, string> > NameElemT; + typedef pair<strings::UniString, pair<unsigned int, string> > NameElemT; typedef vector<NameElemT> VecToSortT; VecToSortT v(container.begin(), container.end()); sort(v.begin(), v.end(), &SortFunc<NameElemT>); @@ -142,7 +134,8 @@ namespace feature { if (it->second.first <= MIN_OCCURRENCE) break; - cout << it->second.first << " " << it->first << " \"" << it->second.second << "\"" << endl; + wcout << it->second.first << " " << std::wstring(it->first.begin(), it->first.end()); + cout << " \"" << it->second.second << "\"" << endl; } } } |