From 0c79ed900011be611c952221bc18d1aae5d77007 Mon Sep 17 00:00:00 2001 From: Yury Melnichek Date: Thu, 1 Sep 2011 21:24:12 +0200 Subject: [generator] Normalize names in --dump_prefixes the same way, as we do it in search. --- generator/dumper.cpp | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'generator/dumper.cpp') diff --git a/generator/dumper.cpp b/generator/dumper.cpp index ab16f28e8f..980f6332e7 100644 --- a/generator/dumper.cpp +++ b/generator/dumper.cpp @@ -1,5 +1,8 @@ #include "dumper.hpp" +#include "../indexer/search_delimiters.hpp" +#include "../indexer/search_string_utils.hpp" + #include "../coding/multilang_utf8_string.hpp" #include "../indexer/classificator.hpp" @@ -74,7 +77,7 @@ namespace feature /////////////////////////////////////////////////////////////////// - typedef map > > TokensContainerT; + typedef map > > TokensContainerT; class PrefixesCollector { public: @@ -84,31 +87,20 @@ namespace feature { CHECK(!name.empty(), ("Feature name is empty")); - vector tokens; - strings::SimpleTokenizer tok(name, " "); - while (tok) - { - tokens.push_back(*tok); - ++tok; - } + vector tokens; + search::SplitUniString(search::NormalizeAndSimplifyString(name), + MakeBackInsertFunctor(tokens), search::Delimiters()); if (tokens.empty()) return true; - /* - // ignore token if it's first letter is an uppercase letter - strings::UniString const s1 = strings::MakeUniString(tokens[0]); - strings::UniString const s2 = strings::MakeLowerCase(s1); - if (s1[0] != s2[0]) - return true; - */ for (size_t i = 1; i < tokens.size(); ++i) { - string s; + strings::UniString s; for (size_t numTokens = 0; numTokens < i; ++numTokens) { - s += tokens[numTokens]; - s += " "; + s.append(tokens[numTokens].begin(), tokens[numTokens].end()); + s.push_back(' '); } pair found = m_stats[langCode].insert(make_pair(s, make_pair(1U, name))); @@ -128,7 +120,7 @@ namespace feature void Print(int8_t langCode, TokensContainerT::mapped_type const & container) { - typedef pair > NameElemT; + typedef pair > NameElemT; typedef vector VecToSortT; VecToSortT v(container.begin(), container.end()); sort(v.begin(), v.end(), &SortFunc); @@ -142,7 +134,8 @@ namespace feature { if (it->second.first <= MIN_OCCURRENCE) break; - cout << it->second.first << " " << it->first << " \"" << it->second.second << "\"" << endl; + wcout << it->second.first << " " << std::wstring(it->first.begin(), it->first.end()); + cout << " \"" << it->second.second << "\"" << endl; } } } -- cgit v1.2.3