#include "dumper.hpp" #include "../indexer/search_delimiters.hpp" #include "../indexer/search_string_utils.hpp" #include "../indexer/classificator.hpp" #include "../indexer/feature_processor.hpp" #include "../indexer/search_trie.hpp" #include "../coding/multilang_utf8_string.hpp" #include "../base/logging.hpp" #include "../std/algorithm.hpp" #include "../std/bind.hpp" #include "../std/iostream.hpp" #include "../std/map.hpp" #include "../std/queue.hpp" #include "../std/vector.hpp" namespace feature { class TypesCollector { vector m_currFeatureTypes; public: typedef map, size_t> value_type; value_type m_stats; size_t m_namesCount; size_t m_totalCount; TypesCollector() : m_namesCount(0), m_totalCount(0) {} void operator()(FeatureType & f, uint32_t) { ++m_totalCount; string s1, s2; f.GetPreferredNames(s1, s2); if (!s1.empty()) ++m_namesCount; m_currFeatureTypes.clear(); f.ForEachTypeRef(*this); CHECK(!m_currFeatureTypes.empty(), ("Feature without any type???")); pair found = m_stats.insert(make_pair(m_currFeatureTypes, 1)); if (!found.second) found.first->second++; } void operator()(uint32_t type) { m_currFeatureTypes.push_back(type); } }; template static bool SortFunc(T const & first, T const & second) { return first.second > second.second; } void DumpTypes(string const & fPath) { TypesCollector doClass; feature::ForEachFromDat(fPath, doClass); typedef pair, size_t> stats_elem_type; typedef vector vec_to_sort; vec_to_sort vecToSort(doClass.m_stats.begin(), doClass.m_stats.end()); sort(vecToSort.begin(), vecToSort.end(), &SortFunc); for (vec_to_sort::iterator it = vecToSort.begin(); it != vecToSort.end(); ++it) { cout << it->second << " "; for (size_t i = 0; i < it->first.size(); ++i) cout << classif().GetFullObjectName(it->first[i]) << " "; cout << endl; } cout << "Total features: " << doClass.m_totalCount << endl; cout << "Features with names: " << doClass.m_namesCount << endl; } /////////////////////////////////////////////////////////////////// typedef map > > TokensContainerT; class PrefixesCollector { public: TokensContainerT m_stats; bool operator()(int8_t langCode, string const & name) { CHECK(!name.empty(), ("Feature name is empty")); vector tokens; search::SplitUniString(search::NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), search::Delimiters()); if (tokens.empty()) return true; for (size_t i = 1; i < tokens.size(); ++i) { strings::UniString s; for (size_t numTokens = 0; numTokens < i; ++numTokens) { s.append(tokens[numTokens].begin(), tokens[numTokens].end()); s.push_back(' '); } pair found = m_stats[langCode].insert(make_pair(s, make_pair(1U, name))); if (!found.second) found.first->second.first++; } return true; } void operator()(FeatureType & f, uint32_t) { f.ForEachNameRef(*this); } }; static size_t const MIN_OCCURRENCE = 3; void Print(int8_t langCode, TokensContainerT::mapped_type const & container) { typedef pair > NameElemT; typedef vector VecToSortT; VecToSortT v(container.begin(), container.end()); sort(v.begin(), v.end(), &SortFunc); // do not display prefixes with low occurrences if (v[0].second.first > MIN_OCCURRENCE) { cout << "Language code: " << StringUtf8Multilang::GetLangByCode(langCode) << endl; for (VecToSortT::iterator it = v.begin(); it != v.end(); ++it) { if (it->second.first <= MIN_OCCURRENCE) break; cout << it->second.first << " " << strings::ToUtf8(it->first); cout << " \"" << it->second.second << "\"" << endl; } } } void DumpPrefixes(string const & fPath) { PrefixesCollector doClass; feature::ForEachFromDat(fPath, doClass); for (TokensContainerT::iterator it = doClass.m_stats.begin(); it != doClass.m_stats.end(); ++it) { Print(it->first, it->second); } } struct SearchTokensCollector { priority_queue > tokens; strings::UniString m_currentS; uint32_t m_currentCount; SearchTokensCollector() : m_currentS(), m_currentCount(0) {} void operator() (strings::UniString const & s, search::trie::ValueReader::ValueType const &) { if (m_currentS == s) { ++m_currentCount; } else { if (m_currentCount > 0) { tokens.push(make_pair(m_currentCount, m_currentS)); if (tokens.size() > 100) tokens.pop(); } m_currentS = s; m_currentCount = 0; } } void Finish() { if (m_currentCount > 0) { tokens.push(make_pair(m_currentCount, m_currentS)); if (tokens.size() > 100) tokens.pop(); } } }; void DumpSearchTokens(string const & fPath) { FilesContainerR container(new FileReader(fPath)); feature::DataHeader header; header.Load(container.GetReader(HEADER_FILE_TAG)); serial::CodingParams cp(search::GetCPForTrie(header.GetDefCodingParams())); scoped_ptr pTrieRoot( ::trie::reader::ReadTrie(container.GetReader(SEARCH_INDEX_FILE_TAG), search::trie::ValueReader(cp), search::trie::EdgeValueReader())); SearchTokensCollector f; ::trie::ForEachRef(*pTrieRoot, f, strings::UniString()); f.Finish(); while (!f.tokens.empty()) { strings::UniString const & s = f.tokens.top().second; cout << f.tokens.top().first << " '" << strings::ToUtf8(s) << "'" << endl; f.tokens.pop(); } } } // namespace feature