Welcome to mirror list, hosted at ThFree Co, Russian Federation.

vocabulary.h « src - github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: a9875229396a253a4b943adff497459b4956528f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#ifndef VOCABULARY_H
#define VOCABULARY_H

#include <vector>
#include <string>
#include <queue>
#include <boost/unordered_map.hpp>

namespace nplm
{

template <typename T>
struct compare_second
{
  bool operator()(const T &lhs, const T &rhs) const { return lhs.second < rhs.second; }
};

class vocabulary {
    std::vector<std::string> m_words;
    boost::unordered_map<std::string, int> m_index;
    int unk;

public:
    vocabulary() 
    { 
        unk = insert_word("<unk>");
    }

    vocabulary(const std::vector<std::string> &words)
      :
      m_words(words)
    {
        for (int i=0; i<words.size(); i++)
            m_index[words[i]] = i;
	unk = m_index["<unk>"];
    }

    int lookup_word(const std::string &word) const
    {
        boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
	if (pos != m_index.end())
	    return pos->second;
	else
	    return unk;
    }

    // lookup word using custom unknown-word id
    int lookup_word(const std::string &word, int unk) const
    {
        boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
	if (pos != m_index.end())
	    return pos->second;
	else
	    return unk;
    }

    int insert_word(const std::string &word)
    {
        int i = size();
        bool inserted = m_index.insert(make_pair(word, i)).second;
	if (inserted)
	{
	    m_words.push_back(word);
	}
	return i;
    }

    int size() const { return m_words.size(); }

    // Inserts the most-frequent words from counts until vocab_size words are reached.
    // counts is a collection of pair<string,int>
    template <typename Map>
    int insert_most_frequent(const Map &counts, int vocab_size)
    {
        typedef std::pair<std::string,int> stringint;

	std::priority_queue<stringint,std::vector<stringint>,compare_second<stringint> > 
	  q(compare_second<stringint>(), std::vector<stringint>(counts.begin(), counts.end()));

	int inserted = 0;
	while (size() < vocab_size && !q.empty())
	{
	    insert_word(q.top().first);
	    q.pop();
	    inserted++;
	}
	return inserted;
    }

    const std::vector<std::string> &words() const { return m_words; }

    const boost::unordered_map<std::string, int>& get_idmap() const { return m_index; }
};

} // namespace nplm

#endif