#!/usr/bin/env python
import gzip


# Class to store word frequences. Word frequences are read from a tab-sepparated file containing two fields: freqences
# first and words second. Words must be lowercased. The file must be gzipped. Such files can be easyly produced from
# monolingual text running a command like this:
# cat monolingual.txt | tokenizer.sh | tr ' ' '\n' | tr '[:upper:]' '[:lower:]' | sort | uniq -c > wordfreq.txt
class WordFreqList(object):

    # Constructor
    def __init__(self, file_with_freq):
        self.word_freqs = dict()
        fname = file_with_freq if not hasattr(file_with_freq, 'name') else file_with_freq.name
        word_ocss = dict()
        with gzip.open(fname, "r") as reader:
            for line in reader:
                line = line.decode().strip()
                parts = line.split()
                word = parts[-1]
                occs = int(parts[0])
                word_ocss[word] = occs
        self.total_words = sum(word_ocss.values())
        for word, occs in word_ocss.items():
            self.word_freqs[word] = float(occs)/float(self.total_words)
        self.min_freq = 1.0/float(self.total_words)

    def get_word_freq(self, word):
        word = word.lower()
        if word in self.word_freqs:
            return self.word_freqs[word]
        else:
            return self.min_freq