stanza/utils/datasets/tokenization/process_thai_tokenization.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

import os
import random

try:
    from pythainlp import sent_tokenize
except ImportError:
    pass

def write_section(output_dir, dataset_name, section, documents):
    """
    Writes a list of documents for tokenization, including a file in conll format

    The Thai datasets generally have no MWT (apparently not relevant for Thai)

    output_dir: the destination directory for the output files
    dataset_name: orchid, BEST, lst20, etc
    section: train/dev/test
    documents: a nested list of documents, paragraphs, sentences, words
      words is a list of (word, space_follows)
    """
    with open(os.path.join(output_dir, 'th_%s-ud-%s-mwt.json' % (dataset_name, section)), 'w') as fout:
        fout.write("[]\n")

    text_out = open(os.path.join(output_dir, 'th_%s.%s.txt' % (dataset_name, section)), 'w')
    label_out = open(os.path.join(output_dir, 'th_%s-ud-%s.toklabels' % (dataset_name, section)), 'w')
    for document in documents:
        for paragraph in document:
            for sentence_idx, sentence in enumerate(paragraph):
                for word_idx, word in enumerate(sentence):
                    # TODO: split with newlines to make it more readable?
                    text_out.write(word[0])
                    for i in range(len(word[0]) - 1):
                        label_out.write("0")
                    if word_idx == len(sentence) - 1:
                        label_out.write("2")
                    else:
                        label_out.write("1")
                    if word[1] and (sentence_idx != len(paragraph) - 1 or word_idx != len(sentence) - 1):
                        text_out.write(' ')
                        label_out.write('0')

            text_out.write("\n\n")
            label_out.write("\n\n")

    text_out.close()
    label_out.close()

    with open(os.path.join(output_dir, 'th_%s.%s.gold.conllu' % (dataset_name, section)), 'w') as fout:
        for document in documents:
            for paragraph in document:
                new_par = True
                for sentence in paragraph:
                    for word_idx, word in enumerate(sentence):
                        # SpaceAfter is left blank if there is space after the word
                        if word[1] and new_par:
                            space = 'NewPar=Yes'
                        elif word[1]:
                            space = '_'
                        elif new_par:
                            space = 'SpaceAfter=No|NewPar=Yes'
                        else:
                            space = 'SpaceAfter=No'
                        new_par = False

                        # Note the faked dependency structure: the conll reading code
                        # needs it even if it isn't being used in any way
                        fake_dep = 'root' if word_idx == 0 else 'dep'
                        fout.write('{}\t{}\t_\t_\t_\t_\t{}\t{}\t{}:{}\t{}\n'.format(word_idx+1, word[0], word_idx, fake_dep, word_idx, fake_dep, space))
                    fout.write('\n')

def write_dataset(documents, output_dir, dataset_name):
    """
    Shuffle a list of documents, write three sections
    """
    random.shuffle(documents)
    num_train = int(len(documents) * 0.8)
    num_dev = int(len(documents) * 0.1)
    os.makedirs(output_dir, exist_ok=True)
    write_section(output_dir, dataset_name, 'train', documents[:num_train])
    write_section(output_dir, dataset_name, 'dev', documents[num_train:num_train+num_dev])
    write_section(output_dir, dataset_name, 'test', documents[num_train+num_dev:])

def write_dataset_best(documents, test_documents, output_dir, dataset_name):
    """
    Shuffle a list of documents, write three sections
    """
    random.shuffle(documents)
    num_train = int(len(documents) * 0.85)
    num_dev = int(len(documents) * 0.15)
    os.makedirs(output_dir, exist_ok=True)
    write_section(output_dir, dataset_name, 'train', documents[:num_train])
    write_section(output_dir, dataset_name, 'dev', documents[num_train:num_train+num_dev])
    write_section(output_dir, dataset_name, 'test', test_documents)


def reprocess_lines(processed_lines):
    """
    Reprocesses lines using pythainlp to cut up sentences into shorter sentences.

    Many of the lines in BEST seem to be multiple Thai sentences concatenated, according to native Thai speakers.

    Input: a list of lines, where each line is a list of words.  Space characters can be included as words
    Output: a new list of lines, resplit using pythainlp
    """
    reprocessed_lines = []
    for line in processed_lines:
        text = "".join(line)
        try:
            chunks = sent_tokenize(text)
        except NameError as e:
            raise NameError("Sentences cannot be reprocessed without first installing pythainlp") from e
        # Check that the total text back is the same as the text in
        if sum(len(x) for x in chunks) != len(text):
            raise ValueError("Got unexpected text length: \n{}\nvs\n{}".format(text, chunks))

        chunk_lengths = [len(x) for x in chunks]

        current_length = 0
        new_line = []
        for word in line:
            if len(word) + current_length < chunk_lengths[0]:
                new_line.append(word)
                current_length = current_length + len(word)
            elif len(word) + current_length == chunk_lengths[0]:
                new_line.append(word)
                reprocessed_lines.append(new_line)
                new_line = []
                chunk_lengths = chunk_lengths[1:]
                current_length = 0
            else:
                remaining_len = chunk_lengths[0] - current_length
                new_line.append(word[:remaining_len])
                reprocessed_lines.append(new_line)
                word = word[remaining_len:]
                chunk_lengths = chunk_lengths[1:]
                while len(word) > chunk_lengths[0]:
                    new_line = [word[:chunk_lengths[0]]]
                    reprocessed_lines.append(new_line)
                    word = word[chunk_lengths[0]:]
                    chunk_lengths = chunk_lengths[1:]
                new_line = [word]
                current_length = len(word)
        reprocessed_lines.append(new_line)
    return reprocessed_lines

def convert_processed_lines(processed_lines):
    """
    Convert a list of sentences into documents suitable for the output methods in this module.

    Input: a list of lines, including space words
    Output: a list of documents, each document containing a list of sentences
            Each sentence is a list of words: (text, space_follows)
            Space words will be eliminated.
    """
    paragraphs = []
    sentences = []
    for words in processed_lines:
        # turn the words into a sentence
        if len(words) > 1 and " " == words[0]:
            words = words[1:]
        elif len(words) == 1 and " " == words[0]:
            words = []

        sentence = []
        for word in words:
            word = word.strip()
            if not word:
                if len(sentence) == 0:
                    print(word)
                    raise ValueError("Unexpected space at start of sentence in document {}".format(filename))
                sentence[-1] = (sentence[-1][0], True)
            else:
                sentence.append((word, False))
        # blank lines are very rare in best, but why not treat them as a paragraph break
        if len(sentence) == 0:
            paragraphs.append([sentences])
            sentences = []
            continue
        sentence[-1] = (sentence[-1][0], True)
        sentences.append(sentence)
    paragraphs.append([sentences])
    return paragraphs