stanza/utils/datasets/tokenization/convert_vi_vlsp.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153


import os

punctuation_set = (',', '.', '!', '?', ')', ':', ';', '”', '…', '...')

def find_spaces(sentence):
    # TODO: there are some sentences where there is only one quote,
    # and some of them should be attached to the previous word instead
    # of the next word.  Training should work this way, though
    odd_quotes = False

    spaces = []
    for word_idx, word in enumerate(sentence):
        space = True
        # Quote period at the end of a sentence needs to be attached
        # to the rest of the text.  Some sentences have `"... text`
        # in the middle, though, so look for that
        if word_idx < len(sentence) - 2 and sentence[word_idx+1] == '"':
            if sentence[word_idx+2] == '.':
                space = False
            elif word_idx == len(sentence) - 3 and sentence[word_idx+2] == '...':
                space = False
        if word_idx < len(sentence) - 1:
            if sentence[word_idx+1] in (',', '.', '!', '?', ')', ':', ';', '”', '…', '...','/', '%'):
                space = False
        if word in ('(', '“', '/'):
            space = False
        if word == '"':
            if odd_quotes:
                # already saw one quote.  put this one at the end of the PREVIOUS word
                # note that we know there must be at least one word already
                odd_quotes = False
                spaces[word_idx-1] = False
            else:
                odd_quotes = True
                space = False
        spaces.append(space)
    return spaces

def add_vlsp_args(parser):
    parser.add_argument('--include_pos_data', action='store_true', default=False, help='To include or not POS training dataset for tokenization training. The path to POS dataset is expected to be in the same dir with WS path. For example, extern_dir/vietnamese/VLSP2013-POS-data')
    parser.add_argument('--vlsp_include_spaces', action='store_true', default=False, help='When processing vi_vlsp tokenization, include all of the spaces.  Otherwise, we try to turn the text back into standard text')
def write_file(vlsp_include_spaces, output_filename, sentences, shard):
    with open(output_filename, "w") as fout:
        check_headlines = False
        for sent_idx, sentence in enumerate(sentences):
            fout.write("# sent_id = %s.%d\n" % (shard, sent_idx))
            orig_text = " ".join(sentence)
            #check if the previous line is a headline (no ending mark at the end) then make this sentence a new par
            if check_headlines:
                fout.write("# newpar id =%s.%d.1\n" % (shard, sent_idx))
                check_headlines = False
            if sentence[len(sentence) - 1] not in punctuation_set:
                check_headlines = True

            if vlsp_include_spaces:
                fout.write("# text = %s\n" % orig_text)
            else:
                spaces = find_spaces(sentence)
                full_text = ""
                for word, space in zip(sentence, spaces):
                    # could be made more efficient, but shouldn't matter
                    full_text = full_text + word
                    if space:
                        full_text = full_text + " "
                fout.write("# text = %s\n" % full_text)
                fout.write("# orig_text = %s\n" % orig_text)
            for word_idx, word in enumerate(sentence):
                fake_dep = "root" if word_idx == 0 else "dep"
                fout.write("%d\t%s\t%s" % ((word_idx+1), word, word))
                fout.write("\t_\t_\t_")
                fout.write("\t%d\t%s" % (word_idx, fake_dep))
                fout.write("\t_\t")
                if vlsp_include_spaces or spaces[word_idx]:
                    fout.write("_")
                else:
                    fout.write("SpaceAfter=No")
                fout.write("\n")
            fout.write("\n")

def convert_pos_dataset(file_path):
    """
    This function is to process the pos dataset
    """
    
    file = open(file_path, "r")
    document = file.readlines()
    sentences = []
    sent = []
    for line in document:
        if line == "\n" and len(sent)>1:
            if sent not in sentences:
                sentences.append(sent)
            sent = []
        elif line != "\n":
            sent.append(line.split("\t")[0].replace("_"," ").strip())
    return sentences
        
def convert_file(vlsp_include_spaces, input_filename, output_filename, shard, split_filename=None, split_shard=None, pos_data = None):
    with open(input_filename) as fin:
        lines = fin.readlines()

    sentences = []
    set_sentences = set()
    for line in lines:
        if len(line.replace("_", " ").split())>1:
            words = line.split()
            #one syllable lines are eliminated
            if len(words) == 1 and len(words[0].split("_")) == 1:
                continue
            else:
                words = [w.replace("_", " ") for w in words]
                #only add sentences that hasn't been added before
                if words not in sentences:
                    sentences.append(words)
                    set_sentences.add(' '.join(words))
                
    if split_filename is not None:
        # even this is a larger dev set than the train set
        split_point = int(len(sentences) * 0.95)
        #check pos_data that aren't overlapping with current VLSP WS dataset
        sentences_pos = [] if pos_data is None else [sent for sent in pos_data if ' '.join(sent) not in set_sentences]
        print("Added ", len(sentences_pos), " sentences from POS dataset.")
        write_file(vlsp_include_spaces, output_filename, sentences[:split_point]+sentences_pos, shard)
        write_file(vlsp_include_spaces, split_filename, sentences[split_point:], split_shard)
    else:
        write_file(vlsp_include_spaces, output_filename, sentences, shard)

def convert_vi_vlsp(extern_dir, tokenizer_dir, args):
    input_path = os.path.join(extern_dir, "vietnamese", "VLSP2013-WS-data")
    input_pos_path = os.path.join(extern_dir, "vietnamese", "VLSP2013-POS-data")
    input_train_filename = os.path.join(input_path, "VLSP2013_WS_train_gold.txt")
    input_test_filename = os.path.join(input_path, "VLSP2013_WS_test_gold.txt")
    
    input_pos_filename = os.path.join(input_pos_path, "VLSP2013_POS_train_BI_POS_Column.txt.goldSeg")
    if not os.path.exists(input_train_filename):
        raise FileNotFoundError("Cannot find train set for VLSP at %s" % input_train_filename)
    if not os.path.exists(input_test_filename):
        raise FileNotFoundError("Cannot find test set for VLSP at %s" % input_test_filename)
    pos_data = None
    if args.include_pos_data:
        if not os.path.exists(input_pos_filename):
            raise FileNotFoundError("Cannot find pos dataset for VLSP at %" % input_pos_filename)
        else:
            pos_data = convert_pos_dataset(input_pos_filename) 

    output_train_filename = os.path.join(tokenizer_dir, "vi_vlsp.train.gold.conllu")
    output_dev_filename = os.path.join(tokenizer_dir,   "vi_vlsp.dev.gold.conllu")
    output_test_filename = os.path.join(tokenizer_dir,  "vi_vlsp.test.gold.conllu")

    convert_file(args.vlsp_include_spaces, input_train_filename, output_train_filename, "train", output_dev_filename, "dev", pos_data)
    convert_file(args.vlsp_include_spaces, input_test_filename, output_test_filename, "test")