diff options
Diffstat (limited to 'stanza/utils/datasets/tokenization/convert_vi_vlsp.py')
-rw-r--r-- | stanza/utils/datasets/tokenization/convert_vi_vlsp.py | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/stanza/utils/datasets/tokenization/convert_vi_vlsp.py b/stanza/utils/datasets/tokenization/convert_vi_vlsp.py new file mode 100644 index 00000000..947fe17f --- /dev/null +++ b/stanza/utils/datasets/tokenization/convert_vi_vlsp.py @@ -0,0 +1,153 @@ + +import os + +punctuation_set = (',', '.', '!', '?', ')', ':', ';', '”', '…', '...') + +def find_spaces(sentence): + # TODO: there are some sentences where there is only one quote, + # and some of them should be attached to the previous word instead + # of the next word. Training should work this way, though + odd_quotes = False + + spaces = [] + for word_idx, word in enumerate(sentence): + space = True + # Quote period at the end of a sentence needs to be attached + # to the rest of the text. Some sentences have `"... text` + # in the middle, though, so look for that + if word_idx < len(sentence) - 2 and sentence[word_idx+1] == '"': + if sentence[word_idx+2] == '.': + space = False + elif word_idx == len(sentence) - 3 and sentence[word_idx+2] == '...': + space = False + if word_idx < len(sentence) - 1: + if sentence[word_idx+1] in (',', '.', '!', '?', ')', ':', ';', '”', '…', '...','/', '%'): + space = False + if word in ('(', '“', '/'): + space = False + if word == '"': + if odd_quotes: + # already saw one quote. put this one at the end of the PREVIOUS word + # note that we know there must be at least one word already + odd_quotes = False + spaces[word_idx-1] = False + else: + odd_quotes = True + space = False + spaces.append(space) + return spaces + +def add_vlsp_args(parser): + parser.add_argument('--include_pos_data', action='store_true', default=False, help='To include or not POS training dataset for tokenization training. The path to POS dataset is expected to be in the same dir with WS path. For example, extern_dir/vietnamese/VLSP2013-POS-data') + parser.add_argument('--vlsp_include_spaces', action='store_true', default=False, help='When processing vi_vlsp tokenization, include all of the spaces. Otherwise, we try to turn the text back into standard text') +def write_file(vlsp_include_spaces, output_filename, sentences, shard): + with open(output_filename, "w") as fout: + check_headlines = False + for sent_idx, sentence in enumerate(sentences): + fout.write("# sent_id = %s.%d\n" % (shard, sent_idx)) + orig_text = " ".join(sentence) + #check if the previous line is a headline (no ending mark at the end) then make this sentence a new par + if check_headlines: + fout.write("# newpar id =%s.%d.1\n" % (shard, sent_idx)) + check_headlines = False + if sentence[len(sentence) - 1] not in punctuation_set: + check_headlines = True + + if vlsp_include_spaces: + fout.write("# text = %s\n" % orig_text) + else: + spaces = find_spaces(sentence) + full_text = "" + for word, space in zip(sentence, spaces): + # could be made more efficient, but shouldn't matter + full_text = full_text + word + if space: + full_text = full_text + " " + fout.write("# text = %s\n" % full_text) + fout.write("# orig_text = %s\n" % orig_text) + for word_idx, word in enumerate(sentence): + fake_dep = "root" if word_idx == 0 else "dep" + fout.write("%d\t%s\t%s" % ((word_idx+1), word, word)) + fout.write("\t_\t_\t_") + fout.write("\t%d\t%s" % (word_idx, fake_dep)) + fout.write("\t_\t") + if vlsp_include_spaces or spaces[word_idx]: + fout.write("_") + else: + fout.write("SpaceAfter=No") + fout.write("\n") + fout.write("\n") + +def convert_pos_dataset(file_path): + """ + This function is to process the pos dataset + """ + + file = open(file_path, "r") + document = file.readlines() + sentences = [] + sent = [] + for line in document: + if line == "\n" and len(sent)>1: + if sent not in sentences: + sentences.append(sent) + sent = [] + elif line != "\n": + sent.append(line.split("\t")[0].replace("_"," ").strip()) + return sentences + +def convert_file(vlsp_include_spaces, input_filename, output_filename, shard, split_filename=None, split_shard=None, pos_data = None): + with open(input_filename) as fin: + lines = fin.readlines() + + sentences = [] + set_sentences = set() + for line in lines: + if len(line.replace("_", " ").split())>1: + words = line.split() + #one syllable lines are eliminated + if len(words) == 1 and len(words[0].split("_")) == 1: + continue + else: + words = [w.replace("_", " ") for w in words] + #only add sentences that hasn't been added before + if words not in sentences: + sentences.append(words) + set_sentences.add(' '.join(words)) + + if split_filename is not None: + # even this is a larger dev set than the train set + split_point = int(len(sentences) * 0.95) + #check pos_data that aren't overlapping with current VLSP WS dataset + sentences_pos = [] if pos_data is None else [sent for sent in pos_data if ' '.join(sent) not in set_sentences] + print("Added ", len(sentences_pos), " sentences from POS dataset.") + write_file(vlsp_include_spaces, output_filename, sentences[:split_point]+sentences_pos, shard) + write_file(vlsp_include_spaces, split_filename, sentences[split_point:], split_shard) + else: + write_file(vlsp_include_spaces, output_filename, sentences, shard) + +def convert_vi_vlsp(extern_dir, tokenizer_dir, args): + input_path = os.path.join(extern_dir, "vietnamese", "VLSP2013-WS-data") + input_pos_path = os.path.join(extern_dir, "vietnamese", "VLSP2013-POS-data") + input_train_filename = os.path.join(input_path, "VLSP2013_WS_train_gold.txt") + input_test_filename = os.path.join(input_path, "VLSP2013_WS_test_gold.txt") + + input_pos_filename = os.path.join(input_pos_path, "VLSP2013_POS_train_BI_POS_Column.txt.goldSeg") + if not os.path.exists(input_train_filename): + raise FileNotFoundError("Cannot find train set for VLSP at %s" % input_train_filename) + if not os.path.exists(input_test_filename): + raise FileNotFoundError("Cannot find test set for VLSP at %s" % input_test_filename) + pos_data = None + if args.include_pos_data: + if not os.path.exists(input_pos_filename): + raise FileNotFoundError("Cannot find pos dataset for VLSP at %" % input_pos_filename) + else: + pos_data = convert_pos_dataset(input_pos_filename) + + output_train_filename = os.path.join(tokenizer_dir, "vi_vlsp.train.gold.conllu") + output_dev_filename = os.path.join(tokenizer_dir, "vi_vlsp.dev.gold.conllu") + output_test_filename = os.path.join(tokenizer_dir, "vi_vlsp.test.gold.conllu") + + convert_file(args.vlsp_include_spaces, input_train_filename, output_train_filename, "train", output_dev_filename, "dev", pos_data) + convert_file(args.vlsp_include_spaces, input_test_filename, output_test_filename, "test") + |