From 6c146e285e18b82763ba3d47d584680c8fd68123 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Tue, 6 Sep 2022 22:23:05 -0700 Subject: A script which convert Sindhi tokenization from Isra university Can also be applied to other similar datasets Read sentences & use the tokenization module to align the tokens with the original text Randomly split the sentences Write out the sentences and prepare their labels --- .../datasets/tokenization/convert_text_files.py | 118 +++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 stanza/utils/datasets/tokenization/convert_text_files.py diff --git a/stanza/utils/datasets/tokenization/convert_text_files.py b/stanza/utils/datasets/tokenization/convert_text_files.py new file mode 100644 index 00000000..64c130c8 --- /dev/null +++ b/stanza/utils/datasets/tokenization/convert_text_files.py @@ -0,0 +1,118 @@ +""" +Given a text file and a file with one word per line, convert the text file +""" + +import argparse +import random + +from stanza.models.tokenization.utils import match_tokens_with_text +import stanza.utils.datasets.common as common + + +def process_raw_file(text_file, token_file): + """ + Process a text file separated into a list of tokens using match_tokens_with_text from the tokenizer + + The tokens are one per line in the token_file + The tokens in the token_file must add up to the text_file modulo whitespace. + Sentence breaks should be represented by blank lines between sentence + + The return format is a list of list of conllu lines representing the sentences. + The only fields set will be the token index, the token text, and possibly SpaceAfter=No + where SpaceAfter=No is true if the next token started with no whitespace in the text file + """ + with open(text_file, encoding="utf-8") as fin: + text = fin.read() + + sentences = [] + current_sentence = [] + with open(token_file, encoding="utf-8") as fin: + for line in fin: + line = line.strip() + if not line: + if current_sentence: + sentences.append(current_sentence) + current_sentence = [] + else: + current_sentence.append(line) + if current_sentence: + sentences.append(current_sentence) + + doc = match_tokens_with_text(sentences, text) + + sentences = [] + for sent_idx, sentence in enumerate(doc.sentences): + tokens = [] + tokens.append("# sent_id = %d" % (sent_idx+1)) + tokens.append("# text = %s" % text[sentence.tokens[0].start_char:sentence.tokens[-1].end_char].replace("\n", " ")) + for token_idx, token in enumerate(sentence.tokens): + #text = token.text + if token_idx == len(sentence.tokens) - 1 and sent_idx == len(doc.sentences) - 1: + space_after = True + elif token_idx == len(sentence.tokens) - 1: + space_after = not token.end_char == doc.sentences[sent_idx+1].tokens[0].start_char + else: + space_after = not token.end_char == sentence.tokens[token_idx+1].start_char + token = [str(token_idx+1), token.text] + ["_"] * 7 + ["_" if space_after else "SpaceAfter=No"] + assert len(token) == 10, "Token length: %d" % len(token) + token = "\t".join(token) + tokens.append(token) + sentences.append(tokens) + return sentences + +def extract_sentences(text_files, token_files): + sentences = [] + for text_file, token_file in zip(text_files, token_files): + sentences.extend(process_raw_file(text_file, token_file)) + return sentences + +def split_sentences(sentences, train_split=0.8, dev_split=0.1): + """ + Splits randomly without shuffling + """ + generator = random.Random(1234) + + train = [] + dev = [] + test = [] + for sentence in sentences: + r = generator.random() + if r < train_split: + train.append(sentence) + elif r < train_split + dev_split: + dev.append(sentence) + else: + test.append(sentence) + return (train, dev, test) + +SHARDS = ("train", "dev", "test") + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--text_files', type=str, default="extern_data/sindhi/tokenization/FolkStory1.txt,extern_data/sindhi/tokenization/String1.txt", help="Where to find the text files") + parser.add_argument('--token_files', type=str, default="extern_data/sindhi/tokenization/tkns_FolkStory1.txt,extern_data/sindhi/tokenization/tkns_String1.txt", help="Where to find the token files") + parser.add_argument('--output_path', type=str, default="data/tokenize", help="Where to output the results") + parser.add_argument('--dataset', type=str, default="sd_isra", help="What name to give this dataset") + args = parser.parse_args() + + text_files = args.text_files.split(",") + token_files = args.token_files.split(",") + + tokenizer_dir = args.output_path + short_name = args.dataset # todo: convert a full name? + + if len(text_files) != len(token_files): + raise ValueError("Expected same number of text and token files") + + sentences = extract_sentences(text_files, token_files) + splits = split_sentences(sentences) + + for dataset, shard in zip(splits, SHARDS): + output_conllu = common.tokenizer_conllu_name(tokenizer_dir, short_name, shard) + common.write_sentences_to_conllu(output_conllu, dataset) + + common.convert_conllu_to_txt(tokenizer_dir, short_name) + common.prepare_tokenizer_treebank_labels(tokenizer_dir, short_name) + +if __name__ == '__main__': + main() -- cgit v1.2.3