diff options
Diffstat (limited to 'stanza/utils/datasets/tokenization/convert_th_best.py')
-rw-r--r-- | stanza/utils/datasets/tokenization/convert_th_best.py | 168 |
1 files changed, 168 insertions, 0 deletions
diff --git a/stanza/utils/datasets/tokenization/convert_th_best.py b/stanza/utils/datasets/tokenization/convert_th_best.py new file mode 100644 index 00000000..778f2dac --- /dev/null +++ b/stanza/utils/datasets/tokenization/convert_th_best.py @@ -0,0 +1,168 @@ +"""Parses the BEST Thai dataset. + +That is to say, the dataset named BEST. We have not yet figured out +which segmentation standard we prefer. + +Note that the version of BEST we used actually had some strange +sentence splits according to a native Thai speaker. Not sure how to +fix that. Options include doing it automatically or finding some +knowledgable annotators to resplit it for us (or just not using BEST) + +This outputs the tokenization results in a conll format similar to +that of the UD treebanks, so we pretend to be a UD treebank for ease +of compatibility with the stanza tools. + +BEST can be downloaded from here: + +https://aiforthai.in.th/corpus.php + +python3 -m stanza.utils.datasets.tokenization.process_best extern_data/thai/best data/tokenize +./scripts/run_tokenize.sh UD_Thai-best --dropout 0.05 --unit_dropout 0.05 --steps 50000 +""" +import glob +import os +import random +import re +import sys + +from pythainlp import sent_tokenize + +from stanza.utils.datasets.tokenization.process_thai_tokenization import reprocess_lines, write_dataset, convert_processed_lines, write_dataset_best, write_dataset + +def clean_line(line): + line = line.replace("html>", "html|>") + # news_00089.txt + line = line.replace("<NER>", "<NE>") + line = line.replace("</NER>", "</NE>") + # specific error that occurs in encyclopedia_00095.txt + line = line.replace("</AB>Penn", "</AB>|Penn>") + # news_00058.txt + line = line.replace("<AB>จม.</AB>เปิดผนึก", "<AB>จม.</AB>|เปิดผนึก") + # news_00015.txt + line = re.sub("<NE><AB>([^|<>]+)</AB>([^|<>]+)</NE>", "\\1|\\2", line) + # news_00024.txt + line = re.sub("<NE><AB>([^|<>]+)</AB></NE>", "\\1", line) + # news_00055.txt + line = re.sub("<NE>([^|<>]+)<AB>([^|<>]+)</AB></NE>", "\\1|\\2", line) + line = re.sub("<NE><AB>([^|<>]+)</AB><AB>([^|<>]+)</AB></NE>", "\\1|\\2", line) + line = re.sub("<NE>([^|<>]+)<AB>([^|<>]+)</AB> <AB>([^|<>]+)</AB></NE>", "\\1|\\2|\\3", line) + # news_00008.txt and other news articles + line = re.sub("</AB>([0-9])", "</AB>|\\1", line) + line = line.replace("</AB> ", "</AB>|") + line = line.replace("<EM>", "<POEM>") + line = line.replace("</EM>", "</POEM>") + line = line.strip() + return line + + +def clean_word(word): + # novel_00078.txt + if word == '<NEพี่มน</NE>': + return 'พี่มน' + if word.startswith("<NE>") and word.endswith("</NE>"): + return word[4:-5] + if word.startswith("<AB>") and word.endswith("</AB>"): + return word[4:-5] + if word.startswith("<POEM>") and word.endswith("</POEM>"): + return word[6:-7] + """ + if word.startswith("<EM>"): + return word[4:] + if word.endswith("</EM>"): + return word[:-5] + """ + if word.startswith("<NE>"): + return word[4:] + if word.endswith("</NE>"): + return word[:-5] + if word.startswith("<POEM>"): + return word[6:] + if word.endswith("</POEM>"): + return word[:-7] + if word == '<': + return word + return word + +def read_data(input_dir): + # data for test sets + test_files = [os.path.join(input_dir, 'TEST_100K_ANS.txt')] + print(test_files) + + # data for train and dev sets + subdirs = [os.path.join(input_dir, 'article'), + os.path.join(input_dir, 'encyclopedia'), + os.path.join(input_dir, 'news'), + os.path.join(input_dir, 'novel')] + files = [] + for subdir in subdirs: + if not os.path.exists(subdir): + raise FileNotFoundError("Expected a directory that did not exist: {}".format(subdir)) + files.extend(glob.glob(os.path.join(subdir, '*.txt'))) + + test_documents = [] + for filename in test_files: + print("File name:", filename) + with open(filename) as fin: + processed_lines = [] + for line in fin.readlines(): + line = clean_line(line) + words = line.split("|") + words = [clean_word(x) for x in words] + for word in words: + if len(word) > 1 and word[0] == '<': + raise ValueError("Unexpected word '{}' in document {}".format(word, filename)) + words = [x for x in words if x] + processed_lines.append(words) + + processed_lines = reprocess_lines(processed_lines) + paragraphs = convert_processed_lines(processed_lines) + + test_documents.extend(paragraphs) + print("Test document finished.") + + documents = [] + + for filename in files: + with open(filename) as fin: + print("File:", filename) + processed_lines = [] + for line in fin.readlines(): + line = clean_line(line) + words = line.split("|") + words = [clean_word(x) for x in words] + for word in words: + if len(word) > 1 and word[0] == '<': + raise ValueError("Unexpected word '{}' in document {}".format(word, filename)) + words = [x for x in words if x] + processed_lines.append(words) + + processed_lines = reprocess_lines(processed_lines) + paragraphs = convert_processed_lines(processed_lines) + + documents.extend(paragraphs) + + print("All documents finished.") + + return documents, test_documents + + +def main(*args): + random.seed(1000) + if not args: + args = sys.argv[1:] + + input_dir = args[0] + full_input_dir = os.path.join(input_dir, "thai", "best") + if os.path.exists(full_input_dir): + # otherwise hopefully the user gave us the full path? + input_dir = full_input_dir + + output_dir = args[1] + documents, test_documents = read_data(input_dir) + print("Finished reading data.") + write_dataset_best(documents, test_documents, output_dir, "best") + + +if __name__ == '__main__': + main() + |