Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'stanza/utils/datasets/tokenization/convert_th_best.py')
-rw-r--r--stanza/utils/datasets/tokenization/convert_th_best.py168
1 files changed, 168 insertions, 0 deletions
diff --git a/stanza/utils/datasets/tokenization/convert_th_best.py b/stanza/utils/datasets/tokenization/convert_th_best.py
new file mode 100644
index 00000000..778f2dac
--- /dev/null
+++ b/stanza/utils/datasets/tokenization/convert_th_best.py
@@ -0,0 +1,168 @@
+"""Parses the BEST Thai dataset.
+
+That is to say, the dataset named BEST. We have not yet figured out
+which segmentation standard we prefer.
+
+Note that the version of BEST we used actually had some strange
+sentence splits according to a native Thai speaker. Not sure how to
+fix that. Options include doing it automatically or finding some
+knowledgable annotators to resplit it for us (or just not using BEST)
+
+This outputs the tokenization results in a conll format similar to
+that of the UD treebanks, so we pretend to be a UD treebank for ease
+of compatibility with the stanza tools.
+
+BEST can be downloaded from here:
+
+https://aiforthai.in.th/corpus.php
+
+python3 -m stanza.utils.datasets.tokenization.process_best extern_data/thai/best data/tokenize
+./scripts/run_tokenize.sh UD_Thai-best --dropout 0.05 --unit_dropout 0.05 --steps 50000
+"""
+import glob
+import os
+import random
+import re
+import sys
+
+from pythainlp import sent_tokenize
+
+from stanza.utils.datasets.tokenization.process_thai_tokenization import reprocess_lines, write_dataset, convert_processed_lines, write_dataset_best, write_dataset
+
+def clean_line(line):
+ line = line.replace("html>", "html|>")
+ # news_00089.txt
+ line = line.replace("<NER>", "<NE>")
+ line = line.replace("</NER>", "</NE>")
+ # specific error that occurs in encyclopedia_00095.txt
+ line = line.replace("</AB>Penn", "</AB>|Penn>")
+ # news_00058.txt
+ line = line.replace("<AB>จม.</AB>เปิดผนึก", "<AB>จม.</AB>|เปิดผนึก")
+ # news_00015.txt
+ line = re.sub("<NE><AB>([^|<>]+)</AB>([^|<>]+)</NE>", "\\1|\\2", line)
+ # news_00024.txt
+ line = re.sub("<NE><AB>([^|<>]+)</AB></NE>", "\\1", line)
+ # news_00055.txt
+ line = re.sub("<NE>([^|<>]+)<AB>([^|<>]+)</AB></NE>", "\\1|\\2", line)
+ line = re.sub("<NE><AB>([^|<>]+)</AB><AB>([^|<>]+)</AB></NE>", "\\1|\\2", line)
+ line = re.sub("<NE>([^|<>]+)<AB>([^|<>]+)</AB> <AB>([^|<>]+)</AB></NE>", "\\1|\\2|\\3", line)
+ # news_00008.txt and other news articles
+ line = re.sub("</AB>([0-9])", "</AB>|\\1", line)
+ line = line.replace("</AB> ", "</AB>|")
+ line = line.replace("<EM>", "<POEM>")
+ line = line.replace("</EM>", "</POEM>")
+ line = line.strip()
+ return line
+
+
+def clean_word(word):
+ # novel_00078.txt
+ if word == '<NEพี่มน</NE>':
+ return 'พี่มน'
+ if word.startswith("<NE>") and word.endswith("</NE>"):
+ return word[4:-5]
+ if word.startswith("<AB>") and word.endswith("</AB>"):
+ return word[4:-5]
+ if word.startswith("<POEM>") and word.endswith("</POEM>"):
+ return word[6:-7]
+ """
+ if word.startswith("<EM>"):
+ return word[4:]
+ if word.endswith("</EM>"):
+ return word[:-5]
+ """
+ if word.startswith("<NE>"):
+ return word[4:]
+ if word.endswith("</NE>"):
+ return word[:-5]
+ if word.startswith("<POEM>"):
+ return word[6:]
+ if word.endswith("</POEM>"):
+ return word[:-7]
+ if word == '<':
+ return word
+ return word
+
+def read_data(input_dir):
+ # data for test sets
+ test_files = [os.path.join(input_dir, 'TEST_100K_ANS.txt')]
+ print(test_files)
+
+ # data for train and dev sets
+ subdirs = [os.path.join(input_dir, 'article'),
+ os.path.join(input_dir, 'encyclopedia'),
+ os.path.join(input_dir, 'news'),
+ os.path.join(input_dir, 'novel')]
+ files = []
+ for subdir in subdirs:
+ if not os.path.exists(subdir):
+ raise FileNotFoundError("Expected a directory that did not exist: {}".format(subdir))
+ files.extend(glob.glob(os.path.join(subdir, '*.txt')))
+
+ test_documents = []
+ for filename in test_files:
+ print("File name:", filename)
+ with open(filename) as fin:
+ processed_lines = []
+ for line in fin.readlines():
+ line = clean_line(line)
+ words = line.split("|")
+ words = [clean_word(x) for x in words]
+ for word in words:
+ if len(word) > 1 and word[0] == '<':
+ raise ValueError("Unexpected word '{}' in document {}".format(word, filename))
+ words = [x for x in words if x]
+ processed_lines.append(words)
+
+ processed_lines = reprocess_lines(processed_lines)
+ paragraphs = convert_processed_lines(processed_lines)
+
+ test_documents.extend(paragraphs)
+ print("Test document finished.")
+
+ documents = []
+
+ for filename in files:
+ with open(filename) as fin:
+ print("File:", filename)
+ processed_lines = []
+ for line in fin.readlines():
+ line = clean_line(line)
+ words = line.split("|")
+ words = [clean_word(x) for x in words]
+ for word in words:
+ if len(word) > 1 and word[0] == '<':
+ raise ValueError("Unexpected word '{}' in document {}".format(word, filename))
+ words = [x for x in words if x]
+ processed_lines.append(words)
+
+ processed_lines = reprocess_lines(processed_lines)
+ paragraphs = convert_processed_lines(processed_lines)
+
+ documents.extend(paragraphs)
+
+ print("All documents finished.")
+
+ return documents, test_documents
+
+
+def main(*args):
+ random.seed(1000)
+ if not args:
+ args = sys.argv[1:]
+
+ input_dir = args[0]
+ full_input_dir = os.path.join(input_dir, "thai", "best")
+ if os.path.exists(full_input_dir):
+ # otherwise hopefully the user gave us the full path?
+ input_dir = full_input_dir
+
+ output_dir = args[1]
+ documents, test_documents = read_data(input_dir)
+ print("Finished reading data.")
+ write_dataset_best(documents, test_documents, output_dir, "best")
+
+
+if __name__ == '__main__':
+ main()
+