Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'stanza/utils/datasets/ner/split_wikiner.py')
-rw-r--r--stanza/utils/datasets/ner/split_wikiner.py80
1 files changed, 80 insertions, 0 deletions
diff --git a/stanza/utils/datasets/ner/split_wikiner.py b/stanza/utils/datasets/ner/split_wikiner.py
new file mode 100644
index 00000000..8c4b3d3d
--- /dev/null
+++ b/stanza/utils/datasets/ner/split_wikiner.py
@@ -0,0 +1,80 @@
+"""
+Preprocess the WikiNER dataset, by
+1) normalizing tags;
+2) split into train (70%), dev (15%), test (15%) datasets.
+"""
+
+import os
+import random
+from collections import Counter
+random.seed(1234)
+
+def read_sentences(filename, encoding):
+ sents = []
+ cache = []
+ skipped = 0
+ skip = False
+ with open(filename, encoding=encoding) as infile:
+ for i, line in enumerate(infile):
+ line = line.rstrip()
+ if len(line) == 0:
+ if len(cache) > 0:
+ if not skip:
+ sents.append(cache)
+ else:
+ skipped += 1
+ skip = False
+ cache = []
+ continue
+ array = line.split()
+ if len(array) != 2:
+ skip = True
+ continue
+ #assert len(array) == 2, "Format error at line {}: {}".format(i+1, line)
+ w, t = array
+ cache.append([w, t])
+ if len(cache) > 0:
+ if not skip:
+ sents.append(cache)
+ else:
+ skipped += 1
+ cache = []
+ print("Skipped {} examples due to formatting issues.".format(skipped))
+ return sents
+
+def write_sentences_to_file(sents, filename):
+ print(f"Writing {len(sents)} sentences to {filename}")
+ with open(filename, 'w') as outfile:
+ for sent in sents:
+ for pair in sent:
+ print(f"{pair[0]}\t{pair[1]}", file=outfile)
+ print("", file=outfile)
+
+def split_wikiner(directory, *in_filenames, encoding="utf-8", prefix=""):
+ sents = []
+ for filename in in_filenames:
+ new_sents = read_sentences(filename, encoding)
+ print(f"{len(new_sents)} sentences read from {filename}.")
+ sents.extend(new_sents)
+
+ # split
+ num = len(sents)
+ train_num = int(num*0.7)
+ dev_num = int(num*0.15)
+
+ random.shuffle(sents)
+ train_sents = sents[:train_num]
+ dev_sents = sents[train_num:train_num+dev_num]
+ test_sents = sents[train_num+dev_num:]
+
+ batches = [train_sents, dev_sents, test_sents]
+ filenames = ['train.bio', 'dev.bio', 'test.bio']
+ if prefix:
+ filenames = ['%s.%s' % (prefix, f) for f in filenames]
+ for batch, filename in zip(batches, filenames):
+ write_sentences_to_file(batch, os.path.join(directory, filename))
+
+if __name__ == "__main__":
+ in_filename = 'raw/wp2.txt'
+ directory = "."
+ split_wikiner(directory, in_filename)