Add notes on silver words for the delta embedding

author: John Bauer <horatio@gmail.com> 2022-10-30 09:29:27 +0300
committer: John Bauer <horatio@gmail.com> 2022-10-30 09:20:37 +0300
commit: af9d8698412464e1e54ea1ca86cdf69e3a208427 (patch)
tree: ed636964bd4907128335300595680545d556f7fd
parent: b56c86ed8fabc7f5fdb288443d0a4fa01feb1ee5 (diff)
1 files changed, 3 insertions, 0 deletions
diff --git a/stanza/models/constituency/trainer.py b/stanza/models/constituency/trainer.py
index 1bd676e5..c655632b 100644
--- a/stanza/models/constituency/trainer.py
+++ b/stanza/models/constituency/trainer.py
@@ -423,6 +423,9 @@ def build_trainer(args, train_trees, dev_trees, silver_trees, foundation_cache,
     # expected there will be some UNK words
     words = parse_tree.Tree.get_unique_words(train_trees)
     rare_words = parse_tree.Tree.get_rare_words(train_trees, args['rare_word_threshold'])
+    # the silver words will just get UNK if they are not already known
+    # TODO: add words from the silver dataset?  perhaps just a fraction
+
     # also, it's not actually an error if there is a pattern of
     # compound unary or compound open nodes which doesn't exist in the
     # train set.  it just means we probably won't ever get that right
author	John Bauer <horatio@gmail.com>	2022-10-30 09:29:27 +0300
committer	John Bauer <horatio@gmail.com>	2022-10-30 09:20:37 +0300
commit	af9d8698412464e1e54ea1ca86cdf69e3a208427 (patch)
tree	ed636964bd4907128335300595680545d556f7fd
parent	b56c86ed8fabc7f5fdb288443d0a4fa01feb1ee5 (diff)