Use some words from the silver dataset (currently |gold| words are added, even if that means some overlaps)

author: John Bauer <horatio@gmail.com> 2022-10-30 11:27:01 +0300
committer: John Bauer <horatio@gmail.com> 2022-11-01 04:14:11 +0300
commit: 7ca36963b51acfb8eed59ada4d57db43b98480a2 (patch)
tree: ea0d376fd9cd949af0e23b5d2c384a181ee44cd2
parent: af5e7d3d3b4726d5503efb027b8efda6e8dbbd3e (diff)
2 files changed, 7 insertions, 2 deletions
diff --git a/stanza/models/constituency/trainer.py b/stanza/models/constituency/trainer.py
index 436d001b..1b4fea90 100644
--- a/stanza/models/constituency/trainer.py
+++ b/stanza/models/constituency/trainer.py
@@ -423,8 +423,11 @@ def build_trainer(args, train_trees, dev_trees, silver_trees, foundation_cache,
     # expected there will be some UNK words
     words = parse_tree.Tree.get_unique_words(train_trees)
     rare_words = parse_tree.Tree.get_rare_words(train_trees, args['rare_word_threshold'])
-    # the silver words will just get UNK if they are not already known
-    # TODO: add words from the silver dataset?  perhaps just a fraction
+    # rare/unknown silver words will just get UNK if they are not already known
+    if silver_trees and args['use_silver_words']:
+        logger.info("Getting silver words to add to the delta embedding")
+        silver_words = parse_tree.Tree.get_common_words(tqdm(silver_trees, postfix='Silver words'), len(words))
+        words = sorted(set(words + silver_words))
 
     # also, it's not actually an error if there is a pattern of
     # compound unary or compound open nodes which doesn't exist in the
diff --git a/stanza/models/constituency_parser.py b/stanza/models/constituency_parser.py
index 1cfef05c..43a402bf 100644
--- a/stanza/models/constituency_parser.py
+++ b/stanza/models/constituency_parser.py
@@ -375,6 +375,8 @@ def parse_args(args=None):
     # leaky_relu was not an improvement - a full run on WSJ led to 0.9181 f1 instead of 0.919
     parser.add_argument('--nonlinearity', default='relu', choices=NONLINEARITY.keys(), help='Nonlinearity to use in the model.  relu is a noticeable improvement over tanh')
 
+    parser.add_argument('--use_silver_words', default=True, dest='use_silver_words', action='store_true', help="Use/don't use words from the silver dataset")
+    parser.add_argument('--no_use_silver_words', default=True, dest='use_silver_words', action='store_false', help="Use/don't use words from the silver dataset")
     parser.add_argument('--rare_word_unknown_frequency', default=0.02, type=float, help='How often to replace a rare word with UNK when training')
     parser.add_argument('--rare_word_threshold', default=0.02, type=float, help='How many words to consider as rare words as a fraction of the dataset')
     parser.add_argument('--tag_unknown_frequency', default=0.001, type=float, help='How often to replace a tag with UNK when training')
author	John Bauer <horatio@gmail.com>	2022-10-30 11:27:01 +0300
committer	John Bauer <horatio@gmail.com>	2022-11-01 04:14:11 +0300
commit	7ca36963b51acfb8eed59ada4d57db43b98480a2 (patch)
tree	ea0d376fd9cd949af0e23b5d2c384a181ee44cd2
parent	af5e7d3d3b4726d5503efb027b8efda6e8dbbd3e (diff)