diff options
author | John Bauer <horatio@gmail.com> | 2022-10-30 11:27:01 +0300 |
---|---|---|
committer | John Bauer <horatio@gmail.com> | 2022-11-01 04:14:11 +0300 |
commit | 7ca36963b51acfb8eed59ada4d57db43b98480a2 (patch) | |
tree | ea0d376fd9cd949af0e23b5d2c384a181ee44cd2 | |
parent | af5e7d3d3b4726d5503efb027b8efda6e8dbbd3e (diff) |
Use some words from the silver dataset (currently |gold| words are added, even if that means some overlaps)
-rw-r--r-- | stanza/models/constituency/trainer.py | 7 | ||||
-rw-r--r-- | stanza/models/constituency_parser.py | 2 |
2 files changed, 7 insertions, 2 deletions
diff --git a/stanza/models/constituency/trainer.py b/stanza/models/constituency/trainer.py index 436d001b..1b4fea90 100644 --- a/stanza/models/constituency/trainer.py +++ b/stanza/models/constituency/trainer.py @@ -423,8 +423,11 @@ def build_trainer(args, train_trees, dev_trees, silver_trees, foundation_cache, # expected there will be some UNK words words = parse_tree.Tree.get_unique_words(train_trees) rare_words = parse_tree.Tree.get_rare_words(train_trees, args['rare_word_threshold']) - # the silver words will just get UNK if they are not already known - # TODO: add words from the silver dataset? perhaps just a fraction + # rare/unknown silver words will just get UNK if they are not already known + if silver_trees and args['use_silver_words']: + logger.info("Getting silver words to add to the delta embedding") + silver_words = parse_tree.Tree.get_common_words(tqdm(silver_trees, postfix='Silver words'), len(words)) + words = sorted(set(words + silver_words)) # also, it's not actually an error if there is a pattern of # compound unary or compound open nodes which doesn't exist in the diff --git a/stanza/models/constituency_parser.py b/stanza/models/constituency_parser.py index 1cfef05c..43a402bf 100644 --- a/stanza/models/constituency_parser.py +++ b/stanza/models/constituency_parser.py @@ -375,6 +375,8 @@ def parse_args(args=None): # leaky_relu was not an improvement - a full run on WSJ led to 0.9181 f1 instead of 0.919 parser.add_argument('--nonlinearity', default='relu', choices=NONLINEARITY.keys(), help='Nonlinearity to use in the model. relu is a noticeable improvement over tanh') + parser.add_argument('--use_silver_words', default=True, dest='use_silver_words', action='store_true', help="Use/don't use words from the silver dataset") + parser.add_argument('--no_use_silver_words', default=True, dest='use_silver_words', action='store_false', help="Use/don't use words from the silver dataset") parser.add_argument('--rare_word_unknown_frequency', default=0.02, type=float, help='How often to replace a rare word with UNK when training') parser.add_argument('--rare_word_threshold', default=0.02, type=float, help='How many words to consider as rare words as a fraction of the dataset') parser.add_argument('--tag_unknown_frequency', default=0.001, type=float, help='How often to replace a tag with UNK when training') |