Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/stanfordnlp/stanza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Bauer <horatio@gmail.com>2022-10-30 11:27:01 +0300
committerJohn Bauer <horatio@gmail.com>2022-11-01 04:14:11 +0300
commit7ca36963b51acfb8eed59ada4d57db43b98480a2 (patch)
treeea0d376fd9cd949af0e23b5d2c384a181ee44cd2
parentaf5e7d3d3b4726d5503efb027b8efda6e8dbbd3e (diff)
Use some words from the silver dataset (currently |gold| words are added, even if that means some overlaps)
-rw-r--r--stanza/models/constituency/trainer.py7
-rw-r--r--stanza/models/constituency_parser.py2
2 files changed, 7 insertions, 2 deletions
diff --git a/stanza/models/constituency/trainer.py b/stanza/models/constituency/trainer.py
index 436d001b..1b4fea90 100644
--- a/stanza/models/constituency/trainer.py
+++ b/stanza/models/constituency/trainer.py
@@ -423,8 +423,11 @@ def build_trainer(args, train_trees, dev_trees, silver_trees, foundation_cache,
# expected there will be some UNK words
words = parse_tree.Tree.get_unique_words(train_trees)
rare_words = parse_tree.Tree.get_rare_words(train_trees, args['rare_word_threshold'])
- # the silver words will just get UNK if they are not already known
- # TODO: add words from the silver dataset? perhaps just a fraction
+ # rare/unknown silver words will just get UNK if they are not already known
+ if silver_trees and args['use_silver_words']:
+ logger.info("Getting silver words to add to the delta embedding")
+ silver_words = parse_tree.Tree.get_common_words(tqdm(silver_trees, postfix='Silver words'), len(words))
+ words = sorted(set(words + silver_words))
# also, it's not actually an error if there is a pattern of
# compound unary or compound open nodes which doesn't exist in the
diff --git a/stanza/models/constituency_parser.py b/stanza/models/constituency_parser.py
index 1cfef05c..43a402bf 100644
--- a/stanza/models/constituency_parser.py
+++ b/stanza/models/constituency_parser.py
@@ -375,6 +375,8 @@ def parse_args(args=None):
# leaky_relu was not an improvement - a full run on WSJ led to 0.9181 f1 instead of 0.919
parser.add_argument('--nonlinearity', default='relu', choices=NONLINEARITY.keys(), help='Nonlinearity to use in the model. relu is a noticeable improvement over tanh')
+ parser.add_argument('--use_silver_words', default=True, dest='use_silver_words', action='store_true', help="Use/don't use words from the silver dataset")
+ parser.add_argument('--no_use_silver_words', default=True, dest='use_silver_words', action='store_false', help="Use/don't use words from the silver dataset")
parser.add_argument('--rare_word_unknown_frequency', default=0.02, type=float, help='How often to replace a rare word with UNK when training')
parser.add_argument('--rare_word_threshold', default=0.02, type=float, help='How many words to consider as rare words as a fraction of the dataset')
parser.add_argument('--tag_unknown_frequency', default=0.001, type=float, help='How often to replace a tag with UNK when training')