diff options
Diffstat (limited to 'stanza/models/tokenization/data.py')
-rw-r--r-- | stanza/models/tokenization/data.py | 63 |
1 files changed, 55 insertions, 8 deletions
diff --git a/stanza/models/tokenization/data.py b/stanza/models/tokenization/data.py index ce059b6a..9039f818 100644 --- a/stanza/models/tokenization/data.py +++ b/stanza/models/tokenization/data.py @@ -1,14 +1,11 @@ from bisect import bisect_right from copy import copy -import json import numpy as np import random import logging import re import torch - from .vocab import Vocab - logger = logging.getLogger('stanza') def filter_consecutive_whitespaces(para): @@ -26,11 +23,11 @@ NEWLINE_WHITESPACE_RE = re.compile(r'\n\s*\n') NUMERIC_RE = re.compile(r'^([\d]+[,\.]*)+$') WHITESPACE_RE = re.compile(r'\s') - class DataLoader: - def __init__(self, args, input_files={'txt': None, 'label': None}, input_text=None, input_data=None, vocab=None, evaluation=False): + def __init__(self, args, input_files={'txt': None, 'label': None}, input_text=None, input_data=None, vocab=None, evaluation=False, dictionary=None): self.args = args self.eval = evaluation + self.dictionary = dictionary # get input files txt_file = input_files['txt'] @@ -107,8 +104,6 @@ class DataLoader: func = lambda x: 1 if x.startswith(' ') else 0 elif feat_func == 'capitalized': func = lambda x: 1 if x[0].isupper() else 0 - elif feat_func == 'all_caps': - func = lambda x: 1 if x.isupper() else 0 elif feat_func == 'numeric': func = lambda x: 1 if (NUMERIC_RE.match(x) is not None) else 0 else: @@ -119,6 +114,40 @@ class DataLoader: # stacking all featurize functions composite_func = lambda x: [f(x) for f in funcs] + length = len(para) + #This function is to extract dictionary features for each character + def extract_dict_feat(idx): + dict_forward_feats = [0 for i in range(self.args['num_dict_feat'])] + dict_backward_feats = [0 for i in range(self.args['num_dict_feat'])] + forward_word = para[idx][0] + backward_word = para[idx][0] + prefix = True + suffix = True + for window in range(1,self.args['num_dict_feat']+1): + # concatenate each character and check if words found in dict not, stop if prefix not found + #check if idx+t is out of bound and if the prefix is already not found + if (idx + window) <= length-1 and prefix: + forward_word += para[idx+window][0].lower() + #check in json file if the word is present as prefix or word or None. + feat = 1 if forward_word in self.dictionary["words"] else 0 + #if the return value is not 2 or 3 then the checking word is not a valid word in dict. + dict_forward_feats[window-1] = feat + #if the dict return 0 means no prefixes found, thus, stop looking for forward. + if forward_word not in self.dictionary["prefixes"]: + prefix = False + #backward check: similar to forward + if (idx - window) >= 0 and suffix: + backward_word = para[idx-window][0].lower() + backward_word + feat = 1 if backward_word in self.dictionary["words"] else 0 + dict_backward_feats[window-1] = feat + if backward_word not in self.dictionary["suffixes"]: + suffix = False + #if cannot find both prefix and suffix, then exit the loop + if not prefix and not suffix: + break + + return dict_forward_feats + dict_backward_feats + def process_sentence(sent): return [self.vocab.unit2id(y[0]) for y in sent], [y[1] for y in sent], [y[2] for y in sent], [y[0] for y in sent] @@ -135,6 +164,12 @@ class DataLoader: if use_start_of_para: f = 1 if i == 0 else 0 feats.append(f) + + #if dictionary feature is selected + if self.args['use_dictionary']: + dict_feats = extract_dict_feat(i) + feats = feats + dict_feats + current += [(unit, label, feats)] if label1 == 2 or label1 == 4: # end of sentence if len(current) <= self.args['max_seqlen']: @@ -156,7 +191,7 @@ class DataLoader: random.shuffle(para) self.init_sent_ids() - def next(self, eval_offsets=None, unit_dropout=0.0, old_batch=None): + def next(self, eval_offsets=None, unit_dropout=0.0, old_batch=None, feat_unit_dropout=0.0): ''' Get a batch of converted and padded PyTorch data from preprocessed raw text for training/prediction. ''' feat_size = len(self.sentences[0][0][2][0]) unkid = self.vocab.unit2id('<UNK>') @@ -277,6 +312,18 @@ class DataLoader: if mask[i, j]: raw_units[i][j] = '<UNK>' + # dropout unit feature vector in addition to only torch.dropout in the model. + # experiments showed that only torch.dropout hurts the model + # we believe it is because the dict feature vector is mostly scarse so it makes + # more sense to drop out the whole vector instead of only single element. + if self.args['use_dictionary'] and feat_unit_dropout > 0 and not self.eval: + mask_feat = np.random.random_sample(units.shape) < feat_unit_dropout + mask_feat[units == padid] = 0 + for i in range(len(raw_units)): + for j in range(len(raw_units[i])): + if mask_feat[i,j]: + features[i,j,:] = 0 + units = torch.from_numpy(units) labels = torch.from_numpy(labels) features = torch.from_numpy(features) |