diff options
author | ZJaume <jzaragoza@prompsit.com> | 2022-07-28 17:34:37 +0300 |
---|---|---|
committer | ZJaume <jzaragoza@prompsit.com> | 2022-07-28 17:34:37 +0300 |
commit | 6f8aa847538ad11a21d4ae021c9f8e2c1f171137 (patch) | |
tree | 693d8e4a0d4c9d3d35d07d5ace5b9e9d0cc9f45a | |
parent | c03a6d9ba086899813e26d670978f2ab47a9f463 (diff) |
Append noise tag to generated data
-rw-r--r-- | bicleaner_ai/datagen.py | 9 | ||||
-rw-r--r-- | bicleaner_ai/training.py | 30 |
2 files changed, 23 insertions, 16 deletions
diff --git a/bicleaner_ai/datagen.py b/bicleaner_ai/datagen.py index a9a0d81..1b94469 100644 --- a/bicleaner_ai/datagen.py +++ b/bicleaner_ai/datagen.py @@ -2,6 +2,7 @@ from tensorflow.keras.preprocessing.sequence import pad_sequences import sentencepiece as sp import tensorflow as tf import numpy as np +import logging class SentenceEncoder(object): ''' @@ -114,7 +115,13 @@ class SentenceGenerator(tf.keras.utils.Sequence): # Build array of sample weights if len(data) >= 4 and data[3]: - self.weights = np.array(data[3], dtype=float) + try: + float(data[3][0]) + except ValueError: + logging.debug("No float detected at 4th field of the data, " + "ignoring data weights") + else: + self.weights = np.array(data[3], dtype=float) # Index samples self.num_samples = len(data[0]) diff --git a/bicleaner_ai/training.py b/bicleaner_ai/training.py index 36d0543..d6f428b 100644 --- a/bicleaner_ai/training.py +++ b/bicleaner_ai/training.py @@ -73,7 +73,7 @@ def sentence_noise(i, src, trg, args): # Positive samples for j in range(args.pos_ratio): - sts.append(src_strip + "\t" + trg_strip+ "\t1") + sts.append(src_strip + "\t" + trg_strip+ "\t1\tpos") # Apply noise # Every noise has 50% chance of doing it in target or source @@ -83,20 +83,20 @@ def sentence_noise(i, src, trg, args): # Random misalignment for j, k in zip(range(args.rand_ratio), rand_mask(args.rand_ratio)): if k: - sts.append(src[random.randrange(1,size)].strip() + "\t" + trg_strip + "\t0") + sts.append(src[random.randrange(1,size)].strip() + "\t" + trg_strip + "\t0\trand") else: - sts.append(src_strip + "\t" + trg[random.randrange(1,size)].strip() + "\t0") + sts.append(src_strip + "\t" + trg[random.randrange(1,size)].strip() + "\t0\trand") # Frequence based noise for j, k in zip(range(args.freq_ratio), rand_mask(args.freq_ratio)): if k: replaced = freq_noise(src[i], src_tok, args.sl_word_freqs) if replaced is not None: - sts.append(replaced + "\t" + trg_strip + "\t0") + sts.append(replaced + "\t" + trg_strip + "\t0\tfreq") else: replaced = freq_noise(trg[i], trg_tok, args.tl_word_freqs) if replaced is not None: - sts.append(src_strip + "\t" + replaced + "\t0") + sts.append(src_strip + "\t" + replaced + "\t0\tfreq") # Randomly omit words for j, k in zip(range(args.womit_ratio), rand_mask(args.womit_ratio)): @@ -104,12 +104,12 @@ def sentence_noise(i, src, trg, args): s_toks = src_tok.tokenize(src[i]) omitted = omit_words(s_toks) if omitted != []: - sts.append(src_tok.detokenize(omitted) + "\t" + trg_strip + "\t0") + sts.append(src_tok.detokenize(omitted) + "\t" + trg_strip + "\t0\twomit") else: t_toks = trg_tok.tokenize(trg[i]) omitted = omit_words(t_toks) if omitted != []: - sts.append(src_strip + "\t" + trg_tok.detokenize(omitted) + "\t0") + sts.append(src_strip + "\t" + trg_tok.detokenize(omitted) + "\t0\twomit") # Cut sentences, a.k.a. segmentation noise # randomly cut at end or begin @@ -118,21 +118,21 @@ def sentence_noise(i, src, trg, args): s_toks = src_tok.tokenize(src[i]) cut = cut_sent(s_toks, cut_begin=random.getrandbits(1)) if cut is not None: - sts.append(src_tok.detokenize(cut) + "\t" + trg_strip + "\t0") + sts.append(src_tok.detokenize(cut) + "\t" + trg_strip + "\t0\tcut") else: t_toks = trg_tok.tokenize(trg[i]) cut = cut_sent(t_toks, cut_begin=random.getrandbits(1)) if cut is not None: - sts.append(src_strip + "\t" + trg_tok.detokenize(cut) + "\t0") + sts.append(src_strip + "\t" + trg_tok.detokenize(cut) + "\t0\tcut") # Glued sentences for j, k in zip(range(args.glue_ratio), rand_mask(args.glue_ratio)): if k: glued = glue_sent(src_strip, src, src_tok) - sts.append(glued + "\t" + trg_strip + "\t0") + sts.append(glued + "\t" + trg_strip + "\t0\tglue") else: glued = glue_sent(trg_strip, trg, trg_tok) - sts.append(src_strip + "\t" + glued + "\t0") + sts.append(src_strip + "\t" + glued + "\t0\tglue") # Misalginment by fuzzy matching if args.fuzzy_ratio > 0: @@ -141,15 +141,15 @@ def sentence_noise(i, src, trg, args): for j, k in zip(range(args.fuzzy_ratio), rand_mask(args.fuzzy_ratio)): if k: fuzzed = src[src_index[j]].strip() - sts.append(src_strip + "\t" + fuzzed + "\t0") + sts.append(src_strip + "\t" + fuzzed + "\t0\tfuzzy") else: fuzzed = src[src_index[j]].strip() - sts.append(fuzzed + "\t" + trg_strip + "\t0") + sts.append(fuzzed + "\t" + trg_strip + "\t0\tfuzzy") # Misalgniment with neighbour sentences if args.neighbour_mix and i <size-2 and i > 1: - sts.append(src_strip + "\t" + trg[i+1].strip()+ "\t0") - sts.append(src_strip + "\t" + trg[i-1].strip()+ "\t0") + sts.append(src_strip + "\t" + trg[i+1].strip()+ "\t0\tneighbour") + sts.append(src_strip + "\t" + trg[i-1].strip()+ "\t0\tneighbour") return sts |