diff options
author | Ales Tamchyna <tamchyna@ufal.mff.cuni.cz> | 2013-11-25 20:36:49 +0400 |
---|---|---|
committer | Ales Tamchyna <tamchyna@ufal.mff.cuni.cz> | 2013-11-25 20:36:49 +0400 |
commit | 0d108d213f263b5b6442c94f9c4ca97b85050bf5 (patch) | |
tree | acd16a86e235ed61bb3abbf1117af47c88f5461b | |
parent | 0cdee467ca99160009715f1177fef8e0e2f2fbfc (diff) |
filter extract file based on dev/test setdamt_phrase
-rwxr-xr-x | scripts/training/extract_words_dlm.py | 27 |
1 files changed, 22 insertions, 5 deletions
diff --git a/scripts/training/extract_words_dlm.py b/scripts/training/extract_words_dlm.py index 6b122b092..0479bf41f 100755 --- a/scripts/training/extract_words_dlm.py +++ b/scripts/training/extract_words_dlm.py @@ -6,7 +6,7 @@ NAME SYNOPSIS - extract_words_dlm.py SOURCE TARGET ALIGN > OUT + extract_words_dlm.py SOURCE TARGET ALIGN [TESTSOURCE] > OUT DESCRIPTION @@ -87,9 +87,18 @@ def cept_to_string(source, cept): def spans_to_string(spans): return " ".join("-".join(map(str, span)) for span in spans) +def confirm_source(source, cept, onlywords): + if onlywords: + for index in cept: + if not source[index] in onlywords: + return False + return True + else: + return True + SENTENCE_ID = 1 -def extract_from_sentence(source, target, align_pairs): +def extract_from_sentence(source, target, align_pairs, onlywords): global SENTENCE_ID source_cept = collections.defaultdict(lambda: []) @@ -105,6 +114,8 @@ def extract_from_sentence(source, target, align_pairs): if cept: spans = cept_to_spans(cept) last_aligned_position = max(cept) + 1 + if not confirm_source(source, cept, onlywords): + continue else: spans = [[last_aligned_position, last_aligned_position]] @@ -120,7 +131,7 @@ def extract_from_sentence(source, target, align_pairs): SENTENCE_ID += 1 -def extract_all(source_path, target_path, align_path): +def extract_all(source_path, target_path, align_path, onlywords): input_files = itertools.zip_longest( fileinput.input(source_path), fileinput.input(target_path), @@ -136,12 +147,18 @@ def extract_all(source_path, target_path, align_path): target_sentence = parse_sentence(target_line) align_pairs = parse_alignment_pairs(align_line) - extract_from_sentence(source_sentence, target_sentence, align_pairs) + extract_from_sentence(source_sentence, target_sentence, align_pairs, onlywords) # ---------------------------------------------------------------------- Main -- def main(): - extract_all(sys.argv[1], sys.argv[2], sys.argv[3]) + onlywords = [] + if len(sys.argv) > 4: + testsrc = open(sys.argv[4], "r") + for line in testsrc: + onlywords.append(line.rstrip()) + + extract_all(sys.argv[1], sys.argv[2], sys.argv[3], onlywords) if __name__ == "__main__": main() |