Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAles Tamchyna <tamchyna@ufal.mff.cuni.cz>2013-11-25 20:36:49 +0400
committerAles Tamchyna <tamchyna@ufal.mff.cuni.cz>2013-11-25 20:36:49 +0400
commit0d108d213f263b5b6442c94f9c4ca97b85050bf5 (patch)
treeacd16a86e235ed61bb3abbf1117af47c88f5461b
parent0cdee467ca99160009715f1177fef8e0e2f2fbfc (diff)
filter extract file based on dev/test setdamt_phrase
-rwxr-xr-xscripts/training/extract_words_dlm.py27
1 files changed, 22 insertions, 5 deletions
diff --git a/scripts/training/extract_words_dlm.py b/scripts/training/extract_words_dlm.py
index 6b122b092..0479bf41f 100755
--- a/scripts/training/extract_words_dlm.py
+++ b/scripts/training/extract_words_dlm.py
@@ -6,7 +6,7 @@ NAME
SYNOPSIS
- extract_words_dlm.py SOURCE TARGET ALIGN > OUT
+ extract_words_dlm.py SOURCE TARGET ALIGN [TESTSOURCE] > OUT
DESCRIPTION
@@ -87,9 +87,18 @@ def cept_to_string(source, cept):
def spans_to_string(spans):
return " ".join("-".join(map(str, span)) for span in spans)
+def confirm_source(source, cept, onlywords):
+ if onlywords:
+ for index in cept:
+ if not source[index] in onlywords:
+ return False
+ return True
+ else:
+ return True
+
SENTENCE_ID = 1
-def extract_from_sentence(source, target, align_pairs):
+def extract_from_sentence(source, target, align_pairs, onlywords):
global SENTENCE_ID
source_cept = collections.defaultdict(lambda: [])
@@ -105,6 +114,8 @@ def extract_from_sentence(source, target, align_pairs):
if cept:
spans = cept_to_spans(cept)
last_aligned_position = max(cept) + 1
+ if not confirm_source(source, cept, onlywords):
+ continue
else:
spans = [[last_aligned_position, last_aligned_position]]
@@ -120,7 +131,7 @@ def extract_from_sentence(source, target, align_pairs):
SENTENCE_ID += 1
-def extract_all(source_path, target_path, align_path):
+def extract_all(source_path, target_path, align_path, onlywords):
input_files = itertools.zip_longest(
fileinput.input(source_path),
fileinput.input(target_path),
@@ -136,12 +147,18 @@ def extract_all(source_path, target_path, align_path):
target_sentence = parse_sentence(target_line)
align_pairs = parse_alignment_pairs(align_line)
- extract_from_sentence(source_sentence, target_sentence, align_pairs)
+ extract_from_sentence(source_sentence, target_sentence, align_pairs, onlywords)
# ---------------------------------------------------------------------- Main --
def main():
- extract_all(sys.argv[1], sys.argv[2], sys.argv[3])
+ onlywords = []
+ if len(sys.argv) > 4:
+ testsrc = open(sys.argv[4], "r")
+ for line in testsrc:
+ onlywords.append(line.rstrip())
+
+ extract_all(sys.argv[1], sys.argv[2], sys.argv[3], onlywords)
if __name__ == "__main__":
main()