Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/Moses2TMX/Moses2TMX.py')
-rwxr-xr-xcontrib/Moses2TMX/Moses2TMX.py166
1 files changed, 166 insertions, 0 deletions
diff --git a/contrib/Moses2TMX/Moses2TMX.py b/contrib/Moses2TMX/Moses2TMX.py
new file mode 100755
index 000000000..b032fe9c5
--- /dev/null
+++ b/contrib/Moses2TMX/Moses2TMX.py
@@ -0,0 +1,166 @@
+#! /usr/bin/env python
+# -*- coding: utf_8 -*-
+"""This program is used to prepare TMX files from corpora composed of 2 files for each language pair,
+where the position of a segment in the first language file is exactly the same as in the second
+language file.
+
+The program requires that Pythoncard and wxPython (as well as Python) be previously installed.
+
+Copyright 2009, 2010 João Luís A. C. Rosas
+
+Distributed under GNU GPL v3 licence (see http://www.gnu.org/licenses/)
+
+E-mail: joao.luis.rosas@gmail.com """
+
+__version__ = "$Revision: 1.033$"
+__date__ = "$Date: 2010/02/25$"
+__author__="$João Luís A. C. Rosas$"
+
+from PythonCard import clipboard, dialog, graphic, model
+from PythonCard.components import button, combobox,statictext,checkbox,staticbox
+import wx
+import os, re
+import string
+import sys
+from time import strftime
+import codecs
+
+class Moses2TMX(model.Background):
+
+ def on_initialize(self, event):
+ self.inputdir=''
+ #Get directory where program file is and ...
+ currdir=os.path.abspath(os.path.dirname(os.path.realpath(sys.argv[0])))
+ #... load the file ("LanguageCodes.txt") with the list of languages that the program can process
+ try:
+ self.languages=open(currdir+os.sep+r'LanguageCodes.txt','r+').readlines()
+ except:
+ # If the languages file doesn't exist in the program directory, alert user that it is essential for the good working of the program and exit
+ result = dialog.alertDialog(self, 'The file "LanguageCodes.txt" is missing. The program will now close.', 'Essential file missing')
+ sys.exit()
+ #remove end of line marker from each line in "LanguageCodes.txt"
+ for lang in range(len(self.languages)):
+ self.languages[lang]=self.languages[lang].rstrip()
+ self.lang1code=''
+ self.lang2code=''
+ #Insert list of language names in appropriate program window's combo boxes
+ self.components.cbStartingLanguage.items=self.languages
+ self.components.cbDestinationLanguage.items=self.languages
+
+ def CreateTMX(self, name):
+ print 'Started at '+strftime('%H-%M-%S')
+ #get the startinglanguage name (e.g.: "EN-GB") from the program window
+ self.lang1code=self.components.cbStartingLanguage.text
+ #get the destinationlanguage name from the program window
+ self.lang2code=self.components.cbDestinationLanguage.text
+ print name+'.'+self.lang2code[:2].lower()
+ e=codecs.open(name,'r',"utf-8","strict")
+ f=codecs.open(name+'.'+self.lang2code[:2].lower()+'.moses','r',"utf-8","strict")
+ a=codecs.open(name+'.tmp','w',"utf-8","strict")
+ b=codecs.open(name+'.'+self.lang2code[:2].lower()+'.moses.tmp','w',"utf-8","strict")
+ for line in e:
+ if line.strip():
+ a.write(line)
+ for line in f:
+ if line.strip():
+ b.write(line)
+ a=codecs.open(name+'.tmp','r',"utf-8","strict")
+ b=codecs.open(name+'.'+self.lang2code[:2].lower()+'.moses.tmp','r',"utf-8","strict")
+ g=codecs.open(name+'.tmx','w','utf-16','strict')
+ g.write('<?xml version="1.0" ?>\n<!DOCTYPE tmx SYSTEM "tmx14.dtd">\n<tmx version="version 1.4">\n\n<header\ncreationtool="moses2tmx"\ncreationtoolversion="1.032"\nsegtype="sentence"\ndatatype="PlainText"\nadminlang="EN-US"\nsrclang="'+self.lang1code+'"\n>\n</header>\n\n<body>\n')
+ parar=0
+ while True:
+ self.ling1segm=a.readline().strip()
+ self.ling2segm=b.readline().strip()
+ if not self.ling1segm:
+ break
+ elif not self.ling2segm:
+ break
+ else:
+ try:
+ g.write('<tu creationid="MT!">\n<prop type="Txt::Translator">Moses</prop>\n<tuv xml:lang="'+self.lang1code+'">\n<seg>'+self.ling1segm+'</seg>\n</tuv>\n<tuv xml:lang="'+self.lang2code+ \
+ '">\n<seg>'+self.ling2segm+'</seg>\n</tuv>\n</tu>\n\n')
+ except:
+ pass
+ a.close()
+ b.close()
+ e.close()
+ f.close()
+ g.write('</body>\n</tmx>\n')
+ g.close()
+ #os.remove(name)
+ #os.remove(name+'.'+self.lang2code[:2].lower()+'.moses')
+ os.remove(name+'.tmp')
+ os.remove(name+'.'+self.lang2code[:2].lower()+'.moses.tmp')
+
+ def createTMXs(self):
+ try:
+ # Get a list of all TMX files that need to be processed
+ fileslist=self.locate('*.moses',self.inputdir)
+ except:
+ # if any error up to now, add the name of the TMX file to the output file @errors
+ self.errortypes=self.errortypes+' - Get All Segments: creation of output files error\n'
+ if fileslist:
+ # For each relevant TMX file ...
+ for self.presentfile in fileslist:
+ filename=self.presentfile[:-9]
+ #print filename
+ self.CreateTMX(filename)
+ print 'Finished at '+strftime('%H-%M-%S')
+ result = dialog.alertDialog(self, 'Processing done.', 'Processing Done')
+
+ def on_btnCreateTMX_mouseClick(self, event):
+ self.createTMXs()
+
+ def on_menuFileCreateTMXFiles_select(self, event):
+ self.createTMXs()
+
+ def on_btnSelectLang1File_mouseClick(self, event):
+ self.input1=self.GetInputFileName()
+
+ def on_btnSelectLang2File_mouseClick(self, event):
+ self.input2=self.GetInputFileName()
+
+ def locate(self,pattern, basedir):
+ """Locate all files matching supplied filename pattern in and below
+ supplied root directory.
+
+ @pattern: something like '*.tmx'
+ @basedir:whole directory to be treated
+ """
+ import fnmatch
+ for path, dirs, files in os.walk(os.path.abspath(basedir)):
+ for filename in fnmatch.filter(files, pattern):
+ yield os.path.join(path, filename)
+
+ def SelectDirectory(self):
+ """Select the directory where the files to be processed are
+
+ @result: object returned by the dialog window with attributes accepted (true if user clicked OK button, false otherwise) and
+ path (list of strings containing the full pathnames to all files selected by the user)
+ @self.inputdir: directory where files to be processed are (and where output files will be written)
+ @self.statusBar.text: text displayed in the program window status bar"""
+
+ result= dialog.directoryDialog(self, 'Choose a directory', 'a')
+ if result.accepted:
+ self.inputdir=result.path
+ self.statusBar.text=self.inputdir+' selected.'
+
+ def on_menuFileSelectDirectory_select(self, event):
+ self.SelectDirectory()
+
+ def on_btnSelectDirectory_mouseClick(self, event):
+ self.SelectDirectory()
+
+ def on_menuHelpShowHelp_select(self, event):
+ f = open('_READ_ME_FIRST.txt', "r")
+ msg = f.read()
+ result = dialog.scrolledMessageDialog(self, msg, '_READ_ME_FIRST.txt')
+
+ def on_menuFileExit_select(self, event):
+ sys.exit()
+
+
+if __name__ == '__main__':
+ app = model.Application(Moses2TMX)
+ app.MainLoop()