#!/usr/bin/env python # -*- coding: utf-8 -*- # Python utilities for moses # # This package mostly wraps standard Moses utilities into pipes. # # Written by Ulrich Germann # # This package borrows from scripts written by Christian Buck # # The package assumes that there is a complete moses installation # (including scripts) under one root directory, # e.g., via # bjam --with-xmlrpc-c=... [...] --install-scripts --prefix=${HOME}/moses # By default, this root directory is "${HOME}/moses". import xmlrpclib,datetime,argparse,time,os,sys from subprocess import * from unicodedata import normalize moses_root = os.environ.get('MOSES_ROOT',os.environ.get('HOME')+"/moses") class ProcessWrapper: def __init__(self,cmd=[]): self.process = None self.cmd = cmd return def start(self, stdin=PIPE, stdout=PIPE): if self.process: raise Exception("Process is already running") self.process = Popen(self.cmd, stdin = stdin, stdout = stdout) return def __del__(self): if self.process: self.process.terminate() pass return pass class LineProcessor(ProcessWrapper): def __call__(self,input): if not self.process: self.start() self.process.stdin.write("%s\n"%input.strip()) self.process.stdin.flush() return self.process.stdout.readline().strip() pass class SentenceSplitter(ProcessWrapper): """ Wrapper for standard Moses sentence splitter """ def __init__(self,lang): ssplit_cmd = moses_root+"/scripts/ems/support/split-sentences.perl" self.cmd = [ssplit_cmd, "-b", "-q", "-l",lang] self.process = None return def __call__(self,input): if not self.process: self.start() pass self.process.stdin.write(input.strip() + "\n

\n") self.process.stdin.flush() x = self.process.stdout.readline().strip() ret = [] while x != '

' and x != '': ret.append(x) x = self.process.stdout.readline().strip() pass return ret class Pretokenizer(LineProcessor): """ Pretokenizer wrapper; the pretokenizer fixes known issues with the input. """ def __init__(self,lang): pretok_cmd = moses_root+"/scripts/tokenizer/pre-tokenizer.perl" self.cmd = [pretok_cmd,"-b", "-q", "-l",lang] self.process = None return pass class Tokenizer(LineProcessor): """ Tokenizer wrapper; the pretokenizer fixes known issues with the input. """ def __init__(self,lang,args=["-a","-no-escape"]): tok_cmd = moses_root+"/scripts/tokenizer/tokenizer.perl" self.cmd = [tok_cmd,"-b", "-q", "-l", lang] + args self.process = None return class Truecaser(LineProcessor): """ Truecaser wrapper. """ def __init__(self,model): truecase_cmd = moses_root+"/scripts/recaser/truecase.perl" self.cmd = [truecase_cmd,"-b", "--model",model] self.process = None return pass class LineProcessorPipeline: """ Line processor: one line in, one line out """ def __init__(self,parts=[]): self.chain = [LineProcessor(p.cmd) for p in parts] return def start(self): if len(self.chain) == 0: return if self.chain[0].process: return self.chain[0].start() for i in xrange(1,len(self.chain)): self.chain[i].start(stdin = self.chain[i-1].process.stdout) pass return def __call__(self,input): if len(self.chain) == 0: return input self.start() self.chain[0].process.stdin.write("%s\n"%input.strip()) self.chain[0].process.stdin.flush() return self.chain[0].process.stdout.readline().strip() pass def find_free_port(p): """ Find a free port, starting at /p/. Return the free port, or False if none found. """ ret = p while ret - p < 20: devnull = open(os.devnull,"w") n = Popen(["netstat","-tnp"],stdout=PIPE,stderr=devnull) if n.communicate()[0].find(":%d "%ret) < 0: return p ret += 1 pass return False class MosesServer(ProcessWrapper): def __init__(self,args=["-fd", "\n"]): self.process = None mserver_cmd = moses_root+"/bin/mosesserver" self.cmd = [mserver_cmd] + args self.url = None self.proxy = None return def start(self,config=None,args=[],port=7447,debug=False): self.cmd.extend(args) if config: if "-f" in args: raise Exception("Config file specified twice") else: self.cmd.extend(["-f",config]) pass pass self.port = port # find_free_port(port) if not self.port: raise Excpetion("Cannot find free port for moses server!") self.cmd.extend(["--server-port", "%d"%self.port]) if debug: print >>sys.stderr,self.cmd self.process = Popen(self.cmd,stderr = sys.stderr) else: devnull = open(os.devnull,"w") self.process = Popen(self.cmd, stderr=devnull, stdout=devnull) pass if self.process.poll(): raise Exception("FATAL ERROR: Could not launch moses server!") if debug: print >>sys.stderr,"MOSES port is %d."%self.port print >>sys.stderr,"Moses poll status is", self.process.poll() pass self.url = "http://localhost:%d/RPC2"%self.port self.connect(self.url) return True def connect(self,url): if url[:4] != "http": url = "http://%s"%url if url[-5:] != "/RPC2": url += "/RPC2" self.url = url self.proxy = xmlrpclib.ServerProxy(self.url) return def translate(self,input): attempts = 0 while attempts < 100: try: if type(input) is unicode: # if the server does not expect unicode, provide a # properly encoded string! param = {'text': input.strip().encode('utf8')} return self.proxy.translate(param)['text'].decode('utf8') elif type(input) is str: param = {'text': input.strip()} return self.proxy.translate(param)['text'] elif type(input) is list: return [self.translate(x) for x in input] elif type(input) is dict: return self.proxy.translate(input) else: raise Exception("Can't handle input of this type!") except: attempts += 1 print >>sys.stderr, "WAITING", attempts time.sleep(1) pass pass raise Exception("Translation request failed") pass