Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authorhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-11 02:08:10 +0400
committerhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-11 02:08:10 +0400
commitc8d1576e6a08a9dfff23ff6e857500cdc2018204 (patch)
treeba6d72bef2cbb5e11e5ec4e3669fbee44a8a169d /misc
parent68ef1413cd7d3b4808d3febae014cd81174b9da5 (diff)
change chunking extraction according to advice from phi
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@628 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'misc')
-rw-r--r--misc/java-utils/ProcessShallowParse.java57
1 files changed, 40 insertions, 17 deletions
diff --git a/misc/java-utils/ProcessShallowParse.java b/misc/java-utils/ProcessShallowParse.java
index 05ddebfc3..c18fe1f48 100644
--- a/misc/java-utils/ProcessShallowParse.java
+++ b/misc/java-utils/ProcessShallowParse.java
@@ -41,6 +41,8 @@ public class ProcessShallowParse
class ProcessShallowParse2
{ // factored sentence
+ boolean m_prevART = false;
+
public ProcessShallowParse2(Reader inStream, Writer outStream) throws Exception
{
BufferedReader inFile = new BufferedReader(inStream);
@@ -48,35 +50,56 @@ class ProcessShallowParse2
// tokenise
String inLine;
+ int i = 1;
while ((inLine = inFile.readLine()) != null)
{
+ m_prevART = false;
StringTokenizer st = new StringTokenizer(inLine);
- while (st.hasMoreTokens())
- {
- String factoredWord = st.nextToken();
- Output(factoredWord, outFile);
- }
- outFile.write("\n");
- }
+ String ret = "";
+ while (st.hasMoreTokens())
+ {
+ String factoredWord = st.nextToken();
+ ret += Output(factoredWord);
+ }
+ outFile.write(i++ + " " + ret);
+ if (ret.length() > 0)
+ outFile.write("\n");
+ }
}
- protected void Output(String factoredWord, BufferedWriter outStream) throws Exception
+ protected String Output(String factoredWord) throws Exception
{
StringTokenizer st = new StringTokenizer(factoredWord, "|");
- st.nextToken();
+ String surface = st.nextToken();
String pos = st.nextToken();
String morph = st.nextToken();
+ String ret = "";
int lastPos = pos.lastIndexOf('-');
-
- if (pos.indexOf("ART") == 0
- || pos.indexOf("P") == 0
- || pos.indexOf("V") == 0
- || pos.indexOf("$,") == 0
- || pos.indexOf("$.") == 0
- )
+ if (pos.indexOf("ART-SB") == 0)
+ {
+ ret = pos + "|" + morph + " ";
+ m_prevART = true;
+ }
+ else if (pos.indexOf("NN-NK") == 0 && m_prevART)
{
- outStream.write(pos + "|" + morph + " ");
+ ret = pos + "|" + morph + " ";
+ m_prevART = false;
}
+ else if (pos.indexOf("VAFIN-HD") == 0
+ || pos.indexOf("VVFIN-HD") == 0
+ || pos.indexOf("VMFIN-HD") == 0
+ || pos.indexOf("PPER-SB") == 0
+ || pos.indexOf("PRELS-SB") == 0
+ || pos.indexOf("PDS-SB") == 0
+ || pos.indexOf("PPER-PH") == 0
+ || pos.indexOf("PPER-EP") == 0
+ )
+ {
+ ret = pos + "|" + surface + " ";
+ m_prevART = false;
+ }
+
+ return ret;
}
}