Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authorhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-12 07:32:10 +0400
committerhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-12 07:32:10 +0400
commite684352c622e5c8b35020046a042a9cf57aebfe6 (patch)
tree9d3eb30780ad193e477ff1798b9e3a001c3cd720 /misc
parent4d0922afab799590cca1f9e5029470ac939c5a37 (diff)
improved tagging
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@678 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'misc')
-rw-r--r--misc/java-utils/ProcessShallowParse.java50
-rw-r--r--misc/java-utils/ShrinkSentence.java48
-rw-r--r--misc/java-utils/TagHierarchy.java17
3 files changed, 79 insertions, 36 deletions
diff --git a/misc/java-utils/ProcessShallowParse.java b/misc/java-utils/ProcessShallowParse.java
index dd3b2430e..77b789b20 100644
--- a/misc/java-utils/ProcessShallowParse.java
+++ b/misc/java-utils/ProcessShallowParse.java
@@ -5,7 +5,8 @@ import java.io.*;
import java.util.*;
//input is the sentences with all features combined
-//output shrunked sentences with only those words we are interested in
+//output sentences combination of morphology, lopar tags and parsed tags
+// used to create generation table
public class ProcessShallowParse
{
public static void main(String[] args) throws Exception
@@ -21,26 +22,6 @@ public class ProcessShallowParse
System.err.println("End...");
}
-
- public ProcessShallowParse(Reader inStream, Writer outStream) throws Exception
- {
- BufferedReader inFile = new BufferedReader(inStream);
- BufferedWriter outFile = new BufferedWriter(outStream);
-
- // tokenise
- String inLine;
- while ((inLine = inFile.readLine()) != null)
- {
- StringTokenizer st = new StringTokenizer(inLine);
- while (st.hasMoreTokens())
- {
- String token = st.nextToken();
- if (token.substring(0, 2).compareTo("I-") != 0)
- outFile.write(token + " ");
- }
- outFile.write("\n");
- }
- }
}
class ProcessShallowParse2
@@ -63,10 +44,13 @@ class ProcessShallowParse2
String factoredWord = st.nextToken();
ret += Output(factoredWord);
}
- outFile.write(ret);
- if (ret.length() > 0)
- outFile.write("\n");
+ outFile.write(ret + "\n");
+ i++;
}
+ outFile.flush();
+ outFile.close();
+ outFile = null;
+ System.err.print("no of lines = " + i);
}
protected String Output(String factoredWord) throws Exception
@@ -79,20 +63,16 @@ class ProcessShallowParse2
String posImproved = st.nextToken();
String ret = "";
- if (posImproved.indexOf("ART-SB") == 0
- || posImproved.indexOf("NN-NK_NP-SB") == 0)
+ if (posImproved.equals("ART-SB")
+ || posImproved.equals("NN-NK_NP-SB"))
{
ret = posImproved + "_" + morph + " ";
}
- else if (posImproved.indexOf("VAFIN-HD") == 0
- || posImproved.indexOf("VVFIN-HD") == 0
- || posImproved.indexOf("VMFIN-HD") == 0
- || posImproved.indexOf("PPER-SB") == 0
- || posImproved.indexOf("PRELS-SB") == 0
- || posImproved.indexOf("PDS-SB") == 0
- || posImproved.indexOf("PPER-PH") == 0
- || posImproved.indexOf("PPER-EP") == 0
- )
+ else if (posImproved.equals("???"))
+ {
+ ret = "??? ";
+ }
+ else
{
ret = surface + " ";
}
diff --git a/misc/java-utils/ShrinkSentence.java b/misc/java-utils/ShrinkSentence.java
new file mode 100644
index 000000000..e69fc6b1c
--- /dev/null
+++ b/misc/java-utils/ShrinkSentence.java
@@ -0,0 +1,48 @@
+// $Id$
+
+import java.io.*;
+import java.util.*;
+
+//used to create language model
+public class ShrinkSentence
+{
+ public static void main(String[] args) throws Exception
+ {
+ System.err.println("Starting...");
+
+ InputStreamReader inStream = new InputStreamReader(args.length > 0 ? new FileInputStream(args[0]) : System.in
+ , "Latin1");
+ OutputStreamWriter outStream = new OutputStreamWriter(args.length > 1 ? new FileOutputStream(args[1]) : (OutputStream) System.out
+ , "Latin1");
+
+ new ShrinkSentence(inStream, outStream);
+
+ System.err.println("End...");
+ }
+
+ public ShrinkSentence(Reader inStream, Writer outStream) throws Exception
+ {
+ BufferedReader inFile = new BufferedReader(inStream);
+ BufferedWriter outFile = new BufferedWriter(outStream);
+
+ // tokenise
+ String inLine;
+ int i = 1;
+ while ((inLine = inFile.readLine()) != null)
+ {
+ StringTokenizer st = new StringTokenizer(inLine);
+ while (st.hasMoreTokens())
+ {
+ String word = st.nextToken();
+ if (!word.equals("???"))
+ outFile.write(word + " ");
+ }
+ outFile.write("\n");
+ i++;
+ }
+ outFile.flush();
+ outFile.close();
+ outFile = null;
+ System.err.print("no of lines = " + i);
+ }
+} \ No newline at end of file
diff --git a/misc/java-utils/TagHierarchy.java b/misc/java-utils/TagHierarchy.java
index 61f48871b..cdec14948 100644
--- a/misc/java-utils/TagHierarchy.java
+++ b/misc/java-utils/TagHierarchy.java
@@ -47,6 +47,7 @@ class TagHierarchy
System.err.println(nullLines + " null lines\n");
}
+ // indent parsed tree to make it easier to look at
public void OutputHierarchy(String inLine, BufferedWriter outFile) throws Exception
{
int level = 0;
@@ -104,7 +105,21 @@ class TagHierarchy
int firstBracket = parsed.indexOf(')');
int noBracket = parsed.length() - firstBracket;
String word = parsed.substring(0, firstBracket);
- outFile.write(currTag + " ");
+
+ if (currTag.equals("ART-SB")
+ || currTag.equals("NN-NK_NP-SB")
+ || currTag.equals("VAFIN-HD")
+ || currTag.equals("VVFIN-HD")
+ || currTag.equals("VMFIN-HD")
+ || currTag.equals("PPER-SB")
+ || currTag.equals("PRELS-SB")
+ || currTag.equals("PDS-SB")
+ || currTag.equals("PPER-PH")
+ || currTag.equals("PPER-EP")
+ )
+ outFile.write(currTag + " ");
+ else
+ outFile.write("??? ");
level -= noBracket;