improved tagging

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@678 1f5c12ca-751b-0410-a591-d2e778427230
author: hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-12 07:32:10 +0400
committer: hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-12 07:32:10 +0400
commit: e684352c622e5c8b35020046a042a9cf57aebfe6 (patch)
tree: 9d3eb30780ad193e477ff1798b9e3a001c3cd720 /misc
parent: 4d0922afab799590cca1f9e5029470ac939c5a37 (diff)
3 files changed, 79 insertions, 36 deletions
diff --git a/misc/java-utils/ProcessShallowParse.java b/misc/java-utils/ProcessShallowParse.java
index dd3b2430e..77b789b20 100644
--- a/misc/java-utils/ProcessShallowParse.java
+++ b/misc/java-utils/ProcessShallowParse.java
@@ -5,7 +5,8 @@ import java.io.*;
 import java.util.*;
 
 //input is the sentences with all features combined 
-//output shrunked sentences with only those words we are interested in
+//output sentences combination of morphology, lopar tags and parsed tags
+// used to create generation table
 public class ProcessShallowParse
 {
 	public static void main(String[] args) throws Exception
@@ -21,26 +22,6 @@ public class ProcessShallowParse
 		
 		System.err.println("End...");
 	}
-
-	public ProcessShallowParse(Reader inStream, Writer outStream) throws Exception
-	{
-		BufferedReader inFile = new BufferedReader(inStream); 
-		BufferedWriter outFile = new BufferedWriter(outStream); 
-		
-		// tokenise
-		String inLine;
-		while ((inLine = inFile.readLine()) != null)
-		{
-			StringTokenizer st = new StringTokenizer(inLine);
-		     while (st.hasMoreTokens()) 
-		     {
-		    	 String token = st.nextToken();
-		    	 if (token.substring(0, 2).compareTo("I-") != 0)
-		    		 outFile.write(token + " ");
-		     }
-		     outFile.write("\n");
-		}		
-	}
 }
 
 class ProcessShallowParse2
@@ -63,10 +44,13 @@ class ProcessShallowParse2
 				String factoredWord = st.nextToken();
 		    	ret += Output(factoredWord);
 		    }
-			outFile.write(ret);
-			if (ret.length() > 0)
-				outFile.write("\n");
+			outFile.write(ret + "\n");
+			i++;
 		}
+		outFile.flush();
+		outFile.close();
+		outFile = null;
+		System.err.print("no of lines = " + i);
 	}
 	
 	protected String Output(String factoredWord) throws Exception
@@ -79,20 +63,16 @@ class ProcessShallowParse2
     	String posImproved = st.nextToken();
     	String ret = "";
 
-    	if (posImproved.indexOf("ART-SB") == 0
-    		|| posImproved.indexOf("NN-NK_NP-SB") == 0)
+    	if (posImproved.equals("ART-SB")
+    		|| posImproved.equals("NN-NK_NP-SB"))
     	{
     		ret = posImproved + "_" + morph + " ";
     	}
-    	else if (posImproved.indexOf("VAFIN-HD") == 0
-    			|| posImproved.indexOf("VVFIN-HD") == 0
-    			|| posImproved.indexOf("VMFIN-HD") == 0
-        		|| posImproved.indexOf("PPER-SB") == 0
-        		|| posImproved.indexOf("PRELS-SB") == 0
-        		|| posImproved.indexOf("PDS-SB") == 0
-        		|| posImproved.indexOf("PPER-PH") == 0
-        		|| posImproved.indexOf("PPER-EP") == 0
-        	)
+    	else if (posImproved.equals("???"))
+    	{
+    		ret = "??? ";
+    	}
+    	else
     	{
     		ret = surface + " ";
     	}
diff --git a/misc/java-utils/ShrinkSentence.java b/misc/java-utils/ShrinkSentence.java
new file mode 100644
index 000000000..e69fc6b1c
--- /dev/null
+++ b/misc/java-utils/ShrinkSentence.java
@@ -0,0 +1,48 @@
+// $Id$
+
+import java.io.*;
+import java.util.*;
+
+//used to create language model
+public class ShrinkSentence
+{
+	public static void main(String[] args) throws Exception
+	{
+		System.err.println("Starting...");
+
+		InputStreamReader inStream = new InputStreamReader(args.length > 0 ? new FileInputStream(args[0]) : System.in
+														, "Latin1"); 
+		OutputStreamWriter outStream = new OutputStreamWriter(args.length > 1 ? new FileOutputStream(args[1]) : (OutputStream) System.out
+														, "Latin1"); 
+		
+		new ShrinkSentence(inStream, outStream);
+		
+		System.err.println("End...");
+	}
+
+	public ShrinkSentence(Reader inStream, Writer outStream) throws Exception
+	{
+		BufferedReader inFile = new BufferedReader(inStream); 
+		BufferedWriter outFile = new BufferedWriter(outStream); 
+
+		// tokenise
+		String inLine;
+		int i = 1;
+		while ((inLine = inFile.readLine()) != null)
+		{
+			StringTokenizer st = new StringTokenizer(inLine);
+			while (st.hasMoreTokens()) 
+		    {
+				String word = st.nextToken();
+				if (!word.equals("???"))
+					outFile.write(word + " ");
+		    }
+			outFile.write("\n");
+			i++;
+		}
+		outFile.flush();
+		outFile.close();
+		outFile = null;
+		System.err.print("no of lines = " + i);		
+	}
+}
+\ No newline at end of file
diff --git a/misc/java-utils/TagHierarchy.java b/misc/java-utils/TagHierarchy.java
index 61f48871b..cdec14948 100644
--- a/misc/java-utils/TagHierarchy.java
+++ b/misc/java-utils/TagHierarchy.java
@@ -47,6 +47,7 @@ class TagHierarchy
 		System.err.println(nullLines + " null lines\n");
 	}
 
+	// indent parsed tree to make it easier to look at
 	public void OutputHierarchy(String inLine, BufferedWriter outFile) throws Exception
 	{
 		int level = 0;
@@ -104,7 +105,21 @@ class TagHierarchy
 	    		int firstBracket = parsed.indexOf(')');
 	    		int noBracket = parsed.length() - firstBracket;
 	    		String word = parsed.substring(0, firstBracket);
-	    		outFile.write(currTag + " ");
+
+	    		if (currTag.equals("ART-SB")
+		    			|| currTag.equals("NN-NK_NP-SB")
+		    			|| currTag.equals("VAFIN-HD")
+		    			|| currTag.equals("VVFIN-HD")
+		    			|| currTag.equals("VMFIN-HD")
+		    			|| currTag.equals("PPER-SB")
+		    			|| currTag.equals("PRELS-SB")
+		    			|| currTag.equals("PDS-SB")
+		    			|| currTag.equals("PPER-PH")
+		    			|| currTag.equals("PPER-EP")
+		    			)
+	    			outFile.write(currTag + " ");
+	    		else
+	    			outFile.write("??? ");
 	    		
 	    		level -= noBracket;
author	hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-12 07:32:10 +0400
committer	hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-12 07:32:10 +0400
commit	e684352c622e5c8b35020046a042a9cf57aebfe6 (patch)
tree	9d3eb30780ad193e477ff1798b9e3a001c3cd720 /misc
parent	4d0922afab799590cca1f9e5029470ac939c5a37 (diff)