Code cleanup, language auto-detection (for text) and additional comments.

Language auto-detection implemented. Changed constructor default to "auto" and allowed language to be set to "null" for continuity between the synthesiser and recognizer classes.
author: Skylion007 <dragonsrcool.aaron@gmail.com> 2013-08-25 22:01:05 +0400
committer: Skylion007 <dragonsrcool.aaron@gmail.com> 2013-08-25 22:01:05 +0400
commit: d7d5ec7377311008acd0225ad851f2b62146ef03 (patch)
tree: ce983c62b0eb999fdb552afba033d448ff5c3d85
parent: 68ae920c658ac6937a4ae857197581e3779065b0 (diff)
1 files changed, 254 insertions, 149 deletions
diff --git a/src/com/darkprograms/speech/synthesiser/Synthesiser.java b/src/com/darkprograms/speech/synthesiser/Synthesiser.java
index 8ed83db..8b2a67f 100644
--- a/src/com/darkprograms/speech/synthesiser/Synthesiser.java
+++ b/src/com/darkprograms/speech/synthesiser/Synthesiser.java
@@ -1,165 +1,270 @@
 package com.darkprograms.speech.synthesiser;
 
 import java.io.InputStream;
+import java.io.Reader;
 import java.net.URL;
 import java.net.URLConnection;
 import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.List;
 
+
 /**
- * Synthesiser class that connects to Google's unoffical API to retreive data
+ * Synthesiser class that connects to Google's unoffical API to retrieve data
  *
  * @author Luke Kuza, Aaron Gokaslan (Skylion)
  */
 public class Synthesiser {
 
-    /**
-     * URL to query for Google synthesiser
-     */
-    private final static String GOOGLE_SYNTHESISER_URL = "http://translate.google.com/translate_tts?tl=";
-    
-    private String languageCode; //Specifies the language you want the voice to speak in.
-
-    //Languages
-    public static final String LANG_AU_ENGLISH = "en-AU";
-    public static final String LANG_US_ENGLISH = "en-US";
-    public static final String LANG_UK_ENGLISH = "en-GB";
-    public static final String LANG_ES_SPANISH = "es";
-    public static final String LANG_FR_FRENCH = "fr";
-    public static final String LANG_DE_GERMAN = "de"; 
-    //Please add on more regional languages as you find them. Also try to include the accent code if you can can.
-
-    /**
-     * Constructor
-     */
-    public Synthesiser() {
-        languageCode = "en-US"; //Defaults to English (United States)
-    }
-    
-    /**
-     * Overloaded Constructor that takes Language Code parameter
-     * 
-     */
-    public Synthesiser(String languageCode){
-        this.languageCode = languageCode;
-    }
-
-    /**
-     * Returns the current language code for the Synthesiser. 
-     * @return the current language code
-     */
-    public String getLanguageCode(){
-        return languageCode;
-    }
-   
-    /**
-     * Example: English(Generic) = en, English (US) = en-US, English (UK) = en-GB. and Spanish = es;
-     * @param languageCode The language code you would like to modify languageCode to.
-     */
-    public void setLanguage(String languageCode){
-        this.languageCode = languageCode;
-    }
-
-    /**
-     * Gets an input stream to MP3 data for the returned information from a request
-     *
-     * @param synthText Text you want to be synthesized into MP3 data
-     * @return Returns an input stream of the MP3 data that is returned from Google
-     * @throws Exception Throws exception if it can not complete the request
-     */
-    public InputStream getMP3Data(String synthText) throws Exception {
-        
-    	if(synthText.length()>99){
-    		List<String> fragments = stringParser(synthText);
-    		return getMP3Data(fragments);
-    	}
-    	
-    	String encoded = URLEncoder.encode(synthText, "UTF-8"); //Encode
-
-        URL url = new URL(GOOGLE_SYNTHESISER_URL + languageCode + "&q=" + encoded);  //create url
-
-        // Open New URL connection channel.
-        URLConnection urlConn = url.openConnection(); //Open connection
-
-
-        urlConn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0) Gecko/20100101 Firefox/4.0");  //Adding header for user agent is required
-
-        return urlConn.getInputStream();
-    }
-    
-    /**
-     * Gets an InputStream to MP3Data for the returned information from a request
-     * @param synthText List of Strings you want to be synthesized into MP3 data
-     * @return Returns an input stream of all the MP3 data that is returned from Google
-     * @throws Exception Throws exception if it cannot complete the request
-     */
-    public InputStream getMP3Data(List<String> synthText) throws Exception{
-    	InputStream complete = getMP3Data(synthText.remove(0));
-    	for(String part: synthText){
-    		complete = new java.io.SequenceInputStream(complete, getMP3Data(part));//Concatenate with new MP3 Data
-    	}
-    	return complete;
-    }
-
-    /**
-     * Separates a string into smaller parts so that Google will not reject the request.
-     * @param input The string you want to separate
-     * @return A List<String> of the String fragments from your input..
-     */
-    private List<String> stringParser(String input){
-    	return stringParser(input, new ArrayList<String>());
-    }
-    
-    /**
-     * Separates a string into smaller parts so that Google will not reject the request.
-     * @param input The string you want to break up into smaller parts
-     * @param fragments List<String> that you want to add stuff too. 
-     * If you don't have a List<String> already constructed "new ArrayList<String>()" works well.
-     * @return A list of the fragments of the original String
-     */
-    private List<String> stringParser(String input, List<String> fragments){
-    	if(input.length()<100){//Base Case
-    		fragments.add(input);
-    		return fragments;
-    	}
-    	else{
-    		int space = findLastWord(input);//Checks if a space exists
-    		if(space<0){
-    			fragments.add(input.substring(0,99));//In case you sent gibberish to Google.
-    			return stringParser(input.substring(99), fragments);
-    		}else{
-    			fragments.add(input.substring(0,space));//Otherwise, adds the last word to the list for recursion.
-    			return stringParser(input.substring(space), fragments);
-    		}
-    	}
-    }
-
-    /**
-     * Finds the last word in your String (before the index of 99) by searching for spaces and ending punctuation.
-     * @param input The String you want to search through.
-     * @return The index of where the last word of the String ends before the index of 99.
-     */
-    private int findLastWord(String input){
-    	if(input.length()<100)
-    		return input.length();
-    	for(int i = 99; i>=0; i--){
-    		char tmp = input.charAt(i);
-    		if(isEndingPunctuation(tmp)){
-    			return i;
-    		}
-    	}
-        return -1;
-    }
-    
-    /**
-     * Checks if char is an ending character
-     * Ending punctuation for all languages according to Wikipedia (Except for Sanskrit non-unicode)
-     * @param The char you want check
-     * @return True if it is, false if not.
-     */
-     private boolean isEndingPunctuation(char input){
-         return  input == ' ' || input == '.' || input == '!' || input == '?' || input == ';' || input == ':' 
-            || input == '|';
-     }
+	/**
+	 * URL to query for Google synthesiser
+	 */
+	private final static String GOOGLE_SYNTHESISER_URL = "http://translate.google.com/translate_tts?tl=";
+
+	/**
+	 * URL to query for Google Auto Detection
+	 */
+	private final static String GOOGLE_AUTODETECT_URL = "http://translate.google.com/translate_a/t?client=t&sl=auto&text=";
+
+	/**
+	 * language of the Text you want to translate
+	 */
+	private String languageCode; 
+
+	/**
+	 * LANG_XX_XXXX Variables are language codes. 
+	 */
+	public static final String LANG_AU_ENGLISH = "en-AU";
+	public static final String LANG_US_ENGLISH = "en-US";
+	public static final String LANG_UK_ENGLISH = "en-GB";
+	public static final String LANG_ES_SPANISH = "es";
+	public static final String LANG_FR_FRENCH = "fr";
+	public static final String LANG_DE_GERMAN = "de";
+	//Please add on more regional languages as you find them. Also try to include the accent code if you can can.
+
+	/**
+	 * Constructor
+	 */
+	public Synthesiser() {
+		languageCode = "auto";
+	}
+
+	/**
+	 * Constructor that takes language code parameter. Specify to "auto" for language autoDetection 
+	 */
+	public Synthesiser(String languageCode){
+		this.languageCode = languageCode;
+	}
+
+	/**
+	 * Returns the current language code for the Synthesiser.
+	 * Example: English(Generic) = en, English (US) = en-US, English (UK) = en-GB. and Spanish = es;
+	 * @return the current language code parameter
+	 */
+	public String getLanguage(){
+		return languageCode;
+	}
+
+	/**
+	 * Note: set language to auto to enable automatic language detection.
+	 * Setting to null will also implement Google's automatic language detection
+	 * @param languageCode The language code you would like to modify languageCode to.
+	 */
+	public void setLanguage(String languageCode){
+
+		this.languageCode = languageCode;
+	}
+
+	/**
+	 * Gets an input stream to MP3 data for the returned information from a request
+	 *
+	 * @param synthText Text you want to be synthesized into MP3 data
+	 * @return Returns an input stream of the MP3 data that is returned from Google
+	 * @throws Exception Throws exception if it can not complete the request
+	 */
+	public InputStream getMP3Data(String synthText) throws Exception {
+
+		String languageCode = this.languageCode;//Ensures retention of language settings if set to auto
+
+		if(languageCode == null || languageCode.equals("") || languageCode.equalsIgnoreCase("auto")){
+			try{
+				languageCode = detectLanguage(synthText);//Detects language
+			}
+			catch(Exception ex){
+				ex.printStackTrace();
+				languageCode = "en-us";//Reverts to Default Language if it can't detect it.
+			}
+		}
+
+		if(synthText.length()>100){
+			List<String> fragments = parseString(synthText);//parses String if too long
+			return getMP3Data(fragments);
+		}
+
+		String encoded = URLEncoder.encode(synthText, "UTF-8"); //Encode
+
+		URL url = new URL(GOOGLE_SYNTHESISER_URL + languageCode + "&q=" + encoded); //create url
+
+		// Open New URL connection channel.
+		URLConnection urlConn = url.openConnection(); //Open connection
+
+
+		urlConn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0) Gecko/20100101 Firefox/4.0"); //Adding header for user agent is required
+
+		return urlConn.getInputStream();
+	}
+
+	/**
+	 * Gets an InputStream to MP3Data for the returned information from a request
+	 * @param synthText List of Strings you want to be synthesized into MP3 data
+	 * @return Returns an input stream of all the MP3 data that is returned from Google
+	 * @throws Exception Throws exception if it cannot complete the request
+	 */
+	public InputStream getMP3Data(List<String> synthText) throws Exception{
+		InputStream complete = getMP3Data(synthText.remove(0));
+		for(String part: synthText){
+			complete = new java.io.SequenceInputStream(complete, getMP3Data(part));//Concatenate with new MP3 Data
+		}
+		return complete;
+	}
+
+	/**
+	 * Separates a string into smaller parts so that Google will not reject the request.
+	 * @param input The string you want to separate
+	 * @return A List<String> of the String fragments from your input..
+	 */
+	private List<String> parseString(String input){
+		return parseString (input, new ArrayList<String>());
+	}
+
+	/**
+	 * Separates a string into smaller parts so that Google will not reject the request.
+	 * @param input The string you want to break up into smaller parts
+	 * @param fragments List<String> that you want to add stuff too.
+	 * If you don't have a List<String> already constructed "new ArrayList<String>()" works well.
+	 * @return A list of the fragments of the original String
+	 */
+	private List<String> parseString(String input, List<String> fragments){
+		if(input.length()<=100){//Base Case
+			fragments.add(input);
+			return fragments;
+		}
+		else{
+			int lastWord = findLastWord(input);//Checks if a space exists
+			if(lastWord<0){
+				fragments.add(input.substring(0,100));//In case you sent gibberish to Google.
+				return parseString(input.substring(100), fragments);
+			}else{
+				fragments.add(input.substring(0,lastWord));//Otherwise, adds the last word to the list for recursion.
+				return parseString(input.substring(lastWord), fragments);
+			}
+		}
+	}
+
+	/**
+	 * Finds the last word in your String (before the index of 99) by searching for spaces and ending punctuation.
+	 * Will preferably parse on punctuation to alleviate mid-sentence pausing
+	 * @param input The String you want to search through.
+	 * @return The index of where the last word of the string ends before the index of 99.
+	 */
+	private int findLastWord(String input){
+		if(input.length()<100)
+			return input.length();
+		int space = -1;
+		for(int i = 99; i>=0; i--){
+			char tmp = input.charAt(i);
+			if(isEndingPunctuation(tmp)){
+				return i;
+			}
+			if(space==-1 && tmp == ' '){
+				space = i;
+			}
+		}
+		if(space>=0){
+			return space;
+		}
+		return -1;
+	}
+
+	/**
+	 * Checks if char is an ending character
+	 * Ending punctuation for all languages according to Wikipedia (Except for Sanskrit non-unicode)
+	 * @param The char you want check
+	 * @return True if it is, false if not.
+	 */
+	private boolean isEndingPunctuation(char input){
+		return input == '.' || input == '!' || input == '?' || input == ';' || input == ':' || input == '|';
+	}
+
+	/**
+	 * Automatically determines the language of the original text
+	 * @param text represents the text you want to check the language of
+	 * @return the languageCode
+	 * @throws Exception if it cannot complete the request
+	 */
+	public String detectLanguage(String text) throws Exception{
+
+		//GOOGLE rejects requests that are longer
+		if(text.length()>99){
+			text = text.substring(0,findLastWord(text));//We don't need the whole text to determine language
+		}
+		String encoded = URLEncoder.encode(text, "UTF-8"); //Encode
+		URL url = new URL(GOOGLE_AUTODETECT_URL + encoded); //Generates URL
+		URLConnection urlConn = url.openConnection(); //Open connection
+		urlConn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0) Gecko/20100101 Firefox/4.0"); //Adding header for user agent is required
+		String rawData = urlToText(urlConn);//Gets text from Google
+		return parseRawData(rawData);
+	}
+
+	/**
+	 * Converts a URL Connection to Text
+	 * @param urlConn The Open URLConnection that you want to generate a String from
+	 * @return The generated String
+	 * @throws Exception if it cannot complete the request
+	 */
+	private String urlToText(URLConnection urlConn) throws Exception{
+		Reader r = new java.io.InputStreamReader(urlConn.getInputStream());//Gets Data Converts to string
+		StringBuilder buf = new StringBuilder();
+		while (true) {
+			int ch = r.read();
+			if (ch < 0)
+				break;
+			buf.append((char) ch);
+		}
+		String str = buf.toString();
+		System.out.println(str);
+		return str;
+	}
+
+	/**
+	 * Searches RAWData for Language
+	 * @param RAWData the raw String directly from Google you want to search through
+	 * @return The language parsed from the rawData or en-US (English-United States) if Google cannot determine it.
+	 */
+	private String parseRawData(String rawData){
+		for(int i = 0; i+3<=rawData.length(); i++){
+			if(rawData.charAt(i)=='"' && rawData.charAt(i+3)=='"'){
+				String possible = rawData.substring(i+1,i+3);
+				if(containsLettersOnly(possible)){//Required due to Google's inconsistent formatting.
+					return possible;
+				}
+			}
+		}
+		return null;
+	}
+
+	/**
+	 * Checks if all characters in text are letters.  
+	 * @param text The text you want to determine the validity of.
+	 * @return True if all characters are letters, otherwise false.
+	 */
+	private boolean containsLettersOnly(String text){
+		for(int i = 0; i<text.length(); i++){
+			if(!Character.isLetter(text.charAt(i))){
+				return false;
+			}
+		}
+		return true;
+	}
 }
+
author	Skylion007 <dragonsrcool.aaron@gmail.com>	2013-08-25 22:01:05 +0400
committer	Skylion007 <dragonsrcool.aaron@gmail.com>	2013-08-25 22:01:05 +0400
commit	d7d5ec7377311008acd0225ad851f2b62146ef03 (patch)
tree	ce983c62b0eb999fdb552afba033d448ff5c3d85
parent	68ae920c658ac6937a4ae857197581e3779065b0 (diff)