Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/ClusterM/java-speech-api.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSkylion007 <dragonsrcool.aaron@gmail.com>2013-08-25 22:01:05 +0400
committerSkylion007 <dragonsrcool.aaron@gmail.com>2013-08-25 22:01:05 +0400
commitd7d5ec7377311008acd0225ad851f2b62146ef03 (patch)
treece983c62b0eb999fdb552afba033d448ff5c3d85
parent68ae920c658ac6937a4ae857197581e3779065b0 (diff)
Code cleanup, language auto-detection (for text) and additional comments.
Language auto-detection implemented. Changed constructor default to "auto" and allowed language to be set to "null" for continuity between the synthesiser and recognizer classes.
-rw-r--r--src/com/darkprograms/speech/synthesiser/Synthesiser.java403
1 files changed, 254 insertions, 149 deletions
diff --git a/src/com/darkprograms/speech/synthesiser/Synthesiser.java b/src/com/darkprograms/speech/synthesiser/Synthesiser.java
index 8ed83db..8b2a67f 100644
--- a/src/com/darkprograms/speech/synthesiser/Synthesiser.java
+++ b/src/com/darkprograms/speech/synthesiser/Synthesiser.java
@@ -1,165 +1,270 @@
package com.darkprograms.speech.synthesiser;
import java.io.InputStream;
+import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
+
/**
- * Synthesiser class that connects to Google's unoffical API to retreive data
+ * Synthesiser class that connects to Google's unoffical API to retrieve data
*
* @author Luke Kuza, Aaron Gokaslan (Skylion)
*/
public class Synthesiser {
- /**
- * URL to query for Google synthesiser
- */
- private final static String GOOGLE_SYNTHESISER_URL = "http://translate.google.com/translate_tts?tl=";
-
- private String languageCode; //Specifies the language you want the voice to speak in.
-
- //Languages
- public static final String LANG_AU_ENGLISH = "en-AU";
- public static final String LANG_US_ENGLISH = "en-US";
- public static final String LANG_UK_ENGLISH = "en-GB";
- public static final String LANG_ES_SPANISH = "es";
- public static final String LANG_FR_FRENCH = "fr";
- public static final String LANG_DE_GERMAN = "de";
- //Please add on more regional languages as you find them. Also try to include the accent code if you can can.
-
- /**
- * Constructor
- */
- public Synthesiser() {
- languageCode = "en-US"; //Defaults to English (United States)
- }
-
- /**
- * Overloaded Constructor that takes Language Code parameter
- *
- */
- public Synthesiser(String languageCode){
- this.languageCode = languageCode;
- }
-
- /**
- * Returns the current language code for the Synthesiser.
- * @return the current language code
- */
- public String getLanguageCode(){
- return languageCode;
- }
-
- /**
- * Example: English(Generic) = en, English (US) = en-US, English (UK) = en-GB. and Spanish = es;
- * @param languageCode The language code you would like to modify languageCode to.
- */
- public void setLanguage(String languageCode){
- this.languageCode = languageCode;
- }
-
- /**
- * Gets an input stream to MP3 data for the returned information from a request
- *
- * @param synthText Text you want to be synthesized into MP3 data
- * @return Returns an input stream of the MP3 data that is returned from Google
- * @throws Exception Throws exception if it can not complete the request
- */
- public InputStream getMP3Data(String synthText) throws Exception {
-
- if(synthText.length()>99){
- List<String> fragments = stringParser(synthText);
- return getMP3Data(fragments);
- }
-
- String encoded = URLEncoder.encode(synthText, "UTF-8"); //Encode
-
- URL url = new URL(GOOGLE_SYNTHESISER_URL + languageCode + "&q=" + encoded); //create url
-
- // Open New URL connection channel.
- URLConnection urlConn = url.openConnection(); //Open connection
-
-
- urlConn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0) Gecko/20100101 Firefox/4.0"); //Adding header for user agent is required
-
- return urlConn.getInputStream();
- }
-
- /**
- * Gets an InputStream to MP3Data for the returned information from a request
- * @param synthText List of Strings you want to be synthesized into MP3 data
- * @return Returns an input stream of all the MP3 data that is returned from Google
- * @throws Exception Throws exception if it cannot complete the request
- */
- public InputStream getMP3Data(List<String> synthText) throws Exception{
- InputStream complete = getMP3Data(synthText.remove(0));
- for(String part: synthText){
- complete = new java.io.SequenceInputStream(complete, getMP3Data(part));//Concatenate with new MP3 Data
- }
- return complete;
- }
-
- /**
- * Separates a string into smaller parts so that Google will not reject the request.
- * @param input The string you want to separate
- * @return A List<String> of the String fragments from your input..
- */
- private List<String> stringParser(String input){
- return stringParser(input, new ArrayList<String>());
- }
-
- /**
- * Separates a string into smaller parts so that Google will not reject the request.
- * @param input The string you want to break up into smaller parts
- * @param fragments List<String> that you want to add stuff too.
- * If you don't have a List<String> already constructed "new ArrayList<String>()" works well.
- * @return A list of the fragments of the original String
- */
- private List<String> stringParser(String input, List<String> fragments){
- if(input.length()<100){//Base Case
- fragments.add(input);
- return fragments;
- }
- else{
- int space = findLastWord(input);//Checks if a space exists
- if(space<0){
- fragments.add(input.substring(0,99));//In case you sent gibberish to Google.
- return stringParser(input.substring(99), fragments);
- }else{
- fragments.add(input.substring(0,space));//Otherwise, adds the last word to the list for recursion.
- return stringParser(input.substring(space), fragments);
- }
- }
- }
-
- /**
- * Finds the last word in your String (before the index of 99) by searching for spaces and ending punctuation.
- * @param input The String you want to search through.
- * @return The index of where the last word of the String ends before the index of 99.
- */
- private int findLastWord(String input){
- if(input.length()<100)
- return input.length();
- for(int i = 99; i>=0; i--){
- char tmp = input.charAt(i);
- if(isEndingPunctuation(tmp)){
- return i;
- }
- }
- return -1;
- }
-
- /**
- * Checks if char is an ending character
- * Ending punctuation for all languages according to Wikipedia (Except for Sanskrit non-unicode)
- * @param The char you want check
- * @return True if it is, false if not.
- */
- private boolean isEndingPunctuation(char input){
- return input == ' ' || input == '.' || input == '!' || input == '?' || input == ';' || input == ':'
- || input == '|';
- }
+ /**
+ * URL to query for Google synthesiser
+ */
+ private final static String GOOGLE_SYNTHESISER_URL = "http://translate.google.com/translate_tts?tl=";
+
+ /**
+ * URL to query for Google Auto Detection
+ */
+ private final static String GOOGLE_AUTODETECT_URL = "http://translate.google.com/translate_a/t?client=t&sl=auto&text=";
+
+ /**
+ * language of the Text you want to translate
+ */
+ private String languageCode;
+
+ /**
+ * LANG_XX_XXXX Variables are language codes.
+ */
+ public static final String LANG_AU_ENGLISH = "en-AU";
+ public static final String LANG_US_ENGLISH = "en-US";
+ public static final String LANG_UK_ENGLISH = "en-GB";
+ public static final String LANG_ES_SPANISH = "es";
+ public static final String LANG_FR_FRENCH = "fr";
+ public static final String LANG_DE_GERMAN = "de";
+ //Please add on more regional languages as you find them. Also try to include the accent code if you can can.
+
+ /**
+ * Constructor
+ */
+ public Synthesiser() {
+ languageCode = "auto";
+ }
+
+ /**
+ * Constructor that takes language code parameter. Specify to "auto" for language autoDetection
+ */
+ public Synthesiser(String languageCode){
+ this.languageCode = languageCode;
+ }
+
+ /**
+ * Returns the current language code for the Synthesiser.
+ * Example: English(Generic) = en, English (US) = en-US, English (UK) = en-GB. and Spanish = es;
+ * @return the current language code parameter
+ */
+ public String getLanguage(){
+ return languageCode;
+ }
+
+ /**
+ * Note: set language to auto to enable automatic language detection.
+ * Setting to null will also implement Google's automatic language detection
+ * @param languageCode The language code you would like to modify languageCode to.
+ */
+ public void setLanguage(String languageCode){
+
+ this.languageCode = languageCode;
+ }
+
+ /**
+ * Gets an input stream to MP3 data for the returned information from a request
+ *
+ * @param synthText Text you want to be synthesized into MP3 data
+ * @return Returns an input stream of the MP3 data that is returned from Google
+ * @throws Exception Throws exception if it can not complete the request
+ */
+ public InputStream getMP3Data(String synthText) throws Exception {
+
+ String languageCode = this.languageCode;//Ensures retention of language settings if set to auto
+
+ if(languageCode == null || languageCode.equals("") || languageCode.equalsIgnoreCase("auto")){
+ try{
+ languageCode = detectLanguage(synthText);//Detects language
+ }
+ catch(Exception ex){
+ ex.printStackTrace();
+ languageCode = "en-us";//Reverts to Default Language if it can't detect it.
+ }
+ }
+
+ if(synthText.length()>100){
+ List<String> fragments = parseString(synthText);//parses String if too long
+ return getMP3Data(fragments);
+ }
+
+ String encoded = URLEncoder.encode(synthText, "UTF-8"); //Encode
+
+ URL url = new URL(GOOGLE_SYNTHESISER_URL + languageCode + "&q=" + encoded); //create url
+
+ // Open New URL connection channel.
+ URLConnection urlConn = url.openConnection(); //Open connection
+
+
+ urlConn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0) Gecko/20100101 Firefox/4.0"); //Adding header for user agent is required
+
+ return urlConn.getInputStream();
+ }
+
+ /**
+ * Gets an InputStream to MP3Data for the returned information from a request
+ * @param synthText List of Strings you want to be synthesized into MP3 data
+ * @return Returns an input stream of all the MP3 data that is returned from Google
+ * @throws Exception Throws exception if it cannot complete the request
+ */
+ public InputStream getMP3Data(List<String> synthText) throws Exception{
+ InputStream complete = getMP3Data(synthText.remove(0));
+ for(String part: synthText){
+ complete = new java.io.SequenceInputStream(complete, getMP3Data(part));//Concatenate with new MP3 Data
+ }
+ return complete;
+ }
+
+ /**
+ * Separates a string into smaller parts so that Google will not reject the request.
+ * @param input The string you want to separate
+ * @return A List<String> of the String fragments from your input..
+ */
+ private List<String> parseString(String input){
+ return parseString (input, new ArrayList<String>());
+ }
+
+ /**
+ * Separates a string into smaller parts so that Google will not reject the request.
+ * @param input The string you want to break up into smaller parts
+ * @param fragments List<String> that you want to add stuff too.
+ * If you don't have a List<String> already constructed "new ArrayList<String>()" works well.
+ * @return A list of the fragments of the original String
+ */
+ private List<String> parseString(String input, List<String> fragments){
+ if(input.length()<=100){//Base Case
+ fragments.add(input);
+ return fragments;
+ }
+ else{
+ int lastWord = findLastWord(input);//Checks if a space exists
+ if(lastWord<0){
+ fragments.add(input.substring(0,100));//In case you sent gibberish to Google.
+ return parseString(input.substring(100), fragments);
+ }else{
+ fragments.add(input.substring(0,lastWord));//Otherwise, adds the last word to the list for recursion.
+ return parseString(input.substring(lastWord), fragments);
+ }
+ }
+ }
+
+ /**
+ * Finds the last word in your String (before the index of 99) by searching for spaces and ending punctuation.
+ * Will preferably parse on punctuation to alleviate mid-sentence pausing
+ * @param input The String you want to search through.
+ * @return The index of where the last word of the string ends before the index of 99.
+ */
+ private int findLastWord(String input){
+ if(input.length()<100)
+ return input.length();
+ int space = -1;
+ for(int i = 99; i>=0; i--){
+ char tmp = input.charAt(i);
+ if(isEndingPunctuation(tmp)){
+ return i;
+ }
+ if(space==-1 && tmp == ' '){
+ space = i;
+ }
+ }
+ if(space>=0){
+ return space;
+ }
+ return -1;
+ }
+
+ /**
+ * Checks if char is an ending character
+ * Ending punctuation for all languages according to Wikipedia (Except for Sanskrit non-unicode)
+ * @param The char you want check
+ * @return True if it is, false if not.
+ */
+ private boolean isEndingPunctuation(char input){
+ return input == '.' || input == '!' || input == '?' || input == ';' || input == ':' || input == '|';
+ }
+
+ /**
+ * Automatically determines the language of the original text
+ * @param text represents the text you want to check the language of
+ * @return the languageCode
+ * @throws Exception if it cannot complete the request
+ */
+ public String detectLanguage(String text) throws Exception{
+
+ //GOOGLE rejects requests that are longer
+ if(text.length()>99){
+ text = text.substring(0,findLastWord(text));//We don't need the whole text to determine language
+ }
+ String encoded = URLEncoder.encode(text, "UTF-8"); //Encode
+ URL url = new URL(GOOGLE_AUTODETECT_URL + encoded); //Generates URL
+ URLConnection urlConn = url.openConnection(); //Open connection
+ urlConn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0) Gecko/20100101 Firefox/4.0"); //Adding header for user agent is required
+ String rawData = urlToText(urlConn);//Gets text from Google
+ return parseRawData(rawData);
+ }
+
+ /**
+ * Converts a URL Connection to Text
+ * @param urlConn The Open URLConnection that you want to generate a String from
+ * @return The generated String
+ * @throws Exception if it cannot complete the request
+ */
+ private String urlToText(URLConnection urlConn) throws Exception{
+ Reader r = new java.io.InputStreamReader(urlConn.getInputStream());//Gets Data Converts to string
+ StringBuilder buf = new StringBuilder();
+ while (true) {
+ int ch = r.read();
+ if (ch < 0)
+ break;
+ buf.append((char) ch);
+ }
+ String str = buf.toString();
+ System.out.println(str);
+ return str;
+ }
+
+ /**
+ * Searches RAWData for Language
+ * @param RAWData the raw String directly from Google you want to search through
+ * @return The language parsed from the rawData or en-US (English-United States) if Google cannot determine it.
+ */
+ private String parseRawData(String rawData){
+ for(int i = 0; i+3<=rawData.length(); i++){
+ if(rawData.charAt(i)=='"' && rawData.charAt(i+3)=='"'){
+ String possible = rawData.substring(i+1,i+3);
+ if(containsLettersOnly(possible)){//Required due to Google's inconsistent formatting.
+ return possible;
+ }
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Checks if all characters in text are letters.
+ * @param text The text you want to determine the validity of.
+ * @return True if all characters are letters, otherwise false.
+ */
+ private boolean containsLettersOnly(String text){
+ for(int i = 0; i<text.length(); i++){
+ if(!Character.isLetter(text.charAt(i))){
+ return false;
+ }
+ }
+ return true;
+ }
}
+