1 files changed, 3959 insertions, 0 deletions
diff --git a/TBXTools.py b/TBXTools.py
new file mode 100644
index 0000000..3005340
--- /dev/null
+++ b/TBXTools.py
@@ -0,0 +1,3959 @@
+#    TBXTools
+#    version: 2022/05/05
+#    Copyright: Antoni Oliver (2022) - Universitat Oberta de Catalunya - aoliverg@uoc.edu
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import codecs
+import sqlite3
+import xml.etree.cElementTree as etree
+
+import nltk
+from nltk.util import ngrams
+from nltk.probability import FreqDist
+from nltk.collocations import *
+import re
+import pickle
+import gzip
+import operator
+import sys
+import math
+import csv
+
+import string
+
+import importlib
+
+import gensim
+from gensim.models import Word2Vec
+from gensim.models import KeyedVectors
+import numpy
+import collections
+import numpy as np
+
+import time
+
+try:
+    import spacy
+except:
+    pass
+    
+try:
+    import spacy_udpipe
+except:
+    pass
+import subprocess
+import openpyxl
+from openpyxl import load_workbook
+
+class TBXTools:
+    '''Class for automatic terminology extraction and terminology management.'''
+    def __init__(self):
+        self.maxinserts=10000 #controls the maximum number of inserts in memory
+        self.sl_lang=""
+        self.tl_lang=""
+        self.max_id_corpus=0
+        
+        self.sl_stopwords=[]
+        self.tl_stopwords=[]
+        self.sl_inner_stopwords=[]
+        self.tl_inner_stopwords=[]
+        self.sl_exclsions_regexps=[]
+        self.tl_exclusion_regexps=[]
+        self.sl_morphonorm_rules=[]
+        self.tl_morphonorm_rules=[]
+        self.evaluation_terms={}
+        self.tsr_terms=[]
+        self.exclusion_terms={}
+        self.exclusion_no_terms={}
+        self.ngrams={}
+        self.tagged_ngrams={}
+        self.term_candidates={}
+        self.linguistic_patterns={}
+        
+        self.knownterms=[]
+        self.n_min=1
+        self.n_max=5
+        
+        self.n_min_pos_patterns=1000
+        self.n_max_pos_patterns=1
+        
+        self.punctuation=string.punctuation
+        self.sl_stopwords.extend(self.punctuation)
+        self.tl_stopwords.extend(self.punctuation)
+        self.sl_inner_stopwords.extend(self.punctuation)
+        self.tl_inner_stopwords.extend(self.punctuation)
+        
+        self.specificSLtokenizer=False
+        self.specificTLtokenizer=False
+        
+        self.SLtokenizer=None
+        self.TLtokenizer=None
+        
+        
+        
+        
+    def create_project(self,project_name,sllang=None, tllang=None,overwrite=False):
+        '''Opens a project. If the project already exists, it raises an exception. To avoid the exception use overwrite=True. To open existing projects, use the open_project method.'''
+        #sllang and tllang are not longer used.
+        if os.path.isfile(project_name) and not overwrite:
+                raise Exception("This file already exists")
+        
+        else:
+            if os.path.isfile(project_name) and overwrite:
+                os.remove(project_name)
+            self.conn=sqlite3.connect(project_name)
+            self.cur = self.conn.cursor() 
+            self.cur2 = self.conn.cursor()
+            with self.conn:
+                self.cur = self.conn.cursor()
+                self.cur.execute("CREATE TABLE sl_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, segment TEXT)")
+                self.cur.execute("CREATE TABLE tl_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, segment TEXT)")
+                self.cur.execute("CREATE TABLE parallel_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, segmentSL, segmentTL TEXT)")
+                self.cur.execute("CREATE TABLE tagged_parallel_corpus(id INTEGER PRIMARY KEY, tagged_segmentSL, tagged_segmentTL TEXT)")
+                self.cur.execute("CREATE TABLE sl_corpus_c(id INTEGER PRIMARY KEY AUTOINCREMENT, segment TEXT)")
+                self.cur.execute("CREATE TABLE tl_corpus_c(id INTEGER PRIMARY KEY AUTOINCREMENT, segment TEXT)")
+                self.cur.execute("CREATE TABLE sl_tagged_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, tagged_segment TEXT)")
+                self.cur.execute("CREATE TABLE tl_tagged_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, tagged_segment TEXT)")
+                self.cur.execute("CREATE TABLE sl_tagged_corpus_c(id INTEGER PRIMARY KEY AUTOINCREMENT, tagged_segment TEXT)")
+                self.cur.execute("CREATE TABLE tl_tagged_corpus_c(id INTEGER PRIMARY KEY AUTOINCREMENT, tagged_segment TEXT)")
+                self.cur.execute("CREATE TABLE sl_stopwords (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_stopword TEXT)")
+                self.cur.execute("CREATE TABLE sl_inner_stopwords (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_inner_stopword TEXT)")
+                self.cur.execute("CREATE TABLE tl_stopwords (id INTEGER PRIMARY KEY AUTOINCREMENT, tl_stopword TEXT)")
+                self.cur.execute("CREATE TABLE tl_inner_stopwords (id INTEGER PRIMARY KEY AUTOINCREMENT, tl_inner_stopword TEXT)")
+                self.cur.execute("CREATE TABLE sl_exclusion_regexps (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_exclusion_regexp TEXT)")
+                self.cur.execute("CREATE TABLE tl_exclusion_regexps (id INTEGER PRIMARY KEY AUTOINCREMENT, tl_exclusion_regexp TEXT)")
+                self.cur.execute("CREATE TABLE sl_morphonorm_rules (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_morphonorm_rule TEXT)")
+                self.cur.execute("CREATE TABLE tl_morphonorm_rules (id INTEGER PRIMARY KEY AUTOINCREMENT, tl_morphonorm_rule TEXT)")
+                self.cur.execute("CREATE TABLE evaluation_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+                self.cur.execute("CREATE TABLE reference_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+                self.cur.execute("CREATE TABLE validated_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+                self.cur.execute("CREATE TABLE compoundify_terms_sl (id INTEGER PRIMARY KEY AUTOINCREMENT, term TEXT)")
+                self.cur.execute("CREATE TABLE compoundify_terms_tl (id INTEGER PRIMARY KEY AUTOINCREMENT, term TEXT)")
+                self.cur.execute("CREATE TABLE tsr_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, term TEXT)")
+                self.cur.execute("CREATE TABLE tosearch_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, term TEXT)")
+                self.cur.execute("CREATE TABLE exclusion_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+                self.cur.execute("CREATE TABLE exclusion_noterms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+                self.cur.execute("CREATE TABLE tokens (id INTEGER PRIMARY KEY AUTOINCREMENT, token TEXT, frequency INTEGER)")
+                self.cur.execute("CREATE TABLE ngrams (id INTEGER PRIMARY KEY AUTOINCREMENT, ngram TEXT, n INTEGER, frequency INTEGER)")
+                self.cur.execute("CREATE TABLE tagged_ngrams (id INTEGER PRIMARY KEY AUTOINCREMENT, ngram TEXT, tagged_ngram TEXT, n INTEGER, frequency INTEGER)")
+                
+                self.cur.execute("CREATE INDEX indextaggedngram on tagged_ngrams (ngram);")
+                
+                self.cur.execute("CREATE TABLE embeddings_sl (id INTEGER PRIMARY KEY AUTOINCREMENT, candidate TEXT, embedding BLOB)")
+                self.cur.execute("CREATE INDEX indexembeddings_sl on embeddings_sl (candidate);")
+                
+                self.cur.execute("CREATE TABLE embeddings_sl_ref (id INTEGER PRIMARY KEY AUTOINCREMENT, candidate TEXT, embedding BLOB)")
+                self.cur.execute("CREATE INDEX indexembeddings_sl_ref on embeddings_sl_ref (candidate);")
+                
+                self.cur.execute("CREATE TABLE embeddings_tl (id INTEGER PRIMARY KEY AUTOINCREMENT, candidate TEXT, embedding BLOB)")
+                self.cur.execute("CREATE INDEX indexembeddings_tl on embeddings_tl (candidate);")
+                
+                self.cur.execute("CREATE TABLE term_candidates (id INTEGER PRIMARY KEY AUTOINCREMENT, candidate TEXT, n INTEGER, frequency INTEGER, measure TEXT, value FLOAT)")
+                self.cur.execute("CREATE TABLE index_pt(id INTEGER PRIMARY KEY AUTOINCREMENT, source TEXT, target TEXT, probability FLOAT)")
+                self.cur.execute("CREATE INDEX index_index_pt on index_pt (source);")
+                self.cur.execute("CREATE TABLE linguistic_patterns (id INTEGER PRIMARY KEY AUTOINCREMENT, linguistic_pattern TEXT)")
+                
+                self.conn.commit()
+                
+    def open_project(self,project_name):
+        '''Opens an existing project. If the project doesn't exist it raises an exception.'''
+        if not os.path.isfile(project_name):
+                raise Exception("Project not found")
+        else:
+            self.conn=sqlite3.connect(project_name)
+            self.cur = self.conn.cursor() 
+            self.cur2 = self.conn.cursor()
+            
+
+    #METODES DELETES
+    def delete_configuration(self):
+        '''Deletes the project configuration.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM configuration')
+            self.conn.commit()
+    
+    def delete_sl_corpus(self):
+        '''Deletes de source language corpus.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM sl_corpus')
+            self.conn.commit()
+    
+    def delete_tl_corpus(self):
+        '''Deletes de target language corpus.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM tl_corpus')
+            self.conn.commit()
+            
+    def delete_parallel_corpus(self):
+        '''Deletes de target language corpus.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM parallel_corpus')
+            self.conn.commit()
+            
+    def delete_sl_corpus_c(self):
+        '''Deletes de source language contrast corpus.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM sl_corpus_c')
+            self.conn.commit()
+    
+    def delete_tl_corpus_c(self):
+        '''Deletes de target language contrast corpus.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM tl_corpus_c')
+            self.conn.commit()
+    
+    def delete_sl_tagged_corpus(self):
+        '''Deletes the source language tagged corpus.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM sl_tagged_corpus')
+            self.conn.commit()
+    
+    def delete_tl_tagged_corpus(self):
+        '''Deletes the target language tagged corpus.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM tl_tagged_corpus')
+            self.conn.commit()
+         
+    def delete_sl_tagged_corpus_c(self):
+        '''Deletes the source language contrast tagged corpus.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM sl_tagged_corpus_c')
+            self.conn.commit()
+    
+    def delete_tl_tagged_corpus_c(self):
+        '''Deletes the target language contrast tagged corpus.'''
+        with self.conn:
+            self.cur.execute('DELETE FROM tl_tagged_corpus_c')
+            self.conn.commit()
+    
+    def delete_sl_stopwords(self):
+        '''Deletes the stop-words for the source language.'''
+        #self.sl_stopwords=[]
+        with self.conn:
+            self.cur.execute('DELETE FROM sl_stopwords')
+            self.conn.commit()
+            
+    def delete_tl_stopwords(self):
+        '''Deletes the stop-words fot the target language.'''
+        #self.tl_stopwords=[]
+        with self.conn:
+            self.cur.execute('DELETE FROM tl_stopwords')
+            self.conn.commit()
+            
+    def delete_sl_inner_stopwords(self):
+        '''Deletes the inner stop-words for the source language.'''
+        #self.sl_inner_stopwords=[]
+        with self.conn:
+            self.cur.execute('DELETE FROM sl_inner_stopwords')
+            self.conn.commit()
+            
+    def delete_tl_inner_stopwords(self):
+        '''Deletes the innter stop-words for the target language.'''
+        #self.tl_inner_stopwords=[]
+        with self.conn:
+            self.cur.execute('DELETE FROM tl_inner_stopwords')
+            self.conn.commit()
+    
+    def delete_sl_exclusion_regexps(self):
+        '''Deletes the exclusion regular expressions for the source language.'''
+        #self.sl_exclusion_regexps=[]
+        with self.conn:
+            self.cur.execute('DELETE FROM sl_exclusion_regexps')
+            self.conn.commit()
+    
+    def delete_tl_exclusion_regexps(self):
+        '''Deletes the exclusion regular expressions for the target language.'''
+        #self.tl_exclusion_regexps=[]
+        with self.conn:
+            self.cur.execute('DELETE FROM tl_exclusion_regexps')
+            self.conn.commit()
+            
+    def delete_sl_morphonorm_rules(self):
+        '''Deletes the morphological normalisation rules for the source language.'''
+        #self.sl_morphonorm_rules=[]
+        with self.conn:
+            self.cur.execute('DELETE FROM sl_morphonorm_rules')
+            self.conn.commit()
+            
+    def delete_tl_morphonorm_rules(self):
+        '''Deletes the morphological normalisation rules for the target language.'''
+        #self.tl_morphonorm_rules=[]
+        with self.conn:
+            self.cur.execute('DELETE FROM tl_morphonorm_rules')
+            self.conn.commit()
+
+    def delete_evaluation_terms(self):
+        '''Deletes the evaluation terms.'''
+        #self.evaluation_terms={}
+        with self.conn:
+            self.cur.execute('DELETE FROM evaluation_terms')
+            self.conn.commit()
+    
+    def delete_reference_terms(self):
+        '''Deletes the reference terms.'''
+        #self.evaluation_terms={}
+        with self.conn:
+            self.cur.execute('DELETE FROM reference_terms')
+            self.conn.commit()
+            
+    def delete_validated_terms(self):
+        '''Deletes the validated terms.'''
+        #self.evaluation_terms={}
+        with self.conn:
+            self.cur.execute('DELETE FROM validated_terms')
+            self.conn.commit()
+            
+    def delete_compoundify_terms_sl(self):
+        '''Deletes the compoundify terms for the source language.'''
+        #self.exclusion_terms={}
+        with self.conn:
+            self.cur.execute('DELETE FROM compoundify_terms_sl')
+            self.conn.commit() 
+    
+    def delete_compoundify_terms_tl(self):
+        '''Deletes the compoundify terms for the target language.'''
+        #self.exclusion_terms={}
+        with self.conn:
+            self.cur.execute('DELETE FROM compoundify_terms_sl')
+            self.conn.commit()
+    
+    def delete_tsr_terms(self):
+        '''Deletes the TSR terms.'''
+        #self.exclusion_terms={}
+        with self.conn:
+            self.cur.execute('DELETE FROM tsr_terms')
+            self.conn.commit()
+    
+    def delete_exclusion_terms(self):
+        '''Deletes the exclusion terms.'''
+        #self.exclusion_terms={}
+        with self.conn:
+            self.cur.execute('DELETE FROM exclusion_terms')
+            self.conn.commit()
+    
+    def delete_exclusion_no_terms(self):
+        '''Deletes the exclusion no terms.'''
+        #self.exclusion_terms={}
+        with self.conn:
+            self.cur.execute('DELETE FROM exclusion_no_terms')
+            self.conn.commit() 
+            
+    def delete_tokens(self):
+        '''Deletes the tokens.'''
+        #self.ngrams={}
+        with self.conn:
+            self.cur.execute('DELETE FROM tokens')
+            self.conn.commit()
+            
+    def delete_ngrams(self):
+        '''Deletes the ngrams.'''
+        #self.ngrams={}
+        with self.conn:
+            self.cur.execute('DELETE FROM ngrams')
+            self.conn.commit()
+            
+    def delete_tagged_ngrams(self):
+        '''Deletes the tagged ngrams.'''
+        #self.tagged_ngrams={}
+        with self.conn:
+            self.cur.execute('DELETE FROM tagged_ngrams')
+            self.conn.commit()
+            
+    def delete_embeddings_sl(self):
+        '''Deletes the embeddings for the source language.'''
+        #self.tagged_ngrams={}
+        with self.conn:
+            self.cur.execute('DELETE FROM embeddings_sl')
+            self.conn.commit()
+            
+    def delete_embeddings_tl(self):
+        '''Deletes the embeddings for the target language.'''
+        #self.tagged_ngrams={}
+        with self.conn:
+            self.cur.execute('DELETE FROM embeddings_tl')
+            self.conn.commit()
+            
+    def delete_term_candidates(self):
+        '''Deletes the term candidates.'''
+        #self.term_candidates={}
+        with self.conn:
+            self.cur.execute('DELETE FROM term_candidates')
+            self.conn.commit()
+            
+    def delete_linguistic_patterns(self):
+        '''Deletes the linguistic patterns for linguistic terminology extraction.'''
+        #self.exclusion_terms={}
+        with self.conn:
+            self.cur.execute('DELETE FROM linguistic_patterns')
+            self.conn.commit() 
+                     
+    def load_sl_corpus(self,corpusfile, encoding="utf-8", compoundify=False, comp_symbol="▁"):
+        '''Loads a monolingual corpus for the source language. It's recommended, but not compulsory, that the corpus is segmented (one segment per line). Use external tools to segment the corpus. A plain text corpus (not segmented), can be aslo used.'''
+        if compoundify:
+            compterms=[]
+            self.cur.execute('SELECT term from compoundify_terms_sl')
+            data=self.cur.fetchall()
+            for d in data:
+                compterms.append(d[0])            
+        cf=codecs.open(corpusfile,"r",encoding=encoding,errors="ignore")
+        data=[]
+        continserts=0
+        for line in cf:            
+            record=[]
+            line=line.rstrip()
+            if compoundify:
+                for compterm in compterms:
+                    if line.find(compterm)>=1:
+                        comptermMOD=compterm.replace(" ",comp_symbol)
+                        line=line.replace(compterm,comptermMOD)
+            record.append(line)
+            data.append(record)
+            continserts+=1
+            if continserts==self.maxinserts:
+                self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data)    
+        self.conn.commit()
+        
+    def load_tl_corpus(self,corpusfile, encoding="utf-8", compoundify=False, comp_symbol="▁"):
+        '''Loads a monolingual corpus for the target language. It's recommended, but not compulsory, that the corpus is segmented (one segment per line). Use TBXTools external tools to segment the corpus. A plain text corpus (not segmented), can be aslo used.'''
+        
+        if compoundify:
+            compterms=[]
+            self.cur.execute('SELECT term from compoundify_terms_tl')
+            data=self.cur.fetchall()
+            for d in data:
+                compterms.append(d[0])            
+        cf=codecs.open(corpusfile,"r",encoding=encoding,errors="ignore")
+        data=[]
+        continserts=0
+        for line in cf:            
+            record=[]
+            line=line.rstrip()
+            if compoundify:
+                for compterm in compterms:
+                    if line.find(compterm)>=1:
+                        comptermMOD=compterm.replace(" ",comp_symbol)
+                        line=line.replace(compterm,comptermMOD)
+            record.append(line)
+            data.append(record)
+            continserts+=1
+            if continserts==self.maxinserts:
+                self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data)    
+        self.conn.commit()        
+    
+    def load_sl_corpus_c(self,corpusfile, encoding="utf-8", compoundify=False, comp_symbol="▁"):
+        '''Loads a monolingual contrast corpus for the source language. It's recommended, but not compulsory, that the corpus is segmented (one segment per line). Use external tools to segment the corpus. A plain text corpus (not segmented), can be aslo used.'''
+        if compoundify:
+            compterms=[]
+            self.cur.execute('SELECT term from compoundify_terms_sl')
+            data=self.cur.fetchall()
+            for d in data:
+                compterms.append(d[0])            
+        cf=codecs.open(corpusfile,"r",encoding=encoding,errors="ignore")
+        data=[]
+        continserts=0
+        for line in cf:            
+            record=[]
+            line=line.rstrip()
+            if compoundify:
+                for compterm in compterms:
+                    if line.find(compterm)>=1:
+                        comptermMOD=compterm.replace(" ",comp_symbol)
+                        line=line.replace(compterm,comptermMOD)
+            record.append(line)
+            data.append(record)
+            continserts+=1
+            if continserts==self.maxinserts:
+                cur.executemany("INSERT INTO sl_corpus_c (segment) VALUES (?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO sl_corpus_c (segment) VALUES (?)",data)    
+        self.conn.commit()
+        
+    def load_tl_corpus_c(self,corpusfile, encoding="utf-8", compoundify=False, comp_symbol="▁"):
+        '''Loads a monolingual contrast corpus for the target language. It's recommended, but not compulsory, that the corpus is segmented (one segment per line). Use TBXTools external tools to segment the corpus. A plain text corpus (not segmented), can be aslo used.'''
+        
+        if compoundify:
+            compterms=[]
+            self.cur.execute('SELECT term from compoundify_terms_tl')
+            data=self.cur.fetchall()
+            for d in data:
+                compterms.append(d[0])            
+        cf=codecs.open(corpusfile,"r",encoding=encoding,errors="ignore")
+        data=[]
+        continserts=0
+        for line in cf:            
+            record=[]
+            line=line.rstrip()
+            if compoundify:
+                for compterm in compterms:
+                    if line.find(compterm)>=1:
+                        comptermMOD=compterm.replace(" ",comp_symbol)
+                        line=line.replace(compterm,comptermMOD)
+            record.append(line)
+            data.append(record)
+            continserts+=1
+            if continserts==self.maxinserts:
+                cur.executemany("INSERT INTO tl_corpus_c (segment) VALUES (?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO tl_corpus_c (segment) VALUES (?)",data)    
+        self.conn.commit() 
+        
+    def load_parallel_corpus_Moses(self,slcorpusfile, tlcorpusfile, feed_monolingual=True, encoding="utf-8"):
+        '''Loads a parallel corpus in Moses format (that is, in two independent files). It expects one segment per line.'''
+        slcf=codecs.open(slcorpusfile,"r",encoding=encoding)
+        tlcf=codecs.open(tlcorpusfile,"r",encoding=encoding)
+        parallel_data=[]
+        sl_data=[]
+        tl_data=[]
+        parallel_data=[]
+        continserts=0
+        while 1:
+            sl_segment=slcf.readline()
+            if not sl_segment:
+                break
+            tl_segment=tlcf.readline()
+            continserts+=1            
+            sl_record=[]
+            tl_record=[]
+            parallel_record=[]
+            sl_segment=sl_segment.rstrip()
+            tl_segment=tl_segment.rstrip()
+            parallel_record.append(sl_segment)
+            parallel_record.append(tl_segment)
+            sl_record.append(sl_segment)
+            tl_record.append(tl_segment)
+            parallel_data.append(parallel_record)
+            sl_data.append(sl_record)
+            tl_data.append(tl_record)
+            if continserts==self.maxinserts:
+                self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data)
+                if feed_monolingual:
+                    self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",sl_data)
+                    self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",tl_data)
+                parallel_data=[]    
+                sl_data=[]
+                tl_data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data)
+            if feed_monolingual:
+                self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",sl_data)
+                self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",tl_data) 
+        self.conn.commit()
+        
+    def load_parallel_corpus_tabtxt(self,corpusfile, feed_monolingual=True, reverse=False, encoding="utf-8"):
+        '''Loads a parallel corpus in tabbed text format (that is, in two independent files). It expects one segment per line.'''
+        cf=codecs.open(corpusfile,"r",encoding=encoding)        
+        parallel_data=[]
+        sl_data=[]
+        tl_data=[]
+        parallel_data=[]
+        parallel_data_rev=[]
+        continserts=0
+        for linia in cf:
+            linia=linia.rstrip()
+            camps=linia.split("\t")
+            if len(camps)>=2:
+                sl_segment=camps[0]
+                tl_segment=camps[1]
+                continserts+=1            
+                sl_record=[]
+                tl_record=[]
+                parallel_record=[]
+                parallel_record_rev=[]
+                sl_segment=sl_segment.rstrip()
+                tl_segment=tl_segment.rstrip()
+                parallel_record.append(sl_segment)
+                parallel_record.append(tl_segment)
+                parallel_record_rev.append(tl_segment)
+                parallel_record_rev.append(sl_segment)
+                sl_record.append(sl_segment)
+                tl_record.append(tl_segment)
+                parallel_data.append(parallel_record)
+                parallel_data_rev.append(parallel_record_rev)
+                sl_data.append(sl_record)
+                tl_data.append(tl_record)
+            if continserts==self.maxinserts:
+                if reverse:
+                    self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data_rev)
+                else:
+                    self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data)
+                if feed_monolingual:
+                    if reverse:
+                        self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",tl_data)
+                        self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",sl_data)
+                    else:
+                        self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",sl_data)
+                        self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",tl_data)
+                parallel_data=[]
+                parallel_data_rev=[]                
+                sl_data=[]
+                tl_data=[]
+                continserts=0
+        with self.conn:
+            if reverse:
+                self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data_rev)
+            else:
+                self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data)
+            if feed_monolingual:
+                if reverse:
+                    self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",tl_data)
+                    self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",sl_data)
+                else:
+                    self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",sl_data)
+                    self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",tl_data) 
+        self.conn.commit()
+                    
+    def load_parallel_corpus_tmx(self,tmx_file, sl_code="", tl_code="", feed_monolingual=True):
+        '''Loads a parallel corpus from a TMX file. Source and target language codes should be given. The codes must be the exactly the same as in the TMX file. A list of codes separated by comma is allowed. '''
+        continserts=0
+        slcodes=[]
+        for slc in sl_code.split(","):
+            slcodes.append(slc.strip())
+        tlcodes=[]
+        for tlc in tl_code.split(","):
+            tlcodes.append(tlc.strip())
+        data1=[]
+        data2=[]
+        datap=[]
+        sl_segment=""
+        tl_segment=""
+        current_lang=""
+        for event, elem in etree.iterparse(tmx_file,events=("start","end")):
+            if event=='start':
+                if elem.tag=="tu" and not sl_segment=="" and not tl_segment=="":
+                    continserts+=1
+                    
+                    record1=[]
+                    record2=[]
+                    recordp=[]
+                    record1.append(sl_segment)
+                    data1.append(record1)
+                    record2.append(tl_segment)
+                    data2.append(record2)
+                    recordp.append(sl_segment)
+                    recordp.append(tl_segment)
+                    datap.append(recordp)
+                    sl_segment=""
+                    tl_segment=""
+                    if continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",datap)
+                        if feed_monolingual:
+                            self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data1) 
+                            self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data2)
+                        data1=[]
+                        data2=[]
+                        datap=[]
+                        continserts=0
+                        self.conn.commit()
+                elif elem.tag=="tuv":
+                    current_lang=elem.attrib['{http://www.w3.org/XML/1998/namespace}lang']
+                elif elem.tag=="seg":
+                    if elem.text:
+                        segmentext=elem.text
+                    else:
+                        segmentext=""
+                    if current_lang in slcodes:
+                        sl_segment=segmentext
+                    if current_lang in tlcodes:
+                        tl_segment=segmentext 
+        with self.conn:
+            self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",datap)
+            if feed_monolingual:
+                self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data1)
+                self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data2) 
+        self.conn.commit()                
+                        
+    def load_parallel_corpus_sdltm(self,sdltmfile, feed_monolingual=True):
+        '''Loads a parallel corpus from a SDLTM file.'''
+        
+        connSDLTM=sqlite3.connect(sdltmfile)
+        curSDLTM = connSDLTM.cursor() 
+        curSDLTM.execute('select source_segment,target_segment from translation_units;')
+        dataSDLTM=curSDLTM.fetchall()
+        data1=[]
+        data2=[]
+        datap=[]
+        continserts=0
+        for d in dataSDLTM:
+            ssxml=d[0]
+            tsxml=d[1]
+            record1=[]
+            record2=[]
+            recordp=[]
+            try:
+                rootSL = etree.fromstring(ssxml)
+                for text in rootSL.iter('Value'):
+                    sltext="".join(text.itertext()).replace("\n"," ")
+                rootTL = etree.fromstring(tsxml)
+                for text in rootTL.iter('Value'):
+                    tltext="".join(text.itertext()).replace("\n"," ")
+                if not sltext=="" and not tltext=="":
+                    continserts+=1
+                    record1.append(sltext)
+                    data1.append(record1)
+                    record2.append(tltext)
+                    data2.append(record2)
+                    recordp.append(sltext)
+                    recordp.append(tltext)    
+                    datap.append(recordp)
+            except:
+                print("ERROR")
+            if continserts==self.maxinserts:
+                self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",datap)
+                if feed_monolingual:
+                    self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data1) 
+                    self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data2)
+                data1=[]
+                data2=[]
+                datap=[]
+                continserts=0
+                self.conn.commit()
+        with self.conn:
+            self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",datap)
+            if feed_monolingual:
+                self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data1)
+                self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data2) 
+            self.conn.commit()  
+        
+        
+                        
+    def load_sl_tagged_corpus(self,corpusfile,format="TBXTools",encoding="utf-8"):
+        '''Loads the source language tagged corpus. 3 formats are allowed:
+        - TBXTools: The internal format used by TBXTools. One tagged segment per line.
+                f1|l1|t1|p1 f2|l2|t2|p2 ... fn|ln|tn|pn 
+        - Freeling: One token per line and segments separated by blank lines
+                f1 l1 t1 p1
+                f2 l2 t2 p2
+                ...
+                fn ln tn pn
+        - Conll: One of the output formats guiven by the Standford Core NLP analyzer.  On token per line and segments separated by blank lines
+                id1 f1 l1 t1 ...
+                id2 f2 l2 t2 ...
+                ...
+                idn fn ln tn ...
+        '''
+        validformarts=["TBXTools","freeling","conll"]
+        #TODO: Raise exception if not a valid format.
+        cf=codecs.open(corpusfile,"r",encoding=encoding)
+        if format.lower()=="tbxtools":            
+            data=[]
+            continserts=0
+            for line in cf:
+                continserts+=1                
+                record=[]
+                line=line.rstrip()
+                record.append(line)
+                data.append(record)
+                if continserts==self.maxinserts:
+                    self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)
+                    data=[]
+                    continserts=0
+                    
+            with self.conn:
+                self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)    
+            self.conn.commit()
+        elif format.lower()=="freeling":
+            data=[]
+            continserts=0
+            segment=[]
+            for line in cf:
+                line=line.rstrip()
+                if line=="":
+                    continserts+=1
+                    record=[]
+                    record.append(" ".join(segment))
+                    data.append(record)
+                    if continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)
+                        data=[]
+                        continserts=0
+                        data=[]
+                        self.conn.commit()
+                    segment=[]
+                        
+                else:
+                    camps=line.split()
+                    token=camps[0]+"|"+camps[1]+"|"+camps[2]
+
+                    segment.append(token)
+            with self.conn:
+                self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)    
+                self.conn.commit()                                                
+        elif format.lower()=="conll":
+            data=[]
+            continserts=0
+            segment=[]
+            for line in cf:
+                line=line.rstrip()
+                if line=="":
+                    continserts+=1
+                    record=[]
+                    record.append(" ".join(segment))
+                    data.append(record)
+                    if continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",self.data)
+                        data=[]
+                        continserts=0
+                        data=[]
+                        self.conn.commit()
+                    segment=[]
+                        
+                else:
+                    camps=line.split()
+                    token=camps[1]+"|"+camps[2]+"|"+camps[3]
+                    segment.append(token)                                                
+            with self.conn:
+                self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)    
+            self.conn.commit()
+
+    def load_tl_tagged_corpus(self,corpusfile,format="TBXTools",encoding="utf-8"):
+        '''Loads the target language tagged corpus. 3 formats are allowed:
+        - TBXTools: The internal format used by TBXTools. One tagged segment per line.
+                f1|l1|t1|p1 f2|l2|t2|p2 ... fn|ln|tn|pn 
+        - Freeling: One token per line and segments separated by blank lines
+                f1 l1 t1 p1
+                f2 l2 t2 p2
+                ...
+                fn ln tn pn
+        - Conll: One of the output formats guiven by the Standford Core NLP analyzer.  On token per line and segments separated by blank lines
+                id1 f1 l1 t1 ...
+                id2 f2 l2 t2 ...
+                ...
+                idn fn ln tn ...
+        '''
+        validformarts=["TBXTools","freeling","conll"]
+        #TODO: Raise exception if not a valid format.
+        cf=codecs.open(corpusfile,"r",encoding=encoding)
+        if format.lower()=="tbxtools":            
+            data=[]
+            continserts=0
+            for line in cf:
+                continserts+=1                
+                record=[]
+                line=line.rstrip()
+                record.append(line)
+                data.append(record)
+                if continserts==self.maxinserts:
+                    self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)
+                    data=[]
+                    continserts=0
+                    
+            with self.conn:
+                self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)    
+            self.conn.commit()
+        elif format.lower()=="freeling":
+            data=[]
+            continserts=0
+            segment=[]
+            for line in cf:
+                line=line.rstrip()
+                if line=="":
+                    continserts+=1
+                    record=[]
+                    record.append(" ".join(segment))
+                    data.append(record)
+                    if continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)
+                        data=[]
+                        continserts=0
+                        data=[]
+                        self.conn.commit()
+                    segment=[]
+                        
+                else:
+                    camps=line.split()
+                    token=camps[0]+"|"+camps[1]+"|"+camps[2]
+
+                    segment.append(token)
+            with self.conn:
+                self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)    
+                self.conn.commit()                                                
+        elif format.lower()=="conll":
+            data=[]
+            continserts=0
+            segment=[]
+            for line in cf:
+                line=line.rstrip()
+                if line=="":
+                    continserts+=1
+                    record=[]
+                    record.append(" ".join(segment))
+                    data.append(record)
+                    if continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",self.data)
+                        data=[]
+                        continserts=0
+                        data=[]
+                        self.conn.commit()
+                    segment=[]
+                        
+                else:
+                    camps=line.split()
+                    token=camps[1]+"|"+camps[2]+"|"+camps[3]
+                    segment.append(token)                                                
+            with self.conn:
+                self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)    
+            self.conn.commit()  
+    
+    def load_sl_tagged_corpus_c(self,corpusfile,format="TBXTools",encoding="utf-8"):
+        '''Loads the source language tagged corpus. 3 formats are allowed:
+        - TBXTools: The internal format used by TBXTools. One tagged segment per line.
+                f1|l1|t1|p1 f2|l2|t2|p2 ... fn|ln|tn|pn 
+        - Freeling: One token per line and segments separated by blank lines
+                f1 l1 t1 p1
+                f2 l2 t2 p2
+                ...
+                fn ln tn pn
+        - Conll: One of the output formats guiven by the Standford Core NLP analyzer.  On token per line and segments separated by blank lines
+                id1 f1 l1 t1 ...
+                id2 f2 l2 t2 ...
+                ...
+                idn fn ln tn ...
+        '''
+        validformarts=["TBXTools","freeling","conll"]
+        #TODO: Raise exception if not a valid format.
+        cf=codecs.open(corpusfile,"r",encoding=encoding)
+        if format.lower()=="tbxtools":            
+            data=[]
+            continserts=0
+            for line in cf:
+                continserts+=1                
+                record=[]
+                line=line.rstrip()
+                record.append(line)
+                data.append(record)
+                if continserts==self.maxinserts:
+                    self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+                    data=[]
+                    continserts=0
+                    
+            with self.conn:
+                self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)    
+            self.conn.commit()
+        elif format.lower()=="freeling":
+            data=[]
+            continserts=0
+            segment=[]
+            for line in cf:
+                line=line.rstrip()
+                if line=="":
+                    continserts+=1
+                    record=[]
+                    record.append(" ".join(segment))
+                    data.append(record)
+                    if continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+                        data=[]
+                        continserts=0
+                        data=[]
+                        self.conn.commit()
+                    segment=[]
+                        
+                else:
+                    camps=line.split()
+                    token=camps[0]+"|"+camps[1]+"|"+camps[2]
+
+                    segment.append(token)
+            with self.conn:
+                self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)    
+                self.conn.commit()                                                
+        elif format.lower()=="conll":
+            data=[]
+            continserts=0
+            segment=[]
+            for line in cf:
+                line=line.rstrip()
+                if line=="":
+                    continserts+=1
+                    record=[]
+                    record.append(" ".join(segment))
+                    data.append(record)
+                    if continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",self.data)
+                        data=[]
+                        continserts=0
+                        data=[]
+                        self.conn.commit()
+                    segment=[]
+                        
+                else:
+                    camps=line.split()
+                    token=camps[1]+"|"+camps[2]+"|"+camps[3]
+                    segment.append(token)                                                
+            with self.conn:
+                self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)    
+            self.conn.commit()
+
+    def load_tl_tagged_corpus_c(self,corpusfile,format="TBXTools",encoding="utf-8"):
+        '''Loads the target language tagged corpus. 3 formats are allowed:
+        - TBXTools: The internal format used by TBXTools. One tagged segment per line.
+                f1|l1|t1|p1 f2|l2|t2|p2 ... fn|ln|tn|pn 
+        - Freeling: One token per line and segments separated by blank lines
+                f1 l1 t1 p1
+                f2 l2 t2 p2
+                ...
+                fn ln tn pn
+        - Conll: One of the output formats guiven by the Standford Core NLP analyzer.  On token per line and segments separated by blank lines
+                id1 f1 l1 t1 ...
+                id2 f2 l2 t2 ...
+                ...
+                idn fn ln tn ...
+        '''
+        validformarts=["TBXTools","freeling","conll"]
+        #TODO: Raise exception if not a valid format.
+        cf=codecs.open(corpusfile,"r",encoding=encoding)
+        if format.lower()=="tbxtools":            
+            data=[]
+            continserts=0
+            for line in cf:
+                continserts+=1                
+                record=[]
+                line=line.rstrip()
+                record.append(line)
+                data.append(record)
+                if continserts==self.maxinserts:
+                    self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+                    data=[]
+                    continserts=0
+                    
+            with self.conn:
+                self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)    
+            self.conn.commit()
+        elif format.lower()=="freeling":
+            data=[]
+            continserts=0
+            segment=[]
+            for line in cf:
+                line=line.rstrip()
+                if line=="":
+                    continserts+=1
+                    record=[]
+                    record.append(" ".join(segment))
+                    data.append(record)
+                    if continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+                        data=[]
+                        continserts=0
+                        data=[]
+                        self.conn.commit()
+                    segment=[]
+                        
+                else:
+                    camps=line.split()
+                    token=camps[0]+"|"+camps[1]+"|"+camps[2]
+
+                    segment.append(token)
+            with self.conn:
+                self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)    
+                self.conn.commit()                                                
+        elif format.lower()=="conll":
+            data=[]
+            continserts=0
+            segment=[]
+            for line in cf:
+                line=line.rstrip()
+                if line=="":
+                    continserts+=1
+                    record=[]
+                    record.append(" ".join(segment))
+                    data.append(record)
+                    if continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",self.data)
+                        data=[]
+                        continserts=0
+                        data=[]
+                        self.conn.commit()
+                    segment=[]
+                        
+                else:
+                    camps=line.split()
+                    token=camps[1]+"|"+camps[2]+"|"+camps[3]
+                    segment.append(token)                                                
+            with self.conn:
+                self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)    
+            self.conn.commit() 
+    
+    
+    def load_sl_stopwords(self,fitxer,encoding="utf-8"):
+        '''Loads the stopwords for the source language.'''
+        fc=codecs.open(fitxer,"r",encoding)
+        data=[]
+        record=[]
+        while 1:
+            linia=fc.readline()
+            if not linia:
+                break 
+            linia=linia.rstrip()
+            record.append(linia)
+            data.append(record)
+            record=[]
+        
+        for punct in self.punctuation:
+            record.append(punct)
+            data.append(record)
+            record=[]
+        with self.conn:
+            self.cur.executemany("INSERT INTO sl_stopwords (sl_stopword) VALUES (?)",data)  
+            
+    def load_tl_stopwords(self,fitxer,encoding="utf-8"):
+        '''Loads the stopwords for the target language.'''
+        fc=codecs.open(fitxer,"r",encoding)
+        data=[]
+        record=[]
+        while 1:
+            linia=fc.readline()
+            if not linia:
+                break 
+            linia=linia.rstrip()
+            record.append(linia)
+            data.append(record)
+            record=[]
+        
+        for punct in self.punctuation:
+            record.append(punct)
+            data.append(record)
+            record=[]
+        with self.conn:
+            self.cur.executemany("INSERT INTO tl_stopwords (tl_stopword) VALUES (?)",data) 
+
+    def load_sl_inner_stopwords(self,fitxer,encoding="utf-8"):
+        '''Loads the stopwords for the source language.'''
+        fc=codecs.open(fitxer,"r",encoding)
+        data=[]
+        record=[]
+        while 1:
+            linia=fc.readline()
+            if not linia:
+                break 
+            linia=linia.rstrip()
+            record.append(linia)
+            data.append(record)
+            record=[]        
+        for punct in self.punctuation:
+            record.append(punct)
+            data.append(record)
+            record=[]
+        with self.conn:
+            self.cur.executemany("INSERT INTO sl_inner_stopwords (sl_inner_stopword) VALUES (?)",data)  
+            
+    def load_tl_inner_stopwords(self,fitxer,encoding="utf-8"):
+        '''Loads the inner stopwords for the target language.'''
+        fc=codecs.open(fitxer,"r",encoding)
+        data=[]
+        record=[]
+        while 1:
+            linia=fc.readline()
+            if not linia:
+                break 
+            linia=linia.rstrip()
+            record.append(linia)
+            data.append(record)
+            record=[]        
+        for punct in self.punctuation:
+            record.append(punct)
+            data.append(record)
+            record=[]
+        with self.conn:
+            self.cur.executemany("INSERT INTO tl_inner_stopwords (tl_inner_stopword) VALUES (?)",data)  
+    
+    #evaluation terms
+    def load_evaluation_terms_tabtxt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the evaluation terms from a tabulated text.'''
+        cf=codecs.open(arxiu,"r",encoding=encoding)
+        data=[]
+        continserts=0
+        for line in cf:
+            line=line.rstrip()
+            continserts+=1            
+            record=[]
+            line=line.rstrip()
+            camps=line.split("\t")
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(camps[0]).split()
+            else:
+                tokens=camps[0].split()
+            if len(camps)==1:
+                if len(tokens)>=nmin and len(tokens)<=nmax:
+                    record.append(camps[0])
+                    record.append("_")
+                    data.append(record)
+            elif len(camps)>1:
+                if len(tokens)>=nmin and len(tokens)<=nmax:
+                    record.append(camps[0])
+                    record.append(camps[1])
+                    data.append(record)
+            if continserts==self.maxinserts: 
+                self.cur.executemany("INSERT INTO evaluation_terms (sl_term,tl_term) VALUES (?,?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO evaluation_terms (sl_term,tl_term) VALUES (?,?)",data)
+        self.conn.commit()
+        
+    def load_evaluation_terms_tbx(self,arxiu,sl_code="",tl_code="",encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the evaluation terms from a TBX file.'''
+        slcodes=[]
+        for slc in sl_code.split(","):
+            slcodes.append(slc.strip())
+        tlcodes=[]
+        for tlc in tl_code.split(","):
+            tlcodes.append(tlc.strip())
+        data=[]
+        slterm=[]
+        tlterm=[]
+        lang=""
+        for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+            tag=elem.tag.replace(self.namespace(elem),"")
+            if event=="end" and tag in ["conceptEntry","termEntry"]:
+                if len(slterm)>0 and len(tlterm)>0:
+                    record=[]
+                    for slt in slterm:
+                        if self.specificSLtokenizer:
+                            tokens=self.SLtokenizer.tokenize(slt).split()
+                        else:
+                            tokens=slt.split()
+                        if len(tokens)>=nmin and len(tokens)<=nmax:
+                            tlt=", ".join(tlterm)
+                            record.append(slt)
+                            record.append(tlt)
+                            data.append(record)  
+                            record=[]
+                    slterm=[]
+                    tlterm=[]
+            elif event=="start" and tag=="langSec":
+                if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in slcodes:
+                    lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+                if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in tlcodes:
+                    lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+            elif event=="start" and tag=="term":
+                if lang in slcodes: slterm.append("".join(elem.itertext()).lstrip().rstrip())
+                elif lang in tlcodes: tlterm.append("".join(elem.itertext()).lstrip().rstrip())
+        self.cur.executemany("INSERT INTO evaluation_terms (sl_term,tl_term) VALUES (?,?)",data)   
+        self.conn.commit()
+    #
+    def load_validated_terms(self,terms):
+        """Load a list of tuples containig source-target terms)."""
+        data=[]
+        for tupleTerms in terms:
+            record=[]
+            slterm=tupleTerms[0]
+            tlterm=tupleTerms[1]
+            record.append(slterm)
+            record.append(tlterm)
+            data.append(record)
+        self.cur.executemany("INSERT INTO validated_terms (sl_term,tl_term) VALUES (?,?)",data)   
+        self.conn.commit()
+    def get_validated_terms(self):
+        self.cur.execute("SELECT sl_term, tl_term FROM validated_terms;")
+        validatedterms=[]
+        source_terms=[]
+        target_terms=[]
+        for s in self.cur.fetchall():
+            record=[]
+            record.append(s[0])
+            record.append(s[1])
+            validatedterms.append(record)
+        return(validatedterms)
+        
+    #reference_terms
+    def load_reference_terms_tabtxt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000, reverse=False):
+        '''Loads the reference terms from a tabulated text.'''
+        cf=codecs.open(arxiu,"r",encoding=encoding)
+        data=[]
+        continserts=0
+        for line in cf:
+            line=line.rstrip()
+            continserts+=1            
+            record=[]
+            line=line.rstrip()
+            camps=line.split("\t")
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(camps[0]).split()
+            else:
+                tokens=camps[0].split()
+            if len(camps)==1:
+                if len(tokens)>=nmin and len(tokens)<=nmax:
+                    record.append(camps[0])
+                    record.append("_")
+                    data.append(record)
+            elif len(camps)>1:
+                if len(tokens)>=nmin and len(tokens)<=nmax:
+                    record.append(camps[0])
+                    record.append(camps[1])
+                    data.append(record)
+            if continserts==self.maxinserts: 
+                if reverse:
+                    self.cur.executemany("INSERT INTO reference_terms (tl_term,sl_term) VALUES (?,?)",data)
+                else:
+                    self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            if reverse:
+                self.cur.executemany("INSERT INTO reference_terms (tl_term,sl_term) VALUES (?,?)",data)
+            else:
+                self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)
+        self.conn.commit()
+        
+    def load_reference_terms_tbx(self,arxiu,sl_code="",tl_code="",encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the evaluation terms from a TBX file.'''
+        slcodes=[]
+        for slc in sl_code.split(","):
+            slcodes.append(slc.strip())
+        tlcodes=[]
+        for tlc in tl_code.split(","):
+            tlcodes.append(tlc.strip())
+        data=[]
+        slterm=[]
+        tlterm=[]
+        lang=""
+        for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+            tag=elem.tag.replace(self.namespace(elem),"")
+            if event=="end" and tag in ["conceptEntry","termEntry"]:
+                if len(slterm)>0 and len(tlterm)>0:
+                    record=[]
+                    for slt in slterm:
+                        if self.specificSLtokenizer:
+                            tokens=self.SLtokenizer.tokenize(slt).split()
+                        else:
+                            tokens=slt.split()
+                        if len(tokens)>=nmin and len(tokens)<=nmax:
+                            tlt=", ".join(tlterm)
+                            record.append(slt)
+                            record.append(tlt)
+                            data.append(record)  
+                            record=[]
+                    slterm=[]
+                    tlterm=[]
+            elif event=="start" and tag=="langSec":
+                if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in slcodes:
+                    lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+                if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in tlcodes:
+                    lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+            elif event=="start" and tag=="term":
+                if lang in slcodes: slterm.append("".join(elem.itertext()).lstrip().rstrip())
+                elif lang in tlcodes: tlterm.append("".join(elem.itertext()).lstrip().rstrip())
+        self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)   
+        self.conn.commit()
+    
+    def load_reference_terms_csv(self,arxiu,encoding="utf-8",nmin=0,nmax=1000,CSVdelimiter=",",CSVquotechar=None,CSVescapechar=None,CSVSLTerm=1,CSVTLTerm=2):
+        csv_file=codecs.open(arxiu,"r",encoding=encoding)
+        csv_reader = csv.reader(csv_file, delimiter=",", quotechar=CSVquotechar, escapechar=CSVescapechar)
+        record=[]
+        data=[]
+        for row in csv_reader:
+            record.append(row[CSVSLTerm-1])
+            record.append(row[CSVTLTerm-1])
+            data.append(record)  
+            record=[]
+        self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)   
+        self.conn.commit()
+    def load_reference_terms_excel(self,file,nmin=0,nmax=1000,sheet_name=1,first_row=1,sourceColumn="A",targetColumn="B"):
+        workbook = load_workbook(filename=file)
+        data=[]
+        for sheet_name in workbook.sheetnames:
+            sheet = workbook[sheet_name]
+            for row in sheet.rows:
+                source=""
+                target=""
+                record=[]
+                for cell in row:
+                    
+                    if isinstance(cell, openpyxl.cell.cell.MergedCell):
+                        # Skip this cell
+                        continue
+                    if cell.column_letter==sourceColumn: 
+                        source=cell.value
+                    elif cell.column_letter==targetColumn: 
+                        target=cell.value
+                if not source=="" and not target=="":
+                    record.append(source)
+                    record.append(target)  
+                    data.append(record)
+        self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)
+        self.conn.commit()
+
+    #compoundify_terms_sl
+    def load_compoundify_terms_sl_txt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the compoundify terms for the source language from a text file (one term per line).'''
+        cf=codecs.open(arxiu,"r",encoding=encoding)
+        data=[]
+        continserts=0
+        for line in cf:
+            line=line.rstrip()
+            continserts+=1            
+            record=[]
+            line=line.rstrip()
+            camps=line.split("\t")
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(camps[0]).split()
+            else:
+                tokens=camps[0].split()
+            if len(tokens)>=nmin and len(tokens)<=nmax:
+                record.append(camps[0])
+                data.append(record)
+            if continserts==self.maxinserts: 
+                self.cur.executemany("INSERT INTO compoundify_terms_sl (term) VALUES (?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO compoundify_terms_sl (term) VALUES (?)",data)
+        self.conn.commit()
+        
+    def load_compoundify_terms_sl_tbx(self,arxiu,code="",encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the compoundify terms for the source language from a TBX file.'''
+        codes=[]
+        for slc in code.split(","):
+            codes.append(slc.strip())
+        data=[]
+        term=[]
+        lang=""
+        for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+            tag=elem.tag.replace(self.namespace(elem),"")
+            if event=="end" and tag in ["conceptEntry","termEntry"]:
+                if len(term)>0 and lang in codes:
+                    record=[]
+                    for slt in term:
+                        if self.specificSLtokenizer:
+                            tokens=self.SLtokenizer.tokenize(slt).split()
+                        else:
+                            tokens=slt.split()
+                        if len(tokens)>=nmin and len(tokens)<=nmax:
+                            record.append(slt)
+                            data.append(record)  
+                            record=[]
+                    term=[]
+            elif event=="start" and tag=="langSec":
+                if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in codes:
+                    lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+                else:
+                    lang=""
+            elif event=="start" and tag=="term":
+                if lang in codes: 
+                    term.append("".join(elem.itertext()).lstrip().rstrip())
+        self.cur.executemany("INSERT INTO compoundify_terms_sl (term) VALUES (?)",data)   
+        self.conn.commit()
+        
+    #compoundify_terms_tl
+    def load_compoundify_terms_tl_txt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the compoundify terms for the target language from a text file (one term per line).'''
+        cf=codecs.open(arxiu,"r",encoding=encoding)
+        data=[]
+        continserts=0
+        for line in cf:
+            line=line.rstrip()
+            continserts+=1            
+            record=[]
+            line=line.rstrip()
+            camps=line.split("\t")
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(camps[0]).split()
+            else:
+                tokens=camps[0].split()
+            if len(tokens)>=nmin and len(tokens)<=nmax:
+                record.append(camps[0])
+                data.append(record)
+            if continserts==self.maxinserts: 
+                self.cur.executemany("INSERT INTO compoundify_terms_tl (term) VALUES (?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO compoundify_terms_tl (term) VALUES (?)",data)
+        self.conn.commit()
+        
+    def load_compoundify_terms_tl_tbx(self,arxiu,code="",encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the compoundify terms for the target language from a TBX file.'''
+        codes=[]
+        for slc in code.split(","):
+            codes.append(slc.strip())
+        data=[]
+        term=[]
+        lang=""
+        for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+            tag=elem.tag.replace(self.namespace(elem),"")
+            if event=="end" and tag in ["conceptEntry","termEntry"]:
+                if len(term)>0 and lang in codes:
+                    record=[]
+                    for slt in term:
+                        if self.specificSLtokenizer:
+                            tokens=self.SLtokenizer.tokenize(slt).split()
+                        else:
+                            tokens=slt.split()
+                        if len(tokens)>=nmin and len(tokens)<=nmax:
+                            record.append(slt)
+                            data.append(record)  
+                            record=[]
+                    term=[]
+            elif event=="start" and tag=="langSec":
+                if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in codes:
+                    lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+                else:
+                    lang=""
+            elif event=="start" and tag=="term":
+                if lang in codes: 
+                    term.append("".join(elem.itertext()).lstrip().rstrip())
+        self.cur.executemany("INSERT INTO compoundify_terms_tl (term) VALUES (?)",data)   
+        self.conn.commit()
+        
+    #tsr terms
+    
+    def load_tsr_terms_txt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the TSR terms from a text file (one term per line).'''
+        cf=codecs.open(arxiu,"r",encoding=encoding)
+        data=[]
+        continserts=0
+        for line in cf:
+            line=line.rstrip()
+            continserts+=1            
+            record=[]
+            line=line.rstrip()
+            camps=line.split("\t")
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(camps[0]).split()
+            else:
+                tokens=camps[0].split()
+            if len(tokens)>=nmin and len(tokens)<=nmax:
+                record.append(camps[0])
+                data.append(record)
+            if continserts==self.maxinserts: 
+                self.cur.executemany("INSERT INTO tsr_terms (term) VALUES (?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO tsr_terms (term) VALUES (?)",data)
+        self.conn.commit()
+        
+    def load_tosearch_terms(self,SLterms,encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the TSR terms from a string, text file (one term per line) or list.'''
+        tofind=[]
+        if isinstance(SLterms, str):
+            if os.path.exists(SLterms):
+                entrada=codecs.open(SLterms)
+                for linia in entrada:
+                    linia=linia.rstrip()
+                    tofind.append(linia)
+                entrada.close()
+            else:
+                tofind.append(SLterms)
+        elif isinstance(SLterms, list):
+            tofind.extend(SLterms)
+        data=[]
+        continserts=0
+        for term in tofind:
+            continserts+=1            
+            record=[]
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(term)
+            else:
+                tokens=term.split()
+            if len(tokens)>=nmin and len(tokens)<=nmax:
+                record.append(term)
+                data.append(record)
+            if continserts==self.maxinserts: 
+                self.cur.executemany("INSERT INTO tosearch_terms (term) VALUES (?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO tosearch_terms (term) VALUES (?)",data)
+        self.conn.commit()
+        
+    def load_tsr_terms_tbx(self,arxiu,code="",encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the TSR terms from a TBX file.'''
+        codes=[]
+        for slc in code.split(","):
+            codes.append(slc.strip())
+        data=[]
+        term=[]
+        lang=""
+        for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+            tag=elem.tag.replace(self.namespace(elem),"")
+            if event=="end" and tag in ["conceptEntry","termEntry"]:
+                if len(term)>0 and lang in codes:
+                    record=[]
+                    for slt in term:
+                        if self.specificSLtokenizer:
+                            tokens=self.SLtokenizer.tokenize(slt).split()
+                        else:
+                            tokens=slt.split()
+                        if len(tokens)>=nmin and len(tokens)<=nmax:
+                            record.append(slt)
+                            data.append(record)  
+                            record=[]
+                    term=[]
+            elif event=="start" and tag=="langSec":
+                if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in codes:
+                    lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+                else:
+                    lang=""
+            elif event=="start" and tag=="term":
+                if lang in codes: 
+                    term.append("".join(elem.itertext()).lstrip().rstrip())
+        self.cur.executemany("INSERT INTO tsr_terms (term) VALUES (?)",data)   
+        self.conn.commit()
+        
+    #exclusion_terms
+    
+    def load_exclusion_terms_tabtxt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the exclusion terms from a tabulated text.'''
+        cf=codecs.open(arxiu,"r",encoding=encoding)
+        data=[]
+        continserts=0
+        for line in cf:
+            line=line.rstrip()
+            continserts+=1            
+            record=[]
+            line=line.rstrip()
+            camps=line.split("\t")
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(camps[0]).split()
+            else:
+                tokens=camps[0].split()
+            if len(camps)==1:
+                if len(tokens)>=nmin and len(tokens)<=nmax:
+                    record.append(camps[0])
+                    record.append("_")
+                    data.append(record)
+            elif len(camps)>1:
+                if len(tokens)>=nmin and len(tokens)<=nmax:
+                    record.append(camps[0])
+                    record.append(camps[1])
+                    data.append(record)
+            if continserts==self.maxinserts: 
+                self.cur.executemany("INSERT INTO exclusion_terms (sl_term,tl_term) VALUES (?,?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO exclusion_terms (sl_term,tl_term) VALUES (?,?)",data)
+        self.conn.commit()
+        
+    def load_exclusion_terms_tbx(self,arxiu,sl_code="",tl_code="",encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the exclusion terms from a TBX file.'''
+        slcodes=[]
+        for slc in sl_code.split(","):
+            slcodes.append(slc.strip())
+        tlcodes=[]
+        for tlc in tl_code.split(","):
+            tlcodes.append(tlc.strip())
+        data=[]
+        slterm=[]
+        tlterm=[]
+        lang=""
+        for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+            tag=elem.tag.replace(self.namespace(elem),"")
+            if event=="end" and tag in ["conceptEntry","termEntry"]:
+                if len(slterm)>0 and len(tlterm)>0:
+                    record=[]
+                    for slt in slterm:
+                        if self.specificSLtokenizer:
+                            tokens=self.SLtokenizer.tokenize(slt).split()
+                        else:
+                            tokens=slt.split()
+                        if len(tokens)>=nmin and len(tokens)<=nmax:
+                            tlt=", ".join(tlterm)
+                            record.append(slt)
+                            record.append(tlt)
+                            data.append(record)  
+                            record=[]
+                    slterm=[]
+                    tlterm=[]
+            elif event=="start" and tag=="langSec":
+                if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in slcodes:
+                    lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+                if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in tlcodes:
+                    lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+            elif event=="start" and tag=="term":
+                if lang in slcodes: slterm.append("".join(elem.itertext()).lstrip().rstrip())
+                elif lang in tlcodes: tlterm.append("".join(elem.itertext()).lstrip().rstrip())
+        self.cur.executemany("INSERT INTO exclusion_terms (sl_term,tl_term) VALUES (?,?)",data)   
+        self.conn.commit()
+        
+    #EXCLUSION NO TERMS
+    def load_exclusion_noterms_tabtxt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+        '''Loads the exclusion no terms from a tabulated text.'''
+        cf=codecs.open(arxiu,"r",encoding=encoding)
+        data=[]
+        continserts=0
+        for line in cf:
+            line=line.rstrip()
+            continserts+=1            
+            record=[]
+            line=line.rstrip()
+            camps=line.split("\t")
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(camps[0]).split()
+            else:
+                tokens=camps[0].split()
+            if len(camps)==1:
+                if len(tokens)>=nmin and len(tokens)<=nmax:
+                    record.append(camps[0])
+                    record.append("_")
+                    data.append(record)
+            elif len(camps)>1:
+                if len(tokens)>=nmin and len(tokens)<=nmax:
+                    record.append(camps[0])
+                    record.append(camps[1])
+                    data.append(record)
+            if continserts==self.maxinserts: 
+                self.cur.executemany("INSERT INTO exclusion_noterms (sl_term,tl_term) VALUES (?,?)",data)
+                data=[]
+                continserts=0
+        with self.conn:
+            self.cur.executemany("INSERT INTO exclusion_noterms (sl_term,tl_term) VALUES (?,?)",data)
+    
+    
+    
+    def namespace(self,element):
+        m = re.match(r'\{.*\}', element.tag)
+        return m.group(0) if m else ''
+        
+    def find_translation_reference_terms(self,term):
+        self.cur.execute("SELECT tl_term FROM reference_terms where sl_term='"+str(term)+"'")
+        tlterms=[]
+        for self.s in self.cur.fetchall():
+            tlterms.append(self.s[0])
+        if len(tlterms)>0:
+            return(", ".join(tlterms))
+        else:
+            return(None)
+        
+    
+    def load_sl_exclusion_regexps(self,arxiu,encoding="utf-8"):
+        '''Loads the exclusion regular expressions for the source language.'''
+        cf=codecs.open(arxiu,"r",encoding=encoding)
+        data=[]
+        for line in cf:
+            line=line.rstrip()
+            record=[]
+            record.append(line)
+            data.append(record)
+            
+        with self.conn:
+            self.cur.executemany('INSERT INTO sl_exclusion_regexps (sl_exclusion_regexp) VALUES (?)',data)
+            
+    def load_tl_exclusion_regexps(self,arxiu,encoding="utf-8"):
+        '''Loads the exclusion regular expressions for the target language.'''
+        cf=codecs.open(arxiu,"r",encoding=encoding)
+        data=[]
+        for line in cf:
+            line=line.rstrip()
+            record=[]
+            record.append(line)
+            data.append(record)
+            
+        with self.conn:
+            self.cur.executemany('INSERT INTO tl_exclusion_regexps (sl_exclusion_regexp) VALUES (?)',data)
+
+   
+    def show_term_candidates(self,limit=-1,minfreq=2, minmeasure=-1, show_frequency=True, show_measure=False, mark_eval=False, verbose=False):
+        '''Shows the term candidates in the screen.'''
+        measure=0
+        knownterms=[]
+        knownoterms=[]
+        with self.conn:
+            self.cur.execute("SELECT sl_term FROM exclusion_terms")
+            for s in self.cur.fetchall():
+                knownterms.append(s[0])
+        with self.conn:
+            self.cur.execute("SELECT sl_term FROM exclusion_noterms")
+            for s in self.cur.fetchall():
+                knownnoterms.append(s[0])
+        with self.conn:
+            self.cur.execute("SELECT frequency,value,n,candidate FROM term_candidates order by value desc, frequency desc, random() limit "+str(limit))
+            for s in self.cur.fetchall():
+                frequency=s[0]
+                if s[1]==None:
+                    measure==0
+                else:
+                    measure=s[1]
+                n=s[2]
+                candidate=s[3]
+                if n>=n_min and n<=n_max and not candidate in knownterms and not candidate in knownoterms:
+                    if mark_eval:
+                        if candidate in evaluation_terms:
+                            candidate="*"+candidate
+                    if show_frequency and not show_measure:
+                        cadena=str(frequency)+"\t"+candidate
+                    if not show_frequency and show_measure:
+                        cadena=str(measure)+"\t"+candidate
+                    if show_measure and show_frequency:
+                        cadena=str(frequency)+"\t"+str(measure)+"\t"+candidate
+                    else:
+                        cadena=candidate
+                    print(cadena)
+
+    def select_unigrams(self,file,position=-1,verbose=True):
+        sunigrams=codecs.open(file,"w",encoding="utf-8")
+        unigrams={}
+        self.cur.execute("SELECT frequency,candidate FROM term_candidates order by value desc, frequency desc, random()")
+        #self.cur.execute("SELECT frequency,value,n,candidate FROM term_candidates order by n desc limit "+str(limit))
+        for s in self.cur.fetchall():
+            frequency=s[0]
+            candidate=s[1].split()[position]
+            if candidate in unigrams:
+                unigrams[candidate]+=frequency
+            else:
+                unigrams[candidate]=frequency
+        #for self.candidate in self.unigrams:
+        #    print(self.unigrams[self.candidate],self.candidate)
+        data=[]
+        for candidate in sorted(unigrams, key=unigrams.get, reverse=True):
+            
+            cadena=str(unigrams[candidate])+"\t"+candidate
+            #if self.verbose: print(cadena)
+            record=[]
+            record.append(candidate)
+            record.append(1)
+            record.append(unigrams[candidate])
+            record.append("freq")
+            record.append(unigrams[candidate])
+            data.append(record)
+            sunigrams.write(cadena+"\n")
+          
+        with self.conn:
+            self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data)        
+            self.conn.commit()
+
+
+    def save_term_candidates(self,outfile,limit=-1,minfreq=2, minmeasure=-1, show_frequency=True, show_measure=False, mark_eval=False, verbose=False):
+        '''Saves the term candidates in a file.'''
+        sortida=codecs.open(outfile,"w",encoding="utf-8")
+        measure=0
+        knownterms=[]
+        knownnoterms=[]
+        with self.conn:
+            self.cur.execute("SELECT sl_term FROM exclusion_terms")
+            for s in self.cur.fetchall():
+                knownterms.append(s[0])
+        with self.conn:
+            self.cur.execute("SELECT sl_term FROM exclusion_noterms")
+            for s in self.cur.fetchall():
+                knownnoterms.append(s[0])
+        with self.conn:
+            self.cur.execute("SELECT frequency,value,n,candidate FROM term_candidates order by value desc, frequency desc, random() limit "+str(limit))
+            for s in self.cur.fetchall():
+                frequency=s[0]
+                if s[1]==None:
+                    measure==0
+                else:
+                    measure=s[1]
+                n=s[2]
+                candidate=s[3]
+                if not candidate in knownterms and not candidate in knownnoterms:
+                    if mark_eval:
+                        if candidate in evaluation_terms:
+                            candidate="*"+candidate
+                    if show_measure and not show_frequency:
+                        cadena=str(measure)+"\t"+candidate
+                    elif show_frequency and not show_measure:
+                        cadena=str(frequency)+"\t"+candidate
+                    elif show_frequency and show_measure:
+                        cadena=str(frequency)+"\t"+str(measure)+"\t"+candidate
+                    else:
+                        cadena=candidate
+                    if verbose:
+                        print(cadena)
+                    sortida.write(cadena+"\n")
+                    
+    #STATISTICAL TERM EXTRACTION
+    
+    def ngram_calculation (self,nmin,nmax,minfreq=2,corpus="sl_corpus"):
+        '''Performs the calculation of ngrams.'''
+        ngramsFD=FreqDist()
+        tokensFD=FreqDist()
+        n_nmin=nmin
+        n_max=nmax
+            
+        with self.conn:
+            if corpus=="sl_corpus":
+                self.cur.execute('SELECT segment from sl_corpus')
+            elif corpus=="tl_corpus":
+                self.cur.execute('SELECT segment from tl_corpus')
+            for s in self.cur.fetchall():
+                segment=s[0]
+                for n in range(nmin,nmax+1): #we DON'T calculate one order bigger in order to detect nested candidates
+                    if self.specificSLtokenizer:
+                        tokens=self.SLtokenizer.tokenize(segment).split()
+                    else:
+                        tokens=segment.split()
+                    ngs=ngrams(tokens, n)
+                    for ng in ngs:
+                        ngramsFD[ng]+=1
+                for token in tokens:
+                    tokensFD[token]+=1
+                       
+        data=[]                
+        for c in ngramsFD.most_common():
+            if c[1]>=minfreq:
+                record=[]
+                record.append(" ".join(c[0]))            
+                record.append(len(c[0]))
+                record.append(c[1])   
+                data.append(record)
+        with self.conn:
+            self.cur.executemany("INSERT INTO ngrams (ngram, n, frequency) VALUES (?,?,?)",data) 
+            self.conn.commit()
+            
+        data=[]                
+        for c in tokensFD.most_common():
+            record=[]
+            record.append(c[0])            
+            record.append(c[1])   
+            data.append(record)
+        with self.conn:
+            self.cur.executemany("INSERT INTO tokens (token, frequency) VALUES (?,?)",data) 
+            self.conn.commit()
+            
+    def statistical_term_extraction(self,minfreq=2,corpus="sl_corpus"):
+        '''Performs an statistical term extraction using the extracted ngrams (ngram_calculation should be executed first). Loading stop-words is advisable. '''
+        self.cur.execute("DELETE FROM term_candidates")
+        self.conn.commit()
+        stopwords=[]
+        with self.conn:
+            if corpus=="sl_corpus":
+                self.cur.execute("SELECT sl_stopword FROM sl_stopwords")
+            elif corpus=="tl_corpus":
+                self.cur.execute("SELECT tl_stopword FROM tl_stopwords")
+            for s in self.cur.fetchall():
+                stopwords.append(s[0])
+                
+        inner_stopwords=[]
+        with self.conn:
+            if corpus=="sl_corpus":
+                self.cur.execute("SELECT sl_inner_stopword FROM sl_inner_stopwords")
+            elif corpus=="tl_corpus":
+                self.cur.execute("SELECT tl_inner_stopword FROM tl_inner_stopwords")
+            for s in self.cur.fetchall():
+                inner_stopwords.append(s[0])
+        
+        self.cur.execute("SELECT ngram, n, frequency FROM ngrams order by frequency desc")
+        results=self.cur.fetchall()
+        data=[] 
+        for a in results:
+            if corpus=="sl_corpus":
+                if self.specificSLtokenizer:
+                    ng=self.SLtokenizer.tokenize(a[0]).split()
+                else:
+                    ng=a[0].split()
+            if corpus=="tl_corpus":
+                if self.specificTLtokenizer:
+                    
+                    ng=self.TLtokenizer.tokenize(a[0]).split()
+                else:
+                    ng=a[0].split()
+            include=True
+            if ng[0].lower() in stopwords: include=False
+            if ng[-1].lower() in stopwords: include=False
+            for i in range(1,len(ng)):
+                if ng[i].lower() in inner_stopwords:
+                    include=False
+            if include:
+                record=[]
+                record.append(a[0])            
+                record.append(a[1])
+                record.append(a[2])
+                record.append("freq")
+                record.append(a[2])   
+                data.append(record)
+            if a[2]<minfreq:
+                break
+        with self.conn:
+            self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data)        
+            self.conn.commit()
+    
+    def loadSLtokenizer(self, tokenizer):
+        if not tokenizer.endswith(".py"): tokenizer=tokenizer+".py"
+        spec = importlib.util.spec_from_file_location('', tokenizer)
+        tokenizermod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(tokenizermod)
+        self.SLtokenizer=tokenizermod.Tokenizer()
+        self.specificSLtokenizer=True
+        
+    def unloadSLtokenizer(self):
+        self.SLtokenizer=None
+        self.specificSLtokenizer=False
+        
+    def loadTLtokenizer(self, tokenizer):
+        if not tokenizer.endswith(".py"): tokenizer=tokenizer+".py"
+        spec = importlib.util.spec_from_file_location('', tokenizer)
+        tokenizermod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(tokenizermod)
+        self.TLtokenizer=tokenizermod.Tokenizer()
+        self.specificTLtokenizer=True
+        
+    def unloadSLtokenizer(self):
+        self.TLtokenizer=None
+        self.specificTLtokenizer=False
+            
+    def statistical_term_extraction_by_segment(self, segment, minlocalfreq=1, minglobalfreq=2, maxcandidates=2, nmin=1, nmax=4):
+        '''Performs an statistical term extraction over a single segment using the extracted ngrams (ngram_calculation should be executed first) Loading stop-words is advisable. '''
+        ngramsFD=FreqDist()
+        sl_stopword=[]
+        with self.conn:
+            self.cur.execute("SELECT sl_stopword FROM sl_stopwords")
+            for s in self.cur.fetchall():
+                sl_stopword.append(s[0])
+                
+        sl_inner_stopwords=[]
+        with self.conn:
+            self.cur.execute("SELECT sl_inner_stopword FROM sl_inner_stopwords")
+            for s in self.cur.fetchall():
+                sl_inner_stopwords.append(s[0])
+        
+        for n in range(nmin,nmax+1):
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(segment).split()
+            else:
+                tokens=segment.split()
+            ngs=ngrams(tokens, n)
+            for ng in ngs:
+                include=True
+                
+                if ng[0].lower() in self.sl_stopwords: include=False
+                if ng[-1].lower() in self.sl_stopwords: include=False
+                for i in range(1,len(ng)):
+                    if ng[i].lower() in self.sl_inner_stopwords:
+                        include=False
+                if include: ngramsFD[" ".join(ng)]+=1
+                
+        for ng in ngramsFD.most_common():
+            print(ng)
+
+    def case_normalization(self,verbose=False):
+        '''
+        Performs case normalization. If a capitalized term exists as non-capitalized, the capitalized one will be deleted and the frequency of the non-capitalized one will be increased by the frequency of the capitalized.
+        '''
+        self.cur.execute("SELECT candidate,frequency FROM term_candidates order by frequency desc")
+        results=self.cur.fetchall()
+        auxiliar={}
+        for r in results:
+            auxiliar[r[0]]=r[1]
+        for a in results:
+            if not a[0]==a[0].lower() and a[0].lower() in auxiliar:
+                terma=a[0]
+                termb=a[0].lower()
+                freqa=a[1]
+                freqb=auxiliar[termb]
+                n=len(termb.split())
+                freqtotal=freqa+freqb
+                if verbose:
+                    print(terma,freqa,"-->",termb,freqb,"-->",freqtotal)
+                self.cur.execute('DELETE FROM term_candidates WHERE candidate=?', (terma,))
+                self.cur.execute('DELETE FROM term_candidates WHERE candidate=?', (termb,))
+                self.cur.execute("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",(termb,n,freqtotal,"freq",freqtotal))
+        self.conn.commit()
+
+    def nest_normalization(self,percent=10,verbose=False):
+        '''
+        Performs a normalization of nested term candidates. If an n-gram candidate A is contained in a n+1 candidate B and freq(A)==freq(B) or they are close values (determined by the percent parameter, A is deleted B remains as it is)
+        '''
+        self.cur.execute("SELECT candidate,frequency,n FROM term_candidates order by frequency desc")
+        results=self.cur.fetchall()
+        for a in results:
+            ta=a[0]
+            fa=a[1]
+            na=a[2]
+            nb=na+1
+            fmax=fa+fa*percent/100
+            fmin=fa-fa*percent/100
+            self.cur.execute("SELECT candidate,frequency FROM term_candidates where frequency <="+str(fmax)+" and frequency>="+str(fmin)+"  and n ="+str(nb))
+            results2=self.cur.fetchall()
+            for b in results2:
+                tb=b[0]
+                fb=b[1]
+                if not ta==tb and not tb.find(ta)==-1: 
+                    self.cur.execute('DELETE FROM term_candidates WHERE candidate=?', (ta,))
+                    if verbose:
+                        print(str(fa),ta,"-->",str(fb),tb)
+        self.conn.commit()
+
+    def regexp_exclusion(self,verbose=False):
+        '''Deletes term candidates matching a set of regular expresions loaded with the load_sl_exclusion_regexps method.'''
+        self.cur.execute("SELECT sl_exclusion_regexp FROM sl_exclusion_regexps")
+        results=self.cur.fetchall()
+        for r in results:
+            nregexp=len(r[0].split())
+            exreg=r[0]
+            self.cur.execute("SELECT candidate FROM term_candidates")
+            results=self.cur.fetchall()
+            cexreg=re.compile(exreg)
+            for a in results:
+                candidate=a[0]
+                ncandidate=len(candidate.split())
+                match=re.match(cexreg,candidate)
+                if not match==None and nregexp==ncandidate:
+                    self.cur.execute('DELETE FROM term_candidates WHERE candidate=?', (candidate,))
+                    if verbose:
+                        print(exreg,"-->",candidate)
+            self.conn.commit()
+
+    #EVALUATION
+    
+    
+     
+    def evaluate_pos(self,limit,order="desc",iterations=1000,ignore_case=True):
+        '''Performs the evaluation of the term candidates using the evaluation_terms loaded with the load_evaluation_terms method.'''
+        correct=0
+        total=0
+        evaluation_terms=[]
+        self.cur.execute("SELECT sl_term FROM evaluation_terms")
+        results=self.cur.fetchall()
+        for r in results:
+            evaluation_terms.append(r[0])
+        tsr_terms=[]
+        self.cur.execute("SELECT term FROM tsr_terms")
+        results=self.cur.fetchall()
+        for r in results:
+            tsr_terms.append(r[0])
+        evaluation_terms.extend(self.tsr_terms)
+        with self.conn:
+            for i in range(0,iterations):
+                if order=="desc":
+                    self.cur.execute("SELECT candidate,value from term_candidates where n<="+str(self.n_max)+" order by value desc, frequency desc, random() limit "+str(limit))
+                elif order=="asc":
+                    self.cur.execute("SELECT candidate from term_candidates where n<="+str(self.n_max)+" order by value asc, frequency desc, random() limit "+str(limit))
+                else:
+                    raise NameError('Order must be desc (decending) or asc (ascending). Defaulf value: desc')
+                #self.cur.execute("SELECT candidate from term_candidates order by id limit "+str(limit))
+                for s in self.cur.fetchall():
+                    total+=1
+                    candidate=s[0]
+                    if ignore_case:
+                        if candidate in evaluation_terms:
+                            correct+=1
+                        elif candidate.lower() in evaluation_terms:
+                            correct+=1
+                    else:
+                        if candidate in evaluation_terms:
+                            correct+=1
+            correct=correct/iterations
+            total=total/iterations
+            
+        try:
+            precisio=100*correct/total
+            recall=100*correct/len(evaluation_terms)
+            f1=2*precisio*recall/(precisio+recall)
+            return(limit,correct,total,precisio,recall,f1)
+        except:
+            return(limit,0,0,0,0,0)
+
+    def association_measures(self,measure="raw_freq"):
+        measurename=measure
+        bigram_measures = myBigramAssocMeasures()
+        trigram_measures = myTrigramAssocMeasures()
+        quadgram_measures = myQuadgramAssocMeasures()
+        
+        fd_tokens=nltk.FreqDist()
+        fd_bigrams=nltk.FreqDist()
+        fd_trigrams=nltk.FreqDist()
+        fd_quadgrams=nltk.FreqDist()
+        wildcard_fd=nltk.FreqDist()
+        self.cur.execute("SELECT token,frequency from tokens")
+        for s in self.cur.fetchall():
+            aux=(s[0])
+            fd_tokens[aux]+=s[1]
+            
+        textcorpus=[]
+        self.cur.execute("SELECT segment from sl_corpus")
+        for segment in self.cur.fetchall():
+            textcorpus.extend(segment[0].split())
+            
+        bigram_finder=BigramCollocationFinder.from_words(textcorpus)
+        trigram_finder=TrigramCollocationFinder.from_words(textcorpus)
+        quadgram_finder=QuadgramCollocationFinder.from_words(textcorpus)
+        
+        self.cur.execute("SELECT ngram,frequency,n from ngrams")
+        results=self.cur.fetchall()
+        for r in results:
+            data=[]
+            data.append(r[0])
+            self.cur2.execute("UPDATE term_candidates SET value=NULL where candidate=?",data)
+        self.conn.commit()
+        data=[]
+        bigram_measure=[]
+        try:
+            bigram_measure=eval("bigram_finder.score_ngrams(bigram_measures."+measure+")")
+        except:
+            print("WARNING: measure "+measure+ " not implemented for bigrams",sys.exc_info())
+            #sys.exit()
+            
+        for nose in bigram_measure:
+            record=[]
+            term_candidate=" ".join(nose[0])
+            mvalue=nose[1]
+            record.append(measure)
+            record.append(mvalue)
+            record.append(term_candidate)
+            data.append(record)
+        
+        trigram_measure=[]        
+        try:
+            trigram_measure=eval("trigram_finder.score_ngrams(trigram_measures."+measure+")")
+        except:
+            print("WARNING: measure "+measure+ " not implemented for trigrams")
+            #sys.exit()
+        for nose in trigram_measure:
+            record=[]
+            term_candidate=" ".join(nose[0])
+            mvalue=nose[1]
+            record.append(measure)
+            record.append(mvalue)
+            record.append(term_candidate)
+            data.append(record)
+        quadgram_measure=[]  
+        try:
+            quadgram_measure=eval("quadgram_finder.score_ngrams(quadgram_measures."+measure+")")
+        except:
+            print("WARNING: measure "+measure+ " not implemented for quadgrams")
+            #sys.exit()
+        
+        for nose in quadgram_measure:
+            record=[]
+            term_candidate=" ".join(nose[0])
+            mvalue=nose[1]
+            record.append(measure)
+            record.append(mvalue)
+            record.append(term_candidate)
+            data.append(record)
+            
+        self.conn.executemany("UPDATE term_candidates SET measure=?,value=? where candidate=?",data)
+        self.conn.commit()
+    
+
+
+    def index_phrase_table(self,phrasetable):
+        '''Indexes a phrase table from Moses.'''
+        self.entrada=gzip.open(phrasetable, mode='rt',encoding='utf-8')
+
+        self.pt={}
+        self.continserts=0
+        self.record=[]
+        self.data=[]
+        while 1:
+            self.linia=self.entrada.readline()
+            if not self.linia:
+                break
+            self.linia=self.linia.rstrip()
+            self.camps=self.linia.split(" ||| ")
+            self.source=self.camps[0].strip()
+            self.trad=self.camps[1].strip()
+            self.probs=self.camps[2].split()
+            try:
+                if not self.trad[0] in self.punctuation and not self.source[0] in self.punctuation and not self.trad[-1] in self.punctuation and not self.source[-1] in self.punctuation:
+                    #Currently, four different phrase translation scores are computed:
+                    #0    inverse phrase translation probability φ(f|e)
+                    #1    inverse lexical weighting lex(f|e)
+                    #2    direct phrase translation probability φ(e|f)
+                    #3    direct lexical weighting lex(e|f)
+                    #self.probtrad=float(self.probs[1])
+                    self.probtrad=(float(self.probs[2])*float(self.probs[3]))
+                    #print(self.source,self.trad,self.probtrad)
+                    self.record=[]
+                    self.record.append(self.source)
+                    self.record.append(self.trad)
+                    self.record.append(self.probtrad)
+                    self.data.append(self.record)
+                    self.continserts+=1
+                    if self.continserts==self.maxinserts:
+                        self.cur.executemany("INSERT INTO index_pt (source, target, probability) VALUES (?,?,?)",self.data)
+                        self.data=[]
+                        self.continserts=0
+                        self.conn.commit()
+            except:
+                pass
+        with self.conn:
+            self.cur.executemany("INSERT INTO index_pt (source, target, probability) VALUES (?,?,?)",self.data)    
+        self.conn.commit()
+                
+        
+    def find_terms_in_parallel_corpus(self,SLterms,maxdec=1,maxinc=2,candidates=5,maxlines=-1):
+        tofind=[]
+        result={}
+        if isinstance(SLterms, str):
+            if os.path.exists(SLterms):
+                entrada=codecs.open(SLterms)
+                for linia in entrada:
+                    linia=linia.rstrip()
+                    tofind.append(linia)
+                entrada.close()
+            else:
+                tofind.append(SLterms)
+        elif isinstance(SLterms, list):
+            tofind.extend(SLterms)
+        tl_stopwords=[]
+        with self.conn:
+            self.cur.execute("SELECT tl_stopword FROM tl_stopwords")
+            for s in self.cur.fetchall():
+                tl_stopwords.append(s[0])
+       
+        for SLterm in tofind:
+            fd=FreqDist()
+            fd.clear()
+            result[SLterm]={}
+            if maxlines==-1:
+                self.cur.execute("SELECT segmentTL FROM parallel_corpus where INSTR(segmentSL,\""+SLterm+"\")")
+            else:
+                self.cur.execute("SELECT segmentTL FROM parallel_corpus where INSTR(segmentSL,\""+SLterm+"\") limit "+str(maxlines))
+            TLsegments=self.cur.fetchall()
+            if self.specificSLtokenizer:
+                termtok=self.SLtokenizer.tokenize(SLterm)
+            else:
+                termtok=SLterm
+            nSLterm=len(termtok.split())
+            nmin=nSLterm-maxdec
+            if nmin<1: nmin=1
+            nmax=nSLterm+maxinc
+            for TLsegment in TLsegments:
+                if self.specificTLtokenizer:
+                    TLsegmenttok=self.TLtokenizer.tokenize(TLsegment[0]).split()
+                else:
+                    TLsegmenttok=TLsegment[0].split()
+                for n in range(nmin,nmax+1):
+                    ngs=ngrams(TLsegmenttok, n)
+                    for ng in ngs:
+                        include=True
+                        if ng[0] in tl_stopwords: include=False
+                        if len(ng)>1 and ng[1] in tl_stopwords: include=False
+                        if include:
+                            detokcandidate=" ".join(ng)
+                            if self.specificTLtokenizer:
+                                detokcandidate=self.TLtokenizer.detokenize(detokcandidate)
+                            fd[detokcandidate]+=1
+                            
+            totalf=fd.N()            
+            for mc in fd.most_common(candidates):
+                result[SLterm][mc[0]]=mc[1]/totalf
+        return(result)
+    
+    def compoundify_sl_corpus(self,term):
+        term2=term.replace(" ","▁")
+        self.cur.execute("SELECT id, segment FROM sl_corpus where INSTR(segment,\""+term+"\")")
+        trobats=self.cur.fetchall()
+        for trobat in trobats:
+            ident=trobat[0]
+            segment=trobat[1]
+            segment2=segment.replace(term,term2)
+            self.cur.execute("UPDATE sl_corpus SET segment=? where id=?",(segment2,ident))
+        self.conn.commit()
+    
+    def compoundify_tl_corpus(self,term):
+        term2=term.replace(" ","▁")
+        self.cur.execute("SELECT id, segment FROM tl_corpus where INSTR(segment,\""+term+"\")")
+        trobats=self.cur.fetchall()
+        for trobat in trobats:
+            ident=trobat[0]
+            segment=trobat[1]
+            segment2=segment.replace(term,term2)
+            self.cur.execute("UPDATE tl_corpus SET segment=? where id=?",(segment2,ident))
+        self.conn.commit()
+    
+    def compoundify_tl_corpus_mod(self,term):
+        term2=term.replace(" ","▁")
+        self.cur.execute("SELECT id, segment FROM tl_corpus where INSTR(segment,\""+term+"\")")
+        trobats=self.cur.fetchall()
+        data=[]
+        for trobat in trobats:
+            ident=trobat[0]
+            segment=trobat[1]
+            segment2=segment.replace(term,term2)
+            data.append([segment2])
+            
+            
+        self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data)
+        self.conn.commit()
+    
+    def find_translation_comparable_corpus(self,SLterms,tl_stopwords=None,mapping_dictionary="MUSE-en-es.txt",maxdec=1,maxinc=2,candidates=25,compoundifySL=True,compoundifyTL=True,max_term_candidates_compoundify=200):
+        tofind=[]
+        result={}
+        
+        if isinstance(SLterms, str):
+            if os.path.exists(SLterms):
+                entrada=codecs.open(SLterms)
+                for linia in entrada:
+                    linia=linia.rstrip()
+                    tofind.append(linia)
+                entrada.close()
+            else:
+                tofind.append(SLterms)
+        elif isinstance(SLterms, list):
+            tofind.extend(SLterms)
+        #compoundify SL corpus
+        slnmin=1000000
+        slnmax=0
+        for SLterm in tofind:
+            if self.specificSLtokenizer:
+                termtok=self.SLtokenizer.tokenize(SLterm)
+            else:
+                termtok=SLterm
+            if len(termtok.split())>1 and compoundifySL:
+                self.compoundify_sl_corpus(SLterm)
+            if len(termtok.split())<slnmin:slnmin=len(termtok.split())
+            if len(termtok.split())>slnmax:slnmax=len(termtok.split())
+            n_min=slnmin-maxdec
+            if n_min<2: n_min=2
+            n_max=slnmax+maxdec
+        #compoundify TL corpus  (basic statistical term extraction)
+        if compoundifyTL:
+            self.delete_tokens()
+            self.delete_ngrams()
+            self.delete_sl_stopwords()
+            self.delete_sl_inner_stopwords
+            self.delete_sl_exclusion_regexps()
+            self.delete_term_candidates()
+            self.ngram_calculation (n_min,n_max,minfreq=2,corpus="tl_corpus")
+            if not tl_stopwords==None:
+                self.load_tl_stopwords(tl_stopwords)
+            self.statistical_term_extraction(minfreq=2,corpus="tl_corpus")
+            self.cur.execute("SELECT candidate FROM term_candidates ORDER BY frequency desc limit "+str(max_term_candidates_compoundify)+";")
+            trobats=self.cur.fetchall()
+            for trobat in trobats:
+                term=trobat[0]
+                self.compoundify_tl_corpus(term)
+        print("CALCULATING EMBEDDINGS SL")
+        self.calculate_embeddings_sl("embeddingsSL.temp",vector_size=300, window=5)
+        print("CALCULATING EMBEDDINGS TL")
+        self.calculate_embeddings_tl("embeddingsTL.temp",vector_size=300, window=5)
+        print("MAPPING EMBEDDINGS")        
+        self.mapEmbeddings("embeddingsSL.temp","embeddingsTL.temp","mappedSL.tmp","mappedTL.tmp",mapping_dictionary)        
+        self.load_SL_embeddings("mappedSL.tmp")
+        self.load_TL_embeddings("mappedTL.tmp")
+        stopwords=[]
+        with self.conn:
+            self.cur.execute("SELECT tl_stopword FROM tl_stopwords")
+            for s in self.cur.fetchall():
+                stopwords.append(s[0])
+        results={}
+        for SLterm in tofind:
+            if self.specificSLtokenizer:
+                termtok=self.SLtokenizer.tokenize(SLterm)
+            else:
+                termtok=SLterm
+            lenterm=len(termtok.split())
+            lenmin=lenterm-maxdec
+            lenmax=lenterm+maxinc
+            results[SLterm]={}
+            translations=self.find_translation_wv(SLterm,ncandidates=1000)
+            cont=0
+            for translation in translations:
+                if self.specificTLtokenizer:
+                    translationtok=self.TLtokenizer.tokenize(translation)
+                else:
+                    translationtok=translation
+                lentranslation=len(translationtok.split())
+                try:
+                    if not translation in stopwords and not translation.split()[0] in stopwords and not translation.split()[-1] in stopwords and lentranslation>=lenmin and lentranslation<=lenmax:
+                        results[SLterm][translation]=translations[translation]
+                        cont+=1
+                except:
+                    pass
+                if cont>=candidates:
+                    break
+    
+        return(results)
+    
+    
+    def find_translation_ptable(self,sourceterm,maxdec=1,maxinc=1,ncandidates=5,separator=":"):
+        '''Finds translation equivalents in an indexed phrase table table. Requires an indexed phrase table and a a list of terms separated by ":".
+        The number of translation candidates can be fixed, as well as the maximum decrement and increment of the number of tokens of the translation candidate'''
+        #select target from index_pt where source="international conflict";
+        self.cur.execute('SELECT target,probability FROM index_pt where source =?',(sourceterm,))
+        self.results=self.cur.fetchall()
+        self.targetcandidates={}
+        for self.a in self.results:
+            self.targetterm=self.a[0]
+            self.probability=float(self.a[1])
+            self.tttokens=self.targetterm.split()
+            
+            if not self.tttokens[0] in self.tl_stopwords and not self.tttokens[-1] in self.tl_stopwords and len(self.tttokens)>=len(sourceterm.split())-maxdec and len(self.tttokens)<=len(sourceterm.split())+maxinc:
+                self.targetcandidates[self.targetterm]=self.probability
+        self.sorted_x = sorted(self.targetcandidates.items(), key=operator.itemgetter(1),reverse=True)
+        self.results=[]
+        for self.s in self.sorted_x:
+            self.results.append(self.s[0].replace(":",";"))
+        return(separator.join(self.results[0:ncandidates]))
+        
+                
+   
+    def start_freeling_api(self,freelingpath, LANG):
+        
+        if not freelingpath.endswith("/"):freelingpath=freelingpath+"/"
+        try:
+            sys.path.append(freelingpath+"APIs/python3/")
+            import pyfreeling
+        except:
+            print("No Freeling API available. Verify Freeling PATH: "+freelingpath+"freeling/APIs/python3/")
+        
+        pyfreeling.util_init_locale("default");
+
+        # create language analyzer
+        la1=pyfreeling.lang_ident(freelingpath+"common/lang_ident/ident.dat");
+
+        # create options set for maco analyzer. Default values are Ok, except for data files.
+        op1= pyfreeling.maco_options(LANG);
+        op1.set_data_files( "", 
+                           freelingpath + "common/punct.dat",
+                           freelingpath+ LANG + "/dicc.src",
+                           freelingpath + LANG + "/afixos.dat",
+                           "",
+                           freelingpath + LANG + "/locucions.dat", 
+                           freelingpath + LANG + "/np.dat",
+                           freelingpath + LANG + "/quantities.dat",
+                           freelingpath + LANG + "/probabilitats.dat");
+
+        # create analyzers
+        self.tk1=pyfreeling.tokenizer(freelingpath+LANG+"/tokenizer.dat");
+        self.sp1=pyfreeling.splitter(freelingpath+LANG+"/splitter.dat");
+        self.sid1=self.sp1.open_session();
+        self.mf1=pyfreeling.maco(op1);
+
+        # activate mmorpho odules to be used in next call
+        #(self, umap: "bool", num: "bool", pun: "bool", dat: "bool",
+        # dic: "bool", aff: "bool", comp: "bool", rtk: "bool", 
+        # mw: "bool", ner: "bool", qt: "bool", prb: "bool")
+        #deactivate mw
+        self.mf1.set_active_options(False, True, True, False,  # select which among created 
+                              True, True, False, True,  # submodules are to be used. 
+                              False, False, True, True ); # default: all created submodules are used
+
+        # create tagger, sense anotator, and parsers
+        self.tg1=pyfreeling.hmm_tagger(freelingpath+LANG+"/tagger.dat",True,2);
+        
+    def tag_freeling_api(self,corpus="source"):
+        with self.conn:
+            data=[]
+            if corpus=="source":
+                self.cur.execute('SELECT id,segment from sl_corpus')
+            elif corpus=="target":
+                self.cur.execute('SELECT id,segment from tl_corpus')
+            continserts=0
+            for s in self.cur.fetchall():
+                id=s[0]
+                segment=s[1]
+                continserts+=1
+                l1 = self.tk1.tokenize(segment);
+                ls1 = self.sp1.split(self.sid1,l1,True);
+                ls1 = self.mf1.analyze(ls1);
+                ls1 = self.tg1.analyze(ls1);
+                ttsentence=[]
+                for s in ls1 :
+                  ws = s.get_words();
+                  for w in ws :
+                    form=w.get_form()
+                    lemma=w.get_lemma()
+                    tag=w.get_tag()
+                    ttsentence.append(form+"|"+lemma+"|"+tag)
+                ttsentence=" ".join(ttsentence)
+                record=[]
+                record.append(id)
+                record.append(ttsentence)
+                data.append(record)
+                if continserts==self.maxinserts:
+                    if corpus=="source":
+                        self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+                    if corpus=="target":
+                        self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+                    data=[]
+                    continserts=0
+            with self.conn:
+                if corpus=="source":
+                    self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data) 
+                if corpus=="target":
+                    self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+    
+    
+    #SPACY TAGGER
+    def load_POS_model_spacy(self, model):
+        if not spacy.util.is_package(model):
+            print("Downloading and installing ",model)
+            try:
+                subprocess.check_call([sys.executable, "-m", "spacy", "download", model])
+                print("Model downloaded. Stopping the program. The program should be run again to load the downloaded model.")
+            except:
+                print("Model",model,"not available.")
+        else:
+            self.POSmodel_spacy=spacy.load(model)
+        
+    def tag_spacy(self,corpus="source",mode="coarse"):
+        #mode on of coarse or fine
+        with self.conn:
+            data=[]
+            if corpus=="source":
+                self.cur.execute('SELECT id,segment from sl_corpus')
+            elif corpus=="target":
+                self.cur.execute('SELECT id,segment from tl_corpus')
+            elif corpus=="parallel-source":
+                self.cur.execute('SELECT id,segmentSL from parallel_corpus')
+            elif corpus=="parallel-target":
+                self.cur.execute('SELECT id,segmentTL from parallel_corpus')
+            continserts=0
+            for s in self.cur.fetchall():
+                id=s[0]
+                segment=s[1]
+                continserts+=1
+                taggedtokens = self.POSmodel_spacy(segment)
+                ttsentence=[]
+                for token in taggedtokens:
+                    form=token.text
+                    lemma=token.lemma_
+                    if mode=="fine":
+                        tag=token.tag_
+                    elif mode=="coarse":
+                        tag=token.pos_    
+                    ttsentence.append(form+"|"+lemma+"|"+tag)
+                ttsentence=" ".join(ttsentence)
+                record=[]
+                record.append(id)
+                record.append(ttsentence)
+                data.append(record)
+                if continserts==self.maxinserts:
+                    if corpus=="source":
+                        self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+                    elif corpus=="target":
+                        self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+                    elif corpus=="parallel-source":
+                        self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentSL) VALUES (?,?) ON CONFLICT (id) DO UPDATE  SET tagged_segmentSL=excluded.tagged_segmentSL",data)
+                    elif corpus=="parallel-target":
+                        self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentTL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentTL=excluded.tagged_segmentTL",data)
+                    data=[]
+                    continserts=0
+            with self.conn:
+                if corpus=="source":
+                    self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data) 
+                elif corpus=="target":
+                    self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+                elif corpus=="parallel-source":
+                    self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentSL) VALUES (?,?) ON CONFLICT (id) DO UPDATE  SET tagged_segmentSL=excluded.tagged_segmentSL",data)
+                elif corpus=="parallel-target":
+                    self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentTL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentTL=excluded.tagged_segmentTL",data)
+    
+    #SPACY_UDPIPE TAGGER
+    def load_POS_model_spacy_udpipe(self, language):
+        try:
+            self.POSmodel = spacy_udpipe.load(language)
+        except:
+            print("No model for ",language," available.")
+            print("Downloading and installing model for ",language)
+            try:
+                spacy_udpipe.download(language)
+                self.POSmodel = spacy_udpipe.load(language)
+            except:
+                print("ERROR: not able to load spacy_udepipe model for ",language)
+            
+        
+    def tag_spacy_udpipe(self,corpus="source"):
+        #mode on of coarse or fine
+        with self.conn:
+            data=[]
+            if corpus=="source":
+                self.cur.execute('SELECT id,segment from sl_corpus')
+            elif corpus=="target":
+                self.cur.execute('SELECT id,segment from tl_corpus')
+            elif corpus=="parallel-source":
+                self.cur.execute('SELECT id,segmentSL from parallel_corpus')
+            elif corpus=="parallel-target":
+                self.cur.execute('SELECT id,segmentTL from parallel_corpus')
+            continserts=0
+            for s in self.cur.fetchall():
+                id=s[0]
+                segment=s[1]
+                continserts+=1
+                taggedtokens = self.POSmodel(segment)
+                ttsentence=[]
+                for token in taggedtokens:
+                    form=token.text
+                    lemma=token.lemma_
+                    tag=token.tag_
+                    tag=token.pos_    
+                    ttsentence.append(form+"|"+lemma+"|"+tag)
+                ttsentence=" ".join(ttsentence)
+                record=[]
+                record.append(id)
+                record.append(ttsentence)
+                data.append(record)
+                if continserts==self.maxinserts:
+                    if corpus=="source":
+                        self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+                    elif corpus=="target":
+                        self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+                    elif corpus=="parallel-source":
+                        self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentSL) VALUES (?,?) ON CONFLICT (id) DO UPDATE  SET tagged_segmentSL=excluded.tagged_segmentSL",data)
+                    elif corpus=="parallel-target":
+                        self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentTL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentTL=excluded.tagged_segmentTL",data)
+                    data=[]
+                    continserts=0
+            with self.conn:
+                if corpus=="source":
+                    self.cur.executemany("INSERT sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data) 
+                elif corpus=="target":
+                    self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+                elif corpus=="parallel-source":
+                    self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentSL) VALUES (?,?) ON CONFLICT (id) DO UPDATE  SET tagged_segmentSL=excluded.tagged_segmentSL",data)
+                elif corpus=="parallel-target":
+                    self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentTL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentTL=excluded.tagged_segmentTL",data)
+    
+    
+    def save_sl_tagged_corpus(self,outputfile,encoding="utf-8"):
+        sortida=codecs.open(outputfile,"w",encoding=encoding)
+        self.cur.execute('SELECT tagged_segment from sl_tagged_corpus')
+        for s in self.cur.fetchall():
+            tagged_segment=s[0]
+            sortida.write(tagged_segment+"\n")
+    
+    def save_tl_tagged_corpus(self,outputfile,encoding="utf-8"):
+        sortida=codecs.open(outputfile,"w",encoding=encoding)
+        self.cur.execute('SELECT tagged_segment from tl_tagged_corpus')
+        for s in self.cur.fetchall():
+            tagged_segment=s[0]
+            sortida.write(tagged_segment+"\n")
+            
+            
+            
+    def save_sl_tagged_parallel_corpus(self,outputfile,encoding="utf-8"):
+        sortida=codecs.open(outputfile,"w",encoding=encoding)
+        self.cur.execute('SELECT tagged_segmentSL from tagged_parallel_corpus')
+        for s in self.cur.fetchall():
+            tagged_segment=s[0]
+            sortida.write(tagged_segment+"\n")
+            
+    def save_tl_tagged_parallel_corpus(self,outputfile,encoding="utf-8"):
+        sortida=codecs.open(outputfile,"w",encoding=encoding)
+        self.cur.execute('SELECT tagged_segmentTL from tagged_parallel_corpus')
+        for s in self.cur.fetchall():
+            tagged_segment=s[0]
+            sortida.write(tagged_segment+"\n")
+
+    
+    def tagged_ngram_calculation (self,nmin=2,nmax=3,minfreq=2):
+        '''Calculates the tagged ngrams.'''
+        ngramsFD=FreqDist()
+        n_nmin=nmin
+        n_max=nmax
+        data=[]
+        record=[]
+        with self.conn:
+            self.cur.execute('SELECT tagged_segment from sl_tagged_corpus')
+            for s in self.cur.fetchall():
+                segment=s[0]
+                for n in range(nmin,nmax+1):
+                    ngs=ngrams(segment.split(),n)
+                    for ng in ngs:
+                        ngramsFD[ng]+=1
+        for c in ngramsFD.most_common():
+           if c[1]>=minfreq:
+                candidate=[]
+                for ngt in c[0]:
+                    candidate.append(ngt.split("|")[0])
+                candidate=" ".join(candidate)
+                record=[]
+                record.append(candidate)
+                record.append(" ".join(c[0])) 
+                record.append(len(c[0]))
+                record.append(c[1])   
+                data.append(record)
+        with self.conn:
+            self.cur.executemany("INSERT INTO tagged_ngrams (ngram, tagged_ngram, n, frequency) VALUES (?,?,?,?)",data) 
+            self.conn.commit()
+                    
+    def translate_linguistic_pattern(self,pattern):
+        aux=[]
+        for ptoken in pattern.split():
+            auxtoken=[]
+            ptoken=ptoken.replace(".*","[^\s]+") 
+            for pelement in ptoken.split("|"):
+                if pelement=="#":
+                    auxtoken.append("([^\s]+?)")                    
+                elif pelement=="":
+                    auxtoken.append("[^\s]+?")
+                else:
+                    if pelement.startswith("#"):
+                        auxtoken.append("("+pelement.replace("#","")+")")
+                    else:
+                        auxtoken.append(pelement)
+            aux.append("\|".join(auxtoken))
+        tp="("+" ".join(aux)+")"
+        return(tp)       
+    
+    def load_linguistic_patterns(self,file, encoding="utf-8"):
+        '''Loads the linguistic patterns to use with linguistic terminology extraction.'''
+        entrada=codecs.open(file,"r",encoding=encoding)
+        linguistic_patterns=[]
+        data=[]
+        record=[]
+        for linia in entrada:
+            linia=linia.rstrip()
+            npattern=len(linia.split(" "))
+            if npattern<self.n_min_pos_patterns: self.n_min_pos_patterns=npattern
+            if npattern>self.n_max_pos_patterns: self.n_max_pos_patterns=npattern
+            pattern=self.translate_linguistic_pattern(linia)
+            record.append(pattern)
+            data.append(record)
+            record=[]
+        with self.conn:
+            self.cur.executemany("INSERT INTO linguistic_patterns (linguistic_pattern) VALUES (?)",data)
+    def get_n_min_pos_patterns(self):
+        return(self.n_min_pos_patterns)
+        
+    def get_n_max_pos_patterns(self):
+        return(self.n_max_pos_patterns)
+    
+    def linguistic_term_extraction(self,minfreq=2):
+        '''Performs an linguistic term extraction using the extracted tagged ngrams (tagged_ngram_calculation should be executed first). '''
+        linguistic_patterns=[]
+        controlpatterns=[]
+        with self.conn:
+            self.cur.execute("SELECT linguistic_pattern from linguistic_patterns")
+            for lp in self.cur.fetchall():
+                linguistic_pattern=lp[0]
+                transformedpattern="^"+linguistic_pattern+"$"
+                if not transformedpattern in controlpatterns:
+                    linguistic_patterns.append(transformedpattern)
+                    controlpatterns.append(transformedpattern)            
+        self.cur.execute("SELECT tagged_ngram, n, frequency FROM tagged_ngrams order by frequency desc")
+        results=self.cur.fetchall()
+        data=[] 
+        for a in results:
+            include=True
+            ng=a[0]
+            n=a[1]
+            frequency=a[2]
+            try:
+                if ng.split()[0].split("|")[1].lower() in sl_stopwords: include=False
+            except:
+                pass
+            try:
+                if ng.split()[-1].split("|")[1].lower() in sl_stopwords: include=False
+            except:
+                pass
+            if frequency<minfreq:
+                break
+            if include:
+                for pattern in linguistic_patterns:
+                    match=re.search(pattern,ng)
+                    if match:
+                        if match.group(0)==ng:          
+                            candidate=" ".join(match.groups()[1:])
+                            record=[]
+                            record.append(candidate)     
+                            record.append(n)
+                            record.append(frequency)   
+                            record.append("freq")
+                            record.append(frequency)   
+                            data.append(record)
+                            break
+        with self.conn:
+            self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data)      
+            self.conn.commit()
+            
+        #deleting repeated candidates
+        self.cur.execute("SELECT candidate, n, frequency FROM term_candidates")
+        results=self.cur.fetchall()
+        tcaux={}
+        for a in results:
+            if not a[0] in tcaux:
+                tcaux[a[0]]=a[2]
+            else:
+                tcaux[a[0]]+=a[2]
+        self.cur.execute("DELETE FROM term_candidates")
+        self.conn.commit()
+        data=[] 
+        for tc in tcaux:
+            record=[]
+            record.append(tc)            
+            record.append(len(tc.split()))
+            record.append(tcaux[tc])   
+            record.append("freq")
+            record.append(tcaux[tc])   
+            data.append(record)
+        with self.conn:
+            self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data) 
+            self.conn.commit()
+            
+    def learn_linguistic_patterns(self,outputfile,showfrequencies=False,encoding="utf-8",verbose=True,representativity=100):
+        learntpatterns={}
+        sortida=codecs.open(outputfile,"w",encoding=encoding)
+        acufreq=0
+        tags={}
+        with self.conn:
+            self.cur.execute("SELECT sl_term FROM evaluation_terms")
+            for s in self.cur.fetchall():
+                self.cur.execute("SELECT tagged_ngram, n, frequency FROM tagged_ngrams WHERE ngram= ?", (s[0],))
+                results=self.cur.fetchall()
+                if len(results)>0:
+                    for a in results:
+                        ng=a[0]
+                        nglist=ng.split()
+                        n=a[1]
+                        frequency=a[2]
+                        candidate=[]
+                        ngtokenstag=ng.split()
+                        for ngt in ngtokenstag:
+                            candidate.append(ngt.split("|")[0])
+                        candidate=" ".join(candidate)
+                        t2=ng.split()
+                        t1=candidate.split()
+                        patternbrut=[]
+                        for position in range(0,n):
+                            t2f=t2[position].split("|")[0]
+                            t2l=t2[position].split("|")[1]
+                            t2t=t2[position].split("|")[2]
+                            patternpart=""
+                            if t1[position]==t2l:
+                                patternpart="|#|"+t2t
+                            elif t1[position]==t2f:
+                                patternpart="#||"+t2t
+                            patternbrut.append(patternpart)
+                        pattern=" ".join(patternbrut)
+                        if pattern in learntpatterns:
+                            learntpatterns[pattern]+=n
+                            acufreq+=n
+                        else:
+                            learntpatterns[pattern]=n
+                            acufreq+=n
+        sorted_x = sorted(learntpatterns.items(), key=operator.itemgetter(1),reverse=True)
+        results=[]
+        acufreq2=0
+        for s in sorted_x:
+            percent=100*acufreq2/acufreq
+            if percent>representativity:
+                break
+            acufreq2+=s[1]
+            if showfrequencies:
+                cadena=str(s[1])+"\t"+s[0]
+            else:
+                cadena=s[0]
+            sortida.write(cadena+"\n")
+            if verbose:
+                print(cadena)
+                                
+    def find_translation_pcorpus_statistical(self,slterm,maxdec=1,maxinc=1,ncandidates=5,separator=":"):
+        self.nmin=len(slterm.split())-maxdec
+        self.nmax=len(slterm.split())+maxinc
+        self.tlngrams=FreqDist()
+        with self.conn:
+            self.cur.execute('SELECT id, segment from sl_corpus')
+            
+            for self.s in self.cur.fetchall():
+                self.segment=self.s[1]
+                self.id=self.s[0]
+                
+                if self.segment.find(slterm)>-1:
+                    self.cur2.execute('SELECT segment from tl_corpus where id="'+str(self.id)+'"')
+                    for self.s2 in self.cur2.fetchall():
+                        self.tl_segment=self.s2[0]
+                        for self.n in range(self.nmin,self.nmax+1):
+                            #self.tlngs=ngrams(self.tl_tokenizer.tokenize(self.tl_segment), self.n)
+                            self.tlngs=ngrams(self.tl_segment.split(), self.n)
+                            for self.tlng in self.tlngs:
+                                if not self.tlng[0] in self.tl_stopwords and not self.tlng[-1] in self.tl_stopwords:
+                                    self.tlngrams[self.tlng]+=1
+            self.resultlist=[]
+            for self.c in self.tlngrams.most_common(ncandidates):
+                self.resultlist.append(" ".join(self.c[0]))
+            
+            return(separator.join(self.resultlist))
+    
+    def find_translation_pcorpus_linguistics(self,slterm,maxdec=1,maxinc=1,ncandidates=5,separator=":"):
+        self.nmin=len(slterm.split())-maxdec
+        self.nmax=len(slterm.split())+maxinc
+        self.tlngrams=FreqDist()
+        with self.conn:
+            self.cur.execute('SELECT id, segment from sl_corpus')
+            
+            for self.s in self.cur.fetchall():
+                self.segment=self.s[1]
+                self.id=self.s[0]
+                
+                if self.segment.find(slterm)>-1:
+                    self.cur2.execute('SELECT segment from tl_corpus where id="'+str(self.id)+'"')
+                    for self.s2 in self.cur2.fetchall():
+                        self.tl_segment=self.s2[0]
+                        for self.n in range(self.nmin,self.nmax+1):
+                            #self.tlngs=ngrams(self.tl_tokenizer.tokenize(self.tl_segment), self.n)
+                            self.tlngs=ngrams(self.tl_segment.split(), self.n)
+                            for self.tlng in self.tlngs:
+                                if not self.tlng[0] in self.tl_stopwords and not self.tlng[-1] in self.tl_stopwords:
+                                    self.tlngrams[self.tlng]+=1
+            self.resultlist=[]
+            for self.c in self.tlngrams.most_common(ncandidates):
+                self.resultlist.append(" ".join(self.c[0]))
+            
+            return(separator.join(self.resultlist))
+
+#EMBEDDINGS
+
+    def calculate_embeddings_sl(self,filename,vector_size=300, window=5, min_count=1, workers=4):
+        self.cur.execute('SELECT id, segment from sl_corpus')
+        data = []
+        for s in self.cur.fetchall():
+            temp=[]
+            segment=s[1] 
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(segment).split()
+            else:
+                tokens=segment.split()
+            data.append(tokens)
+        model = Word2Vec(sentences=data, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
+        model.wv.save_word2vec_format(filename, binary=False)
+     
+    def calculate_embeddings_sl_ref(self,filename,vector_size=300, window=5, min_count=1, workers=4):
+        self.cur.execute('SELECT id, segment from tl_corpus')
+        data = []
+        for s in self.cur.fetchall():
+            temp=[]
+            segment=s[1] 
+            if self.specificSLtokenizer:
+                tokens=self.SLtokenizer.tokenize(segment).split()                
+            else:
+                tokens=segment.split()
+            data.append(tokens)
+        model = Word2Vec(sentences=data, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
+        model.wv.save_word2vec_format(filename, binary=False)
+    
+    def calculate_embeddings_tl(self,filename,vector_size=300, window=5, min_count=1, workers=4):
+        self.cur.execute('SELECT id, segment from tl_corpus')
+        data = []
+        for s in self.cur.fetchall():
+            temp=[]
+            segment=s[1] 
+            if self.specificTLtokenizer:
+                tokens=self.TLtokenizer.tokenize(segment).split()
+            else:
+                tokens=segment.split()
+            data.append(tokens)
+        model = Word2Vec(sentences=data, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
+        model.wv.save_word2vec_format(filename, binary=False)
+    
+    def mapEmbeddings(self,src_input,trg_input,src_output,trg_output,init_dictionary):
+        supervised_mapping(src_input,trg_input,src_output,trg_output,init_dictionary)
+        
+    def load_SL_embeddings(self, file, binary=False):
+        self.wvSL = KeyedVectors.load_word2vec_format(file, binary=False)
+        
+    def load_TL_embeddings(self, file, binary=False):
+        self.wvTL = KeyedVectors.load_word2vec_format(file, binary=False)
+        
+    
+    def find_translation_wv(self, term, ncandidates=50):
+        
+        term=term.strip().replace(" ","▁")
+        try:
+            vector=self.wvSL[term]
+            tcandidates = self.wvTL.most_similar([vector], topn=ncandidates)
+        except:
+            tcandidates=[]
+        response={}
+        
+        for tc in tcandidates:
+            tc2=tc[0].replace("▁"," ")
+            response[tc2]=tc[1]
+                
+        return(response)
+        
+
+
+#TSR
+    def tsr(self, type="combined",max_iterations=10000000000, verbose=True): 
+        component={}
+        firstcomponent={}
+        middlecomponent={}
+        lastcomponent={}
+        self.tsr_terms=[]
+        self.cur.execute("SELECT term FROM tsr_terms")
+        results=self.cur.fetchall()
+        for r in results:
+            self.tsr_terms.append(r[0])
+        for term in self.tsr_terms:
+            camps=term.split()
+            if len(camps)==1: #UNIGRAMS
+                firstcomponent[camps[0].lower()]=1
+                lastcomponent[camps[0].lower()]=1
+            if len(camps)>=2:
+                firstcomponent[camps[0].lower()]=1
+                lastcomponent[camps[-1].lower()]=1
+                component[camps[0].lower()]=1
+                component[camps[-1].lower()]=1
+                if len(camps)>=3:
+                    for i in range(1,len(camps)-1):
+                        middlecomponent[camps[i].lower()]=1
+                        component[camps[i].lower()]=1
+
+        new=True
+        newcandidates={} #candidate-frequency
+        hashmeasure={}
+        hashvalue={}
+        
+        newcandidatestempstric={} #candidate-frequency                                                                    
+        hashmeasuretempstrict={}
+        hashvaluetempstric={}
+
+        newcandidatestempflexible={} #candidate-frequency
+        hashmeasuretempflexible={}
+        hashvaluetempflexible={}
+        
+        newcandidatestempcombined={} #candidate-frequency
+        hashmeasuretempcombined={}
+        hashvaluetempcombined={}
+        
+        iterations=0
+        while new:
+            iterations+=1
+            if verbose: print("ITERATION",iterations)
+            new=False
+            self.cur.execute("SELECT candidate,n,frequency,measure,value FROM term_candidates ")
+            results=self.cur.fetchall()
+            auxiliar={}
+            value=max_iterations-iterations#r[4]
+            for r in results:
+                candidate=r[0]
+                n=r[1]
+                frequency=r[2]
+                measure="tsr"#r[3]
+                #IMPLEMENTED ONLY FOR BIGRAMS !!!
+                '''
+                rcamps=candidate.split()
+                if type=="strict":
+                    if rcamps[0] in firstcomponent and rcamps[-1] in lastcomponent:
+                        if not candidate in newcandidates:
+                            newcandidates[candidate]=frequency
+                            hashmeasure[candidate]=measure
+                            hashvalue[candidate]=value
+                            new=True
+                            firstcomponent[rcamps[0]]=1
+                            lastcomponent[rcamps[-1]]=1
+                elif type=="flexible":
+                    if rcamps[0] in firstcomponent or rcamps[-1] in lastcomponent:
+                        if not candidate in newcandidates:
+                            newcandidates[candidate]=frequency
+                            hashmeasure[candidate]=measure
+                            hashvalue[candidate]=value
+                            new=True
+                            firstcomponent[rcamps[0]]=1
+                            lastcomponent[rcamps[-1]]=1
+                            component[rcamps[0]]=1
+                            component[rcamps[-1]]=1
+                elif type=="combined":
+                    if iterations==1:
+                        if rcamps[0] in firstcomponent and rcamps[-1] in lastcomponent:
+                            if not candidate in newcandidates:
+                                newcandidates[candidate]=frequency
+                                hashmeasure[candidate]=measure
+                                hashvalue[candidate]=value
+                                new=True
+                                firstcomponent[rcamps[0]]=1
+                                lastcomponent[rcamps[-1]]=1
+                                component[rcamps[0]]=1
+                                component[rcamps[-1]]=1
+                    else:
+                        if rcamps[0] in firstcomponent or rcamps[-1] in lastcomponent:
+                            if not candidate in newcandidates:
+                                newcandidates[candidate]=frequency
+                                hashmeasure[candidate]=measure
+                                hashvalue[candidate]=value
+                                new=True
+                                firstcomponent[rcamps[0]]=1
+                                lastcomponent[rcamps[-1]]=1
+                                component[rcamps[0]]=1
+                                component[rcamps[-1]]=1
+                '''
+                first_c=False
+                middle_c=False
+                last_c=False
+                rcamps=candidate.split()
+                truesfalses=[]
+                if str(rcamps[0]).lower() in firstcomponent: 
+                    first_c=True
+                    truesfalses.append(True)
+                else:
+                    truesfalses.append(False)
+                if str(rcamps[-1]).lower() in lastcomponent: 
+                    last_c=True
+                    truesfalses.append(True)
+                else:
+                    truesfalses.append(False)
+                if n>2:
+                    middle_c=True
+                    for i in range(1,n-1):
+                        if not str(r[i]).lower() in middlecomponent: middle_c=False
+                    if middle_c==True:
+                        truesfalses.append(True)
+                    else:
+                        truesfalses.append(False)
+                if type=="strict":
+                    if not False in truesfalses:
+                        if not candidate in newcandidates:
+                            newcandidates[candidate]=frequency
+                            hashmeasure[candidate]=measure
+                            hashvalue[candidate]=value
+                            new=True
+                            firstcomponent[rcamps[0]]=1
+                            lastcomponent[rcamps[-1]]=1
+                elif type=="flexible":
+                    if True in truesfalses:
+                        if not candidate in newcandidates:
+                            newcandidates[candidate]=frequency
+                            hashmeasure[candidate]=measure
+                            hashvalue[candidate]=value
+                            new=True
+                            firstcomponent[rcamps[0]]=1
+                            lastcomponent[rcamps[-1]]=1
+                            component[rcamps[0]]=1
+                            component[rcamps[-1]]=1
+                elif type=="combined":
+                    if iterations==1:       
+                        new=True
+                        if not False in truesfalses:
+                            if not candidate in newcandidates:
+                                newcandidates[candidate]=frequency
+                                hashmeasure[candidate]=measure
+                                hashvalue[candidate]=value                                
+                                firstcomponent[rcamps[0]]=1
+                                lastcomponent[rcamps[-1]]=1
+                                if n>2:
+                                    for i in range(1,n-1):
+                                        middlecomponent[rcamps[i]]=1
+                                        component[rcamps[i]]=1
+                    else:
+                        if True in truesfalses:
+                            if not candidate in newcandidates:
+                                newcandidates[candidate]=frequency
+                                hashmeasure[candidate]=measure
+                                hashvalue[candidate]=value
+                                new=True
+                                firstcomponent[rcamps[0]]=1
+                                lastcomponent[rcamps[-1]]=1
+                                if n>2:
+                                    for i in range(1,n-1):
+                                        middlecomponent[rcamps[i]]=1
+                                        component[rcamps[i]]=1
+                                component[rcamps[0]]=1
+                                component[rcamps[-1]]=1
+                '''
+                if n==2:
+                    if rcamps[0] in firstcomponent: first_c=True
+                    middle_c=True
+                    if rcamps[-1] in lastcomponent: last_c=True
+                    if type=="strict":
+                        if first_c and last_c:
+                            if not candidate in newcandidates:
+                                newcandidates[candidate]=frequency
+                                hashmeasure[candidate]=measure
+                                hashvalue[candidate]=value
+                                new=True
+                                firstcomponent[rcamps[0]]=1
+                                lastcomponent[rcamps[-1]]=1
+                    elif type=="flexible":
+                        if first_c or last_c:
+                            if not candidate in newcandidates:
+                                newcandidates[candidate]=frequency
+                                hashmeasure[candidate]=measure
+                                hashvalue[candidate]=value
+                                new=True
+                                firstcomponent[rcamps[0]]=1
+                                lastcomponent[rcamps[-1]]=1
+                                component[rcamps[0]]=1
+                                component[rcamps[-1]]=1
+                    
+                    elif type=="combined":
+                        if iterations==1:
+                            if first_c and last_c:
+                                if not candidate in newcandidates:
+                                    newcandidates[candidate]=frequency
+                                    hashmeasure[candidate]=measure
+                                    hashvalue[candidate]=value
+                                    new=True
+                                    firstcomponent[rcamps[0]]=1
+                                    lastcomponent[rcamps[-1]]=1
+                                    component[rcamps[0]]=1
+                                    component[rcamps[-1]]=1
+                        else:
+                            if first_c or last_c:
+                                if not candidate in newcandidates:
+                                    newcandidates[candidate]=frequency
+                                    hashmeasure[candidate]=measure
+                                    hashvalue[candidate]=value
+                                    new=True
+                                    firstcomponent[rcamps[0]]=1
+                                    lastcomponent[rcamps[-1]]=1
+                                    component[rcamps[0]]=1
+                                    component[rcamps[-1]]=1
+                                    
+                elif n==3:
+                    if rcamps[0] in firstcomponent: first_c=True
+                    if rcamps[1] in middlecomponent: middle_c=True
+                    if rcamps[-1] in lastcomponent: last_c=True
+                    if type=="strict":
+                        if first_c and middle_c and last_c:
+                            if not candidate in newcandidates:
+                                newcandidates[candidate]=frequency
+                                hashmeasure[candidate]=measure
+                                hashvalue[candidate]=value
+                                new=True
+                                firstcomponent[rcamps[0]]=1
+                                middlecomponent[rcamps[1]]=1
+                                lastcomponent[rcamps[-1]]=1
+                    elif type=="flexible":
+                        condition=False
+                        if first_c and middle_c or last_c: condition=True
+                        #if first_c or middle_c and last_c: condition=True
+                        if last_c and middle_c or first_c: condition=True
+                        if condition:
+                            if not candidate in newcandidates:
+                                newcandidates[candidate]=frequency
+                                hashmeasure[candidate]=measure
+                                hashvalue[candidate]=value
+                                new=True
+                                firstcomponent[rcamps[0]]=1
+                                middlecomponent[rcamps[1]]=1
+                                lastcomponent[rcamps[-1]]=1
+                                component[rcamps[0]]=1
+                                component[rcamps[1]]=1
+                                component[rcamps[-1]]=1
+                    
+                    elif type=="combined":
+                        if iterations==1:
+                            if first_c and middle_c and last_c:
+                                if not candidate in newcandidates:
+                                    newcandidates[candidate]=frequency
+                                    hashmeasure[candidate]=measure
+                                    hashvalue[candidate]=value
+                                    new=True
+                                    firstcomponent[rcamps[0]]=1
+                                    middlecomponent[rcamps[1]]=1
+                                    lastcomponent[rcamps[-1]]=1
+                                    component[rcamps[0]]=1
+                                    component[rcamps[1]]=1
+                                    component[rcamps[-1]]=1
+                        else:
+                            condition=False
+                            if first_c or middle_c or last_c: condition=True
+                            #if first_c and middle_c or last_c: condition=True
+                            #if first_c or middle_c and last_c: condition=True
+                            #if last_c and middle_c or first_c: condition=True
+                            if condition:
+                                if not candidate in newcandidates:
+                                    newcandidates[candidate]=frequency
+                                    hashmeasure[candidate]=measure
+                                    hashvalue[candidate]=value
+                                    new=True
+                                    firstcomponent[rcamps[0]]=1
+                                    middlecomponent[rcamps[1]]=1
+                                    lastcomponent[rcamps[-1]]=1
+                                    component[rcamps[0]]=1
+                                    component[rcamps[1]]=1
+                                    component[rcamps[-1]]=1
+            '''     
+            if iterations>=max_iterations:
+                break
+            if verbose: print(iterations,new)
+        with self.conn:
+            self.cur.execute('DELETE FROM term_candidates')
+            self.conn.commit()
+        
+                    
+        data=[]
+        for c in newcandidates:
+            termb=c
+            n=len(c.split())
+            freqtotal=newcandidates[c]
+            measure=hashmeasure[c]
+            value=hashvalue[c]
+            record=[]
+            record.append(termb)
+            record.append(n)
+            record.append(freqtotal)
+            record.append(measure)
+            record.append(value)
+            data.append(record)
+        with self.conn:
+            self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data)
+            
+        self.conn.commit()   
+
+def L_LLR(a,b,c):
+    '''Auxiliar function to calculate Log Likelihood Ratio'''
+    L=(c**a)*((1-c)**(b-a))
+    return(L)
+            
+class myBigramAssocMeasures(nltk.collocations.BigramAssocMeasures):
+    """
+    A collection of bigram association measures. Each association measure
+    is provided as a function with three arguments::
+
+        bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
+
+    The arguments constitute the marginals of a contingency table, counting
+    the occurrences of particular events in a corpus. The letter i in the
+    suffix refers to the appearance of the word in question, while x indicates
+    the appearance of any word. Thus, for example:
+
+        n_ii counts (w1, w2), i.e. the bigram being scored
+        n_ix counts (w1, *)
+        n_xi counts (*, w2)
+        n_xx counts (*, *), i.e. any bigram
+
+    This may be shown with respect to a contingency table::
+
+                w1    ~w1
+             ------ ------
+         w2 | n_ii | n_oi | = n_xi
+             ------ ------
+        ~w2 | n_io | n_oo |
+             ------ ------
+             = n_ix        TOTAL = n_xx
+             
+    Amb la terminologia de Pazienza
+
+                w1    ~w1
+             ------ ------
+         w2 | n_ii O11| n_oi O12| = n_xi
+             ------ ------
+        ~w2 | n_io O21| n_oo O22|
+             ------ ------
+             = n_ix        TOTAL = n_xx
+             
+             N=O11+O12+O21+O22= n_xx
+             R1=O11+O12 = n_xi
+             R2=O21+O22
+             C1=O11+O21=n_ix
+             C2=O12+O22= n_oi+n_oo
+             
+        TENIM: n_ii, n_ix_xi_tuple, n_xx
+        n_io=n_ix-n_ii
+        n_oi=n_xi-n_ii
+        n_oo=n_xx-n_ix-n_xi
+    """
+    #MEASURES NOT IMPLEMENTED IN NLTK
+    
+    def loglikelihood(self,n_ii, n_ix_xi_tuple, n_xx):
+        '''LogLikelihood according to NSP'''
+        (n_ix, n_xi) = n_ix_xi_tuple
+        n_oo=n_xx-n_ix-n_xi
+        n_io=n_ix-n_ii
+        n_oi=n_xi-n_ii
+        n_oo=n_xx-n_ix-n_xi        
+        
+        n11=n_ii
+        n12=n_io
+        n21=n_oi
+        n22=n_oo
+        n1p=n11+n12
+        np1=n11+n21
+        n2p=n21+n22
+        np2=n12+n22
+        npp=n_xx
+
+        m11 = (n1p*np1/npp)
+        m12 = (n1p*np2/npp)
+        m21 = (np1*n2p/npp)
+        m22 = (n2p*np2/npp)
+        try:
+            LogLikelihood = 2 * (n11 * math.log((n11/m11),2) + n12 * math.log((n12/m12),2) + n21 * math.log((n21/m21),2) + n22 * math.log((n22/m22),2))
+        except:
+            LogLikelihood=0
+        return(LogLikelihood)
+        
+    def MI(self,n_ii, n_ix_xi_tuple, n_xx):
+        '''Church Mutual Information accoding to Pazienza'''
+        (n_ix, n_xi) = n_ix_xi_tuple
+        self.E11=n_xi*n_ix/n_xx
+        self.part=n_ii/self.E11
+        self.MI=math.log(self.part,2)
+        return(self.MI)
+        
+    def MI2(self,n_ii, n_ix_xi_tuple, n_xx):
+        '''Church Mutual Information Variant accoding to Pazienza'''
+        (n_ix, n_xi) = n_ix_xi_tuple
+        self.E11=n_xi*n_ix/n_xx
+        self.part=(n_ii/self.E11)**2
+        self.MI2=math.log(self.part,2)
+        return(self.MI2)
+        
+    def MI3(self,n_ii, n_ix_xi_tuple, n_xx):
+        '''Church Mutual Information Variant accoding to Pazienza'''
+        (n_ix, n_xi) = n_ix_xi_tuple
+        self.E11=n_xi*n_ix/n_xx
+        self.part=(n_ii/self.E11)**3
+        self.MI3=math.log(self.part,2)
+        return(self.MI3)
+        
+    def odds(self,n_ii, n_ix_xi_tuple, n_xx):
+        '''Odds ratio according to NSP'''
+        (n_ix, n_xi) = n_ix_xi_tuple
+        n_oo=n_xx-n_ix-n_xi
+        n_io=n_ix-n_ii
+        n_oi=n_xi-n_ii
+        n_oo=n_xx-n_ix-n_xi        
+        
+        n11=n_ii
+        n12=n_io
+        n21=n_oi
+        n22=n_oo
+        n1p=n11+n12
+        np1=n11+n21
+        n2p=n21+n22
+        np2=n12+n22
+        npp=n_xx
+
+        m11 = (n1p*np1/npp)
+        m12 = (n1p*np2/npp)
+        m21 = (np1*n2p/npp)
+        m22 = (n2p*np2/npp)
+        
+        if n21==0:n21=1
+        if n12==0:n12=1
+        ODDS_RATIO = (n11*n22)/(n21*n12)
+        return(ODDS_RATIO)
+        
+    def z_score(self,n_ii, n_ix_xi_tuple, n_xx):
+        '''z-score ratio according to NSP'''
+        (n_ix, n_xi) = n_ix_xi_tuple
+        n_oo=n_xx-n_ix-n_xi
+        n_io=n_ix-n_ii
+        n_oi=n_xi-n_ii
+        n_oo=n_xx-n_ix-n_xi        
+        
+        n11=n_ii
+        n12=n_io
+        n21=n_oi
+        n22=n_oo
+        n1p=n11+n12
+        np1=n11+n21
+        n2p=n21+n22
+        np2=n12+n22
+        npp=n_xx
+
+        m11 = (n1p*np1/npp)
+        m12 = (n1p*np2/npp)
+        m21 = (np1*n2p/npp)
+        m22 = (n2p*np2/npp)
+       
+        zscore = (n11-m11)/(math.sqrt(m11))
+        return(zscore)
+   
+
+class myTrigramAssocMeasures(nltk.collocations.TrigramAssocMeasures):
+    pass
+    
+class myQuadgramAssocMeasures(nltk.collocations.QuadgramAssocMeasures):
+    pass
+
+
+ 
+  
+    
+###STUFF FROM MAP EMBEDDINGS MIKEL ARTETXE###
+#cupy_utils
+import numpy
+
+try:
+    import cupy
+except ImportError:
+    cupy = None
+
+
+def supports_cupy():
+    return cupy is not None
+
+
+def get_cupy():
+    return cupy
+
+
+def get_array_module(x):
+    if cupy is not None:
+        return cupy.get_array_module(x)
+    else:
+        return numpy
+
+
+def asnumpy(x):
+    if cupy is not None:
+        return cupy.asnumpy(x)
+    else:
+        return numpy.asarray(x)
+#embeddings
+
+def embeddings_read(file, threshold=0, vocabulary=None, dtype='float'):
+    header = file.readline().split(' ')
+    count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
+    dim = int(header[1])
+    words = []
+    matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
+    for i in range(count):
+        word, vec = file.readline().split(' ', 1)
+        if vocabulary is None:
+            words.append(word)
+            matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
+        elif word in vocabulary:
+            words.append(word)
+            matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
+    return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))
+
+
+def embeddings_write(words, matrix, file):
+    m = asnumpy(matrix)
+    print('%d %d' % m.shape, file=file)
+    for i in range(len(words)):
+        print(words[i] + ' ' + ' '.join(['%.6g' % x for x in m[i]]), file=file)
+
+
+def embeddings_length_normalize(matrix):
+    xp = get_array_module(matrix)
+    norms = xp.sqrt(xp.sum(matrix**2, axis=1))
+    norms[norms == 0] = 1
+    matrix /= norms[:, xp.newaxis]
+
+
+def embeddings_mean_center(matrix):
+    xp = get_array_module(matrix)
+    avg = xp.mean(matrix, axis=0)
+    matrix -= avg
+
+
+def embeddings_length_normalize_dimensionwise(matrix):
+    xp = get_array_module(matrix)
+    norms = xp.sqrt(xp.sum(matrix**2, axis=0))
+    norms[norms == 0] = 1
+    matrix /= norms
+
+
+def embeddings_mean_center_embeddingwise(matrix):
+    xp = get_array_module(matrix)
+    avg = xp.mean(matrix, axis=1)
+    matrix -= avg[:, xp.newaxis]
+
+
+def embeddings_normalize(matrix, actions):
+    for action in actions:
+        if action == 'unit':
+            embeddings_length_normalize(matrix)
+        elif action == 'center':
+            embeddings_mean_center(matrix)
+        elif action == 'unitdim':
+            embeddings_length_normalize_dimensionwise(matrix)
+        elif action == 'centeremb':
+            embeddings_mean_center_embeddingwise(matrix)
+            
+#map_embeddings
+
+def dropout(m, p):
+    if p <= 0.0:
+        return m
+    else:
+        xp = get_array_module(m)
+        mask = xp.random.rand(*m.shape) >= p
+        return m*mask
+
+
+def topk_mean(m, k, inplace=False):  # TODO Assuming that axis is 1
+    xp = get_array_module(m)
+    n = m.shape[0]
+    ans = xp.zeros(n, dtype=m.dtype)
+    if k <= 0:
+        return ans
+    if not inplace:
+        m = xp.array(m)
+    ind0 = xp.arange(n)
+    ind1 = xp.empty(n, dtype=int)
+    minimum = m.min()
+    for i in range(k):
+        m.argmax(axis=1, out=ind1)
+        ans += m[ind0, ind1]
+        m[ind0, ind1] = minimum
+    return ans / k
+
+def supervised_mapping(src_input,trg_input,src_output,trg_output,init_dictionary,encoding="utf-8",precision="fp32",cuda=False,batch_size=1000,seed=0,unsupervised_vocab=0,src_reweight=0,trg_reweight=0,dim_reduction=0,vocabulary_cutoff=0,direction="union",csls=0,threshold=0.000001,validation=None,stochastic_initial=0.1,stochastic_multiplier=2.0,stochastic_interval=50):    
+    self_learning=False
+    print("SUPERVISED")
+    normalize=['unit', 'center', 'unit']
+    #parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
+    normalize=['unit', 'center', 'unit']
+    whiten=True
+    src_reweight=0.5
+    trg_reweight=0.5
+    src_dewhiten='src'
+    trg_dewhiten='trg'
+    batch_size=1000
+    cuda=False
+    identical=False
+    unsupervised=False
+    init_identical=False
+    init_numerals=False
+    init_unsupervised=False
+    orthogonal=False
+    unconstrained=False
+    self_learning=False
+    verbose=False
+    if precision == 'fp16':
+        dtype = 'float16'
+    elif precision == 'fp32':
+        dtype = 'float32'
+    elif precision == 'fp64':
+        dtype = 'float64'
+
+    # Read input embeddings
+    print("Read input embeddings")
+    srcfile = open(src_input, encoding=encoding, errors='surrogateescape')
+    trgfile = open(trg_input, encoding=encoding, errors='surrogateescape')
+    src_words, x = embeddings_read(srcfile, dtype=dtype)
+    trg_words, z = embeddings_read(trgfile, dtype=dtype)
+
+    # NumPy/CuPy management
+    if cuda:
+        if not supports_cupy():
+            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
+            sys.exit(-1)
+        xp = get_cupy()
+        x = xp.asarray(x)
+        z = xp.asarray(z)
+    else:
+        xp = np
+    xp.random.seed(seed)
+
+    # Build word to index map
+    print("Build word to index map")
+    src_word2ind = {word: i for i, word in enumerate(src_words)}
+    trg_word2ind = {word: i for i, word in enumerate(trg_words)}
+
+    # STEP 0: Normalization
+    print("STEP 0: Normalization")
+    embeddings_normalize(x, normalize)
+    embeddings_normalize(z, normalize)
+
+    # Build the seed dictionary
+    print("Build the seed dictionary")
+    src_indices = []
+    trg_indices = []
+    
+    f = open(init_dictionary, encoding=encoding, errors='surrogateescape')
+    for line in f:
+        src, trg = line.split()
+        try:
+            src_ind = src_word2ind[src]
+            trg_ind = trg_word2ind[trg]
+            src_indices.append(src_ind)
+            trg_indices.append(trg_ind)
+        except KeyError:
+            pass
+            #print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
+
+    # Read validation dictionary
+    if validation is not None:
+        f = open(validation, encoding=encoding, errors='surrogateescape')
+        validation = collections.defaultdict(set)
+        oov = set()
+        vocab = set()
+        for line in f:
+            src, trg = line.split()
+            try:
+                src_ind = src_word2ind[src]
+                trg_ind = trg_word2ind[trg]
+                validation[src_ind].add(trg_ind)
+                vocab.add(src)
+            except KeyError:
+                oov.add(src)
+        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
+        validation_coverage = len(validation) / (len(validation) + len(oov))
+
+    
+
+    # Allocate memory
+    print("Allocate memory")
+    xw = xp.empty_like(x)
+    zw = xp.empty_like(z)
+    src_size = x.shape[0] if vocabulary_cutoff <= 0 else min(x.shape[0], vocabulary_cutoff)
+    trg_size = z.shape[0] if vocabulary_cutoff <= 0 else min(z.shape[0], vocabulary_cutoff)
+    simfwd = xp.empty((batch_size, trg_size), dtype=dtype)
+    simbwd = xp.empty((batch_size, src_size), dtype=dtype)
+    if validation is not None:
+        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)
+
+    best_sim_forward = xp.full(src_size, -100, dtype=dtype)
+    src_indices_forward = xp.arange(src_size)
+    trg_indices_forward = xp.zeros(src_size, dtype=int)
+    best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
+    src_indices_backward = xp.zeros(trg_size, dtype=int)
+    trg_indices_backward = xp.arange(trg_size)
+    knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
+    knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)
+
+    # Training loop
+    print("Training loop")
+    best_objective = objective = -100.
+    it = 1
+    last_improvement = 0
+    keep_prob = stochastic_initial
+    t = time.time()
+    end = not self_learning
+    while True:
+
+        # Increase the keep probability if we have not improve in stochastic_interval iterations
+        if it - last_improvement > stochastic_interval:
+            if keep_prob >= 1.0:
+                end = True
+            keep_prob = min(1.0, stochastic_multiplier*keep_prob)
+            last_improvement = it
+
+        # Update the embedding mapping
+        if orthogonal or not end:  # orthogonal mapping
+            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
+            w = vt.T.dot(u.T)
+            x.dot(w, out=xw)
+            zw[:] = z
+        elif unconstrained:  # unconstrained mapping
+            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
+            w = x_pseudoinv.dot(z[trg_indices])
+            x.dot(w, out=xw)
+            zw[:] = z
+        else:  # advanced mapping
+
+            # TODO xw.dot(wx2, out=xw) and alike not working
+            xw[:] = x
+            zw[:] = z
+
+            # STEP 1: Whitening
+            def whitening_transformation(m):
+                u, s, vt = xp.linalg.svd(m, full_matrices=False)
+                return vt.T.dot(xp.diag(1/s)).dot(vt)
+            if whiten:
+                wx1 = whitening_transformation(xw[src_indices])
+                wz1 = whitening_transformation(zw[trg_indices])
+                xw = xw.dot(wx1)
+                zw = zw.dot(wz1)
+
+            # STEP 2: Orthogonal mapping
+            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
+            wz2 = wz2_t.T
+            xw = xw.dot(wx2)
+            zw = zw.dot(wz2)
+
+            # STEP 3: Re-weighting
+            xw *= s**src_reweight
+            zw *= s**trg_reweight
+
+            # STEP 4: De-whitening
+            if src_dewhiten == 'src':
+                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
+            elif src_dewhiten == 'trg':
+                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
+            if trg_dewhiten == 'src':
+                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
+            elif trg_dewhiten == 'trg':
+                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
+
+            # STEP 5: Dimensionality reduction
+            if dim_reduction > 0:
+                xw = xw[:, :dim_reduction]
+                zw = zw[:, :dim_reduction]
+
+        # Self-learning
+        if end:
+            break
+        else:
+            # Update the training dictionary
+            if direction in ('forward', 'union'):
+                if csls_neighborhood > 0:
+                    for i in range(0, trg_size, simbwd.shape[0]):
+                        j = min(i + simbwd.shape[0], trg_size)
+                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
+                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=csls_neighborhood, inplace=True)
+                for i in range(0, src_size, simfwd.shape[0]):
+                    j = min(i + simfwd.shape[0], src_size)
+                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
+                    simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
+                    simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
+                    dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j])
+            if direction in ('backward', 'union'):
+                if csls_neighborhood > 0:
+                    for i in range(0, src_size, simfwd.shape[0]):
+                        j = min(i + simfwd.shape[0], src_size)
+                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
+                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=csls_neighborhood, inplace=True)
+                for i in range(0, trg_size, simbwd.shape[0]):
+                    j = min(i + simbwd.shape[0], trg_size)
+                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
+                    simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
+                    simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
+                    dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j])
+            if direction == 'forward':
+                src_indices = src_indices_forward
+                trg_indices = trg_indices_forward
+            elif direction == 'backward':
+                src_indices = src_indices_backward
+                trg_indices = trg_indices_backward
+            elif direction == 'union':
+                src_indices = xp.concatenate((src_indices_forward, src_indices_backward))
+                trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward))
+
+            # Objective function evaluation
+            if direction == 'forward':
+                objective = xp.mean(best_sim_forward).tolist()
+            elif direction == 'backward':
+                objective = xp.mean(best_sim_backward).tolist()
+            elif direction == 'union':
+                objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2
+            if objective - best_objective >= threshold:
+                last_improvement = it
+                best_objective = objective
+
+            # Accuracy and similarity evaluation in validation
+            if validation is not None:
+                src = list(validation.keys())
+                xw[src].dot(zw.T, out=simval)
+                nn = asnumpy(simval.argmax(axis=1))
+                accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
+                similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])
+
+            # Logging
+            duration = time.time() - t
+            if verbose:
+                print(file=sys.stderr)
+                print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
+                print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
+                print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
+                if validation is not None:
+                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
+                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
+                    print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
+                sys.stderr.flush()
+            if log is not None:
+                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
+                    100 * similarity, 100 * accuracy, 100 * validation_coverage) if validation is not None else ''
+                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
+                log.flush()
+
+        t = time.time()
+        it += 1
+
+    # Write mapped embeddings
+    print("Write mapped embeddings")
+    srcfile = open(src_output, mode='w', encoding=encoding, errors='surrogateescape')
+    trgfile = open(trg_output, mode='w', encoding=encoding, errors='surrogateescape')
+    embeddings_write(src_words, xw, srcfile)
+    embeddings_write(trg_words, zw, trgfile)
+    srcfile.close()
+    trgfile.close() 
+