Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/aoliverg/TBXTools.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TBXTools.py3959
1 files changed, 3959 insertions, 0 deletions
diff --git a/TBXTools.py b/TBXTools.py
new file mode 100644
index 0000000..3005340
--- /dev/null
+++ b/TBXTools.py
@@ -0,0 +1,3959 @@
+# TBXTools
+# version: 2022/05/05
+# Copyright: Antoni Oliver (2022) - Universitat Oberta de Catalunya - aoliverg@uoc.edu
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+import os
+import codecs
+import sqlite3
+import xml.etree.cElementTree as etree
+
+import nltk
+from nltk.util import ngrams
+from nltk.probability import FreqDist
+from nltk.collocations import *
+import re
+import pickle
+import gzip
+import operator
+import sys
+import math
+import csv
+
+import string
+
+import importlib
+
+import gensim
+from gensim.models import Word2Vec
+from gensim.models import KeyedVectors
+import numpy
+import collections
+import numpy as np
+
+import time
+
+try:
+ import spacy
+except:
+ pass
+
+try:
+ import spacy_udpipe
+except:
+ pass
+import subprocess
+import openpyxl
+from openpyxl import load_workbook
+
+class TBXTools:
+ '''Class for automatic terminology extraction and terminology management.'''
+ def __init__(self):
+ self.maxinserts=10000 #controls the maximum number of inserts in memory
+ self.sl_lang=""
+ self.tl_lang=""
+ self.max_id_corpus=0
+
+ self.sl_stopwords=[]
+ self.tl_stopwords=[]
+ self.sl_inner_stopwords=[]
+ self.tl_inner_stopwords=[]
+ self.sl_exclsions_regexps=[]
+ self.tl_exclusion_regexps=[]
+ self.sl_morphonorm_rules=[]
+ self.tl_morphonorm_rules=[]
+ self.evaluation_terms={}
+ self.tsr_terms=[]
+ self.exclusion_terms={}
+ self.exclusion_no_terms={}
+ self.ngrams={}
+ self.tagged_ngrams={}
+ self.term_candidates={}
+ self.linguistic_patterns={}
+
+ self.knownterms=[]
+ self.n_min=1
+ self.n_max=5
+
+ self.n_min_pos_patterns=1000
+ self.n_max_pos_patterns=1
+
+ self.punctuation=string.punctuation
+ self.sl_stopwords.extend(self.punctuation)
+ self.tl_stopwords.extend(self.punctuation)
+ self.sl_inner_stopwords.extend(self.punctuation)
+ self.tl_inner_stopwords.extend(self.punctuation)
+
+ self.specificSLtokenizer=False
+ self.specificTLtokenizer=False
+
+ self.SLtokenizer=None
+ self.TLtokenizer=None
+
+
+
+
+ def create_project(self,project_name,sllang=None, tllang=None,overwrite=False):
+ '''Opens a project. If the project already exists, it raises an exception. To avoid the exception use overwrite=True. To open existing projects, use the open_project method.'''
+ #sllang and tllang are not longer used.
+ if os.path.isfile(project_name) and not overwrite:
+ raise Exception("This file already exists")
+
+ else:
+ if os.path.isfile(project_name) and overwrite:
+ os.remove(project_name)
+ self.conn=sqlite3.connect(project_name)
+ self.cur = self.conn.cursor()
+ self.cur2 = self.conn.cursor()
+ with self.conn:
+ self.cur = self.conn.cursor()
+ self.cur.execute("CREATE TABLE sl_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, segment TEXT)")
+ self.cur.execute("CREATE TABLE tl_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, segment TEXT)")
+ self.cur.execute("CREATE TABLE parallel_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, segmentSL, segmentTL TEXT)")
+ self.cur.execute("CREATE TABLE tagged_parallel_corpus(id INTEGER PRIMARY KEY, tagged_segmentSL, tagged_segmentTL TEXT)")
+ self.cur.execute("CREATE TABLE sl_corpus_c(id INTEGER PRIMARY KEY AUTOINCREMENT, segment TEXT)")
+ self.cur.execute("CREATE TABLE tl_corpus_c(id INTEGER PRIMARY KEY AUTOINCREMENT, segment TEXT)")
+ self.cur.execute("CREATE TABLE sl_tagged_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, tagged_segment TEXT)")
+ self.cur.execute("CREATE TABLE tl_tagged_corpus(id INTEGER PRIMARY KEY AUTOINCREMENT, tagged_segment TEXT)")
+ self.cur.execute("CREATE TABLE sl_tagged_corpus_c(id INTEGER PRIMARY KEY AUTOINCREMENT, tagged_segment TEXT)")
+ self.cur.execute("CREATE TABLE tl_tagged_corpus_c(id INTEGER PRIMARY KEY AUTOINCREMENT, tagged_segment TEXT)")
+ self.cur.execute("CREATE TABLE sl_stopwords (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_stopword TEXT)")
+ self.cur.execute("CREATE TABLE sl_inner_stopwords (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_inner_stopword TEXT)")
+ self.cur.execute("CREATE TABLE tl_stopwords (id INTEGER PRIMARY KEY AUTOINCREMENT, tl_stopword TEXT)")
+ self.cur.execute("CREATE TABLE tl_inner_stopwords (id INTEGER PRIMARY KEY AUTOINCREMENT, tl_inner_stopword TEXT)")
+ self.cur.execute("CREATE TABLE sl_exclusion_regexps (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_exclusion_regexp TEXT)")
+ self.cur.execute("CREATE TABLE tl_exclusion_regexps (id INTEGER PRIMARY KEY AUTOINCREMENT, tl_exclusion_regexp TEXT)")
+ self.cur.execute("CREATE TABLE sl_morphonorm_rules (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_morphonorm_rule TEXT)")
+ self.cur.execute("CREATE TABLE tl_morphonorm_rules (id INTEGER PRIMARY KEY AUTOINCREMENT, tl_morphonorm_rule TEXT)")
+ self.cur.execute("CREATE TABLE evaluation_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+ self.cur.execute("CREATE TABLE reference_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+ self.cur.execute("CREATE TABLE validated_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+ self.cur.execute("CREATE TABLE compoundify_terms_sl (id INTEGER PRIMARY KEY AUTOINCREMENT, term TEXT)")
+ self.cur.execute("CREATE TABLE compoundify_terms_tl (id INTEGER PRIMARY KEY AUTOINCREMENT, term TEXT)")
+ self.cur.execute("CREATE TABLE tsr_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, term TEXT)")
+ self.cur.execute("CREATE TABLE tosearch_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, term TEXT)")
+ self.cur.execute("CREATE TABLE exclusion_terms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+ self.cur.execute("CREATE TABLE exclusion_noterms (id INTEGER PRIMARY KEY AUTOINCREMENT, sl_term TEXT, tl_term TEXT)")
+ self.cur.execute("CREATE TABLE tokens (id INTEGER PRIMARY KEY AUTOINCREMENT, token TEXT, frequency INTEGER)")
+ self.cur.execute("CREATE TABLE ngrams (id INTEGER PRIMARY KEY AUTOINCREMENT, ngram TEXT, n INTEGER, frequency INTEGER)")
+ self.cur.execute("CREATE TABLE tagged_ngrams (id INTEGER PRIMARY KEY AUTOINCREMENT, ngram TEXT, tagged_ngram TEXT, n INTEGER, frequency INTEGER)")
+
+ self.cur.execute("CREATE INDEX indextaggedngram on tagged_ngrams (ngram);")
+
+ self.cur.execute("CREATE TABLE embeddings_sl (id INTEGER PRIMARY KEY AUTOINCREMENT, candidate TEXT, embedding BLOB)")
+ self.cur.execute("CREATE INDEX indexembeddings_sl on embeddings_sl (candidate);")
+
+ self.cur.execute("CREATE TABLE embeddings_sl_ref (id INTEGER PRIMARY KEY AUTOINCREMENT, candidate TEXT, embedding BLOB)")
+ self.cur.execute("CREATE INDEX indexembeddings_sl_ref on embeddings_sl_ref (candidate);")
+
+ self.cur.execute("CREATE TABLE embeddings_tl (id INTEGER PRIMARY KEY AUTOINCREMENT, candidate TEXT, embedding BLOB)")
+ self.cur.execute("CREATE INDEX indexembeddings_tl on embeddings_tl (candidate);")
+
+ self.cur.execute("CREATE TABLE term_candidates (id INTEGER PRIMARY KEY AUTOINCREMENT, candidate TEXT, n INTEGER, frequency INTEGER, measure TEXT, value FLOAT)")
+ self.cur.execute("CREATE TABLE index_pt(id INTEGER PRIMARY KEY AUTOINCREMENT, source TEXT, target TEXT, probability FLOAT)")
+ self.cur.execute("CREATE INDEX index_index_pt on index_pt (source);")
+ self.cur.execute("CREATE TABLE linguistic_patterns (id INTEGER PRIMARY KEY AUTOINCREMENT, linguistic_pattern TEXT)")
+
+ self.conn.commit()
+
+ def open_project(self,project_name):
+ '''Opens an existing project. If the project doesn't exist it raises an exception.'''
+ if not os.path.isfile(project_name):
+ raise Exception("Project not found")
+ else:
+ self.conn=sqlite3.connect(project_name)
+ self.cur = self.conn.cursor()
+ self.cur2 = self.conn.cursor()
+
+
+ #METODES DELETES
+ def delete_configuration(self):
+ '''Deletes the project configuration.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM configuration')
+ self.conn.commit()
+
+ def delete_sl_corpus(self):
+ '''Deletes de source language corpus.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM sl_corpus')
+ self.conn.commit()
+
+ def delete_tl_corpus(self):
+ '''Deletes de target language corpus.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM tl_corpus')
+ self.conn.commit()
+
+ def delete_parallel_corpus(self):
+ '''Deletes de target language corpus.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM parallel_corpus')
+ self.conn.commit()
+
+ def delete_sl_corpus_c(self):
+ '''Deletes de source language contrast corpus.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM sl_corpus_c')
+ self.conn.commit()
+
+ def delete_tl_corpus_c(self):
+ '''Deletes de target language contrast corpus.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM tl_corpus_c')
+ self.conn.commit()
+
+ def delete_sl_tagged_corpus(self):
+ '''Deletes the source language tagged corpus.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM sl_tagged_corpus')
+ self.conn.commit()
+
+ def delete_tl_tagged_corpus(self):
+ '''Deletes the target language tagged corpus.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM tl_tagged_corpus')
+ self.conn.commit()
+
+ def delete_sl_tagged_corpus_c(self):
+ '''Deletes the source language contrast tagged corpus.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM sl_tagged_corpus_c')
+ self.conn.commit()
+
+ def delete_tl_tagged_corpus_c(self):
+ '''Deletes the target language contrast tagged corpus.'''
+ with self.conn:
+ self.cur.execute('DELETE FROM tl_tagged_corpus_c')
+ self.conn.commit()
+
+ def delete_sl_stopwords(self):
+ '''Deletes the stop-words for the source language.'''
+ #self.sl_stopwords=[]
+ with self.conn:
+ self.cur.execute('DELETE FROM sl_stopwords')
+ self.conn.commit()
+
+ def delete_tl_stopwords(self):
+ '''Deletes the stop-words fot the target language.'''
+ #self.tl_stopwords=[]
+ with self.conn:
+ self.cur.execute('DELETE FROM tl_stopwords')
+ self.conn.commit()
+
+ def delete_sl_inner_stopwords(self):
+ '''Deletes the inner stop-words for the source language.'''
+ #self.sl_inner_stopwords=[]
+ with self.conn:
+ self.cur.execute('DELETE FROM sl_inner_stopwords')
+ self.conn.commit()
+
+ def delete_tl_inner_stopwords(self):
+ '''Deletes the innter stop-words for the target language.'''
+ #self.tl_inner_stopwords=[]
+ with self.conn:
+ self.cur.execute('DELETE FROM tl_inner_stopwords')
+ self.conn.commit()
+
+ def delete_sl_exclusion_regexps(self):
+ '''Deletes the exclusion regular expressions for the source language.'''
+ #self.sl_exclusion_regexps=[]
+ with self.conn:
+ self.cur.execute('DELETE FROM sl_exclusion_regexps')
+ self.conn.commit()
+
+ def delete_tl_exclusion_regexps(self):
+ '''Deletes the exclusion regular expressions for the target language.'''
+ #self.tl_exclusion_regexps=[]
+ with self.conn:
+ self.cur.execute('DELETE FROM tl_exclusion_regexps')
+ self.conn.commit()
+
+ def delete_sl_morphonorm_rules(self):
+ '''Deletes the morphological normalisation rules for the source language.'''
+ #self.sl_morphonorm_rules=[]
+ with self.conn:
+ self.cur.execute('DELETE FROM sl_morphonorm_rules')
+ self.conn.commit()
+
+ def delete_tl_morphonorm_rules(self):
+ '''Deletes the morphological normalisation rules for the target language.'''
+ #self.tl_morphonorm_rules=[]
+ with self.conn:
+ self.cur.execute('DELETE FROM tl_morphonorm_rules')
+ self.conn.commit()
+
+ def delete_evaluation_terms(self):
+ '''Deletes the evaluation terms.'''
+ #self.evaluation_terms={}
+ with self.conn:
+ self.cur.execute('DELETE FROM evaluation_terms')
+ self.conn.commit()
+
+ def delete_reference_terms(self):
+ '''Deletes the reference terms.'''
+ #self.evaluation_terms={}
+ with self.conn:
+ self.cur.execute('DELETE FROM reference_terms')
+ self.conn.commit()
+
+ def delete_validated_terms(self):
+ '''Deletes the validated terms.'''
+ #self.evaluation_terms={}
+ with self.conn:
+ self.cur.execute('DELETE FROM validated_terms')
+ self.conn.commit()
+
+ def delete_compoundify_terms_sl(self):
+ '''Deletes the compoundify terms for the source language.'''
+ #self.exclusion_terms={}
+ with self.conn:
+ self.cur.execute('DELETE FROM compoundify_terms_sl')
+ self.conn.commit()
+
+ def delete_compoundify_terms_tl(self):
+ '''Deletes the compoundify terms for the target language.'''
+ #self.exclusion_terms={}
+ with self.conn:
+ self.cur.execute('DELETE FROM compoundify_terms_sl')
+ self.conn.commit()
+
+ def delete_tsr_terms(self):
+ '''Deletes the TSR terms.'''
+ #self.exclusion_terms={}
+ with self.conn:
+ self.cur.execute('DELETE FROM tsr_terms')
+ self.conn.commit()
+
+ def delete_exclusion_terms(self):
+ '''Deletes the exclusion terms.'''
+ #self.exclusion_terms={}
+ with self.conn:
+ self.cur.execute('DELETE FROM exclusion_terms')
+ self.conn.commit()
+
+ def delete_exclusion_no_terms(self):
+ '''Deletes the exclusion no terms.'''
+ #self.exclusion_terms={}
+ with self.conn:
+ self.cur.execute('DELETE FROM exclusion_no_terms')
+ self.conn.commit()
+
+ def delete_tokens(self):
+ '''Deletes the tokens.'''
+ #self.ngrams={}
+ with self.conn:
+ self.cur.execute('DELETE FROM tokens')
+ self.conn.commit()
+
+ def delete_ngrams(self):
+ '''Deletes the ngrams.'''
+ #self.ngrams={}
+ with self.conn:
+ self.cur.execute('DELETE FROM ngrams')
+ self.conn.commit()
+
+ def delete_tagged_ngrams(self):
+ '''Deletes the tagged ngrams.'''
+ #self.tagged_ngrams={}
+ with self.conn:
+ self.cur.execute('DELETE FROM tagged_ngrams')
+ self.conn.commit()
+
+ def delete_embeddings_sl(self):
+ '''Deletes the embeddings for the source language.'''
+ #self.tagged_ngrams={}
+ with self.conn:
+ self.cur.execute('DELETE FROM embeddings_sl')
+ self.conn.commit()
+
+ def delete_embeddings_tl(self):
+ '''Deletes the embeddings for the target language.'''
+ #self.tagged_ngrams={}
+ with self.conn:
+ self.cur.execute('DELETE FROM embeddings_tl')
+ self.conn.commit()
+
+ def delete_term_candidates(self):
+ '''Deletes the term candidates.'''
+ #self.term_candidates={}
+ with self.conn:
+ self.cur.execute('DELETE FROM term_candidates')
+ self.conn.commit()
+
+ def delete_linguistic_patterns(self):
+ '''Deletes the linguistic patterns for linguistic terminology extraction.'''
+ #self.exclusion_terms={}
+ with self.conn:
+ self.cur.execute('DELETE FROM linguistic_patterns')
+ self.conn.commit()
+
+ def load_sl_corpus(self,corpusfile, encoding="utf-8", compoundify=False, comp_symbol="▁"):
+ '''Loads a monolingual corpus for the source language. It's recommended, but not compulsory, that the corpus is segmented (one segment per line). Use external tools to segment the corpus. A plain text corpus (not segmented), can be aslo used.'''
+ if compoundify:
+ compterms=[]
+ self.cur.execute('SELECT term from compoundify_terms_sl')
+ data=self.cur.fetchall()
+ for d in data:
+ compterms.append(d[0])
+ cf=codecs.open(corpusfile,"r",encoding=encoding,errors="ignore")
+ data=[]
+ continserts=0
+ for line in cf:
+ record=[]
+ line=line.rstrip()
+ if compoundify:
+ for compterm in compterms:
+ if line.find(compterm)>=1:
+ comptermMOD=compterm.replace(" ",comp_symbol)
+ line=line.replace(compterm,comptermMOD)
+ record.append(line)
+ data.append(record)
+ continserts+=1
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_tl_corpus(self,corpusfile, encoding="utf-8", compoundify=False, comp_symbol="▁"):
+ '''Loads a monolingual corpus for the target language. It's recommended, but not compulsory, that the corpus is segmented (one segment per line). Use TBXTools external tools to segment the corpus. A plain text corpus (not segmented), can be aslo used.'''
+
+ if compoundify:
+ compterms=[]
+ self.cur.execute('SELECT term from compoundify_terms_tl')
+ data=self.cur.fetchall()
+ for d in data:
+ compterms.append(d[0])
+ cf=codecs.open(corpusfile,"r",encoding=encoding,errors="ignore")
+ data=[]
+ continserts=0
+ for line in cf:
+ record=[]
+ line=line.rstrip()
+ if compoundify:
+ for compterm in compterms:
+ if line.find(compterm)>=1:
+ comptermMOD=compterm.replace(" ",comp_symbol)
+ line=line.replace(compterm,comptermMOD)
+ record.append(line)
+ data.append(record)
+ continserts+=1
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_sl_corpus_c(self,corpusfile, encoding="utf-8", compoundify=False, comp_symbol="▁"):
+ '''Loads a monolingual contrast corpus for the source language. It's recommended, but not compulsory, that the corpus is segmented (one segment per line). Use external tools to segment the corpus. A plain text corpus (not segmented), can be aslo used.'''
+ if compoundify:
+ compterms=[]
+ self.cur.execute('SELECT term from compoundify_terms_sl')
+ data=self.cur.fetchall()
+ for d in data:
+ compterms.append(d[0])
+ cf=codecs.open(corpusfile,"r",encoding=encoding,errors="ignore")
+ data=[]
+ continserts=0
+ for line in cf:
+ record=[]
+ line=line.rstrip()
+ if compoundify:
+ for compterm in compterms:
+ if line.find(compterm)>=1:
+ comptermMOD=compterm.replace(" ",comp_symbol)
+ line=line.replace(compterm,comptermMOD)
+ record.append(line)
+ data.append(record)
+ continserts+=1
+ if continserts==self.maxinserts:
+ cur.executemany("INSERT INTO sl_corpus_c (segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_corpus_c (segment) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_tl_corpus_c(self,corpusfile, encoding="utf-8", compoundify=False, comp_symbol="▁"):
+ '''Loads a monolingual contrast corpus for the target language. It's recommended, but not compulsory, that the corpus is segmented (one segment per line). Use TBXTools external tools to segment the corpus. A plain text corpus (not segmented), can be aslo used.'''
+
+ if compoundify:
+ compterms=[]
+ self.cur.execute('SELECT term from compoundify_terms_tl')
+ data=self.cur.fetchall()
+ for d in data:
+ compterms.append(d[0])
+ cf=codecs.open(corpusfile,"r",encoding=encoding,errors="ignore")
+ data=[]
+ continserts=0
+ for line in cf:
+ record=[]
+ line=line.rstrip()
+ if compoundify:
+ for compterm in compterms:
+ if line.find(compterm)>=1:
+ comptermMOD=compterm.replace(" ",comp_symbol)
+ line=line.replace(compterm,comptermMOD)
+ record.append(line)
+ data.append(record)
+ continserts+=1
+ if continserts==self.maxinserts:
+ cur.executemany("INSERT INTO tl_corpus_c (segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_corpus_c (segment) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_parallel_corpus_Moses(self,slcorpusfile, tlcorpusfile, feed_monolingual=True, encoding="utf-8"):
+ '''Loads a parallel corpus in Moses format (that is, in two independent files). It expects one segment per line.'''
+ slcf=codecs.open(slcorpusfile,"r",encoding=encoding)
+ tlcf=codecs.open(tlcorpusfile,"r",encoding=encoding)
+ parallel_data=[]
+ sl_data=[]
+ tl_data=[]
+ parallel_data=[]
+ continserts=0
+ while 1:
+ sl_segment=slcf.readline()
+ if not sl_segment:
+ break
+ tl_segment=tlcf.readline()
+ continserts+=1
+ sl_record=[]
+ tl_record=[]
+ parallel_record=[]
+ sl_segment=sl_segment.rstrip()
+ tl_segment=tl_segment.rstrip()
+ parallel_record.append(sl_segment)
+ parallel_record.append(tl_segment)
+ sl_record.append(sl_segment)
+ tl_record.append(tl_segment)
+ parallel_data.append(parallel_record)
+ sl_data.append(sl_record)
+ tl_data.append(tl_record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data)
+ if feed_monolingual:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",sl_data)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",tl_data)
+ parallel_data=[]
+ sl_data=[]
+ tl_data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data)
+ if feed_monolingual:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",sl_data)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",tl_data)
+ self.conn.commit()
+
+ def load_parallel_corpus_tabtxt(self,corpusfile, feed_monolingual=True, reverse=False, encoding="utf-8"):
+ '''Loads a parallel corpus in tabbed text format (that is, in two independent files). It expects one segment per line.'''
+ cf=codecs.open(corpusfile,"r",encoding=encoding)
+ parallel_data=[]
+ sl_data=[]
+ tl_data=[]
+ parallel_data=[]
+ parallel_data_rev=[]
+ continserts=0
+ for linia in cf:
+ linia=linia.rstrip()
+ camps=linia.split("\t")
+ if len(camps)>=2:
+ sl_segment=camps[0]
+ tl_segment=camps[1]
+ continserts+=1
+ sl_record=[]
+ tl_record=[]
+ parallel_record=[]
+ parallel_record_rev=[]
+ sl_segment=sl_segment.rstrip()
+ tl_segment=tl_segment.rstrip()
+ parallel_record.append(sl_segment)
+ parallel_record.append(tl_segment)
+ parallel_record_rev.append(tl_segment)
+ parallel_record_rev.append(sl_segment)
+ sl_record.append(sl_segment)
+ tl_record.append(tl_segment)
+ parallel_data.append(parallel_record)
+ parallel_data_rev.append(parallel_record_rev)
+ sl_data.append(sl_record)
+ tl_data.append(tl_record)
+ if continserts==self.maxinserts:
+ if reverse:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data_rev)
+ else:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data)
+ if feed_monolingual:
+ if reverse:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",tl_data)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",sl_data)
+ else:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",sl_data)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",tl_data)
+ parallel_data=[]
+ parallel_data_rev=[]
+ sl_data=[]
+ tl_data=[]
+ continserts=0
+ with self.conn:
+ if reverse:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data_rev)
+ else:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",parallel_data)
+ if feed_monolingual:
+ if reverse:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",tl_data)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",sl_data)
+ else:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",sl_data)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",tl_data)
+ self.conn.commit()
+
+ def load_parallel_corpus_tmx(self,tmx_file, sl_code="", tl_code="", feed_monolingual=True):
+ '''Loads a parallel corpus from a TMX file. Source and target language codes should be given. The codes must be the exactly the same as in the TMX file. A list of codes separated by comma is allowed. '''
+ continserts=0
+ slcodes=[]
+ for slc in sl_code.split(","):
+ slcodes.append(slc.strip())
+ tlcodes=[]
+ for tlc in tl_code.split(","):
+ tlcodes.append(tlc.strip())
+ data1=[]
+ data2=[]
+ datap=[]
+ sl_segment=""
+ tl_segment=""
+ current_lang=""
+ for event, elem in etree.iterparse(tmx_file,events=("start","end")):
+ if event=='start':
+ if elem.tag=="tu" and not sl_segment=="" and not tl_segment=="":
+ continserts+=1
+
+ record1=[]
+ record2=[]
+ recordp=[]
+ record1.append(sl_segment)
+ data1.append(record1)
+ record2.append(tl_segment)
+ data2.append(record2)
+ recordp.append(sl_segment)
+ recordp.append(tl_segment)
+ datap.append(recordp)
+ sl_segment=""
+ tl_segment=""
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",datap)
+ if feed_monolingual:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data1)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data2)
+ data1=[]
+ data2=[]
+ datap=[]
+ continserts=0
+ self.conn.commit()
+ elif elem.tag=="tuv":
+ current_lang=elem.attrib['{http://www.w3.org/XML/1998/namespace}lang']
+ elif elem.tag=="seg":
+ if elem.text:
+ segmentext=elem.text
+ else:
+ segmentext=""
+ if current_lang in slcodes:
+ sl_segment=segmentext
+ if current_lang in tlcodes:
+ tl_segment=segmentext
+ with self.conn:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",datap)
+ if feed_monolingual:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data1)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data2)
+ self.conn.commit()
+
+ def load_parallel_corpus_sdltm(self,sdltmfile, feed_monolingual=True):
+ '''Loads a parallel corpus from a SDLTM file.'''
+
+ connSDLTM=sqlite3.connect(sdltmfile)
+ curSDLTM = connSDLTM.cursor()
+ curSDLTM.execute('select source_segment,target_segment from translation_units;')
+ dataSDLTM=curSDLTM.fetchall()
+ data1=[]
+ data2=[]
+ datap=[]
+ continserts=0
+ for d in dataSDLTM:
+ ssxml=d[0]
+ tsxml=d[1]
+ record1=[]
+ record2=[]
+ recordp=[]
+ try:
+ rootSL = etree.fromstring(ssxml)
+ for text in rootSL.iter('Value'):
+ sltext="".join(text.itertext()).replace("\n"," ")
+ rootTL = etree.fromstring(tsxml)
+ for text in rootTL.iter('Value'):
+ tltext="".join(text.itertext()).replace("\n"," ")
+ if not sltext=="" and not tltext=="":
+ continserts+=1
+ record1.append(sltext)
+ data1.append(record1)
+ record2.append(tltext)
+ data2.append(record2)
+ recordp.append(sltext)
+ recordp.append(tltext)
+ datap.append(recordp)
+ except:
+ print("ERROR")
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",datap)
+ if feed_monolingual:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data1)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data2)
+ data1=[]
+ data2=[]
+ datap=[]
+ continserts=0
+ self.conn.commit()
+ with self.conn:
+ self.cur.executemany("INSERT INTO parallel_corpus (segmentSL, segmentTL) VALUES (?,?)",datap)
+ if feed_monolingual:
+ self.cur.executemany("INSERT INTO sl_corpus (segment) VALUES (?)",data1)
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data2)
+ self.conn.commit()
+
+
+
+ def load_sl_tagged_corpus(self,corpusfile,format="TBXTools",encoding="utf-8"):
+ '''Loads the source language tagged corpus. 3 formats are allowed:
+ - TBXTools: The internal format used by TBXTools. One tagged segment per line.
+ f1|l1|t1|p1 f2|l2|t2|p2 ... fn|ln|tn|pn
+ - Freeling: One token per line and segments separated by blank lines
+ f1 l1 t1 p1
+ f2 l2 t2 p2
+ ...
+ fn ln tn pn
+ - Conll: One of the output formats guiven by the Standford Core NLP analyzer. On token per line and segments separated by blank lines
+ id1 f1 l1 t1 ...
+ id2 f2 l2 t2 ...
+ ...
+ idn fn ln tn ...
+ '''
+ validformarts=["TBXTools","freeling","conll"]
+ #TODO: Raise exception if not a valid format.
+ cf=codecs.open(corpusfile,"r",encoding=encoding)
+ if format.lower()=="tbxtools":
+ data=[]
+ continserts=0
+ for line in cf:
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ record.append(line)
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+ elif format.lower()=="freeling":
+ data=[]
+ continserts=0
+ segment=[]
+ for line in cf:
+ line=line.rstrip()
+ if line=="":
+ continserts+=1
+ record=[]
+ record.append(" ".join(segment))
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+ data=[]
+ self.conn.commit()
+ segment=[]
+
+ else:
+ camps=line.split()
+ token=camps[0]+"|"+camps[1]+"|"+camps[2]
+
+ segment.append(token)
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+ elif format.lower()=="conll":
+ data=[]
+ continserts=0
+ segment=[]
+ for line in cf:
+ line=line.rstrip()
+ if line=="":
+ continserts+=1
+ record=[]
+ record.append(" ".join(segment))
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",self.data)
+ data=[]
+ continserts=0
+ data=[]
+ self.conn.commit()
+ segment=[]
+
+ else:
+ camps=line.split()
+ token=camps[1]+"|"+camps[2]+"|"+camps[3]
+ segment.append(token)
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_tl_tagged_corpus(self,corpusfile,format="TBXTools",encoding="utf-8"):
+ '''Loads the target language tagged corpus. 3 formats are allowed:
+ - TBXTools: The internal format used by TBXTools. One tagged segment per line.
+ f1|l1|t1|p1 f2|l2|t2|p2 ... fn|ln|tn|pn
+ - Freeling: One token per line and segments separated by blank lines
+ f1 l1 t1 p1
+ f2 l2 t2 p2
+ ...
+ fn ln tn pn
+ - Conll: One of the output formats guiven by the Standford Core NLP analyzer. On token per line and segments separated by blank lines
+ id1 f1 l1 t1 ...
+ id2 f2 l2 t2 ...
+ ...
+ idn fn ln tn ...
+ '''
+ validformarts=["TBXTools","freeling","conll"]
+ #TODO: Raise exception if not a valid format.
+ cf=codecs.open(corpusfile,"r",encoding=encoding)
+ if format.lower()=="tbxtools":
+ data=[]
+ continserts=0
+ for line in cf:
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ record.append(line)
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+ elif format.lower()=="freeling":
+ data=[]
+ continserts=0
+ segment=[]
+ for line in cf:
+ line=line.rstrip()
+ if line=="":
+ continserts+=1
+ record=[]
+ record.append(" ".join(segment))
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+ data=[]
+ self.conn.commit()
+ segment=[]
+
+ else:
+ camps=line.split()
+ token=camps[0]+"|"+camps[1]+"|"+camps[2]
+
+ segment.append(token)
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+ elif format.lower()=="conll":
+ data=[]
+ continserts=0
+ segment=[]
+ for line in cf:
+ line=line.rstrip()
+ if line=="":
+ continserts+=1
+ record=[]
+ record.append(" ".join(segment))
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",self.data)
+ data=[]
+ continserts=0
+ data=[]
+ self.conn.commit()
+ segment=[]
+
+ else:
+ camps=line.split()
+ token=camps[1]+"|"+camps[2]+"|"+camps[3]
+ segment.append(token)
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_sl_tagged_corpus_c(self,corpusfile,format="TBXTools",encoding="utf-8"):
+ '''Loads the source language tagged corpus. 3 formats are allowed:
+ - TBXTools: The internal format used by TBXTools. One tagged segment per line.
+ f1|l1|t1|p1 f2|l2|t2|p2 ... fn|ln|tn|pn
+ - Freeling: One token per line and segments separated by blank lines
+ f1 l1 t1 p1
+ f2 l2 t2 p2
+ ...
+ fn ln tn pn
+ - Conll: One of the output formats guiven by the Standford Core NLP analyzer. On token per line and segments separated by blank lines
+ id1 f1 l1 t1 ...
+ id2 f2 l2 t2 ...
+ ...
+ idn fn ln tn ...
+ '''
+ validformarts=["TBXTools","freeling","conll"]
+ #TODO: Raise exception if not a valid format.
+ cf=codecs.open(corpusfile,"r",encoding=encoding)
+ if format.lower()=="tbxtools":
+ data=[]
+ continserts=0
+ for line in cf:
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ record.append(line)
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+ elif format.lower()=="freeling":
+ data=[]
+ continserts=0
+ segment=[]
+ for line in cf:
+ line=line.rstrip()
+ if line=="":
+ continserts+=1
+ record=[]
+ record.append(" ".join(segment))
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+ data=[]
+ self.conn.commit()
+ segment=[]
+
+ else:
+ camps=line.split()
+ token=camps[0]+"|"+camps[1]+"|"+camps[2]
+
+ segment.append(token)
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+ elif format.lower()=="conll":
+ data=[]
+ continserts=0
+ segment=[]
+ for line in cf:
+ line=line.rstrip()
+ if line=="":
+ continserts+=1
+ record=[]
+ record.append(" ".join(segment))
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",self.data)
+ data=[]
+ continserts=0
+ data=[]
+ self.conn.commit()
+ segment=[]
+
+ else:
+ camps=line.split()
+ token=camps[1]+"|"+camps[2]+"|"+camps[3]
+ segment.append(token)
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_tl_tagged_corpus_c(self,corpusfile,format="TBXTools",encoding="utf-8"):
+ '''Loads the target language tagged corpus. 3 formats are allowed:
+ - TBXTools: The internal format used by TBXTools. One tagged segment per line.
+ f1|l1|t1|p1 f2|l2|t2|p2 ... fn|ln|tn|pn
+ - Freeling: One token per line and segments separated by blank lines
+ f1 l1 t1 p1
+ f2 l2 t2 p2
+ ...
+ fn ln tn pn
+ - Conll: One of the output formats guiven by the Standford Core NLP analyzer. On token per line and segments separated by blank lines
+ id1 f1 l1 t1 ...
+ id2 f2 l2 t2 ...
+ ...
+ idn fn ln tn ...
+ '''
+ validformarts=["TBXTools","freeling","conll"]
+ #TODO: Raise exception if not a valid format.
+ cf=codecs.open(corpusfile,"r",encoding=encoding)
+ if format.lower()=="tbxtools":
+ data=[]
+ continserts=0
+ for line in cf:
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ record.append(line)
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+ elif format.lower()=="freeling":
+ data=[]
+ continserts=0
+ segment=[]
+ for line in cf:
+ line=line.rstrip()
+ if line=="":
+ continserts+=1
+ record=[]
+ record.append(" ".join(segment))
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ data=[]
+ continserts=0
+ data=[]
+ self.conn.commit()
+ segment=[]
+
+ else:
+ camps=line.split()
+ token=camps[0]+"|"+camps[1]+"|"+camps[2]
+
+ segment.append(token)
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+ elif format.lower()=="conll":
+ data=[]
+ continserts=0
+ segment=[]
+ for line in cf:
+ line=line.rstrip()
+ if line=="":
+ continserts+=1
+ record=[]
+ record.append(" ".join(segment))
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",self.data)
+ data=[]
+ continserts=0
+ data=[]
+ self.conn.commit()
+ segment=[]
+
+ else:
+ camps=line.split()
+ token=camps[1]+"|"+camps[2]+"|"+camps[3]
+ segment.append(token)
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_tagged_corpus_c (tagged_segment) VALUES (?)",data)
+ self.conn.commit()
+
+
+ def load_sl_stopwords(self,fitxer,encoding="utf-8"):
+ '''Loads the stopwords for the source language.'''
+ fc=codecs.open(fitxer,"r",encoding)
+ data=[]
+ record=[]
+ while 1:
+ linia=fc.readline()
+ if not linia:
+ break
+ linia=linia.rstrip()
+ record.append(linia)
+ data.append(record)
+ record=[]
+
+ for punct in self.punctuation:
+ record.append(punct)
+ data.append(record)
+ record=[]
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_stopwords (sl_stopword) VALUES (?)",data)
+
+ def load_tl_stopwords(self,fitxer,encoding="utf-8"):
+ '''Loads the stopwords for the target language.'''
+ fc=codecs.open(fitxer,"r",encoding)
+ data=[]
+ record=[]
+ while 1:
+ linia=fc.readline()
+ if not linia:
+ break
+ linia=linia.rstrip()
+ record.append(linia)
+ data.append(record)
+ record=[]
+
+ for punct in self.punctuation:
+ record.append(punct)
+ data.append(record)
+ record=[]
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_stopwords (tl_stopword) VALUES (?)",data)
+
+ def load_sl_inner_stopwords(self,fitxer,encoding="utf-8"):
+ '''Loads the stopwords for the source language.'''
+ fc=codecs.open(fitxer,"r",encoding)
+ data=[]
+ record=[]
+ while 1:
+ linia=fc.readline()
+ if not linia:
+ break
+ linia=linia.rstrip()
+ record.append(linia)
+ data.append(record)
+ record=[]
+ for punct in self.punctuation:
+ record.append(punct)
+ data.append(record)
+ record=[]
+ with self.conn:
+ self.cur.executemany("INSERT INTO sl_inner_stopwords (sl_inner_stopword) VALUES (?)",data)
+
+ def load_tl_inner_stopwords(self,fitxer,encoding="utf-8"):
+ '''Loads the inner stopwords for the target language.'''
+ fc=codecs.open(fitxer,"r",encoding)
+ data=[]
+ record=[]
+ while 1:
+ linia=fc.readline()
+ if not linia:
+ break
+ linia=linia.rstrip()
+ record.append(linia)
+ data.append(record)
+ record=[]
+ for punct in self.punctuation:
+ record.append(punct)
+ data.append(record)
+ record=[]
+ with self.conn:
+ self.cur.executemany("INSERT INTO tl_inner_stopwords (tl_inner_stopword) VALUES (?)",data)
+
+ #evaluation terms
+ def load_evaluation_terms_tabtxt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the evaluation terms from a tabulated text.'''
+ cf=codecs.open(arxiu,"r",encoding=encoding)
+ data=[]
+ continserts=0
+ for line in cf:
+ line=line.rstrip()
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ camps=line.split("\t")
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(camps[0]).split()
+ else:
+ tokens=camps[0].split()
+ if len(camps)==1:
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ record.append("_")
+ data.append(record)
+ elif len(camps)>1:
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ record.append(camps[1])
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO evaluation_terms (sl_term,tl_term) VALUES (?,?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO evaluation_terms (sl_term,tl_term) VALUES (?,?)",data)
+ self.conn.commit()
+
+ def load_evaluation_terms_tbx(self,arxiu,sl_code="",tl_code="",encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the evaluation terms from a TBX file.'''
+ slcodes=[]
+ for slc in sl_code.split(","):
+ slcodes.append(slc.strip())
+ tlcodes=[]
+ for tlc in tl_code.split(","):
+ tlcodes.append(tlc.strip())
+ data=[]
+ slterm=[]
+ tlterm=[]
+ lang=""
+ for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+ tag=elem.tag.replace(self.namespace(elem),"")
+ if event=="end" and tag in ["conceptEntry","termEntry"]:
+ if len(slterm)>0 and len(tlterm)>0:
+ record=[]
+ for slt in slterm:
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(slt).split()
+ else:
+ tokens=slt.split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ tlt=", ".join(tlterm)
+ record.append(slt)
+ record.append(tlt)
+ data.append(record)
+ record=[]
+ slterm=[]
+ tlterm=[]
+ elif event=="start" and tag=="langSec":
+ if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in slcodes:
+ lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+ if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in tlcodes:
+ lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+ elif event=="start" and tag=="term":
+ if lang in slcodes: slterm.append("".join(elem.itertext()).lstrip().rstrip())
+ elif lang in tlcodes: tlterm.append("".join(elem.itertext()).lstrip().rstrip())
+ self.cur.executemany("INSERT INTO evaluation_terms (sl_term,tl_term) VALUES (?,?)",data)
+ self.conn.commit()
+ #
+ def load_validated_terms(self,terms):
+ """Load a list of tuples containig source-target terms)."""
+ data=[]
+ for tupleTerms in terms:
+ record=[]
+ slterm=tupleTerms[0]
+ tlterm=tupleTerms[1]
+ record.append(slterm)
+ record.append(tlterm)
+ data.append(record)
+ self.cur.executemany("INSERT INTO validated_terms (sl_term,tl_term) VALUES (?,?)",data)
+ self.conn.commit()
+ def get_validated_terms(self):
+ self.cur.execute("SELECT sl_term, tl_term FROM validated_terms;")
+ validatedterms=[]
+ source_terms=[]
+ target_terms=[]
+ for s in self.cur.fetchall():
+ record=[]
+ record.append(s[0])
+ record.append(s[1])
+ validatedterms.append(record)
+ return(validatedterms)
+
+ #reference_terms
+ def load_reference_terms_tabtxt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000, reverse=False):
+ '''Loads the reference terms from a tabulated text.'''
+ cf=codecs.open(arxiu,"r",encoding=encoding)
+ data=[]
+ continserts=0
+ for line in cf:
+ line=line.rstrip()
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ camps=line.split("\t")
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(camps[0]).split()
+ else:
+ tokens=camps[0].split()
+ if len(camps)==1:
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ record.append("_")
+ data.append(record)
+ elif len(camps)>1:
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ record.append(camps[1])
+ data.append(record)
+ if continserts==self.maxinserts:
+ if reverse:
+ self.cur.executemany("INSERT INTO reference_terms (tl_term,sl_term) VALUES (?,?)",data)
+ else:
+ self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ if reverse:
+ self.cur.executemany("INSERT INTO reference_terms (tl_term,sl_term) VALUES (?,?)",data)
+ else:
+ self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)
+ self.conn.commit()
+
+ def load_reference_terms_tbx(self,arxiu,sl_code="",tl_code="",encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the evaluation terms from a TBX file.'''
+ slcodes=[]
+ for slc in sl_code.split(","):
+ slcodes.append(slc.strip())
+ tlcodes=[]
+ for tlc in tl_code.split(","):
+ tlcodes.append(tlc.strip())
+ data=[]
+ slterm=[]
+ tlterm=[]
+ lang=""
+ for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+ tag=elem.tag.replace(self.namespace(elem),"")
+ if event=="end" and tag in ["conceptEntry","termEntry"]:
+ if len(slterm)>0 and len(tlterm)>0:
+ record=[]
+ for slt in slterm:
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(slt).split()
+ else:
+ tokens=slt.split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ tlt=", ".join(tlterm)
+ record.append(slt)
+ record.append(tlt)
+ data.append(record)
+ record=[]
+ slterm=[]
+ tlterm=[]
+ elif event=="start" and tag=="langSec":
+ if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in slcodes:
+ lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+ if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in tlcodes:
+ lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+ elif event=="start" and tag=="term":
+ if lang in slcodes: slterm.append("".join(elem.itertext()).lstrip().rstrip())
+ elif lang in tlcodes: tlterm.append("".join(elem.itertext()).lstrip().rstrip())
+ self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)
+ self.conn.commit()
+
+ def load_reference_terms_csv(self,arxiu,encoding="utf-8",nmin=0,nmax=1000,CSVdelimiter=",",CSVquotechar=None,CSVescapechar=None,CSVSLTerm=1,CSVTLTerm=2):
+ csv_file=codecs.open(arxiu,"r",encoding=encoding)
+ csv_reader = csv.reader(csv_file, delimiter=",", quotechar=CSVquotechar, escapechar=CSVescapechar)
+ record=[]
+ data=[]
+ for row in csv_reader:
+ record.append(row[CSVSLTerm-1])
+ record.append(row[CSVTLTerm-1])
+ data.append(record)
+ record=[]
+ self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)
+ self.conn.commit()
+ def load_reference_terms_excel(self,file,nmin=0,nmax=1000,sheet_name=1,first_row=1,sourceColumn="A",targetColumn="B"):
+ workbook = load_workbook(filename=file)
+ data=[]
+ for sheet_name in workbook.sheetnames:
+ sheet = workbook[sheet_name]
+ for row in sheet.rows:
+ source=""
+ target=""
+ record=[]
+ for cell in row:
+
+ if isinstance(cell, openpyxl.cell.cell.MergedCell):
+ # Skip this cell
+ continue
+ if cell.column_letter==sourceColumn:
+ source=cell.value
+ elif cell.column_letter==targetColumn:
+ target=cell.value
+ if not source=="" and not target=="":
+ record.append(source)
+ record.append(target)
+ data.append(record)
+ self.cur.executemany("INSERT INTO reference_terms (sl_term,tl_term) VALUES (?,?)",data)
+ self.conn.commit()
+
+ #compoundify_terms_sl
+ def load_compoundify_terms_sl_txt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the compoundify terms for the source language from a text file (one term per line).'''
+ cf=codecs.open(arxiu,"r",encoding=encoding)
+ data=[]
+ continserts=0
+ for line in cf:
+ line=line.rstrip()
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ camps=line.split("\t")
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(camps[0]).split()
+ else:
+ tokens=camps[0].split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO compoundify_terms_sl (term) VALUES (?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO compoundify_terms_sl (term) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_compoundify_terms_sl_tbx(self,arxiu,code="",encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the compoundify terms for the source language from a TBX file.'''
+ codes=[]
+ for slc in code.split(","):
+ codes.append(slc.strip())
+ data=[]
+ term=[]
+ lang=""
+ for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+ tag=elem.tag.replace(self.namespace(elem),"")
+ if event=="end" and tag in ["conceptEntry","termEntry"]:
+ if len(term)>0 and lang in codes:
+ record=[]
+ for slt in term:
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(slt).split()
+ else:
+ tokens=slt.split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(slt)
+ data.append(record)
+ record=[]
+ term=[]
+ elif event=="start" and tag=="langSec":
+ if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in codes:
+ lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+ else:
+ lang=""
+ elif event=="start" and tag=="term":
+ if lang in codes:
+ term.append("".join(elem.itertext()).lstrip().rstrip())
+ self.cur.executemany("INSERT INTO compoundify_terms_sl (term) VALUES (?)",data)
+ self.conn.commit()
+
+ #compoundify_terms_tl
+ def load_compoundify_terms_tl_txt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the compoundify terms for the target language from a text file (one term per line).'''
+ cf=codecs.open(arxiu,"r",encoding=encoding)
+ data=[]
+ continserts=0
+ for line in cf:
+ line=line.rstrip()
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ camps=line.split("\t")
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(camps[0]).split()
+ else:
+ tokens=camps[0].split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO compoundify_terms_tl (term) VALUES (?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO compoundify_terms_tl (term) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_compoundify_terms_tl_tbx(self,arxiu,code="",encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the compoundify terms for the target language from a TBX file.'''
+ codes=[]
+ for slc in code.split(","):
+ codes.append(slc.strip())
+ data=[]
+ term=[]
+ lang=""
+ for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+ tag=elem.tag.replace(self.namespace(elem),"")
+ if event=="end" and tag in ["conceptEntry","termEntry"]:
+ if len(term)>0 and lang in codes:
+ record=[]
+ for slt in term:
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(slt).split()
+ else:
+ tokens=slt.split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(slt)
+ data.append(record)
+ record=[]
+ term=[]
+ elif event=="start" and tag=="langSec":
+ if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in codes:
+ lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+ else:
+ lang=""
+ elif event=="start" and tag=="term":
+ if lang in codes:
+ term.append("".join(elem.itertext()).lstrip().rstrip())
+ self.cur.executemany("INSERT INTO compoundify_terms_tl (term) VALUES (?)",data)
+ self.conn.commit()
+
+ #tsr terms
+
+ def load_tsr_terms_txt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the TSR terms from a text file (one term per line).'''
+ cf=codecs.open(arxiu,"r",encoding=encoding)
+ data=[]
+ continserts=0
+ for line in cf:
+ line=line.rstrip()
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ camps=line.split("\t")
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(camps[0]).split()
+ else:
+ tokens=camps[0].split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO tsr_terms (term) VALUES (?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO tsr_terms (term) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_tosearch_terms(self,SLterms,encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the TSR terms from a string, text file (one term per line) or list.'''
+ tofind=[]
+ if isinstance(SLterms, str):
+ if os.path.exists(SLterms):
+ entrada=codecs.open(SLterms)
+ for linia in entrada:
+ linia=linia.rstrip()
+ tofind.append(linia)
+ entrada.close()
+ else:
+ tofind.append(SLterms)
+ elif isinstance(SLterms, list):
+ tofind.extend(SLterms)
+ data=[]
+ continserts=0
+ for term in tofind:
+ continserts+=1
+ record=[]
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(term)
+ else:
+ tokens=term.split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(term)
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO tosearch_terms (term) VALUES (?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO tosearch_terms (term) VALUES (?)",data)
+ self.conn.commit()
+
+ def load_tsr_terms_tbx(self,arxiu,code="",encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the TSR terms from a TBX file.'''
+ codes=[]
+ for slc in code.split(","):
+ codes.append(slc.strip())
+ data=[]
+ term=[]
+ lang=""
+ for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+ tag=elem.tag.replace(self.namespace(elem),"")
+ if event=="end" and tag in ["conceptEntry","termEntry"]:
+ if len(term)>0 and lang in codes:
+ record=[]
+ for slt in term:
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(slt).split()
+ else:
+ tokens=slt.split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(slt)
+ data.append(record)
+ record=[]
+ term=[]
+ elif event=="start" and tag=="langSec":
+ if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in codes:
+ lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+ else:
+ lang=""
+ elif event=="start" and tag=="term":
+ if lang in codes:
+ term.append("".join(elem.itertext()).lstrip().rstrip())
+ self.cur.executemany("INSERT INTO tsr_terms (term) VALUES (?)",data)
+ self.conn.commit()
+
+ #exclusion_terms
+
+ def load_exclusion_terms_tabtxt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the exclusion terms from a tabulated text.'''
+ cf=codecs.open(arxiu,"r",encoding=encoding)
+ data=[]
+ continserts=0
+ for line in cf:
+ line=line.rstrip()
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ camps=line.split("\t")
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(camps[0]).split()
+ else:
+ tokens=camps[0].split()
+ if len(camps)==1:
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ record.append("_")
+ data.append(record)
+ elif len(camps)>1:
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ record.append(camps[1])
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO exclusion_terms (sl_term,tl_term) VALUES (?,?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO exclusion_terms (sl_term,tl_term) VALUES (?,?)",data)
+ self.conn.commit()
+
+ def load_exclusion_terms_tbx(self,arxiu,sl_code="",tl_code="",encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the exclusion terms from a TBX file.'''
+ slcodes=[]
+ for slc in sl_code.split(","):
+ slcodes.append(slc.strip())
+ tlcodes=[]
+ for tlc in tl_code.split(","):
+ tlcodes.append(tlc.strip())
+ data=[]
+ slterm=[]
+ tlterm=[]
+ lang=""
+ for event, elem in etree.iterparse(arxiu,events=("start", "end")):
+ tag=elem.tag.replace(self.namespace(elem),"")
+ if event=="end" and tag in ["conceptEntry","termEntry"]:
+ if len(slterm)>0 and len(tlterm)>0:
+ record=[]
+ for slt in slterm:
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(slt).split()
+ else:
+ tokens=slt.split()
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ tlt=", ".join(tlterm)
+ record.append(slt)
+ record.append(tlt)
+ data.append(record)
+ record=[]
+ slterm=[]
+ tlterm=[]
+ elif event=="start" and tag=="langSec":
+ if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in slcodes:
+ lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+ if elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"] in tlcodes:
+ lang=elem.attrib["{http://www.w3.org/XML/1998/namespace}lang"]
+ elif event=="start" and tag=="term":
+ if lang in slcodes: slterm.append("".join(elem.itertext()).lstrip().rstrip())
+ elif lang in tlcodes: tlterm.append("".join(elem.itertext()).lstrip().rstrip())
+ self.cur.executemany("INSERT INTO exclusion_terms (sl_term,tl_term) VALUES (?,?)",data)
+ self.conn.commit()
+
+ #EXCLUSION NO TERMS
+ def load_exclusion_noterms_tabtxt(self,arxiu,encoding="utf-8",nmin=0,nmax=1000):
+ '''Loads the exclusion no terms from a tabulated text.'''
+ cf=codecs.open(arxiu,"r",encoding=encoding)
+ data=[]
+ continserts=0
+ for line in cf:
+ line=line.rstrip()
+ continserts+=1
+ record=[]
+ line=line.rstrip()
+ camps=line.split("\t")
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(camps[0]).split()
+ else:
+ tokens=camps[0].split()
+ if len(camps)==1:
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ record.append("_")
+ data.append(record)
+ elif len(camps)>1:
+ if len(tokens)>=nmin and len(tokens)<=nmax:
+ record.append(camps[0])
+ record.append(camps[1])
+ data.append(record)
+ if continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO exclusion_noterms (sl_term,tl_term) VALUES (?,?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ self.cur.executemany("INSERT INTO exclusion_noterms (sl_term,tl_term) VALUES (?,?)",data)
+
+
+
+ def namespace(self,element):
+ m = re.match(r'\{.*\}', element.tag)
+ return m.group(0) if m else ''
+
+ def find_translation_reference_terms(self,term):
+ self.cur.execute("SELECT tl_term FROM reference_terms where sl_term='"+str(term)+"'")
+ tlterms=[]
+ for self.s in self.cur.fetchall():
+ tlterms.append(self.s[0])
+ if len(tlterms)>0:
+ return(", ".join(tlterms))
+ else:
+ return(None)
+
+
+ def load_sl_exclusion_regexps(self,arxiu,encoding="utf-8"):
+ '''Loads the exclusion regular expressions for the source language.'''
+ cf=codecs.open(arxiu,"r",encoding=encoding)
+ data=[]
+ for line in cf:
+ line=line.rstrip()
+ record=[]
+ record.append(line)
+ data.append(record)
+
+ with self.conn:
+ self.cur.executemany('INSERT INTO sl_exclusion_regexps (sl_exclusion_regexp) VALUES (?)',data)
+
+ def load_tl_exclusion_regexps(self,arxiu,encoding="utf-8"):
+ '''Loads the exclusion regular expressions for the target language.'''
+ cf=codecs.open(arxiu,"r",encoding=encoding)
+ data=[]
+ for line in cf:
+ line=line.rstrip()
+ record=[]
+ record.append(line)
+ data.append(record)
+
+ with self.conn:
+ self.cur.executemany('INSERT INTO tl_exclusion_regexps (sl_exclusion_regexp) VALUES (?)',data)
+
+
+ def show_term_candidates(self,limit=-1,minfreq=2, minmeasure=-1, show_frequency=True, show_measure=False, mark_eval=False, verbose=False):
+ '''Shows the term candidates in the screen.'''
+ measure=0
+ knownterms=[]
+ knownoterms=[]
+ with self.conn:
+ self.cur.execute("SELECT sl_term FROM exclusion_terms")
+ for s in self.cur.fetchall():
+ knownterms.append(s[0])
+ with self.conn:
+ self.cur.execute("SELECT sl_term FROM exclusion_noterms")
+ for s in self.cur.fetchall():
+ knownnoterms.append(s[0])
+ with self.conn:
+ self.cur.execute("SELECT frequency,value,n,candidate FROM term_candidates order by value desc, frequency desc, random() limit "+str(limit))
+ for s in self.cur.fetchall():
+ frequency=s[0]
+ if s[1]==None:
+ measure==0
+ else:
+ measure=s[1]
+ n=s[2]
+ candidate=s[3]
+ if n>=n_min and n<=n_max and not candidate in knownterms and not candidate in knownoterms:
+ if mark_eval:
+ if candidate in evaluation_terms:
+ candidate="*"+candidate
+ if show_frequency and not show_measure:
+ cadena=str(frequency)+"\t"+candidate
+ if not show_frequency and show_measure:
+ cadena=str(measure)+"\t"+candidate
+ if show_measure and show_frequency:
+ cadena=str(frequency)+"\t"+str(measure)+"\t"+candidate
+ else:
+ cadena=candidate
+ print(cadena)
+
+ def select_unigrams(self,file,position=-1,verbose=True):
+ sunigrams=codecs.open(file,"w",encoding="utf-8")
+ unigrams={}
+ self.cur.execute("SELECT frequency,candidate FROM term_candidates order by value desc, frequency desc, random()")
+ #self.cur.execute("SELECT frequency,value,n,candidate FROM term_candidates order by n desc limit "+str(limit))
+ for s in self.cur.fetchall():
+ frequency=s[0]
+ candidate=s[1].split()[position]
+ if candidate in unigrams:
+ unigrams[candidate]+=frequency
+ else:
+ unigrams[candidate]=frequency
+ #for self.candidate in self.unigrams:
+ # print(self.unigrams[self.candidate],self.candidate)
+ data=[]
+ for candidate in sorted(unigrams, key=unigrams.get, reverse=True):
+
+ cadena=str(unigrams[candidate])+"\t"+candidate
+ #if self.verbose: print(cadena)
+ record=[]
+ record.append(candidate)
+ record.append(1)
+ record.append(unigrams[candidate])
+ record.append("freq")
+ record.append(unigrams[candidate])
+ data.append(record)
+ sunigrams.write(cadena+"\n")
+
+ with self.conn:
+ self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data)
+ self.conn.commit()
+
+
+ def save_term_candidates(self,outfile,limit=-1,minfreq=2, minmeasure=-1, show_frequency=True, show_measure=False, mark_eval=False, verbose=False):
+ '''Saves the term candidates in a file.'''
+ sortida=codecs.open(outfile,"w",encoding="utf-8")
+ measure=0
+ knownterms=[]
+ knownnoterms=[]
+ with self.conn:
+ self.cur.execute("SELECT sl_term FROM exclusion_terms")
+ for s in self.cur.fetchall():
+ knownterms.append(s[0])
+ with self.conn:
+ self.cur.execute("SELECT sl_term FROM exclusion_noterms")
+ for s in self.cur.fetchall():
+ knownnoterms.append(s[0])
+ with self.conn:
+ self.cur.execute("SELECT frequency,value,n,candidate FROM term_candidates order by value desc, frequency desc, random() limit "+str(limit))
+ for s in self.cur.fetchall():
+ frequency=s[0]
+ if s[1]==None:
+ measure==0
+ else:
+ measure=s[1]
+ n=s[2]
+ candidate=s[3]
+ if not candidate in knownterms and not candidate in knownnoterms:
+ if mark_eval:
+ if candidate in evaluation_terms:
+ candidate="*"+candidate
+ if show_measure and not show_frequency:
+ cadena=str(measure)+"\t"+candidate
+ elif show_frequency and not show_measure:
+ cadena=str(frequency)+"\t"+candidate
+ elif show_frequency and show_measure:
+ cadena=str(frequency)+"\t"+str(measure)+"\t"+candidate
+ else:
+ cadena=candidate
+ if verbose:
+ print(cadena)
+ sortida.write(cadena+"\n")
+
+ #STATISTICAL TERM EXTRACTION
+
+ def ngram_calculation (self,nmin,nmax,minfreq=2,corpus="sl_corpus"):
+ '''Performs the calculation of ngrams.'''
+ ngramsFD=FreqDist()
+ tokensFD=FreqDist()
+ n_nmin=nmin
+ n_max=nmax
+
+ with self.conn:
+ if corpus=="sl_corpus":
+ self.cur.execute('SELECT segment from sl_corpus')
+ elif corpus=="tl_corpus":
+ self.cur.execute('SELECT segment from tl_corpus')
+ for s in self.cur.fetchall():
+ segment=s[0]
+ for n in range(nmin,nmax+1): #we DON'T calculate one order bigger in order to detect nested candidates
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(segment).split()
+ else:
+ tokens=segment.split()
+ ngs=ngrams(tokens, n)
+ for ng in ngs:
+ ngramsFD[ng]+=1
+ for token in tokens:
+ tokensFD[token]+=1
+
+ data=[]
+ for c in ngramsFD.most_common():
+ if c[1]>=minfreq:
+ record=[]
+ record.append(" ".join(c[0]))
+ record.append(len(c[0]))
+ record.append(c[1])
+ data.append(record)
+ with self.conn:
+ self.cur.executemany("INSERT INTO ngrams (ngram, n, frequency) VALUES (?,?,?)",data)
+ self.conn.commit()
+
+ data=[]
+ for c in tokensFD.most_common():
+ record=[]
+ record.append(c[0])
+ record.append(c[1])
+ data.append(record)
+ with self.conn:
+ self.cur.executemany("INSERT INTO tokens (token, frequency) VALUES (?,?)",data)
+ self.conn.commit()
+
+ def statistical_term_extraction(self,minfreq=2,corpus="sl_corpus"):
+ '''Performs an statistical term extraction using the extracted ngrams (ngram_calculation should be executed first). Loading stop-words is advisable. '''
+ self.cur.execute("DELETE FROM term_candidates")
+ self.conn.commit()
+ stopwords=[]
+ with self.conn:
+ if corpus=="sl_corpus":
+ self.cur.execute("SELECT sl_stopword FROM sl_stopwords")
+ elif corpus=="tl_corpus":
+ self.cur.execute("SELECT tl_stopword FROM tl_stopwords")
+ for s in self.cur.fetchall():
+ stopwords.append(s[0])
+
+ inner_stopwords=[]
+ with self.conn:
+ if corpus=="sl_corpus":
+ self.cur.execute("SELECT sl_inner_stopword FROM sl_inner_stopwords")
+ elif corpus=="tl_corpus":
+ self.cur.execute("SELECT tl_inner_stopword FROM tl_inner_stopwords")
+ for s in self.cur.fetchall():
+ inner_stopwords.append(s[0])
+
+ self.cur.execute("SELECT ngram, n, frequency FROM ngrams order by frequency desc")
+ results=self.cur.fetchall()
+ data=[]
+ for a in results:
+ if corpus=="sl_corpus":
+ if self.specificSLtokenizer:
+ ng=self.SLtokenizer.tokenize(a[0]).split()
+ else:
+ ng=a[0].split()
+ if corpus=="tl_corpus":
+ if self.specificTLtokenizer:
+
+ ng=self.TLtokenizer.tokenize(a[0]).split()
+ else:
+ ng=a[0].split()
+ include=True
+ if ng[0].lower() in stopwords: include=False
+ if ng[-1].lower() in stopwords: include=False
+ for i in range(1,len(ng)):
+ if ng[i].lower() in inner_stopwords:
+ include=False
+ if include:
+ record=[]
+ record.append(a[0])
+ record.append(a[1])
+ record.append(a[2])
+ record.append("freq")
+ record.append(a[2])
+ data.append(record)
+ if a[2]<minfreq:
+ break
+ with self.conn:
+ self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data)
+ self.conn.commit()
+
+ def loadSLtokenizer(self, tokenizer):
+ if not tokenizer.endswith(".py"): tokenizer=tokenizer+".py"
+ spec = importlib.util.spec_from_file_location('', tokenizer)
+ tokenizermod = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(tokenizermod)
+ self.SLtokenizer=tokenizermod.Tokenizer()
+ self.specificSLtokenizer=True
+
+ def unloadSLtokenizer(self):
+ self.SLtokenizer=None
+ self.specificSLtokenizer=False
+
+ def loadTLtokenizer(self, tokenizer):
+ if not tokenizer.endswith(".py"): tokenizer=tokenizer+".py"
+ spec = importlib.util.spec_from_file_location('', tokenizer)
+ tokenizermod = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(tokenizermod)
+ self.TLtokenizer=tokenizermod.Tokenizer()
+ self.specificTLtokenizer=True
+
+ def unloadSLtokenizer(self):
+ self.TLtokenizer=None
+ self.specificTLtokenizer=False
+
+ def statistical_term_extraction_by_segment(self, segment, minlocalfreq=1, minglobalfreq=2, maxcandidates=2, nmin=1, nmax=4):
+ '''Performs an statistical term extraction over a single segment using the extracted ngrams (ngram_calculation should be executed first) Loading stop-words is advisable. '''
+ ngramsFD=FreqDist()
+ sl_stopword=[]
+ with self.conn:
+ self.cur.execute("SELECT sl_stopword FROM sl_stopwords")
+ for s in self.cur.fetchall():
+ sl_stopword.append(s[0])
+
+ sl_inner_stopwords=[]
+ with self.conn:
+ self.cur.execute("SELECT sl_inner_stopword FROM sl_inner_stopwords")
+ for s in self.cur.fetchall():
+ sl_inner_stopwords.append(s[0])
+
+ for n in range(nmin,nmax+1):
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(segment).split()
+ else:
+ tokens=segment.split()
+ ngs=ngrams(tokens, n)
+ for ng in ngs:
+ include=True
+
+ if ng[0].lower() in self.sl_stopwords: include=False
+ if ng[-1].lower() in self.sl_stopwords: include=False
+ for i in range(1,len(ng)):
+ if ng[i].lower() in self.sl_inner_stopwords:
+ include=False
+ if include: ngramsFD[" ".join(ng)]+=1
+
+ for ng in ngramsFD.most_common():
+ print(ng)
+
+ def case_normalization(self,verbose=False):
+ '''
+ Performs case normalization. If a capitalized term exists as non-capitalized, the capitalized one will be deleted and the frequency of the non-capitalized one will be increased by the frequency of the capitalized.
+ '''
+ self.cur.execute("SELECT candidate,frequency FROM term_candidates order by frequency desc")
+ results=self.cur.fetchall()
+ auxiliar={}
+ for r in results:
+ auxiliar[r[0]]=r[1]
+ for a in results:
+ if not a[0]==a[0].lower() and a[0].lower() in auxiliar:
+ terma=a[0]
+ termb=a[0].lower()
+ freqa=a[1]
+ freqb=auxiliar[termb]
+ n=len(termb.split())
+ freqtotal=freqa+freqb
+ if verbose:
+ print(terma,freqa,"-->",termb,freqb,"-->",freqtotal)
+ self.cur.execute('DELETE FROM term_candidates WHERE candidate=?', (terma,))
+ self.cur.execute('DELETE FROM term_candidates WHERE candidate=?', (termb,))
+ self.cur.execute("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",(termb,n,freqtotal,"freq",freqtotal))
+ self.conn.commit()
+
+ def nest_normalization(self,percent=10,verbose=False):
+ '''
+ Performs a normalization of nested term candidates. If an n-gram candidate A is contained in a n+1 candidate B and freq(A)==freq(B) or they are close values (determined by the percent parameter, A is deleted B remains as it is)
+ '''
+ self.cur.execute("SELECT candidate,frequency,n FROM term_candidates order by frequency desc")
+ results=self.cur.fetchall()
+ for a in results:
+ ta=a[0]
+ fa=a[1]
+ na=a[2]
+ nb=na+1
+ fmax=fa+fa*percent/100
+ fmin=fa-fa*percent/100
+ self.cur.execute("SELECT candidate,frequency FROM term_candidates where frequency <="+str(fmax)+" and frequency>="+str(fmin)+" and n ="+str(nb))
+ results2=self.cur.fetchall()
+ for b in results2:
+ tb=b[0]
+ fb=b[1]
+ if not ta==tb and not tb.find(ta)==-1:
+ self.cur.execute('DELETE FROM term_candidates WHERE candidate=?', (ta,))
+ if verbose:
+ print(str(fa),ta,"-->",str(fb),tb)
+ self.conn.commit()
+
+ def regexp_exclusion(self,verbose=False):
+ '''Deletes term candidates matching a set of regular expresions loaded with the load_sl_exclusion_regexps method.'''
+ self.cur.execute("SELECT sl_exclusion_regexp FROM sl_exclusion_regexps")
+ results=self.cur.fetchall()
+ for r in results:
+ nregexp=len(r[0].split())
+ exreg=r[0]
+ self.cur.execute("SELECT candidate FROM term_candidates")
+ results=self.cur.fetchall()
+ cexreg=re.compile(exreg)
+ for a in results:
+ candidate=a[0]
+ ncandidate=len(candidate.split())
+ match=re.match(cexreg,candidate)
+ if not match==None and nregexp==ncandidate:
+ self.cur.execute('DELETE FROM term_candidates WHERE candidate=?', (candidate,))
+ if verbose:
+ print(exreg,"-->",candidate)
+ self.conn.commit()
+
+ #EVALUATION
+
+
+
+ def evaluate_pos(self,limit,order="desc",iterations=1000,ignore_case=True):
+ '''Performs the evaluation of the term candidates using the evaluation_terms loaded with the load_evaluation_terms method.'''
+ correct=0
+ total=0
+ evaluation_terms=[]
+ self.cur.execute("SELECT sl_term FROM evaluation_terms")
+ results=self.cur.fetchall()
+ for r in results:
+ evaluation_terms.append(r[0])
+ tsr_terms=[]
+ self.cur.execute("SELECT term FROM tsr_terms")
+ results=self.cur.fetchall()
+ for r in results:
+ tsr_terms.append(r[0])
+ evaluation_terms.extend(self.tsr_terms)
+ with self.conn:
+ for i in range(0,iterations):
+ if order=="desc":
+ self.cur.execute("SELECT candidate,value from term_candidates where n<="+str(self.n_max)+" order by value desc, frequency desc, random() limit "+str(limit))
+ elif order=="asc":
+ self.cur.execute("SELECT candidate from term_candidates where n<="+str(self.n_max)+" order by value asc, frequency desc, random() limit "+str(limit))
+ else:
+ raise NameError('Order must be desc (decending) or asc (ascending). Defaulf value: desc')
+ #self.cur.execute("SELECT candidate from term_candidates order by id limit "+str(limit))
+ for s in self.cur.fetchall():
+ total+=1
+ candidate=s[0]
+ if ignore_case:
+ if candidate in evaluation_terms:
+ correct+=1
+ elif candidate.lower() in evaluation_terms:
+ correct+=1
+ else:
+ if candidate in evaluation_terms:
+ correct+=1
+ correct=correct/iterations
+ total=total/iterations
+
+ try:
+ precisio=100*correct/total
+ recall=100*correct/len(evaluation_terms)
+ f1=2*precisio*recall/(precisio+recall)
+ return(limit,correct,total,precisio,recall,f1)
+ except:
+ return(limit,0,0,0,0,0)
+
+ def association_measures(self,measure="raw_freq"):
+ measurename=measure
+ bigram_measures = myBigramAssocMeasures()
+ trigram_measures = myTrigramAssocMeasures()
+ quadgram_measures = myQuadgramAssocMeasures()
+
+ fd_tokens=nltk.FreqDist()
+ fd_bigrams=nltk.FreqDist()
+ fd_trigrams=nltk.FreqDist()
+ fd_quadgrams=nltk.FreqDist()
+ wildcard_fd=nltk.FreqDist()
+ self.cur.execute("SELECT token,frequency from tokens")
+ for s in self.cur.fetchall():
+ aux=(s[0])
+ fd_tokens[aux]+=s[1]
+
+ textcorpus=[]
+ self.cur.execute("SELECT segment from sl_corpus")
+ for segment in self.cur.fetchall():
+ textcorpus.extend(segment[0].split())
+
+ bigram_finder=BigramCollocationFinder.from_words(textcorpus)
+ trigram_finder=TrigramCollocationFinder.from_words(textcorpus)
+ quadgram_finder=QuadgramCollocationFinder.from_words(textcorpus)
+
+ self.cur.execute("SELECT ngram,frequency,n from ngrams")
+ results=self.cur.fetchall()
+ for r in results:
+ data=[]
+ data.append(r[0])
+ self.cur2.execute("UPDATE term_candidates SET value=NULL where candidate=?",data)
+ self.conn.commit()
+ data=[]
+ bigram_measure=[]
+ try:
+ bigram_measure=eval("bigram_finder.score_ngrams(bigram_measures."+measure+")")
+ except:
+ print("WARNING: measure "+measure+ " not implemented for bigrams",sys.exc_info())
+ #sys.exit()
+
+ for nose in bigram_measure:
+ record=[]
+ term_candidate=" ".join(nose[0])
+ mvalue=nose[1]
+ record.append(measure)
+ record.append(mvalue)
+ record.append(term_candidate)
+ data.append(record)
+
+ trigram_measure=[]
+ try:
+ trigram_measure=eval("trigram_finder.score_ngrams(trigram_measures."+measure+")")
+ except:
+ print("WARNING: measure "+measure+ " not implemented for trigrams")
+ #sys.exit()
+ for nose in trigram_measure:
+ record=[]
+ term_candidate=" ".join(nose[0])
+ mvalue=nose[1]
+ record.append(measure)
+ record.append(mvalue)
+ record.append(term_candidate)
+ data.append(record)
+ quadgram_measure=[]
+ try:
+ quadgram_measure=eval("quadgram_finder.score_ngrams(quadgram_measures."+measure+")")
+ except:
+ print("WARNING: measure "+measure+ " not implemented for quadgrams")
+ #sys.exit()
+
+ for nose in quadgram_measure:
+ record=[]
+ term_candidate=" ".join(nose[0])
+ mvalue=nose[1]
+ record.append(measure)
+ record.append(mvalue)
+ record.append(term_candidate)
+ data.append(record)
+
+ self.conn.executemany("UPDATE term_candidates SET measure=?,value=? where candidate=?",data)
+ self.conn.commit()
+
+
+
+ def index_phrase_table(self,phrasetable):
+ '''Indexes a phrase table from Moses.'''
+ self.entrada=gzip.open(phrasetable, mode='rt',encoding='utf-8')
+
+ self.pt={}
+ self.continserts=0
+ self.record=[]
+ self.data=[]
+ while 1:
+ self.linia=self.entrada.readline()
+ if not self.linia:
+ break
+ self.linia=self.linia.rstrip()
+ self.camps=self.linia.split(" ||| ")
+ self.source=self.camps[0].strip()
+ self.trad=self.camps[1].strip()
+ self.probs=self.camps[2].split()
+ try:
+ if not self.trad[0] in self.punctuation and not self.source[0] in self.punctuation and not self.trad[-1] in self.punctuation and not self.source[-1] in self.punctuation:
+ #Currently, four different phrase translation scores are computed:
+ #0 inverse phrase translation probability φ(f|e)
+ #1 inverse lexical weighting lex(f|e)
+ #2 direct phrase translation probability φ(e|f)
+ #3 direct lexical weighting lex(e|f)
+ #self.probtrad=float(self.probs[1])
+ self.probtrad=(float(self.probs[2])*float(self.probs[3]))
+ #print(self.source,self.trad,self.probtrad)
+ self.record=[]
+ self.record.append(self.source)
+ self.record.append(self.trad)
+ self.record.append(self.probtrad)
+ self.data.append(self.record)
+ self.continserts+=1
+ if self.continserts==self.maxinserts:
+ self.cur.executemany("INSERT INTO index_pt (source, target, probability) VALUES (?,?,?)",self.data)
+ self.data=[]
+ self.continserts=0
+ self.conn.commit()
+ except:
+ pass
+ with self.conn:
+ self.cur.executemany("INSERT INTO index_pt (source, target, probability) VALUES (?,?,?)",self.data)
+ self.conn.commit()
+
+
+ def find_terms_in_parallel_corpus(self,SLterms,maxdec=1,maxinc=2,candidates=5,maxlines=-1):
+ tofind=[]
+ result={}
+ if isinstance(SLterms, str):
+ if os.path.exists(SLterms):
+ entrada=codecs.open(SLterms)
+ for linia in entrada:
+ linia=linia.rstrip()
+ tofind.append(linia)
+ entrada.close()
+ else:
+ tofind.append(SLterms)
+ elif isinstance(SLterms, list):
+ tofind.extend(SLterms)
+ tl_stopwords=[]
+ with self.conn:
+ self.cur.execute("SELECT tl_stopword FROM tl_stopwords")
+ for s in self.cur.fetchall():
+ tl_stopwords.append(s[0])
+
+ for SLterm in tofind:
+ fd=FreqDist()
+ fd.clear()
+ result[SLterm]={}
+ if maxlines==-1:
+ self.cur.execute("SELECT segmentTL FROM parallel_corpus where INSTR(segmentSL,\""+SLterm+"\")")
+ else:
+ self.cur.execute("SELECT segmentTL FROM parallel_corpus where INSTR(segmentSL,\""+SLterm+"\") limit "+str(maxlines))
+ TLsegments=self.cur.fetchall()
+ if self.specificSLtokenizer:
+ termtok=self.SLtokenizer.tokenize(SLterm)
+ else:
+ termtok=SLterm
+ nSLterm=len(termtok.split())
+ nmin=nSLterm-maxdec
+ if nmin<1: nmin=1
+ nmax=nSLterm+maxinc
+ for TLsegment in TLsegments:
+ if self.specificTLtokenizer:
+ TLsegmenttok=self.TLtokenizer.tokenize(TLsegment[0]).split()
+ else:
+ TLsegmenttok=TLsegment[0].split()
+ for n in range(nmin,nmax+1):
+ ngs=ngrams(TLsegmenttok, n)
+ for ng in ngs:
+ include=True
+ if ng[0] in tl_stopwords: include=False
+ if len(ng)>1 and ng[1] in tl_stopwords: include=False
+ if include:
+ detokcandidate=" ".join(ng)
+ if self.specificTLtokenizer:
+ detokcandidate=self.TLtokenizer.detokenize(detokcandidate)
+ fd[detokcandidate]+=1
+
+ totalf=fd.N()
+ for mc in fd.most_common(candidates):
+ result[SLterm][mc[0]]=mc[1]/totalf
+ return(result)
+
+ def compoundify_sl_corpus(self,term):
+ term2=term.replace(" ","▁")
+ self.cur.execute("SELECT id, segment FROM sl_corpus where INSTR(segment,\""+term+"\")")
+ trobats=self.cur.fetchall()
+ for trobat in trobats:
+ ident=trobat[0]
+ segment=trobat[1]
+ segment2=segment.replace(term,term2)
+ self.cur.execute("UPDATE sl_corpus SET segment=? where id=?",(segment2,ident))
+ self.conn.commit()
+
+ def compoundify_tl_corpus(self,term):
+ term2=term.replace(" ","▁")
+ self.cur.execute("SELECT id, segment FROM tl_corpus where INSTR(segment,\""+term+"\")")
+ trobats=self.cur.fetchall()
+ for trobat in trobats:
+ ident=trobat[0]
+ segment=trobat[1]
+ segment2=segment.replace(term,term2)
+ self.cur.execute("UPDATE tl_corpus SET segment=? where id=?",(segment2,ident))
+ self.conn.commit()
+
+ def compoundify_tl_corpus_mod(self,term):
+ term2=term.replace(" ","▁")
+ self.cur.execute("SELECT id, segment FROM tl_corpus where INSTR(segment,\""+term+"\")")
+ trobats=self.cur.fetchall()
+ data=[]
+ for trobat in trobats:
+ ident=trobat[0]
+ segment=trobat[1]
+ segment2=segment.replace(term,term2)
+ data.append([segment2])
+
+
+ self.cur.executemany("INSERT INTO tl_corpus (segment) VALUES (?)",data)
+ self.conn.commit()
+
+ def find_translation_comparable_corpus(self,SLterms,tl_stopwords=None,mapping_dictionary="MUSE-en-es.txt",maxdec=1,maxinc=2,candidates=25,compoundifySL=True,compoundifyTL=True,max_term_candidates_compoundify=200):
+ tofind=[]
+ result={}
+
+ if isinstance(SLterms, str):
+ if os.path.exists(SLterms):
+ entrada=codecs.open(SLterms)
+ for linia in entrada:
+ linia=linia.rstrip()
+ tofind.append(linia)
+ entrada.close()
+ else:
+ tofind.append(SLterms)
+ elif isinstance(SLterms, list):
+ tofind.extend(SLterms)
+ #compoundify SL corpus
+ slnmin=1000000
+ slnmax=0
+ for SLterm in tofind:
+ if self.specificSLtokenizer:
+ termtok=self.SLtokenizer.tokenize(SLterm)
+ else:
+ termtok=SLterm
+ if len(termtok.split())>1 and compoundifySL:
+ self.compoundify_sl_corpus(SLterm)
+ if len(termtok.split())<slnmin:slnmin=len(termtok.split())
+ if len(termtok.split())>slnmax:slnmax=len(termtok.split())
+ n_min=slnmin-maxdec
+ if n_min<2: n_min=2
+ n_max=slnmax+maxdec
+ #compoundify TL corpus (basic statistical term extraction)
+ if compoundifyTL:
+ self.delete_tokens()
+ self.delete_ngrams()
+ self.delete_sl_stopwords()
+ self.delete_sl_inner_stopwords
+ self.delete_sl_exclusion_regexps()
+ self.delete_term_candidates()
+ self.ngram_calculation (n_min,n_max,minfreq=2,corpus="tl_corpus")
+ if not tl_stopwords==None:
+ self.load_tl_stopwords(tl_stopwords)
+ self.statistical_term_extraction(minfreq=2,corpus="tl_corpus")
+ self.cur.execute("SELECT candidate FROM term_candidates ORDER BY frequency desc limit "+str(max_term_candidates_compoundify)+";")
+ trobats=self.cur.fetchall()
+ for trobat in trobats:
+ term=trobat[0]
+ self.compoundify_tl_corpus(term)
+ print("CALCULATING EMBEDDINGS SL")
+ self.calculate_embeddings_sl("embeddingsSL.temp",vector_size=300, window=5)
+ print("CALCULATING EMBEDDINGS TL")
+ self.calculate_embeddings_tl("embeddingsTL.temp",vector_size=300, window=5)
+ print("MAPPING EMBEDDINGS")
+ self.mapEmbeddings("embeddingsSL.temp","embeddingsTL.temp","mappedSL.tmp","mappedTL.tmp",mapping_dictionary)
+ self.load_SL_embeddings("mappedSL.tmp")
+ self.load_TL_embeddings("mappedTL.tmp")
+ stopwords=[]
+ with self.conn:
+ self.cur.execute("SELECT tl_stopword FROM tl_stopwords")
+ for s in self.cur.fetchall():
+ stopwords.append(s[0])
+ results={}
+ for SLterm in tofind:
+ if self.specificSLtokenizer:
+ termtok=self.SLtokenizer.tokenize(SLterm)
+ else:
+ termtok=SLterm
+ lenterm=len(termtok.split())
+ lenmin=lenterm-maxdec
+ lenmax=lenterm+maxinc
+ results[SLterm]={}
+ translations=self.find_translation_wv(SLterm,ncandidates=1000)
+ cont=0
+ for translation in translations:
+ if self.specificTLtokenizer:
+ translationtok=self.TLtokenizer.tokenize(translation)
+ else:
+ translationtok=translation
+ lentranslation=len(translationtok.split())
+ try:
+ if not translation in stopwords and not translation.split()[0] in stopwords and not translation.split()[-1] in stopwords and lentranslation>=lenmin and lentranslation<=lenmax:
+ results[SLterm][translation]=translations[translation]
+ cont+=1
+ except:
+ pass
+ if cont>=candidates:
+ break
+
+ return(results)
+
+
+ def find_translation_ptable(self,sourceterm,maxdec=1,maxinc=1,ncandidates=5,separator=":"):
+ '''Finds translation equivalents in an indexed phrase table table. Requires an indexed phrase table and a a list of terms separated by ":".
+ The number of translation candidates can be fixed, as well as the maximum decrement and increment of the number of tokens of the translation candidate'''
+ #select target from index_pt where source="international conflict";
+ self.cur.execute('SELECT target,probability FROM index_pt where source =?',(sourceterm,))
+ self.results=self.cur.fetchall()
+ self.targetcandidates={}
+ for self.a in self.results:
+ self.targetterm=self.a[0]
+ self.probability=float(self.a[1])
+ self.tttokens=self.targetterm.split()
+
+ if not self.tttokens[0] in self.tl_stopwords and not self.tttokens[-1] in self.tl_stopwords and len(self.tttokens)>=len(sourceterm.split())-maxdec and len(self.tttokens)<=len(sourceterm.split())+maxinc:
+ self.targetcandidates[self.targetterm]=self.probability
+ self.sorted_x = sorted(self.targetcandidates.items(), key=operator.itemgetter(1),reverse=True)
+ self.results=[]
+ for self.s in self.sorted_x:
+ self.results.append(self.s[0].replace(":",";"))
+ return(separator.join(self.results[0:ncandidates]))
+
+
+
+ def start_freeling_api(self,freelingpath, LANG):
+
+ if not freelingpath.endswith("/"):freelingpath=freelingpath+"/"
+ try:
+ sys.path.append(freelingpath+"APIs/python3/")
+ import pyfreeling
+ except:
+ print("No Freeling API available. Verify Freeling PATH: "+freelingpath+"freeling/APIs/python3/")
+
+ pyfreeling.util_init_locale("default");
+
+ # create language analyzer
+ la1=pyfreeling.lang_ident(freelingpath+"common/lang_ident/ident.dat");
+
+ # create options set for maco analyzer. Default values are Ok, except for data files.
+ op1= pyfreeling.maco_options(LANG);
+ op1.set_data_files( "",
+ freelingpath + "common/punct.dat",
+ freelingpath+ LANG + "/dicc.src",
+ freelingpath + LANG + "/afixos.dat",
+ "",
+ freelingpath + LANG + "/locucions.dat",
+ freelingpath + LANG + "/np.dat",
+ freelingpath + LANG + "/quantities.dat",
+ freelingpath + LANG + "/probabilitats.dat");
+
+ # create analyzers
+ self.tk1=pyfreeling.tokenizer(freelingpath+LANG+"/tokenizer.dat");
+ self.sp1=pyfreeling.splitter(freelingpath+LANG+"/splitter.dat");
+ self.sid1=self.sp1.open_session();
+ self.mf1=pyfreeling.maco(op1);
+
+ # activate mmorpho odules to be used in next call
+ #(self, umap: "bool", num: "bool", pun: "bool", dat: "bool",
+ # dic: "bool", aff: "bool", comp: "bool", rtk: "bool",
+ # mw: "bool", ner: "bool", qt: "bool", prb: "bool")
+ #deactivate mw
+ self.mf1.set_active_options(False, True, True, False, # select which among created
+ True, True, False, True, # submodules are to be used.
+ False, False, True, True ); # default: all created submodules are used
+
+ # create tagger, sense anotator, and parsers
+ self.tg1=pyfreeling.hmm_tagger(freelingpath+LANG+"/tagger.dat",True,2);
+
+ def tag_freeling_api(self,corpus="source"):
+ with self.conn:
+ data=[]
+ if corpus=="source":
+ self.cur.execute('SELECT id,segment from sl_corpus')
+ elif corpus=="target":
+ self.cur.execute('SELECT id,segment from tl_corpus')
+ continserts=0
+ for s in self.cur.fetchall():
+ id=s[0]
+ segment=s[1]
+ continserts+=1
+ l1 = self.tk1.tokenize(segment);
+ ls1 = self.sp1.split(self.sid1,l1,True);
+ ls1 = self.mf1.analyze(ls1);
+ ls1 = self.tg1.analyze(ls1);
+ ttsentence=[]
+ for s in ls1 :
+ ws = s.get_words();
+ for w in ws :
+ form=w.get_form()
+ lemma=w.get_lemma()
+ tag=w.get_tag()
+ ttsentence.append(form+"|"+lemma+"|"+tag)
+ ttsentence=" ".join(ttsentence)
+ record=[]
+ record.append(id)
+ record.append(ttsentence)
+ data.append(record)
+ if continserts==self.maxinserts:
+ if corpus=="source":
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ if corpus=="target":
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ if corpus=="source":
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ if corpus=="target":
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+
+
+ #SPACY TAGGER
+ def load_POS_model_spacy(self, model):
+ if not spacy.util.is_package(model):
+ print("Downloading and installing ",model)
+ try:
+ subprocess.check_call([sys.executable, "-m", "spacy", "download", model])
+ print("Model downloaded. Stopping the program. The program should be run again to load the downloaded model.")
+ except:
+ print("Model",model,"not available.")
+ else:
+ self.POSmodel_spacy=spacy.load(model)
+
+ def tag_spacy(self,corpus="source",mode="coarse"):
+ #mode on of coarse or fine
+ with self.conn:
+ data=[]
+ if corpus=="source":
+ self.cur.execute('SELECT id,segment from sl_corpus')
+ elif corpus=="target":
+ self.cur.execute('SELECT id,segment from tl_corpus')
+ elif corpus=="parallel-source":
+ self.cur.execute('SELECT id,segmentSL from parallel_corpus')
+ elif corpus=="parallel-target":
+ self.cur.execute('SELECT id,segmentTL from parallel_corpus')
+ continserts=0
+ for s in self.cur.fetchall():
+ id=s[0]
+ segment=s[1]
+ continserts+=1
+ taggedtokens = self.POSmodel_spacy(segment)
+ ttsentence=[]
+ for token in taggedtokens:
+ form=token.text
+ lemma=token.lemma_
+ if mode=="fine":
+ tag=token.tag_
+ elif mode=="coarse":
+ tag=token.pos_
+ ttsentence.append(form+"|"+lemma+"|"+tag)
+ ttsentence=" ".join(ttsentence)
+ record=[]
+ record.append(id)
+ record.append(ttsentence)
+ data.append(record)
+ if continserts==self.maxinserts:
+ if corpus=="source":
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ elif corpus=="target":
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ elif corpus=="parallel-source":
+ self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentSL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentSL=excluded.tagged_segmentSL",data)
+ elif corpus=="parallel-target":
+ self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentTL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentTL=excluded.tagged_segmentTL",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ if corpus=="source":
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ elif corpus=="target":
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ elif corpus=="parallel-source":
+ self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentSL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentSL=excluded.tagged_segmentSL",data)
+ elif corpus=="parallel-target":
+ self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentTL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentTL=excluded.tagged_segmentTL",data)
+
+ #SPACY_UDPIPE TAGGER
+ def load_POS_model_spacy_udpipe(self, language):
+ try:
+ self.POSmodel = spacy_udpipe.load(language)
+ except:
+ print("No model for ",language," available.")
+ print("Downloading and installing model for ",language)
+ try:
+ spacy_udpipe.download(language)
+ self.POSmodel = spacy_udpipe.load(language)
+ except:
+ print("ERROR: not able to load spacy_udepipe model for ",language)
+
+
+ def tag_spacy_udpipe(self,corpus="source"):
+ #mode on of coarse or fine
+ with self.conn:
+ data=[]
+ if corpus=="source":
+ self.cur.execute('SELECT id,segment from sl_corpus')
+ elif corpus=="target":
+ self.cur.execute('SELECT id,segment from tl_corpus')
+ elif corpus=="parallel-source":
+ self.cur.execute('SELECT id,segmentSL from parallel_corpus')
+ elif corpus=="parallel-target":
+ self.cur.execute('SELECT id,segmentTL from parallel_corpus')
+ continserts=0
+ for s in self.cur.fetchall():
+ id=s[0]
+ segment=s[1]
+ continserts+=1
+ taggedtokens = self.POSmodel(segment)
+ ttsentence=[]
+ for token in taggedtokens:
+ form=token.text
+ lemma=token.lemma_
+ tag=token.tag_
+ tag=token.pos_
+ ttsentence.append(form+"|"+lemma+"|"+tag)
+ ttsentence=" ".join(ttsentence)
+ record=[]
+ record.append(id)
+ record.append(ttsentence)
+ data.append(record)
+ if continserts==self.maxinserts:
+ if corpus=="source":
+ self.cur.executemany("INSERT INTO sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ elif corpus=="target":
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ elif corpus=="parallel-source":
+ self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentSL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentSL=excluded.tagged_segmentSL",data)
+ elif corpus=="parallel-target":
+ self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentTL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentTL=excluded.tagged_segmentTL",data)
+ data=[]
+ continserts=0
+ with self.conn:
+ if corpus=="source":
+ self.cur.executemany("INSERT sl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ elif corpus=="target":
+ self.cur.executemany("INSERT INTO tl_tagged_corpus (id, tagged_segment) VALUES (?,?)",data)
+ elif corpus=="parallel-source":
+ self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentSL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentSL=excluded.tagged_segmentSL",data)
+ elif corpus=="parallel-target":
+ self.cur.executemany("INSERT INTO tagged_parallel_corpus (id, tagged_segmentTL) VALUES (?,?) ON CONFLICT (id) DO UPDATE SET tagged_segmentTL=excluded.tagged_segmentTL",data)
+
+
+ def save_sl_tagged_corpus(self,outputfile,encoding="utf-8"):
+ sortida=codecs.open(outputfile,"w",encoding=encoding)
+ self.cur.execute('SELECT tagged_segment from sl_tagged_corpus')
+ for s in self.cur.fetchall():
+ tagged_segment=s[0]
+ sortida.write(tagged_segment+"\n")
+
+ def save_tl_tagged_corpus(self,outputfile,encoding="utf-8"):
+ sortida=codecs.open(outputfile,"w",encoding=encoding)
+ self.cur.execute('SELECT tagged_segment from tl_tagged_corpus')
+ for s in self.cur.fetchall():
+ tagged_segment=s[0]
+ sortida.write(tagged_segment+"\n")
+
+
+
+ def save_sl_tagged_parallel_corpus(self,outputfile,encoding="utf-8"):
+ sortida=codecs.open(outputfile,"w",encoding=encoding)
+ self.cur.execute('SELECT tagged_segmentSL from tagged_parallel_corpus')
+ for s in self.cur.fetchall():
+ tagged_segment=s[0]
+ sortida.write(tagged_segment+"\n")
+
+ def save_tl_tagged_parallel_corpus(self,outputfile,encoding="utf-8"):
+ sortida=codecs.open(outputfile,"w",encoding=encoding)
+ self.cur.execute('SELECT tagged_segmentTL from tagged_parallel_corpus')
+ for s in self.cur.fetchall():
+ tagged_segment=s[0]
+ sortida.write(tagged_segment+"\n")
+
+
+ def tagged_ngram_calculation (self,nmin=2,nmax=3,minfreq=2):
+ '''Calculates the tagged ngrams.'''
+ ngramsFD=FreqDist()
+ n_nmin=nmin
+ n_max=nmax
+ data=[]
+ record=[]
+ with self.conn:
+ self.cur.execute('SELECT tagged_segment from sl_tagged_corpus')
+ for s in self.cur.fetchall():
+ segment=s[0]
+ for n in range(nmin,nmax+1):
+ ngs=ngrams(segment.split(),n)
+ for ng in ngs:
+ ngramsFD[ng]+=1
+ for c in ngramsFD.most_common():
+ if c[1]>=minfreq:
+ candidate=[]
+ for ngt in c[0]:
+ candidate.append(ngt.split("|")[0])
+ candidate=" ".join(candidate)
+ record=[]
+ record.append(candidate)
+ record.append(" ".join(c[0]))
+ record.append(len(c[0]))
+ record.append(c[1])
+ data.append(record)
+ with self.conn:
+ self.cur.executemany("INSERT INTO tagged_ngrams (ngram, tagged_ngram, n, frequency) VALUES (?,?,?,?)",data)
+ self.conn.commit()
+
+ def translate_linguistic_pattern(self,pattern):
+ aux=[]
+ for ptoken in pattern.split():
+ auxtoken=[]
+ ptoken=ptoken.replace(".*","[^\s]+")
+ for pelement in ptoken.split("|"):
+ if pelement=="#":
+ auxtoken.append("([^\s]+?)")
+ elif pelement=="":
+ auxtoken.append("[^\s]+?")
+ else:
+ if pelement.startswith("#"):
+ auxtoken.append("("+pelement.replace("#","")+")")
+ else:
+ auxtoken.append(pelement)
+ aux.append("\|".join(auxtoken))
+ tp="("+" ".join(aux)+")"
+ return(tp)
+
+ def load_linguistic_patterns(self,file, encoding="utf-8"):
+ '''Loads the linguistic patterns to use with linguistic terminology extraction.'''
+ entrada=codecs.open(file,"r",encoding=encoding)
+ linguistic_patterns=[]
+ data=[]
+ record=[]
+ for linia in entrada:
+ linia=linia.rstrip()
+ npattern=len(linia.split(" "))
+ if npattern<self.n_min_pos_patterns: self.n_min_pos_patterns=npattern
+ if npattern>self.n_max_pos_patterns: self.n_max_pos_patterns=npattern
+ pattern=self.translate_linguistic_pattern(linia)
+ record.append(pattern)
+ data.append(record)
+ record=[]
+ with self.conn:
+ self.cur.executemany("INSERT INTO linguistic_patterns (linguistic_pattern) VALUES (?)",data)
+ def get_n_min_pos_patterns(self):
+ return(self.n_min_pos_patterns)
+
+ def get_n_max_pos_patterns(self):
+ return(self.n_max_pos_patterns)
+
+ def linguistic_term_extraction(self,minfreq=2):
+ '''Performs an linguistic term extraction using the extracted tagged ngrams (tagged_ngram_calculation should be executed first). '''
+ linguistic_patterns=[]
+ controlpatterns=[]
+ with self.conn:
+ self.cur.execute("SELECT linguistic_pattern from linguistic_patterns")
+ for lp in self.cur.fetchall():
+ linguistic_pattern=lp[0]
+ transformedpattern="^"+linguistic_pattern+"$"
+ if not transformedpattern in controlpatterns:
+ linguistic_patterns.append(transformedpattern)
+ controlpatterns.append(transformedpattern)
+ self.cur.execute("SELECT tagged_ngram, n, frequency FROM tagged_ngrams order by frequency desc")
+ results=self.cur.fetchall()
+ data=[]
+ for a in results:
+ include=True
+ ng=a[0]
+ n=a[1]
+ frequency=a[2]
+ try:
+ if ng.split()[0].split("|")[1].lower() in sl_stopwords: include=False
+ except:
+ pass
+ try:
+ if ng.split()[-1].split("|")[1].lower() in sl_stopwords: include=False
+ except:
+ pass
+ if frequency<minfreq:
+ break
+ if include:
+ for pattern in linguistic_patterns:
+ match=re.search(pattern,ng)
+ if match:
+ if match.group(0)==ng:
+ candidate=" ".join(match.groups()[1:])
+ record=[]
+ record.append(candidate)
+ record.append(n)
+ record.append(frequency)
+ record.append("freq")
+ record.append(frequency)
+ data.append(record)
+ break
+ with self.conn:
+ self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data)
+ self.conn.commit()
+
+ #deleting repeated candidates
+ self.cur.execute("SELECT candidate, n, frequency FROM term_candidates")
+ results=self.cur.fetchall()
+ tcaux={}
+ for a in results:
+ if not a[0] in tcaux:
+ tcaux[a[0]]=a[2]
+ else:
+ tcaux[a[0]]+=a[2]
+ self.cur.execute("DELETE FROM term_candidates")
+ self.conn.commit()
+ data=[]
+ for tc in tcaux:
+ record=[]
+ record.append(tc)
+ record.append(len(tc.split()))
+ record.append(tcaux[tc])
+ record.append("freq")
+ record.append(tcaux[tc])
+ data.append(record)
+ with self.conn:
+ self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data)
+ self.conn.commit()
+
+ def learn_linguistic_patterns(self,outputfile,showfrequencies=False,encoding="utf-8",verbose=True,representativity=100):
+ learntpatterns={}
+ sortida=codecs.open(outputfile,"w",encoding=encoding)
+ acufreq=0
+ tags={}
+ with self.conn:
+ self.cur.execute("SELECT sl_term FROM evaluation_terms")
+ for s in self.cur.fetchall():
+ self.cur.execute("SELECT tagged_ngram, n, frequency FROM tagged_ngrams WHERE ngram= ?", (s[0],))
+ results=self.cur.fetchall()
+ if len(results)>0:
+ for a in results:
+ ng=a[0]
+ nglist=ng.split()
+ n=a[1]
+ frequency=a[2]
+ candidate=[]
+ ngtokenstag=ng.split()
+ for ngt in ngtokenstag:
+ candidate.append(ngt.split("|")[0])
+ candidate=" ".join(candidate)
+ t2=ng.split()
+ t1=candidate.split()
+ patternbrut=[]
+ for position in range(0,n):
+ t2f=t2[position].split("|")[0]
+ t2l=t2[position].split("|")[1]
+ t2t=t2[position].split("|")[2]
+ patternpart=""
+ if t1[position]==t2l:
+ patternpart="|#|"+t2t
+ elif t1[position]==t2f:
+ patternpart="#||"+t2t
+ patternbrut.append(patternpart)
+ pattern=" ".join(patternbrut)
+ if pattern in learntpatterns:
+ learntpatterns[pattern]+=n
+ acufreq+=n
+ else:
+ learntpatterns[pattern]=n
+ acufreq+=n
+ sorted_x = sorted(learntpatterns.items(), key=operator.itemgetter(1),reverse=True)
+ results=[]
+ acufreq2=0
+ for s in sorted_x:
+ percent=100*acufreq2/acufreq
+ if percent>representativity:
+ break
+ acufreq2+=s[1]
+ if showfrequencies:
+ cadena=str(s[1])+"\t"+s[0]
+ else:
+ cadena=s[0]
+ sortida.write(cadena+"\n")
+ if verbose:
+ print(cadena)
+
+ def find_translation_pcorpus_statistical(self,slterm,maxdec=1,maxinc=1,ncandidates=5,separator=":"):
+ self.nmin=len(slterm.split())-maxdec
+ self.nmax=len(slterm.split())+maxinc
+ self.tlngrams=FreqDist()
+ with self.conn:
+ self.cur.execute('SELECT id, segment from sl_corpus')
+
+ for self.s in self.cur.fetchall():
+ self.segment=self.s[1]
+ self.id=self.s[0]
+
+ if self.segment.find(slterm)>-1:
+ self.cur2.execute('SELECT segment from tl_corpus where id="'+str(self.id)+'"')
+ for self.s2 in self.cur2.fetchall():
+ self.tl_segment=self.s2[0]
+ for self.n in range(self.nmin,self.nmax+1):
+ #self.tlngs=ngrams(self.tl_tokenizer.tokenize(self.tl_segment), self.n)
+ self.tlngs=ngrams(self.tl_segment.split(), self.n)
+ for self.tlng in self.tlngs:
+ if not self.tlng[0] in self.tl_stopwords and not self.tlng[-1] in self.tl_stopwords:
+ self.tlngrams[self.tlng]+=1
+ self.resultlist=[]
+ for self.c in self.tlngrams.most_common(ncandidates):
+ self.resultlist.append(" ".join(self.c[0]))
+
+ return(separator.join(self.resultlist))
+
+ def find_translation_pcorpus_linguistics(self,slterm,maxdec=1,maxinc=1,ncandidates=5,separator=":"):
+ self.nmin=len(slterm.split())-maxdec
+ self.nmax=len(slterm.split())+maxinc
+ self.tlngrams=FreqDist()
+ with self.conn:
+ self.cur.execute('SELECT id, segment from sl_corpus')
+
+ for self.s in self.cur.fetchall():
+ self.segment=self.s[1]
+ self.id=self.s[0]
+
+ if self.segment.find(slterm)>-1:
+ self.cur2.execute('SELECT segment from tl_corpus where id="'+str(self.id)+'"')
+ for self.s2 in self.cur2.fetchall():
+ self.tl_segment=self.s2[0]
+ for self.n in range(self.nmin,self.nmax+1):
+ #self.tlngs=ngrams(self.tl_tokenizer.tokenize(self.tl_segment), self.n)
+ self.tlngs=ngrams(self.tl_segment.split(), self.n)
+ for self.tlng in self.tlngs:
+ if not self.tlng[0] in self.tl_stopwords and not self.tlng[-1] in self.tl_stopwords:
+ self.tlngrams[self.tlng]+=1
+ self.resultlist=[]
+ for self.c in self.tlngrams.most_common(ncandidates):
+ self.resultlist.append(" ".join(self.c[0]))
+
+ return(separator.join(self.resultlist))
+
+#EMBEDDINGS
+
+ def calculate_embeddings_sl(self,filename,vector_size=300, window=5, min_count=1, workers=4):
+ self.cur.execute('SELECT id, segment from sl_corpus')
+ data = []
+ for s in self.cur.fetchall():
+ temp=[]
+ segment=s[1]
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(segment).split()
+ else:
+ tokens=segment.split()
+ data.append(tokens)
+ model = Word2Vec(sentences=data, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
+ model.wv.save_word2vec_format(filename, binary=False)
+
+ def calculate_embeddings_sl_ref(self,filename,vector_size=300, window=5, min_count=1, workers=4):
+ self.cur.execute('SELECT id, segment from tl_corpus')
+ data = []
+ for s in self.cur.fetchall():
+ temp=[]
+ segment=s[1]
+ if self.specificSLtokenizer:
+ tokens=self.SLtokenizer.tokenize(segment).split()
+ else:
+ tokens=segment.split()
+ data.append(tokens)
+ model = Word2Vec(sentences=data, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
+ model.wv.save_word2vec_format(filename, binary=False)
+
+ def calculate_embeddings_tl(self,filename,vector_size=300, window=5, min_count=1, workers=4):
+ self.cur.execute('SELECT id, segment from tl_corpus')
+ data = []
+ for s in self.cur.fetchall():
+ temp=[]
+ segment=s[1]
+ if self.specificTLtokenizer:
+ tokens=self.TLtokenizer.tokenize(segment).split()
+ else:
+ tokens=segment.split()
+ data.append(tokens)
+ model = Word2Vec(sentences=data, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
+ model.wv.save_word2vec_format(filename, binary=False)
+
+ def mapEmbeddings(self,src_input,trg_input,src_output,trg_output,init_dictionary):
+ supervised_mapping(src_input,trg_input,src_output,trg_output,init_dictionary)
+
+ def load_SL_embeddings(self, file, binary=False):
+ self.wvSL = KeyedVectors.load_word2vec_format(file, binary=False)
+
+ def load_TL_embeddings(self, file, binary=False):
+ self.wvTL = KeyedVectors.load_word2vec_format(file, binary=False)
+
+
+ def find_translation_wv(self, term, ncandidates=50):
+
+ term=term.strip().replace(" ","▁")
+ try:
+ vector=self.wvSL[term]
+ tcandidates = self.wvTL.most_similar([vector], topn=ncandidates)
+ except:
+ tcandidates=[]
+ response={}
+
+ for tc in tcandidates:
+ tc2=tc[0].replace("▁"," ")
+ response[tc2]=tc[1]
+
+ return(response)
+
+
+
+#TSR
+ def tsr(self, type="combined",max_iterations=10000000000, verbose=True):
+ component={}
+ firstcomponent={}
+ middlecomponent={}
+ lastcomponent={}
+ self.tsr_terms=[]
+ self.cur.execute("SELECT term FROM tsr_terms")
+ results=self.cur.fetchall()
+ for r in results:
+ self.tsr_terms.append(r[0])
+ for term in self.tsr_terms:
+ camps=term.split()
+ if len(camps)==1: #UNIGRAMS
+ firstcomponent[camps[0].lower()]=1
+ lastcomponent[camps[0].lower()]=1
+ if len(camps)>=2:
+ firstcomponent[camps[0].lower()]=1
+ lastcomponent[camps[-1].lower()]=1
+ component[camps[0].lower()]=1
+ component[camps[-1].lower()]=1
+ if len(camps)>=3:
+ for i in range(1,len(camps)-1):
+ middlecomponent[camps[i].lower()]=1
+ component[camps[i].lower()]=1
+
+ new=True
+ newcandidates={} #candidate-frequency
+ hashmeasure={}
+ hashvalue={}
+
+ newcandidatestempstric={} #candidate-frequency
+ hashmeasuretempstrict={}
+ hashvaluetempstric={}
+
+ newcandidatestempflexible={} #candidate-frequency
+ hashmeasuretempflexible={}
+ hashvaluetempflexible={}
+
+ newcandidatestempcombined={} #candidate-frequency
+ hashmeasuretempcombined={}
+ hashvaluetempcombined={}
+
+ iterations=0
+ while new:
+ iterations+=1
+ if verbose: print("ITERATION",iterations)
+ new=False
+ self.cur.execute("SELECT candidate,n,frequency,measure,value FROM term_candidates ")
+ results=self.cur.fetchall()
+ auxiliar={}
+ value=max_iterations-iterations#r[4]
+ for r in results:
+ candidate=r[0]
+ n=r[1]
+ frequency=r[2]
+ measure="tsr"#r[3]
+ #IMPLEMENTED ONLY FOR BIGRAMS !!!
+ '''
+ rcamps=candidate.split()
+ if type=="strict":
+ if rcamps[0] in firstcomponent and rcamps[-1] in lastcomponent:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ elif type=="flexible":
+ if rcamps[0] in firstcomponent or rcamps[-1] in lastcomponent:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[-1]]=1
+ elif type=="combined":
+ if iterations==1:
+ if rcamps[0] in firstcomponent and rcamps[-1] in lastcomponent:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[-1]]=1
+ else:
+ if rcamps[0] in firstcomponent or rcamps[-1] in lastcomponent:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[-1]]=1
+ '''
+ first_c=False
+ middle_c=False
+ last_c=False
+ rcamps=candidate.split()
+ truesfalses=[]
+ if str(rcamps[0]).lower() in firstcomponent:
+ first_c=True
+ truesfalses.append(True)
+ else:
+ truesfalses.append(False)
+ if str(rcamps[-1]).lower() in lastcomponent:
+ last_c=True
+ truesfalses.append(True)
+ else:
+ truesfalses.append(False)
+ if n>2:
+ middle_c=True
+ for i in range(1,n-1):
+ if not str(r[i]).lower() in middlecomponent: middle_c=False
+ if middle_c==True:
+ truesfalses.append(True)
+ else:
+ truesfalses.append(False)
+ if type=="strict":
+ if not False in truesfalses:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ elif type=="flexible":
+ if True in truesfalses:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[-1]]=1
+ elif type=="combined":
+ if iterations==1:
+ new=True
+ if not False in truesfalses:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ if n>2:
+ for i in range(1,n-1):
+ middlecomponent[rcamps[i]]=1
+ component[rcamps[i]]=1
+ else:
+ if True in truesfalses:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ if n>2:
+ for i in range(1,n-1):
+ middlecomponent[rcamps[i]]=1
+ component[rcamps[i]]=1
+ component[rcamps[0]]=1
+ component[rcamps[-1]]=1
+ '''
+ if n==2:
+ if rcamps[0] in firstcomponent: first_c=True
+ middle_c=True
+ if rcamps[-1] in lastcomponent: last_c=True
+ if type=="strict":
+ if first_c and last_c:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ elif type=="flexible":
+ if first_c or last_c:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[-1]]=1
+
+ elif type=="combined":
+ if iterations==1:
+ if first_c and last_c:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[-1]]=1
+ else:
+ if first_c or last_c:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[-1]]=1
+
+ elif n==3:
+ if rcamps[0] in firstcomponent: first_c=True
+ if rcamps[1] in middlecomponent: middle_c=True
+ if rcamps[-1] in lastcomponent: last_c=True
+ if type=="strict":
+ if first_c and middle_c and last_c:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ middlecomponent[rcamps[1]]=1
+ lastcomponent[rcamps[-1]]=1
+ elif type=="flexible":
+ condition=False
+ if first_c and middle_c or last_c: condition=True
+ #if first_c or middle_c and last_c: condition=True
+ if last_c and middle_c or first_c: condition=True
+ if condition:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ middlecomponent[rcamps[1]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[1]]=1
+ component[rcamps[-1]]=1
+
+ elif type=="combined":
+ if iterations==1:
+ if first_c and middle_c and last_c:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ middlecomponent[rcamps[1]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[1]]=1
+ component[rcamps[-1]]=1
+ else:
+ condition=False
+ if first_c or middle_c or last_c: condition=True
+ #if first_c and middle_c or last_c: condition=True
+ #if first_c or middle_c and last_c: condition=True
+ #if last_c and middle_c or first_c: condition=True
+ if condition:
+ if not candidate in newcandidates:
+ newcandidates[candidate]=frequency
+ hashmeasure[candidate]=measure
+ hashvalue[candidate]=value
+ new=True
+ firstcomponent[rcamps[0]]=1
+ middlecomponent[rcamps[1]]=1
+ lastcomponent[rcamps[-1]]=1
+ component[rcamps[0]]=1
+ component[rcamps[1]]=1
+ component[rcamps[-1]]=1
+ '''
+ if iterations>=max_iterations:
+ break
+ if verbose: print(iterations,new)
+ with self.conn:
+ self.cur.execute('DELETE FROM term_candidates')
+ self.conn.commit()
+
+
+ data=[]
+ for c in newcandidates:
+ termb=c
+ n=len(c.split())
+ freqtotal=newcandidates[c]
+ measure=hashmeasure[c]
+ value=hashvalue[c]
+ record=[]
+ record.append(termb)
+ record.append(n)
+ record.append(freqtotal)
+ record.append(measure)
+ record.append(value)
+ data.append(record)
+ with self.conn:
+ self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency, measure, value) VALUES (?,?,?,?,?)",data)
+
+ self.conn.commit()
+
+def L_LLR(a,b,c):
+ '''Auxiliar function to calculate Log Likelihood Ratio'''
+ L=(c**a)*((1-c)**(b-a))
+ return(L)
+
+class myBigramAssocMeasures(nltk.collocations.BigramAssocMeasures):
+ """
+ A collection of bigram association measures. Each association measure
+ is provided as a function with three arguments::
+
+ bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
+
+ The arguments constitute the marginals of a contingency table, counting
+ the occurrences of particular events in a corpus. The letter i in the
+ suffix refers to the appearance of the word in question, while x indicates
+ the appearance of any word. Thus, for example:
+
+ n_ii counts (w1, w2), i.e. the bigram being scored
+ n_ix counts (w1, *)
+ n_xi counts (*, w2)
+ n_xx counts (*, *), i.e. any bigram
+
+ This may be shown with respect to a contingency table::
+
+ w1 ~w1
+ ------ ------
+ w2 | n_ii | n_oi | = n_xi
+ ------ ------
+ ~w2 | n_io | n_oo |
+ ------ ------
+ = n_ix TOTAL = n_xx
+
+ Amb la terminologia de Pazienza
+
+ w1 ~w1
+ ------ ------
+ w2 | n_ii O11| n_oi O12| = n_xi
+ ------ ------
+ ~w2 | n_io O21| n_oo O22|
+ ------ ------
+ = n_ix TOTAL = n_xx
+
+ N=O11+O12+O21+O22= n_xx
+ R1=O11+O12 = n_xi
+ R2=O21+O22
+ C1=O11+O21=n_ix
+ C2=O12+O22= n_oi+n_oo
+
+ TENIM: n_ii, n_ix_xi_tuple, n_xx
+ n_io=n_ix-n_ii
+ n_oi=n_xi-n_ii
+ n_oo=n_xx-n_ix-n_xi
+ """
+ #MEASURES NOT IMPLEMENTED IN NLTK
+
+ def loglikelihood(self,n_ii, n_ix_xi_tuple, n_xx):
+ '''LogLikelihood according to NSP'''
+ (n_ix, n_xi) = n_ix_xi_tuple
+ n_oo=n_xx-n_ix-n_xi
+ n_io=n_ix-n_ii
+ n_oi=n_xi-n_ii
+ n_oo=n_xx-n_ix-n_xi
+
+ n11=n_ii
+ n12=n_io
+ n21=n_oi
+ n22=n_oo
+ n1p=n11+n12
+ np1=n11+n21
+ n2p=n21+n22
+ np2=n12+n22
+ npp=n_xx
+
+ m11 = (n1p*np1/npp)
+ m12 = (n1p*np2/npp)
+ m21 = (np1*n2p/npp)
+ m22 = (n2p*np2/npp)
+ try:
+ LogLikelihood = 2 * (n11 * math.log((n11/m11),2) + n12 * math.log((n12/m12),2) + n21 * math.log((n21/m21),2) + n22 * math.log((n22/m22),2))
+ except:
+ LogLikelihood=0
+ return(LogLikelihood)
+
+ def MI(self,n_ii, n_ix_xi_tuple, n_xx):
+ '''Church Mutual Information accoding to Pazienza'''
+ (n_ix, n_xi) = n_ix_xi_tuple
+ self.E11=n_xi*n_ix/n_xx
+ self.part=n_ii/self.E11
+ self.MI=math.log(self.part,2)
+ return(self.MI)
+
+ def MI2(self,n_ii, n_ix_xi_tuple, n_xx):
+ '''Church Mutual Information Variant accoding to Pazienza'''
+ (n_ix, n_xi) = n_ix_xi_tuple
+ self.E11=n_xi*n_ix/n_xx
+ self.part=(n_ii/self.E11)**2
+ self.MI2=math.log(self.part,2)
+ return(self.MI2)
+
+ def MI3(self,n_ii, n_ix_xi_tuple, n_xx):
+ '''Church Mutual Information Variant accoding to Pazienza'''
+ (n_ix, n_xi) = n_ix_xi_tuple
+ self.E11=n_xi*n_ix/n_xx
+ self.part=(n_ii/self.E11)**3
+ self.MI3=math.log(self.part,2)
+ return(self.MI3)
+
+ def odds(self,n_ii, n_ix_xi_tuple, n_xx):
+ '''Odds ratio according to NSP'''
+ (n_ix, n_xi) = n_ix_xi_tuple
+ n_oo=n_xx-n_ix-n_xi
+ n_io=n_ix-n_ii
+ n_oi=n_xi-n_ii
+ n_oo=n_xx-n_ix-n_xi
+
+ n11=n_ii
+ n12=n_io
+ n21=n_oi
+ n22=n_oo
+ n1p=n11+n12
+ np1=n11+n21
+ n2p=n21+n22
+ np2=n12+n22
+ npp=n_xx
+
+ m11 = (n1p*np1/npp)
+ m12 = (n1p*np2/npp)
+ m21 = (np1*n2p/npp)
+ m22 = (n2p*np2/npp)
+
+ if n21==0:n21=1
+ if n12==0:n12=1
+ ODDS_RATIO = (n11*n22)/(n21*n12)
+ return(ODDS_RATIO)
+
+ def z_score(self,n_ii, n_ix_xi_tuple, n_xx):
+ '''z-score ratio according to NSP'''
+ (n_ix, n_xi) = n_ix_xi_tuple
+ n_oo=n_xx-n_ix-n_xi
+ n_io=n_ix-n_ii
+ n_oi=n_xi-n_ii
+ n_oo=n_xx-n_ix-n_xi
+
+ n11=n_ii
+ n12=n_io
+ n21=n_oi
+ n22=n_oo
+ n1p=n11+n12
+ np1=n11+n21
+ n2p=n21+n22
+ np2=n12+n22
+ npp=n_xx
+
+ m11 = (n1p*np1/npp)
+ m12 = (n1p*np2/npp)
+ m21 = (np1*n2p/npp)
+ m22 = (n2p*np2/npp)
+
+ zscore = (n11-m11)/(math.sqrt(m11))
+ return(zscore)
+
+
+class myTrigramAssocMeasures(nltk.collocations.TrigramAssocMeasures):
+ pass
+
+class myQuadgramAssocMeasures(nltk.collocations.QuadgramAssocMeasures):
+ pass
+
+
+
+
+
+###STUFF FROM MAP EMBEDDINGS MIKEL ARTETXE###
+#cupy_utils
+import numpy
+
+try:
+ import cupy
+except ImportError:
+ cupy = None
+
+
+def supports_cupy():
+ return cupy is not None
+
+
+def get_cupy():
+ return cupy
+
+
+def get_array_module(x):
+ if cupy is not None:
+ return cupy.get_array_module(x)
+ else:
+ return numpy
+
+
+def asnumpy(x):
+ if cupy is not None:
+ return cupy.asnumpy(x)
+ else:
+ return numpy.asarray(x)
+#embeddings
+
+def embeddings_read(file, threshold=0, vocabulary=None, dtype='float'):
+ header = file.readline().split(' ')
+ count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
+ dim = int(header[1])
+ words = []
+ matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
+ for i in range(count):
+ word, vec = file.readline().split(' ', 1)
+ if vocabulary is None:
+ words.append(word)
+ matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
+ elif word in vocabulary:
+ words.append(word)
+ matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
+ return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))
+
+
+def embeddings_write(words, matrix, file):
+ m = asnumpy(matrix)
+ print('%d %d' % m.shape, file=file)
+ for i in range(len(words)):
+ print(words[i] + ' ' + ' '.join(['%.6g' % x for x in m[i]]), file=file)
+
+
+def embeddings_length_normalize(matrix):
+ xp = get_array_module(matrix)
+ norms = xp.sqrt(xp.sum(matrix**2, axis=1))
+ norms[norms == 0] = 1
+ matrix /= norms[:, xp.newaxis]
+
+
+def embeddings_mean_center(matrix):
+ xp = get_array_module(matrix)
+ avg = xp.mean(matrix, axis=0)
+ matrix -= avg
+
+
+def embeddings_length_normalize_dimensionwise(matrix):
+ xp = get_array_module(matrix)
+ norms = xp.sqrt(xp.sum(matrix**2, axis=0))
+ norms[norms == 0] = 1
+ matrix /= norms
+
+
+def embeddings_mean_center_embeddingwise(matrix):
+ xp = get_array_module(matrix)
+ avg = xp.mean(matrix, axis=1)
+ matrix -= avg[:, xp.newaxis]
+
+
+def embeddings_normalize(matrix, actions):
+ for action in actions:
+ if action == 'unit':
+ embeddings_length_normalize(matrix)
+ elif action == 'center':
+ embeddings_mean_center(matrix)
+ elif action == 'unitdim':
+ embeddings_length_normalize_dimensionwise(matrix)
+ elif action == 'centeremb':
+ embeddings_mean_center_embeddingwise(matrix)
+
+#map_embeddings
+
+def dropout(m, p):
+ if p <= 0.0:
+ return m
+ else:
+ xp = get_array_module(m)
+ mask = xp.random.rand(*m.shape) >= p
+ return m*mask
+
+
+def topk_mean(m, k, inplace=False): # TODO Assuming that axis is 1
+ xp = get_array_module(m)
+ n = m.shape[0]
+ ans = xp.zeros(n, dtype=m.dtype)
+ if k <= 0:
+ return ans
+ if not inplace:
+ m = xp.array(m)
+ ind0 = xp.arange(n)
+ ind1 = xp.empty(n, dtype=int)
+ minimum = m.min()
+ for i in range(k):
+ m.argmax(axis=1, out=ind1)
+ ans += m[ind0, ind1]
+ m[ind0, ind1] = minimum
+ return ans / k
+
+def supervised_mapping(src_input,trg_input,src_output,trg_output,init_dictionary,encoding="utf-8",precision="fp32",cuda=False,batch_size=1000,seed=0,unsupervised_vocab=0,src_reweight=0,trg_reweight=0,dim_reduction=0,vocabulary_cutoff=0,direction="union",csls=0,threshold=0.000001,validation=None,stochastic_initial=0.1,stochastic_multiplier=2.0,stochastic_interval=50):
+ self_learning=False
+ print("SUPERVISED")
+ normalize=['unit', 'center', 'unit']
+ #parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
+ normalize=['unit', 'center', 'unit']
+ whiten=True
+ src_reweight=0.5
+ trg_reweight=0.5
+ src_dewhiten='src'
+ trg_dewhiten='trg'
+ batch_size=1000
+ cuda=False
+ identical=False
+ unsupervised=False
+ init_identical=False
+ init_numerals=False
+ init_unsupervised=False
+ orthogonal=False
+ unconstrained=False
+ self_learning=False
+ verbose=False
+ if precision == 'fp16':
+ dtype = 'float16'
+ elif precision == 'fp32':
+ dtype = 'float32'
+ elif precision == 'fp64':
+ dtype = 'float64'
+
+ # Read input embeddings
+ print("Read input embeddings")
+ srcfile = open(src_input, encoding=encoding, errors='surrogateescape')
+ trgfile = open(trg_input, encoding=encoding, errors='surrogateescape')
+ src_words, x = embeddings_read(srcfile, dtype=dtype)
+ trg_words, z = embeddings_read(trgfile, dtype=dtype)
+
+ # NumPy/CuPy management
+ if cuda:
+ if not supports_cupy():
+ print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
+ sys.exit(-1)
+ xp = get_cupy()
+ x = xp.asarray(x)
+ z = xp.asarray(z)
+ else:
+ xp = np
+ xp.random.seed(seed)
+
+ # Build word to index map
+ print("Build word to index map")
+ src_word2ind = {word: i for i, word in enumerate(src_words)}
+ trg_word2ind = {word: i for i, word in enumerate(trg_words)}
+
+ # STEP 0: Normalization
+ print("STEP 0: Normalization")
+ embeddings_normalize(x, normalize)
+ embeddings_normalize(z, normalize)
+
+ # Build the seed dictionary
+ print("Build the seed dictionary")
+ src_indices = []
+ trg_indices = []
+
+ f = open(init_dictionary, encoding=encoding, errors='surrogateescape')
+ for line in f:
+ src, trg = line.split()
+ try:
+ src_ind = src_word2ind[src]
+ trg_ind = trg_word2ind[trg]
+ src_indices.append(src_ind)
+ trg_indices.append(trg_ind)
+ except KeyError:
+ pass
+ #print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
+
+ # Read validation dictionary
+ if validation is not None:
+ f = open(validation, encoding=encoding, errors='surrogateescape')
+ validation = collections.defaultdict(set)
+ oov = set()
+ vocab = set()
+ for line in f:
+ src, trg = line.split()
+ try:
+ src_ind = src_word2ind[src]
+ trg_ind = trg_word2ind[trg]
+ validation[src_ind].add(trg_ind)
+ vocab.add(src)
+ except KeyError:
+ oov.add(src)
+ oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov
+ validation_coverage = len(validation) / (len(validation) + len(oov))
+
+
+
+ # Allocate memory
+ print("Allocate memory")
+ xw = xp.empty_like(x)
+ zw = xp.empty_like(z)
+ src_size = x.shape[0] if vocabulary_cutoff <= 0 else min(x.shape[0], vocabulary_cutoff)
+ trg_size = z.shape[0] if vocabulary_cutoff <= 0 else min(z.shape[0], vocabulary_cutoff)
+ simfwd = xp.empty((batch_size, trg_size), dtype=dtype)
+ simbwd = xp.empty((batch_size, src_size), dtype=dtype)
+ if validation is not None:
+ simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)
+
+ best_sim_forward = xp.full(src_size, -100, dtype=dtype)
+ src_indices_forward = xp.arange(src_size)
+ trg_indices_forward = xp.zeros(src_size, dtype=int)
+ best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
+ src_indices_backward = xp.zeros(trg_size, dtype=int)
+ trg_indices_backward = xp.arange(trg_size)
+ knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
+ knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)
+
+ # Training loop
+ print("Training loop")
+ best_objective = objective = -100.
+ it = 1
+ last_improvement = 0
+ keep_prob = stochastic_initial
+ t = time.time()
+ end = not self_learning
+ while True:
+
+ # Increase the keep probability if we have not improve in stochastic_interval iterations
+ if it - last_improvement > stochastic_interval:
+ if keep_prob >= 1.0:
+ end = True
+ keep_prob = min(1.0, stochastic_multiplier*keep_prob)
+ last_improvement = it
+
+ # Update the embedding mapping
+ if orthogonal or not end: # orthogonal mapping
+ u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
+ w = vt.T.dot(u.T)
+ x.dot(w, out=xw)
+ zw[:] = z
+ elif unconstrained: # unconstrained mapping
+ x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
+ w = x_pseudoinv.dot(z[trg_indices])
+ x.dot(w, out=xw)
+ zw[:] = z
+ else: # advanced mapping
+
+ # TODO xw.dot(wx2, out=xw) and alike not working
+ xw[:] = x
+ zw[:] = z
+
+ # STEP 1: Whitening
+ def whitening_transformation(m):
+ u, s, vt = xp.linalg.svd(m, full_matrices=False)
+ return vt.T.dot(xp.diag(1/s)).dot(vt)
+ if whiten:
+ wx1 = whitening_transformation(xw[src_indices])
+ wz1 = whitening_transformation(zw[trg_indices])
+ xw = xw.dot(wx1)
+ zw = zw.dot(wz1)
+
+ # STEP 2: Orthogonal mapping
+ wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
+ wz2 = wz2_t.T
+ xw = xw.dot(wx2)
+ zw = zw.dot(wz2)
+
+ # STEP 3: Re-weighting
+ xw *= s**src_reweight
+ zw *= s**trg_reweight
+
+ # STEP 4: De-whitening
+ if src_dewhiten == 'src':
+ xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
+ elif src_dewhiten == 'trg':
+ xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
+ if trg_dewhiten == 'src':
+ zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
+ elif trg_dewhiten == 'trg':
+ zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
+
+ # STEP 5: Dimensionality reduction
+ if dim_reduction > 0:
+ xw = xw[:, :dim_reduction]
+ zw = zw[:, :dim_reduction]
+
+ # Self-learning
+ if end:
+ break
+ else:
+ # Update the training dictionary
+ if direction in ('forward', 'union'):
+ if csls_neighborhood > 0:
+ for i in range(0, trg_size, simbwd.shape[0]):
+ j = min(i + simbwd.shape[0], trg_size)
+ zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
+ knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=csls_neighborhood, inplace=True)
+ for i in range(0, src_size, simfwd.shape[0]):
+ j = min(i + simfwd.shape[0], src_size)
+ xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
+ simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
+ simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN
+ dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j])
+ if direction in ('backward', 'union'):
+ if csls_neighborhood > 0:
+ for i in range(0, src_size, simfwd.shape[0]):
+ j = min(i + simfwd.shape[0], src_size)
+ xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
+ knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=csls_neighborhood, inplace=True)
+ for i in range(0, trg_size, simbwd.shape[0]):
+ j = min(i + simbwd.shape[0], trg_size)
+ zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
+ simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
+ simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN
+ dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j])
+ if direction == 'forward':
+ src_indices = src_indices_forward
+ trg_indices = trg_indices_forward
+ elif direction == 'backward':
+ src_indices = src_indices_backward
+ trg_indices = trg_indices_backward
+ elif direction == 'union':
+ src_indices = xp.concatenate((src_indices_forward, src_indices_backward))
+ trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward))
+
+ # Objective function evaluation
+ if direction == 'forward':
+ objective = xp.mean(best_sim_forward).tolist()
+ elif direction == 'backward':
+ objective = xp.mean(best_sim_backward).tolist()
+ elif direction == 'union':
+ objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2
+ if objective - best_objective >= threshold:
+ last_improvement = it
+ best_objective = objective
+
+ # Accuracy and similarity evaluation in validation
+ if validation is not None:
+ src = list(validation.keys())
+ xw[src].dot(zw.T, out=simval)
+ nn = asnumpy(simval.argmax(axis=1))
+ accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
+ similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])
+
+ # Logging
+ duration = time.time() - t
+ if verbose:
+ print(file=sys.stderr)
+ print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
+ print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr)
+ print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
+ if validation is not None:
+ print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
+ print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
+ print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
+ sys.stderr.flush()
+ if log is not None:
+ val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
+ 100 * similarity, 100 * accuracy, 100 * validation_coverage) if validation is not None else ''
+ print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
+ log.flush()
+
+ t = time.time()
+ it += 1
+
+ # Write mapped embeddings
+ print("Write mapped embeddings")
+ srcfile = open(src_output, mode='w', encoding=encoding, errors='surrogateescape')
+ trgfile = open(trg_output, mode='w', encoding=encoding, errors='surrogateescape')
+ embeddings_write(src_words, xw, srcfile)
+ embeddings_write(trg_words, zw, trgfile)
+ srcfile.close()
+ trgfile.close()
+