diff options
author | Antoni Oliver <aoliverg@uoc.edu> | 2021-05-11 12:41:52 +0300 |
---|---|---|
committer | Antoni Oliver <aoliverg@uoc.edu> | 2021-05-11 12:41:52 +0300 |
commit | 37137b80c1ece6c3be30a3a5220d41d79b686c5e (patch) | |
tree | 1f3faf5a781854d7918b926f30defdea2cc3fb6e | |
parent | d77e589b4ac409f633856842eb4e2b15020465ef (diff) |
solved problem with frequencies in linguistic TE
-rwxr-xr-x | TBXTools.py | 22 |
1 files changed, 13 insertions, 9 deletions
diff --git a/TBXTools.py b/TBXTools.py index cc6c6e6..1dc9593 100755 --- a/TBXTools.py +++ b/TBXTools.py @@ -16,8 +16,8 @@ import math import string -#version: 2020/10/06 -#Copyright: Antoni Oliver (2020) - Universitat Oberta de Catalunya - aoliverg@uoc.edu +#version: 2021/05/1 +# Copyright: Antoni Oliver (2021) - Universitat Oberta de Catalunya - aoliverg@uoc.edu # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -844,7 +844,7 @@ class TBXTools: self.segment=self.s[0] for self.n in range(nmin,nmax+2): #we calculate one order bigger in order to detect nested candidates #self.ngs=ngrams(self.sl_tokenizer.tokenize(self.segment), self.n) - self.ngs=ngrams(self.segment.split(" "), self.n) + self.ngs=ngrams(self.segment.split(), self.n) for self.ng in self.ngs: self.ngrams[self.ng]+=1 #for self.token in self.sl_tokenizer.tokenize(self.segment): @@ -1398,8 +1398,8 @@ class TBXTools: for self.s in self.cur.fetchall(): self.segment=self.s[0] - for self.n in range(nmin,nmax+2): #we calculate one order bigger in order to detect nested candidates - self.ngs=ngrams(self.segment.split(" "), self.n) + for self.n in range(nmin,nmax+1): + self.ngs=ngrams(self.segment.split(), self.n) for self.ng in self.ngs: self.ngrams[self.ng]+=1 @@ -1461,11 +1461,16 @@ class TBXTools: def linguistic_term_extraction(self,minfreq=2): '''Performs an linguistic term extraction using the extracted tagged ngrams (tagged_ngram_calculation should be executed first). ''' self.linguistics_patterns=[] + self.controlpatterns=[] with self.conn: self.cur.execute("SELECT linguistic_pattern from linguistic_patterns") for self.lp in self.cur.fetchall(): self.linguistic_pattern=self.lp[0] - self.linguistic_patterns.append("^"+self.linguistic_pattern+"$") + self.transformedpattern="^"+self.linguistic_pattern+"$" + if not self.transformedpattern in self.controlpatterns: + self.linguistic_patterns.append(self.transformedpattern) + self.controlpatterns.append(self.transformedpattern) + self.cur.execute("SELECT tagged_ngram, n, frequency FROM tagged_ngrams order by frequency desc") self.results=self.cur.fetchall() @@ -1483,10 +1488,8 @@ class TBXTools: if self.include: for self.pattern in self.linguistic_patterns: self.match=re.search(self.pattern,self.ng) - if self.match: - if self.match.group(0)==self.ng: - + if self.match.group(0)==self.ng: self.candidate=" ".join(self.match.groups()[1:]) self.record=[] self.record.append(self.candidate) @@ -1495,6 +1498,7 @@ class TBXTools: self.record.append("freq") self.record.append(self.frequency) self.data.append(self.record) + break with self.conn: #self.cur.executemany("INSERT INTO term_candidates (candidate, n, frequency) VALUES (?,?,?)",self.data) |