Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/python/modules/TextTools/TextTools.py')
-rw-r--r--intern/python/modules/TextTools/TextTools.py766
1 files changed, 766 insertions, 0 deletions
diff --git a/intern/python/modules/TextTools/TextTools.py b/intern/python/modules/TextTools/TextTools.py
new file mode 100644
index 00000000000..7eae2bcfc39
--- /dev/null
+++ b/intern/python/modules/TextTools/TextTools.py
@@ -0,0 +1,766 @@
+""" mxTextTools - A tools package for fast text processing.
+
+ (c) Copyright Marc-Andre Lemburg; All Rights Reserved.
+ See the documentation for further information on copyrights,
+ or contact the author (mal@lemburg.com).
+"""
+import string,types
+
+#
+# import the C module and the version number
+#
+from mxTextTools import *
+from mxTextTools import __version__
+
+#
+# import the symbols needed to write tag tables
+#
+from Constants.TagTables import *
+
+#
+# import the some handy character sets
+#
+from Constants.Sets import *
+
+#
+# format and print tables, taglists and joinlists:
+#
+def format_entry(table,i,
+
+ TupleType=types.TupleType):
+
+ """ Returns a pp-formatted tag table entry as string
+ """
+ e = table[i]
+ jne = 0
+ je = 1
+ t,c,m = e[:3]
+ if len(e)>3: jne = e[3]
+ if len(e)>4: je = e[4]
+ flags,cmd = divmod(c,256)
+ c = id2cmd[cmd]
+ if type(m) == TupleType and c in ('Table','SubTable'):
+ m = '<table>'
+ elif m == None:
+ m = 'Here/To'
+ else:
+ m = repr(m)
+ if len(m) > 17:
+ m = m[:17]+'...'
+ return '%-15.15s : %-30s : jne=%+i : je=%+i' % \
+ (repr(t),'%-.15s : %s'%(c,m),jne,je)
+
+def format_table(table,i=-1):
+
+ """ Returns a pp-formatted version of the tag table as string """
+
+ l = []
+ for j in range(len(table)):
+ if i == j:
+ l.append('--> '+format_entry(table,j))
+ else:
+ l.append(' '+format_entry(table,j))
+ return string.join(l,'\n')+'\n'
+
+def print_tagtable(table):
+
+ """ Print the tag table
+ """
+ print format_table(table)
+
+def print_tags(text,tags,indent=0):
+
+ """ Print the taglist tags for text using the given indent level
+ """
+ for tag,l,r,subtags in tags:
+ tagname = repr(tag)
+ if len(tagname) > 20:
+ tagname = tagname[:20] + '...'
+ target = repr(text[l:r])
+ if len(target) > 60:
+ target = target[:60] + '...'
+ if subtags == None:
+ print ' '+indent*' |',tagname,': ',target,(l,r)
+ else:
+ print ' '+indent*' |',tagname,': ',target,(l,r)
+ print_tags(text,subtags,indent+1)
+
+def print_joinlist(joins,indent=0,
+
+ StringType=types.StringType):
+
+ """ Print the joinlist joins using the given indent level
+ """
+ for j in joins:
+ if type(j) == StringType:
+ text = repr(j)
+ if len(text) > 40:
+ text = text[:40] + '...'
+ print ' '+indent*' |',text,' (len = %i)' % len(j)
+ else:
+ text = j[0]
+ l,r = j[1:3]
+ text = repr(text[l:r])
+ if len(text) > 40:
+ text = text[:40] + '...'
+ print ' '+indent*' |',text,' (len = %i)' % (r-l),(l,r)
+
+def normlist(jlist,
+
+ StringType=types.StringType):
+
+ """ Return a normalized joinlist.
+
+ All tuples in the joinlist are turned into real strings. The
+ resulting list is a equivalent copy of the joinlist only
+ consisting of strings.
+
+ """
+ l = [''] * len(jlist)
+ for i in range(len(jlist)):
+ entry = jlist[i]
+ if type(entry) == StringType:
+ l[i] = entry
+ else:
+ l[i] = entry[0][entry[1]:entry[2]]
+ return l
+
+#
+# aid for matching from a list of words
+#
+def _lookup_dict(l,index=0):
+
+ d = {}
+ for w in l:
+ c = w[index]
+ if d.has_key(c):
+ d[c].append(w)
+ else:
+ d[c] = [w]
+ return d
+
+def word_in_list(l):
+
+ """ Creates a lookup table that matches the words in l
+ """
+ t = []
+ d = _lookup_dict(l)
+ keys = d.keys()
+ if len(keys) < 18: # somewhat arbitrary bound
+ # fast hint for small sets
+ t.append((None,IsIn,string.join(d.keys(),'')))
+ t.append((None,Skip,-1))
+ # test groups
+ for c, group in d.items():
+ t.append(None) # hint will be filled in later
+ i = len(t)-1
+ for w in group:
+ t.append((None,Word,w[1:],+1,MatchOk))
+ t.append((None,Fail,Here))
+ # add hint
+ t[i] = (None,Is,c,len(t)-i)
+ t.append((None,Fail,Here))
+ return tuple(t)
+
+#
+# Extra stuff useful in combination with the C functions
+#
+
+def replace(text,what,with,start=0,stop=None,
+
+ SearchObject=BMS,join=join,joinlist=joinlist,tag=tag,
+ string_replace=string.replace,type=type,
+ StringType=types.StringType):
+
+ """A fast replacement for string.replace.
+
+ what can be given as string or search object.
+
+ This function is a good example for the AppendTagobj-flag usage
+ (the taglist can be used directly as joinlist).
+
+ """
+ if type(what) == StringType:
+ so = SearchObject(what)
+ else:
+ so = what
+ what = so.match
+ if stop is None:
+ if start == 0 and len(what) < 2:
+ return string_replace(text,what,with)
+ stop = len(text)
+ t = ((text,sWordStart,so,+2),
+ # Found something, replace and continue searching
+ (with,Skip+AppendTagobj,len(what),-1,-1),
+ # Rest of text
+ (text,Move,ToEOF)
+ )
+ found,taglist,last = tag(text,t,start,stop)
+ if not found:
+ return text
+ return join(taglist)
+
+# Alternative (usually slower) versions using different techniques:
+
+def _replace2(text,what,with,start=0,stop=None,
+
+ join=join,joinlist=joinlist,tag=tag,
+ StringType=types.StringType,BMS=BMS):
+
+ """Analogon to string.replace; returns a string with all occurences
+ of what in text[start:stop] replaced by with
+ - uses a one entry tag-table and a Boyer-Moore-Search-object
+ - what can be a string or a BMS/FS search object
+ - it's faster than string.replace in those cases, where
+ the what-string gets long and/or many replacements are found;
+ faster meaning from a few percent up to many times as fast
+ - start and stop define the slice of text to work in
+ - stop defaults to len(text)
+ """
+ if stop is None:
+ stop = len(text)
+ if type(what) == StringType:
+ what=BMS(what)
+ t = ((with,sFindWord,what,+1,+0),)
+ found,taglist,last = tag(text,t,start,stop)
+ if not found:
+ return text
+ return join(joinlist(text,taglist))
+
+def _replace3(text,what,with,
+
+ join=string.join,FS=FS,
+ StringType=types.StringType):
+
+ if type(what) == StringType:
+ what=FS(what)
+ slices = what.findall(text)
+ if not slices:
+ return text
+ l = []
+ x = 0
+ for left,right in slices:
+ l.append(text[x:left] + with)
+ x = right
+ l.append(text[x:])
+ return join(l,'')
+
+def _replace4(text,what,with,
+
+ join=join,joinlist=joinlist,tag=tag,FS=FS,
+ StringType=types.StringType):
+
+ if type(what) == StringType:
+ what=FS(what)
+ slices = what.findall(text)
+ if not slices:
+ return text
+ repl = [None]*len(slices)
+ for i in range(len(slices)):
+ repl[i] = (with,)+slices[i]
+ return join(joinlist(text,repl))
+
+
+def find(text,what,start=0,stop=None,
+
+ SearchObject=FS):
+
+ """ A faster replacement for string.find().
+
+ Uses a search object for the task. Returns the position of the
+ first occurance of what in text[start:stop]. stop defaults to
+ len(text). Returns -1 in case no occurance was found.
+
+ """
+ if stop:
+ return SearchObject(what).find(text,start,stop)
+ else:
+ return SearchObject(what).find(text,start)
+
+def findall(text,what,start=0,stop=None,
+
+ SearchObject=FS):
+
+ """ Find all occurances of what in text.
+
+ Uses a search object for the task. Returns a list of slice
+ tuples (l,r) marking the all occurances in
+ text[start:stop]. stop defaults to len(text). Returns an
+ empty list in case no occurance was found.
+
+ """
+ if stop:
+ return SearchObject(what).findall(text,start,stop)
+ else:
+ return SearchObject(what).findall(text,start)
+
+def split(text,sep,start=0,stop=None,translate=None,
+
+ SearchObject=FS):
+
+ """ A faster replacement for string.split().
+
+ Uses a search object for the task. Returns the result of
+ cutting the text[start:stop] string into snippets at every sep
+ occurance in form of a list of substrings. translate is passed
+ to the search object as translation string.
+
+ XXX convert to a C function... or even better, add as method
+ to search objects.
+
+ """
+ if translate:
+ so = SearchObject(sep,translate)
+ else:
+ so = SearchObject(sep)
+ if stop:
+ cuts = so.findall(text,start,stop)
+ else:
+ cuts = so.findall(text,start)
+ l = 0
+ list = []
+ append = list.append
+ for left,right in cuts:
+ append(text[l:left])
+ l = right
+ append(text[l:])
+ return list
+
+# helper for tagdict
+def _tagdict(text,dict,prefix,taglist):
+
+ for o,l,r,s in taglist:
+ pfx = prefix + str(o)
+ dict[pfx] = text[l:r]
+ if s:
+ _tagdict(text,dict,pfx+'.',s)
+
+def tagdict(text,*args):
+
+ """ Tag a text just like the function tag() and then convert
+ its output into a dictionary where the tagobjects reference
+ their respective strings
+ - this function emulates the interface of tag()
+ - in contrast to tag() this funtion *does* make copies
+ of the found stings
+ - returns a tuple (rc,tagdict,next) with the same meaning
+ of rc and next as tag(); tagdict is the new dictionary -
+ None in case rc is 0
+ """
+ rc,taglist,next = apply(tag,(text,)+args)
+ if not rc:
+ return (rc,None,next)
+ d = {}
+ tagdict = _tagdict
+ for o,l,r,s in taglist:
+ pfx = str(o)
+ d[pfx] = text[l:r]
+ if s:
+ tagdict(text,dict,pfx+'.',s)
+ return (rc,d,next)
+
+def invset(chars):
+
+ """ Return a set with all characters *except* the ones in chars.
+ """
+ return set(chars,0)
+
+def is_whitespace(text,start=0,stop=None,
+
+ nonwhitespace=nonwhitespace_set,setfind=setfind):
+
+ """ Return 1 iff text[start:stop] only contains whitespace
+ characters (as defined in Constants/Sets.py), 0 otherwise.
+ """
+ if stop is None:
+ stop = len(text)
+ i = setfind(text,nonwhitespace,start,stop)
+ return (i < 0)
+
+def collapse(text,seperator=' ',
+
+ join=join,setsplit=setsplit,collapse_set=set(newline+whitespace)):
+
+ """ Eliminates newline characters and compresses whitespace
+ characters into one space.
+
+ The result is a one line text string. Tim Peters will like
+ this function called with '-' seperator ;-)
+
+ """
+ return join(setsplit(text,collapse_set),seperator)
+
+_linesplit_table = (
+ (None,Is,'\r',+1),
+ (None,Is,'\n',+1),
+ ('line',AllInSet+AppendMatch,set('\r\n',0),+1,-2),
+ (None,EOF,Here,+1,MatchOk),
+ ('empty line',Skip+AppendMatch,0,0,-4),
+ )
+
+def splitlines(text,
+
+ tag=tag,linesplit_table=_linesplit_table):
+
+ """ Split text into a list of single lines.
+
+ The following combinations are considered to be line-ends:
+ '\r', '\r\n', '\n'; they may be used in any combination. The
+ line-end indicators are removed from the strings prior to
+ adding them to the list.
+
+ This function allows dealing with text files from Macs, PCs
+ and Unix origins in a portable way.
+
+ """
+ return tag(text,linesplit_table)[1]
+
+_linecount_table = (
+ (None,Is,'\r',+1),
+ (None,Is,'\n',+1),
+ ('line',AllInSet+AppendTagobj,set('\r\n',0),+1,-2),
+ (None,EOF,Here,+1,MatchOk),
+ ('empty line',Skip+AppendTagobj,0,0,-4),
+ )
+
+def countlines(text,
+
+ linecount_table=_linecount_table):
+
+ """ Returns the number of lines in text.
+
+ Line ends are treated just like for splitlines() in a
+ portable way.
+ """
+ return len(tag(text,linecount_table)[1])
+
+_wordsplit_table = (
+ (None,AllInSet,whitespace_set,+1),
+ ('word',AllInSet+AppendMatch,nonwhitespace_set,+1,-1),
+ (None,EOF,Here,+1,MatchOk),
+ )
+
+def splitwords(text,
+
+ setsplit=setsplit,whitespace_set=whitespace_set):
+
+ """ Split text into a list of single words.
+
+ Words are separated by whitespace. The whitespace is stripped
+ before adding the words to the list.
+
+ """
+ return setsplit(text,whitespace_set)
+
+#
+# Testing and benchmarking
+#
+
+# Taken from my hack.py module:
+import time
+class _timer:
+
+ """ timer class with a quite obvious interface
+ - .start() starts a fairly accurate CPU-time timer plus an
+ absolute timer
+ - .stop() stops the timer and returns a tuple: the CPU-time in seconds
+ and the absolute time elapsed since .start() was called
+ """
+
+ utime = 0
+ atime = 0
+
+ def start(self,
+ clock=time.clock,time=time.time):
+ self.atime = time()
+ self.utime = clock()
+
+ def stop(self,
+ clock=time.clock,time=time.time):
+ self.utime = clock() - self.utime
+ self.atime = time() - self.atime
+ return self.utime,self.atime
+
+ def usertime(self,
+ clock=time.clock,time=time.time):
+ self.utime = clock() - self.utime
+ self.atime = time() - self.atime
+ return self.utime
+
+ def abstime(self,
+ clock=time.clock,time=time.time):
+ self.utime = clock() - self.utime
+ self.atime = time() - self.atime
+ return self.utime
+
+ def __str__(self):
+
+ return '%0.2fu %0.2fa sec.' % (self.utime,self.atime)
+
+def _bench(file='mxTextTools/mxTextTools.c'):
+
+ def mismatch(orig,new):
+ print
+ for i in range(len(orig)):
+ if orig[i] != new[i]:
+ break
+ else:
+ print 'Length mismatch: orig=%i new=%i' % (len(orig),len(new))
+ if len(orig) > len(new):
+ print 'Missing chars:'+repr(orig[len(new):])
+ else:
+ print 'Excess chars:'+repr(new[len(orig):])
+ print
+ return
+ print 'Mismatch at offset %i:' % i
+ print (orig[i-100:i]
+ + '<- %s != %s ->' % (repr(orig[i]),repr(new[i]))
+ + orig[i+1:i+100])
+ print
+
+ text = open(file).read()
+ import string
+
+ t = _timer()
+ print 'Working on a %i byte string' % len(text)
+
+ if 0:
+ print
+ print 'Replacing strings'
+ print '-'*72
+ print
+ for what,with in (('m','M'),('mx','MX'),('mxText','MXTEXT'),
+ ('hmm','HMM'),('hmmm','HMM'),('hmhmm','HMM')):
+ print 'Replace "%s" with "%s"' % (what,with)
+ t.start()
+ for i in range(100):
+ rtext = string.replace(text,what,with)
+ print 'with string.replace:',t.stop(),'sec.'
+ t.start()
+ for i in range(100):
+ ttext = replace(text,what,with)
+ print 'with tag.replace:',t.stop(),'sec.'
+ if ttext != rtext:
+ print 'results are NOT ok !'
+ print '-'*72
+ mismatch(rtext,ttext)
+ t.start()
+ for i in range(100):
+ ttext = _replace2(text,what,with)
+ print 'with tag._replace2:',t.stop(),'sec.'
+ if ttext != rtext:
+ print 'results are NOT ok !'
+ print '-'*72
+ print rtext
+ t.start()
+ for i in range(100):
+ ttext = _replace3(text,what,with)
+ print 'with tag._replace3:',t.stop(),'sec.'
+ if ttext != rtext:
+ print 'results are NOT ok !'
+ print '-'*72
+ print rtext
+ t.start()
+ for i in range(100):
+ ttext = _replace4(text,what,with)
+ print 'with tag._replace4:',t.stop(),'sec.'
+ if ttext != rtext:
+ print 'results are NOT ok !'
+ print '-'*72
+ print rtext
+ print
+
+ if 0:
+ print
+ print 'String lower/upper'
+ print '-'*72
+ print
+
+ op = string.lower
+ t.start()
+ for i in range(1000):
+ op(text)
+ t.stop()
+ print ' string.lower:',t
+
+ op = string.upper
+ t.start()
+ for i in range(1000):
+ op(text)
+ t.stop()
+ print ' string.upper:',t
+
+ op = upper
+ t.start()
+ for i in range(1000):
+ op(text)
+ t.stop()
+ print ' TextTools.upper:',t
+
+ op = lower
+ t.start()
+ for i in range(1000):
+ op(text)
+ t.stop()
+ print ' TextTools.lower:',t
+
+ print 'Testing...',
+ ltext = string.lower(text)
+ assert ltext == lower(text)
+ utext = string.upper(text)
+ assert utext == upper(text)
+ print 'ok.'
+
+ if 0:
+ print
+ print 'Joining lists'
+ print '-'*72
+ print
+
+ l = setsplit(text,whitespace_set)
+
+ op = string.join
+ t.start()
+ for i in range(1000):
+ op(l)
+ t.stop()
+ print ' string.join:',t
+
+ op = join
+ t.start()
+ for i in range(1000):
+ op(l)
+ t.stop()
+ print ' TextTools.join:',t
+
+ op = string.join
+ t.start()
+ for i in range(1000):
+ op(l,' ')
+ t.stop()
+ print ' string.join with seperator:',t
+
+ op = join
+ t.start()
+ for i in range(1000):
+ op(l,' ')
+ t.stop()
+ print ' TextTools.join with seperator:',t
+
+ if 0:
+ print
+ print 'Creating join lists'
+ print '-'*72
+ print
+
+ repl = []
+ for i in range(0,len(text),10):
+ repl.append(str(i),i,i+1)
+
+ op = joinlist
+ t.start()
+ for i in range(1000):
+ op(text,repl)
+ t.stop()
+ print ' TextTools.joinlist:',t
+
+ if 0:
+ print
+ print 'Splitting text'
+ print '-'*72
+ print
+
+ op = string.split
+ t.start()
+ for i in range(100):
+ op(text)
+ t.stop()
+ print ' string.split whitespace:',t,'(',len(op(text)),'snippets )'
+
+ op = setsplit
+ ws = whitespace_set
+ t.start()
+ for i in range(100):
+ op(text,ws)
+ t.stop()
+ print ' TextTools.setsplit whitespace:',t,'(',len(op(text,ws)),'snippets )'
+
+ assert string.split(text) == setsplit(text,ws)
+
+ op = string.split
+ sep = 'a'
+ t.start()
+ for i in range(100):
+ op(text,sep)
+ t.stop()
+ print ' string.split at "a":',t,'(',len(op(text,sep)),'snippets )'
+
+ op = split
+ sep = 'a'
+ t.start()
+ for i in range(100):
+ op(text,sep)
+ t.stop()
+ print ' TextTools.split at "a":',t,'(',len(op(text,sep)),'snippets )'
+
+ op = charsplit
+ sep = 'a'
+ t.start()
+ for i in range(100):
+ op(text,sep)
+ t.stop()
+ print ' TextTools.charsplit at "a":',t,'(',len(op(text,sep)),'snippets )'
+
+ op = setsplit
+ sep = set('a')
+ t.start()
+ for i in range(100):
+ op(text,sep)
+ t.stop()
+ print ' TextTools.setsplit at "a":',t,'(',len(op(text,sep)),'snippets )'
+
+ # Note: string.split and setsplit don't work identically !
+
+ op = string.split
+ sep = 'int'
+ t.start()
+ for i in range(100):
+ op(text,sep)
+ t.stop()
+ print ' string.split at "int":',t,'(',len(op(text,sep)),'snippets )'
+
+ op = split
+ sep = 'int'
+ t.start()
+ for i in range(100):
+ op(text,sep)
+ t.stop()
+ print ' TextTools.split at "int":',t,'(',len(op(text,sep)),'snippets )'
+
+ op = setsplit
+ sep = set('int')
+ t.start()
+ for i in range(100):
+ op(text,sep)
+ t.stop()
+ print ' TextTools.setsplit at "i", "n", "t":',t,'(',len(op(text,sep)),'snippets )'
+
+ op = string.split
+ sep = 'register'
+ t.start()
+ for i in range(100):
+ op(text,sep)
+ t.stop()
+ print ' string.split at "register":',t,'(',len(op(text,sep)),'snippets )'
+
+ op = split
+ sep = 'register'
+ t.start()
+ for i in range(100):
+ op(text,sep)
+ t.stop()
+ print ' TextTools.split at "register":',t,'(',len(op(text,sep)),'snippets )'
+
+if __name__=='__main__':
+ _bench()
+