diff options
Diffstat (limited to 'intern/python/modules/TextTools')
-rw-r--r-- | intern/python/modules/TextTools/Constants/Sets.py | 39 | ||||
-rw-r--r-- | intern/python/modules/TextTools/Constants/TagTables.py | 348 | ||||
-rw-r--r-- | intern/python/modules/TextTools/Constants/__init__.py | 1 | ||||
-rw-r--r-- | intern/python/modules/TextTools/TextTools.py | 766 | ||||
-rw-r--r-- | intern/python/modules/TextTools/__init__.py | 48 | ||||
-rw-r--r-- | intern/python/modules/TextTools/mxTextTools/__init__.py | 17 |
6 files changed, 1219 insertions, 0 deletions
diff --git a/intern/python/modules/TextTools/Constants/Sets.py b/intern/python/modules/TextTools/Constants/Sets.py new file mode 100644 index 00000000000..bf260aa3e0c --- /dev/null +++ b/intern/python/modules/TextTools/Constants/Sets.py @@ -0,0 +1,39 @@ +""" Constants for sets (of characters) + + (c) Copyright Marc-Andre Lemburg; All Rights Reserved. + See the documentation for further information on copyrights, + or contact the author (mal@lemburg.com). +""" +import string + +# Simple character strings + +a2z = 'abcdefghijklmnopqrstuvwxyz' +A2Z = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' +umlaute = 'äöüß' +Umlaute = 'ÄÖÜ' +alpha = A2Z + a2z +german_alpha = A2Z + a2z + umlaute + Umlaute +number = '0123456789' +alphanumeric = alpha + number +white = ' \t\v' +newline = '\r\n' +formfeed = '\f' +whitespace = white + newline + formfeed +any = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377' + +# Precompiled as sets, e.g. a2z_set = set(a2z) +a2z_set = '\000\000\000\000\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' +A2Z_set = '\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' +alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' +german_alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\020\000@\220\020\000@\020' +number_set = '\000\000\000\000\000\000\377\003\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' +alphanumeric_set = '\000\000\000\000\000\000\377\003\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' +white_set = '\000\002\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' +newline_set = '\000$\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' +whitespace_set = '\000&\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' +nonwhitespace_set = '\377\301\377\377\376\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377' +any_set = '\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377' + +# Clean up +del string diff --git a/intern/python/modules/TextTools/Constants/TagTables.py b/intern/python/modules/TextTools/Constants/TagTables.py new file mode 100644 index 00000000000..315d825b94e --- /dev/null +++ b/intern/python/modules/TextTools/Constants/TagTables.py @@ -0,0 +1,348 @@ +""" Constants for writing tag tables + + The documentation in this file is obsoleted by the HTML docs in + the Doc/ subdirectory of the package. Constants defined here must + match those in mxTextTools/mxte.h. + + (c) Copyright Marc-Andre Lemburg; All Rights Reserved. + See the documentation for further information on copyrights, + or contact the author (mal@lemburg.com). +""" +######################################################################### +# This file contains the definitions and constants used by the tagging +# engine: +# +# 1. Matching Tables +# 2. Commands & Constants +# 3. Matching Functions +# 4. Callable tagobjects +# 5. Calling the engine & Taglists +# + +######################################################################### +# 1. Matching Tables: +# +# these are tuples of tuples, each entry having the following meaning: +# +# syntax: (tag, cmd, chars|table|fct [,jne] [,je=1]) +# tag = object used to mark this section, if it matches +# cmd = command (see below) +# chars = match one or more of these characters +# table = table to use for matching characters +# fct = function to call (see below) +# jne = if the current character doesn't match, jump this +# many table entries relative to the current entry +# je = if we have a match make a relative jump of this length +# +# * a table matches a string iff the end of the table is reached +# (that is: an index is requested that is beyond the end-of-table) +# * a table is not matched if a tag is not matched and no jne is given; +# if it is matched then processing simply moves on to the next entry +# * marking is done by adding the matching slice in the string +# together with the marking object to the tag list; if the object is +# None, then it will not be appended to the taglist list +# * if the flag CallTag is set in cmd, then instead of appending +# matches to the taglist, the tagobj will be called (see below) +# +# TIP: if you are getting an error 'call of a non-function' while +# writing a table definition, you probably have a missing ',' +# somewhere in the tuple ! +# +# For examples see the tag*.py - files that came with this engine. +# + +######################################################################### +# 2. Commands & Constants +# +# + +# +# some useful constants for writing matching tables +# + +To = None # good for cmd=Jump +Here = None # good for cmd=Fail and EOF +MatchOk = 20000 # somewhere beyond the end of the tag table... +MatchFail = -20000 # somewhere beyond the start of the tag table... +ToEOF = -1 # good for cmd=Move + +ThisTable = 999 # to recursively match using the current table; + # can be passed as argument to Table and SubTable + # instead of a tuple + +# +# commands and flags passed in cmd (see below) +# +# note: I might add some further commands to this list, if needed +# (the numbers will then probably change, but not the +# names) +# +# convention: a command "matches", if and only if it moves the +# current position at least one character; a command "reads" +# characters the characters, if they match ok +# +# notations: +# +# x refers to the current position in the string +# len refers to the string length or what the function tag() is told to +# believe it to be (i.e. the engine only looks at the slice text[x:len]) +# text refers to the text string +# jne is the optional relative jump distance in case the command +# did not match, i.e. x before and after applying the command +# are the same (if not given the current table is considered +# not to match) +# je is the optional relative jump distance in case the command +# did match (it defaults to +1) +# + +# commands +Fail = 0 # this will always fail (position remains unchanged) +Jump = 0 # jump to jne (position remains unchanged) + +# match & read chars +AllIn = 11 # all chars in match (at least one) +AllNotIn = 12 # all chars not in match (at least one) +Is = 13 # current char must be == match (matches one char) +IsIn = 14 # current char must be in match (matches one char) +IsNot = 15 # current char must be be != match (matches one char) +IsNotIn = 15 # current char must be not be in match (matches one char) + +AllInSet = 31 +IsInSet = 32 + +# match & read for whole words +Word = 21 # the next chars must be those in match +WordStart = 22 # all chars up to the first occ. of match (at least one) +WordEnd = 23 # same as WordStart, accept that the text pointer + # is moved behind the match +NoWord = WordStart # all chars up to the first occ. of match (at least one) + + +# match using search objects BMS or FS +sWordStart = 111 # all chars up to the first occ. of match (may be 0 chars) +sWordEnd = 112 # same as WordStart, accept that the text pointer + # is moved behind the match +sFindWord = 113 # find match and process the found slice only (ignoring + # the chars that lead up to the match); positions + # the text pointer right after the match like WordEnd + +# functions & tables +Call = 201 # call match(text,x,len) as function (see above) +CallArg = 202 # match has to be a 2-tuple (fct,arg), then + # fct(text,x,len,arg) is called; the return value is taken + # as new x; it is considered matching if the new x is + # different than the x before the call -- like always + # (note: arg has to be *one* object, e.g. a tuple) +Table = 203 # match using table (given in match) +SubTable = 207 # match using sub table (given in match); the sub table + # uses the same taglist as the calling table +TableInList = 204 # same as Table, but match is a tuple (list,index) + # and the table list[index] is used as matching + # table +SubTableInList = 208 + # same as TableInList, but the sub table + # uses the same taglist as the calling table + +# specials +EOF = 1 # current position must be EOF, e.g. >= len(string) +Skip = 2 # skip match (must be an integer) chars; note: this cmd + # always matches ok, so jne doesn't have any meaning in + # this context +Move = 3 # move the current text position to match (if negative, + # the text length + 1 (!) is added, thus -1 moves to the + # EOF, -2 to the last char and so on); note: this cmd + # always matches ok, so jne doesn't have any meaning in + # this context + +# loops +Loop = 205 # loop-construct + # + # (tagobj,Loop,Count,jne,je) - sets/decrements the + # loop variable for current table according to the + # following rules: + # 1. the first time the engine passes this entry + # sets the loop variable to Count and continues + # without reading any character, but saving the + # current position in text + # 2. the next time, it decrements the loop variable + # and checks if it is < 0: + # (a) if it is, then the tagobj is added to the + # taglist with the slice (saved position, current + # position) and processing continues at entry + # current + jne + # (b) else, processing continues at entry current + je + # Note: if you jump out of the loop while the loop + # variable is still > 0, then you *must* + # reset the loop mechanism with + # (None,LoopControl,Reset) + # Note: you can skip the remaining loops by calling + # (None,LoopControl,Break) and jumping back + # to the Loop-entry; this sets the loop + # variable to 0 + # Note: tables cannot have nested loops within their + # context; you can have nested loops in nested + # tables though (there is one loop var per + # tag()-call which takes place every time + # a table match is done) + # +LoopControl = 206 # controls the loop variable (always succeeds, i.e. + # jne has no meaning); + # match may be one of: +Break = 0 # * sets the loop variable to 0, thereby allowing + # to skip the remaining loops +Reset = -1 # * resets the loop mechanism (see note above) + # + # See tagLoop.py for some examples. + +########################################################################## +# +# Flags (to be '+'ed with the above command code) +# +CallTag = 256 # call tagobj(taglist,text,l,r,subtags) + # upon successfully matching the slice [l:r] in text + # * taglist is the current list tags found (may be None) + # * subtags is a sub-list, passed when a subtable was used + # to do the matching -- it is None otherwise !) +# +# example entry with CallTag-flag set: +# +# (found_a_tag,CallTag+Table,tagtable) +# -- if tagtable matches the current text position, +# found_a_tag(taglist,text,l,r,newtaglist) is called and +# the match is *not* appended to the taglist by the tagging +# engine (the function would have to do this, in case it is needed) + +AppendToTagobj = 512 # this appends the slice found to the tagobj, assuming + # that it is a Python list: + # does a tagobj.append((None,l,r,subtags)) call +# Alias for b/w comp. +AppendToTag = AppendToTagobj + +AppendTagobj = 1024 # don't append (tagobj,l,r,subtags) to the taglist, + # but only tagobj itself; the information in l,r,subtags + # is lost, yet this can be used to write tag tables + # whose output can be used directly by tag.join() + +AppendMatch = 2048 # append the match to the taglist instead of + # the tag object; this produces non-standard + # taglists ! + +######################################################################### +# 3. Matching Functions +# +# syntax: +# +# fct(s,x,len_s) +# where s = string we are working on +# x = current index in s where we wnat to match something +# len_s = 'length' of s, this is how far the search may be +# conducted in s, not necessarily the true length of s +# +# * the function has to return the index of the char right after +# matched string, e.g. +# +# 'xyzabc' ---> 'xyz' matches ---> return x+3 +# +# * if the string doesn't match simply return x; in other words: +# the function has to return the matching slice's right index +# * you can use this to match e.g. 10 characters of a certain kind, +# or any word out of a given list, etc. +# * note: you cannot give the function additional parameters from within +# the matching table, so it has to know everything it needs to +# know a priori; use dynamic programming ! +# +# some examples (not needed, since all are implemented by commands) +# +# +#def matchword(x): +# s = """ +#def a(s,x,len_text): +# y = x+%i +# if s[x:y] == %s: return y +# return x +#""" +# exec s % (len(x),repr(x)) +# return a +# +#def rejectword(x): +# s = """ +#def a(s,x,len_text): +# while x < len(s) and s[x:x+%i] != %s: +# x = x + 1 +# return x +#""" +# exec s % (len(x),repr(x)) +# return a +# +#def HTML_Comment(s,x,len_text): +# while x < len_text and s[x:x+3] != '-->': +# x = x + 1 +# return x +# +# + +######################################################################### +# 4. Callable tagobjects +# +# a sample callable tagobj: +# +# +#def test(taglist,text,l,r,newtaglist): +# +# print 'found',repr(text[l:r])[:40],(l,r) +# +# + +######################################################################### +# 5. Calling the engine & Taglists +# +# The function +# tag(text,table,start=0,len_text=len(text),taglistinit=[]) +# found in mxTextTools: +# +# This function does all the matching according to the above rules. +# You give it a text string and a tag table and it will +# start processing the string starting from 'start' (which defaults to 0) +# and continue working until it reaches the 'EOF', i.e. len_text (which +# defaults to the text length). It thus tags the slice text[start:len_text]. +# +# The function will create a list of found tags in the following +# format (which I call taglist): +# +# (tagobj,l,r,subtaglist) +# +# where: tagobj = specified tag object taken from the table +# [l:r] = slice that matched the tag in text +# subtaglist = if matching was done using a subtable +# this is the taglist it produced; in all other +# cases this will be None +# +# * if you pass None as taglistinit, then no taglist will be created, +# i.e. only CallTag commands will have any effect. (This saves +# temporary memory for big files) +# * the function returns a tuple: +# (success, taglist, nextindex) +# where: success = 0/1 +# taglist = the produced list or None +# nextindex = the index+1 of the last char that matched +# (in case of failure, this points to the beginning +# of the substring that caused the problem) +# + +### Module init. + +def _module_init(): + + global id2cmd + + import types + id2cmd = {} + IntType = types.IntType + for cmd,value in globals().items(): + if type(value) == IntType: + if value == 0: + id2cmd[0] = 'Fail/Jump' + else: + id2cmd[value] = cmd + +_module_init() diff --git a/intern/python/modules/TextTools/Constants/__init__.py b/intern/python/modules/TextTools/Constants/__init__.py new file mode 100644 index 00000000000..0519ecba6ea --- /dev/null +++ b/intern/python/modules/TextTools/Constants/__init__.py @@ -0,0 +1 @@ +
\ No newline at end of file diff --git a/intern/python/modules/TextTools/TextTools.py b/intern/python/modules/TextTools/TextTools.py new file mode 100644 index 00000000000..7eae2bcfc39 --- /dev/null +++ b/intern/python/modules/TextTools/TextTools.py @@ -0,0 +1,766 @@ +""" mxTextTools - A tools package for fast text processing. + + (c) Copyright Marc-Andre Lemburg; All Rights Reserved. + See the documentation for further information on copyrights, + or contact the author (mal@lemburg.com). +""" +import string,types + +# +# import the C module and the version number +# +from mxTextTools import * +from mxTextTools import __version__ + +# +# import the symbols needed to write tag tables +# +from Constants.TagTables import * + +# +# import the some handy character sets +# +from Constants.Sets import * + +# +# format and print tables, taglists and joinlists: +# +def format_entry(table,i, + + TupleType=types.TupleType): + + """ Returns a pp-formatted tag table entry as string + """ + e = table[i] + jne = 0 + je = 1 + t,c,m = e[:3] + if len(e)>3: jne = e[3] + if len(e)>4: je = e[4] + flags,cmd = divmod(c,256) + c = id2cmd[cmd] + if type(m) == TupleType and c in ('Table','SubTable'): + m = '<table>' + elif m == None: + m = 'Here/To' + else: + m = repr(m) + if len(m) > 17: + m = m[:17]+'...' + return '%-15.15s : %-30s : jne=%+i : je=%+i' % \ + (repr(t),'%-.15s : %s'%(c,m),jne,je) + +def format_table(table,i=-1): + + """ Returns a pp-formatted version of the tag table as string """ + + l = [] + for j in range(len(table)): + if i == j: + l.append('--> '+format_entry(table,j)) + else: + l.append(' '+format_entry(table,j)) + return string.join(l,'\n')+'\n' + +def print_tagtable(table): + + """ Print the tag table + """ + print format_table(table) + +def print_tags(text,tags,indent=0): + + """ Print the taglist tags for text using the given indent level + """ + for tag,l,r,subtags in tags: + tagname = repr(tag) + if len(tagname) > 20: + tagname = tagname[:20] + '...' + target = repr(text[l:r]) + if len(target) > 60: + target = target[:60] + '...' + if subtags == None: + print ' '+indent*' |',tagname,': ',target,(l,r) + else: + print ' '+indent*' |',tagname,': ',target,(l,r) + print_tags(text,subtags,indent+1) + +def print_joinlist(joins,indent=0, + + StringType=types.StringType): + + """ Print the joinlist joins using the given indent level + """ + for j in joins: + if type(j) == StringType: + text = repr(j) + if len(text) > 40: + text = text[:40] + '...' + print ' '+indent*' |',text,' (len = %i)' % len(j) + else: + text = j[0] + l,r = j[1:3] + text = repr(text[l:r]) + if len(text) > 40: + text = text[:40] + '...' + print ' '+indent*' |',text,' (len = %i)' % (r-l),(l,r) + +def normlist(jlist, + + StringType=types.StringType): + + """ Return a normalized joinlist. + + All tuples in the joinlist are turned into real strings. The + resulting list is a equivalent copy of the joinlist only + consisting of strings. + + """ + l = [''] * len(jlist) + for i in range(len(jlist)): + entry = jlist[i] + if type(entry) == StringType: + l[i] = entry + else: + l[i] = entry[0][entry[1]:entry[2]] + return l + +# +# aid for matching from a list of words +# +def _lookup_dict(l,index=0): + + d = {} + for w in l: + c = w[index] + if d.has_key(c): + d[c].append(w) + else: + d[c] = [w] + return d + +def word_in_list(l): + + """ Creates a lookup table that matches the words in l + """ + t = [] + d = _lookup_dict(l) + keys = d.keys() + if len(keys) < 18: # somewhat arbitrary bound + # fast hint for small sets + t.append((None,IsIn,string.join(d.keys(),''))) + t.append((None,Skip,-1)) + # test groups + for c, group in d.items(): + t.append(None) # hint will be filled in later + i = len(t)-1 + for w in group: + t.append((None,Word,w[1:],+1,MatchOk)) + t.append((None,Fail,Here)) + # add hint + t[i] = (None,Is,c,len(t)-i) + t.append((None,Fail,Here)) + return tuple(t) + +# +# Extra stuff useful in combination with the C functions +# + +def replace(text,what,with,start=0,stop=None, + + SearchObject=BMS,join=join,joinlist=joinlist,tag=tag, + string_replace=string.replace,type=type, + StringType=types.StringType): + + """A fast replacement for string.replace. + + what can be given as string or search object. + + This function is a good example for the AppendTagobj-flag usage + (the taglist can be used directly as joinlist). + + """ + if type(what) == StringType: + so = SearchObject(what) + else: + so = what + what = so.match + if stop is None: + if start == 0 and len(what) < 2: + return string_replace(text,what,with) + stop = len(text) + t = ((text,sWordStart,so,+2), + # Found something, replace and continue searching + (with,Skip+AppendTagobj,len(what),-1,-1), + # Rest of text + (text,Move,ToEOF) + ) + found,taglist,last = tag(text,t,start,stop) + if not found: + return text + return join(taglist) + +# Alternative (usually slower) versions using different techniques: + +def _replace2(text,what,with,start=0,stop=None, + + join=join,joinlist=joinlist,tag=tag, + StringType=types.StringType,BMS=BMS): + + """Analogon to string.replace; returns a string with all occurences + of what in text[start:stop] replaced by with + - uses a one entry tag-table and a Boyer-Moore-Search-object + - what can be a string or a BMS/FS search object + - it's faster than string.replace in those cases, where + the what-string gets long and/or many replacements are found; + faster meaning from a few percent up to many times as fast + - start and stop define the slice of text to work in + - stop defaults to len(text) + """ + if stop is None: + stop = len(text) + if type(what) == StringType: + what=BMS(what) + t = ((with,sFindWord,what,+1,+0),) + found,taglist,last = tag(text,t,start,stop) + if not found: + return text + return join(joinlist(text,taglist)) + +def _replace3(text,what,with, + + join=string.join,FS=FS, + StringType=types.StringType): + + if type(what) == StringType: + what=FS(what) + slices = what.findall(text) + if not slices: + return text + l = [] + x = 0 + for left,right in slices: + l.append(text[x:left] + with) + x = right + l.append(text[x:]) + return join(l,'') + +def _replace4(text,what,with, + + join=join,joinlist=joinlist,tag=tag,FS=FS, + StringType=types.StringType): + + if type(what) == StringType: + what=FS(what) + slices = what.findall(text) + if not slices: + return text + repl = [None]*len(slices) + for i in range(len(slices)): + repl[i] = (with,)+slices[i] + return join(joinlist(text,repl)) + + +def find(text,what,start=0,stop=None, + + SearchObject=FS): + + """ A faster replacement for string.find(). + + Uses a search object for the task. Returns the position of the + first occurance of what in text[start:stop]. stop defaults to + len(text). Returns -1 in case no occurance was found. + + """ + if stop: + return SearchObject(what).find(text,start,stop) + else: + return SearchObject(what).find(text,start) + +def findall(text,what,start=0,stop=None, + + SearchObject=FS): + + """ Find all occurances of what in text. + + Uses a search object for the task. Returns a list of slice + tuples (l,r) marking the all occurances in + text[start:stop]. stop defaults to len(text). Returns an + empty list in case no occurance was found. + + """ + if stop: + return SearchObject(what).findall(text,start,stop) + else: + return SearchObject(what).findall(text,start) + +def split(text,sep,start=0,stop=None,translate=None, + + SearchObject=FS): + + """ A faster replacement for string.split(). + + Uses a search object for the task. Returns the result of + cutting the text[start:stop] string into snippets at every sep + occurance in form of a list of substrings. translate is passed + to the search object as translation string. + + XXX convert to a C function... or even better, add as method + to search objects. + + """ + if translate: + so = SearchObject(sep,translate) + else: + so = SearchObject(sep) + if stop: + cuts = so.findall(text,start,stop) + else: + cuts = so.findall(text,start) + l = 0 + list = [] + append = list.append + for left,right in cuts: + append(text[l:left]) + l = right + append(text[l:]) + return list + +# helper for tagdict +def _tagdict(text,dict,prefix,taglist): + + for o,l,r,s in taglist: + pfx = prefix + str(o) + dict[pfx] = text[l:r] + if s: + _tagdict(text,dict,pfx+'.',s) + +def tagdict(text,*args): + + """ Tag a text just like the function tag() and then convert + its output into a dictionary where the tagobjects reference + their respective strings + - this function emulates the interface of tag() + - in contrast to tag() this funtion *does* make copies + of the found stings + - returns a tuple (rc,tagdict,next) with the same meaning + of rc and next as tag(); tagdict is the new dictionary - + None in case rc is 0 + """ + rc,taglist,next = apply(tag,(text,)+args) + if not rc: + return (rc,None,next) + d = {} + tagdict = _tagdict + for o,l,r,s in taglist: + pfx = str(o) + d[pfx] = text[l:r] + if s: + tagdict(text,dict,pfx+'.',s) + return (rc,d,next) + +def invset(chars): + + """ Return a set with all characters *except* the ones in chars. + """ + return set(chars,0) + +def is_whitespace(text,start=0,stop=None, + + nonwhitespace=nonwhitespace_set,setfind=setfind): + + """ Return 1 iff text[start:stop] only contains whitespace + characters (as defined in Constants/Sets.py), 0 otherwise. + """ + if stop is None: + stop = len(text) + i = setfind(text,nonwhitespace,start,stop) + return (i < 0) + +def collapse(text,seperator=' ', + + join=join,setsplit=setsplit,collapse_set=set(newline+whitespace)): + + """ Eliminates newline characters and compresses whitespace + characters into one space. + + The result is a one line text string. Tim Peters will like + this function called with '-' seperator ;-) + + """ + return join(setsplit(text,collapse_set),seperator) + +_linesplit_table = ( + (None,Is,'\r',+1), + (None,Is,'\n',+1), + ('line',AllInSet+AppendMatch,set('\r\n',0),+1,-2), + (None,EOF,Here,+1,MatchOk), + ('empty line',Skip+AppendMatch,0,0,-4), + ) + +def splitlines(text, + + tag=tag,linesplit_table=_linesplit_table): + + """ Split text into a list of single lines. + + The following combinations are considered to be line-ends: + '\r', '\r\n', '\n'; they may be used in any combination. The + line-end indicators are removed from the strings prior to + adding them to the list. + + This function allows dealing with text files from Macs, PCs + and Unix origins in a portable way. + + """ + return tag(text,linesplit_table)[1] + +_linecount_table = ( + (None,Is,'\r',+1), + (None,Is,'\n',+1), + ('line',AllInSet+AppendTagobj,set('\r\n',0),+1,-2), + (None,EOF,Here,+1,MatchOk), + ('empty line',Skip+AppendTagobj,0,0,-4), + ) + +def countlines(text, + + linecount_table=_linecount_table): + + """ Returns the number of lines in text. + + Line ends are treated just like for splitlines() in a + portable way. + """ + return len(tag(text,linecount_table)[1]) + +_wordsplit_table = ( + (None,AllInSet,whitespace_set,+1), + ('word',AllInSet+AppendMatch,nonwhitespace_set,+1,-1), + (None,EOF,Here,+1,MatchOk), + ) + +def splitwords(text, + + setsplit=setsplit,whitespace_set=whitespace_set): + + """ Split text into a list of single words. + + Words are separated by whitespace. The whitespace is stripped + before adding the words to the list. + + """ + return setsplit(text,whitespace_set) + +# +# Testing and benchmarking +# + +# Taken from my hack.py module: +import time +class _timer: + + """ timer class with a quite obvious interface + - .start() starts a fairly accurate CPU-time timer plus an + absolute timer + - .stop() stops the timer and returns a tuple: the CPU-time in seconds + and the absolute time elapsed since .start() was called + """ + + utime = 0 + atime = 0 + + def start(self, + clock=time.clock,time=time.time): + self.atime = time() + self.utime = clock() + + def stop(self, + clock=time.clock,time=time.time): + self.utime = clock() - self.utime + self.atime = time() - self.atime + return self.utime,self.atime + + def usertime(self, + clock=time.clock,time=time.time): + self.utime = clock() - self.utime + self.atime = time() - self.atime + return self.utime + + def abstime(self, + clock=time.clock,time=time.time): + self.utime = clock() - self.utime + self.atime = time() - self.atime + return self.utime + + def __str__(self): + + return '%0.2fu %0.2fa sec.' % (self.utime,self.atime) + +def _bench(file='mxTextTools/mxTextTools.c'): + + def mismatch(orig,new): + print + for i in range(len(orig)): + if orig[i] != new[i]: + break + else: + print 'Length mismatch: orig=%i new=%i' % (len(orig),len(new)) + if len(orig) > len(new): + print 'Missing chars:'+repr(orig[len(new):]) + else: + print 'Excess chars:'+repr(new[len(orig):]) + print + return + print 'Mismatch at offset %i:' % i + print (orig[i-100:i] + + '<- %s != %s ->' % (repr(orig[i]),repr(new[i])) + + orig[i+1:i+100]) + print + + text = open(file).read() + import string + + t = _timer() + print 'Working on a %i byte string' % len(text) + + if 0: + print + print 'Replacing strings' + print '-'*72 + print + for what,with in (('m','M'),('mx','MX'),('mxText','MXTEXT'), + ('hmm','HMM'),('hmmm','HMM'),('hmhmm','HMM')): + print 'Replace "%s" with "%s"' % (what,with) + t.start() + for i in range(100): + rtext = string.replace(text,what,with) + print 'with string.replace:',t.stop(),'sec.' + t.start() + for i in range(100): + ttext = replace(text,what,with) + print 'with tag.replace:',t.stop(),'sec.' + if ttext != rtext: + print 'results are NOT ok !' + print '-'*72 + mismatch(rtext,ttext) + t.start() + for i in range(100): + ttext = _replace2(text,what,with) + print 'with tag._replace2:',t.stop(),'sec.' + if ttext != rtext: + print 'results are NOT ok !' + print '-'*72 + print rtext + t.start() + for i in range(100): + ttext = _replace3(text,what,with) + print 'with tag._replace3:',t.stop(),'sec.' + if ttext != rtext: + print 'results are NOT ok !' + print '-'*72 + print rtext + t.start() + for i in range(100): + ttext = _replace4(text,what,with) + print 'with tag._replace4:',t.stop(),'sec.' + if ttext != rtext: + print 'results are NOT ok !' + print '-'*72 + print rtext + print + + if 0: + print + print 'String lower/upper' + print '-'*72 + print + + op = string.lower + t.start() + for i in range(1000): + op(text) + t.stop() + print ' string.lower:',t + + op = string.upper + t.start() + for i in range(1000): + op(text) + t.stop() + print ' string.upper:',t + + op = upper + t.start() + for i in range(1000): + op(text) + t.stop() + print ' TextTools.upper:',t + + op = lower + t.start() + for i in range(1000): + op(text) + t.stop() + print ' TextTools.lower:',t + + print 'Testing...', + ltext = string.lower(text) + assert ltext == lower(text) + utext = string.upper(text) + assert utext == upper(text) + print 'ok.' + + if 0: + print + print 'Joining lists' + print '-'*72 + print + + l = setsplit(text,whitespace_set) + + op = string.join + t.start() + for i in range(1000): + op(l) + t.stop() + print ' string.join:',t + + op = join + t.start() + for i in range(1000): + op(l) + t.stop() + print ' TextTools.join:',t + + op = string.join + t.start() + for i in range(1000): + op(l,' ') + t.stop() + print ' string.join with seperator:',t + + op = join + t.start() + for i in range(1000): + op(l,' ') + t.stop() + print ' TextTools.join with seperator:',t + + if 0: + print + print 'Creating join lists' + print '-'*72 + print + + repl = [] + for i in range(0,len(text),10): + repl.append(str(i),i,i+1) + + op = joinlist + t.start() + for i in range(1000): + op(text,repl) + t.stop() + print ' TextTools.joinlist:',t + + if 0: + print + print 'Splitting text' + print '-'*72 + print + + op = string.split + t.start() + for i in range(100): + op(text) + t.stop() + print ' string.split whitespace:',t,'(',len(op(text)),'snippets )' + + op = setsplit + ws = whitespace_set + t.start() + for i in range(100): + op(text,ws) + t.stop() + print ' TextTools.setsplit whitespace:',t,'(',len(op(text,ws)),'snippets )' + + assert string.split(text) == setsplit(text,ws) + + op = string.split + sep = 'a' + t.start() + for i in range(100): + op(text,sep) + t.stop() + print ' string.split at "a":',t,'(',len(op(text,sep)),'snippets )' + + op = split + sep = 'a' + t.start() + for i in range(100): + op(text,sep) + t.stop() + print ' TextTools.split at "a":',t,'(',len(op(text,sep)),'snippets )' + + op = charsplit + sep = 'a' + t.start() + for i in range(100): + op(text,sep) + t.stop() + print ' TextTools.charsplit at "a":',t,'(',len(op(text,sep)),'snippets )' + + op = setsplit + sep = set('a') + t.start() + for i in range(100): + op(text,sep) + t.stop() + print ' TextTools.setsplit at "a":',t,'(',len(op(text,sep)),'snippets )' + + # Note: string.split and setsplit don't work identically ! + + op = string.split + sep = 'int' + t.start() + for i in range(100): + op(text,sep) + t.stop() + print ' string.split at "int":',t,'(',len(op(text,sep)),'snippets )' + + op = split + sep = 'int' + t.start() + for i in range(100): + op(text,sep) + t.stop() + print ' TextTools.split at "int":',t,'(',len(op(text,sep)),'snippets )' + + op = setsplit + sep = set('int') + t.start() + for i in range(100): + op(text,sep) + t.stop() + print ' TextTools.setsplit at "i", "n", "t":',t,'(',len(op(text,sep)),'snippets )' + + op = string.split + sep = 'register' + t.start() + for i in range(100): + op(text,sep) + t.stop() + print ' string.split at "register":',t,'(',len(op(text,sep)),'snippets )' + + op = split + sep = 'register' + t.start() + for i in range(100): + op(text,sep) + t.stop() + print ' TextTools.split at "register":',t,'(',len(op(text,sep)),'snippets )' + +if __name__=='__main__': + _bench() + diff --git a/intern/python/modules/TextTools/__init__.py b/intern/python/modules/TextTools/__init__.py new file mode 100644 index 00000000000..f9255aca276 --- /dev/null +++ b/intern/python/modules/TextTools/__init__.py @@ -0,0 +1,48 @@ +""" mxTextTools - A tools package for fast text processing. + + (c) Copyright Marc-Andre Lemburg; All Rights Reserved. + See the documentation for further information on copyrights, + or contact the author (mal@lemburg.com). +""" +__package_info__ = """ +BEGIN PYTHON-PACKAGE-INFO 1.0 +Title: mxTextTools - Tools for fast text processing +Current-Version: 1.1.1 +Home-Page: http://starship.skyport.net/~lemburg/mxTextTools.html +Primary-Site: http://starship.skyport.net/~lemburg/mxTextTools-1.1.1.zip + +This package provides several different functions and mechanisms +to do fast text text processing. Amongst these are character set +operations, parsing & tagging tools (using a finite state machine +executing byte code) and common things such as Boyer-Moore search +objects. For full documentation see the home page. +END PYTHON-PACKAGE-INFO +""" +from TextTools import * +from TextTools import __version__ + +### Make the types pickleable: + +# Shortcuts for pickle (reduces the pickle's length) +def _BMS(match,translate): + return BMS(match,translate) +def _FS(match,translate): + return FS(match,translate) + +# Module init +class modinit: + + ### Register the two types + import copy_reg + def pickle_BMS(so): + return _BMS,(so.match,so.translate) + def pickle_FS(so): + return _FS,(so.match,so.translate) + copy_reg.pickle(BMSType, + pickle_BMS, + _BMS) + copy_reg.pickle(FSType, + pickle_FS, + _FS) + +del modinit diff --git a/intern/python/modules/TextTools/mxTextTools/__init__.py b/intern/python/modules/TextTools/mxTextTools/__init__.py new file mode 100644 index 00000000000..009b7cbd4c7 --- /dev/null +++ b/intern/python/modules/TextTools/mxTextTools/__init__.py @@ -0,0 +1,17 @@ +""" mxTextTools - A tools package for fast text processing. + + (c) Copyright Marc-Andre Lemburg; All Rights Reserved. + See the documentation for further information on copyrights, + or contact the author (mal@lemburg.com). +""" +from mxTextTools import * +from mxTextTools import __version__ + +# +# Make BMS take the role of FS in case the Fast Search object was not built +# +try: + FS +except NameError: + FS = BMS + FSType = BMSType |