diff options
Diffstat (limited to 'src/stringprep/uni_parse.tcl')
-rw-r--r-- | src/stringprep/uni_parse.tcl | 437 |
1 files changed, 0 insertions, 437 deletions
diff --git a/src/stringprep/uni_parse.tcl b/src/stringprep/uni_parse.tcl deleted file mode 100644 index 100631b6b..000000000 --- a/src/stringprep/uni_parse.tcl +++ /dev/null @@ -1,437 +0,0 @@ -# uni_parse.tcl -- -# -# This program parses the UnicodeData file and generates the -# corresponding uni_data.c file with compressed character -# data tables. The input to this program should be rfc3454.txt -# -# Copyright (c) 1998-1999 by Scriptics Corporation. -# All rights reserved. -# -# Modified for ejabberd by Alexey Shchepin -# -# RCS: @(#) $Id$ - - -namespace eval uni { - set shift 8; # number of bits of data within a page - # This value can be adjusted to find the - # best split to minimize table size - - variable pMap; # map from page to page index, each entry is - # an index into the pages table, indexed by - # page number - variable pages; # map from page index to page info, each - # entry is a list of indices into the groups - # table, the list is indexed by the offset - variable groups; # list of character info values, indexed by - # group number, initialized with the - # unassigned character group -} - -proc uni::getValue {i} { - variable casemap - variable casemap2 - variable tablemap - - if {[info exists tablemap($i)]} { - set tables $tablemap($i) - } else { - set tables {} - } - - if {[info exists casemap2($i)]} { - set multicase 1 - set delta $casemap2($i) - } else { - set multicase 0 - if {[info exists casemap($i)]} { - set delta $casemap($i) - } else { - set delta 0 - } - } - - if {abs($delta) > 0xFFFFF} { - puts "delta must be less than 22 bits wide" - exit - } - - set ac 0 - set c11 0 - set c21 0 - set b1 0 - set d1 0 - set d2 0 - set xnp 0 - - foreach tab $tables { - switch -glob -- $tab { - C.1.1 {set c11 1} - C.2.1 {set c21 1} - C.* {set ac 1} - A.1 {set ac 1} - B.1 {set b1 1} - D.1 {set d1 1} - D.2 {set d2 1} - XNP {set xnp 1} - } - } - - set val [expr {($ac << 0) | - ($c11 << 1) | - ($c21 << 2) | - ($b1 << 3) | - ($d1 << 4) | - ($d2 << 5) | - ($xnp << 6) | - ($multicase << 7) | - ($delta << 11)}] - - return $val -} - -proc uni::getGroup {value} { - variable groups - - set gIndex [lsearch -exact $groups $value] - if {$gIndex == -1} { - set gIndex [llength $groups] - lappend groups $value - } - return $gIndex -} - -proc uni::addPage {info} { - variable pMap - variable pages - variable pages_map - - if {[info exists pages_map($info)]} { - lappend pMap $pages_map($info) - } else { - set pIndex [llength $pages] - lappend pages $info - set pages_map($info) $pIndex - lappend pMap $pIndex - } - return -} - - -proc uni::load_tables {data} { - variable casemap - variable casemap2 - variable multicasemap - variable tablemap - - set multicasemap {} - set table "" - - foreach line [split $data \n] { - if {$table == ""} { - if {[regexp { ----- Start Table (.*) -----} $line temp table]} { - #puts "Start table '$table'" - } - } else { - if {[regexp { ----- End Table (.*) -----} $line temp table1]} { - set table "" - } else { - if {$table == "B.1"} { - if {[regexp {^ ([[:xdigit:]]+); ;} $line \ - temp val]} { - scan $val %x val - if {$val <= 0x10ffff} { - lappend tablemap($val) $table - } - } - } elseif {$table == "B.2"} { - if {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+);} $line \ - temp from to]} { - scan $from %x from - scan $to %x to - if {$from <= 0x10ffff && $to <= 0x10ffff} { - set casemap($from) [expr {$to - $from}] - } - } elseif {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \ - temp from to1 to2]} { - scan $from %x from - scan $to1 %x to1 - scan $to2 %x to2 - if {$from <= 0x10ffff && \ - $to1 <= 0x10ffff && $to2 <= 0x10ffff} { - set casemap2($from) [llength $multicasemap] - lappend multicasemap [list $to1 $to2] - } - } elseif {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \ - temp from to1 to2 to3]} { - scan $from %x from - scan $to1 %x to1 - scan $to2 %x to2 - scan $to3 %x to3 - if {$from <= 0x10ffff && \ - $to1 <= 0x10ffff && $to2 <= 0x10ffff && \ - $to3 <= 0x10ffff} { - set casemap2($from) [llength $multicasemap] - lappend multicasemap [list $to1 $to2 $to3] - } - } else { - #puts "missed: $line" - } - - } elseif {$table != "B.3"} { - if {[regexp {^ ([[:xdigit:]]+)-([[:xdigit:]]+)} $line \ - temp from to]} { - scan $from %x from - scan $to %x to - for {set i $from} {$i <= $to && $i <= 0x10ffff} {incr i} { - lappend tablemap($i) $table - } - } elseif {[regexp {^ ([[:xdigit:]]+)} $line \ - temp val]} { - scan $val %x val - if {$val <= 0x10ffff} { - lappend tablemap($val) $table - } - } - } - } - } - } - - # XMPP nodeprep prohibited - foreach val {22 26 27 2f 3a 3c 3e 40} { - scan $val %x val - lappend tablemap($val) XNP - } -} - -proc uni::buildTables {} { - variable shift - - variable casemap - variable tablemap - - variable pMap {} - variable pages {} - variable groups {} - set info {} ;# temporary page info - - set mask [expr {(1 << $shift) - 1}] - - set next 0 - - for {set i 0} {$i <= 0x10ffff} {incr i} { - set gIndex [getGroup [getValue $i]] - - # Split character index into offset and page number - set offset [expr {$i & $mask}] - set page [expr {($i >> $shift)}] - - # Add the group index to the info for the current page - lappend info $gIndex - - # If this is the last entry in the page, add the page - if {$offset == $mask} { - addPage $info - set info {} - } - } - return -} - -proc uni::main {} { - global argc argv0 argv - variable pMap - variable pages - variable groups - variable shift - variable multicasemap - - if {$argc != 2} { - puts stderr "\nusage: $argv0 <datafile> <outdir>\n" - exit 1 - } - set f [open [lindex $argv 0] r] - set data [read $f] - close $f - - load_tables $data - buildTables - puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" - set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] - puts "shift = $shift, space = $size" - - set f [open [file join [lindex $argv 1] uni_data.c] w] - fconfigure $f -translation lf - puts $f "/* - * uni_data.c -- - * - * Declarations of Unicode character information tables. This file is - * automatically generated by the uni_parse.tcl script. Do not - * modify this file by hand. - * - * Copyright (c) 1998 by Scriptics Corporation. - * All rights reserved. - * - * Modified for ejabberd by Alexey Shchepin - * - * RCS: @(#) \$Id\$ - */ - -/* - * A 16-bit Unicode character is split into two parts in order to index - * into the following tables. The lower OFFSET_BITS comprise an offset - * into a page of characters. The upper bits comprise the page number. - */ - -#define OFFSET_BITS $shift - -/* - * The pageMap is indexed by page number and returns an alternate page number - * that identifies a unique page of characters. Many Unicode characters map - * to the same alternate page number. - */ - -static unsigned char pageMap\[\] = {" - set line " " - set last [expr {[llength $pMap] - 1}] - for {set i 0} {$i <= $last} {incr i} { - append line [lindex $pMap $i] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - puts $f $line - puts $f "}; - -/* - * The groupMap is indexed by combining the alternate page number with - * the page offset and returns a group number that identifies a unique - * set of character attributes. - */ - -static unsigned short int groupMap\[\] = {" - set line " " - set lasti [expr {[llength $pages] - 1}] - for {set i 0} {$i <= $lasti} {incr i} { - set page [lindex $pages $i] - set lastj [expr {[llength $page] - 1}] - for {set j 0} {$j <= $lastj} {incr j} { - append line [lindex $page $j] - if {$j != $lastj || $i != $lasti} { - append line ", " - } - if {[string length $line] > 70} { - puts $f $line - set line " " - } - } - } - puts $f $line - puts $f "}; - -/* - * Each group represents a unique set of character attributes. The attributes - * are encoded into a 32-bit value as follows: - * - * Bit 0 A.1 | C.1.2 | C.2.2 | C.3 -- C.9 - * - * Bit 1 C.1.1 - * - * Bit 2 C.2.1 - * - * Bit 3 B.1 - * - * Bit 4 D.1 - * - * Bit 5 D.2 - * - * Bit 6 XNP - * - * Bit 7 Case maps to several characters - * - * Bits 8-10 Reserved for future use. - * - * Bits 11-31 Case delta: delta for case conversions. This should be the - * highest field so we can easily sign extend. - */ - -static int groups\[\] = {" - set line " " - set last [expr {[llength $groups] - 1}] - for {set i 0} {$i <= $last} {incr i} { - set val [lindex $groups $i] - - append line [format "%d" $val] - if {$i != $last} { - append line ", " - } - if {[string length $line] > 65} { - puts $f $line - set line " " - } - } - puts $f $line - puts $f "}; - -/* - * Table for characters that lowercased to multiple ones - */ - -static int multiCaseTable\[\]\[4\] = {" - set last [expr {[llength $multicasemap] - 1}] - for {set i 0} {$i <= $last} {incr i} { - set val [lindex $multicasemap $i] - - set line " " - append line [format "{%d, %s}" [llength $val] [join $val ", "]] - if {$i != $last} { - append line ", " - } - puts $f $line - } - puts $f "}; - -/* - * The following constants are used to determine the category of a - * Unicode character. - */ - -#define ACMask (1 << 0) -#define C11Mask (1 << 1) -#define C21Mask (1 << 2) -#define B1Mask (1 << 3) -#define D1Mask (1 << 4) -#define D2Mask (1 << 5) -#define XNPMask (1 << 6) -#define MCMask (1 << 7) - -/* - * The following macros extract the fields of the character info. The - * GetDelta() macro is complicated because we can't rely on the C compiler - * to do sign extension on right shifts. - */ - -#define GetCaseType(info) (((info) & 0xE0) >> 5) -#define GetCategory(info) ((info) & 0x1F) -#define GetDelta(info) (((info) > 0) ? ((info) >> 11) : (~(~((info)) >> 11))) -#define GetMC(info) (multiCaseTable\[GetDelta(info)\]) - -/* - * This macro extracts the information about a character from the - * Unicode character tables. - */ - -#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0x1fffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) -" - - close $f -} - -uni::main - -return |