diff options
Diffstat (limited to 'irstlm')
-rw-r--r-- | irstlm/src/src/Makefile.am | 30 | ||||
-rw-r--r-- | irstlm/src/src/cmd.c | 661 | ||||
-rw-r--r-- | irstlm/src/src/cmd.h | 68 | ||||
-rw-r--r-- | irstlm/src/src/compile-lm.cpp | 221 | ||||
-rw-r--r-- | irstlm/src/src/dictionary.cpp | 425 | ||||
-rw-r--r-- | irstlm/src/src/dictionary.h | 186 | ||||
-rw-r--r-- | irstlm/src/src/gzfilebuf.h | 80 | ||||
-rw-r--r-- | irstlm/src/src/htable.cpp | 329 | ||||
-rw-r--r-- | irstlm/src/src/htable.h | 130 | ||||
-rw-r--r-- | irstlm/src/src/index.h | 19 | ||||
-rw-r--r-- | irstlm/src/src/lmtable.cpp | 1217 | ||||
-rw-r--r-- | irstlm/src/src/lmtable.h | 366 | ||||
-rw-r--r-- | irstlm/src/src/mempool.cpp | 497 | ||||
-rw-r--r-- | irstlm/src/src/mempool.h | 172 | ||||
-rw-r--r-- | irstlm/src/src/n_gram.cpp | 214 | ||||
-rw-r--r-- | irstlm/src/src/n_gram.h | 118 | ||||
-rw-r--r-- | irstlm/src/src/ngramcache.cpp | 89 | ||||
-rw-r--r-- | irstlm/src/src/ngramcache.h | 51 | ||||
-rw-r--r-- | irstlm/src/src/quantize-lm.cpp | 424 | ||||
-rw-r--r-- | irstlm/src/src/util.cpp | 77 | ||||
-rw-r--r-- | irstlm/src/src/util.h | 25 |
21 files changed, 5399 insertions, 0 deletions
diff --git a/irstlm/src/src/Makefile.am b/irstlm/src/src/Makefile.am new file mode 100644 index 000000000..2c3cc5caf --- /dev/null +++ b/irstlm/src/src/Makefile.am @@ -0,0 +1,30 @@ +lib_LIBRARIES = libirstlm.a + +libirstlm_a_SOURCES = \ + dictionary.cpp \ + htable.cpp \ + lmtable.cpp \ + mempool.cpp \ + n_gram.cpp \ + ngramcache.cpp \ + util.cpp + +library_includedir=$(includedir) +library_include_HEADERS = \ + dictionary.h \ + htable.h \ + lmtable.h \ + mempool.h \ + n_gram.h \ + ngramcache.h \ + util.h \ + gzfilebuf.h + +bin_PROGRAMS = compile-lm quantize-lm + +AM_LDFLAGS=-L. +LIBS=-lirstlm -lz + +compile_lm_SOURCES = compile-lm.cpp +quantize_lm_SOURCES = quantize-lm.cpp + diff --git a/irstlm/src/src/cmd.c b/irstlm/src/src/cmd.c new file mode 100644 index 000000000..aeb36d7b9 --- /dev/null +++ b/irstlm/src/src/cmd.c @@ -0,0 +1,661 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <string.h> + +#include "cmd.h" + +static Enum_T BoolEnum[] = { + { "FALSE", 0 }, + { "TRUE", 1 }, + { 0, 0 } +}; + +#ifdef NEEDSTRDUP +char *strdup(); +#endif + +#define FALSE 0 +#define TRUE 1 + +#define LINSIZ 10240 +#define MAXPARAM 256 + +static char *GetLine(), + **str2array(); +static int Scan(), + SetParam(), + SetEnum(), + SetSubrange(), + SetStrArray(), + SetGte(), + SetLte(), + CmdError(), + EnumError(), + SubrangeError(), + GteError(), + LteError(), + PrintParam(), + PrintEnum(), + PrintStrArray(); + +static Cmd_T cmds[MAXPARAM+1]; +static char *SepString = " \t\n"; + +#if defined(__STDC__) +#include <stdarg.h> +int DeclareParams(char *ParName, ...) +#else +#include <varargs.h> +int DeclareParams(ParName, va_alist) +char *ParName; +va_dcl +#endif +{ + va_list args; + static int ParamN = 0; + int j, + c; + char *s; + +#if defined(__STDC__) + va_start(args, ParName); +#else + va_start(args); +#endif + for(;ParName;) { + if(ParamN==MAXPARAM) { + fprintf(stderr, "Too many parameters !!\n"); + break; + } + for(j=0,c=1; j<ParamN&&(c=strcmp(cmds[j].Name,ParName))<0; j++) + ; + if(!c) { + fprintf(stderr, + "Warning: parameter \"%s\" declared twice.\n", + ParName); + } + for(c=ParamN; c>j; c--) { + cmds[c] = cmds[c-1]; + } + cmds[j].Name = ParName; + cmds[j].Type = va_arg(args, int); + cmds[j].Val = va_arg(args, void *); + switch(cmds[j].Type) { + case CMDENUMTYPE: /* get the pointer to Enum_T struct */ + cmds[j].p = va_arg(args, void *); + break; + case CMDSUBRANGETYPE: /* get the two extremes */ + cmds[j].p = (void*) calloc(2, sizeof(int)); + ((int*)cmds[j].p)[0] = va_arg(args, int); + ((int*)cmds[j].p)[1] = va_arg(args, int); + break; + case CMDGTETYPE: /* get lower or upper bound */ + case CMDLTETYPE: + cmds[j].p = (void*) calloc(1, sizeof(int)); + ((int*)cmds[j].p)[0] = va_arg(args, int); + break; + case CMDSTRARRAYTYPE: /* get the separators string */ + cmds[j].p = (s=va_arg(args, char*)) + ? (void*)strdup(s) : 0; + break; + case CMDBOOLTYPE: + cmds[j].Type = CMDENUMTYPE; + cmds[j].p = BoolEnum; + break; + case CMDDOUBLETYPE: /* nothing else is needed */ + case CMDINTTYPE: + case CMDSTRINGTYPE: + break; + default: + fprintf(stderr, "%s: %s %d %s \"%s\"\n", + "DeclareParam()", "Unknown Type", + cmds[j].Type, "for parameter", cmds[j].Name); + exit(1); + } + ParamN++; + ParName = va_arg(args, char *); + } + cmds[ParamN].Name = NULL; + va_end(args); + return 0; +} + +int GetParams(n, a, CmdFileName) +int *n; +char ***a; +char *CmdFileName; +{ + char *Line, + *ProgName; + int argc = *n; + char **argv = *a, + *s; + FILE *fp; + int IsPipe; + +#ifdef MSDOS +#define PATHSEP '\\' + char *dot = NULL; +#else +#define PATHSEP '/' +#endif + + if(!(Line=malloc(LINSIZ))) { + fprintf(stderr, "GetParams(): Unable to alloc %d bytes\n", + LINSIZ); + exit(1); + } + if((ProgName=strrchr(*argv, PATHSEP))) { + ++ProgName; + } else { + ProgName = *argv; + } +#ifdef MSDOS + if(dot=strchr(ProgName, '.')) *dot = 0; +#endif + --argc; + ++argv; + for(;;) { + if(argc && argv[0][0]=='-' && argv[0][1]=='=') { + CmdFileName = argv[0]+2; + ++argv; + --argc; + } + if(!CmdFileName) { + break; + } + IsPipe = !strncmp(CmdFileName, "@@", 2); + fp = IsPipe + ? popen(CmdFileName+2, "r") + : strcmp(CmdFileName, "-") + ? fopen(CmdFileName, "r") + : stdin; + if(!fp) { + fprintf(stderr, "Unable to open command file %s\n", + CmdFileName); + exit(1); + } + while(GetLine(fp, LINSIZ, Line) && strcmp(Line, "\\End")) { + if(Scan(ProgName, cmds, Line)) { + CmdError(Line); + } + } + if(fp!=stdin) { + if(IsPipe) pclose(fp); else fclose(fp); + } + CmdFileName = NULL; + } + while(argc && **argv=='-' && (s=strchr(*argv, '='))) { + *s = ' '; + sprintf(Line, "%s/%s", ProgName, *argv+1); + *s = '='; + if(Scan(ProgName, cmds, Line)) CmdError(*argv); + --argc; + ++argv; + } + *n = argc; + *a = argv; +#ifdef MSDOS + if(dot) *dot = '.'; +#endif + free(Line); + return 0; +} + +int PrintParams(ValFlag, fp) +int ValFlag; +FILE *fp; +{ + int i; + + fflush(fp); + if(ValFlag) { + fprintf(fp, "Parameters Values:\n"); + } else { + fprintf(fp, "Parameters:\n"); + } + for(i=0; cmds[i].Name; i++) PrintParam(cmds+i, ValFlag, fp); + fprintf(fp, "\n"); + fflush(fp); + return 0; +} + +int SPrintParams(a, pfx) +char ***a, + *pfx; +{ + int l, + n; + Cmd_T *cmd; + + if(!pfx) pfx=""; + l = strlen(pfx); + for(n=0, cmd=cmds; cmd->Name; cmd++) n += !!cmd->ArgStr; + a[0] = calloc(n, sizeof(char*)); + for(n=0, cmd=cmds; cmd->Name; cmd++) { + if(!cmd->ArgStr) continue; + a[0][n] = malloc(strlen(cmd->Name)+strlen(cmd->ArgStr)+l+2); + sprintf(a[0][n], "%s%s=%s", pfx, cmd->Name, cmd->ArgStr); + ++n; + } + return n; +} + +static int CmdError(opt) +char *opt; +{ + fprintf(stderr, "Invalid option \"%s\"\n", opt); + fprintf(stderr, "This program expectes the following parameters:\n"); + PrintParams(FALSE, stderr); + exit(0); +} + +static int PrintParam(cmd, ValFlag, fp) +Cmd_T *cmd; +int ValFlag; +FILE *fp; +{ + fprintf(fp, "%4s", ""); + switch(cmd->Type) { + case CMDDOUBLETYPE: + fprintf(fp, "%s", cmd->Name); + if(ValFlag) fprintf(fp, ": %22.15e", *(double *)cmd->Val); + fprintf(fp, "\n"); + break; + case CMDENUMTYPE: + PrintEnum(cmd, ValFlag, fp); + break; + case CMDINTTYPE: + case CMDSUBRANGETYPE: + case CMDGTETYPE: + case CMDLTETYPE: + fprintf(fp, "%s", cmd->Name); + if(ValFlag) fprintf(fp, ": %d", *(int *)cmd->Val); + fprintf(fp, "\n"); + break; + case CMDSTRINGTYPE: + fprintf(fp, "%s", cmd->Name); + if(ValFlag) { + if(*(char **)cmd->Val) { + fprintf(fp, ": \"%s\"", *(char **)cmd->Val); + } else { + fprintf(fp, ": %s", "NULL"); + } + } + fprintf(fp, "\n"); + break; + case CMDSTRARRAYTYPE: + PrintStrArray(cmd, ValFlag, fp); + break; + default: + fprintf(stderr, "%s: %s %d %s \"%s\"\n", + "PrintParam", + "Unknown Type", + cmd->Type, + "for parameter", + cmd->Name); + exit(1); + } + return 0; +} + +static char *GetLine(fp, n, Line) +FILE *fp; +int n; +char *Line; +{ + int j, + l, + offs=0; + + for(;;) { + if(!fgets(Line+offs, n-offs, fp)) { + return NULL; + } + if(Line[offs]=='#') continue; + l = strlen(Line+offs)-1; + Line[offs+l] = 0; + for(j=offs; Line[j] && isspace(Line[j]); j++, l--) + ; + if(l<1) continue; + if(j > offs) { + char *s = Line+offs, + *q = Line+j; + + while((*s++=*q++)) + ; + } + if(Line[offs+l-1]=='\\') { + offs += l; + Line[offs-1] = ' '; + } else { + break; + } + } + return Line; +} + +static int Scan(ProgName, cmds, Line) +char *ProgName, + *Line; +Cmd_T *cmds; +{ + char *q, + *p; + int i, + hl, + HasToMatch = FALSE, + c0, + c; + + p = Line+strspn(Line, SepString); + if(!(hl=strcspn(p, SepString))) { + return 0; + } + if((q=strchr(p, '/')) && q-p<hl) { + *q = 0; + if(strcmp(p, ProgName)) { + *q = '/'; + return 0; + } + *q = '/'; + HasToMatch=TRUE; + p = q+1; + } + if(!(hl = strcspn(p, SepString))) { + return 0; + } + c0 = p[hl]; + p[hl] = 0; + for(i=0, c=1; cmds[i].Name&&(c=strcmp(cmds[i].Name, p))<0; i++) + ; + p[hl] = c0; + if(!c) return SetParam(cmds+i, p+hl+strspn(p+hl, SepString)); + return HasToMatch && c; +} + +static int SetParam(cmd, s) +Cmd_T *cmd; +char *s; +{ + if(!*s && cmd->Type != CMDSTRINGTYPE) { + fprintf(stderr, + "WARNING: No value specified for parameter \"%s\"\n", + cmd->Name); + return 0; + } + switch(cmd->Type) { + case CMDDOUBLETYPE: + if(sscanf(s, "%lf", (double*)cmd->Val)!=1) { + fprintf(stderr, + "Float value required for parameter \"%s\"\n", + cmd->Name); + exit(1); + } + break; + case CMDENUMTYPE: + SetEnum(cmd, s); + break; + case CMDINTTYPE: + if(sscanf(s, "%d", (int*)cmd->Val)!=1) { + fprintf(stderr, + "Integer value required for parameter \"%s\"\n", + cmd->Name); + exit(1); + } + break; + case CMDSTRINGTYPE: + *(char **)cmd->Val = (strcmp(s, "<NULL>") && strcmp(s, "NULL")) + ? strdup(s) + : 0; + break; + case CMDSTRARRAYTYPE: + SetStrArray(cmd, s); + break; + case CMDGTETYPE: + SetGte(cmd, s); + break; + case CMDLTETYPE: + SetLte(cmd, s); + break; + case CMDSUBRANGETYPE: + SetSubrange(cmd, s); + break; + default: + fprintf(stderr, "%s: %s %d %s \"%s\"\n", + "SetParam", + "Unknown Type", + cmd->Type, + "for parameter", + cmd->Name); + exit(1); + } + cmd->ArgStr = strdup(s); + return 0; +} + +static int SetEnum(cmd, s) +Cmd_T *cmd; +char *s; +{ + Enum_T *en; + + for(en=(Enum_T *)cmd->p; en->Name; en++) { + if(*en->Name && !strcmp(s, en->Name)) { + *(int *) cmd->Val = en->Idx; + return 0; + } + } + return EnumError(cmd, s); +} + +static int SetSubrange(cmd, s) +Cmd_T *cmd; +char *s; +{ + int n; + + if(sscanf(s, "%d", &n)!=1) { + fprintf(stderr, + "Integer value required for parameter \"%s\"\n", + cmd->Name); + exit(1); + } + if(n < *(int *)cmd->p || n > *((int *)cmd->p+1)) { + return SubrangeError(cmd, n); + } + *(int *)cmd->Val = n; + return 0; +} + +static int SetGte(cmd, s) +Cmd_T *cmd; +char *s; +{ + int n; + + if(sscanf(s, "%d", &n)!=1) { + fprintf(stderr, + "Integer value required for parameter \"%s\"\n", + cmd->Name); + exit(1); + } + if(n<*(int *)cmd->p) { + return GteError(cmd, n); + } + *(int *)cmd->Val = n; + return 0; +} + +static int SetStrArray(cmd, s) +Cmd_T *cmd; +char *s; +{ + *(char***)cmd->Val = str2array(s, (char*)cmd->p); + return 0; +} + +static int SetLte(cmd, s) +Cmd_T *cmd; +char *s; +{ + int n; + + if(sscanf(s, "%d", &n)!=1) { + fprintf(stderr, + "Integer value required for parameter \"%s\"\n", + cmd->Name); + exit(1); + } + if(n > *(int *)cmd->p) { + return LteError(cmd, n); + } + *(int *)cmd->Val = n; + return 0; +} + +static int EnumError(cmd, s) +Cmd_T *cmd; +char *s; +{ + Enum_T *en; + + fprintf(stderr, + "Invalid value \"%s\" for parameter \"%s\"\n", s, cmd->Name); + fprintf(stderr, "Valid values are:\n"); + for(en=(Enum_T *)cmd->p; en->Name; en++) { + if(*en->Name) { + fprintf(stderr, " %s\n", en->Name); + } + } + fprintf(stderr, "\n"); + exit(1); +} + +static int GteError(cmd, n) +Cmd_T *cmd; +int n; +{ + fprintf(stderr, + "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); + fprintf(stderr, "Valid values must be greater than or equal to %d\n", + *(int *)cmd->p); + exit(1); +} + +static int LteError(cmd, n) +Cmd_T *cmd; +int n; +{ + fprintf(stderr, + "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); + fprintf(stderr, "Valid values must be less than or equal to %d\n", + *(int *)cmd->p); + exit(1); +} + +static int SubrangeError(cmd, n) +Cmd_T *cmd; +int n; +{ + fprintf(stderr, + "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); + fprintf(stderr, "Valid values range from %d to %d\n", + *(int *)cmd->p, *((int *)cmd->p+1)); + exit(1); +} + +static int PrintEnum(cmd, ValFlag, fp) +Cmd_T *cmd; +int ValFlag; +FILE *fp; +{ + Enum_T *en; + + fprintf(fp, "%s", cmd->Name); + if(ValFlag) { + for(en=(Enum_T *)cmd->p; en->Name; en++) { + if(*en->Name && en->Idx==*(int *)cmd->Val) { + fprintf(fp, ": %s", en->Name); + } + } + } + fprintf(fp, "\n"); + return 0; +} + +static int PrintStrArray(cmd, ValFlag, fp) +Cmd_T *cmd; +int ValFlag; +FILE *fp; +{ + char *indent, + **s = *(char***)cmd->Val; + int l = 4+strlen(cmd->Name); + + fprintf(fp, "%s", cmd->Name); + indent = malloc(l+2); + memset(indent, ' ', l+1); + indent[l+1] = 0; + if(ValFlag) { + fprintf(fp, ": %s", s ? (*s ? *s++ : "NULL") : ""); + if(s) while(*s) { + fprintf(fp, "\n%s %s", indent, *s++); + } + } + free(indent); + fprintf(fp, "\n"); + return 0; +} + +static char **str2array(s, sep) +char *s, + *sep; +{ + char *p, + **a; + int n = 0, + l; + + if(!sep) sep = SepString; + p = s += strspn(s, sep); + while(*p) { + p += strcspn(p, sep); + p += strspn(p, sep); + ++n; + } + a = calloc(n+1, sizeof(char *)); + p = s; + n = 0; + while(*p) { + l = strcspn(p, sep); + a[n] = malloc(l+1); + memcpy(a[n], p, l); + a[n][l] = 0; + ++n; + p += l; + p += strspn(p, sep); + } + return a; +} diff --git a/irstlm/src/src/cmd.h b/irstlm/src/src/cmd.h new file mode 100644 index 000000000..708905f6f --- /dev/null +++ b/irstlm/src/src/cmd.h @@ -0,0 +1,68 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +#if !defined(CMD_H) + +#define CMD_H + +#define CMDDOUBLETYPE 1 +#define CMDENUMTYPE 2 +#define CMDINTTYPE 3 +#define CMDSTRINGTYPE 4 +#define CMDSUBRANGETYPE 5 +#define CMDGTETYPE 6 +#define CMDLTETYPE 7 +#define CMDSTRARRAYTYPE 8 +#define CMDBOOLTYPE 9 + +typedef struct { + char *Name; + int Idx; +} Enum_T; + +typedef struct { + int Type; + char *Name, + *ArgStr; + void *Val, + *p; +} Cmd_T; + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__STDC__) +int DeclareParams(char *, ...); +#else +int DeclareParams(); +#endif + +int GetParams(int *n, char ***a,char *CmdFileName), + SPrintParams(), + PrintParams(); + +#ifdef __cplusplus +} +#endif +#endif + + + diff --git a/irstlm/src/src/compile-lm.cpp b/irstlm/src/src/compile-lm.cpp new file mode 100644 index 000000000..09f01fc94 --- /dev/null +++ b/irstlm/src/src/compile-lm.cpp @@ -0,0 +1,221 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit, compile LM + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +using namespace std; + +#include <iostream> +#include <fstream> +#include <vector> +#include <string> +#include <stdlib.h> +#include "util.h" +#include "math.h" +#include "lmtable.h" + + +/* GLOBAL OPTIONS ***************/ + +std::string stxt = "no"; +std::string seval = ""; +std::string sdebug = "0"; +std::string ssubdict = ""; + +/********************************/ + +void usage(const char *msg = 0) { + if (msg) { std::cerr << msg << std::endl; } + std::cerr << "Usage: compile-lm [options] input-file.lm [output-file.blm]" << std::endl; + if (!msg) std::cerr << std::endl + << " compile-lm reads a standard LM file in ARPA format and produces" << std::endl + << " a compiled representation that the IRST LM toolkit can quickly" << std::endl + << " read and process. LM file can be compressed with gzip." << std::endl << std::endl; + std::cerr << "Options:\n" + << "--text=[yes|no] -t=[yes|no] (output is again in text format)\n" + << "--eval=text-file -e=text-file (computes perplexity of text-file and returns)\n" + << "--debug=1 -d=1 (verbose output for --eval option)\n" + << "--subdict=text-file --sd=tex-file (limits LM to include only words in text-file)\n" ; +} + +bool starts_with(const std::string &s, const std::string &pre) { + if (pre.size() > s.size()) return false; + + if (pre == s) return true; + std::string pre_equals(pre+'='); + if (pre_equals.size() > s.size()) return false; + return (s.substr(0,pre_equals.size()) == pre_equals); +} + +std::string get_param(const std::string& opt, int argc, const char **argv, int& argi) +{ + std::string::size_type equals = opt.find_first_of('='); + if (equals != std::string::npos && equals < opt.size()-1) { + return opt.substr(equals+1); + } + std::string nexto; + if (argi + 1 < argc) { + nexto = argv[++argi]; + } else { + usage((opt + " requires a value!").c_str()); + exit(1); + } + return nexto; +} + +void handle_option(const std::string& opt, int argc, const char **argv, int& argi) +{ + if (opt == "--help" || opt == "-h") { usage(); exit(1); } + if (starts_with(opt, "--text") || starts_with(opt, "-t")) + stxt = get_param(opt, argc, argv, argi); + else + if (starts_with(opt, "--eval") || starts_with(opt, "-e")) + seval = get_param(opt, argc, argv, argi); + else + if (starts_with(opt, "--debug") || starts_with(opt, "-d")) + sdebug = get_param(opt, argc, argv, argi); + + else + if (starts_with(opt, "--subdict") || starts_with(opt, "-sd")) + ssubdict = get_param(opt, argc, argv, argi); + + else { + usage(("Don't understand option " + opt).c_str()); + exit(1); + } +} + +int main(int argc, const char **argv) +{ + + if (argc < 2) { usage(); exit(1); } + std::vector<std::string> files; + for (int i=1; i < argc; i++) { + std::string opt = argv[i]; + if (opt[0] == '-') { handle_option(opt, argc, argv, i); } + else files.push_back(opt); + } + if (files.size() > 2) { usage("Too many arguments"); exit(1); } + if (files.size() < 1) { usage("Please specify a LM file to read from"); exit(1); } + + bool textoutput = (stxt == "yes"? true : false); + int debug = atoi(sdebug.c_str()); + + std::string infile = files[0]; + std::string outfile=""; + + if (files.size() == 1) { + outfile=infile; + + //remove path information + std::string::size_type p = outfile.rfind('/'); + if (p != std::string::npos && ((p+1) < outfile.size())) + outfile.erase(0,p+1); + + //eventually strip .gz + if (outfile.compare(outfile.size()-3,3,".gz")==0) + outfile.erase(outfile.size()-3,3); + + outfile+=(textoutput?".lm":".blm"); + } + else + outfile = files[1]; + + cerr << "outfile: " << outfile << std::endl; + + + lmtable lmt; + + + if (ssubdict == ""){ + + std::cout << "Reading " << infile << "..." << std::endl; + inputfilestream inp(infile.c_str()); + if (!inp.good()) { + std::cerr << "Failed to open " << infile << "!\n"; + exit(1); + } + lmt.load(inp); + + } + else{ //load reduced LM from a binary LM file! + lmt.dict->generate((char *)ssubdict.c_str()); + //eventually add OOV code so that it can be found in the table + lmt.dict->genoovcode(); //generate OOV code + //filter table from the large binary table on disk + lmt.filter2(infile.c_str(),1); + //regenerate OOV code as dictionary has been rebuild + lmt.dict->genoovcode(); + } + + if (seval != ""){ + std::cerr << "Start Eval\n"; + std::cerr << "OOV code: " << lmt.dict->oovcode() << "\n"; + ngram ng(lmt.dict); + std::cout.setf(ios::fixed); + std::cout.precision(2); + if (debug>1) std::cout.precision(8); + std::fstream inptxt(seval.c_str(),std::ios::in); + + int Nbo=0,Nw=0,Noov=0; + double logPr=0,PP=0,PPwp=0,Pr; + + int bos=ng.dict->encode(ng.dict->BoS()); + +#ifdef TRACE_CACHE + lmt.init_probcache(); +#endif + + while(inptxt >> ng){ + + if (ng.size>lmt.maxlevel()) ng.size=lmt.maxlevel(); + + // reset ngram at begin of sentence + if (*ng.wordp(1)==bos) continue; + + lmt.bo_state(0); + if (ng.size>=1){ + logPr+=(Pr=lmt.clprob(ng)); + if (debug>1) + std::cout << ng << "[" << ng.size << "-gram]" << " " << Pr << "\n"; + + if (*ng.wordp(1) == lmt.dict->oovcode()) Noov++; + Nw++; if (lmt.bo_state()) Nbo++; + } + + } + + PP=exp((-logPr * log(10.0)) /Nw); + PPwp= PP * exp(Noov * log(10000000.0-lmt.dict->size())/Nw); + + std::cout << "%% Nw=" << Nw << " PP=" << PP << " PPwp=" << PPwp + << " Nbo=" << Nbo << " Noov=" << Noov + << " OOV=" << (float)Noov/Nw * 100.0 << "%\n"; + + return 0; + }; + + std::cout << "Saving to " << outfile << std::endl; + if (textoutput) + lmt.savetxt(outfile.c_str()); + else + lmt.savebin(outfile.c_str()); + + return 0; +} + diff --git a/irstlm/src/src/dictionary.cpp b/irstlm/src/src/dictionary.cpp new file mode 100644 index 000000000..44627c1c6 --- /dev/null +++ b/irstlm/src/src/dictionary.cpp @@ -0,0 +1,425 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +#include <iomanip> +#include <iostream> +#include <fstream> +#include "mempool.h" +#include "htable.h" +#include "dictionary.h" +#include "index.h" + +using namespace std; + +dictionary::dictionary(char *filename,int size,char* isymb,char* oovlexfile){ + + // unitialized memory + if (oovlexfile!=NULL) + oovlex=new dictionary(oovlexfile,size,isymb,NULL); + else + oovlex=(dictionary *)NULL; + + htb = new htable(size/LOAD_FACTOR); + tb = new dict_entry[size]; + st = new strstack(size * 10); + + for (int i=0;i<size;i++) tb[i].freq=0; + + is=(char*) NULL; + intsymb(isymb); + + oov_code = -1; + in_oov_lex=0; + n = 0; + N = 0; + dubv = 0; + lim = size; + ifl=0; //increment flag + + if (filename==NULL) return; + + std::ifstream inp(filename,ios::in); + + if (!inp){ + cerr << "cannot open " << filename << "\n"; + exit(1); + } + + char buffer[100]; + + inp >> setw(100) >> buffer; + + inp.close(); + + if ((strncmp(buffer,"dict",4)==0) || + (strncmp(buffer,"DICT",4)==0)) + load(filename); + else + generate(filename); + + cerr << "loaded \n"; + + +} + + + +void dictionary::generate(char *filename){ + + char buffer[MAX_WORD]; + int k; + + ifstream inp(filename,ios::in); + + if (!inp){ + cerr << "cannot open " << filename << "\n"; + exit(1); + } + + cerr << "dict:"; + + ifl=1; k=0; + while (inp >> setw(MAX_WORD) >> buffer){ + + if (strlen(buffer)==(MAX_WORD-1)){ + cerr << "dictionary: a too long word was read (" + << buffer << ")\n"; + }; + + + if (strlen(buffer)==0){ + cerr << "zero lenght word!\n"; + continue; + } + + //if (is && (strlen(buffer)==1) && !index(is,buffer[0])) + if (is && (strlen(buffer)==1) && (index(is,buffer[0])!=NULL)) + continue; //skip over the interruption symbol + + incfreq(encode(buffer),1); + + if (!(++k % 1000000)) cerr << "."; + } + ifl=0; + cerr << "\n"; + + inp.close(); + +} + +void dictionary::load(char* filename){ + char header[100]; + char buffer[MAX_WORD]; + char *addr; + int freqflag=0; + + ifstream inp(filename,ios::in); + + if (!inp){ + cerr << "\ncannot open " << filename << "\n"; + exit(1); + } + + cerr << "dict:"; + + inp.getline(header,100); + if (strncmp(header,"DICT",4)==0) + freqflag=1; + else + if (strncmp(header,"dict",4)!=0){ + cerr << "\ndictionary file " << filename << " has a wrong header\n"; + exit(1); + } + + + while (inp >> setw(MAX_WORD) >> buffer){ + + if (strlen(buffer)==(MAX_WORD-1)){ + cerr << "\ndictionary: a too long word was read (" + << buffer << ")\n"; + exit(1); + }; + + tb[n].word=st->push(buffer); + tb[n].code=n; + + if (freqflag) + inp >> tb[n].freq; + else + tb[n].freq=0; + + if ((addr=htb->search((char *)&tb[n].word,HT_ENTER))) + if (addr!=(char *)&tb[n].word){ + cerr << "dictionary::loadtxt wrong entry was found (" + << buffer << ") in position " << n << "\n"; + exit(1); + } + + N+=tb[n].freq; + + if (strcmp(buffer,OOV())==0) oov_code=n; + + if (++n==lim) grow(); + + } + + inp.close(); +} + + +void dictionary::load(std::istream& inp){ + + char buffer[MAX_WORD]; + char *addr; + int size; + + inp >> size; + + for (int i=0;i<size;i++){ + + inp >> setw(MAX_WORD) >> buffer; + + if (strlen(buffer)==MAX_WORD-1){ + cerr << "\ndictionary::load found word exceeding max length (" + << MAX_WORD << ")" << buffer << "\n"; + exit(1); + }; + + tb[n].word=st->push(buffer); + tb[n].code=n; + inp >> tb[n].freq; + N+=tb[n].freq; + + if ((addr=htb->search((char *)&tb[n].word,HT_ENTER))) + if (addr!=(char *)&tb[n].word){ + cerr << "dictionary::loadtxt wrong entry was found (" + << buffer << ") in position " << n << "\n"; + exit(1); + } + + if (strcmp(tb[n].word,OOV())==0) + oov_code=n; + + if (++n==lim) grow(); + } + inp.getline(buffer,MAX_WORD-1); +} + +void dictionary::save(std::ostream& out){ + out << n << "\n"; + for (int i=0;i<n;i++) + out << tb[i].word << " " << tb[i].freq << "\n"; +} + + +int cmpdictentry(const void *a,const void *b){ + dict_entry *ae=(dict_entry *)a; + dict_entry *be=(dict_entry *)b; + return be->freq-ae->freq; +} + +dictionary::dictionary(dictionary* d){ + + //transfer values + + n=d->n; //total entries + N=d->N; //total frequency + lim=d->lim; //limit of entries + oov_code=-1; //code od oov must be re-defined + ifl=0; //increment flag=0; + dubv=d->dubv; //dictionary upperbound transferred + in_oov_lex=0; //does not copy oovlex; + + + //creates a sorted copy of the table + + tb = new dict_entry[lim]; + htb = new htable(lim/LOAD_FACTOR); + st = new strstack(lim * 10); + + for (int i=0;i<n;i++){ + tb[i].code=d->tb[i].code; + tb[i].freq=d->tb[i].freq; + tb[i].word=st->push(d->tb[i].word); + } + + //sort all entries according to frequency + cerr << "sorting dictionary ..."; + qsort(tb,n,sizeof(dict_entry),cmpdictentry); + cerr << "done\n"; + + for (int i=0;i<n;i++){ + + //eventually re-assign oov code + if (d->oov_code==tb[i].code) oov_code=i; + + tb[i].code=i; + htb->search((char *)&tb[i].word,HT_ENTER); + }; + +} + + + +dictionary::~dictionary(){ + delete htb; + delete st; + delete [] tb; +} + +void dictionary::stat(){ + cout << "dictionary class statistics\n"; + cout << "size " << n + << " used memory " + << (lim * sizeof(int) + + htb->used() + + st->used())/1024 << " Kb\n"; +} + +void dictionary::grow(){ + + delete htb; + + cerr << "+\b"; + + dict_entry *tb2=new dict_entry[lim+GROWTH_STEP]; + + memcpy(tb2,tb,sizeof(dict_entry) * lim ); + + delete [] tb; tb=tb2; + + htb=new htable((lim+GROWTH_STEP)/LOAD_FACTOR); + + for (int i=0;i<lim;i++) + + htb->search((char *)&tb[i].word,HT_ENTER); + + for (int i=lim;i<lim+GROWTH_STEP;i++) tb[i].freq=0; + + lim+=GROWTH_STEP; + + +} + +void dictionary::save(char *filename,int freqflag){ + + std::ofstream out(filename,ios::out); + + if (!out){ + cerr << "cannot open " << filename << "\n"; + } + + // header + if (freqflag) + out << "DICTIONARY 0 " << n << "\n"; + else + out << "dictionary 0 " << n << "\n"; + + for (int i=0;i<n;i++){ + out << tb[i].word; + if (freqflag) + out << " " << tb[i].freq; + out << "\n"; + } + + out.close(); +} + + +int dictionary::getcode(const char *w){ + dict_entry* ptr=(dict_entry *)htb->search((char *)&w,HT_FIND); + if (ptr==NULL) return -1; + return ptr->code; +} + +int dictionary::encode(const char *w){ + + //case of strange characters + if (strlen(w)==0){cerr << "0";w=OOV();} + + dict_entry* ptr; + + if ((ptr=(dict_entry *)htb->search((char *)&w,HT_FIND))!=NULL) + return ptr->code; + else{ + if (!ifl){ //do not extend dictionary + if (oov_code==-1){ //did not use OOV yet + cerr << "starting to use OOV words [" << w << "]\n"; + tb[n].word=st->push(OOV()); + htb->search((char *)&tb[n].word,HT_ENTER); + tb[n].code=n; + tb[n].freq=0; + oov_code=n; + if (++n==lim) grow(); + } + //if there is an oov lexicon, check if this word belongs to + dict_entry* oovptr; + if (oovlex){ + if ((oovptr=(dict_entry *)oovlex->htb->search((char *)&w,HT_FIND))!=NULL){ + in_oov_lex=1; + oov_lex_code=oovptr->code; + }else + in_oov_lex=0; + } + return encode(OOV()); + } + else{ //extend dictionary + tb[n].word=st->push((char *)w); + htb->search((char *)&tb[n].word,HT_ENTER); + tb[n].code=n; + tb[n].freq=0; + if (++n==lim) grow(); + return n-1; + } + } +} + + +char *dictionary::decode(int c){ + if (c>=0 && c < n) + return tb[c].word; + else{ + cerr << "decode: code out of boundary\n"; + return OOV(); + } +} + + +dictionary_iter::dictionary_iter(dictionary *dict) : m_dict(dict) { + m_dict->htb->scan(HT_INIT); +} + +dict_entry* dictionary_iter::next() { + return (dict_entry*)m_dict->htb->scan(HT_CONT); +} + + + + + +/* +main(int argc,char **argv){ + dictionary d(argv[1],40000); + d.stat(); + cout << "ROMA" << d.decode(0) << "\n"; + cout << "ROMA:" << d.encode("ROMA") << "\n"; + d.save(argv[2]); +} +*/ diff --git a/irstlm/src/src/dictionary.h b/irstlm/src/src/dictionary.h new file mode 100644 index 000000000..be625e810 --- /dev/null +++ b/irstlm/src/src/dictionary.h @@ -0,0 +1,186 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +#ifndef MF_DICTIONARY_H +#define MF_DICTIONARY_H + +#include <cstring> +#include <iostream> + +#define MAX_WORD 1000 +#define LOAD_FACTOR 5 + +#ifndef GROWTH_STEP +#define GROWTH_STEP 100000 +#endif + +#ifndef DICT_INITSIZE +#define DICT_INITSIZE 100000 +#endif + +//Begin of sentence symbol +#ifndef BOS_ +#define BOS_ "<s>" +#endif + + +//End of sentence symbol +#ifndef EOS_ +#define EOS_ "</s>" +#endif + +//Out-Of-Vocabulary symbol +#ifndef OOV_ +#define OOV_ "_unk_" +#endif + +typedef struct{ + char *word; + int code; + int freq; +}dict_entry; + +class strstack; +class htable; + +class dictionary{ + strstack *st; //!< stack of strings + dict_entry *tb; //!< entry table + htable *htb; //!< hash table + int n; //!< number of entries + int N; //!< total frequency + int lim; //!< limit of entries + int oov_code; //!< code assigned to oov words + char* is; //!< interruption symbol list + char ifl; //!< increment flag + int dubv; //!< dictionary size upper bound + int in_oov_lex; //!< flag + int oov_lex_code; //!< dictionary + char* oov_str; //!< oov string + + public: + + friend class dictionary_iter; + + dictionary* oovlex; //<! additional dictionary + + inline int dub(){return dubv;} + inline int dub(int value){return (dubv=value);} + + inline char *OOV(){return (OOV_);} + inline char *BoS(){return (BOS_);} + inline char *EoS(){return (EOS_);} + + inline int oovcode(int v=-1){return oov_code=(v>=0?v:oov_code);} + + inline char *intsymb(char* isymb=NULL){ + if (isymb==NULL) return is; + if (is!=NULL) delete [] is; + is=new char[strlen(isymb+1)]; + strcpy(is,isymb); + return is=isymb; + } + + inline int incflag(){return ifl;} + inline int incflag(int v){return ifl=v;} + inline int oovlexsize(){return oovlex?oovlex->n:0;} + inline int inoovlex(){return in_oov_lex;} + inline int oovlexcode(){return oov_lex_code;} + + + int isprintable(char* w){ + char buffer[MAX_WORD]; + sprintf(buffer,"%s",w); + return strcmp(w,buffer)==0; + } + + inline void genoovcode(){ + int c=encode(OOV()); + std::cerr << "OOV code is "<< c << std::endl; + oovcode(c); + } + + inline dictionary* oovlexp(char *fname=NULL){ + if (fname==NULL) return oovlex; + if (oovlex!=NULL) delete oovlex; + oovlex=new dictionary(fname,DICT_INITSIZE); + return oovlex; + } + + inline int setoovrate(double oovrate){ + encode(OOV()); //be sure OOV code exists + int oovfreq=(int)(oovrate * totfreq()); + std::cerr << "setting OOV rate to: " << oovrate << " -- freq= " << oovfreq << std::endl; + return freq(oovcode(),oovfreq); + } + + + inline int incfreq(int code,int value){N+=value;return tb[code].freq+=value;} + + inline int multfreq(int code,double value){ + N+=(int)(value * tb[code].freq)-tb[code].freq; + return tb[code].freq=(int)(value * tb[code].freq); + } + + inline int freq(int code,int value=-1){ + if (value>=0){ + N+=value-tb[code].freq; + tb[code].freq=value; + } + return tb[code].freq; + } + + inline int totfreq(){return N;} + + void grow(); + //dictionary(int size=400,char* isym=NULL,char* oovlex=NULL); + dictionary(char *filename=NULL,int size=DICT_INITSIZE,char* isymb=NULL,char* oovlex=NULL); + dictionary(dictionary* d); + + ~dictionary(); + void generate(char *filename); + void load(char *filename); + void save(char *filename,int freqflag=0); + void load(std::istream& fd); + void save(std::ostream& fd); + + int size(){return n;}; + int getcode(const char *w); + int encode(const char *w); + char *decode(int c); + void stat(); + + void cleanfreq(){ + for (int i=0;i<n;tb[i++].freq=0); + N=0; + } + +}; + +class dictionary_iter { + public: + dictionary_iter(dictionary *dict); + dict_entry* next(); + private: + dictionary* m_dict; +}; + +#endif + diff --git a/irstlm/src/src/gzfilebuf.h b/irstlm/src/src/gzfilebuf.h new file mode 100644 index 000000000..5e215935a --- /dev/null +++ b/irstlm/src/src/gzfilebuf.h @@ -0,0 +1,80 @@ +#ifndef _GZFILEBUF_H_
+#define _GZFILEBUF_H_
+
+#include <streambuf>
+#include <zlib.h>
+
+class gzfilebuf : public std::streambuf {
+public:
+ gzfilebuf(const char *filename)
+ { _gzf = gzopen(filename, "rb");
+ setg (_buff+sizeof(int), // beginning of putback area
+ _buff+sizeof(int), // read position
+ _buff+sizeof(int)); // end position
+ }
+ ~gzfilebuf() { gzclose(_gzf); }
+protected:
+ virtual int_type overflow (int_type c) {
+ throw;
+ }
+
+ // write multiple characters
+ virtual
+ std::streamsize xsputn (const char* s,
+ std::streamsize num) {
+ throw;
+ }
+
+ virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
+ }
+
+ //read one character
+ virtual int_type underflow () {
+ // is read position before end of _buff?
+ if (gptr() < egptr()) {
+ return traits_type::to_int_type(*gptr());
+ }
+
+ /* process size of putback area
+ * - use number of characters read
+ * - but at most four
+ */
+ unsigned int numPutback = gptr() - eback();
+ if (numPutback > sizeof(int)) {
+ numPutback = sizeof(int);
+ }
+
+ /* copy up to four characters previously read into
+ * the putback _buff (area of first four characters)
+ */
+ std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
+ numPutback);
+
+ // read new characters
+ int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
+ if (num <= 0) {
+ // ERROR or EOF
+ return EOF;
+ }
+
+ // reset _buff pointers
+ setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
+ _buff+sizeof(int), // read position
+ _buff+sizeof(int)+num); // end of buffer
+
+ // return next character
+ return traits_type::to_int_type(*gptr());
+ }
+
+ std::streamsize xsgetn (char* s,
+ std::streamsize num) {
+ return gzread(_gzf,s,num);
+ }
+
+private:
+ gzFile _gzf;
+ static const unsigned int _buffsize = 1024;
+ char _buff[_buffsize];
+};
+
+#endif
diff --git a/irstlm/src/src/htable.cpp b/irstlm/src/src/htable.cpp new file mode 100644 index 000000000..73dab4d10 --- /dev/null +++ b/irstlm/src/src/htable.cpp @@ -0,0 +1,329 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +#include <iostream> +#include <cassert> +#include "mempool.h" +#include "htable.h" + +//bitwise rotation of unsigned integers + +#define rot_right(a,k) (((a) >> k ) | ((a) << (32-(k)))) +#define rot_left(a,k) (((a) << k ) | ((a) >> (32-(k)))) + +using namespace std; + +htable::htable(int n,int kl,HTYPE ht,size_t (*klf)(const char* )){ + + if (ht!=STRPTR && ht!=STR && kl==0){ + cerr << "htable: key length must be specified for non-string entries!"; + exit(1); + } + + memory=new mempool( sizeof(entry) , BlockSize ); + + table = new entry* [ size=n ]; + + memset(table,0,sizeof(entry *) * n ); + + keylen=(ht==INT || ht==INTPTR?kl/sizeof(int):kl); + + htype=ht; + + keys = accesses = collisions = 0; + + keylenfunc=(klf?klf:&strlen); + +} + + +char *htable::search(char *item, HT_ACTION action) + +{ + address h; + entry *q,**p; + int i; + + //if (action == HT_FIND) + accesses++; + + h = Hash(item); + + i=(h % size); + + //cout << "htable::search() hash i=" << i << "\n"; + + p = &table[h % size]; + + q=*p; + + /* + ** Follow collision chain + */ + + while (q != NULL && Comp((char *)q->key,(char *)item)) + { + p = (entry **)&q->next; + q=*p; + //if (action == HT_FIND) + collisions++; + } + + if ( + q != NULL /* found */ + || + action == HT_FIND /* not found, search only */ + || + (q = (entry *)memory->allocate()) + == + NULL /* not found, no room */ + ) + + return((q!=NULL)?(char *)q->key:(char *)NULL); + + *p = q; /* link into chain */ + /* + ** Initialize new element + */ + + q->key = item; + q->next = NULL; + keys++; + + return((char *)q->key); +} + + +char *htable::scan(HT_ACTION action){ + + char *k; + + if (action == HT_INIT) + { + scan_i=0;scan_p=table[0]; + return NULL; + } + + // if scan_p==NULL go to the first non null pointer + while ((scan_p==NULL) && (++scan_i<size)) scan_p=table[scan_i]; + + if (scan_p!=NULL) + { + k=scan_p->key; + scan_p=(entry *)scan_p->next; + return k; + }; + + return NULL; +} + + +void htable::map(ostream& co,int cols){ + + entry *p; + char* img=new char[cols+1]; + + img[cols]='\0'; + memset(img,'.',cols); + + co << "htable memory map: . (0 items), - (<5), # (>5)\n"; + + for (int i=0; i<size;i++) + { + int n=0;p=table[i]; + + while(p!=NULL){ + n++; + p=(entry *)p->next; + }; + + if (i && (i % cols)==0){ + co << img << "\n"; + memset(img,'.',cols); + } + + if (n>0) + img[i % cols]=n<=5?'-':'#'; + + } + + img[size % cols]='\0'; + co << img << "\n"; + + delete []img; +} + + +void htable::stat(){ + cerr << "htable class statistics\n"; + cerr << "size " << size + << " keys " << keys + << " acc " << accesses + << " coll " << collisions + << " used memory " << used()/1024 << "Kb\n"; +} + +htable::~htable() +{ + delete [] table; + delete memory; +} + +address htable::HashStr(char *key) +{ + char *Key=(htype==STRPTR? *(char **)key:key); + int length=(keylen?keylen:keylenfunc(Key)); + + //cerr << "hash: " << Key << " length:" << length << "\n"; + + register address h=0; + register int i; + + for (i=0,h=0;i<length;i++) + h = h * Prime1 ^ (Key[i] - ' '); + h %= Prime2; + + return h; +} + +//Herbert Glarner's "HSH 11/13" hash function. +/* +address htable::HashInt(char *key){ + +int *Key=(htype==INTPTR? *(int **)key:(int *)key); + +address state=12345,h=0; +register int i,j; + +int p=7; //precision=8 * sizeof(int)-1, in general must be >=7 + + for (i=0,h=0;i<keylen;i++){ + h = h ^ ((address) Key[i]); + for (j=0;j<p;j++){ + state = rot_left(state,11); //state = state left-rotate 11 bits + h = rot_left(h,13); //h = h left-rotate 13 bits + h ^= state ; //h = h xor state + h = rot_left(h,(state & (address)31)); //h = h left-rotate (state mod 32) bits + h = rot_left(h, (h & (address)31)); //h = h left-rotate (h mod 32) bits + } + } + + return h; +} + +*/ + +address htable::HashInt(char *key) +{ + int *Key=(htype==INTPTR? *(int **)key:(int *)key); + + + address h; + register int i; + + //Thomas Wang's 32 bit Mix Function + for (i=0,h=0;i<keylen;i++){ + h+=Key[i]; + h += ~(h << 15); + h ^= (h >> 10); + h += (h << 3); + h ^= (h >> 6); + h += ~(h << 11); + h ^= (h >> 16); + }; + + return h; +} + +int htable::CompStr(char *key1, char *key2) +{ + assert(key1 && key2); + + char *Key1=(htype==STRPTR?*(char **)key1:key1); + char *Key2=(htype==STRPTR?*(char **)key2:key2); + + assert(Key1 && Key2); + + int length1=(keylen?keylen:keylenfunc(Key1)); + int length2=(keylen?keylen:keylenfunc(Key2)); + + if (length1!=length2) return 1; + + register int i; + + for (i=0;i<length1;i++) + if (Key1[i]!=Key2[i]) return 1; + return 0; +} + +int htable::CompInt(char *key1, char *key2) +{ + assert(key1 && key2); + + int *Key1=(htype==INTPTR?*(int **)key1:(int*)key1); + int *Key2=(htype==INTPTR?*(int **)key2:(int*)key2); + + assert(Key1 && Key2); + + register int i; + + for (i=0;i<keylen;i++) + if (Key1[i]!=Key2[i]) return 1; + return 0; +} + + +/* +main(){ + +const int n=1000; + +htable *ht=new htable(1000/5); + + char w[n][20]; + char *c; + + for (int i=0;i<n;i++) + { + sprintf(w[i],"ciao%d",i); + ht->search((char *)&w[i],HT_ENTER); + } + + for (int i=0;i<n;i++) + if (ht->search((char *)&w[i],HT_FIND)) + cout << w[i] << " trovato\n" ; + else + cout << w[i] << " non trovato\n"; + + ht->stat(); + + delete ht; + htable *ht2=new htable(n); + for (int i=0;i<n;i++) + ht2->search((char *)&w[i],HT_ENTER); + + ht2->scan(INIT); + cout << "elenco:\n"; + while ((c=ht2->scan(CONT))!=NULL) + cout << *(char **) c << "\n"; + + ht2->map(); +} +*/ diff --git a/irstlm/src/src/htable.h b/irstlm/src/src/htable.h new file mode 100644 index 000000000..476d45e74 --- /dev/null +++ b/irstlm/src/src/htable.h @@ -0,0 +1,130 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +#ifndef MF_HTABLE_H +#define MF_HTABLE_H + +#include <iostream> + +#define Prime1 37 +#define Prime2 1048583 +#define BlockSize 100 + + +// Fast arithmetic, relying on powers of 2, +// and on pre-processor concatenation property + +typedef struct{ + char* key; + char* next; // secret from user +}entry; + +typedef unsigned int address; + +typedef enum {HT_FIND, //!< search: find an entry + HT_ENTER, //!< search: enter an entry + HT_INIT, //!< scan: start scan + HT_CONT //!< scan: continue scan +} HT_ACTION; + +typedef enum { + STR, //!< string + STRPTR, //!< pointer to string + INT, //!< pointer to int + INTPTR //!< pointer to pointer to int +}HTYPE; + +//! Hash Table for strings + +class htable { + int size; //!< table size + int keylen; //!< key length + HTYPE htype; //!< type of entry pointer + entry **table; //!< hash table + int scan_i; //!< scan support + entry *scan_p; //!< scan support + // statistics + long keys; //!< # of entries + long accesses; //!< # of accesses + long collisions; //!< # of collisions + + mempool *memory; //!< memory pool + + size_t (*keylenfunc)(const char*); //!< function computing key length + address (*hashfunc)(const char*); //!< hash function + int (*compfunc)(const char*, const char*); //!< comparison function + + public: + + //! Creates an hash table + htable(int n,int kl=0,HTYPE ht=STRPTR,size_t (*klf)(const char* )=NULL); + + //! Destroys an and hash table + ~htable(); + + //! Computes the hash function + address Hash(char *key){ + switch (htype){ + case INT:case INTPTR: return HashInt(key); + break; + case STR:case STRPTR: return HashStr(key); + default: exit(1); + } + }; + address HashInt(char *key); + address HashStr(char *key); + + //! Compares the keys of two entries + int Comp(char *Key1,char *Key2){ + switch (htype){ + case INT:case INTPTR: return CompInt(Key1,Key2); + break; + case STR:case STRPTR: return CompStr(Key1,Key2); + default: exit(1); + }; + } + + int CompInt(char *Key1,char *Key2); + int CompStr(char *Key1,char *Key2); + + //! Searches for an item + char *search(char *item, HT_ACTION action); + + //! Scans the content + char *scan(HT_ACTION action); + + //! Prints statistics + void stat(); + + //! Print a map of memory use + void map(std::ostream& co=std::cout, int cols=80); + + //! Returns amount of used memory + int used(){return + size * sizeof(entry **) + + memory->used();}; +}; + + + +#endif + + + diff --git a/irstlm/src/src/index.h b/irstlm/src/src/index.h new file mode 100644 index 000000000..e4e29d793 --- /dev/null +++ b/irstlm/src/src/index.h @@ -0,0 +1,19 @@ + + +#pragma once + +#ifdef WIN32 + +inline const char *index(const char *str, char search) +{ + size_t i=0; + while (i< strlen(str) ){ + if (str[i]==search) return &str[i]; + } + return NULL; +} + + +#endif + + diff --git a/irstlm/src/src/lmtable.cpp b/irstlm/src/src/lmtable.cpp new file mode 100644 index 000000000..05ab115eb --- /dev/null +++ b/irstlm/src/src/lmtable.cpp @@ -0,0 +1,1217 @@ +/****************************************************************************** +IrstLM: IRST Language Model Toolkit +Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ +#include <iostream> +#include <fstream> +#include <stdexcept> +#include <cassert> +#include "math.h" +#include "mempool.h" +#include "htable.h" +#include "ngramcache.h" +#include "dictionary.h" +#include "n_gram.h" +#include "lmtable.h" +#include "util.h" + +using namespace std; + +inline void error(char* message){ + cerr << message << "\n"; + throw std::runtime_error(message); +} + + +//instantiate an empty lm table +lmtable::lmtable(){ + + configure(1,false); + + dict=new dictionary((char *)NULL,1000000,(char*)NULL,(char*)NULL); + + memset(cursize, 0, sizeof(cursize)); + memset(tbltype, 0, sizeof(tbltype)); + memset(maxsize, 0, sizeof(maxsize)); + memset(info, 0, sizeof(info)); + memset(NumCenters, 0, sizeof(NumCenters)); + + max_cache_lev=0; + for (int i=0;i<=LMTMAXLEV+1;i++) lmtcache[i]=NULL; + probcache=NULL; + statecache=NULL; + +}; + + +//loadstd::istream& inp a lmtable from a lm file + +void lmtable::load(istream& inp){ + + //give a look at the header to select loading method + char header[1024]; + + inp >> header; cerr << header << "\n"; + + if (strncmp(header,"Qblmt",5)==0 || strncmp(header,"blmt",4)==0) + loadbin(inp,header); + else + loadtxt(inp,header); + + dict->genoovcode(); + + + cerr << "OOV code is " << dict->oovcode() << "\n"; + +} + + + +int parseWords(char *sentence, char **words, int max) +{ + char *word; + int i = 0; + + char *const wordSeparators = " \t\r\n"; + + for (word = strtok(sentence, wordSeparators); + i < max && word != 0; + i++, word = strtok(0, wordSeparators)) + { + words[i] = word; + } + + if (i < max){words[i] = 0;} + + return i; +} + + + +//Load a LM as a text file. LM could have been generated either with the +//IRST LM toolkit or with the SRILM Toolkit. In the latter we are not +//sure that n-grams are lexically ordered (according to the 1-grams). +//However, we make the following assumption: +//"all successors of any prefix are sorted and written in contiguous lines!" +//This method also loads files processed with the quantization +//tool: qlm + +int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow){ + + char* words[1+ LMTMAXLEV + 1 + 1]; + int howmany; + char line[MAX_LINE]; + + inp.getline(line,MAX_LINE); + if (strlen(line)==MAX_LINE-1){ + cerr << "parseline: input line exceed MAXLINE (" + << MAX_LINE << ") chars " << line << "\n"; + exit(1); + } + + howmany = parseWords(line, words, Order + 3); + assert(howmany == (Order+ 1) || howmany == (Order + 2)); + + //read words + ng.size=0; + for (int i=1;i<=Order;i++) + ng.pushw(strcmp(words[i],"<unk>")?words[i]:ng.dict->OOV()); + + //read logprob/code and logbow/code + assert(sscanf(words[0],"%f",&prob)); + if (howmany==(Order+2)) + assert(sscanf(words[Order+1],"%f",&bow)); + else + bow=0.0; //this is log10prob=0 for implicit backoff + + return 1; +} + + +void lmtable::loadcenters(istream& inp,int Order){ + char line[MAX_LINE]; + + //first read the coodebook + cerr << Order << " read code book "; + inp >> NumCenters[Order]; + Pcenters[Order]=new float[NumCenters[Order]]; + Bcenters[Order]=(Order<maxlev?new float[NumCenters[Order]]:NULL); + + for (int c=0;c<NumCenters[Order];c++){ + inp >> Pcenters[Order][c]; + if (Order<maxlev) inp >> Bcenters[Order][c]; + }; + //empty the last line + inp.getline((char*)line,MAX_LINE); + +} + + +void lmtable::loadtxt(istream& inp,const char* header){ + + + //open input stream and prepare an input string + char line[MAX_LINE]; + + //prepare word dictionary + //dict=(dictionary*) new dictionary(NULL,1000000,NULL,NULL); + dict->incflag(1); + + //put here ngrams, log10 probabilities or their codes + ngram ng(dict); + float prob,bow;; + + //check the header to decide if the LM is quantized or not + isQtable=(strncmp(header,"qARPA",5)==0?true:false); + + //we will configure the table later we we know the maxlev; + bool yetconfigured=false; + + cerr << "loadtxt()\n"; + + // READ ARPA Header + int Order,n; + + while (inp.getline(line,MAX_LINE)){ + + if (strlen(line)==MAX_LINE-1){ + cerr << "lmtable::loadtxt: input line exceed MAXLINE (" + << MAX_LINE << ") chars " << line << "\n"; + exit(1); + } + + bool backslash = (line[0] == '\\'); + + if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { + maxsize[Order] = n; maxlev=Order; //upadte Order + + } + + if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { + + //at this point we are sure about the size of the LM + if (!yetconfigured){ + configure(maxlev,isQtable);yetconfigured=true; + //allocate space for loading the table of this level + for (int i=1;i<=maxlev;i++) + table[i]= new char[maxsize[i] * nodesize(tbltype[i])]; + } + + cerr << Order << "-grams: reading "; + + if (isQtable) loadcenters(inp,Order); + + //allocate support vector to manage badly ordered n-grams + if (maxlev>1 && Order<maxlev) { + startpos[Order]=new int[maxsize[Order]]; + for (int c=0;c<maxsize[Order];c++) startpos[Order][c]=-1; + } + + //prepare to read the n-grams entries + cerr << maxsize[Order] << " entries\n"; + + //WE ASSUME A WELL STRUCTURED FILE!!! + + for (int c=0;c<maxsize[Order];c++){ + + if (parseline(inp,Order,ng,prob,bow)) + add(ng, + (int)(isQtable?prob:*((int *)&prob)), + (int)(isQtable?bow:*((int *)&bow))); + } + // now we can fix table at level Order -1 + if (maxlev>1 && Order>1) checkbounds(Order-1); + } + } + + dict->incflag(0); + cerr << "done\n"; + +} + + +//Checkbound with sorting of n-gram table on disk + +void lmtable::checkbounds(int level){ + + char* tbl=table[level]; + char* succtbl=table[level+1]; + + LMT_TYPE ndt=tbltype[level], succndt=tbltype[level+1]; + int ndsz=nodesize(ndt), succndsz=nodesize(succndt); + + //re-order table at level+1 on disk + //generate random filename to avoid collisions + ofstream out;string filePath; + createtempfile(out,filePath,ios::out); + + int start,end,newstart; + + //re-order table at level l+1 + newstart=0; + for (int c=0;c<cursize[level];c++){ + start=startpos[level][c]; end=bound(tbl+c*ndsz,ndt); + //is start==-1 there are no successors for this entry and end==-2 + if (end==-2) end=start; + assert(start<=end); + assert(newstart+(end-start)<=cursize[level+1]); + assert(end<=cursize[level+1]); + + if (start<end) + out.write((char*)(succtbl + start * succndsz),(end-start) * succndsz); + + bound(tbl+c*ndsz,ndt,newstart+(end-start)); + newstart+=(end-start); + } + out.close(); + + fstream inp(filePath.c_str(),ios::in); + + inp.read(succtbl,cursize[level+1]*succndsz); + inp.close(); + removefile(filePath); + +} + +//Add method inserts n-grams in the table structure. It is ONLY used during +//loading of LMs in text format. It searches for the prefix, then it adds the +//suffix to the last level and updates the start-end positions. + +int lmtable::add(ngram& ng,int iprob,int ibow){ + + char *found; LMT_TYPE ndt; int ndsz; + + if (ng.size>1){ + + // find the prefix starting from the first level + int start=0, end=cursize[1]; + + for (int l=1;l<ng.size;l++){ + + ndt=tbltype[l]; ndsz=nodesize(ndt); + + if (search(table[l] + (start * ndsz),ndt,l,(end-start),ndsz, + ng.wordp(ng.size-l+1),LMT_FIND, &found)){ + + //update start-end positions for next step + if (l< (ng.size-1)){ + //set start position + if (found==table[l]) start=0; //first pos in table + else start=bound(found - ndsz,ndt); //end of previous entry + + //set end position + end=bound(found,ndt); + } + } + else{ + cerr << "warning: missing back-off for ngram " << ng << "\n"; + return 0; + } + } + + // update book keeping information about level ng-size -1. + // if this is the first successor update start position + int position=(found-table[ng.size-1])/ndsz; + if (startpos[ng.size-1][position]==-1) + startpos[ng.size-1][position]=cursize[ng.size]; + + //always update ending position + bound(found,ndt,cursize[ng.size]+1); + //cout << "startpos: " << startpos[ng.size-1][position] + //<< " endpos: " << bound(found,ndt) << "\n"; + + } + + // just add at the end of table[ng.size] + + assert(cursize[ng.size]< maxsize[ng.size]); // is there enough space? + ndt=tbltype[ng.size];ndsz=nodesize(ndt); + + found=table[ng.size] + (cursize[ng.size] * ndsz); + word(found,*ng.wordp(1)); + prob(found,ndt,iprob); + if (ng.size<maxlev){bow(found,ndt,ibow);bound(found,ndt,-2);} + + cursize[ng.size]++; + + return 1; + +} + + +void *lmtable::search(char* tb, + LMT_TYPE ndt, + int lev, + int n, + int sz, + int *ngp, + LMT_ACTION action, + char **found){ + + if (lev==1) return *found=(*ngp <n ? tb + *ngp * sz:NULL); + + //prepare search pattern + char w[LMTCODESIZE];putmem(w,ngp[0],0,LMTCODESIZE); + + int idx=0; // index returned by mybsearch + *found=NULL; //initialize output variable + switch(action){ + case LMT_FIND: + if (!tb || !mybsearch(tb,n,sz,(unsigned char *)w,&idx)) return NULL; + else + return *found=tb + (idx * sz); + default: + error("lmtable::search: this option is available"); + }; + + return NULL; +} + + +int lmtable::mybsearch(char *ar, int n, int size, + unsigned char *key, int *idx) +{ + + totbsearch++; + + register int low, high; + register unsigned char *p; + register int result; + register int i; + + /* return idx with the first + position equal or greater than key */ + + /* Warning("start bsearch \n"); */ + + low = 0;high = n; *idx=0; + while (low < high) + { + *idx = (low + high) / 2; + p = (unsigned char *) (ar + (*idx * size)); + + //comparison + for (i=(LMTCODESIZE-1);i>=0;i--){ + result=key[i]-p[i]; + if (result) break; + } + + if (result < 0) + high = *idx; + else if (result > 0) + low = *idx + 1; + else + return 1; + } + + *idx=low; + + return 0; + +} + + +// saves a LM table in text format + +void lmtable::savetxt(const char *filename){ + + fstream out(filename,ios::out); + int l; + + out.precision(7); + + if (isQtable) out << "qARPA\n"; + + + ngram ng(dict,0); + + cerr << "savetxt: " << filename << "\n"; + + out << "\n\\data\\\n"; + for (l=1;l<=maxlev;l++){ + out << "ngram " << l << "= " << cursize[l] << "\n"; + } + + for (l=1;l<=maxlev;l++){ + + out << "\n\\" << l << "-grams:\n"; + cerr << "save: " << cursize[l] << " " << l << "-grams\n"; + if (isQtable){ + out << NumCenters[l] << "\n"; + for (int c=0;c<NumCenters[l];c++){ + out << Pcenters[l][c]; + if (l<maxlev) out << " " << Bcenters[l][c]; + out << "\n"; + } + } + + ng.size=0; + dumplm(out,ng,1,l,0,cursize[1]); + + } + + out << "\\end\\\n"; + cerr << "done\n"; +} + + +void lmtable::savebin(const char *filename){ + + fstream out(filename,ios::out); + cerr << "savebin: " << filename << "\n"; + + // print header + if (isQtable){ + out << "Qblmt " << maxlev; + for (int i=1;i<=maxlev;i++) out << " " << cursize[i]; + out << "\nNumCenters"; + for (int i=1;i<=maxlev;i++) out << " " << NumCenters[i]; + out << "\n"; + + }else{ + out << "blmt " << maxlev; + for (int i=1;i<=maxlev;i++) out << " " << cursize[i] ; + out << "\n"; + } + + dict->save(out); + + for (int i=1;i<=maxlev;i++){ + cerr << "saving " << cursize[i] << " " << i << "-grams\n"; + if (isQtable){ + out.write((char*)Pcenters[i],NumCenters[i] * sizeof(float)); + if (i<maxlev) + out.write((char *)Bcenters[i],NumCenters[i] * sizeof(float)); + } + out.write(table[i],cursize[i]*nodesize(tbltype[i])); + } + + cerr << "done\n"; +} + + +//manages the long header of a bin file + +void lmtable::loadbinheader(istream& inp,const char* header){ + + // read rest of header + inp >> maxlev; + + if (strncmp(header,"Qblmt",5)==0) isQtable=1; + else if(strncmp(header,"blmt",4)==0) isQtable=0; + else error("loadbin: LM file is not in binary format"); + + configure(maxlev,isQtable); + + for (int i=1;i<=maxlev;i++){ + inp >> cursize[i]; maxsize[i]=cursize[i]; + table[i]=new char[cursize[i] * nodesize(tbltype[i])]; + } + + + if (isQtable){ + char header2[100]; + cerr << "reading num centers:"; + inp >> header2; + for (int i=1;i<=maxlev;i++){ + inp >> NumCenters[i];cerr << " " << NumCenters[i]; + + } + cerr << "\n"; + } +} + +//load codebook of level l + +void lmtable::loadbincodebook(istream& inp,int l){ + + Pcenters[l]=new float [NumCenters[l]]; + inp.read((char*)Pcenters[l],NumCenters[l] * sizeof(float)); + if (l<maxlev){ + Bcenters[l]=new float [NumCenters[l]]; + inp.read((char *)Bcenters[l],NumCenters[l]*sizeof(float)); + } + +} + +//load a binary lmfile + +void lmtable::loadbin(istream& inp, const char* header){ + + cerr << "loadbin()\n"; + + loadbinheader(inp,header); + + dict->load(inp); + + for (int l=1;l<=maxlev;l++){ + if (isQtable) loadbincodebook(inp,l); + + cerr << "loading " << cursize[l] << " " << l << "-grams\n"; + inp.read(table[l],cursize[l]*nodesize(tbltype[l])); + }; + + cerr << "done\n"; +} + + + +int lmtable::get(ngram& ng,int n,int lev){ + + // cout << "cerco:" << ng << "\n"; + totget[lev]++; + + if (lev > maxlev) error("get: lev exceeds maxlevel"); + if (n < lev) error("get: ngram is too small"); + + //set boudaries for 1-gram + int offset=0,limit=cursize[1]; + + //information of table entries + int hit;char* found; LMT_TYPE ndt; + ng.link=NULL; + ng.lev=0; + + for (int l=1;l<=lev;l++){ + + //initialize entry information + hit = 0 ; found = NULL; ndt=tbltype[l]; + + //if (l==2) cout <<"bicache: searching:" << ng <<"\n"; + + if (lmtcache[l] && lmtcache[l]->get(ng.wordp(n),(char *)&found)) + hit=1; + else + search(table[l] + (offset * nodesize(ndt)), + ndt, + l, + (limit-offset), + nodesize(ndt), + ng.wordp(n-l+1), + LMT_FIND, + &found); + + //insert both found and not found items!!! + if (lmtcache[l] && hit==0) + lmtcache[l]->add(ng.wordp(n),(char *)&found); + + if (!found) return 0; + + ng.link=found; + ng.info=ndt; + ng.lev=l; + + if (l<maxlev){ //set start/end point for next search + + //if current offset is at the bottom also that of successors will be + if (offset+1==cursize[l]) limit=cursize[l+1]; + else limit=bound(found,ndt); + + //if current start is at the begin, then also that of successors will be + if (found==table[l]) offset=0; + else offset=bound((found - nodesize(ndt)),ndt); + + assert(offset!=-1); assert(limit!=-1); + } + } + + //put information inside ng + ng.size=n; ng.freq=0; + ng.succ=(lev<maxlev?limit-offset:0); + + return 1; +} + + +//recursively prints the language model table + +void lmtable::dumplm(fstream& out,ngram ng, int ilev, int elev, int ipos,int epos){ + + LMT_TYPE ndt=tbltype[ilev]; + int ndsz=nodesize(ndt); + + + assert(ng.size==ilev-1); + assert(ipos>=0 && epos<=cursize[ilev] && ipos<epos); + ng.pushc(0); + + for (int i=ipos;i<epos;i++){ + *ng.wordp(1)=word(table[ilev]+i*ndsz); + if (ilev<elev){ + //get first and last successor position + int isucc=(i>0?bound(table[ilev]+(i-1)*ndsz,ndt):0); + int esucc=bound(table[ilev]+i*ndsz,ndt); + if (isucc < esucc) //there are successors! + dumplm(out,ng,ilev+1,elev,isucc,esucc); + //else + //cout << "no successors for " << ng << "\n"; + } + else{ + //out << i << " "; //this was just to count printed n-grams + int ipr=prob(table[ilev]+ i * ndsz,ndt); + out << (isQtable?ipr:*(float *)&ipr) <<"\t"; + for (int k=ng.size;k>=1;k--){ + if (k<ng.size) out << " "; + out << dict->decode(*ng.wordp(k)); + } + + if (ilev<maxlev){ + int ibo=bow(table[ilev]+ i * ndsz,ndt); + if (isQtable) out << "\t" << ibo; + else + if (*((float *)&ibo)!=0.0) + out << "\t" << *((float *)&ibo); + + } + out << "\n"; + } + } +} + +//succscan iteratively returns all successors of an ngram h for which +//get(h,h.size,h.size) returned true. + + +int lmtable::succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){ + assert(lev==h.lev+1 && h.size==lev && lev<=maxlev); + + LMT_TYPE ndt=tbltype[h.lev]; + int ndsz=nodesize(ndt); + + switch (action){ + + case LMT_INIT: + //reset ngram local indexes + + ng.size=lev; + ng.trans(h); + ng.midx[lev]=(h.link>table[h.lev]?bound(h.link-ndsz,ndt):0); + + return 1; + + case LMT_CONT: + + if (ng.midx[lev]<bound(h.link,ndt)) + { + //put current word into ng + *ng.wordp(1)=word(table[lev]+ng.midx[lev]*nodesize(tbltype[lev])); + ng.midx[lev]++; + return 1; + } + else + return 0; + + default: + cerr << "succscan: only permitted options are LMT_INIT and LMT_CONT\n"; + exit(0); + } + +} + +//maxsuffptr returns the largest suffix of an n-gram that is contained +//in the LM table. This can be used as a compact representation of the +//(n-1)-gram state of a n-gram LM. if the input k-gram has k>=n then it +//is trimmed to its n-1 suffix. + +const char *lmtable::maxsuffptr(ngram ong){ + + if (ong.size==0) return (char*) NULL; + if (ong.size>=maxlev) ong.size=maxlev-1; + + ngram ng=ong; + //ngram ng(dict); //eventually use the <unk> word + //ng.trans(ong); + + if (get(ng,ng.size,ng.size)) + return ng.link; + else{ + ong.size--; + return maxsuffptr(ong); + } +} + + +const char *lmtable::cmaxsuffptr(ngram ong){ + + if (ong.size==0) return (char*) NULL; + if (ong.size>=maxlev) ong.size=maxlev-1; + + char* found; + + if (statecache && (ong.size==maxlev-1) && statecache->get(ong.wordp(maxlev-1),(char *)&found)) + return found; + + found=(char *)maxsuffptr(ong); + + if (statecache && ong.size==maxlev-1){ + //if (statecache->isfull()) statecache->reset(); + statecache->add(ong.wordp(maxlev-1),(char *)&found); + }; + + return found; + +} + + +// returns the probability of an n-gram + +double lmtable::prob(ngram ong){ + + if (ong.size==0) return 1.0; + if (ong.size>maxlev) ong.size=maxlev; + + ngram ng(dict); + ng.trans(ong); + + double rbow; + int ibow,iprob; + LMT_TYPE ndt; + + + if (get(ng,ng.size,ng.size)){ + ndt=(LMT_TYPE)ng.info; iprob=prob(ng.link,ndt); + return exp((double)(isQtable?Pcenters[ng.size][iprob]:*((float *)&iprob))); + } + else{ //size==1 means an OOV word + if (ng.size==1) + return 1.0/UNIGRAM_RESOLUTION; + else{ // compute backoff + //set backoff state, shift n-gram, set default bow prob + bo_state(1); ng.shift();rbow=1.0; + if (ng.lev==ng.size){ + ndt= (LMT_TYPE)ng.info; ibow=bow(ng.link,ndt); + rbow= (double) (isQtable?Bcenters[ng.size][ibow]:*((float *)&ibow)); + } + //prepare recursion step + ong.size--; + return exp(rbow) * prob(ong); + } + } +} + +//return log10 probs + +double lmtable::lprob(ngram ong){ + + if (ong.size==0) return 0.0; + if (ong.size>maxlev) ong.size=maxlev; + + ngram ng=ong; + //ngram ng(dict); //avoid dictionary transfer + //ng.trans(ong); + + double rbow; + int ibow,iprob; + LMT_TYPE ndt; + + + if (get(ng,ng.size,ng.size)){ + ndt=(LMT_TYPE)ng.info; iprob=prob(ng.link,ndt); + return (double)(isQtable?Pcenters[ng.size][iprob]:*((float *)&iprob)); + } + else{ //size==1 means an OOV word + if (ng.size==1) + return -log(UNIGRAM_RESOLUTION)/log(10.0); + else{ // compute backoff + //set backoff state, shift n-gram, set default bow prob + bo_state(1); ng.shift();rbow=0.0; + if (ng.lev==ng.size){ + ndt= (LMT_TYPE)ng.info; ibow=bow(ng.link,ndt); + rbow= (double) (isQtable?Bcenters[ng.size][ibow]:*((float *)&ibow)); + } + //prepare recursion step + ong.size--; + return rbow + lprob(ong); + } + } +} + +//return log10 probsL use cache memory + +double lmtable::clprob(ngram ong){ + + if (ong.size==0) return 0.0; + + if (ong.size>maxlev) ong.size=maxlev; + + double logpr; + +#ifdef TRACE_CACHE + if (probcache && ong.size==maxlev && sentence_id>0lo){ + *cacheout << sentence_id << " " << ong << "\n"; + } +#endif + + if (probcache && ong.size==maxlev && probcache->get(ong.wordp(maxlev),(char *)&logpr)){ + return logpr; + } + + logpr=lprob(ong); + + if (probcache && ong.size==maxlev){ + //if (probcache->isfull()) probcache->reset(); + probcache->add(ong.wordp(maxlev),(char *)&logpr); + }; + + return logpr; + +}; + + + +//Fill the lmtable with the n-grams in a huge lmfile. +//Use the local dictionary to select the needed ngrams + + +void lmtable::filter2(const char* binlmfile, int buffMb){ + + //load header and dictionary of binary lm on disk + lmtable* dsklmt=new lmtable(); + fstream inp(binlmfile,ios::in); + + // read header + char header[MAX_LINE]; + inp >> header; + + dsklmt->loadbinheader(inp, header); + dsklmt->dict->load(inp); + + //inherit properties of the dsklmt + isQtable=dsklmt->isQtable; + configure(dsklmt->maxlevel(),dsklmt->isQtable); + if (isQtable) + for (int i=1;i<=maxlev;i++) NumCenters[i]=dsklmt->NumCenters[i]; + + + //prepare word code conversion table; words which + //are not in the local dictionary will have code -1 + //prepare a new dictionary sorted as the large dictionary + + dictionary* newdict=new dictionary((char *)NULL,1000000,(char*)NULL,(char*)NULL); + newdict->incflag(1); + int* code2code=new int[dsklmt->dict->size()]; + for (int w=0;w<dsklmt->dict->size();w++){ + if (dict->getcode(dsklmt->dict->decode(w))!=-1) + code2code[w]=newdict->encode(dsklmt->dict->decode(w)); + else code2code[w]=-1; + } + newdict->incflag(0); + delete dict; + dict=newdict; + + //service variables + char* p; char* q; char* r; + int ndsz; LMT_TYPE type; int w; + long i,j,l; + + + disktable* dtbl; + for (l=1;l<=maxlev;l++){ + + //shortcuts for current table + type=tbltype[l]; ndsz=nodesize(type); + + //steel eventual coodebooks from dsklm; + if (isQtable) loadbincodebook(inp,l); + + //allocate the maxumum number of entries to be load at each time + cerr << "loading part of " << dsklmt->cursize[l] << " " << l << "-grams\n"; + + //open a on-disk table buffer starting from current position in stream inp + dtbl=new disktable(inp, (buffMb * 1024 *1024)/ndsz,ndsz,dsklmt->maxsize[l]); + + if (l==1){ + + //compute actual table size + maxsize[l]=0; + for (i=0;i<dsklmt->maxsize[l];i++){ + p=dtbl->get(inp,i); + if ((code2code[dsklmt->word(p)]) != -1) maxsize[l]++; + } + + assert(maxsize[l]<=dsklmt->maxsize[l]); + + //allocate memory for table and start positions + table[l]=new char[maxsize[l] * ndsz]; + startpos[l]=new int[maxsize[l]]; + + //reset position of dsklmt + dtbl->rewind(inp); + + //copy elements into table[l] + cursize[l]=0; + for (i=0;i<dsklmt->maxsize[l];i++){ + p=dtbl->get(inp,i); + if ((w=code2code[dsklmt->word(p)]) != -1) { + r=table[l] + (long) cursize[l] * ndsz; + memcpy(r,p,ndsz); + //store the initial poition in startpos + startpos[l][cursize[l]]=(i==0?0:bound(dtbl->get(inp,i-1),tbltype[l])); + word(r,w); + //cout << "1-gram bound:" << bound(r,tbltype[l]) << "\n"; + cursize[l]++; + } + } + + for (i=0;i<cursize[l];i++) assert(word(table[l]+ (long) i * ndsz)==i); + + assert(maxsize[l]==cursize[l]); + + } + else{ + + //shortcuts for the predecessors table + char* ptable=table[l-1]; + LMT_TYPE ptype=tbltype[l-1]; + int pndsz=nodesize(ptype); + + //count actual table size, allocate memory, and copy elements + //we scan elements through the previous table: ptable + //cout << inp.tellp() << "\n"; + + maxsize[l]=0; + + for (i=0;i<cursize[l-1];i++){ + p=ptable+i*pndsz; + for (j=startpos[l-1][i];j<bound(p,ptype);j++){ + assert(startpos[l-1][i]<bound(p,ptype)); + q=dtbl->get(inp,j); + if ((w=code2code[dsklmt->word(q)]) != -1) maxsize[l]++; + + } + } + + //allocate memory for the table, and fill it + assert(maxsize[l]<=dsklmt->maxsize[l]); + + table[l]=new char[maxsize[l] * ndsz]; + if (l<maxlev) startpos[l]=new int[maxsize[l]]; + + //reset position of dsklmt + dtbl->rewind(inp); + + r=table[l]; //next available position in table[l] + cursize[l]=0; + + for (i=0;i<cursize[l-1];i++){ + p=ptable+ i * pndsz; + for (j=startpos[l-1][i];j<bound(p,ptype);j++){ + assert(startpos[l-1][i]<bound(p,ptype)); + + q=dtbl->get(inp,j); + + + if ((w=code2code[dsklmt->word(q)]) != -1){ + //copy element + r=table[l] + cursize[l] * ndsz; + memcpy(r,q,ndsz); + if (l<maxlev) startpos[l][cursize[l]]=(j>0?bound(dtbl->get(inp,j-1),type):0); + word(r,w); + //cout << "+" << dict->decode(word(q)) << " - bound " + //<< startpos[l][cursize[l]] << " " << bound(p,ptype) << "\n"; + cursize[l]++; //increment index in startpos + } + + } + //update bounds of predecessor + bound(p,ptype,cursize[l]); + } + + assert(cursize[l]==maxsize[l]); + } + + + delete dtbl; + if (l>1) delete [] startpos[l-1]; + } + + +} + +void lmtable::filter(const char* binlmfile){ + + //load header and dictionary of binary lm on disk + lmtable* dsklmt=new lmtable(); + fstream inp(binlmfile,ios::in); + + // read header + char header[MAX_LINE]; + inp >> header; + + dsklmt->loadbinheader(inp, header); + + dsklmt->dict->load(inp); + + //inherit properties of the dsklmt + configure(dsklmt->maxlevel(),dsklmt->isQuantized()); + + //prepare word code conversion table; words which + //are not in the local dictionary will have code -1 + //prepare a new dictionary sorted as the large dictionary + + dictionary* newdict=new dictionary((char *)NULL,1000000,(char*)NULL,(char*)NULL); + newdict->incflag(1); + int* code2code=new int[dsklmt->dict->size()]; + for (int w=0;w<dsklmt->dict->size();w++){ + if (dict->getcode(dsklmt->dict->decode(w))!=-1) + code2code[w]=newdict->encode(dsklmt->dict->decode(w)); + else code2code[w]=-1; + } + newdict->incflag(0); + delete dict; + dict=newdict; + + //service variables + char* p; char* q; char* r; + int ndsz; LMT_TYPE type; int w; + int i,j,l; + + for (l=1;l<=maxlev;l++){ + + //shortcuts for current table + type=tbltype[l]; ndsz=nodesize(type); + + //steel eventual coodebooks from dsklm; + if (isQtable) loadbincodebook(inp,l); + + //load single l-table from dsklmt: this table can be + //removed at the ned of this cycle + cerr << "loading " << dsklmt->cursize[l] << " " << l << "-grams\n"; + dsklmt->table[l]=new char[dsklmt->cursize[l]*ndsz]; + inp.read(dsklmt->table[l],dsklmt->cursize[l]*ndsz); + + //shortcuts for dsktable + char* dtbl=dsklmt->table[l]; + int dsize=dsklmt->cursize[l]; + + if (l==1){ + + //count actual table size + maxsize[l]=0; + for (i=0;i<dsize;i++) + if ((code2code[dsklmt->word(dtbl+i*ndsz)]) != -1) maxsize[l]++; + + assert(maxsize[l]<=dsklmt->maxsize[l]); + //allocate memory for table and start positions + table[l]=new char[maxsize[l] * ndsz]; + startpos[l]=new int[maxsize[l]]; + + //copy elements one by one + + for (i=0;i<dsize;i++){ + p=dtbl+i*ndsz; + if ((w=code2code[dsklmt->word(p)]) != -1) { + r=table[l] + cursize[l] * ndsz; + memcpy(r,p,ndsz); + //store the initial poition in startpos + startpos[l][cursize[l]]=(i==0?0:bound(p-ndsz,tbltype[l])); + word(r,w); + cursize[l]++; + } + } + + for (i=0;i<cursize[l];i++) assert(word(table[l]+i*ndsz)==i); + + assert(maxsize[l]==cursize[l]); + + } + else{ //l>=1; + + //shortcuts for the predecessors table + char* ptable=table[l-1]; + LMT_TYPE ptype=tbltype[l-1]; + int pndsz=nodesize(ptype); + + //count actual table size, allocate memory, and copy elements + //we scan elements through the previous table: ptable + + maxsize[l]=0; + for (i=0;i<cursize[l-1];i++){ + p=ptable+i*pndsz; + for (j=startpos[l-1][i];j<bound(p,ptype);j++){ + q=dsklmt->table[l] + j * ndsz; + if ((w=code2code[dsklmt->word(q)]) != -1){ + maxsize[l]++; + } + } + } + + //allocate memory for the table, and fill it + assert(maxsize[l]<=dsklmt->maxsize[l]); + table[l]=new char[maxsize[l] * ndsz]; + if (l<maxlev) startpos[l]=new int[maxsize[l]]; + + r=table[l]; //next available position in table[l] + cursize[l]=0; + for (i=0;i<cursize[l-1];i++){ + p=ptable+i*pndsz; + for (j=startpos[l-1][i];j<bound(p,ptype);j++){ + q=dsklmt->table[l] + j * ndsz; + if ((w=code2code[dsklmt->word(q)]) != -1){ + //copy element + r=table[l] + cursize[l] * ndsz; + memcpy(r,q,ndsz); + if (l<maxlev) + startpos[l][cursize[l]]=(j==0?0:dsklmt->bound(q-ndsz,type)); + word(r,w); + //cout << "+" << dict->decode(word(q)) << " - bound " + //<< startpos[l][cursize[l]] << " " << bound(p,ptype) << "\n"; + cursize[l]++; //increment index in startpos + } + } + //update bounds of predecessor + bound(p,ptype,cursize[l]); + } + + } + + delete [] dsklmt->table[l]; + if (l>1) delete [] startpos[l-1]; + } + +} + + +void lmtable::stat(int level){ + int totmem=0,memory; + float mega=1024 * 1024; + + cout.precision(2); + + cout << "lmtable class statistics\n"; + + cout << "levels " << maxlev << "\n"; + for (int l=1;l<=maxlev;l++){ + memory=cursize[l] * nodesize(tbltype[l]); + cout << "lev " << l + << " entries "<< cursize[l] + << " used mem " << memory/mega << "Mb\n"; + totmem+=memory; + } + + cout << "total allocated mem " << totmem/mega << "Mb\n"; + + cout << "total number of get calls\n"; + for (int l=1;l<=maxlev;l++){ + cout << "level " << l << " " << totget[l] << "\n"; + } + cout << "total binary search : " << totbsearch << "\n"; + + if (level >1 ) dict->stat(); + +} diff --git a/irstlm/src/src/lmtable.h b/irstlm/src/src/lmtable.h new file mode 100644 index 000000000..4171dabea --- /dev/null +++ b/irstlm/src/src/lmtable.h @@ -0,0 +1,366 @@ +/****************************************************************************** +IrstLM: IRST Language Model Toolkit +Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + + +#ifndef MF_LMTABLE_H +#define MF_LMTABLE_H + +#include "ngramcache.h" +#include "dictionary.h" +#include "n_gram.h" + +#undef TRACE_CACHE + +#define LMTMAXLEV 20 +#define MAX_LINE 1024 + +#ifndef LMTCODESIZE +#define LMTCODESIZE (int)3 +#endif + +#define SHORTSIZE (int)2 +#define PTRSIZE (int)sizeof(char *) +#define INTSIZE (int)4 +#define CHARSIZE (int)1 + +#define PROBSIZE (int)4 //use float +#define QPROBSIZE (int)1 +#define BOUNDSIZE (int)4 + +#define UNIGRAM_RESOLUTION 10000000.0 + +typedef enum {INTERNAL,QINTERNAL,LEAF,QLEAF} LMT_TYPE; +typedef char* node; + +typedef enum {LMT_FIND, //!< search: find an entry + LMT_ENTER, //!< search: enter an entry + LMT_INIT, //!< scan: start scan + LMT_CONT //!< scan: continue scan +} LMT_ACTION; + + +//disktable or accessing tables stored on disk + +class disktable{ +private: + char* buffer; //!< buffer of disk table + int buffer_size; //!< size of buffer + int entry_size; //!< size of each single entry in buffer + long current_border; //!< current last available entry in buffer + long file_border; //!< last element in disk table + long start_position; /* */ +public: + + disktable(std::fstream& inp, int buffersize, int entrysize,long max_entries){ + buffer_size=buffersize; + entry_size=entrysize; + buffer=new char[(buffer_size+1) * entry_size]; + current_border=0; + file_border=max_entries; + start_position=inp.tellp(); + }; + + ~disktable() {delete [] buffer;}; + + char* get(std::fstream& inp,long position){ + assert(position < file_border); + + //you can look back at maximum one position before the first in the buffer! + assert(position >= (current_border-buffer_size -1)); + + while (position>=current_border){ + int how_many=(current_border + buffer_size <= file_border?buffer_size:file_border-current_border); + //store last value in position buffer_size; + memcpy(buffer + buffer_size * entry_size, buffer+(buffer_size-1) * entry_size, entry_size); + //read the number of elements + inp.read(buffer,how_many * entry_size); + //update curent border + current_border+=how_many; + } + if (position > (current_border-buffer_size-1) ) //then it is in buffer + return buffer + (position % buffer_size) * entry_size; + else + if (current_border>buffer_size) //asks for last of previous block + return buffer + buffer_size * entry_size; + else return NULL; + }; + + void rewind(std::fstream& inp){ + inp.seekp(start_position); + current_border=0; + } + +}; + +class lmtable{ + + char* table[LMTMAXLEV+1]; //storage of all levels + LMT_TYPE tbltype[LMTMAXLEV+1]; //table type for each levels + int cursize[LMTMAXLEV+1]; //current size of levels + int maxsize[LMTMAXLEV+1]; //current size of levels + int* startpos[LMTMAXLEV+1]; //support vector to store start positions + + int maxlev; //max level of table + char info[100]; //information put in the header + + //statistics + int totget[LMTMAXLEV+1]; + int totbsearch; + + //probability quantization + bool isQtable; + + int NumCenters[LMTMAXLEV+1]; + float* Pcenters[LMTMAXLEV+1]; + float* Bcenters[LMTMAXLEV+1]; + + int lmt_oov_code; + int lmt_oov_size; + int backoff_state; + + //improve access speed + ngramcache* lmtcache[LMTMAXLEV+1]; + ngramcache* probcache; + ngramcache* statecache; + int max_cache_lev; + +public: + +#ifdef TRACE_CACHE + std::fstream* cacheout; + int sentence_id; +#endif + + dictionary *dict; // dictionary + + lmtable(); + + ~lmtable(){ + for (int i=2;i<=LMTMAXLEV;i++) + if (lmtcache[i]){ + std::cerr << i <<"-gram cache: "; lmtcache[i]->stat(); + delete lmtcache[i]; + } + + if (probcache){ + std::cerr << "Prob Cache: "; probcache->stat(); + delete probcache; +#if TRACE_CACHE + cacheout->close(); + delete cacheout; +#endif + + } + if (statecache){ + std::cerr << "State Cache: "; statecache->stat(); + delete statecache; + } + + + for (int i=1;i<=maxlev;i++){ + if (table[i]) delete [] table[i]; + if (isQtable){ + if (Pcenters[i]) delete [] Pcenters[i]; + if (i<maxlev) + if (Bcenters[i]) delete [] Bcenters[i]; + } + } + } + + void init_probcache(){ + assert(probcache==NULL); + probcache=new ngramcache(maxlev,sizeof(double),400000); +#ifdef TRACE_CACHE + cacheout=new std::fstream("/tmp/tracecache",std::ios::out); + sentence_id=0; +#endif + } + + void init_statecache(){ + assert(statecache==NULL); + statecache=new ngramcache(maxlev-1,sizeof(char *),200000); + } + + void init_lmtcaches(int uptolev){ + max_cache_lev=uptolev; + for (int i=2;i<=max_cache_lev;i++){ + assert(lmtcache[i]==NULL); + lmtcache[i]=new ngramcache(i,sizeof(char *),200000); + } + } + + void check_cache_levels(){ + if (probcache && probcache->isfull()) probcache->reset(probcache->cursize()); + if (statecache && statecache->isfull()) statecache->reset(statecache->cursize()); + for (int i=2;i<=max_cache_lev;i++) + if (lmtcache[i]->isfull()) lmtcache[i]->reset(lmtcache[i]->cursize()); + } + + bool is_probcache_active(){return probcache!=NULL;} + bool is_statecache_active(){return statecache!=NULL;} + bool are_lmtcaches_active(){return lmtcache[2]!=NULL;} + + void configure(int n,bool quantized){ + maxlev=n; + if (n==1) + tbltype[1]=(quantized?QLEAF:LEAF); + else{ + for (int i=1;i<n;i++) tbltype[i]=(quantized?QINTERNAL:INTERNAL); + tbltype[n]=(quantized?QLEAF:LEAF); + } + }; + + int maxlevel(){return maxlev;}; + bool isQuantized(){return isQtable;} + + + void savetxt(const char *filename); + void savebin(const char *filename); + void dumplm(std::fstream& out,ngram ng, int ilev, int elev, int ipos,int epos); + + void load(std::istream& inp); + void loadtxt(std::istream& inp,const char* header); + void loadbin(std::istream& inp,const char* header); + + void loadbinheader(std::istream& inp, const char* header); + void loadbincodebook(std::istream& inp,int l); + + void filter(const char* lmfile); + void filter2(const char* lmfile,int buffMb=512); + + void loadcenters(std::istream& inp,int Order); + + double prob(ngram ng); + double lprob(ngram ng); + double clprob(ngram ng); + + + void *search(char *tb,LMT_TYPE ndt,int lev,int n,int sz,int *w, + LMT_ACTION action,char **found=(char **)NULL); + + int mybsearch(char *ar, int n, int size, unsigned char *key, int *idx); + + int add(ngram& ng,int prob,int bow); + void checkbounds(int level); + + int get(ngram& ng){return get(ng,ng.size,ng.size);} + int get(ngram& ng,int n,int lev); + + int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev); + + const char *maxsuffptr(ngram ong); + const char *cmaxsuffptr(ngram ong); + + inline int putmem(char* ptr,int value,int offs,int size){ + assert(ptr!=NULL); + for (int i=0;i<size;i++) + ptr[offs+i]=(value >> (8 * i)) & 0xff; + return value; + }; + + inline int getmem(char* ptr,int* value,int offs,int size){ + assert(ptr!=NULL); + *value=ptr[offs] & 0xff; + for (int i=1;i<size;i++) + *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i)); + return *value; + }; + + + int bo_state(int value=-1){ + return (value==-1?backoff_state:backoff_state=value); + }; + + + int nodesize(LMT_TYPE ndt){ + switch (ndt){ + case INTERNAL: + return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE; + case QINTERNAL: + return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE; + case QLEAF: + return LMTCODESIZE + QPROBSIZE; + case LEAF: + return LMTCODESIZE + PROBSIZE; + default: + assert(0); + return 0; + } + } + + inline int word(node nd,int value=-1) + { + int offset=0; + + if (value==-1) + getmem(nd,&value,offset,LMTCODESIZE); + else + putmem(nd,value,offset,LMTCODESIZE); + + return value; + }; + + inline int prob(node nd,LMT_TYPE ndt, int value=-1) + { + int offs=LMTCODESIZE; + int size=(ndt==QINTERNAL || ndt==QLEAF?QPROBSIZE:PROBSIZE); + + if (value==-1) + getmem(nd,&value,offs,size); + else + putmem(nd,value,offs,size); + + return value; + }; + + + inline int bow(node nd,LMT_TYPE ndt, int value=-1) + { + assert(ndt==INTERNAL || ndt==QINTERNAL); + int size=(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); + int offs=LMTCODESIZE+size; + + if (value==-1) + getmem(nd,&value,offs,size); + else + putmem(nd,value,offs,size); + + return value; + }; + + inline int bound(node nd,LMT_TYPE ndt, int value=-1) + { + assert(ndt==INTERNAL || ndt==QINTERNAL); + int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); + + if (value==-1) + getmem(nd,&value,offs,BOUNDSIZE); + else + putmem(nd,value,offs,BOUNDSIZE); + + return value; + }; + + void stat(int lev=0); + +}; + +#endif + diff --git a/irstlm/src/src/mempool.cpp b/irstlm/src/src/mempool.cpp new file mode 100644 index 000000000..1c690213d --- /dev/null +++ b/irstlm/src/src/mempool.cpp @@ -0,0 +1,497 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +// An efficient memory pool manager +// by M. Federico +// Copyright Marcello Federico, ITC-irst, 1998 + +#include <iostream> +#include <ostream> +#include <cassert> +#include "mempool.h" + +using namespace std; + +#ifdef TRACE_ENABLE +#define TRACE_ERR(str) { std::cerr << str; } +#else +#define TRACE_ERR(str) { } +#endif + +/*! The pool contains: + - entries of size is + - tables for bs entries +*/ + + +mempool::mempool(int is, int bs){ + + // item size must be multiple of memory alignment step (4 bytes) + // example: is is=9 becomes i=12 (9 + 4 - 9 %4 ) + + is=(is>(int)sizeof(char *)?is:0); + + is=is + sizeof(char *) - (is % sizeof(char *)); + + item_size = is; + + block_size = bs; + + true_size = is * bs; + + block_list = new memnode; + + block_list->block = new char[true_size]; + + memset(block_list->block,'0',true_size); + + block_list->next = 0; + + blocknum = 1; + + entries = 0; + + // build free list + + char *ptr = free_list = block_list->block; + + for (int i=0;i<block_size-1;i++) { + *(char **)ptr= ptr + item_size; + ptr+=item_size; + } + *(char **)ptr = NULL; //last item + +} + + +char * mempool::allocate(){ + + char *ptr; + + if (free_list==NULL) + { + memnode *new_block = new memnode; + + new_block->block = new char[true_size]; + + //memset(new_block->block,'0',true_size); + + new_block->next = block_list; + + block_list=new_block; // update block list + + /* update free list */ + + ptr = free_list = block_list->block; + + for (int i=0;i<block_size-1;i++) { + *(char **)ptr = ptr + item_size; + ptr = ptr + item_size; + } + + *(char **)ptr=NULL; + + blocknum++; + } + + assert(free_list); + + ptr = free_list; + + free_list=*(char **)ptr; + + *(char **)ptr=NULL; // reset the released item + + entries++; + + return ptr; + +} + + +int mempool::free(char* addr){ + + // do not check if it belongs to this pool !! + /* + memnode *list=block_list; + while ((list != NULL) && + ((addr < list->block) || + (addr >= (list->block + true_size)))) + list=list->next; + + if ((list==NULL) || (((addr - list->block) % item_size)!=0)) + { + //cerr << "mempool::free-> addr does not belong to this pool\n"; + return 0; + } + */ + + *(char **)addr=free_list; + free_list=addr; + + entries--; + + return 1; +} + + +mempool::~mempool() +{ + memnode *ptr; + + while (block_list !=NULL){ + ptr=block_list->next; + delete [] block_list->block; + delete block_list; + block_list=ptr; + } + +} + +void mempool::map (ostream& co){ + + co << "mempool memory map:\n"; + //percorri piu` volte la lista libera + + memnode *bl=block_list; + char *fl=free_list; + + char* img=new char[block_size+1]; + img[block_size]='\0'; + + while (bl !=NULL){ + + memset(img,'#',block_size); + + fl=free_list; + while (fl != NULL){ + if ((fl >= bl->block) + && + (fl < bl->block + true_size)) + { + img[(fl-bl->block)/item_size]='-'; + } + + fl=*(char **)fl; + } + + co << img << "\n"; + bl=bl->next; + } + delete [] img; +} + +void mempool::stat(){ + + TRACE_ERR("mempool class statistics\n" + << "entries " << entries + << " blocks " << blocknum + << " used memory " << (blocknum * true_size)/1024 << " Kb\n"); +} + + + +strstack::strstack(int bs){ + + size=bs; + list=new memnode; + + list->block=new char[size]; + + list->next=0; + + memset(list->block,'\0',size); + idx=0; + + waste=0; + memory=size; + entries=0; + blocknum=1; + +} + + +void strstack::stat(){ + + TRACE_ERR("strstack class statistics\n" + << "entries " << entries + << " blocks " << blocknum + << " used memory " << memory/1024 << " Kb\n"); +} + + +char *strstack::push(char *s){ + int len=strlen(s); + + if ((len+1) >= size){ + cerr << "strstack::push string is too long\n"; + exit(1); + }; + + if ((idx+len+1) >= size){ + //append a new block + //there must be space to + //put the index after + //the word + + waste+=size-idx; + blocknum++; + memory+=size; + + memnode* nd=new memnode; + nd->block=new char[size]; + nd->next=list; + + list=nd; + + memset(list->block,'\0',size); + + idx=0; + + } + + // append in current block + + strcpy(&list->block[idx],s); + + idx+=len+1; + + entries++; + + return &list->block[idx-len-1]; + +} + + +char *strstack::pop(){ + + if (list==0) return 0; + + if (idx==0){ + + // free this block and go to next + + memnode *ptr=list->next; + + delete [] list->block; + delete list; + + list=ptr; + + if (list==0) + return 0; + else + idx=size-1; + } + + //go back to first non \0 + while (idx>0) + if (list->block[idx--]!='\0') + break; + + //go back to first \0 + while (idx>0) + if (list->block[idx--]=='\0') + break; + + entries--; + + if (list->block[idx+1]=='\0') + { + idx+=2; + memset(&list->block[idx],'\0',size-idx); + return &list->block[idx]; + } + else{ + idx=0; + memset(&list->block[idx],'\0',size); + return &list->block[0]; + } +} + + +char *strstack::top(){ + + int tidx=idx; + memnode *tlist=list; + + if (tlist==0) return 0; + + if (idx==0){ + + tlist=tlist->next; + + if (tlist==0) return 0; + + tidx=size-1; + } + + //go back to first non \0 + while (tidx>0) + if (tlist->block[tidx--]!='\0') + break; + + //aaa\0bbb\0\0\0\0 + + //go back to first \0 + while (tidx>0) + if (tlist->block[tidx--]=='\0') + break; + + if (tlist->block[tidx+1]=='\0') + { + tidx+=2; + return &tlist->block[tidx]; + } + else{ + tidx=0; + return &tlist->block[0]; + } + +} + + +strstack::~strstack(){ + memnode *ptr; + while (list !=NULL){ + ptr=list->next; + delete [] list->block; + delete list; + list=ptr; + } +} + + +storage::storage(int maxsize,int blocksize) +{ + newmemory=0; + newcalls=0; + setsize=maxsize; + poolsize=blocksize; //in bytes + poolset=new mempool* [setsize+1]; + for (int i=0;i<=setsize;i++) + poolset[i]=NULL; +} + + +storage::~storage(){ + for (int i=0;i<=setsize;i++) + if (poolset[i]) + delete poolset[i]; + delete [] poolset; +} + +char *storage::allocate(int size){ + + if (size<=setsize){ + if (!poolset[size]){ + poolset[size]=new mempool(size,poolsize/size); + } + return poolset[size]->allocate(); + } + else{ + + newmemory+=size+8; + newcalls++; + char* p=(char *)calloc(sizeof(char),size); + if (p==NULL){ + cerr << "storage::alloc insufficient memory\n"; + exit(1); + } + return p; + } +} + +char *storage::reallocate(char *oldptr,int oldsize,int newsize){ + + char *newptr; + + assert(newsize>oldsize); + + if (oldsize<=setsize){ + if (newsize<=setsize){ + if (!poolset[newsize]) + poolset[newsize]=new mempool(newsize,poolsize/newsize); + newptr=poolset[newsize]->allocate(); + memset((char*)newptr,0,newsize); + } + else + newptr=(char *)calloc(sizeof(char),newsize); + + if (oldptr && oldsize){ + memcpy(newptr,oldptr,oldsize); + poolset[oldsize]->free(oldptr); + } + } + else{ + newptr=(char *)realloc(oldptr,newsize); + if (newptr==oldptr) + cerr << "r\b"; + else + cerr << "a\b"; + } + if (newptr==NULL){ + cerr << "storage::realloc insufficient memory\n"; + exit(1); + } + + return newptr; +} + +int storage::free(char *addr,int size){ + + /* + while(size<=setsize){ + if (poolset[size] && poolset[size]->free(addr)) + break; + size++; + } + */ + + if (size>setsize) + return free(addr),1; + else{ + poolset[size] && poolset[size]->free(addr); + } + return 1; +} + + +void storage::stat(){ + int used=0; + int memory=sizeof(char *) * setsize; + int waste=0; + + for (int i=0;i<=setsize;i++) + if (poolset[i]){ + used++; + memory+=poolset[i]->used(); + waste+=poolset[i]->wasted(); + } + + TRACE_ERR("storage class statistics\n" + << "alloc entries " << newcalls + << " used memory " << newmemory/1024 << "Kb\n" + << "mpools " << setsize + << " active " << used + << " used memory " << memory/1024 << "Kb" + << " wasted " << waste/1024 << "Kb\n"); +} + diff --git a/irstlm/src/src/mempool.h b/irstlm/src/src/mempool.h new file mode 100644 index 000000000..eafdcd110 --- /dev/null +++ b/irstlm/src/src/mempool.h @@ -0,0 +1,172 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +// An efficient memory manager +// by M. Federico +// Copyright Marcello Federico, ITC-irst, 1998 + +#ifndef MF_MEMPOOL_H +#define MF_MEMPOOL_H + +#ifndef NULL +const int NULL=0; +//#define NULL=0; +#endif + +#include <iostream> // std::ostream + +//! Memory block +/*! This can be used by: +- mempool to store items of fixed size +- strstack to store strings of variable size +*/ + +class memnode{ + friend class mempool; //!< grant access + friend class strstack; //!< grant access + char *block; //!< block of memory + memnode *next; //!< next block ptr +}; + + +//! Memory pool + +/*! A memory pool is composed of: + - a linked list of block_num memory blocks + - each block might contain up to block_size items + - each item is made of exactly item_size bytes +*/ + +class mempool{ + int block_size; //!< number of entries per block + int item_size; //!< number of bytes per entry + int true_size; //!< number of bytes per block + memnode* block_list; //!< list of blocks + char* free_list; //!< free entry list + int entries; //!< number of stored entries + int blocknum; //!< number of allocated blocks + public: + + //! Creates a memory pool + mempool(int is, int bs); + + //! Destroys memory pool + ~mempool(); + + //! Prints a map of memory occupancy + void map(std::ostream& co); + + //! Allocates a single memory entry + char *allocate(); + + //! Frees a single memory entry + int free(char* addr); + + //! Prints statistics about this mempool + void stat(); + + //! Returns effectively used memory (bytes) + /*! includes 8 bytes required by each call of new */ + + int used(){return blocknum * (true_size + 8);}; + + //! Returns amount of wasted memory (bytes) + int wasted(){return used()-(entries * item_size);}; +}; + +//! A stack to store strings + +/*! + The stack is composed of + - a list of blocks memnode of fixed size + - attribute blocknum tells the block on top + - attribute idx tells position of the top string +*/ + +class strstack{ + memnode* list; //!< list of memory blocks + int size; //!< size of each block + int idx; //!< index of last stored string + int waste; //!< current waste of memory + int memory; //!< current use of memory + int entries; //!< current number of stored strings + int blocknum; //!< current number of used blocks + + public: + + strstack(int bs=1000); + + ~strstack(); + + char *push(char *s); + + char *pop(); + + char *top(); + + void stat(); + + int used(){return memory;}; + + int wasted(){return waste;}; + +}; + + +//! Manages multiple memory pools + +/*! + This class permits to manage memory pools + with items up to a specified size. + - items within the allowed range are stored in memory pools + - items larger than the limit are allocated with new +*/ + + +class storage{ + mempool **poolset; //!< array of memory pools + int setsize; //!< number of memory pools/maximum elem size + int poolsize; //!< size of each block + int newmemory; //!< stores amount of used memory + int newcalls; //!< stores number of allocated blocks + public: + + //! Creates storage + storage(int maxsize,int blocksize); + + //! Destroys storage + ~storage(); + + /* names of below functions have been changed so as not to interfere with macros for malloc/realloc/etc -- EVH */ + + //! Allocates memory + char *allocate(int size); + + //! Realloc memory + char *reallocate(char *oldptr,int oldsize,int newsize); + + //! Frees memory of an entry + int free(char *addr,int size=0); + + //! Prints statistics about storage + void stat(); +}; + +#endif diff --git a/irstlm/src/src/n_gram.cpp b/irstlm/src/src/n_gram.cpp new file mode 100644 index 000000000..bf12b7015 --- /dev/null +++ b/irstlm/src/src/n_gram.cpp @@ -0,0 +1,214 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +#include <iomanip> +#include <cassert> +#include "mempool.h" +#include "htable.h" +#include "dictionary.h" +#include "n_gram.h" +#include "index.h" + +using namespace std; + +ngram::ngram(dictionary* d,int sz){ + dict=d; + size=sz; + succ=0; + freq=0; + info=0; + pinfo=0; + link=NULL; + isym=-1; + memset(word,0,sizeof(int)*MAX_NGRAM); + memset(midx,0,sizeof(int)*MAX_NGRAM); +} + +ngram::ngram(ngram& ng){ + size=ng.size; + freq=ng.freq; + succ=0; + info=0; + pinfo=0; + link=NULL; + isym=-1; + dict=ng.dict; + memcpy(word,ng.word,sizeof(int)*MAX_NGRAM); + memcpy(midx,ng.word,sizeof(int)*MAX_NGRAM); + +} + +void ngram::trans (const ngram& ng){ + size=ng.size; + freq=ng.freq; + if (dict == ng.dict){ + info=ng.info; + isym=ng.isym; + memcpy(word,ng.word,sizeof(int)*MAX_NGRAM); + memcpy(midx,ng.midx,sizeof(int)*MAX_NGRAM); + } + else{ + info=0; + memset(midx,0,sizeof(int)*MAX_NGRAM); + isym=-1; + for (int i=1;i<=size;i++) + word[MAX_NGRAM-i]=dict->encode(ng.dict->decode(*ng.wordp(i))); + } +} + + +ifstream& operator>> ( ifstream& fi , ngram& ng){ + char w[MAX_WORD]; + memset(w,0,MAX_WORD); + w[0]='\0'; + + if (!(fi >> setw(MAX_WORD) >> w)) + return fi; + + if (strlen(w)==(MAX_WORD-1)) + cerr << "ngram: a too long word was read (" + << w << ")\n"; + + if (ng.dict->intsymb() && + (strlen(w)==1) && (index(ng.dict->intsymb(),w[0])!=NULL)){ + + ng.isym=(long)index(ng.dict->intsymb(),w[0]) - + (long)ng.dict->intsymb(); + ng.size=0; + return fi; + } + + int c=ng.dict->encode(w); + + if (c == -1 ){ + cerr << "ngram: " << w << " is OOV \n"; + exit(1); + } + + memcpy(ng.word,ng.word+1,(MAX_NGRAM-1)*sizeof(int)); + + ng.word[MAX_NGRAM-1]=(int)c; + ng.freq=1; + + if (ng.size<MAX_NGRAM) ng.size++; + + return fi; + +} + + +int ngram::pushw(char* w){ + + assert(dict!=NULL); + + int c=dict->encode(w); + + if (c == -1 ){ + cerr << "ngram: " << w << " is OOV \n"; + exit(1); + } + + pushc(c); + + return 1; + +} + +int ngram::pushc(int c){ + + int buff[MAX_NGRAM-1]; + memcpy(buff,word+1,(MAX_NGRAM-1)*sizeof(int)); + memcpy(word,buff,(MAX_NGRAM-1)*sizeof(int)); + + word[MAX_NGRAM-1]=(int)c; + if (size<MAX_NGRAM) size++; + + return 1; + +} + + +istream& operator>> ( istream& fi , ngram& ng){ + char w[MAX_WORD]; + memset(w,0,MAX_WORD); + w[0]='\0'; + + assert(ng.dict != NULL); + + if (!(fi >> setw(MAX_WORD) >> w)) + return fi; + + if (strlen(w)==(MAX_WORD-1)) + cerr << "ngram: a too long word was read (" + << w << ")\n"; + + if (ng.dict->intsymb() && + (strlen(w)==1) && (index(ng.dict->intsymb(),w[0])!=NULL)){ + ng.isym=(long)index(ng.dict->intsymb(),w[0])-(long)ng.dict->intsymb(); + ng.size=0; + return fi; + } + + ng.pushw(w); + + ng.freq=1; + + return fi; + +} + +ofstream& operator<< (ofstream& fo,ngram& ng){ + + assert(ng.dict != NULL); + + for (int i=ng.size;i>0;i--) + fo << ng.dict->decode(ng.word[MAX_NGRAM-i]) << " "; + //fo << "[size " << ng.size << " freq " << ng.freq << "]"; + fo << ng.freq; + return fo; +} + +ostream& operator<< (ostream& fo,ngram& ng){ + + assert(ng.dict != NULL); + + for (int i=ng.size;i>0;i--) + fo << ng.dict->decode(ng.word[MAX_NGRAM-i]) << " "; + //fo << "[size " << ng.size << " freq " << ng.freq << "]"; + fo << ng.freq; + + return fo; +} + +/* +main(int argc, char** argv){ + dictionary d(argv[1]); + ifstream txt(argv[1]); + ngram ng(&d); + + while (txt >> ng){ + cout << ng << "\n"; + } + + ngram ng2=ng; + cerr << "copia l'ultimo =" << ng << "\n"; +} +*/ + diff --git a/irstlm/src/src/n_gram.h b/irstlm/src/src/n_gram.h new file mode 100644 index 000000000..f36b5d2df --- /dev/null +++ b/irstlm/src/src/n_gram.h @@ -0,0 +1,118 @@ +/****************************************************************************** + IrstLM: IRST Language Model Toolkit + Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +// n-gram tables +// by M. Federico +// Copyright Marcello Federico, ITC-irst, 1998 + +#ifndef MF_NGRAM_H +#define MF_NGRAM_H + +#include <fstream> +#include <cassert> +#include "dictionary.h" + +#ifdef MYMAXNGRAM +#define MAX_NGRAM MYMAXNGRAM +#else +#define MAX_NGRAM 20 +#endif + +class dictionary; + +//typedef int code; + +class ngram{ + int word[MAX_NGRAM]; //encoded ngram + public: + dictionary *dict; //dictionary + char* link; // ngram-tree pointer + int midx[MAX_NGRAM]; // ngram-tree scan pointer + int lev; // ngram-tree level + int size; // ngram size + int freq; // ngram frequency + int succ; // number of successors + + unsigned char info; // ngram-tree info flags + unsigned char pinfo; // ngram-tree parent info flags + int isym; // last interruption symbol + + ngram(dictionary* d,int sz=0); + ngram(ngram& ng); + + int *wordp()// n-gram pointer + {return wordp(size);}; + int *wordp(int k) // n-gram pointer + {return size>=k?&word[MAX_NGRAM-k]:0;}; + const int *wordp() const // n-gram pointer + {return wordp(size);}; + const int *wordp(int k) const // n-gram pointer + {return size>=k?&word[MAX_NGRAM-k]:0;}; + + int shift(){ + for (int i=(MAX_NGRAM-1);i>0;i--){ + word[i]=word[i-1]; + } + size--; + return 1; + } + + + int containsWord(char* s,int lev){ + + int c=dict->encode(s); + if (c == -1) return 0; + + assert(lev <= size); + for (int i=0;i<lev;i++){ + if (*wordp(size-i)== c) return 1; + } + return 0; + } + + + void trans(const ngram& ng); + + friend std::ifstream& operator>> (std::ifstream& fi,ngram& ng); + friend std::ofstream& operator<< (std::ofstream& fi,ngram& ng); + friend std::istream& operator>> (std::istream& fi,ngram& ng); + friend std::ostream& operator<< (std::ostream& fi,ngram& ng); + + inline int ckhisto(int sz){ + + for (int i=sz;i>1;i--) + if (*wordp(i)==dict->oovcode()) + return 0; + return 1; + } + + int pushc(int c); + int pushw(char* w); + + //~ngram(); + + + +}; + +#endif + + + diff --git a/irstlm/src/src/ngramcache.cpp b/irstlm/src/src/ngramcache.cpp new file mode 100644 index 000000000..3b4234bcc --- /dev/null +++ b/irstlm/src/src/ngramcache.cpp @@ -0,0 +1,89 @@ +/****************************************************************************** +IrstLM: IRST Language Model Toolkit +Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ +#include <iostream> +#include <fstream> +#include <stdexcept> +#include <cassert> +#include "math.h" +#include "mempool.h" +#include "htable.h" + +#include "ngramcache.h" + +using namespace std; + +ngramcache::ngramcache(int n,int size,int maxentries){ + ngsize=n; + infosize=size; + maxn=maxentries; + entries=0; + ht=new htable(maxn * 2, ngsize * sizeof(int),INT,NULL); //lower load factor to reduce collisions + mp=new mempool(ngsize * sizeof(int)+infosize,1000000); + accesses=0; + hits=0; + }; + +ngramcache::~ngramcache(){ + ht->stat();//ht->map(); + mp->stat(); + delete ht;delete mp; +}; + + +//resize cache to specified number of entries +void ngramcache::reset(int n){ + ht->stat(); + delete ht;delete mp; + if (n>0) maxn=n; + ht=new htable(maxn * 2, ngsize * sizeof(int),INT,NULL); //load factor 2 + mp=new mempool(ngsize * sizeof(int)+infosize,1000000); + entries=0; + } + + + + +char* ngramcache::get(const int* ngp,char* info){ + char *found; + // cout << "ngramcache::get() "; + //for (int i=0;i<ngsize;i++) cout << ngp[i] << " "; cout <<"\n"; + accesses++; + if (found=ht->search((char *)ngp,HT_FIND)){ + if (info) memcpy(info,found+ngsize*sizeof(int),infosize); + hits++; + }; + return found; + }; + + +int ngramcache::add(const int* ngp,const char* info){ + + char* entry=mp->allocate(); + memcpy(entry,(char*) ngp,sizeof(int) * ngsize); + memcpy(entry + ngsize * sizeof(int),(char *)info,infosize); + char *found=ht->search((char *)entry,HT_ENTER); + assert(found == entry); //false if key is already insided + entries++; + return 1; + } + +void ngramcache::stat(){ + cerr << "ngramcache stats: entries=" << entries << " acc=" << accesses << " hits=" << hits << "\n"; +}; diff --git a/irstlm/src/src/ngramcache.h b/irstlm/src/src/ngramcache.h new file mode 100644 index 000000000..d7d98e644 --- /dev/null +++ b/irstlm/src/src/ngramcache.h @@ -0,0 +1,51 @@ +/****************************************************************************** +IrstLM: IRST Language Model Toolkit +Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +#ifndef MF_NGRAMCACHE_H +#define MF_NGRAMCACHE_H + +#include "mempool.h" +#include "htable.h" + +class ngramcache{ +private: + htable* ht; + mempool *mp; + int maxn; + int ngsize; + int infosize; + int accesses; + int hits; + int entries; + +public: + + int cursize(){return entries;}; + ngramcache(int n,int size,int maxentries); + ~ngramcache(); + void reset(int n=0); + char* get(const int* ngp,char* info=NULL); + int add(const int* ngp,const char* info); + int isfull(){return (entries >= maxn);}; + void stat(); +}; + +#endif + diff --git a/irstlm/src/src/quantize-lm.cpp b/irstlm/src/src/quantize-lm.cpp new file mode 100644 index 000000000..1b326ce97 --- /dev/null +++ b/irstlm/src/src/quantize-lm.cpp @@ -0,0 +1,424 @@ +/****************************************************************************** +IrstLM: IRST Language Model Toolkit, compile LM +Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +******************************************************************************/ + +using namespace std; + +#include <iostream> +#include <fstream> +#include <vector> +#include <string> +#include <stdlib.h> +#include <assert.h> +#include "math.h" +#include "util.h" + +#define MAX_LINE 1024 + +//---------------------------------------------------------------------- +// Special type and global variable for the BIN CLUSTERING algorithm +// +// +//---------------------------------------------------------------------- + +typedef struct{ + double pt; + int idx; + short code; +}BinEntry; + + +int cmpBinEntry(const void* a,const void* b){ + if (*(double *)a > *(double*)b) + return 1; + else if (*(double *)a < *(double*)b) + return -1; + else + return 0; +} + +BinEntry* bintable=NULL; + +//---------------------------------------------------------------------- +// Global entry points +//---------------------------------------------------------------------- + +int parseWords(char *sentence, char **words, int max); + +int ComputeCluster(int nc, double* cl,int N,double* Pts); + +//---------------------------------------------------------------------- +// Global parameters (some are set in getArgs()) +//---------------------------------------------------------------------- + +int k = 256; // number of centers +const int MAXLEV = 11; //maximum n-gram size + +//---------------------------------------------------------------------- +// Main program +//---------------------------------------------------------------------- + +void usage(const char *msg = 0) { + if (msg) { std::cerr << msg << std::endl; } + std::cerr << "Usage: quantize-lm input-file.lm [output-file.qlm]" << std::endl; + if (!msg) std::cerr << std::endl + << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl + << " a version of it with quantized probabilities and back-off weights"<< std::endl + << " that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl; + } + + +int main(int argc, const char **argv) +{ + + //Process Parameters + + if (argc < 2) { usage(); exit(1); } + std::vector<std::string> files; + for (int i=1; i < argc; i++) { + std::string opt = argv[i]; + files.push_back(opt); + } + if (files.size() > 2) { usage("Too many arguments"); exit(1); } + if (files.size() < 1) { usage("Please specify a LM file to read from"); exit(1); } + + + std::string infile = files[0]; + std::string outfile=""; + + if (files.size() == 1) { + outfile=infile; + + //remove path information + std::string::size_type p = outfile.rfind('/'); + if (p != std::string::npos && ((p+1) < outfile.size())) + outfile.erase(0,p+1); + + //eventually strip .gz + if (outfile.compare(outfile.size()-3,3,".gz")==0) + outfile.erase(outfile.size()-3,3); + + outfile+=".qlm"; + } + else + outfile = files[1]; + + + + std::cout << "Reading " << infile << "..." << std::endl; + + inputfilestream inp(infile.c_str()); + if (!inp.good()) { + std::cerr << "Failed to open " << infile << "!\n"; + exit(1); + } + + + std::ofstream out(outfile.c_str()); + std::cout << "Writing " << outfile << "..." << std::endl; + + //prepare temporary file to save n-gram blocks for multiple reads + //this avoids using seeks which do not work with inputfilestream + //it's odd but i need a bidirectional filestream! + + string filePath;ofstream dummy; + createtempfile(dummy,filePath,ios::out); + dummy.close(); + + fstream filebuff(filePath.c_str(),ios::out|ios::in); + + int nPts = 0; // actual number of points + + // *** Read ARPA FILE ** + + int numNgrams[MAXLEV + 1]; /* # n-grams for each order */ + int Order,MaxOrder; + int n; + + float logprob,logbow, logten=log(10.0); + + double* dataPts=NULL; + double* centersP=NULL; double* centersB=NULL; + + int* mapP=NULL; int* mapB=NULL; + + int centers=k; + streampos iposition; + + out << "qARPA\n"; //print output header + + + for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0; + + char line[MAX_LINE]; + + while (inp.getline(line,MAX_LINE)){ + + bool backslash = (line[0] == '\\'); + + if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { + numNgrams[Order] = n; + MaxOrder=Order; + } + + if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { + + out << line << "\n"; + cerr << "-- Start processing of " << Order << "-grams\n"; + assert(Order <= MAXLEV); + + int N=numNgrams[Order]; + centers=k; + if (Order==1) centers=256; // always use 256 centers + + char* words[MAXLEV+3]; + dataPts=new double[N]; // allocate data + + //reset tempout file + filebuff.seekg(0); + + for (nPts=0;nPts<N;nPts++){ + inp.getline(line,MAX_LINE); + filebuff << line << std::endl; + int howmany = parseWords(line, words, Order + 3); + assert(howmany == Order+2 || howmany == Order+1); + sscanf(words[0],"%f",&logprob); + dataPts[nPts]=exp(logprob * logten); + } + + cerr << "quantizing " << N << " probabilities\n"; + + centersP=new double[centers]; + mapP=new int[N]; + + ComputeCluster(centers,centersP,N,dataPts); + + + assert(bintable !=NULL); + for (int p=0;p<N;p++){ + mapP[bintable[p].idx]=bintable[p].code; + } + + if (Order<MaxOrder){ + //second pass to read back-off weights + + filebuff.seekg(0); + + for (nPts=0;nPts<N;nPts++){ + + filebuff.getline(line,MAX_LINE); + + int howmany = parseWords(line, words, Order + 3); + if (howmany==Order+2) //backoff is written + sscanf(words[Order+1],"%f",&logbow); + else + logbow=0; // backoff is implicit + dataPts[nPts]=exp(logbow * logten); + } + + centersB=new double[centers]; + mapB=new int[N]; + + cerr << "quantizing " << N << " backoff weights\n"; + ComputeCluster(centers,centersB,N,dataPts); + + assert(bintable !=NULL); + for (int p=0;p<N;p++){ + mapB[bintable[p].idx]=bintable[p].code; + } + + } + + + out << centers << "\n"; + for (nPts=0;nPts<centers;nPts++){ + out << log(centersP[nPts])/logten; + if (Order<MaxOrder) out << " " << log(centersB[nPts])/logten; + out << "\n"; + } + + filebuff.seekg(0); + + for (nPts=0;nPts<numNgrams[Order];nPts++){ + + filebuff.getline(line,MAX_LINE); + + parseWords(line, words, Order + 3); + + out << mapP[nPts]; + + for (int i=1;i<=Order;i++) out << "\t" << words[i]; + + if (Order < MaxOrder) out << "\t" << mapB[nPts]; + + out << "\n"; + + } + + if (mapP){delete [] mapP;mapP=NULL;} + if (mapB){delete [] mapB;mapB=NULL;} + + if (centersP){delete [] centersP; centersP=NULL;} + if (centersB){delete [] centersB; centersB=NULL;} + + delete [] dataPts; + + continue; + + + } + + out << line << "\n"; + } + + cerr << "---- done\n"; + + out.flush(); + + out.close(); + inp.close(); + + removefile(filePath); +} + +// Compute Clusters + +int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){ + + + //cerr << "\nExecuting Clutering Algorithm: k=" << centers<< "\n"; + + if (bintable) delete [] bintable; + + bintable=new BinEntry[N]; + for (int i=0;i<N;i++){ + bintable[i].pt=dataPts[i]; + bintable[i].idx=i; + bintable[i].code=0; + } + + //cout << "start sort \n"; + qsort(bintable,N,sizeof(BinEntry),cmpBinEntry); + + int different=1; + + for (int i=1;i<N;i++) + if (bintable[i].pt!=bintable[i-1].pt) + different++; + + int interval=different/centers; + if (interval==0) interval++; + + int* population=new int[centers]; + int* species=new int[centers]; + + //cout << " Different entries=" << different + // << " Total Entries=" << N << " Bin Size=" << interval << "\n"; + + for (int i=0;i<centers;i++){ + population[i]=species[i]=0; + ctrs[i]=0.0; + } + + // initial values + bintable[0].code=0; + population[0]=1; + species[0]=1; + + int currcode=0; + different=1; + + for (int i=1;i<N;i++){ + + if ((bintable[i].pt!=bintable[i-1].pt)){ + different++; + if ((different % interval) == 0) + if ((currcode+1) < centers + && + population[currcode]>0){ + currcode++; + } + } + + if (bintable[i].pt == bintable[i-1].pt) + bintable[i].code=bintable[i-1].code; + else{ + bintable[i].code=currcode; + species[currcode]++; + } + + population[bintable[i].code]++; + + assert(bintable[i].code < centers); + + ctrs[bintable[i].code]+=bintable[i].pt; + + } + + + for (int i=0;i<centers;i++){ + if (population[i]>0){ + ctrs[i]/=(float)population[i]; + if (ctrs[i]<1e-99){ + cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n"; + ctrs[i]=1e-99; + } + } + //cout << i << " ctr " << ctrs[i] << " population " << population[i] << " species " << species[i] <<"\n"; + } + + cout.flush(); + + delete [] population; + delete [] species; + + + return 1; + +} + +//---------------------------------------------------------------------- +// Reading/Printing utilities +// readPt - read a point from input stream into data storage +// at position i. Returns false on error or EOF. +// printPt - prints a points to output file +//---------------------------------------------------------------------- + + +int parseWords(char *sentence, char **words, int max) +{ + char *word; + int i = 0; + + char *const wordSeparators = " \t\r\n"; + + for (word = strtok(sentence, wordSeparators); + i < max && word != 0; + i++, word = strtok(0, wordSeparators)) + { + words[i] = word; + } + if (i < max) { + words[i] = 0; + } + + return i; +} + + diff --git a/irstlm/src/src/util.cpp b/irstlm/src/src/util.cpp new file mode 100644 index 000000000..1b81218d1 --- /dev/null +++ b/irstlm/src/src/util.cpp @@ -0,0 +1,77 @@ + +#ifdef WIN32 +#include <windows.h> +#endif + +#include "util.h" + +using namespace std; + +string gettempfolder() +{ +#ifdef _WIN32 + char *tmpPath = getenv("TMP"); + string str(tmpPath); + if (str.substr(str.size() - 1, 1) != "\\") + str += "\\"; + return str; +#else + return "/tmp/"; +#endif +} + + +void createtempfile(ofstream &fileStream, string &filePath, std::ios_base::openmode flags) +{ +#ifdef _WIN32 + char buffer[BUFSIZ]; + ::GetTempFileNameA(gettempfolder().c_str(), "", 0, buffer); + filePath = buffer; +#else + char buffer[L_tmpnam]; + strcpy(buffer, gettempfolder().c_str()); + strcat(buffer, "dskbuff--XXXXXX"); + mkstemp(buffer); + filePath = buffer; +#endif + fileStream.open(filePath.c_str(), flags); +} + +void removefile(const std::string &filePath) +{ +#ifdef _WIN32 + ::DeleteFileA(filePath.c_str()); +#else + char cmd[100]; + sprintf(cmd,"rm %s",filePath.c_str()); + system(cmd); +#endif +} + + + +inputfilestream::inputfilestream(const std::string &filePath) +: std::istream(0), +m_streambuf(0) +{ + if (filePath.size() > 3 && + filePath.substr(filePath.size() - 3, 3) == ".gz") + { + m_streambuf = new gzfilebuf(filePath.c_str()); + } else { + std::filebuf* fb = new std::filebuf(); + fb->open(filePath.c_str(), std::ios::in); + m_streambuf = fb; + } + this->init(m_streambuf); +} + +inputfilestream::~inputfilestream() +{ + delete m_streambuf; m_streambuf = 0; +} + +void inputfilestream::close() +{ +} + diff --git a/irstlm/src/src/util.h b/irstlm/src/src/util.h new file mode 100644 index 000000000..229d301cc --- /dev/null +++ b/irstlm/src/src/util.h @@ -0,0 +1,25 @@ +#ifndef IRSTLM_UTIL_H +#define IRSTLM_UTIL_H + +#include <string> +#include <fstream> +#include "gzfilebuf.h" + +std::string gettempfolder(); +void createtempfile(std::ofstream &fileStream, std::string &filePath, std::ios_base::openmode flags); +void removefile(const std::string &filePath); + +class inputfilestream : public std::istream +{ +protected: + std::streambuf *m_streambuf; +public: + + inputfilestream(const std::string &filePath); + ~inputfilestream(); + + void close(); +}; + +#endif + |