Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2006-11-05 16:02:33 +0300
committerhieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>2006-11-05 16:02:33 +0300
commit8313b93157fdeedf7e8c939071b085b294450c56 (patch)
tree64e1ef0674cb6ceb4830649d4d160f886df103e2 /irstlm/src
parente8cd4e752f4e239d99e427c0cceafcea8bcaeb1f (diff)
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@950 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'irstlm/src')
-rw-r--r--irstlm/src/src/Makefile.am30
-rw-r--r--irstlm/src/src/cmd.c661
-rw-r--r--irstlm/src/src/cmd.h68
-rw-r--r--irstlm/src/src/compile-lm.cpp221
-rw-r--r--irstlm/src/src/dictionary.cpp425
-rw-r--r--irstlm/src/src/dictionary.h186
-rw-r--r--irstlm/src/src/gzfilebuf.h80
-rw-r--r--irstlm/src/src/htable.cpp329
-rw-r--r--irstlm/src/src/htable.h130
-rw-r--r--irstlm/src/src/index.h19
-rw-r--r--irstlm/src/src/lmtable.cpp1217
-rw-r--r--irstlm/src/src/lmtable.h366
-rw-r--r--irstlm/src/src/mempool.cpp497
-rw-r--r--irstlm/src/src/mempool.h172
-rw-r--r--irstlm/src/src/n_gram.cpp214
-rw-r--r--irstlm/src/src/n_gram.h118
-rw-r--r--irstlm/src/src/ngramcache.cpp89
-rw-r--r--irstlm/src/src/ngramcache.h51
-rw-r--r--irstlm/src/src/quantize-lm.cpp424
-rw-r--r--irstlm/src/src/util.cpp77
-rw-r--r--irstlm/src/src/util.h25
21 files changed, 5399 insertions, 0 deletions
diff --git a/irstlm/src/src/Makefile.am b/irstlm/src/src/Makefile.am
new file mode 100644
index 000000000..2c3cc5caf
--- /dev/null
+++ b/irstlm/src/src/Makefile.am
@@ -0,0 +1,30 @@
+lib_LIBRARIES = libirstlm.a
+
+libirstlm_a_SOURCES = \
+ dictionary.cpp \
+ htable.cpp \
+ lmtable.cpp \
+ mempool.cpp \
+ n_gram.cpp \
+ ngramcache.cpp \
+ util.cpp
+
+library_includedir=$(includedir)
+library_include_HEADERS = \
+ dictionary.h \
+ htable.h \
+ lmtable.h \
+ mempool.h \
+ n_gram.h \
+ ngramcache.h \
+ util.h \
+ gzfilebuf.h
+
+bin_PROGRAMS = compile-lm quantize-lm
+
+AM_LDFLAGS=-L.
+LIBS=-lirstlm -lz
+
+compile_lm_SOURCES = compile-lm.cpp
+quantize_lm_SOURCES = quantize-lm.cpp
+
diff --git a/irstlm/src/src/cmd.c b/irstlm/src/src/cmd.c
new file mode 100644
index 000000000..aeb36d7b9
--- /dev/null
+++ b/irstlm/src/src/cmd.c
@@ -0,0 +1,661 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "cmd.h"
+
+static Enum_T BoolEnum[] = {
+ { "FALSE", 0 },
+ { "TRUE", 1 },
+ { 0, 0 }
+};
+
+#ifdef NEEDSTRDUP
+char *strdup();
+#endif
+
+#define FALSE 0
+#define TRUE 1
+
+#define LINSIZ 10240
+#define MAXPARAM 256
+
+static char *GetLine(),
+ **str2array();
+static int Scan(),
+ SetParam(),
+ SetEnum(),
+ SetSubrange(),
+ SetStrArray(),
+ SetGte(),
+ SetLte(),
+ CmdError(),
+ EnumError(),
+ SubrangeError(),
+ GteError(),
+ LteError(),
+ PrintParam(),
+ PrintEnum(),
+ PrintStrArray();
+
+static Cmd_T cmds[MAXPARAM+1];
+static char *SepString = " \t\n";
+
+#if defined(__STDC__)
+#include <stdarg.h>
+int DeclareParams(char *ParName, ...)
+#else
+#include <varargs.h>
+int DeclareParams(ParName, va_alist)
+char *ParName;
+va_dcl
+#endif
+{
+ va_list args;
+ static int ParamN = 0;
+ int j,
+ c;
+ char *s;
+
+#if defined(__STDC__)
+ va_start(args, ParName);
+#else
+ va_start(args);
+#endif
+ for(;ParName;) {
+ if(ParamN==MAXPARAM) {
+ fprintf(stderr, "Too many parameters !!\n");
+ break;
+ }
+ for(j=0,c=1; j<ParamN&&(c=strcmp(cmds[j].Name,ParName))<0; j++)
+ ;
+ if(!c) {
+ fprintf(stderr,
+ "Warning: parameter \"%s\" declared twice.\n",
+ ParName);
+ }
+ for(c=ParamN; c>j; c--) {
+ cmds[c] = cmds[c-1];
+ }
+ cmds[j].Name = ParName;
+ cmds[j].Type = va_arg(args, int);
+ cmds[j].Val = va_arg(args, void *);
+ switch(cmds[j].Type) {
+ case CMDENUMTYPE: /* get the pointer to Enum_T struct */
+ cmds[j].p = va_arg(args, void *);
+ break;
+ case CMDSUBRANGETYPE: /* get the two extremes */
+ cmds[j].p = (void*) calloc(2, sizeof(int));
+ ((int*)cmds[j].p)[0] = va_arg(args, int);
+ ((int*)cmds[j].p)[1] = va_arg(args, int);
+ break;
+ case CMDGTETYPE: /* get lower or upper bound */
+ case CMDLTETYPE:
+ cmds[j].p = (void*) calloc(1, sizeof(int));
+ ((int*)cmds[j].p)[0] = va_arg(args, int);
+ break;
+ case CMDSTRARRAYTYPE: /* get the separators string */
+ cmds[j].p = (s=va_arg(args, char*))
+ ? (void*)strdup(s) : 0;
+ break;
+ case CMDBOOLTYPE:
+ cmds[j].Type = CMDENUMTYPE;
+ cmds[j].p = BoolEnum;
+ break;
+ case CMDDOUBLETYPE: /* nothing else is needed */
+ case CMDINTTYPE:
+ case CMDSTRINGTYPE:
+ break;
+ default:
+ fprintf(stderr, "%s: %s %d %s \"%s\"\n",
+ "DeclareParam()", "Unknown Type",
+ cmds[j].Type, "for parameter", cmds[j].Name);
+ exit(1);
+ }
+ ParamN++;
+ ParName = va_arg(args, char *);
+ }
+ cmds[ParamN].Name = NULL;
+ va_end(args);
+ return 0;
+}
+
+int GetParams(n, a, CmdFileName)
+int *n;
+char ***a;
+char *CmdFileName;
+{
+ char *Line,
+ *ProgName;
+ int argc = *n;
+ char **argv = *a,
+ *s;
+ FILE *fp;
+ int IsPipe;
+
+#ifdef MSDOS
+#define PATHSEP '\\'
+ char *dot = NULL;
+#else
+#define PATHSEP '/'
+#endif
+
+ if(!(Line=malloc(LINSIZ))) {
+ fprintf(stderr, "GetParams(): Unable to alloc %d bytes\n",
+ LINSIZ);
+ exit(1);
+ }
+ if((ProgName=strrchr(*argv, PATHSEP))) {
+ ++ProgName;
+ } else {
+ ProgName = *argv;
+ }
+#ifdef MSDOS
+ if(dot=strchr(ProgName, '.')) *dot = 0;
+#endif
+ --argc;
+ ++argv;
+ for(;;) {
+ if(argc && argv[0][0]=='-' && argv[0][1]=='=') {
+ CmdFileName = argv[0]+2;
+ ++argv;
+ --argc;
+ }
+ if(!CmdFileName) {
+ break;
+ }
+ IsPipe = !strncmp(CmdFileName, "@@", 2);
+ fp = IsPipe
+ ? popen(CmdFileName+2, "r")
+ : strcmp(CmdFileName, "-")
+ ? fopen(CmdFileName, "r")
+ : stdin;
+ if(!fp) {
+ fprintf(stderr, "Unable to open command file %s\n",
+ CmdFileName);
+ exit(1);
+ }
+ while(GetLine(fp, LINSIZ, Line) && strcmp(Line, "\\End")) {
+ if(Scan(ProgName, cmds, Line)) {
+ CmdError(Line);
+ }
+ }
+ if(fp!=stdin) {
+ if(IsPipe) pclose(fp); else fclose(fp);
+ }
+ CmdFileName = NULL;
+ }
+ while(argc && **argv=='-' && (s=strchr(*argv, '='))) {
+ *s = ' ';
+ sprintf(Line, "%s/%s", ProgName, *argv+1);
+ *s = '=';
+ if(Scan(ProgName, cmds, Line)) CmdError(*argv);
+ --argc;
+ ++argv;
+ }
+ *n = argc;
+ *a = argv;
+#ifdef MSDOS
+ if(dot) *dot = '.';
+#endif
+ free(Line);
+ return 0;
+}
+
+int PrintParams(ValFlag, fp)
+int ValFlag;
+FILE *fp;
+{
+ int i;
+
+ fflush(fp);
+ if(ValFlag) {
+ fprintf(fp, "Parameters Values:\n");
+ } else {
+ fprintf(fp, "Parameters:\n");
+ }
+ for(i=0; cmds[i].Name; i++) PrintParam(cmds+i, ValFlag, fp);
+ fprintf(fp, "\n");
+ fflush(fp);
+ return 0;
+}
+
+int SPrintParams(a, pfx)
+char ***a,
+ *pfx;
+{
+ int l,
+ n;
+ Cmd_T *cmd;
+
+ if(!pfx) pfx="";
+ l = strlen(pfx);
+ for(n=0, cmd=cmds; cmd->Name; cmd++) n += !!cmd->ArgStr;
+ a[0] = calloc(n, sizeof(char*));
+ for(n=0, cmd=cmds; cmd->Name; cmd++) {
+ if(!cmd->ArgStr) continue;
+ a[0][n] = malloc(strlen(cmd->Name)+strlen(cmd->ArgStr)+l+2);
+ sprintf(a[0][n], "%s%s=%s", pfx, cmd->Name, cmd->ArgStr);
+ ++n;
+ }
+ return n;
+}
+
+static int CmdError(opt)
+char *opt;
+{
+ fprintf(stderr, "Invalid option \"%s\"\n", opt);
+ fprintf(stderr, "This program expectes the following parameters:\n");
+ PrintParams(FALSE, stderr);
+ exit(0);
+}
+
+static int PrintParam(cmd, ValFlag, fp)
+Cmd_T *cmd;
+int ValFlag;
+FILE *fp;
+{
+ fprintf(fp, "%4s", "");
+ switch(cmd->Type) {
+ case CMDDOUBLETYPE:
+ fprintf(fp, "%s", cmd->Name);
+ if(ValFlag) fprintf(fp, ": %22.15e", *(double *)cmd->Val);
+ fprintf(fp, "\n");
+ break;
+ case CMDENUMTYPE:
+ PrintEnum(cmd, ValFlag, fp);
+ break;
+ case CMDINTTYPE:
+ case CMDSUBRANGETYPE:
+ case CMDGTETYPE:
+ case CMDLTETYPE:
+ fprintf(fp, "%s", cmd->Name);
+ if(ValFlag) fprintf(fp, ": %d", *(int *)cmd->Val);
+ fprintf(fp, "\n");
+ break;
+ case CMDSTRINGTYPE:
+ fprintf(fp, "%s", cmd->Name);
+ if(ValFlag) {
+ if(*(char **)cmd->Val) {
+ fprintf(fp, ": \"%s\"", *(char **)cmd->Val);
+ } else {
+ fprintf(fp, ": %s", "NULL");
+ }
+ }
+ fprintf(fp, "\n");
+ break;
+ case CMDSTRARRAYTYPE:
+ PrintStrArray(cmd, ValFlag, fp);
+ break;
+ default:
+ fprintf(stderr, "%s: %s %d %s \"%s\"\n",
+ "PrintParam",
+ "Unknown Type",
+ cmd->Type,
+ "for parameter",
+ cmd->Name);
+ exit(1);
+ }
+ return 0;
+}
+
+static char *GetLine(fp, n, Line)
+FILE *fp;
+int n;
+char *Line;
+{
+ int j,
+ l,
+ offs=0;
+
+ for(;;) {
+ if(!fgets(Line+offs, n-offs, fp)) {
+ return NULL;
+ }
+ if(Line[offs]=='#') continue;
+ l = strlen(Line+offs)-1;
+ Line[offs+l] = 0;
+ for(j=offs; Line[j] && isspace(Line[j]); j++, l--)
+ ;
+ if(l<1) continue;
+ if(j > offs) {
+ char *s = Line+offs,
+ *q = Line+j;
+
+ while((*s++=*q++))
+ ;
+ }
+ if(Line[offs+l-1]=='\\') {
+ offs += l;
+ Line[offs-1] = ' ';
+ } else {
+ break;
+ }
+ }
+ return Line;
+}
+
+static int Scan(ProgName, cmds, Line)
+char *ProgName,
+ *Line;
+Cmd_T *cmds;
+{
+ char *q,
+ *p;
+ int i,
+ hl,
+ HasToMatch = FALSE,
+ c0,
+ c;
+
+ p = Line+strspn(Line, SepString);
+ if(!(hl=strcspn(p, SepString))) {
+ return 0;
+ }
+ if((q=strchr(p, '/')) && q-p<hl) {
+ *q = 0;
+ if(strcmp(p, ProgName)) {
+ *q = '/';
+ return 0;
+ }
+ *q = '/';
+ HasToMatch=TRUE;
+ p = q+1;
+ }
+ if(!(hl = strcspn(p, SepString))) {
+ return 0;
+ }
+ c0 = p[hl];
+ p[hl] = 0;
+ for(i=0, c=1; cmds[i].Name&&(c=strcmp(cmds[i].Name, p))<0; i++)
+ ;
+ p[hl] = c0;
+ if(!c) return SetParam(cmds+i, p+hl+strspn(p+hl, SepString));
+ return HasToMatch && c;
+}
+
+static int SetParam(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ if(!*s && cmd->Type != CMDSTRINGTYPE) {
+ fprintf(stderr,
+ "WARNING: No value specified for parameter \"%s\"\n",
+ cmd->Name);
+ return 0;
+ }
+ switch(cmd->Type) {
+ case CMDDOUBLETYPE:
+ if(sscanf(s, "%lf", (double*)cmd->Val)!=1) {
+ fprintf(stderr,
+ "Float value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ break;
+ case CMDENUMTYPE:
+ SetEnum(cmd, s);
+ break;
+ case CMDINTTYPE:
+ if(sscanf(s, "%d", (int*)cmd->Val)!=1) {
+ fprintf(stderr,
+ "Integer value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ break;
+ case CMDSTRINGTYPE:
+ *(char **)cmd->Val = (strcmp(s, "<NULL>") && strcmp(s, "NULL"))
+ ? strdup(s)
+ : 0;
+ break;
+ case CMDSTRARRAYTYPE:
+ SetStrArray(cmd, s);
+ break;
+ case CMDGTETYPE:
+ SetGte(cmd, s);
+ break;
+ case CMDLTETYPE:
+ SetLte(cmd, s);
+ break;
+ case CMDSUBRANGETYPE:
+ SetSubrange(cmd, s);
+ break;
+ default:
+ fprintf(stderr, "%s: %s %d %s \"%s\"\n",
+ "SetParam",
+ "Unknown Type",
+ cmd->Type,
+ "for parameter",
+ cmd->Name);
+ exit(1);
+ }
+ cmd->ArgStr = strdup(s);
+ return 0;
+}
+
+static int SetEnum(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ Enum_T *en;
+
+ for(en=(Enum_T *)cmd->p; en->Name; en++) {
+ if(*en->Name && !strcmp(s, en->Name)) {
+ *(int *) cmd->Val = en->Idx;
+ return 0;
+ }
+ }
+ return EnumError(cmd, s);
+}
+
+static int SetSubrange(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ int n;
+
+ if(sscanf(s, "%d", &n)!=1) {
+ fprintf(stderr,
+ "Integer value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ if(n < *(int *)cmd->p || n > *((int *)cmd->p+1)) {
+ return SubrangeError(cmd, n);
+ }
+ *(int *)cmd->Val = n;
+ return 0;
+}
+
+static int SetGte(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ int n;
+
+ if(sscanf(s, "%d", &n)!=1) {
+ fprintf(stderr,
+ "Integer value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ if(n<*(int *)cmd->p) {
+ return GteError(cmd, n);
+ }
+ *(int *)cmd->Val = n;
+ return 0;
+}
+
+static int SetStrArray(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ *(char***)cmd->Val = str2array(s, (char*)cmd->p);
+ return 0;
+}
+
+static int SetLte(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ int n;
+
+ if(sscanf(s, "%d", &n)!=1) {
+ fprintf(stderr,
+ "Integer value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ if(n > *(int *)cmd->p) {
+ return LteError(cmd, n);
+ }
+ *(int *)cmd->Val = n;
+ return 0;
+}
+
+static int EnumError(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ Enum_T *en;
+
+ fprintf(stderr,
+ "Invalid value \"%s\" for parameter \"%s\"\n", s, cmd->Name);
+ fprintf(stderr, "Valid values are:\n");
+ for(en=(Enum_T *)cmd->p; en->Name; en++) {
+ if(*en->Name) {
+ fprintf(stderr, " %s\n", en->Name);
+ }
+ }
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+static int GteError(cmd, n)
+Cmd_T *cmd;
+int n;
+{
+ fprintf(stderr,
+ "Value %d out of range for parameter \"%s\"\n", n, cmd->Name);
+ fprintf(stderr, "Valid values must be greater than or equal to %d\n",
+ *(int *)cmd->p);
+ exit(1);
+}
+
+static int LteError(cmd, n)
+Cmd_T *cmd;
+int n;
+{
+ fprintf(stderr,
+ "Value %d out of range for parameter \"%s\"\n", n, cmd->Name);
+ fprintf(stderr, "Valid values must be less than or equal to %d\n",
+ *(int *)cmd->p);
+ exit(1);
+}
+
+static int SubrangeError(cmd, n)
+Cmd_T *cmd;
+int n;
+{
+ fprintf(stderr,
+ "Value %d out of range for parameter \"%s\"\n", n, cmd->Name);
+ fprintf(stderr, "Valid values range from %d to %d\n",
+ *(int *)cmd->p, *((int *)cmd->p+1));
+ exit(1);
+}
+
+static int PrintEnum(cmd, ValFlag, fp)
+Cmd_T *cmd;
+int ValFlag;
+FILE *fp;
+{
+ Enum_T *en;
+
+ fprintf(fp, "%s", cmd->Name);
+ if(ValFlag) {
+ for(en=(Enum_T *)cmd->p; en->Name; en++) {
+ if(*en->Name && en->Idx==*(int *)cmd->Val) {
+ fprintf(fp, ": %s", en->Name);
+ }
+ }
+ }
+ fprintf(fp, "\n");
+ return 0;
+}
+
+static int PrintStrArray(cmd, ValFlag, fp)
+Cmd_T *cmd;
+int ValFlag;
+FILE *fp;
+{
+ char *indent,
+ **s = *(char***)cmd->Val;
+ int l = 4+strlen(cmd->Name);
+
+ fprintf(fp, "%s", cmd->Name);
+ indent = malloc(l+2);
+ memset(indent, ' ', l+1);
+ indent[l+1] = 0;
+ if(ValFlag) {
+ fprintf(fp, ": %s", s ? (*s ? *s++ : "NULL") : "");
+ if(s) while(*s) {
+ fprintf(fp, "\n%s %s", indent, *s++);
+ }
+ }
+ free(indent);
+ fprintf(fp, "\n");
+ return 0;
+}
+
+static char **str2array(s, sep)
+char *s,
+ *sep;
+{
+ char *p,
+ **a;
+ int n = 0,
+ l;
+
+ if(!sep) sep = SepString;
+ p = s += strspn(s, sep);
+ while(*p) {
+ p += strcspn(p, sep);
+ p += strspn(p, sep);
+ ++n;
+ }
+ a = calloc(n+1, sizeof(char *));
+ p = s;
+ n = 0;
+ while(*p) {
+ l = strcspn(p, sep);
+ a[n] = malloc(l+1);
+ memcpy(a[n], p, l);
+ a[n][l] = 0;
+ ++n;
+ p += l;
+ p += strspn(p, sep);
+ }
+ return a;
+}
diff --git a/irstlm/src/src/cmd.h b/irstlm/src/src/cmd.h
new file mode 100644
index 000000000..708905f6f
--- /dev/null
+++ b/irstlm/src/src/cmd.h
@@ -0,0 +1,68 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+#if !defined(CMD_H)
+
+#define CMD_H
+
+#define CMDDOUBLETYPE 1
+#define CMDENUMTYPE 2
+#define CMDINTTYPE 3
+#define CMDSTRINGTYPE 4
+#define CMDSUBRANGETYPE 5
+#define CMDGTETYPE 6
+#define CMDLTETYPE 7
+#define CMDSTRARRAYTYPE 8
+#define CMDBOOLTYPE 9
+
+typedef struct {
+ char *Name;
+ int Idx;
+} Enum_T;
+
+typedef struct {
+ int Type;
+ char *Name,
+ *ArgStr;
+ void *Val,
+ *p;
+} Cmd_T;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__STDC__)
+int DeclareParams(char *, ...);
+#else
+int DeclareParams();
+#endif
+
+int GetParams(int *n, char ***a,char *CmdFileName),
+ SPrintParams(),
+ PrintParams();
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+
+
diff --git a/irstlm/src/src/compile-lm.cpp b/irstlm/src/src/compile-lm.cpp
new file mode 100644
index 000000000..09f01fc94
--- /dev/null
+++ b/irstlm/src/src/compile-lm.cpp
@@ -0,0 +1,221 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit, compile LM
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+using namespace std;
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+#include "util.h"
+#include "math.h"
+#include "lmtable.h"
+
+
+/* GLOBAL OPTIONS ***************/
+
+std::string stxt = "no";
+std::string seval = "";
+std::string sdebug = "0";
+std::string ssubdict = "";
+
+/********************************/
+
+void usage(const char *msg = 0) {
+ if (msg) { std::cerr << msg << std::endl; }
+ std::cerr << "Usage: compile-lm [options] input-file.lm [output-file.blm]" << std::endl;
+ if (!msg) std::cerr << std::endl
+ << " compile-lm reads a standard LM file in ARPA format and produces" << std::endl
+ << " a compiled representation that the IRST LM toolkit can quickly" << std::endl
+ << " read and process. LM file can be compressed with gzip." << std::endl << std::endl;
+ std::cerr << "Options:\n"
+ << "--text=[yes|no] -t=[yes|no] (output is again in text format)\n"
+ << "--eval=text-file -e=text-file (computes perplexity of text-file and returns)\n"
+ << "--debug=1 -d=1 (verbose output for --eval option)\n"
+ << "--subdict=text-file --sd=tex-file (limits LM to include only words in text-file)\n" ;
+}
+
+bool starts_with(const std::string &s, const std::string &pre) {
+ if (pre.size() > s.size()) return false;
+
+ if (pre == s) return true;
+ std::string pre_equals(pre+'=');
+ if (pre_equals.size() > s.size()) return false;
+ return (s.substr(0,pre_equals.size()) == pre_equals);
+}
+
+std::string get_param(const std::string& opt, int argc, const char **argv, int& argi)
+{
+ std::string::size_type equals = opt.find_first_of('=');
+ if (equals != std::string::npos && equals < opt.size()-1) {
+ return opt.substr(equals+1);
+ }
+ std::string nexto;
+ if (argi + 1 < argc) {
+ nexto = argv[++argi];
+ } else {
+ usage((opt + " requires a value!").c_str());
+ exit(1);
+ }
+ return nexto;
+}
+
+void handle_option(const std::string& opt, int argc, const char **argv, int& argi)
+{
+ if (opt == "--help" || opt == "-h") { usage(); exit(1); }
+ if (starts_with(opt, "--text") || starts_with(opt, "-t"))
+ stxt = get_param(opt, argc, argv, argi);
+ else
+ if (starts_with(opt, "--eval") || starts_with(opt, "-e"))
+ seval = get_param(opt, argc, argv, argi);
+ else
+ if (starts_with(opt, "--debug") || starts_with(opt, "-d"))
+ sdebug = get_param(opt, argc, argv, argi);
+
+ else
+ if (starts_with(opt, "--subdict") || starts_with(opt, "-sd"))
+ ssubdict = get_param(opt, argc, argv, argi);
+
+ else {
+ usage(("Don't understand option " + opt).c_str());
+ exit(1);
+ }
+}
+
+int main(int argc, const char **argv)
+{
+
+ if (argc < 2) { usage(); exit(1); }
+ std::vector<std::string> files;
+ for (int i=1; i < argc; i++) {
+ std::string opt = argv[i];
+ if (opt[0] == '-') { handle_option(opt, argc, argv, i); }
+ else files.push_back(opt);
+ }
+ if (files.size() > 2) { usage("Too many arguments"); exit(1); }
+ if (files.size() < 1) { usage("Please specify a LM file to read from"); exit(1); }
+
+ bool textoutput = (stxt == "yes"? true : false);
+ int debug = atoi(sdebug.c_str());
+
+ std::string infile = files[0];
+ std::string outfile="";
+
+ if (files.size() == 1) {
+ outfile=infile;
+
+ //remove path information
+ std::string::size_type p = outfile.rfind('/');
+ if (p != std::string::npos && ((p+1) < outfile.size()))
+ outfile.erase(0,p+1);
+
+ //eventually strip .gz
+ if (outfile.compare(outfile.size()-3,3,".gz")==0)
+ outfile.erase(outfile.size()-3,3);
+
+ outfile+=(textoutput?".lm":".blm");
+ }
+ else
+ outfile = files[1];
+
+ cerr << "outfile: " << outfile << std::endl;
+
+
+ lmtable lmt;
+
+
+ if (ssubdict == ""){
+
+ std::cout << "Reading " << infile << "..." << std::endl;
+ inputfilestream inp(infile.c_str());
+ if (!inp.good()) {
+ std::cerr << "Failed to open " << infile << "!\n";
+ exit(1);
+ }
+ lmt.load(inp);
+
+ }
+ else{ //load reduced LM from a binary LM file!
+ lmt.dict->generate((char *)ssubdict.c_str());
+ //eventually add OOV code so that it can be found in the table
+ lmt.dict->genoovcode(); //generate OOV code
+ //filter table from the large binary table on disk
+ lmt.filter2(infile.c_str(),1);
+ //regenerate OOV code as dictionary has been rebuild
+ lmt.dict->genoovcode();
+ }
+
+ if (seval != ""){
+ std::cerr << "Start Eval\n";
+ std::cerr << "OOV code: " << lmt.dict->oovcode() << "\n";
+ ngram ng(lmt.dict);
+ std::cout.setf(ios::fixed);
+ std::cout.precision(2);
+ if (debug>1) std::cout.precision(8);
+ std::fstream inptxt(seval.c_str(),std::ios::in);
+
+ int Nbo=0,Nw=0,Noov=0;
+ double logPr=0,PP=0,PPwp=0,Pr;
+
+ int bos=ng.dict->encode(ng.dict->BoS());
+
+#ifdef TRACE_CACHE
+ lmt.init_probcache();
+#endif
+
+ while(inptxt >> ng){
+
+ if (ng.size>lmt.maxlevel()) ng.size=lmt.maxlevel();
+
+ // reset ngram at begin of sentence
+ if (*ng.wordp(1)==bos) continue;
+
+ lmt.bo_state(0);
+ if (ng.size>=1){
+ logPr+=(Pr=lmt.clprob(ng));
+ if (debug>1)
+ std::cout << ng << "[" << ng.size << "-gram]" << " " << Pr << "\n";
+
+ if (*ng.wordp(1) == lmt.dict->oovcode()) Noov++;
+ Nw++; if (lmt.bo_state()) Nbo++;
+ }
+
+ }
+
+ PP=exp((-logPr * log(10.0)) /Nw);
+ PPwp= PP * exp(Noov * log(10000000.0-lmt.dict->size())/Nw);
+
+ std::cout << "%% Nw=" << Nw << " PP=" << PP << " PPwp=" << PPwp
+ << " Nbo=" << Nbo << " Noov=" << Noov
+ << " OOV=" << (float)Noov/Nw * 100.0 << "%\n";
+
+ return 0;
+ };
+
+ std::cout << "Saving to " << outfile << std::endl;
+ if (textoutput)
+ lmt.savetxt(outfile.c_str());
+ else
+ lmt.savebin(outfile.c_str());
+
+ return 0;
+}
+
diff --git a/irstlm/src/src/dictionary.cpp b/irstlm/src/src/dictionary.cpp
new file mode 100644
index 000000000..44627c1c6
--- /dev/null
+++ b/irstlm/src/src/dictionary.cpp
@@ -0,0 +1,425 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+#include <iomanip>
+#include <iostream>
+#include <fstream>
+#include "mempool.h"
+#include "htable.h"
+#include "dictionary.h"
+#include "index.h"
+
+using namespace std;
+
+dictionary::dictionary(char *filename,int size,char* isymb,char* oovlexfile){
+
+ // unitialized memory
+ if (oovlexfile!=NULL)
+ oovlex=new dictionary(oovlexfile,size,isymb,NULL);
+ else
+ oovlex=(dictionary *)NULL;
+
+ htb = new htable(size/LOAD_FACTOR);
+ tb = new dict_entry[size];
+ st = new strstack(size * 10);
+
+ for (int i=0;i<size;i++) tb[i].freq=0;
+
+ is=(char*) NULL;
+ intsymb(isymb);
+
+ oov_code = -1;
+ in_oov_lex=0;
+ n = 0;
+ N = 0;
+ dubv = 0;
+ lim = size;
+ ifl=0; //increment flag
+
+ if (filename==NULL) return;
+
+ std::ifstream inp(filename,ios::in);
+
+ if (!inp){
+ cerr << "cannot open " << filename << "\n";
+ exit(1);
+ }
+
+ char buffer[100];
+
+ inp >> setw(100) >> buffer;
+
+ inp.close();
+
+ if ((strncmp(buffer,"dict",4)==0) ||
+ (strncmp(buffer,"DICT",4)==0))
+ load(filename);
+ else
+ generate(filename);
+
+ cerr << "loaded \n";
+
+
+}
+
+
+
+void dictionary::generate(char *filename){
+
+ char buffer[MAX_WORD];
+ int k;
+
+ ifstream inp(filename,ios::in);
+
+ if (!inp){
+ cerr << "cannot open " << filename << "\n";
+ exit(1);
+ }
+
+ cerr << "dict:";
+
+ ifl=1; k=0;
+ while (inp >> setw(MAX_WORD) >> buffer){
+
+ if (strlen(buffer)==(MAX_WORD-1)){
+ cerr << "dictionary: a too long word was read ("
+ << buffer << ")\n";
+ };
+
+
+ if (strlen(buffer)==0){
+ cerr << "zero lenght word!\n";
+ continue;
+ }
+
+ //if (is && (strlen(buffer)==1) && !index(is,buffer[0]))
+ if (is && (strlen(buffer)==1) && (index(is,buffer[0])!=NULL))
+ continue; //skip over the interruption symbol
+
+ incfreq(encode(buffer),1);
+
+ if (!(++k % 1000000)) cerr << ".";
+ }
+ ifl=0;
+ cerr << "\n";
+
+ inp.close();
+
+}
+
+void dictionary::load(char* filename){
+ char header[100];
+ char buffer[MAX_WORD];
+ char *addr;
+ int freqflag=0;
+
+ ifstream inp(filename,ios::in);
+
+ if (!inp){
+ cerr << "\ncannot open " << filename << "\n";
+ exit(1);
+ }
+
+ cerr << "dict:";
+
+ inp.getline(header,100);
+ if (strncmp(header,"DICT",4)==0)
+ freqflag=1;
+ else
+ if (strncmp(header,"dict",4)!=0){
+ cerr << "\ndictionary file " << filename << " has a wrong header\n";
+ exit(1);
+ }
+
+
+ while (inp >> setw(MAX_WORD) >> buffer){
+
+ if (strlen(buffer)==(MAX_WORD-1)){
+ cerr << "\ndictionary: a too long word was read ("
+ << buffer << ")\n";
+ exit(1);
+ };
+
+ tb[n].word=st->push(buffer);
+ tb[n].code=n;
+
+ if (freqflag)
+ inp >> tb[n].freq;
+ else
+ tb[n].freq=0;
+
+ if ((addr=htb->search((char *)&tb[n].word,HT_ENTER)))
+ if (addr!=(char *)&tb[n].word){
+ cerr << "dictionary::loadtxt wrong entry was found ("
+ << buffer << ") in position " << n << "\n";
+ exit(1);
+ }
+
+ N+=tb[n].freq;
+
+ if (strcmp(buffer,OOV())==0) oov_code=n;
+
+ if (++n==lim) grow();
+
+ }
+
+ inp.close();
+}
+
+
+void dictionary::load(std::istream& inp){
+
+ char buffer[MAX_WORD];
+ char *addr;
+ int size;
+
+ inp >> size;
+
+ for (int i=0;i<size;i++){
+
+ inp >> setw(MAX_WORD) >> buffer;
+
+ if (strlen(buffer)==MAX_WORD-1){
+ cerr << "\ndictionary::load found word exceeding max length ("
+ << MAX_WORD << ")" << buffer << "\n";
+ exit(1);
+ };
+
+ tb[n].word=st->push(buffer);
+ tb[n].code=n;
+ inp >> tb[n].freq;
+ N+=tb[n].freq;
+
+ if ((addr=htb->search((char *)&tb[n].word,HT_ENTER)))
+ if (addr!=(char *)&tb[n].word){
+ cerr << "dictionary::loadtxt wrong entry was found ("
+ << buffer << ") in position " << n << "\n";
+ exit(1);
+ }
+
+ if (strcmp(tb[n].word,OOV())==0)
+ oov_code=n;
+
+ if (++n==lim) grow();
+ }
+ inp.getline(buffer,MAX_WORD-1);
+}
+
+void dictionary::save(std::ostream& out){
+ out << n << "\n";
+ for (int i=0;i<n;i++)
+ out << tb[i].word << " " << tb[i].freq << "\n";
+}
+
+
+int cmpdictentry(const void *a,const void *b){
+ dict_entry *ae=(dict_entry *)a;
+ dict_entry *be=(dict_entry *)b;
+ return be->freq-ae->freq;
+}
+
+dictionary::dictionary(dictionary* d){
+
+ //transfer values
+
+ n=d->n; //total entries
+ N=d->N; //total frequency
+ lim=d->lim; //limit of entries
+ oov_code=-1; //code od oov must be re-defined
+ ifl=0; //increment flag=0;
+ dubv=d->dubv; //dictionary upperbound transferred
+ in_oov_lex=0; //does not copy oovlex;
+
+
+ //creates a sorted copy of the table
+
+ tb = new dict_entry[lim];
+ htb = new htable(lim/LOAD_FACTOR);
+ st = new strstack(lim * 10);
+
+ for (int i=0;i<n;i++){
+ tb[i].code=d->tb[i].code;
+ tb[i].freq=d->tb[i].freq;
+ tb[i].word=st->push(d->tb[i].word);
+ }
+
+ //sort all entries according to frequency
+ cerr << "sorting dictionary ...";
+ qsort(tb,n,sizeof(dict_entry),cmpdictentry);
+ cerr << "done\n";
+
+ for (int i=0;i<n;i++){
+
+ //eventually re-assign oov code
+ if (d->oov_code==tb[i].code) oov_code=i;
+
+ tb[i].code=i;
+ htb->search((char *)&tb[i].word,HT_ENTER);
+ };
+
+}
+
+
+
+dictionary::~dictionary(){
+ delete htb;
+ delete st;
+ delete [] tb;
+}
+
+void dictionary::stat(){
+ cout << "dictionary class statistics\n";
+ cout << "size " << n
+ << " used memory "
+ << (lim * sizeof(int) +
+ htb->used() +
+ st->used())/1024 << " Kb\n";
+}
+
+void dictionary::grow(){
+
+ delete htb;
+
+ cerr << "+\b";
+
+ dict_entry *tb2=new dict_entry[lim+GROWTH_STEP];
+
+ memcpy(tb2,tb,sizeof(dict_entry) * lim );
+
+ delete [] tb; tb=tb2;
+
+ htb=new htable((lim+GROWTH_STEP)/LOAD_FACTOR);
+
+ for (int i=0;i<lim;i++)
+
+ htb->search((char *)&tb[i].word,HT_ENTER);
+
+ for (int i=lim;i<lim+GROWTH_STEP;i++) tb[i].freq=0;
+
+ lim+=GROWTH_STEP;
+
+
+}
+
+void dictionary::save(char *filename,int freqflag){
+
+ std::ofstream out(filename,ios::out);
+
+ if (!out){
+ cerr << "cannot open " << filename << "\n";
+ }
+
+ // header
+ if (freqflag)
+ out << "DICTIONARY 0 " << n << "\n";
+ else
+ out << "dictionary 0 " << n << "\n";
+
+ for (int i=0;i<n;i++){
+ out << tb[i].word;
+ if (freqflag)
+ out << " " << tb[i].freq;
+ out << "\n";
+ }
+
+ out.close();
+}
+
+
+int dictionary::getcode(const char *w){
+ dict_entry* ptr=(dict_entry *)htb->search((char *)&w,HT_FIND);
+ if (ptr==NULL) return -1;
+ return ptr->code;
+}
+
+int dictionary::encode(const char *w){
+
+ //case of strange characters
+ if (strlen(w)==0){cerr << "0";w=OOV();}
+
+ dict_entry* ptr;
+
+ if ((ptr=(dict_entry *)htb->search((char *)&w,HT_FIND))!=NULL)
+ return ptr->code;
+ else{
+ if (!ifl){ //do not extend dictionary
+ if (oov_code==-1){ //did not use OOV yet
+ cerr << "starting to use OOV words [" << w << "]\n";
+ tb[n].word=st->push(OOV());
+ htb->search((char *)&tb[n].word,HT_ENTER);
+ tb[n].code=n;
+ tb[n].freq=0;
+ oov_code=n;
+ if (++n==lim) grow();
+ }
+ //if there is an oov lexicon, check if this word belongs to
+ dict_entry* oovptr;
+ if (oovlex){
+ if ((oovptr=(dict_entry *)oovlex->htb->search((char *)&w,HT_FIND))!=NULL){
+ in_oov_lex=1;
+ oov_lex_code=oovptr->code;
+ }else
+ in_oov_lex=0;
+ }
+ return encode(OOV());
+ }
+ else{ //extend dictionary
+ tb[n].word=st->push((char *)w);
+ htb->search((char *)&tb[n].word,HT_ENTER);
+ tb[n].code=n;
+ tb[n].freq=0;
+ if (++n==lim) grow();
+ return n-1;
+ }
+ }
+}
+
+
+char *dictionary::decode(int c){
+ if (c>=0 && c < n)
+ return tb[c].word;
+ else{
+ cerr << "decode: code out of boundary\n";
+ return OOV();
+ }
+}
+
+
+dictionary_iter::dictionary_iter(dictionary *dict) : m_dict(dict) {
+ m_dict->htb->scan(HT_INIT);
+}
+
+dict_entry* dictionary_iter::next() {
+ return (dict_entry*)m_dict->htb->scan(HT_CONT);
+}
+
+
+
+
+
+/*
+main(int argc,char **argv){
+ dictionary d(argv[1],40000);
+ d.stat();
+ cout << "ROMA" << d.decode(0) << "\n";
+ cout << "ROMA:" << d.encode("ROMA") << "\n";
+ d.save(argv[2]);
+}
+*/
diff --git a/irstlm/src/src/dictionary.h b/irstlm/src/src/dictionary.h
new file mode 100644
index 000000000..be625e810
--- /dev/null
+++ b/irstlm/src/src/dictionary.h
@@ -0,0 +1,186 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+#ifndef MF_DICTIONARY_H
+#define MF_DICTIONARY_H
+
+#include <cstring>
+#include <iostream>
+
+#define MAX_WORD 1000
+#define LOAD_FACTOR 5
+
+#ifndef GROWTH_STEP
+#define GROWTH_STEP 100000
+#endif
+
+#ifndef DICT_INITSIZE
+#define DICT_INITSIZE 100000
+#endif
+
+//Begin of sentence symbol
+#ifndef BOS_
+#define BOS_ "<s>"
+#endif
+
+
+//End of sentence symbol
+#ifndef EOS_
+#define EOS_ "</s>"
+#endif
+
+//Out-Of-Vocabulary symbol
+#ifndef OOV_
+#define OOV_ "_unk_"
+#endif
+
+typedef struct{
+ char *word;
+ int code;
+ int freq;
+}dict_entry;
+
+class strstack;
+class htable;
+
+class dictionary{
+ strstack *st; //!< stack of strings
+ dict_entry *tb; //!< entry table
+ htable *htb; //!< hash table
+ int n; //!< number of entries
+ int N; //!< total frequency
+ int lim; //!< limit of entries
+ int oov_code; //!< code assigned to oov words
+ char* is; //!< interruption symbol list
+ char ifl; //!< increment flag
+ int dubv; //!< dictionary size upper bound
+ int in_oov_lex; //!< flag
+ int oov_lex_code; //!< dictionary
+ char* oov_str; //!< oov string
+
+ public:
+
+ friend class dictionary_iter;
+
+ dictionary* oovlex; //<! additional dictionary
+
+ inline int dub(){return dubv;}
+ inline int dub(int value){return (dubv=value);}
+
+ inline char *OOV(){return (OOV_);}
+ inline char *BoS(){return (BOS_);}
+ inline char *EoS(){return (EOS_);}
+
+ inline int oovcode(int v=-1){return oov_code=(v>=0?v:oov_code);}
+
+ inline char *intsymb(char* isymb=NULL){
+ if (isymb==NULL) return is;
+ if (is!=NULL) delete [] is;
+ is=new char[strlen(isymb+1)];
+ strcpy(is,isymb);
+ return is=isymb;
+ }
+
+ inline int incflag(){return ifl;}
+ inline int incflag(int v){return ifl=v;}
+ inline int oovlexsize(){return oovlex?oovlex->n:0;}
+ inline int inoovlex(){return in_oov_lex;}
+ inline int oovlexcode(){return oov_lex_code;}
+
+
+ int isprintable(char* w){
+ char buffer[MAX_WORD];
+ sprintf(buffer,"%s",w);
+ return strcmp(w,buffer)==0;
+ }
+
+ inline void genoovcode(){
+ int c=encode(OOV());
+ std::cerr << "OOV code is "<< c << std::endl;
+ oovcode(c);
+ }
+
+ inline dictionary* oovlexp(char *fname=NULL){
+ if (fname==NULL) return oovlex;
+ if (oovlex!=NULL) delete oovlex;
+ oovlex=new dictionary(fname,DICT_INITSIZE);
+ return oovlex;
+ }
+
+ inline int setoovrate(double oovrate){
+ encode(OOV()); //be sure OOV code exists
+ int oovfreq=(int)(oovrate * totfreq());
+ std::cerr << "setting OOV rate to: " << oovrate << " -- freq= " << oovfreq << std::endl;
+ return freq(oovcode(),oovfreq);
+ }
+
+
+ inline int incfreq(int code,int value){N+=value;return tb[code].freq+=value;}
+
+ inline int multfreq(int code,double value){
+ N+=(int)(value * tb[code].freq)-tb[code].freq;
+ return tb[code].freq=(int)(value * tb[code].freq);
+ }
+
+ inline int freq(int code,int value=-1){
+ if (value>=0){
+ N+=value-tb[code].freq;
+ tb[code].freq=value;
+ }
+ return tb[code].freq;
+ }
+
+ inline int totfreq(){return N;}
+
+ void grow();
+ //dictionary(int size=400,char* isym=NULL,char* oovlex=NULL);
+ dictionary(char *filename=NULL,int size=DICT_INITSIZE,char* isymb=NULL,char* oovlex=NULL);
+ dictionary(dictionary* d);
+
+ ~dictionary();
+ void generate(char *filename);
+ void load(char *filename);
+ void save(char *filename,int freqflag=0);
+ void load(std::istream& fd);
+ void save(std::ostream& fd);
+
+ int size(){return n;};
+ int getcode(const char *w);
+ int encode(const char *w);
+ char *decode(int c);
+ void stat();
+
+ void cleanfreq(){
+ for (int i=0;i<n;tb[i++].freq=0);
+ N=0;
+ }
+
+};
+
+class dictionary_iter {
+ public:
+ dictionary_iter(dictionary *dict);
+ dict_entry* next();
+ private:
+ dictionary* m_dict;
+};
+
+#endif
+
diff --git a/irstlm/src/src/gzfilebuf.h b/irstlm/src/src/gzfilebuf.h
new file mode 100644
index 000000000..5e215935a
--- /dev/null
+++ b/irstlm/src/src/gzfilebuf.h
@@ -0,0 +1,80 @@
+#ifndef _GZFILEBUF_H_
+#define _GZFILEBUF_H_
+
+#include <streambuf>
+#include <zlib.h>
+
+class gzfilebuf : public std::streambuf {
+public:
+ gzfilebuf(const char *filename)
+ { _gzf = gzopen(filename, "rb");
+ setg (_buff+sizeof(int), // beginning of putback area
+ _buff+sizeof(int), // read position
+ _buff+sizeof(int)); // end position
+ }
+ ~gzfilebuf() { gzclose(_gzf); }
+protected:
+ virtual int_type overflow (int_type c) {
+ throw;
+ }
+
+ // write multiple characters
+ virtual
+ std::streamsize xsputn (const char* s,
+ std::streamsize num) {
+ throw;
+ }
+
+ virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ){ throw;
+ }
+
+ //read one character
+ virtual int_type underflow () {
+ // is read position before end of _buff?
+ if (gptr() < egptr()) {
+ return traits_type::to_int_type(*gptr());
+ }
+
+ /* process size of putback area
+ * - use number of characters read
+ * - but at most four
+ */
+ unsigned int numPutback = gptr() - eback();
+ if (numPutback > sizeof(int)) {
+ numPutback = sizeof(int);
+ }
+
+ /* copy up to four characters previously read into
+ * the putback _buff (area of first four characters)
+ */
+ std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
+ numPutback);
+
+ // read new characters
+ int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
+ if (num <= 0) {
+ // ERROR or EOF
+ return EOF;
+ }
+
+ // reset _buff pointers
+ setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
+ _buff+sizeof(int), // read position
+ _buff+sizeof(int)+num); // end of buffer
+
+ // return next character
+ return traits_type::to_int_type(*gptr());
+ }
+
+ std::streamsize xsgetn (char* s,
+ std::streamsize num) {
+ return gzread(_gzf,s,num);
+ }
+
+private:
+ gzFile _gzf;
+ static const unsigned int _buffsize = 1024;
+ char _buff[_buffsize];
+};
+
+#endif
diff --git a/irstlm/src/src/htable.cpp b/irstlm/src/src/htable.cpp
new file mode 100644
index 000000000..73dab4d10
--- /dev/null
+++ b/irstlm/src/src/htable.cpp
@@ -0,0 +1,329 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include "mempool.h"
+#include "htable.h"
+
+//bitwise rotation of unsigned integers
+
+#define rot_right(a,k) (((a) >> k ) | ((a) << (32-(k))))
+#define rot_left(a,k) (((a) << k ) | ((a) >> (32-(k))))
+
+using namespace std;
+
+htable::htable(int n,int kl,HTYPE ht,size_t (*klf)(const char* )){
+
+ if (ht!=STRPTR && ht!=STR && kl==0){
+ cerr << "htable: key length must be specified for non-string entries!";
+ exit(1);
+ }
+
+ memory=new mempool( sizeof(entry) , BlockSize );
+
+ table = new entry* [ size=n ];
+
+ memset(table,0,sizeof(entry *) * n );
+
+ keylen=(ht==INT || ht==INTPTR?kl/sizeof(int):kl);
+
+ htype=ht;
+
+ keys = accesses = collisions = 0;
+
+ keylenfunc=(klf?klf:&strlen);
+
+}
+
+
+char *htable::search(char *item, HT_ACTION action)
+
+{
+ address h;
+ entry *q,**p;
+ int i;
+
+ //if (action == HT_FIND)
+ accesses++;
+
+ h = Hash(item);
+
+ i=(h % size);
+
+ //cout << "htable::search() hash i=" << i << "\n";
+
+ p = &table[h % size];
+
+ q=*p;
+
+ /*
+ ** Follow collision chain
+ */
+
+ while (q != NULL && Comp((char *)q->key,(char *)item))
+ {
+ p = (entry **)&q->next;
+ q=*p;
+ //if (action == HT_FIND)
+ collisions++;
+ }
+
+ if (
+ q != NULL /* found */
+ ||
+ action == HT_FIND /* not found, search only */
+ ||
+ (q = (entry *)memory->allocate())
+ ==
+ NULL /* not found, no room */
+ )
+
+ return((q!=NULL)?(char *)q->key:(char *)NULL);
+
+ *p = q; /* link into chain */
+ /*
+ ** Initialize new element
+ */
+
+ q->key = item;
+ q->next = NULL;
+ keys++;
+
+ return((char *)q->key);
+}
+
+
+char *htable::scan(HT_ACTION action){
+
+ char *k;
+
+ if (action == HT_INIT)
+ {
+ scan_i=0;scan_p=table[0];
+ return NULL;
+ }
+
+ // if scan_p==NULL go to the first non null pointer
+ while ((scan_p==NULL) && (++scan_i<size)) scan_p=table[scan_i];
+
+ if (scan_p!=NULL)
+ {
+ k=scan_p->key;
+ scan_p=(entry *)scan_p->next;
+ return k;
+ };
+
+ return NULL;
+}
+
+
+void htable::map(ostream& co,int cols){
+
+ entry *p;
+ char* img=new char[cols+1];
+
+ img[cols]='\0';
+ memset(img,'.',cols);
+
+ co << "htable memory map: . (0 items), - (<5), # (>5)\n";
+
+ for (int i=0; i<size;i++)
+ {
+ int n=0;p=table[i];
+
+ while(p!=NULL){
+ n++;
+ p=(entry *)p->next;
+ };
+
+ if (i && (i % cols)==0){
+ co << img << "\n";
+ memset(img,'.',cols);
+ }
+
+ if (n>0)
+ img[i % cols]=n<=5?'-':'#';
+
+ }
+
+ img[size % cols]='\0';
+ co << img << "\n";
+
+ delete []img;
+}
+
+
+void htable::stat(){
+ cerr << "htable class statistics\n";
+ cerr << "size " << size
+ << " keys " << keys
+ << " acc " << accesses
+ << " coll " << collisions
+ << " used memory " << used()/1024 << "Kb\n";
+}
+
+htable::~htable()
+{
+ delete [] table;
+ delete memory;
+}
+
+address htable::HashStr(char *key)
+{
+ char *Key=(htype==STRPTR? *(char **)key:key);
+ int length=(keylen?keylen:keylenfunc(Key));
+
+ //cerr << "hash: " << Key << " length:" << length << "\n";
+
+ register address h=0;
+ register int i;
+
+ for (i=0,h=0;i<length;i++)
+ h = h * Prime1 ^ (Key[i] - ' ');
+ h %= Prime2;
+
+ return h;
+}
+
+//Herbert Glarner's "HSH 11/13" hash function.
+/*
+address htable::HashInt(char *key){
+
+int *Key=(htype==INTPTR? *(int **)key:(int *)key);
+
+address state=12345,h=0;
+register int i,j;
+
+int p=7; //precision=8 * sizeof(int)-1, in general must be >=7
+
+ for (i=0,h=0;i<keylen;i++){
+ h = h ^ ((address) Key[i]);
+ for (j=0;j<p;j++){
+ state = rot_left(state,11); //state = state left-rotate 11 bits
+ h = rot_left(h,13); //h = h left-rotate 13 bits
+ h ^= state ; //h = h xor state
+ h = rot_left(h,(state & (address)31)); //h = h left-rotate (state mod 32) bits
+ h = rot_left(h, (h & (address)31)); //h = h left-rotate (h mod 32) bits
+ }
+ }
+
+ return h;
+}
+
+*/
+
+address htable::HashInt(char *key)
+{
+ int *Key=(htype==INTPTR? *(int **)key:(int *)key);
+
+
+ address h;
+ register int i;
+
+ //Thomas Wang's 32 bit Mix Function
+ for (i=0,h=0;i<keylen;i++){
+ h+=Key[i];
+ h += ~(h << 15);
+ h ^= (h >> 10);
+ h += (h << 3);
+ h ^= (h >> 6);
+ h += ~(h << 11);
+ h ^= (h >> 16);
+ };
+
+ return h;
+}
+
+int htable::CompStr(char *key1, char *key2)
+{
+ assert(key1 && key2);
+
+ char *Key1=(htype==STRPTR?*(char **)key1:key1);
+ char *Key2=(htype==STRPTR?*(char **)key2:key2);
+
+ assert(Key1 && Key2);
+
+ int length1=(keylen?keylen:keylenfunc(Key1));
+ int length2=(keylen?keylen:keylenfunc(Key2));
+
+ if (length1!=length2) return 1;
+
+ register int i;
+
+ for (i=0;i<length1;i++)
+ if (Key1[i]!=Key2[i]) return 1;
+ return 0;
+}
+
+int htable::CompInt(char *key1, char *key2)
+{
+ assert(key1 && key2);
+
+ int *Key1=(htype==INTPTR?*(int **)key1:(int*)key1);
+ int *Key2=(htype==INTPTR?*(int **)key2:(int*)key2);
+
+ assert(Key1 && Key2);
+
+ register int i;
+
+ for (i=0;i<keylen;i++)
+ if (Key1[i]!=Key2[i]) return 1;
+ return 0;
+}
+
+
+/*
+main(){
+
+const int n=1000;
+
+htable *ht=new htable(1000/5);
+
+ char w[n][20];
+ char *c;
+
+ for (int i=0;i<n;i++)
+ {
+ sprintf(w[i],"ciao%d",i);
+ ht->search((char *)&w[i],HT_ENTER);
+ }
+
+ for (int i=0;i<n;i++)
+ if (ht->search((char *)&w[i],HT_FIND))
+ cout << w[i] << " trovato\n" ;
+ else
+ cout << w[i] << " non trovato\n";
+
+ ht->stat();
+
+ delete ht;
+ htable *ht2=new htable(n);
+ for (int i=0;i<n;i++)
+ ht2->search((char *)&w[i],HT_ENTER);
+
+ ht2->scan(INIT);
+ cout << "elenco:\n";
+ while ((c=ht2->scan(CONT))!=NULL)
+ cout << *(char **) c << "\n";
+
+ ht2->map();
+}
+*/
diff --git a/irstlm/src/src/htable.h b/irstlm/src/src/htable.h
new file mode 100644
index 000000000..476d45e74
--- /dev/null
+++ b/irstlm/src/src/htable.h
@@ -0,0 +1,130 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+#ifndef MF_HTABLE_H
+#define MF_HTABLE_H
+
+#include <iostream>
+
+#define Prime1 37
+#define Prime2 1048583
+#define BlockSize 100
+
+
+// Fast arithmetic, relying on powers of 2,
+// and on pre-processor concatenation property
+
+typedef struct{
+ char* key;
+ char* next; // secret from user
+}entry;
+
+typedef unsigned int address;
+
+typedef enum {HT_FIND, //!< search: find an entry
+ HT_ENTER, //!< search: enter an entry
+ HT_INIT, //!< scan: start scan
+ HT_CONT //!< scan: continue scan
+} HT_ACTION;
+
+typedef enum {
+ STR, //!< string
+ STRPTR, //!< pointer to string
+ INT, //!< pointer to int
+ INTPTR //!< pointer to pointer to int
+}HTYPE;
+
+//! Hash Table for strings
+
+class htable {
+ int size; //!< table size
+ int keylen; //!< key length
+ HTYPE htype; //!< type of entry pointer
+ entry **table; //!< hash table
+ int scan_i; //!< scan support
+ entry *scan_p; //!< scan support
+ // statistics
+ long keys; //!< # of entries
+ long accesses; //!< # of accesses
+ long collisions; //!< # of collisions
+
+ mempool *memory; //!< memory pool
+
+ size_t (*keylenfunc)(const char*); //!< function computing key length
+ address (*hashfunc)(const char*); //!< hash function
+ int (*compfunc)(const char*, const char*); //!< comparison function
+
+ public:
+
+ //! Creates an hash table
+ htable(int n,int kl=0,HTYPE ht=STRPTR,size_t (*klf)(const char* )=NULL);
+
+ //! Destroys an and hash table
+ ~htable();
+
+ //! Computes the hash function
+ address Hash(char *key){
+ switch (htype){
+ case INT:case INTPTR: return HashInt(key);
+ break;
+ case STR:case STRPTR: return HashStr(key);
+ default: exit(1);
+ }
+ };
+ address HashInt(char *key);
+ address HashStr(char *key);
+
+ //! Compares the keys of two entries
+ int Comp(char *Key1,char *Key2){
+ switch (htype){
+ case INT:case INTPTR: return CompInt(Key1,Key2);
+ break;
+ case STR:case STRPTR: return CompStr(Key1,Key2);
+ default: exit(1);
+ };
+ }
+
+ int CompInt(char *Key1,char *Key2);
+ int CompStr(char *Key1,char *Key2);
+
+ //! Searches for an item
+ char *search(char *item, HT_ACTION action);
+
+ //! Scans the content
+ char *scan(HT_ACTION action);
+
+ //! Prints statistics
+ void stat();
+
+ //! Print a map of memory use
+ void map(std::ostream& co=std::cout, int cols=80);
+
+ //! Returns amount of used memory
+ int used(){return
+ size * sizeof(entry **) +
+ memory->used();};
+};
+
+
+
+#endif
+
+
+
diff --git a/irstlm/src/src/index.h b/irstlm/src/src/index.h
new file mode 100644
index 000000000..e4e29d793
--- /dev/null
+++ b/irstlm/src/src/index.h
@@ -0,0 +1,19 @@
+
+
+#pragma once
+
+#ifdef WIN32
+
+inline const char *index(const char *str, char search)
+{
+ size_t i=0;
+ while (i< strlen(str) ){
+ if (str[i]==search) return &str[i];
+ }
+ return NULL;
+}
+
+
+#endif
+
+
diff --git a/irstlm/src/src/lmtable.cpp b/irstlm/src/src/lmtable.cpp
new file mode 100644
index 000000000..05ab115eb
--- /dev/null
+++ b/irstlm/src/src/lmtable.cpp
@@ -0,0 +1,1217 @@
+/******************************************************************************
+IrstLM: IRST Language Model Toolkit
+Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+#include <iostream>
+#include <fstream>
+#include <stdexcept>
+#include <cassert>
+#include "math.h"
+#include "mempool.h"
+#include "htable.h"
+#include "ngramcache.h"
+#include "dictionary.h"
+#include "n_gram.h"
+#include "lmtable.h"
+#include "util.h"
+
+using namespace std;
+
+inline void error(char* message){
+ cerr << message << "\n";
+ throw std::runtime_error(message);
+}
+
+
+//instantiate an empty lm table
+lmtable::lmtable(){
+
+ configure(1,false);
+
+ dict=new dictionary((char *)NULL,1000000,(char*)NULL,(char*)NULL);
+
+ memset(cursize, 0, sizeof(cursize));
+ memset(tbltype, 0, sizeof(tbltype));
+ memset(maxsize, 0, sizeof(maxsize));
+ memset(info, 0, sizeof(info));
+ memset(NumCenters, 0, sizeof(NumCenters));
+
+ max_cache_lev=0;
+ for (int i=0;i<=LMTMAXLEV+1;i++) lmtcache[i]=NULL;
+ probcache=NULL;
+ statecache=NULL;
+
+};
+
+
+//loadstd::istream& inp a lmtable from a lm file
+
+void lmtable::load(istream& inp){
+
+ //give a look at the header to select loading method
+ char header[1024];
+
+ inp >> header; cerr << header << "\n";
+
+ if (strncmp(header,"Qblmt",5)==0 || strncmp(header,"blmt",4)==0)
+ loadbin(inp,header);
+ else
+ loadtxt(inp,header);
+
+ dict->genoovcode();
+
+
+ cerr << "OOV code is " << dict->oovcode() << "\n";
+
+}
+
+
+
+int parseWords(char *sentence, char **words, int max)
+{
+ char *word;
+ int i = 0;
+
+ char *const wordSeparators = " \t\r\n";
+
+ for (word = strtok(sentence, wordSeparators);
+ i < max && word != 0;
+ i++, word = strtok(0, wordSeparators))
+ {
+ words[i] = word;
+ }
+
+ if (i < max){words[i] = 0;}
+
+ return i;
+}
+
+
+
+//Load a LM as a text file. LM could have been generated either with the
+//IRST LM toolkit or with the SRILM Toolkit. In the latter we are not
+//sure that n-grams are lexically ordered (according to the 1-grams).
+//However, we make the following assumption:
+//"all successors of any prefix are sorted and written in contiguous lines!"
+//This method also loads files processed with the quantization
+//tool: qlm
+
+int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow){
+
+ char* words[1+ LMTMAXLEV + 1 + 1];
+ int howmany;
+ char line[MAX_LINE];
+
+ inp.getline(line,MAX_LINE);
+ if (strlen(line)==MAX_LINE-1){
+ cerr << "parseline: input line exceed MAXLINE ("
+ << MAX_LINE << ") chars " << line << "\n";
+ exit(1);
+ }
+
+ howmany = parseWords(line, words, Order + 3);
+ assert(howmany == (Order+ 1) || howmany == (Order + 2));
+
+ //read words
+ ng.size=0;
+ for (int i=1;i<=Order;i++)
+ ng.pushw(strcmp(words[i],"<unk>")?words[i]:ng.dict->OOV());
+
+ //read logprob/code and logbow/code
+ assert(sscanf(words[0],"%f",&prob));
+ if (howmany==(Order+2))
+ assert(sscanf(words[Order+1],"%f",&bow));
+ else
+ bow=0.0; //this is log10prob=0 for implicit backoff
+
+ return 1;
+}
+
+
+void lmtable::loadcenters(istream& inp,int Order){
+ char line[MAX_LINE];
+
+ //first read the coodebook
+ cerr << Order << " read code book ";
+ inp >> NumCenters[Order];
+ Pcenters[Order]=new float[NumCenters[Order]];
+ Bcenters[Order]=(Order<maxlev?new float[NumCenters[Order]]:NULL);
+
+ for (int c=0;c<NumCenters[Order];c++){
+ inp >> Pcenters[Order][c];
+ if (Order<maxlev) inp >> Bcenters[Order][c];
+ };
+ //empty the last line
+ inp.getline((char*)line,MAX_LINE);
+
+}
+
+
+void lmtable::loadtxt(istream& inp,const char* header){
+
+
+ //open input stream and prepare an input string
+ char line[MAX_LINE];
+
+ //prepare word dictionary
+ //dict=(dictionary*) new dictionary(NULL,1000000,NULL,NULL);
+ dict->incflag(1);
+
+ //put here ngrams, log10 probabilities or their codes
+ ngram ng(dict);
+ float prob,bow;;
+
+ //check the header to decide if the LM is quantized or not
+ isQtable=(strncmp(header,"qARPA",5)==0?true:false);
+
+ //we will configure the table later we we know the maxlev;
+ bool yetconfigured=false;
+
+ cerr << "loadtxt()\n";
+
+ // READ ARPA Header
+ int Order,n;
+
+ while (inp.getline(line,MAX_LINE)){
+
+ if (strlen(line)==MAX_LINE-1){
+ cerr << "lmtable::loadtxt: input line exceed MAXLINE ("
+ << MAX_LINE << ") chars " << line << "\n";
+ exit(1);
+ }
+
+ bool backslash = (line[0] == '\\');
+
+ if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {
+ maxsize[Order] = n; maxlev=Order; //upadte Order
+
+ }
+
+ if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {
+
+ //at this point we are sure about the size of the LM
+ if (!yetconfigured){
+ configure(maxlev,isQtable);yetconfigured=true;
+ //allocate space for loading the table of this level
+ for (int i=1;i<=maxlev;i++)
+ table[i]= new char[maxsize[i] * nodesize(tbltype[i])];
+ }
+
+ cerr << Order << "-grams: reading ";
+
+ if (isQtable) loadcenters(inp,Order);
+
+ //allocate support vector to manage badly ordered n-grams
+ if (maxlev>1 && Order<maxlev) {
+ startpos[Order]=new int[maxsize[Order]];
+ for (int c=0;c<maxsize[Order];c++) startpos[Order][c]=-1;
+ }
+
+ //prepare to read the n-grams entries
+ cerr << maxsize[Order] << " entries\n";
+
+ //WE ASSUME A WELL STRUCTURED FILE!!!
+
+ for (int c=0;c<maxsize[Order];c++){
+
+ if (parseline(inp,Order,ng,prob,bow))
+ add(ng,
+ (int)(isQtable?prob:*((int *)&prob)),
+ (int)(isQtable?bow:*((int *)&bow)));
+ }
+ // now we can fix table at level Order -1
+ if (maxlev>1 && Order>1) checkbounds(Order-1);
+ }
+ }
+
+ dict->incflag(0);
+ cerr << "done\n";
+
+}
+
+
+//Checkbound with sorting of n-gram table on disk
+
+void lmtable::checkbounds(int level){
+
+ char* tbl=table[level];
+ char* succtbl=table[level+1];
+
+ LMT_TYPE ndt=tbltype[level], succndt=tbltype[level+1];
+ int ndsz=nodesize(ndt), succndsz=nodesize(succndt);
+
+ //re-order table at level+1 on disk
+ //generate random filename to avoid collisions
+ ofstream out;string filePath;
+ createtempfile(out,filePath,ios::out);
+
+ int start,end,newstart;
+
+ //re-order table at level l+1
+ newstart=0;
+ for (int c=0;c<cursize[level];c++){
+ start=startpos[level][c]; end=bound(tbl+c*ndsz,ndt);
+ //is start==-1 there are no successors for this entry and end==-2
+ if (end==-2) end=start;
+ assert(start<=end);
+ assert(newstart+(end-start)<=cursize[level+1]);
+ assert(end<=cursize[level+1]);
+
+ if (start<end)
+ out.write((char*)(succtbl + start * succndsz),(end-start) * succndsz);
+
+ bound(tbl+c*ndsz,ndt,newstart+(end-start));
+ newstart+=(end-start);
+ }
+ out.close();
+
+ fstream inp(filePath.c_str(),ios::in);
+
+ inp.read(succtbl,cursize[level+1]*succndsz);
+ inp.close();
+ removefile(filePath);
+
+}
+
+//Add method inserts n-grams in the table structure. It is ONLY used during
+//loading of LMs in text format. It searches for the prefix, then it adds the
+//suffix to the last level and updates the start-end positions.
+
+int lmtable::add(ngram& ng,int iprob,int ibow){
+
+ char *found; LMT_TYPE ndt; int ndsz;
+
+ if (ng.size>1){
+
+ // find the prefix starting from the first level
+ int start=0, end=cursize[1];
+
+ for (int l=1;l<ng.size;l++){
+
+ ndt=tbltype[l]; ndsz=nodesize(ndt);
+
+ if (search(table[l] + (start * ndsz),ndt,l,(end-start),ndsz,
+ ng.wordp(ng.size-l+1),LMT_FIND, &found)){
+
+ //update start-end positions for next step
+ if (l< (ng.size-1)){
+ //set start position
+ if (found==table[l]) start=0; //first pos in table
+ else start=bound(found - ndsz,ndt); //end of previous entry
+
+ //set end position
+ end=bound(found,ndt);
+ }
+ }
+ else{
+ cerr << "warning: missing back-off for ngram " << ng << "\n";
+ return 0;
+ }
+ }
+
+ // update book keeping information about level ng-size -1.
+ // if this is the first successor update start position
+ int position=(found-table[ng.size-1])/ndsz;
+ if (startpos[ng.size-1][position]==-1)
+ startpos[ng.size-1][position]=cursize[ng.size];
+
+ //always update ending position
+ bound(found,ndt,cursize[ng.size]+1);
+ //cout << "startpos: " << startpos[ng.size-1][position]
+ //<< " endpos: " << bound(found,ndt) << "\n";
+
+ }
+
+ // just add at the end of table[ng.size]
+
+ assert(cursize[ng.size]< maxsize[ng.size]); // is there enough space?
+ ndt=tbltype[ng.size];ndsz=nodesize(ndt);
+
+ found=table[ng.size] + (cursize[ng.size] * ndsz);
+ word(found,*ng.wordp(1));
+ prob(found,ndt,iprob);
+ if (ng.size<maxlev){bow(found,ndt,ibow);bound(found,ndt,-2);}
+
+ cursize[ng.size]++;
+
+ return 1;
+
+}
+
+
+void *lmtable::search(char* tb,
+ LMT_TYPE ndt,
+ int lev,
+ int n,
+ int sz,
+ int *ngp,
+ LMT_ACTION action,
+ char **found){
+
+ if (lev==1) return *found=(*ngp <n ? tb + *ngp * sz:NULL);
+
+ //prepare search pattern
+ char w[LMTCODESIZE];putmem(w,ngp[0],0,LMTCODESIZE);
+
+ int idx=0; // index returned by mybsearch
+ *found=NULL; //initialize output variable
+ switch(action){
+ case LMT_FIND:
+ if (!tb || !mybsearch(tb,n,sz,(unsigned char *)w,&idx)) return NULL;
+ else
+ return *found=tb + (idx * sz);
+ default:
+ error("lmtable::search: this option is available");
+ };
+
+ return NULL;
+}
+
+
+int lmtable::mybsearch(char *ar, int n, int size,
+ unsigned char *key, int *idx)
+{
+
+ totbsearch++;
+
+ register int low, high;
+ register unsigned char *p;
+ register int result;
+ register int i;
+
+ /* return idx with the first
+ position equal or greater than key */
+
+ /* Warning("start bsearch \n"); */
+
+ low = 0;high = n; *idx=0;
+ while (low < high)
+ {
+ *idx = (low + high) / 2;
+ p = (unsigned char *) (ar + (*idx * size));
+
+ //comparison
+ for (i=(LMTCODESIZE-1);i>=0;i--){
+ result=key[i]-p[i];
+ if (result) break;
+ }
+
+ if (result < 0)
+ high = *idx;
+ else if (result > 0)
+ low = *idx + 1;
+ else
+ return 1;
+ }
+
+ *idx=low;
+
+ return 0;
+
+}
+
+
+// saves a LM table in text format
+
+void lmtable::savetxt(const char *filename){
+
+ fstream out(filename,ios::out);
+ int l;
+
+ out.precision(7);
+
+ if (isQtable) out << "qARPA\n";
+
+
+ ngram ng(dict,0);
+
+ cerr << "savetxt: " << filename << "\n";
+
+ out << "\n\\data\\\n";
+ for (l=1;l<=maxlev;l++){
+ out << "ngram " << l << "= " << cursize[l] << "\n";
+ }
+
+ for (l=1;l<=maxlev;l++){
+
+ out << "\n\\" << l << "-grams:\n";
+ cerr << "save: " << cursize[l] << " " << l << "-grams\n";
+ if (isQtable){
+ out << NumCenters[l] << "\n";
+ for (int c=0;c<NumCenters[l];c++){
+ out << Pcenters[l][c];
+ if (l<maxlev) out << " " << Bcenters[l][c];
+ out << "\n";
+ }
+ }
+
+ ng.size=0;
+ dumplm(out,ng,1,l,0,cursize[1]);
+
+ }
+
+ out << "\\end\\\n";
+ cerr << "done\n";
+}
+
+
+void lmtable::savebin(const char *filename){
+
+ fstream out(filename,ios::out);
+ cerr << "savebin: " << filename << "\n";
+
+ // print header
+ if (isQtable){
+ out << "Qblmt " << maxlev;
+ for (int i=1;i<=maxlev;i++) out << " " << cursize[i];
+ out << "\nNumCenters";
+ for (int i=1;i<=maxlev;i++) out << " " << NumCenters[i];
+ out << "\n";
+
+ }else{
+ out << "blmt " << maxlev;
+ for (int i=1;i<=maxlev;i++) out << " " << cursize[i] ;
+ out << "\n";
+ }
+
+ dict->save(out);
+
+ for (int i=1;i<=maxlev;i++){
+ cerr << "saving " << cursize[i] << " " << i << "-grams\n";
+ if (isQtable){
+ out.write((char*)Pcenters[i],NumCenters[i] * sizeof(float));
+ if (i<maxlev)
+ out.write((char *)Bcenters[i],NumCenters[i] * sizeof(float));
+ }
+ out.write(table[i],cursize[i]*nodesize(tbltype[i]));
+ }
+
+ cerr << "done\n";
+}
+
+
+//manages the long header of a bin file
+
+void lmtable::loadbinheader(istream& inp,const char* header){
+
+ // read rest of header
+ inp >> maxlev;
+
+ if (strncmp(header,"Qblmt",5)==0) isQtable=1;
+ else if(strncmp(header,"blmt",4)==0) isQtable=0;
+ else error("loadbin: LM file is not in binary format");
+
+ configure(maxlev,isQtable);
+
+ for (int i=1;i<=maxlev;i++){
+ inp >> cursize[i]; maxsize[i]=cursize[i];
+ table[i]=new char[cursize[i] * nodesize(tbltype[i])];
+ }
+
+
+ if (isQtable){
+ char header2[100];
+ cerr << "reading num centers:";
+ inp >> header2;
+ for (int i=1;i<=maxlev;i++){
+ inp >> NumCenters[i];cerr << " " << NumCenters[i];
+
+ }
+ cerr << "\n";
+ }
+}
+
+//load codebook of level l
+
+void lmtable::loadbincodebook(istream& inp,int l){
+
+ Pcenters[l]=new float [NumCenters[l]];
+ inp.read((char*)Pcenters[l],NumCenters[l] * sizeof(float));
+ if (l<maxlev){
+ Bcenters[l]=new float [NumCenters[l]];
+ inp.read((char *)Bcenters[l],NumCenters[l]*sizeof(float));
+ }
+
+}
+
+//load a binary lmfile
+
+void lmtable::loadbin(istream& inp, const char* header){
+
+ cerr << "loadbin()\n";
+
+ loadbinheader(inp,header);
+
+ dict->load(inp);
+
+ for (int l=1;l<=maxlev;l++){
+ if (isQtable) loadbincodebook(inp,l);
+
+ cerr << "loading " << cursize[l] << " " << l << "-grams\n";
+ inp.read(table[l],cursize[l]*nodesize(tbltype[l]));
+ };
+
+ cerr << "done\n";
+}
+
+
+
+int lmtable::get(ngram& ng,int n,int lev){
+
+ // cout << "cerco:" << ng << "\n";
+ totget[lev]++;
+
+ if (lev > maxlev) error("get: lev exceeds maxlevel");
+ if (n < lev) error("get: ngram is too small");
+
+ //set boudaries for 1-gram
+ int offset=0,limit=cursize[1];
+
+ //information of table entries
+ int hit;char* found; LMT_TYPE ndt;
+ ng.link=NULL;
+ ng.lev=0;
+
+ for (int l=1;l<=lev;l++){
+
+ //initialize entry information
+ hit = 0 ; found = NULL; ndt=tbltype[l];
+
+ //if (l==2) cout <<"bicache: searching:" << ng <<"\n";
+
+ if (lmtcache[l] && lmtcache[l]->get(ng.wordp(n),(char *)&found))
+ hit=1;
+ else
+ search(table[l] + (offset * nodesize(ndt)),
+ ndt,
+ l,
+ (limit-offset),
+ nodesize(ndt),
+ ng.wordp(n-l+1),
+ LMT_FIND,
+ &found);
+
+ //insert both found and not found items!!!
+ if (lmtcache[l] && hit==0)
+ lmtcache[l]->add(ng.wordp(n),(char *)&found);
+
+ if (!found) return 0;
+
+ ng.link=found;
+ ng.info=ndt;
+ ng.lev=l;
+
+ if (l<maxlev){ //set start/end point for next search
+
+ //if current offset is at the bottom also that of successors will be
+ if (offset+1==cursize[l]) limit=cursize[l+1];
+ else limit=bound(found,ndt);
+
+ //if current start is at the begin, then also that of successors will be
+ if (found==table[l]) offset=0;
+ else offset=bound((found - nodesize(ndt)),ndt);
+
+ assert(offset!=-1); assert(limit!=-1);
+ }
+ }
+
+ //put information inside ng
+ ng.size=n; ng.freq=0;
+ ng.succ=(lev<maxlev?limit-offset:0);
+
+ return 1;
+}
+
+
+//recursively prints the language model table
+
+void lmtable::dumplm(fstream& out,ngram ng, int ilev, int elev, int ipos,int epos){
+
+ LMT_TYPE ndt=tbltype[ilev];
+ int ndsz=nodesize(ndt);
+
+
+ assert(ng.size==ilev-1);
+ assert(ipos>=0 && epos<=cursize[ilev] && ipos<epos);
+ ng.pushc(0);
+
+ for (int i=ipos;i<epos;i++){
+ *ng.wordp(1)=word(table[ilev]+i*ndsz);
+ if (ilev<elev){
+ //get first and last successor position
+ int isucc=(i>0?bound(table[ilev]+(i-1)*ndsz,ndt):0);
+ int esucc=bound(table[ilev]+i*ndsz,ndt);
+ if (isucc < esucc) //there are successors!
+ dumplm(out,ng,ilev+1,elev,isucc,esucc);
+ //else
+ //cout << "no successors for " << ng << "\n";
+ }
+ else{
+ //out << i << " "; //this was just to count printed n-grams
+ int ipr=prob(table[ilev]+ i * ndsz,ndt);
+ out << (isQtable?ipr:*(float *)&ipr) <<"\t";
+ for (int k=ng.size;k>=1;k--){
+ if (k<ng.size) out << " ";
+ out << dict->decode(*ng.wordp(k));
+ }
+
+ if (ilev<maxlev){
+ int ibo=bow(table[ilev]+ i * ndsz,ndt);
+ if (isQtable) out << "\t" << ibo;
+ else
+ if (*((float *)&ibo)!=0.0)
+ out << "\t" << *((float *)&ibo);
+
+ }
+ out << "\n";
+ }
+ }
+}
+
+//succscan iteratively returns all successors of an ngram h for which
+//get(h,h.size,h.size) returned true.
+
+
+int lmtable::succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){
+ assert(lev==h.lev+1 && h.size==lev && lev<=maxlev);
+
+ LMT_TYPE ndt=tbltype[h.lev];
+ int ndsz=nodesize(ndt);
+
+ switch (action){
+
+ case LMT_INIT:
+ //reset ngram local indexes
+
+ ng.size=lev;
+ ng.trans(h);
+ ng.midx[lev]=(h.link>table[h.lev]?bound(h.link-ndsz,ndt):0);
+
+ return 1;
+
+ case LMT_CONT:
+
+ if (ng.midx[lev]<bound(h.link,ndt))
+ {
+ //put current word into ng
+ *ng.wordp(1)=word(table[lev]+ng.midx[lev]*nodesize(tbltype[lev]));
+ ng.midx[lev]++;
+ return 1;
+ }
+ else
+ return 0;
+
+ default:
+ cerr << "succscan: only permitted options are LMT_INIT and LMT_CONT\n";
+ exit(0);
+ }
+
+}
+
+//maxsuffptr returns the largest suffix of an n-gram that is contained
+//in the LM table. This can be used as a compact representation of the
+//(n-1)-gram state of a n-gram LM. if the input k-gram has k>=n then it
+//is trimmed to its n-1 suffix.
+
+const char *lmtable::maxsuffptr(ngram ong){
+
+ if (ong.size==0) return (char*) NULL;
+ if (ong.size>=maxlev) ong.size=maxlev-1;
+
+ ngram ng=ong;
+ //ngram ng(dict); //eventually use the <unk> word
+ //ng.trans(ong);
+
+ if (get(ng,ng.size,ng.size))
+ return ng.link;
+ else{
+ ong.size--;
+ return maxsuffptr(ong);
+ }
+}
+
+
+const char *lmtable::cmaxsuffptr(ngram ong){
+
+ if (ong.size==0) return (char*) NULL;
+ if (ong.size>=maxlev) ong.size=maxlev-1;
+
+ char* found;
+
+ if (statecache && (ong.size==maxlev-1) && statecache->get(ong.wordp(maxlev-1),(char *)&found))
+ return found;
+
+ found=(char *)maxsuffptr(ong);
+
+ if (statecache && ong.size==maxlev-1){
+ //if (statecache->isfull()) statecache->reset();
+ statecache->add(ong.wordp(maxlev-1),(char *)&found);
+ };
+
+ return found;
+
+}
+
+
+// returns the probability of an n-gram
+
+double lmtable::prob(ngram ong){
+
+ if (ong.size==0) return 1.0;
+ if (ong.size>maxlev) ong.size=maxlev;
+
+ ngram ng(dict);
+ ng.trans(ong);
+
+ double rbow;
+ int ibow,iprob;
+ LMT_TYPE ndt;
+
+
+ if (get(ng,ng.size,ng.size)){
+ ndt=(LMT_TYPE)ng.info; iprob=prob(ng.link,ndt);
+ return exp((double)(isQtable?Pcenters[ng.size][iprob]:*((float *)&iprob)));
+ }
+ else{ //size==1 means an OOV word
+ if (ng.size==1)
+ return 1.0/UNIGRAM_RESOLUTION;
+ else{ // compute backoff
+ //set backoff state, shift n-gram, set default bow prob
+ bo_state(1); ng.shift();rbow=1.0;
+ if (ng.lev==ng.size){
+ ndt= (LMT_TYPE)ng.info; ibow=bow(ng.link,ndt);
+ rbow= (double) (isQtable?Bcenters[ng.size][ibow]:*((float *)&ibow));
+ }
+ //prepare recursion step
+ ong.size--;
+ return exp(rbow) * prob(ong);
+ }
+ }
+}
+
+//return log10 probs
+
+double lmtable::lprob(ngram ong){
+
+ if (ong.size==0) return 0.0;
+ if (ong.size>maxlev) ong.size=maxlev;
+
+ ngram ng=ong;
+ //ngram ng(dict); //avoid dictionary transfer
+ //ng.trans(ong);
+
+ double rbow;
+ int ibow,iprob;
+ LMT_TYPE ndt;
+
+
+ if (get(ng,ng.size,ng.size)){
+ ndt=(LMT_TYPE)ng.info; iprob=prob(ng.link,ndt);
+ return (double)(isQtable?Pcenters[ng.size][iprob]:*((float *)&iprob));
+ }
+ else{ //size==1 means an OOV word
+ if (ng.size==1)
+ return -log(UNIGRAM_RESOLUTION)/log(10.0);
+ else{ // compute backoff
+ //set backoff state, shift n-gram, set default bow prob
+ bo_state(1); ng.shift();rbow=0.0;
+ if (ng.lev==ng.size){
+ ndt= (LMT_TYPE)ng.info; ibow=bow(ng.link,ndt);
+ rbow= (double) (isQtable?Bcenters[ng.size][ibow]:*((float *)&ibow));
+ }
+ //prepare recursion step
+ ong.size--;
+ return rbow + lprob(ong);
+ }
+ }
+}
+
+//return log10 probsL use cache memory
+
+double lmtable::clprob(ngram ong){
+
+ if (ong.size==0) return 0.0;
+
+ if (ong.size>maxlev) ong.size=maxlev;
+
+ double logpr;
+
+#ifdef TRACE_CACHE
+ if (probcache && ong.size==maxlev && sentence_id>0lo){
+ *cacheout << sentence_id << " " << ong << "\n";
+ }
+#endif
+
+ if (probcache && ong.size==maxlev && probcache->get(ong.wordp(maxlev),(char *)&logpr)){
+ return logpr;
+ }
+
+ logpr=lprob(ong);
+
+ if (probcache && ong.size==maxlev){
+ //if (probcache->isfull()) probcache->reset();
+ probcache->add(ong.wordp(maxlev),(char *)&logpr);
+ };
+
+ return logpr;
+
+};
+
+
+
+//Fill the lmtable with the n-grams in a huge lmfile.
+//Use the local dictionary to select the needed ngrams
+
+
+void lmtable::filter2(const char* binlmfile, int buffMb){
+
+ //load header and dictionary of binary lm on disk
+ lmtable* dsklmt=new lmtable();
+ fstream inp(binlmfile,ios::in);
+
+ // read header
+ char header[MAX_LINE];
+ inp >> header;
+
+ dsklmt->loadbinheader(inp, header);
+ dsklmt->dict->load(inp);
+
+ //inherit properties of the dsklmt
+ isQtable=dsklmt->isQtable;
+ configure(dsklmt->maxlevel(),dsklmt->isQtable);
+ if (isQtable)
+ for (int i=1;i<=maxlev;i++) NumCenters[i]=dsklmt->NumCenters[i];
+
+
+ //prepare word code conversion table; words which
+ //are not in the local dictionary will have code -1
+ //prepare a new dictionary sorted as the large dictionary
+
+ dictionary* newdict=new dictionary((char *)NULL,1000000,(char*)NULL,(char*)NULL);
+ newdict->incflag(1);
+ int* code2code=new int[dsklmt->dict->size()];
+ for (int w=0;w<dsklmt->dict->size();w++){
+ if (dict->getcode(dsklmt->dict->decode(w))!=-1)
+ code2code[w]=newdict->encode(dsklmt->dict->decode(w));
+ else code2code[w]=-1;
+ }
+ newdict->incflag(0);
+ delete dict;
+ dict=newdict;
+
+ //service variables
+ char* p; char* q; char* r;
+ int ndsz; LMT_TYPE type; int w;
+ long i,j,l;
+
+
+ disktable* dtbl;
+ for (l=1;l<=maxlev;l++){
+
+ //shortcuts for current table
+ type=tbltype[l]; ndsz=nodesize(type);
+
+ //steel eventual coodebooks from dsklm;
+ if (isQtable) loadbincodebook(inp,l);
+
+ //allocate the maxumum number of entries to be load at each time
+ cerr << "loading part of " << dsklmt->cursize[l] << " " << l << "-grams\n";
+
+ //open a on-disk table buffer starting from current position in stream inp
+ dtbl=new disktable(inp, (buffMb * 1024 *1024)/ndsz,ndsz,dsklmt->maxsize[l]);
+
+ if (l==1){
+
+ //compute actual table size
+ maxsize[l]=0;
+ for (i=0;i<dsklmt->maxsize[l];i++){
+ p=dtbl->get(inp,i);
+ if ((code2code[dsklmt->word(p)]) != -1) maxsize[l]++;
+ }
+
+ assert(maxsize[l]<=dsklmt->maxsize[l]);
+
+ //allocate memory for table and start positions
+ table[l]=new char[maxsize[l] * ndsz];
+ startpos[l]=new int[maxsize[l]];
+
+ //reset position of dsklmt
+ dtbl->rewind(inp);
+
+ //copy elements into table[l]
+ cursize[l]=0;
+ for (i=0;i<dsklmt->maxsize[l];i++){
+ p=dtbl->get(inp,i);
+ if ((w=code2code[dsklmt->word(p)]) != -1) {
+ r=table[l] + (long) cursize[l] * ndsz;
+ memcpy(r,p,ndsz);
+ //store the initial poition in startpos
+ startpos[l][cursize[l]]=(i==0?0:bound(dtbl->get(inp,i-1),tbltype[l]));
+ word(r,w);
+ //cout << "1-gram bound:" << bound(r,tbltype[l]) << "\n";
+ cursize[l]++;
+ }
+ }
+
+ for (i=0;i<cursize[l];i++) assert(word(table[l]+ (long) i * ndsz)==i);
+
+ assert(maxsize[l]==cursize[l]);
+
+ }
+ else{
+
+ //shortcuts for the predecessors table
+ char* ptable=table[l-1];
+ LMT_TYPE ptype=tbltype[l-1];
+ int pndsz=nodesize(ptype);
+
+ //count actual table size, allocate memory, and copy elements
+ //we scan elements through the previous table: ptable
+ //cout << inp.tellp() << "\n";
+
+ maxsize[l]=0;
+
+ for (i=0;i<cursize[l-1];i++){
+ p=ptable+i*pndsz;
+ for (j=startpos[l-1][i];j<bound(p,ptype);j++){
+ assert(startpos[l-1][i]<bound(p,ptype));
+ q=dtbl->get(inp,j);
+ if ((w=code2code[dsklmt->word(q)]) != -1) maxsize[l]++;
+
+ }
+ }
+
+ //allocate memory for the table, and fill it
+ assert(maxsize[l]<=dsklmt->maxsize[l]);
+
+ table[l]=new char[maxsize[l] * ndsz];
+ if (l<maxlev) startpos[l]=new int[maxsize[l]];
+
+ //reset position of dsklmt
+ dtbl->rewind(inp);
+
+ r=table[l]; //next available position in table[l]
+ cursize[l]=0;
+
+ for (i=0;i<cursize[l-1];i++){
+ p=ptable+ i * pndsz;
+ for (j=startpos[l-1][i];j<bound(p,ptype);j++){
+ assert(startpos[l-1][i]<bound(p,ptype));
+
+ q=dtbl->get(inp,j);
+
+
+ if ((w=code2code[dsklmt->word(q)]) != -1){
+ //copy element
+ r=table[l] + cursize[l] * ndsz;
+ memcpy(r,q,ndsz);
+ if (l<maxlev) startpos[l][cursize[l]]=(j>0?bound(dtbl->get(inp,j-1),type):0);
+ word(r,w);
+ //cout << "+" << dict->decode(word(q)) << " - bound "
+ //<< startpos[l][cursize[l]] << " " << bound(p,ptype) << "\n";
+ cursize[l]++; //increment index in startpos
+ }
+
+ }
+ //update bounds of predecessor
+ bound(p,ptype,cursize[l]);
+ }
+
+ assert(cursize[l]==maxsize[l]);
+ }
+
+
+ delete dtbl;
+ if (l>1) delete [] startpos[l-1];
+ }
+
+
+}
+
+void lmtable::filter(const char* binlmfile){
+
+ //load header and dictionary of binary lm on disk
+ lmtable* dsklmt=new lmtable();
+ fstream inp(binlmfile,ios::in);
+
+ // read header
+ char header[MAX_LINE];
+ inp >> header;
+
+ dsklmt->loadbinheader(inp, header);
+
+ dsklmt->dict->load(inp);
+
+ //inherit properties of the dsklmt
+ configure(dsklmt->maxlevel(),dsklmt->isQuantized());
+
+ //prepare word code conversion table; words which
+ //are not in the local dictionary will have code -1
+ //prepare a new dictionary sorted as the large dictionary
+
+ dictionary* newdict=new dictionary((char *)NULL,1000000,(char*)NULL,(char*)NULL);
+ newdict->incflag(1);
+ int* code2code=new int[dsklmt->dict->size()];
+ for (int w=0;w<dsklmt->dict->size();w++){
+ if (dict->getcode(dsklmt->dict->decode(w))!=-1)
+ code2code[w]=newdict->encode(dsklmt->dict->decode(w));
+ else code2code[w]=-1;
+ }
+ newdict->incflag(0);
+ delete dict;
+ dict=newdict;
+
+ //service variables
+ char* p; char* q; char* r;
+ int ndsz; LMT_TYPE type; int w;
+ int i,j,l;
+
+ for (l=1;l<=maxlev;l++){
+
+ //shortcuts for current table
+ type=tbltype[l]; ndsz=nodesize(type);
+
+ //steel eventual coodebooks from dsklm;
+ if (isQtable) loadbincodebook(inp,l);
+
+ //load single l-table from dsklmt: this table can be
+ //removed at the ned of this cycle
+ cerr << "loading " << dsklmt->cursize[l] << " " << l << "-grams\n";
+ dsklmt->table[l]=new char[dsklmt->cursize[l]*ndsz];
+ inp.read(dsklmt->table[l],dsklmt->cursize[l]*ndsz);
+
+ //shortcuts for dsktable
+ char* dtbl=dsklmt->table[l];
+ int dsize=dsklmt->cursize[l];
+
+ if (l==1){
+
+ //count actual table size
+ maxsize[l]=0;
+ for (i=0;i<dsize;i++)
+ if ((code2code[dsklmt->word(dtbl+i*ndsz)]) != -1) maxsize[l]++;
+
+ assert(maxsize[l]<=dsklmt->maxsize[l]);
+ //allocate memory for table and start positions
+ table[l]=new char[maxsize[l] * ndsz];
+ startpos[l]=new int[maxsize[l]];
+
+ //copy elements one by one
+
+ for (i=0;i<dsize;i++){
+ p=dtbl+i*ndsz;
+ if ((w=code2code[dsklmt->word(p)]) != -1) {
+ r=table[l] + cursize[l] * ndsz;
+ memcpy(r,p,ndsz);
+ //store the initial poition in startpos
+ startpos[l][cursize[l]]=(i==0?0:bound(p-ndsz,tbltype[l]));
+ word(r,w);
+ cursize[l]++;
+ }
+ }
+
+ for (i=0;i<cursize[l];i++) assert(word(table[l]+i*ndsz)==i);
+
+ assert(maxsize[l]==cursize[l]);
+
+ }
+ else{ //l>=1;
+
+ //shortcuts for the predecessors table
+ char* ptable=table[l-1];
+ LMT_TYPE ptype=tbltype[l-1];
+ int pndsz=nodesize(ptype);
+
+ //count actual table size, allocate memory, and copy elements
+ //we scan elements through the previous table: ptable
+
+ maxsize[l]=0;
+ for (i=0;i<cursize[l-1];i++){
+ p=ptable+i*pndsz;
+ for (j=startpos[l-1][i];j<bound(p,ptype);j++){
+ q=dsklmt->table[l] + j * ndsz;
+ if ((w=code2code[dsklmt->word(q)]) != -1){
+ maxsize[l]++;
+ }
+ }
+ }
+
+ //allocate memory for the table, and fill it
+ assert(maxsize[l]<=dsklmt->maxsize[l]);
+ table[l]=new char[maxsize[l] * ndsz];
+ if (l<maxlev) startpos[l]=new int[maxsize[l]];
+
+ r=table[l]; //next available position in table[l]
+ cursize[l]=0;
+ for (i=0;i<cursize[l-1];i++){
+ p=ptable+i*pndsz;
+ for (j=startpos[l-1][i];j<bound(p,ptype);j++){
+ q=dsklmt->table[l] + j * ndsz;
+ if ((w=code2code[dsklmt->word(q)]) != -1){
+ //copy element
+ r=table[l] + cursize[l] * ndsz;
+ memcpy(r,q,ndsz);
+ if (l<maxlev)
+ startpos[l][cursize[l]]=(j==0?0:dsklmt->bound(q-ndsz,type));
+ word(r,w);
+ //cout << "+" << dict->decode(word(q)) << " - bound "
+ //<< startpos[l][cursize[l]] << " " << bound(p,ptype) << "\n";
+ cursize[l]++; //increment index in startpos
+ }
+ }
+ //update bounds of predecessor
+ bound(p,ptype,cursize[l]);
+ }
+
+ }
+
+ delete [] dsklmt->table[l];
+ if (l>1) delete [] startpos[l-1];
+ }
+
+}
+
+
+void lmtable::stat(int level){
+ int totmem=0,memory;
+ float mega=1024 * 1024;
+
+ cout.precision(2);
+
+ cout << "lmtable class statistics\n";
+
+ cout << "levels " << maxlev << "\n";
+ for (int l=1;l<=maxlev;l++){
+ memory=cursize[l] * nodesize(tbltype[l]);
+ cout << "lev " << l
+ << " entries "<< cursize[l]
+ << " used mem " << memory/mega << "Mb\n";
+ totmem+=memory;
+ }
+
+ cout << "total allocated mem " << totmem/mega << "Mb\n";
+
+ cout << "total number of get calls\n";
+ for (int l=1;l<=maxlev;l++){
+ cout << "level " << l << " " << totget[l] << "\n";
+ }
+ cout << "total binary search : " << totbsearch << "\n";
+
+ if (level >1 ) dict->stat();
+
+}
diff --git a/irstlm/src/src/lmtable.h b/irstlm/src/src/lmtable.h
new file mode 100644
index 000000000..4171dabea
--- /dev/null
+++ b/irstlm/src/src/lmtable.h
@@ -0,0 +1,366 @@
+/******************************************************************************
+IrstLM: IRST Language Model Toolkit
+Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+
+#ifndef MF_LMTABLE_H
+#define MF_LMTABLE_H
+
+#include "ngramcache.h"
+#include "dictionary.h"
+#include "n_gram.h"
+
+#undef TRACE_CACHE
+
+#define LMTMAXLEV 20
+#define MAX_LINE 1024
+
+#ifndef LMTCODESIZE
+#define LMTCODESIZE (int)3
+#endif
+
+#define SHORTSIZE (int)2
+#define PTRSIZE (int)sizeof(char *)
+#define INTSIZE (int)4
+#define CHARSIZE (int)1
+
+#define PROBSIZE (int)4 //use float
+#define QPROBSIZE (int)1
+#define BOUNDSIZE (int)4
+
+#define UNIGRAM_RESOLUTION 10000000.0
+
+typedef enum {INTERNAL,QINTERNAL,LEAF,QLEAF} LMT_TYPE;
+typedef char* node;
+
+typedef enum {LMT_FIND, //!< search: find an entry
+ LMT_ENTER, //!< search: enter an entry
+ LMT_INIT, //!< scan: start scan
+ LMT_CONT //!< scan: continue scan
+} LMT_ACTION;
+
+
+//disktable or accessing tables stored on disk
+
+class disktable{
+private:
+ char* buffer; //!< buffer of disk table
+ int buffer_size; //!< size of buffer
+ int entry_size; //!< size of each single entry in buffer
+ long current_border; //!< current last available entry in buffer
+ long file_border; //!< last element in disk table
+ long start_position; /* */
+public:
+
+ disktable(std::fstream& inp, int buffersize, int entrysize,long max_entries){
+ buffer_size=buffersize;
+ entry_size=entrysize;
+ buffer=new char[(buffer_size+1) * entry_size];
+ current_border=0;
+ file_border=max_entries;
+ start_position=inp.tellp();
+ };
+
+ ~disktable() {delete [] buffer;};
+
+ char* get(std::fstream& inp,long position){
+ assert(position < file_border);
+
+ //you can look back at maximum one position before the first in the buffer!
+ assert(position >= (current_border-buffer_size -1));
+
+ while (position>=current_border){
+ int how_many=(current_border + buffer_size <= file_border?buffer_size:file_border-current_border);
+ //store last value in position buffer_size;
+ memcpy(buffer + buffer_size * entry_size, buffer+(buffer_size-1) * entry_size, entry_size);
+ //read the number of elements
+ inp.read(buffer,how_many * entry_size);
+ //update curent border
+ current_border+=how_many;
+ }
+ if (position > (current_border-buffer_size-1) ) //then it is in buffer
+ return buffer + (position % buffer_size) * entry_size;
+ else
+ if (current_border>buffer_size) //asks for last of previous block
+ return buffer + buffer_size * entry_size;
+ else return NULL;
+ };
+
+ void rewind(std::fstream& inp){
+ inp.seekp(start_position);
+ current_border=0;
+ }
+
+};
+
+class lmtable{
+
+ char* table[LMTMAXLEV+1]; //storage of all levels
+ LMT_TYPE tbltype[LMTMAXLEV+1]; //table type for each levels
+ int cursize[LMTMAXLEV+1]; //current size of levels
+ int maxsize[LMTMAXLEV+1]; //current size of levels
+ int* startpos[LMTMAXLEV+1]; //support vector to store start positions
+
+ int maxlev; //max level of table
+ char info[100]; //information put in the header
+
+ //statistics
+ int totget[LMTMAXLEV+1];
+ int totbsearch;
+
+ //probability quantization
+ bool isQtable;
+
+ int NumCenters[LMTMAXLEV+1];
+ float* Pcenters[LMTMAXLEV+1];
+ float* Bcenters[LMTMAXLEV+1];
+
+ int lmt_oov_code;
+ int lmt_oov_size;
+ int backoff_state;
+
+ //improve access speed
+ ngramcache* lmtcache[LMTMAXLEV+1];
+ ngramcache* probcache;
+ ngramcache* statecache;
+ int max_cache_lev;
+
+public:
+
+#ifdef TRACE_CACHE
+ std::fstream* cacheout;
+ int sentence_id;
+#endif
+
+ dictionary *dict; // dictionary
+
+ lmtable();
+
+ ~lmtable(){
+ for (int i=2;i<=LMTMAXLEV;i++)
+ if (lmtcache[i]){
+ std::cerr << i <<"-gram cache: "; lmtcache[i]->stat();
+ delete lmtcache[i];
+ }
+
+ if (probcache){
+ std::cerr << "Prob Cache: "; probcache->stat();
+ delete probcache;
+#if TRACE_CACHE
+ cacheout->close();
+ delete cacheout;
+#endif
+
+ }
+ if (statecache){
+ std::cerr << "State Cache: "; statecache->stat();
+ delete statecache;
+ }
+
+
+ for (int i=1;i<=maxlev;i++){
+ if (table[i]) delete [] table[i];
+ if (isQtable){
+ if (Pcenters[i]) delete [] Pcenters[i];
+ if (i<maxlev)
+ if (Bcenters[i]) delete [] Bcenters[i];
+ }
+ }
+ }
+
+ void init_probcache(){
+ assert(probcache==NULL);
+ probcache=new ngramcache(maxlev,sizeof(double),400000);
+#ifdef TRACE_CACHE
+ cacheout=new std::fstream("/tmp/tracecache",std::ios::out);
+ sentence_id=0;
+#endif
+ }
+
+ void init_statecache(){
+ assert(statecache==NULL);
+ statecache=new ngramcache(maxlev-1,sizeof(char *),200000);
+ }
+
+ void init_lmtcaches(int uptolev){
+ max_cache_lev=uptolev;
+ for (int i=2;i<=max_cache_lev;i++){
+ assert(lmtcache[i]==NULL);
+ lmtcache[i]=new ngramcache(i,sizeof(char *),200000);
+ }
+ }
+
+ void check_cache_levels(){
+ if (probcache && probcache->isfull()) probcache->reset(probcache->cursize());
+ if (statecache && statecache->isfull()) statecache->reset(statecache->cursize());
+ for (int i=2;i<=max_cache_lev;i++)
+ if (lmtcache[i]->isfull()) lmtcache[i]->reset(lmtcache[i]->cursize());
+ }
+
+ bool is_probcache_active(){return probcache!=NULL;}
+ bool is_statecache_active(){return statecache!=NULL;}
+ bool are_lmtcaches_active(){return lmtcache[2]!=NULL;}
+
+ void configure(int n,bool quantized){
+ maxlev=n;
+ if (n==1)
+ tbltype[1]=(quantized?QLEAF:LEAF);
+ else{
+ for (int i=1;i<n;i++) tbltype[i]=(quantized?QINTERNAL:INTERNAL);
+ tbltype[n]=(quantized?QLEAF:LEAF);
+ }
+ };
+
+ int maxlevel(){return maxlev;};
+ bool isQuantized(){return isQtable;}
+
+
+ void savetxt(const char *filename);
+ void savebin(const char *filename);
+ void dumplm(std::fstream& out,ngram ng, int ilev, int elev, int ipos,int epos);
+
+ void load(std::istream& inp);
+ void loadtxt(std::istream& inp,const char* header);
+ void loadbin(std::istream& inp,const char* header);
+
+ void loadbinheader(std::istream& inp, const char* header);
+ void loadbincodebook(std::istream& inp,int l);
+
+ void filter(const char* lmfile);
+ void filter2(const char* lmfile,int buffMb=512);
+
+ void loadcenters(std::istream& inp,int Order);
+
+ double prob(ngram ng);
+ double lprob(ngram ng);
+ double clprob(ngram ng);
+
+
+ void *search(char *tb,LMT_TYPE ndt,int lev,int n,int sz,int *w,
+ LMT_ACTION action,char **found=(char **)NULL);
+
+ int mybsearch(char *ar, int n, int size, unsigned char *key, int *idx);
+
+ int add(ngram& ng,int prob,int bow);
+ void checkbounds(int level);
+
+ int get(ngram& ng){return get(ng,ng.size,ng.size);}
+ int get(ngram& ng,int n,int lev);
+
+ int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev);
+
+ const char *maxsuffptr(ngram ong);
+ const char *cmaxsuffptr(ngram ong);
+
+ inline int putmem(char* ptr,int value,int offs,int size){
+ assert(ptr!=NULL);
+ for (int i=0;i<size;i++)
+ ptr[offs+i]=(value >> (8 * i)) & 0xff;
+ return value;
+ };
+
+ inline int getmem(char* ptr,int* value,int offs,int size){
+ assert(ptr!=NULL);
+ *value=ptr[offs] & 0xff;
+ for (int i=1;i<size;i++)
+ *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i));
+ return *value;
+ };
+
+
+ int bo_state(int value=-1){
+ return (value==-1?backoff_state:backoff_state=value);
+ };
+
+
+ int nodesize(LMT_TYPE ndt){
+ switch (ndt){
+ case INTERNAL:
+ return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE;
+ case QINTERNAL:
+ return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE;
+ case QLEAF:
+ return LMTCODESIZE + QPROBSIZE;
+ case LEAF:
+ return LMTCODESIZE + PROBSIZE;
+ default:
+ assert(0);
+ return 0;
+ }
+ }
+
+ inline int word(node nd,int value=-1)
+ {
+ int offset=0;
+
+ if (value==-1)
+ getmem(nd,&value,offset,LMTCODESIZE);
+ else
+ putmem(nd,value,offset,LMTCODESIZE);
+
+ return value;
+ };
+
+ inline int prob(node nd,LMT_TYPE ndt, int value=-1)
+ {
+ int offs=LMTCODESIZE;
+ int size=(ndt==QINTERNAL || ndt==QLEAF?QPROBSIZE:PROBSIZE);
+
+ if (value==-1)
+ getmem(nd,&value,offs,size);
+ else
+ putmem(nd,value,offs,size);
+
+ return value;
+ };
+
+
+ inline int bow(node nd,LMT_TYPE ndt, int value=-1)
+ {
+ assert(ndt==INTERNAL || ndt==QINTERNAL);
+ int size=(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+ int offs=LMTCODESIZE+size;
+
+ if (value==-1)
+ getmem(nd,&value,offs,size);
+ else
+ putmem(nd,value,offs,size);
+
+ return value;
+ };
+
+ inline int bound(node nd,LMT_TYPE ndt, int value=-1)
+ {
+ assert(ndt==INTERNAL || ndt==QINTERNAL);
+ int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE);
+
+ if (value==-1)
+ getmem(nd,&value,offs,BOUNDSIZE);
+ else
+ putmem(nd,value,offs,BOUNDSIZE);
+
+ return value;
+ };
+
+ void stat(int lev=0);
+
+};
+
+#endif
+
diff --git a/irstlm/src/src/mempool.cpp b/irstlm/src/src/mempool.cpp
new file mode 100644
index 000000000..1c690213d
--- /dev/null
+++ b/irstlm/src/src/mempool.cpp
@@ -0,0 +1,497 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+// An efficient memory pool manager
+// by M. Federico
+// Copyright Marcello Federico, ITC-irst, 1998
+
+#include <iostream>
+#include <ostream>
+#include <cassert>
+#include "mempool.h"
+
+using namespace std;
+
+#ifdef TRACE_ENABLE
+#define TRACE_ERR(str) { std::cerr << str; }
+#else
+#define TRACE_ERR(str) { }
+#endif
+
+/*! The pool contains:
+ - entries of size is
+ - tables for bs entries
+*/
+
+
+mempool::mempool(int is, int bs){
+
+ // item size must be multiple of memory alignment step (4 bytes)
+ // example: is is=9 becomes i=12 (9 + 4 - 9 %4 )
+
+ is=(is>(int)sizeof(char *)?is:0);
+
+ is=is + sizeof(char *) - (is % sizeof(char *));
+
+ item_size = is;
+
+ block_size = bs;
+
+ true_size = is * bs;
+
+ block_list = new memnode;
+
+ block_list->block = new char[true_size];
+
+ memset(block_list->block,'0',true_size);
+
+ block_list->next = 0;
+
+ blocknum = 1;
+
+ entries = 0;
+
+ // build free list
+
+ char *ptr = free_list = block_list->block;
+
+ for (int i=0;i<block_size-1;i++) {
+ *(char **)ptr= ptr + item_size;
+ ptr+=item_size;
+ }
+ *(char **)ptr = NULL; //last item
+
+}
+
+
+char * mempool::allocate(){
+
+ char *ptr;
+
+ if (free_list==NULL)
+ {
+ memnode *new_block = new memnode;
+
+ new_block->block = new char[true_size];
+
+ //memset(new_block->block,'0',true_size);
+
+ new_block->next = block_list;
+
+ block_list=new_block; // update block list
+
+ /* update free list */
+
+ ptr = free_list = block_list->block;
+
+ for (int i=0;i<block_size-1;i++) {
+ *(char **)ptr = ptr + item_size;
+ ptr = ptr + item_size;
+ }
+
+ *(char **)ptr=NULL;
+
+ blocknum++;
+ }
+
+ assert(free_list);
+
+ ptr = free_list;
+
+ free_list=*(char **)ptr;
+
+ *(char **)ptr=NULL; // reset the released item
+
+ entries++;
+
+ return ptr;
+
+}
+
+
+int mempool::free(char* addr){
+
+ // do not check if it belongs to this pool !!
+ /*
+ memnode *list=block_list;
+ while ((list != NULL) &&
+ ((addr < list->block) ||
+ (addr >= (list->block + true_size))))
+ list=list->next;
+
+ if ((list==NULL) || (((addr - list->block) % item_size)!=0))
+ {
+ //cerr << "mempool::free-> addr does not belong to this pool\n";
+ return 0;
+ }
+ */
+
+ *(char **)addr=free_list;
+ free_list=addr;
+
+ entries--;
+
+ return 1;
+}
+
+
+mempool::~mempool()
+{
+ memnode *ptr;
+
+ while (block_list !=NULL){
+ ptr=block_list->next;
+ delete [] block_list->block;
+ delete block_list;
+ block_list=ptr;
+ }
+
+}
+
+void mempool::map (ostream& co){
+
+ co << "mempool memory map:\n";
+ //percorri piu` volte la lista libera
+
+ memnode *bl=block_list;
+ char *fl=free_list;
+
+ char* img=new char[block_size+1];
+ img[block_size]='\0';
+
+ while (bl !=NULL){
+
+ memset(img,'#',block_size);
+
+ fl=free_list;
+ while (fl != NULL){
+ if ((fl >= bl->block)
+ &&
+ (fl < bl->block + true_size))
+ {
+ img[(fl-bl->block)/item_size]='-';
+ }
+
+ fl=*(char **)fl;
+ }
+
+ co << img << "\n";
+ bl=bl->next;
+ }
+ delete [] img;
+}
+
+void mempool::stat(){
+
+ TRACE_ERR("mempool class statistics\n"
+ << "entries " << entries
+ << " blocks " << blocknum
+ << " used memory " << (blocknum * true_size)/1024 << " Kb\n");
+}
+
+
+
+strstack::strstack(int bs){
+
+ size=bs;
+ list=new memnode;
+
+ list->block=new char[size];
+
+ list->next=0;
+
+ memset(list->block,'\0',size);
+ idx=0;
+
+ waste=0;
+ memory=size;
+ entries=0;
+ blocknum=1;
+
+}
+
+
+void strstack::stat(){
+
+ TRACE_ERR("strstack class statistics\n"
+ << "entries " << entries
+ << " blocks " << blocknum
+ << " used memory " << memory/1024 << " Kb\n");
+}
+
+
+char *strstack::push(char *s){
+ int len=strlen(s);
+
+ if ((len+1) >= size){
+ cerr << "strstack::push string is too long\n";
+ exit(1);
+ };
+
+ if ((idx+len+1) >= size){
+ //append a new block
+ //there must be space to
+ //put the index after
+ //the word
+
+ waste+=size-idx;
+ blocknum++;
+ memory+=size;
+
+ memnode* nd=new memnode;
+ nd->block=new char[size];
+ nd->next=list;
+
+ list=nd;
+
+ memset(list->block,'\0',size);
+
+ idx=0;
+
+ }
+
+ // append in current block
+
+ strcpy(&list->block[idx],s);
+
+ idx+=len+1;
+
+ entries++;
+
+ return &list->block[idx-len-1];
+
+}
+
+
+char *strstack::pop(){
+
+ if (list==0) return 0;
+
+ if (idx==0){
+
+ // free this block and go to next
+
+ memnode *ptr=list->next;
+
+ delete [] list->block;
+ delete list;
+
+ list=ptr;
+
+ if (list==0)
+ return 0;
+ else
+ idx=size-1;
+ }
+
+ //go back to first non \0
+ while (idx>0)
+ if (list->block[idx--]!='\0')
+ break;
+
+ //go back to first \0
+ while (idx>0)
+ if (list->block[idx--]=='\0')
+ break;
+
+ entries--;
+
+ if (list->block[idx+1]=='\0')
+ {
+ idx+=2;
+ memset(&list->block[idx],'\0',size-idx);
+ return &list->block[idx];
+ }
+ else{
+ idx=0;
+ memset(&list->block[idx],'\0',size);
+ return &list->block[0];
+ }
+}
+
+
+char *strstack::top(){
+
+ int tidx=idx;
+ memnode *tlist=list;
+
+ if (tlist==0) return 0;
+
+ if (idx==0){
+
+ tlist=tlist->next;
+
+ if (tlist==0) return 0;
+
+ tidx=size-1;
+ }
+
+ //go back to first non \0
+ while (tidx>0)
+ if (tlist->block[tidx--]!='\0')
+ break;
+
+ //aaa\0bbb\0\0\0\0
+
+ //go back to first \0
+ while (tidx>0)
+ if (tlist->block[tidx--]=='\0')
+ break;
+
+ if (tlist->block[tidx+1]=='\0')
+ {
+ tidx+=2;
+ return &tlist->block[tidx];
+ }
+ else{
+ tidx=0;
+ return &tlist->block[0];
+ }
+
+}
+
+
+strstack::~strstack(){
+ memnode *ptr;
+ while (list !=NULL){
+ ptr=list->next;
+ delete [] list->block;
+ delete list;
+ list=ptr;
+ }
+}
+
+
+storage::storage(int maxsize,int blocksize)
+{
+ newmemory=0;
+ newcalls=0;
+ setsize=maxsize;
+ poolsize=blocksize; //in bytes
+ poolset=new mempool* [setsize+1];
+ for (int i=0;i<=setsize;i++)
+ poolset[i]=NULL;
+}
+
+
+storage::~storage(){
+ for (int i=0;i<=setsize;i++)
+ if (poolset[i])
+ delete poolset[i];
+ delete [] poolset;
+}
+
+char *storage::allocate(int size){
+
+ if (size<=setsize){
+ if (!poolset[size]){
+ poolset[size]=new mempool(size,poolsize/size);
+ }
+ return poolset[size]->allocate();
+ }
+ else{
+
+ newmemory+=size+8;
+ newcalls++;
+ char* p=(char *)calloc(sizeof(char),size);
+ if (p==NULL){
+ cerr << "storage::alloc insufficient memory\n";
+ exit(1);
+ }
+ return p;
+ }
+}
+
+char *storage::reallocate(char *oldptr,int oldsize,int newsize){
+
+ char *newptr;
+
+ assert(newsize>oldsize);
+
+ if (oldsize<=setsize){
+ if (newsize<=setsize){
+ if (!poolset[newsize])
+ poolset[newsize]=new mempool(newsize,poolsize/newsize);
+ newptr=poolset[newsize]->allocate();
+ memset((char*)newptr,0,newsize);
+ }
+ else
+ newptr=(char *)calloc(sizeof(char),newsize);
+
+ if (oldptr && oldsize){
+ memcpy(newptr,oldptr,oldsize);
+ poolset[oldsize]->free(oldptr);
+ }
+ }
+ else{
+ newptr=(char *)realloc(oldptr,newsize);
+ if (newptr==oldptr)
+ cerr << "r\b";
+ else
+ cerr << "a\b";
+ }
+ if (newptr==NULL){
+ cerr << "storage::realloc insufficient memory\n";
+ exit(1);
+ }
+
+ return newptr;
+}
+
+int storage::free(char *addr,int size){
+
+ /*
+ while(size<=setsize){
+ if (poolset[size] && poolset[size]->free(addr))
+ break;
+ size++;
+ }
+ */
+
+ if (size>setsize)
+ return free(addr),1;
+ else{
+ poolset[size] && poolset[size]->free(addr);
+ }
+ return 1;
+}
+
+
+void storage::stat(){
+ int used=0;
+ int memory=sizeof(char *) * setsize;
+ int waste=0;
+
+ for (int i=0;i<=setsize;i++)
+ if (poolset[i]){
+ used++;
+ memory+=poolset[i]->used();
+ waste+=poolset[i]->wasted();
+ }
+
+ TRACE_ERR("storage class statistics\n"
+ << "alloc entries " << newcalls
+ << " used memory " << newmemory/1024 << "Kb\n"
+ << "mpools " << setsize
+ << " active " << used
+ << " used memory " << memory/1024 << "Kb"
+ << " wasted " << waste/1024 << "Kb\n");
+}
+
diff --git a/irstlm/src/src/mempool.h b/irstlm/src/src/mempool.h
new file mode 100644
index 000000000..eafdcd110
--- /dev/null
+++ b/irstlm/src/src/mempool.h
@@ -0,0 +1,172 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+// An efficient memory manager
+// by M. Federico
+// Copyright Marcello Federico, ITC-irst, 1998
+
+#ifndef MF_MEMPOOL_H
+#define MF_MEMPOOL_H
+
+#ifndef NULL
+const int NULL=0;
+//#define NULL=0;
+#endif
+
+#include <iostream> // std::ostream
+
+//! Memory block
+/*! This can be used by:
+- mempool to store items of fixed size
+- strstack to store strings of variable size
+*/
+
+class memnode{
+ friend class mempool; //!< grant access
+ friend class strstack; //!< grant access
+ char *block; //!< block of memory
+ memnode *next; //!< next block ptr
+};
+
+
+//! Memory pool
+
+/*! A memory pool is composed of:
+ - a linked list of block_num memory blocks
+ - each block might contain up to block_size items
+ - each item is made of exactly item_size bytes
+*/
+
+class mempool{
+ int block_size; //!< number of entries per block
+ int item_size; //!< number of bytes per entry
+ int true_size; //!< number of bytes per block
+ memnode* block_list; //!< list of blocks
+ char* free_list; //!< free entry list
+ int entries; //!< number of stored entries
+ int blocknum; //!< number of allocated blocks
+ public:
+
+ //! Creates a memory pool
+ mempool(int is, int bs);
+
+ //! Destroys memory pool
+ ~mempool();
+
+ //! Prints a map of memory occupancy
+ void map(std::ostream& co);
+
+ //! Allocates a single memory entry
+ char *allocate();
+
+ //! Frees a single memory entry
+ int free(char* addr);
+
+ //! Prints statistics about this mempool
+ void stat();
+
+ //! Returns effectively used memory (bytes)
+ /*! includes 8 bytes required by each call of new */
+
+ int used(){return blocknum * (true_size + 8);};
+
+ //! Returns amount of wasted memory (bytes)
+ int wasted(){return used()-(entries * item_size);};
+};
+
+//! A stack to store strings
+
+/*!
+ The stack is composed of
+ - a list of blocks memnode of fixed size
+ - attribute blocknum tells the block on top
+ - attribute idx tells position of the top string
+*/
+
+class strstack{
+ memnode* list; //!< list of memory blocks
+ int size; //!< size of each block
+ int idx; //!< index of last stored string
+ int waste; //!< current waste of memory
+ int memory; //!< current use of memory
+ int entries; //!< current number of stored strings
+ int blocknum; //!< current number of used blocks
+
+ public:
+
+ strstack(int bs=1000);
+
+ ~strstack();
+
+ char *push(char *s);
+
+ char *pop();
+
+ char *top();
+
+ void stat();
+
+ int used(){return memory;};
+
+ int wasted(){return waste;};
+
+};
+
+
+//! Manages multiple memory pools
+
+/*!
+ This class permits to manage memory pools
+ with items up to a specified size.
+ - items within the allowed range are stored in memory pools
+ - items larger than the limit are allocated with new
+*/
+
+
+class storage{
+ mempool **poolset; //!< array of memory pools
+ int setsize; //!< number of memory pools/maximum elem size
+ int poolsize; //!< size of each block
+ int newmemory; //!< stores amount of used memory
+ int newcalls; //!< stores number of allocated blocks
+ public:
+
+ //! Creates storage
+ storage(int maxsize,int blocksize);
+
+ //! Destroys storage
+ ~storage();
+
+ /* names of below functions have been changed so as not to interfere with macros for malloc/realloc/etc -- EVH */
+
+ //! Allocates memory
+ char *allocate(int size);
+
+ //! Realloc memory
+ char *reallocate(char *oldptr,int oldsize,int newsize);
+
+ //! Frees memory of an entry
+ int free(char *addr,int size=0);
+
+ //! Prints statistics about storage
+ void stat();
+};
+
+#endif
diff --git a/irstlm/src/src/n_gram.cpp b/irstlm/src/src/n_gram.cpp
new file mode 100644
index 000000000..bf12b7015
--- /dev/null
+++ b/irstlm/src/src/n_gram.cpp
@@ -0,0 +1,214 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+#include <iomanip>
+#include <cassert>
+#include "mempool.h"
+#include "htable.h"
+#include "dictionary.h"
+#include "n_gram.h"
+#include "index.h"
+
+using namespace std;
+
+ngram::ngram(dictionary* d,int sz){
+ dict=d;
+ size=sz;
+ succ=0;
+ freq=0;
+ info=0;
+ pinfo=0;
+ link=NULL;
+ isym=-1;
+ memset(word,0,sizeof(int)*MAX_NGRAM);
+ memset(midx,0,sizeof(int)*MAX_NGRAM);
+}
+
+ngram::ngram(ngram& ng){
+ size=ng.size;
+ freq=ng.freq;
+ succ=0;
+ info=0;
+ pinfo=0;
+ link=NULL;
+ isym=-1;
+ dict=ng.dict;
+ memcpy(word,ng.word,sizeof(int)*MAX_NGRAM);
+ memcpy(midx,ng.word,sizeof(int)*MAX_NGRAM);
+
+}
+
+void ngram::trans (const ngram& ng){
+ size=ng.size;
+ freq=ng.freq;
+ if (dict == ng.dict){
+ info=ng.info;
+ isym=ng.isym;
+ memcpy(word,ng.word,sizeof(int)*MAX_NGRAM);
+ memcpy(midx,ng.midx,sizeof(int)*MAX_NGRAM);
+ }
+ else{
+ info=0;
+ memset(midx,0,sizeof(int)*MAX_NGRAM);
+ isym=-1;
+ for (int i=1;i<=size;i++)
+ word[MAX_NGRAM-i]=dict->encode(ng.dict->decode(*ng.wordp(i)));
+ }
+}
+
+
+ifstream& operator>> ( ifstream& fi , ngram& ng){
+ char w[MAX_WORD];
+ memset(w,0,MAX_WORD);
+ w[0]='\0';
+
+ if (!(fi >> setw(MAX_WORD) >> w))
+ return fi;
+
+ if (strlen(w)==(MAX_WORD-1))
+ cerr << "ngram: a too long word was read ("
+ << w << ")\n";
+
+ if (ng.dict->intsymb() &&
+ (strlen(w)==1) && (index(ng.dict->intsymb(),w[0])!=NULL)){
+
+ ng.isym=(long)index(ng.dict->intsymb(),w[0]) -
+ (long)ng.dict->intsymb();
+ ng.size=0;
+ return fi;
+ }
+
+ int c=ng.dict->encode(w);
+
+ if (c == -1 ){
+ cerr << "ngram: " << w << " is OOV \n";
+ exit(1);
+ }
+
+ memcpy(ng.word,ng.word+1,(MAX_NGRAM-1)*sizeof(int));
+
+ ng.word[MAX_NGRAM-1]=(int)c;
+ ng.freq=1;
+
+ if (ng.size<MAX_NGRAM) ng.size++;
+
+ return fi;
+
+}
+
+
+int ngram::pushw(char* w){
+
+ assert(dict!=NULL);
+
+ int c=dict->encode(w);
+
+ if (c == -1 ){
+ cerr << "ngram: " << w << " is OOV \n";
+ exit(1);
+ }
+
+ pushc(c);
+
+ return 1;
+
+}
+
+int ngram::pushc(int c){
+
+ int buff[MAX_NGRAM-1];
+ memcpy(buff,word+1,(MAX_NGRAM-1)*sizeof(int));
+ memcpy(word,buff,(MAX_NGRAM-1)*sizeof(int));
+
+ word[MAX_NGRAM-1]=(int)c;
+ if (size<MAX_NGRAM) size++;
+
+ return 1;
+
+}
+
+
+istream& operator>> ( istream& fi , ngram& ng){
+ char w[MAX_WORD];
+ memset(w,0,MAX_WORD);
+ w[0]='\0';
+
+ assert(ng.dict != NULL);
+
+ if (!(fi >> setw(MAX_WORD) >> w))
+ return fi;
+
+ if (strlen(w)==(MAX_WORD-1))
+ cerr << "ngram: a too long word was read ("
+ << w << ")\n";
+
+ if (ng.dict->intsymb() &&
+ (strlen(w)==1) && (index(ng.dict->intsymb(),w[0])!=NULL)){
+ ng.isym=(long)index(ng.dict->intsymb(),w[0])-(long)ng.dict->intsymb();
+ ng.size=0;
+ return fi;
+ }
+
+ ng.pushw(w);
+
+ ng.freq=1;
+
+ return fi;
+
+}
+
+ofstream& operator<< (ofstream& fo,ngram& ng){
+
+ assert(ng.dict != NULL);
+
+ for (int i=ng.size;i>0;i--)
+ fo << ng.dict->decode(ng.word[MAX_NGRAM-i]) << " ";
+ //fo << "[size " << ng.size << " freq " << ng.freq << "]";
+ fo << ng.freq;
+ return fo;
+}
+
+ostream& operator<< (ostream& fo,ngram& ng){
+
+ assert(ng.dict != NULL);
+
+ for (int i=ng.size;i>0;i--)
+ fo << ng.dict->decode(ng.word[MAX_NGRAM-i]) << " ";
+ //fo << "[size " << ng.size << " freq " << ng.freq << "]";
+ fo << ng.freq;
+
+ return fo;
+}
+
+/*
+main(int argc, char** argv){
+ dictionary d(argv[1]);
+ ifstream txt(argv[1]);
+ ngram ng(&d);
+
+ while (txt >> ng){
+ cout << ng << "\n";
+ }
+
+ ngram ng2=ng;
+ cerr << "copia l'ultimo =" << ng << "\n";
+}
+*/
+
diff --git a/irstlm/src/src/n_gram.h b/irstlm/src/src/n_gram.h
new file mode 100644
index 000000000..f36b5d2df
--- /dev/null
+++ b/irstlm/src/src/n_gram.h
@@ -0,0 +1,118 @@
+/******************************************************************************
+ IrstLM: IRST Language Model Toolkit
+ Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+// n-gram tables
+// by M. Federico
+// Copyright Marcello Federico, ITC-irst, 1998
+
+#ifndef MF_NGRAM_H
+#define MF_NGRAM_H
+
+#include <fstream>
+#include <cassert>
+#include "dictionary.h"
+
+#ifdef MYMAXNGRAM
+#define MAX_NGRAM MYMAXNGRAM
+#else
+#define MAX_NGRAM 20
+#endif
+
+class dictionary;
+
+//typedef int code;
+
+class ngram{
+ int word[MAX_NGRAM]; //encoded ngram
+ public:
+ dictionary *dict; //dictionary
+ char* link; // ngram-tree pointer
+ int midx[MAX_NGRAM]; // ngram-tree scan pointer
+ int lev; // ngram-tree level
+ int size; // ngram size
+ int freq; // ngram frequency
+ int succ; // number of successors
+
+ unsigned char info; // ngram-tree info flags
+ unsigned char pinfo; // ngram-tree parent info flags
+ int isym; // last interruption symbol
+
+ ngram(dictionary* d,int sz=0);
+ ngram(ngram& ng);
+
+ int *wordp()// n-gram pointer
+ {return wordp(size);};
+ int *wordp(int k) // n-gram pointer
+ {return size>=k?&word[MAX_NGRAM-k]:0;};
+ const int *wordp() const // n-gram pointer
+ {return wordp(size);};
+ const int *wordp(int k) const // n-gram pointer
+ {return size>=k?&word[MAX_NGRAM-k]:0;};
+
+ int shift(){
+ for (int i=(MAX_NGRAM-1);i>0;i--){
+ word[i]=word[i-1];
+ }
+ size--;
+ return 1;
+ }
+
+
+ int containsWord(char* s,int lev){
+
+ int c=dict->encode(s);
+ if (c == -1) return 0;
+
+ assert(lev <= size);
+ for (int i=0;i<lev;i++){
+ if (*wordp(size-i)== c) return 1;
+ }
+ return 0;
+ }
+
+
+ void trans(const ngram& ng);
+
+ friend std::ifstream& operator>> (std::ifstream& fi,ngram& ng);
+ friend std::ofstream& operator<< (std::ofstream& fi,ngram& ng);
+ friend std::istream& operator>> (std::istream& fi,ngram& ng);
+ friend std::ostream& operator<< (std::ostream& fi,ngram& ng);
+
+ inline int ckhisto(int sz){
+
+ for (int i=sz;i>1;i--)
+ if (*wordp(i)==dict->oovcode())
+ return 0;
+ return 1;
+ }
+
+ int pushc(int c);
+ int pushw(char* w);
+
+ //~ngram();
+
+
+
+};
+
+#endif
+
+
+
diff --git a/irstlm/src/src/ngramcache.cpp b/irstlm/src/src/ngramcache.cpp
new file mode 100644
index 000000000..3b4234bcc
--- /dev/null
+++ b/irstlm/src/src/ngramcache.cpp
@@ -0,0 +1,89 @@
+/******************************************************************************
+IrstLM: IRST Language Model Toolkit
+Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+#include <iostream>
+#include <fstream>
+#include <stdexcept>
+#include <cassert>
+#include "math.h"
+#include "mempool.h"
+#include "htable.h"
+
+#include "ngramcache.h"
+
+using namespace std;
+
+ngramcache::ngramcache(int n,int size,int maxentries){
+ ngsize=n;
+ infosize=size;
+ maxn=maxentries;
+ entries=0;
+ ht=new htable(maxn * 2, ngsize * sizeof(int),INT,NULL); //lower load factor to reduce collisions
+ mp=new mempool(ngsize * sizeof(int)+infosize,1000000);
+ accesses=0;
+ hits=0;
+ };
+
+ngramcache::~ngramcache(){
+ ht->stat();//ht->map();
+ mp->stat();
+ delete ht;delete mp;
+};
+
+
+//resize cache to specified number of entries
+void ngramcache::reset(int n){
+ ht->stat();
+ delete ht;delete mp;
+ if (n>0) maxn=n;
+ ht=new htable(maxn * 2, ngsize * sizeof(int),INT,NULL); //load factor 2
+ mp=new mempool(ngsize * sizeof(int)+infosize,1000000);
+ entries=0;
+ }
+
+
+
+
+char* ngramcache::get(const int* ngp,char* info){
+ char *found;
+ // cout << "ngramcache::get() ";
+ //for (int i=0;i<ngsize;i++) cout << ngp[i] << " "; cout <<"\n";
+ accesses++;
+ if (found=ht->search((char *)ngp,HT_FIND)){
+ if (info) memcpy(info,found+ngsize*sizeof(int),infosize);
+ hits++;
+ };
+ return found;
+ };
+
+
+int ngramcache::add(const int* ngp,const char* info){
+
+ char* entry=mp->allocate();
+ memcpy(entry,(char*) ngp,sizeof(int) * ngsize);
+ memcpy(entry + ngsize * sizeof(int),(char *)info,infosize);
+ char *found=ht->search((char *)entry,HT_ENTER);
+ assert(found == entry); //false if key is already insided
+ entries++;
+ return 1;
+ }
+
+void ngramcache::stat(){
+ cerr << "ngramcache stats: entries=" << entries << " acc=" << accesses << " hits=" << hits << "\n";
+};
diff --git a/irstlm/src/src/ngramcache.h b/irstlm/src/src/ngramcache.h
new file mode 100644
index 000000000..d7d98e644
--- /dev/null
+++ b/irstlm/src/src/ngramcache.h
@@ -0,0 +1,51 @@
+/******************************************************************************
+IrstLM: IRST Language Model Toolkit
+Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+#ifndef MF_NGRAMCACHE_H
+#define MF_NGRAMCACHE_H
+
+#include "mempool.h"
+#include "htable.h"
+
+class ngramcache{
+private:
+ htable* ht;
+ mempool *mp;
+ int maxn;
+ int ngsize;
+ int infosize;
+ int accesses;
+ int hits;
+ int entries;
+
+public:
+
+ int cursize(){return entries;};
+ ngramcache(int n,int size,int maxentries);
+ ~ngramcache();
+ void reset(int n=0);
+ char* get(const int* ngp,char* info=NULL);
+ int add(const int* ngp,const char* info);
+ int isfull(){return (entries >= maxn);};
+ void stat();
+};
+
+#endif
+
diff --git a/irstlm/src/src/quantize-lm.cpp b/irstlm/src/src/quantize-lm.cpp
new file mode 100644
index 000000000..1b326ce97
--- /dev/null
+++ b/irstlm/src/src/quantize-lm.cpp
@@ -0,0 +1,424 @@
+/******************************************************************************
+IrstLM: IRST Language Model Toolkit, compile LM
+Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+******************************************************************************/
+
+using namespace std;
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+#include <assert.h>
+#include "math.h"
+#include "util.h"
+
+#define MAX_LINE 1024
+
+//----------------------------------------------------------------------
+// Special type and global variable for the BIN CLUSTERING algorithm
+//
+//
+//----------------------------------------------------------------------
+
+typedef struct{
+ double pt;
+ int idx;
+ short code;
+}BinEntry;
+
+
+int cmpBinEntry(const void* a,const void* b){
+ if (*(double *)a > *(double*)b)
+ return 1;
+ else if (*(double *)a < *(double*)b)
+ return -1;
+ else
+ return 0;
+}
+
+BinEntry* bintable=NULL;
+
+//----------------------------------------------------------------------
+// Global entry points
+//----------------------------------------------------------------------
+
+int parseWords(char *sentence, char **words, int max);
+
+int ComputeCluster(int nc, double* cl,int N,double* Pts);
+
+//----------------------------------------------------------------------
+// Global parameters (some are set in getArgs())
+//----------------------------------------------------------------------
+
+int k = 256; // number of centers
+const int MAXLEV = 11; //maximum n-gram size
+
+//----------------------------------------------------------------------
+// Main program
+//----------------------------------------------------------------------
+
+void usage(const char *msg = 0) {
+ if (msg) { std::cerr << msg << std::endl; }
+ std::cerr << "Usage: quantize-lm input-file.lm [output-file.qlm]" << std::endl;
+ if (!msg) std::cerr << std::endl
+ << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl
+ << " a version of it with quantized probabilities and back-off weights"<< std::endl
+ << " that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl;
+ }
+
+
+int main(int argc, const char **argv)
+{
+
+ //Process Parameters
+
+ if (argc < 2) { usage(); exit(1); }
+ std::vector<std::string> files;
+ for (int i=1; i < argc; i++) {
+ std::string opt = argv[i];
+ files.push_back(opt);
+ }
+ if (files.size() > 2) { usage("Too many arguments"); exit(1); }
+ if (files.size() < 1) { usage("Please specify a LM file to read from"); exit(1); }
+
+
+ std::string infile = files[0];
+ std::string outfile="";
+
+ if (files.size() == 1) {
+ outfile=infile;
+
+ //remove path information
+ std::string::size_type p = outfile.rfind('/');
+ if (p != std::string::npos && ((p+1) < outfile.size()))
+ outfile.erase(0,p+1);
+
+ //eventually strip .gz
+ if (outfile.compare(outfile.size()-3,3,".gz")==0)
+ outfile.erase(outfile.size()-3,3);
+
+ outfile+=".qlm";
+ }
+ else
+ outfile = files[1];
+
+
+
+ std::cout << "Reading " << infile << "..." << std::endl;
+
+ inputfilestream inp(infile.c_str());
+ if (!inp.good()) {
+ std::cerr << "Failed to open " << infile << "!\n";
+ exit(1);
+ }
+
+
+ std::ofstream out(outfile.c_str());
+ std::cout << "Writing " << outfile << "..." << std::endl;
+
+ //prepare temporary file to save n-gram blocks for multiple reads
+ //this avoids using seeks which do not work with inputfilestream
+ //it's odd but i need a bidirectional filestream!
+
+ string filePath;ofstream dummy;
+ createtempfile(dummy,filePath,ios::out);
+ dummy.close();
+
+ fstream filebuff(filePath.c_str(),ios::out|ios::in);
+
+ int nPts = 0; // actual number of points
+
+ // *** Read ARPA FILE **
+
+ int numNgrams[MAXLEV + 1]; /* # n-grams for each order */
+ int Order,MaxOrder;
+ int n;
+
+ float logprob,logbow, logten=log(10.0);
+
+ double* dataPts=NULL;
+ double* centersP=NULL; double* centersB=NULL;
+
+ int* mapP=NULL; int* mapB=NULL;
+
+ int centers=k;
+ streampos iposition;
+
+ out << "qARPA\n"; //print output header
+
+
+ for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0;
+
+ char line[MAX_LINE];
+
+ while (inp.getline(line,MAX_LINE)){
+
+ bool backslash = (line[0] == '\\');
+
+ if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {
+ numNgrams[Order] = n;
+ MaxOrder=Order;
+ }
+
+ if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {
+
+ out << line << "\n";
+ cerr << "-- Start processing of " << Order << "-grams\n";
+ assert(Order <= MAXLEV);
+
+ int N=numNgrams[Order];
+ centers=k;
+ if (Order==1) centers=256; // always use 256 centers
+
+ char* words[MAXLEV+3];
+ dataPts=new double[N]; // allocate data
+
+ //reset tempout file
+ filebuff.seekg(0);
+
+ for (nPts=0;nPts<N;nPts++){
+ inp.getline(line,MAX_LINE);
+ filebuff << line << std::endl;
+ int howmany = parseWords(line, words, Order + 3);
+ assert(howmany == Order+2 || howmany == Order+1);
+ sscanf(words[0],"%f",&logprob);
+ dataPts[nPts]=exp(logprob * logten);
+ }
+
+ cerr << "quantizing " << N << " probabilities\n";
+
+ centersP=new double[centers];
+ mapP=new int[N];
+
+ ComputeCluster(centers,centersP,N,dataPts);
+
+
+ assert(bintable !=NULL);
+ for (int p=0;p<N;p++){
+ mapP[bintable[p].idx]=bintable[p].code;
+ }
+
+ if (Order<MaxOrder){
+ //second pass to read back-off weights
+
+ filebuff.seekg(0);
+
+ for (nPts=0;nPts<N;nPts++){
+
+ filebuff.getline(line,MAX_LINE);
+
+ int howmany = parseWords(line, words, Order + 3);
+ if (howmany==Order+2) //backoff is written
+ sscanf(words[Order+1],"%f",&logbow);
+ else
+ logbow=0; // backoff is implicit
+ dataPts[nPts]=exp(logbow * logten);
+ }
+
+ centersB=new double[centers];
+ mapB=new int[N];
+
+ cerr << "quantizing " << N << " backoff weights\n";
+ ComputeCluster(centers,centersB,N,dataPts);
+
+ assert(bintable !=NULL);
+ for (int p=0;p<N;p++){
+ mapB[bintable[p].idx]=bintable[p].code;
+ }
+
+ }
+
+
+ out << centers << "\n";
+ for (nPts=0;nPts<centers;nPts++){
+ out << log(centersP[nPts])/logten;
+ if (Order<MaxOrder) out << " " << log(centersB[nPts])/logten;
+ out << "\n";
+ }
+
+ filebuff.seekg(0);
+
+ for (nPts=0;nPts<numNgrams[Order];nPts++){
+
+ filebuff.getline(line,MAX_LINE);
+
+ parseWords(line, words, Order + 3);
+
+ out << mapP[nPts];
+
+ for (int i=1;i<=Order;i++) out << "\t" << words[i];
+
+ if (Order < MaxOrder) out << "\t" << mapB[nPts];
+
+ out << "\n";
+
+ }
+
+ if (mapP){delete [] mapP;mapP=NULL;}
+ if (mapB){delete [] mapB;mapB=NULL;}
+
+ if (centersP){delete [] centersP; centersP=NULL;}
+ if (centersB){delete [] centersB; centersB=NULL;}
+
+ delete [] dataPts;
+
+ continue;
+
+
+ }
+
+ out << line << "\n";
+ }
+
+ cerr << "---- done\n";
+
+ out.flush();
+
+ out.close();
+ inp.close();
+
+ removefile(filePath);
+}
+
+// Compute Clusters
+
+int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){
+
+
+ //cerr << "\nExecuting Clutering Algorithm: k=" << centers<< "\n";
+
+ if (bintable) delete [] bintable;
+
+ bintable=new BinEntry[N];
+ for (int i=0;i<N;i++){
+ bintable[i].pt=dataPts[i];
+ bintable[i].idx=i;
+ bintable[i].code=0;
+ }
+
+ //cout << "start sort \n";
+ qsort(bintable,N,sizeof(BinEntry),cmpBinEntry);
+
+ int different=1;
+
+ for (int i=1;i<N;i++)
+ if (bintable[i].pt!=bintable[i-1].pt)
+ different++;
+
+ int interval=different/centers;
+ if (interval==0) interval++;
+
+ int* population=new int[centers];
+ int* species=new int[centers];
+
+ //cout << " Different entries=" << different
+ // << " Total Entries=" << N << " Bin Size=" << interval << "\n";
+
+ for (int i=0;i<centers;i++){
+ population[i]=species[i]=0;
+ ctrs[i]=0.0;
+ }
+
+ // initial values
+ bintable[0].code=0;
+ population[0]=1;
+ species[0]=1;
+
+ int currcode=0;
+ different=1;
+
+ for (int i=1;i<N;i++){
+
+ if ((bintable[i].pt!=bintable[i-1].pt)){
+ different++;
+ if ((different % interval) == 0)
+ if ((currcode+1) < centers
+ &&
+ population[currcode]>0){
+ currcode++;
+ }
+ }
+
+ if (bintable[i].pt == bintable[i-1].pt)
+ bintable[i].code=bintable[i-1].code;
+ else{
+ bintable[i].code=currcode;
+ species[currcode]++;
+ }
+
+ population[bintable[i].code]++;
+
+ assert(bintable[i].code < centers);
+
+ ctrs[bintable[i].code]+=bintable[i].pt;
+
+ }
+
+
+ for (int i=0;i<centers;i++){
+ if (population[i]>0){
+ ctrs[i]/=(float)population[i];
+ if (ctrs[i]<1e-99){
+ cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n";
+ ctrs[i]=1e-99;
+ }
+ }
+ //cout << i << " ctr " << ctrs[i] << " population " << population[i] << " species " << species[i] <<"\n";
+ }
+
+ cout.flush();
+
+ delete [] population;
+ delete [] species;
+
+
+ return 1;
+
+}
+
+//----------------------------------------------------------------------
+// Reading/Printing utilities
+// readPt - read a point from input stream into data storage
+// at position i. Returns false on error or EOF.
+// printPt - prints a points to output file
+//----------------------------------------------------------------------
+
+
+int parseWords(char *sentence, char **words, int max)
+{
+ char *word;
+ int i = 0;
+
+ char *const wordSeparators = " \t\r\n";
+
+ for (word = strtok(sentence, wordSeparators);
+ i < max && word != 0;
+ i++, word = strtok(0, wordSeparators))
+ {
+ words[i] = word;
+ }
+ if (i < max) {
+ words[i] = 0;
+ }
+
+ return i;
+}
+
+
diff --git a/irstlm/src/src/util.cpp b/irstlm/src/src/util.cpp
new file mode 100644
index 000000000..1b81218d1
--- /dev/null
+++ b/irstlm/src/src/util.cpp
@@ -0,0 +1,77 @@
+
+#ifdef WIN32
+#include <windows.h>
+#endif
+
+#include "util.h"
+
+using namespace std;
+
+string gettempfolder()
+{
+#ifdef _WIN32
+ char *tmpPath = getenv("TMP");
+ string str(tmpPath);
+ if (str.substr(str.size() - 1, 1) != "\\")
+ str += "\\";
+ return str;
+#else
+ return "/tmp/";
+#endif
+}
+
+
+void createtempfile(ofstream &fileStream, string &filePath, std::ios_base::openmode flags)
+{
+#ifdef _WIN32
+ char buffer[BUFSIZ];
+ ::GetTempFileNameA(gettempfolder().c_str(), "", 0, buffer);
+ filePath = buffer;
+#else
+ char buffer[L_tmpnam];
+ strcpy(buffer, gettempfolder().c_str());
+ strcat(buffer, "dskbuff--XXXXXX");
+ mkstemp(buffer);
+ filePath = buffer;
+#endif
+ fileStream.open(filePath.c_str(), flags);
+}
+
+void removefile(const std::string &filePath)
+{
+#ifdef _WIN32
+ ::DeleteFileA(filePath.c_str());
+#else
+ char cmd[100];
+ sprintf(cmd,"rm %s",filePath.c_str());
+ system(cmd);
+#endif
+}
+
+
+
+inputfilestream::inputfilestream(const std::string &filePath)
+: std::istream(0),
+m_streambuf(0)
+{
+ if (filePath.size() > 3 &&
+ filePath.substr(filePath.size() - 3, 3) == ".gz")
+ {
+ m_streambuf = new gzfilebuf(filePath.c_str());
+ } else {
+ std::filebuf* fb = new std::filebuf();
+ fb->open(filePath.c_str(), std::ios::in);
+ m_streambuf = fb;
+ }
+ this->init(m_streambuf);
+}
+
+inputfilestream::~inputfilestream()
+{
+ delete m_streambuf; m_streambuf = 0;
+}
+
+void inputfilestream::close()
+{
+}
+
diff --git a/irstlm/src/src/util.h b/irstlm/src/src/util.h
new file mode 100644
index 000000000..229d301cc
--- /dev/null
+++ b/irstlm/src/src/util.h
@@ -0,0 +1,25 @@
+#ifndef IRSTLM_UTIL_H
+#define IRSTLM_UTIL_H
+
+#include <string>
+#include <fstream>
+#include "gzfilebuf.h"
+
+std::string gettempfolder();
+void createtempfile(std::ofstream &fileStream, std::string &filePath, std::ios_base::openmode flags);
+void removefile(const std::string &filePath);
+
+class inputfilestream : public std::istream
+{
+protected:
+ std::streambuf *m_streambuf;
+public:
+
+ inputfilestream(const std::string &filePath);
+ ~inputfilestream();
+
+ void close();
+};
+
+#endif
+