Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/salm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieu@hoang.co.uk>2013-11-25 13:56:37 +0400
committerHieu Hoang <hieu@hoang.co.uk>2013-11-25 13:56:37 +0400
commita146dbec8f0391e247db1ae4c9b7af5c225436f9 (patch)
tree1fa97934675448cdcffb26b4737887d551822a39
initial add of salm to github
-rwxr-xr-xCopyRight97
-rwxr-xr-xDistribution/Linux/Makefile283
-rwxr-xr-xDistribution/Win32/CalcCountOfCounts/CalcCountOfCounts.dsp138
-rwxr-xr-xDistribution/Win32/CalcCountOfCounts/CalcCountOfCounts.dsw29
-rwxr-xr-xDistribution/Win32/CalcCountOfCounts/CalcCountOfCounts.ncbbin0 -> 107520 bytes
-rwxr-xr-xDistribution/Win32/CalcCountOfCounts/CalcCountOfCounts.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/CalcCountOfCounts/CalcCountOfCounts.plg44
-rwxr-xr-xDistribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.dsp138
-rwxr-xr-xDistribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.dsw29
-rwxr-xr-xDistribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.ncbbin0 -> 107520 bytes
-rwxr-xr-xDistribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.plg44
-rwxr-xr-xDistribution/Win32/EvaluateLM/EvaluateLM.dsp146
-rwxr-xr-xDistribution/Win32/EvaluateLM/EvaluateLM.dsw29
-rwxr-xr-xDistribution/Win32/EvaluateLM/EvaluateLM.ncbbin0 -> 156672 bytes
-rwxr-xr-xDistribution/Win32/EvaluateLM/EvaluateLM.optbin0 -> 49664 bytes
-rwxr-xr-xDistribution/Win32/EvaluateLM/EvaluateLM.plg47
-rwxr-xr-xDistribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.dsp138
-rwxr-xr-xDistribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.dsw29
-rwxr-xr-xDistribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.ncbbin0 -> 107520 bytes
-rwxr-xr-xDistribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.plg44
-rwxr-xr-xDistribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.dsp138
-rwxr-xr-xDistribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.dsw29
-rwxr-xr-xDistribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.ncbbin0 -> 115712 bytes
-rwxr-xr-xDistribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.plg44
-rwxr-xr-xDistribution/Win32/IndexSA/IndexSA.dsp130
-rwxr-xr-xDistribution/Win32/IndexSA/IndexSA.dsw29
-rwxr-xr-xDistribution/Win32/IndexSA/IndexSA.ncbbin0 -> 91136 bytes
-rwxr-xr-xDistribution/Win32/IndexSA/IndexSA.optbin0 -> 49664 bytes
-rwxr-xr-xDistribution/Win32/IndexSA/IndexSA.plg41
-rwxr-xr-xDistribution/Win32/InitializeVocabulary/InitializeVocabulary.dsp122
-rwxr-xr-xDistribution/Win32/InitializeVocabulary/InitializeVocabulary.dsw29
-rwxr-xr-xDistribution/Win32/InitializeVocabulary/InitializeVocabulary.ncbbin0 -> 58368 bytes
-rwxr-xr-xDistribution/Win32/InitializeVocabulary/InitializeVocabulary.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/InitializeVocabulary/InitializeVocabulary.plg38
-rwxr-xr-xDistribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.dsp138
-rwxr-xr-xDistribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.dsw29
-rwxr-xr-xDistribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.ncbbin0 -> 91136 bytes
-rwxr-xr-xDistribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.optbin0 -> 49664 bytes
-rwxr-xr-xDistribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.plg44
-rwxr-xr-xDistribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.dsp138
-rwxr-xr-xDistribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.dsw29
-rwxr-xr-xDistribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.ncbbin0 -> 91136 bytes
-rwxr-xr-xDistribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.plg44
-rwxr-xr-xDistribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.dsp138
-rwxr-xr-xDistribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.dsw29
-rwxr-xr-xDistribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.ncbbin0 -> 107520 bytes
-rwxr-xr-xDistribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.plg16
-rwxr-xr-xDistribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.dsp138
-rwxr-xr-xDistribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.dsw29
-rwxr-xr-xDistribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.ncbbin0 -> 115712 bytes
-rwxr-xr-xDistribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.plg44
-rwxr-xr-xDistribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.dsp138
-rwxr-xr-xDistribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.dsw29
-rwxr-xr-xDistribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.ncbbin0 -> 107520 bytes
-rwxr-xr-xDistribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.plg16
-rwxr-xr-xDistribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.dsp138
-rwxr-xr-xDistribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.dsw29
-rwxr-xr-xDistribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.ncbbin0 -> 123904 bytes
-rwxr-xr-xDistribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.plg44
-rwxr-xr-xDistribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.dsp137
-rwxr-xr-xDistribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.dsw29
-rwxr-xr-xDistribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.ncbbin0 -> 107520 bytes
-rwxr-xr-xDistribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.plg44
-rwxr-xr-xDistribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.dsp138
-rwxr-xr-xDistribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.dsw29
-rwxr-xr-xDistribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.ncbbin0 -> 107520 bytes
-rwxr-xr-xDistribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.plg44
-rwxr-xr-xDistribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.dsp130
-rwxr-xr-xDistribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.dsw29
-rwxr-xr-xDistribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.ncbbin0 -> 74752 bytes
-rwxr-xr-xDistribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.optbin0 -> 48640 bytes
-rwxr-xr-xDistribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.plg41
-rwxr-xr-xReadme70
-rwxr-xr-xSrc/IndexSA/IndexSA.cpp58
-rwxr-xr-xSrc/IndexSA/IndexSA.cpp~57
-rwxr-xr-xSrc/IndexSA/_MonoCorpus.cpp440
-rwxr-xr-xSrc/IndexSA/_MonoCorpus.cpp~439
-rwxr-xr-xSrc/IndexSA/_MonoCorpus.h60
-rwxr-xr-xSrc/SALM-API-Description.txt24
-rwxr-xr-xSrc/Shared/_IDVocabulary.cpp219
-rwxr-xr-xSrc/Shared/_IDVocabulary.cpp~218
-rwxr-xr-xSrc/Shared/_IDVocabulary.h55
-rwxr-xr-xSrc/Shared/_String.cpp253
-rwxr-xr-xSrc/Shared/_String.h45
-rwxr-xr-xSrc/Shared/salm_shared.h36
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp63
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~62
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt5
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp1113
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h210
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp691
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~690
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h137
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp34
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp70
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp32
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp338
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~338
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h53
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp130
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~129
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp72
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~71
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp47
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~46
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp85
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~84
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp67
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~66
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp132
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~131
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp50
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~49
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp144
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~145
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp178
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~177
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp754
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~753
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h127
-rwxr-xr-xSrc/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp314
-rwxr-xr-xSrc/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~313
-rwxr-xr-xSrc/SuffixArrayApplications/_SuffixArrayApplicationBase.h58
-rwxr-xr-xSrc/Utils/InitializeVocabulary.cpp30
-rwxr-xr-xSrc/Utils/UpdateUniversalVoc.cpp28
-rwxr-xr-xSrc/Utils/_UniversalVocabulary.cpp118
-rwxr-xr-xSrc/Utils/_UniversalVocabulary.cpp~117
-rwxr-xr-xSrc/Utils/_UniversalVocabulary.h38
138 files changed, 13929 insertions, 0 deletions
diff --git a/CopyRight b/CopyRight
new file mode 100755
index 0000000..a84184f
--- /dev/null
+++ b/CopyRight
@@ -0,0 +1,97 @@
+GNU GENERAL PUBLIC LICENSE
+
+Version 2, June 1991
+
+Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+
+Everyone is permitted to copy and distribute verbatim copies
+of this license document, but changing it is not allowed.
+
+Preamble
+
+The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too.
+
+When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things.
+
+To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it.
+
+For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights.
+
+We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software.
+
+Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations.
+
+Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all.
+
+The precise terms and conditions for copying, distribution and modification follow.
+TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does.
+
+1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee.
+
+2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions:
+
+¡¡
+ a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change.
+¡¡
+ b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License.
+¡¡
+ c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License.
+
+3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following:
+
+¡¡
+ a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or,
+¡¡
+ b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or,
+¡¡
+ c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable.
+
+If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code.
+
+4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance.
+
+5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it.
+
+6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License.
+
+7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice.
+
+This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License.
+
+8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License.
+
+9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation.
+
+10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally.
+
+NO WARRANTY
+
+11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+END OF TERMS AND CONDITIONS
+
+GNU General Public License - GNU Project - Free Software Foundation (FSF)
+
+¡¡ \ No newline at end of file
diff --git a/Distribution/Linux/Makefile b/Distribution/Linux/Makefile
new file mode 100755
index 0000000..baec989
--- /dev/null
+++ b/Distribution/Linux/Makefile
@@ -0,0 +1,283 @@
+# compilor program
+CC = g++
+
+# directories
+# source code directories
+PROGRAM_SRC_ROOT_DIR = ../../Src
+SA_SHARED_SRC_DIR = ${PROGRAM_SRC_ROOT_DIR}/Shared
+SA_INDEX_SRC_DIR = ${PROGRAM_SRC_ROOT_DIR}/IndexSA
+SA_APPLICATION_SRC_DIR = ${PROGRAM_SRC_ROOT_DIR}/SuffixArrayApplications
+SA_SEARCH_SRC_DIR = ${SA_APPLICATION_SRC_DIR}/SuffixArraySearch
+SA_SEARCH_APP_SRC_DIR = ${SA_SEARCH_SRC_DIR}/Applications
+SA_LM_SRC_DIR = ${SA_APPLICATION_SRC_DIR}/SuffixArrayLanguageModel
+SA_LM_APP_SRC_DIR = ${SA_LM_SRC_DIR}/Applications
+SA_SCAN_SRC_DIR = ${SA_APPLICATION_SRC_DIR}/SuffixArrayScan
+SA_SCAN_APP_SRC_DIR = ${SA_SCAN_SRC_DIR}/Applications
+SA_UTIL_SRC_DIR = ${PROGRAM_SRC_ROOT_DIR}/Utils
+
+# directories for object files
+OBJ_DIR = Objs
+SA_SHARED_OBJ_DIR = ${OBJ_DIR}/Shared
+SA_INDEX_OBJ_DIR = ${OBJ_DIR}/Index
+SA_SEARCH_OBJ_DIR = ${OBJ_DIR}/Search
+SA_SCAN_OBJ_DIR = ${OBJ_DIR}/Scan
+SA_LM_OBJ_DIR = ${OBJ_DIR}/LM
+SA_UTIL_OBJ_DIR = ${OBJ_DIR}/Utils
+
+# directories for final binary executables
+BIN_DIR = ../../Bin/Linux
+SA_INDEX_BIN_DIR = ${BIN_DIR}/Index
+SA_SEARCH_BIN_DIR = ${BIN_DIR}/Search
+SA_SCAN_BIN_DIR = ${BIN_DIR}/Scan
+SA_LM_BIN_DIR = ${BIN_DIR}/LM
+SA_UTIL_BIN_DIR = ${BIN_DIR}/Utils
+
+# compilor flags
+CFLAGS_O32 = -c -O -I${PROGRAM_SRC_ROOT_DIR}/Shared -I${SA_APPLICATION_SRC_DIR} -I${SA_SEARCH_SRC_DIR} -I${SA_SCAN_SRC_DIR} -I${SA_UTIL_SRC_DIR} -I${SA_LM_SRC_DIR}
+CFLAGS_G32 = -c -g -I${PROGRAM_SRC_ROOT_DIR}/Shared -I${SA_APPLICATION_SRC_DIR} -I${SA_SEARCH_SRC_DIR} -I${SA_SCAN_SRC_DIR} -I${SA_UTIL_SRC_DIR} -I${SA_LM_SRC_DIR}
+CFLAGS_O64 = -c -O -m64 -I${PROGRAM_SRC_ROOT_DIR}/Shared -I${SA_APPLICATION_SRC_DIR} -I${SA_SEARCH_SRC_DIR} -I${SA_SCAN_SRC_DIR} -I${SA_UTIL_SRC_DIR} -I${SA_LM_SRC_DIR}
+
+# shared objects
+SHARED_OBJ_o32 = \
+ ${SA_SHARED_OBJ_DIR}/_IDVocabulary.o32 \
+ ${SA_SHARED_OBJ_DIR}/_String.o32 \
+
+SHARED_OBJ_g32 = \
+ ${SA_SHARED_OBJ_DIR}/_IDVocabulary.g32 \
+ ${SA_SHARED_OBJ_DIR}/_String.g32 \
+
+SHARED_OBJ_o64 = \
+ ${SA_SHARED_OBJ_DIR}/_IDVocabulary.o64 \
+ ${SA_SHARED_OBJ_DIR}/_String.o64 \
+
+#
+# Target to create all 32-bit optimized binaries
+allO32 : CREATE_DIR \
+ ${SA_INDEX_BIN_DIR}/IndexSA.O32 \
+ ${SA_SEARCH_BIN_DIR}/FilterDuplicatedSentences.O32 \
+ ${SA_SEARCH_BIN_DIR}/FrequencyOfNgrams.O32 \
+ ${SA_SEARCH_BIN_DIR}/CollectNgramFreqCount.O32 \
+ ${SA_SEARCH_BIN_DIR}/NGramMatchingStat4TestSet.O32 \
+ ${SA_SEARCH_BIN_DIR}/NgramMatchingFreq4Sent.O32 \
+ ${SA_SEARCH_BIN_DIR}/NgramTypeInTestSetMatchedInCorpus.O32 \
+ ${SA_SEARCH_BIN_DIR}/NgramMatchingFreqAndNonCompositionality4Sent.O32 \
+ ${SA_SEARCH_BIN_DIR}/LocateNgramInCorpus.O32 \
+ ${SA_SEARCH_BIN_DIR}/LocateEmbeddedNgramsInCorpus.O32 \
+ ${SA_SEARCH_BIN_DIR}/SampleNGramIns.O32 \
+ ${SA_SCAN_BIN_DIR}/CalcCountOfCounts.O32 \
+ ${SA_SCAN_BIN_DIR}/OutputHighFreqNgram.O32 \
+ ${SA_LM_BIN_DIR}/EvaluateLM.O32 \
+ ${SA_SCAN_BIN_DIR}/TypeTokenFreqInCorpus.O32 \
+ ${SA_UTIL_BIN_DIR}/InitializeVocabulary.O32 \
+ ${SA_UTIL_BIN_DIR}/UpdateUniversalVoc.O32
+
+# Target to createall 32-bit binaries with debugging information
+allG32 : CREATE_DIR \
+ ${SA_INDEX_BIN_DIR}/IndexSA.G32 \
+ ${SA_SEARCH_BIN_DIR}/FilterDuplicatedSentences.G32 \
+ ${SA_SEARCH_BIN_DIR}/FrequencyOfNgrams.G32 \
+ ${SA_SEARCH_BIN_DIR}/CollectNgramFreqCount.G32 \
+ ${SA_SEARCH_BIN_DIR}/NGramMatchingStat4TestSet.G32 \
+ ${SA_SEARCH_BIN_DIR}/NgramMatchingFreq4Sent.G32 \
+ ${SA_SEARCH_BIN_DIR}/NgramTypeInTestSetMatchedInCorpus.G32 \
+ ${SA_SEARCH_BIN_DIR}/NgramMatchingFreqAndNonCompositionality4Sent.G32 \
+ ${SA_SEARCH_BIN_DIR}/LocateNgramInCorpus.G32 \
+ ${SA_SEARCH_BIN_DIR}/LocateEmbeddedNgramsInCorpus.G32 \
+ ${SA_SEARCH_BIN_DIR}/SampleNGramIns.G32 \
+ ${SA_SCAN_BIN_DIR}/CalcCountOfCounts.G32 \
+ ${SA_SCAN_BIN_DIR}/OutputHighFreqNgram.G32 \
+ ${SA_SCAN_BIN_DIR}/TypeTokenFreqInCorpus.G32 \
+ ${SA_LM_BIN_DIR}/EvaluateLM.G32 \
+ ${SA_UTIL_BIN_DIR}/InitializeVocabulary.G32 \
+ ${SA_UTIL_BIN_DIR}/UpdateUniversalVoc.G32
+
+# Target to create all 64-bit optimized binaries
+allO64 : CREATE_DIR \
+ ${SA_INDEX_BIN_DIR}/IndexSA.O64 \
+ ${SA_SEARCH_BIN_DIR}/FilterDuplicatedSentences.O64 \
+ ${SA_SEARCH_BIN_DIR}/FrequencyOfNgrams.O64 \
+ ${SA_SEARCH_BIN_DIR}/CollectNgramFreqCount.O64 \
+ ${SA_SEARCH_BIN_DIR}/NGramMatchingStat4TestSet.O64 \
+ ${SA_SEARCH_BIN_DIR}/NgramMatchingFreq4Sent.O64 \
+ ${SA_SEARCH_BIN_DIR}/NgramTypeInTestSetMatchedInCorpus.O64 \
+ ${SA_SEARCH_BIN_DIR}/NgramMatchingFreqAndNonCompositionality4Sent.O64 \
+ ${SA_SEARCH_BIN_DIR}/LocateNgramInCorpus.O64 \
+ ${SA_SEARCH_BIN_DIR}/LocateEmbeddedNgramsInCorpus.O64 \
+ ${SA_SCAN_BIN_DIR}/CalcCountOfCounts.O64 \
+ ${SA_SCAN_BIN_DIR}/OutputHighFreqNgram.O64 \
+ ${SA_SCAN_BIN_DIR}/TypeTokenFreqInCorpus.O64 \
+ ${SA_LM_BIN_DIR}/EvaluateLM.O64 \
+ ${SA_UTIL_BIN_DIR}/InitializeVocabulary.O64 \
+ ${SA_UTIL_BIN_DIR}/UpdateUniversalVoc.O64
+
+# Target to create the directory hierarchy needed
+CREATE_DIR :
+ mkdir -p ${OBJ_DIR}
+ mkdir -p ${SA_SHARED_OBJ_DIR}
+ mkdir -p ${SA_INDEX_OBJ_DIR}
+ mkdir -p ${SA_SEARCH_OBJ_DIR}
+ mkdir -p ${SA_SCAN_OBJ_DIR}
+ mkdir -p ${SA_LM_OBJ_DIR}
+ mkdir -p ${SA_UTIL_OBJ_DIR}
+ mkdir -p ${BIN_DIR}
+ mkdir -p ${SA_INDEX_BIN_DIR}
+ mkdir -p ${SA_SEARCH_BIN_DIR}
+ mkdir -p ${SA_SCAN_BIN_DIR}
+ mkdir -p ${SA_LM_BIN_DIR}
+ mkdir -p ${SA_UTIL_BIN_DIR}
+
+#-------------------------------------------------------------------
+# Top level executables 32-bit optimized
+#-------------------------------------------------------------------
+${SA_INDEX_BIN_DIR}/IndexSA.O32: ${SA_INDEX_OBJ_DIR}/IndexSA.o32 ${SA_INDEX_OBJ_DIR}/_MonoCorpus.o32 ${SHARED_OBJ_o32}
+ ${CC} -o $@ $^
+
+${SA_SEARCH_BIN_DIR}/%.O32 : ${SA_SEARCH_OBJ_DIR}/%.o32 ${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.o32 ${SA_SEARCH_OBJ_DIR}/_SuffixArraySearchApplicationBase.o32 ${SHARED_OBJ_o32}
+ ${CC} -o $@ $^
+
+${SA_SCAN_BIN_DIR}/%.O32 : ${SA_SCAN_OBJ_DIR}/%.o32 ${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.o32 ${SA_SCAN_OBJ_DIR}/_SuffixArrayScanningBase.o32 ${SHARED_OBJ_o32}
+ ${CC} -o $@ $^
+
+${SA_LM_BIN_DIR}/%.O32 : ${SA_LM_OBJ_DIR}/%.o32 ${SA_LM_OBJ_DIR}/_SuffixArrayLanguageModel.o32 ${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.o32 ${SA_SEARCH_OBJ_DIR}/_SuffixArraySearchApplicationBase.o32 ${SHARED_OBJ_o32}
+ ${CC} -o $@ $^
+
+${SA_UTIL_BIN_DIR}/%.O32 : ${SA_UTIL_OBJ_DIR}/%.o32 ${SA_UTIL_OBJ_DIR}/_UniversalVocabulary.o32 ${SHARED_OBJ_o32}
+ ${CC} -o $@ $^
+
+# 32-bit objects
+${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.o32 : ${SA_APPLICATION_SRC_DIR}/_SuffixArrayApplicationBase.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+${SA_SEARCH_OBJ_DIR}/_SuffixArraySearchApplicationBase.o32 : ${SA_SEARCH_SRC_DIR}/_SuffixArraySearchApplicationBase.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+${SA_SCAN_OBJ_DIR}/_SuffixArrayScanningBase.o32 : ${SA_SCAN_SRC_DIR}/_SuffixArrayScanningBase.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+${SA_LM_OBJ_DIR}/_SuffixArrayLanguageModel.o32 : ${SA_LM_SRC_DIR}/_SuffixArrayLanguageModel.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+${SA_SHARED_OBJ_DIR}/%.o32 : ${SA_SHARED_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+${SA_INDEX_OBJ_DIR}/%.o32 : ${SA_INDEX_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+${SA_SEARCH_OBJ_DIR}/%.o32 : ${SA_SEARCH_APP_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+${SA_SCAN_OBJ_DIR}/%.o32 : ${SA_SCAN_APP_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+${SA_LM_OBJ_DIR}/%.o32 : ${SA_LM_APP_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+${SA_UTIL_OBJ_DIR}/%.o32 : ${SA_UTIL_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O32} -o $@ $<
+
+
+#-------------------------------------------------------------------
+# Top level executables 32-bit debug
+#-------------------------------------------------------------------
+${SA_INDEX_BIN_DIR}/IndexSA.G32: ${SA_INDEX_OBJ_DIR}/IndexSA.g32 ${SA_INDEX_OBJ_DIR}/_MonoCorpus.g32 ${SHARED_OBJ_g32}
+ ${CC} -o $@ $^
+
+${SA_SEARCH_BIN_DIR}/%.G32 : ${SA_SEARCH_OBJ_DIR}/%.g32 ${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.g32 ${SA_SEARCH_OBJ_DIR}/_SuffixArraySearchApplicationBase.g32 ${SHARED_OBJ_g32}
+ ${CC} -o $@ $^
+
+${SA_SCAN_BIN_DIR}/%.G32 : ${SA_SCAN_OBJ_DIR}/%.g32 ${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.g32 ${SA_SCAN_OBJ_DIR}/_SuffixArrayScanningBase.g32 ${SHARED_OBJ_g32}
+ ${CC} -o $@ $^
+
+${SA_LM_BIN_DIR}/%.G32 : ${SA_LM_OBJ_DIR}/%.g32 ${SA_LM_OBJ_DIR}/_SuffixArrayLanguageModel.g32 ${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.g32 ${SA_SEARCH_OBJ_DIR}/_SuffixArraySearchApplicationBase.g32 ${SHARED_OBJ_g32}
+ ${CC} -o $@ $^
+
+${SA_UTIL_BIN_DIR}/%.G32 : ${SA_UTIL_OBJ_DIR}/%.g32 ${SA_UTIL_OBJ_DIR}/_UniversalVocabulary.g32 ${SHARED_OBJ_g32}
+ ${CC} -o $@ $^
+
+# 32-bit objects with debug information
+${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.g32 : ${SA_APPLICATION_SRC_DIR}/_SuffixArrayApplicationBase.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+${SA_SEARCH_OBJ_DIR}/_SuffixArraySearchApplicationBase.g32 : ${SA_SEARCH_SRC_DIR}/_SuffixArraySearchApplicationBase.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+${SA_SCAN_OBJ_DIR}/_SuffixArrayScanningBase.g32 : ${SA_SCAN_SRC_DIR}/_SuffixArrayScanningBase.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+${SA_LM_OBJ_DIR}/_SuffixArrayLanguageModel.g32 : ${SA_LM_SRC_DIR}/_SuffixArrayLanguageModel.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+${SA_SHARED_OBJ_DIR}/%.g32 : ${SA_SHARED_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+${SA_INDEX_OBJ_DIR}/%.g32 : ${SA_INDEX_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+${SA_SEARCH_OBJ_DIR}/%.g32 : ${SA_SEARCH_APP_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+${SA_SCAN_OBJ_DIR}/%.g32 : ${SA_SCAN_APP_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+${SA_LM_OBJ_DIR}/%.g32 : ${SA_LM_APP_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+${SA_UTIL_OBJ_DIR}/%.g32 : ${SA_UTIL_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_G32} -o $@ $<
+
+
+#-------------------------------------------------------------------
+# Top level executables of 64-bit optimized version
+#-------------------------------------------------------------------
+${SA_INDEX_BIN_DIR}/IndexSA.O64: ${SA_INDEX_OBJ_DIR}/IndexSA.o64 ${SA_INDEX_OBJ_DIR}/_MonoCorpus.o64 ${SHARED_OBJ_o64}
+ ${CC} -o $@ $^
+
+${SA_SEARCH_BIN_DIR}/%.O64 : ${SA_SEARCH_OBJ_DIR}/%.o64 ${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.o64 ${SA_SEARCH_OBJ_DIR}/_SuffixArraySearchApplicationBase.o64 ${SHARED_OBJ_o64}
+ ${CC} -o $@ $^
+
+${SA_SCAN_BIN_DIR}/%.O64 : ${SA_SCAN_OBJ_DIR}/%.o64 ${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.o64 ${SA_SCAN_OBJ_DIR}/_SuffixArrayScanningBase.o64 ${SHARED_OBJ_o64}
+ ${CC} -o $@ $^
+
+${SA_LM_BIN_DIR}/%.O64 : ${SA_LM_OBJ_DIR}/%.o64 ${SA_LM_OBJ_DIR}/_SuffixArrayLanguageModel.o64 ${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.o64 ${SA_SEARCH_OBJ_DIR}/_SuffixArraySearchApplicationBase.o64 ${SHARED_OBJ_o64}
+ ${CC} -o $@ $^
+
+${SA_UTIL_BIN_DIR}/%.O64 : ${SA_UTIL_OBJ_DIR}/%.o64 ${SA_UTIL_OBJ_DIR}/_UniversalVocabulary.o64 ${SHARED_OBJ_o64}
+ ${CC} -o $@ $^
+
+# 64-bit objects
+${SA_SEARCH_OBJ_DIR}/_SuffixArrayApplicationBase.o64 : ${SA_APPLICATION_SRC_DIR}/_SuffixArrayApplicationBase.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+${SA_SEARCH_OBJ_DIR}/_SuffixArraySearchApplicationBase.o64 : ${SA_SEARCH_SRC_DIR}/_SuffixArraySearchApplicationBase.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+${SA_SCAN_OBJ_DIR}/_SuffixArrayScanningBase.o64 : ${SA_SCAN_SRC_DIR}/_SuffixArrayScanningBase.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+${SA_LM_OBJ_DIR}/_SuffixArrayLanguageModel.o64 : ${SA_LM_SRC_DIR}/_SuffixArrayLanguageModel.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+${SA_SHARED_OBJ_DIR}/%.o64 : ${SA_SHARED_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+${SA_INDEX_OBJ_DIR}/%.o64 : ${SA_INDEX_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+${SA_SEARCH_OBJ_DIR}/%.o64 : ${SA_SEARCH_APP_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+${SA_SCAN_OBJ_DIR}/%.o64 : ${SA_SCAN_APP_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+${SA_LM_OBJ_DIR}/%.o64 : ${SA_LM_APP_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+${SA_UTIL_OBJ_DIR}/%.o64 : ${SA_UTIL_SRC_DIR}/%.cpp
+ ${CC} ${CFLAGS_O64} -o $@ $<
+
+# Target to clean all the executables, objects and directories
+clean :
+ rm -rf ${OBJ_DIR}/
+ rm -rf ${BIN_DIR}/
+
+
diff --git a/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.dsp b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.dsp
new file mode 100755
index 0000000..f1cc038
--- /dev/null
+++ b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="CalcCountOfCounts" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=CalcCountOfCounts - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "CalcCountOfCounts.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "CalcCountOfCounts.mak" CFG="CalcCountOfCounts - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "CalcCountOfCounts - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "CalcCountOfCounts - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "CalcCountOfCounts - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/CalcCountOfCounts.exe"
+
+!ELSEIF "$(CFG)" == "CalcCountOfCounts - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "CalcCountOfCounts - Win32 Release"
+# Name "CalcCountOfCounts - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayScan\_SuffixArrayScanningBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayScan\Applications\CalcCountOfCounts.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayScan\_SuffixArrayScanningBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.dsw b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.dsw
new file mode 100755
index 0000000..bae317b
--- /dev/null
+++ b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "CalcCountOfCounts"=".\CalcCountOfCounts.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.ncb b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.ncb
new file mode 100755
index 0000000..cc0fe64
--- /dev/null
+++ b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.ncb
Binary files differ
diff --git a/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.opt b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.opt
new file mode 100755
index 0000000..ba562e1
--- /dev/null
+++ b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.opt
Binary files differ
diff --git a/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.plg b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.plg
new file mode 100755
index 0000000..df436b5
--- /dev/null
+++ b/Distribution/Win32/CalcCountOfCounts/CalcCountOfCounts.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: CalcCountOfCounts - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP12FB.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/CalcCountOfCounts.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArrayScan\_SuffixArrayScanningBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArrayScan\Applications\CalcCountOfCounts.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP12FB.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP12FC.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/CalcCountOfCounts.pdb" /machine:I386 /out:"../../../Bin/Win32/CalcCountOfCounts.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArrayScanningBase.obj"
+".\Release\CalcCountOfCounts.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP12FC.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArrayScanningBase.cpp
+CalcCountOfCounts.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+CalcCountOfCounts.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.dsp b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.dsp
new file mode 100755
index 0000000..24a8d31
--- /dev/null
+++ b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="CollectNgramFreqCount" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=CollectNgramFreqCount - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "CollectNgramFreqCount.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "CollectNgramFreqCount.mak" CFG="CollectNgramFreqCount - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "CollectNgramFreqCount - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "CollectNgramFreqCount - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "CollectNgramFreqCount - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/CollectNgramFreqCount.exe"
+
+!ELSEIF "$(CFG)" == "CollectNgramFreqCount - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/CollectNgramFreqCount.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "CollectNgramFreqCount - Win32 Release"
+# Name "CollectNgramFreqCount - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\Applications\CollectNgramFreqCount.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.dsw b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.dsw
new file mode 100755
index 0000000..0d05375
--- /dev/null
+++ b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "CollectNgramFreqCount"=".\CollectNgramFreqCount.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.ncb b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.ncb
new file mode 100755
index 0000000..e9d66bf
--- /dev/null
+++ b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.ncb
Binary files differ
diff --git a/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.opt b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.opt
new file mode 100755
index 0000000..b8d7e23
--- /dev/null
+++ b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.opt
Binary files differ
diff --git a/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.plg b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.plg
new file mode 100755
index 0000000..384f10e
--- /dev/null
+++ b/Distribution/Win32/CollectNgramFreqCount/CollectNgramFreqCount.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: CollectNgramFreqCount - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1300.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/CollectNgramFreqCount.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\Applications\CollectNgramFreqCount.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1300.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1301.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/CollectNgramFreqCount.pdb" /machine:I386 /out:"../../../Bin/Win32/CollectNgramFreqCount.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArraySearchApplicationBase.obj"
+".\Release\CollectNgramFreqCount.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1301.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArraySearchApplicationBase.cpp
+CollectNgramFreqCount.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+CollectNgramFreqCount.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/EvaluateLM/EvaluateLM.dsp b/Distribution/Win32/EvaluateLM/EvaluateLM.dsp
new file mode 100755
index 0000000..d4a8168
--- /dev/null
+++ b/Distribution/Win32/EvaluateLM/EvaluateLM.dsp
@@ -0,0 +1,146 @@
+# Microsoft Developer Studio Project File - Name="EvaluateLM" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=EvaluateLM - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "EvaluateLM.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "EvaluateLM.mak" CFG="EvaluateLM - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "EvaluateLM - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "EvaluateLM - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "EvaluateLM - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/EvaluateLM.exe"
+
+!ELSEIF "$(CFG)" == "EvaluateLM - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/EvaluateLM.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "EvaluateLM - Win32 Release"
+# Name "EvaluateLM - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayLanguageModel\_SuffixArrayLanguageModel.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayLanguageModel\Applications\EvaluateLM.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayLanguageModel\_SuffixArrayLanguageModel.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/EvaluateLM/EvaluateLM.dsw b/Distribution/Win32/EvaluateLM/EvaluateLM.dsw
new file mode 100755
index 0000000..8b33192
--- /dev/null
+++ b/Distribution/Win32/EvaluateLM/EvaluateLM.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "EvaluateLM"=.\EvaluateLM.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/EvaluateLM/EvaluateLM.ncb b/Distribution/Win32/EvaluateLM/EvaluateLM.ncb
new file mode 100755
index 0000000..b632a1d
--- /dev/null
+++ b/Distribution/Win32/EvaluateLM/EvaluateLM.ncb
Binary files differ
diff --git a/Distribution/Win32/EvaluateLM/EvaluateLM.opt b/Distribution/Win32/EvaluateLM/EvaluateLM.opt
new file mode 100755
index 0000000..0c47756
--- /dev/null
+++ b/Distribution/Win32/EvaluateLM/EvaluateLM.opt
Binary files differ
diff --git a/Distribution/Win32/EvaluateLM/EvaluateLM.plg b/Distribution/Win32/EvaluateLM/EvaluateLM.plg
new file mode 100755
index 0000000..9f39fec
--- /dev/null
+++ b/Distribution/Win32/EvaluateLM/EvaluateLM.plg
@@ -0,0 +1,47 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: EvaluateLM - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP12F6.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/EvaluateLM.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArrayLanguageModel\_SuffixArrayLanguageModel.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArrayLanguageModel\Applications\EvaluateLM.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP12F6.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP12F7.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/EvaluateLM.pdb" /machine:I386 /out:"../../../Bin/Win32/EvaluateLM.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArrayLanguageModel.obj"
+".\Release\_SuffixArraySearchApplicationBase.obj"
+".\Release\EvaluateLM.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP12F7.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArrayLanguageModel.cpp
+_SuffixArraySearchApplicationBase.cpp
+EvaluateLM.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+EvaluateLM.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.dsp b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.dsp
new file mode 100755
index 0000000..b45411b
--- /dev/null
+++ b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="FilterDuplicatedSentences" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=FilterDuplicatedSentences - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "FilterDuplicatedSentences.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "FilterDuplicatedSentences.mak" CFG="FilterDuplicatedSentences - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "FilterDuplicatedSentences - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "FilterDuplicatedSentences - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "FilterDuplicatedSentences - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/FilterDuplicatedSentences.exe"
+
+!ELSEIF "$(CFG)" == "FilterDuplicatedSentences - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "FilterDuplicatedSentences - Win32 Release"
+# Name "FilterDuplicatedSentences - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\Applications\FilterDuplicatedSentences.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.dsw b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.dsw
new file mode 100755
index 0000000..eceef71
--- /dev/null
+++ b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "FilterDuplicatedSentences"=.\FilterDuplicatedSentences.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.ncb b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.ncb
new file mode 100755
index 0000000..6e66b27
--- /dev/null
+++ b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.ncb
Binary files differ
diff --git a/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.opt b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.opt
new file mode 100755
index 0000000..3b2cd95
--- /dev/null
+++ b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.opt
Binary files differ
diff --git a/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.plg b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.plg
new file mode 100755
index 0000000..7476099
--- /dev/null
+++ b/Distribution/Win32/FilterDuplicatedSentences/FilterDuplicatedSentences.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: FilterDuplicatedSentences - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1305.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/FilterDuplicatedSentences.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\Applications\FilterDuplicatedSentences.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1305.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1306.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/FilterDuplicatedSentences.pdb" /machine:I386 /out:"../../../Bin/Win32/FilterDuplicatedSentences.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArraySearchApplicationBase.obj"
+".\Release\FilterDuplicatedSentences.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1306.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArraySearchApplicationBase.cpp
+FilterDuplicatedSentences.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+FilterDuplicatedSentences.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.dsp b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.dsp
new file mode 100755
index 0000000..ac9d3dc
--- /dev/null
+++ b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="FrequencyOfNgrams" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=FrequencyOfNgrams - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "FrequencyOfNgrams.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "FrequencyOfNgrams.mak" CFG="FrequencyOfNgrams - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "FrequencyOfNgrams - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "FrequencyOfNgrams - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "FrequencyOfNgrams - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/FrequencyOfNgrams.exe"
+
+!ELSEIF "$(CFG)" == "FrequencyOfNgrams - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/FrequencyOfNgrams.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "FrequencyOfNgrams - Win32 Release"
+# Name "FrequencyOfNgrams - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\Applications\FrequencyOfNgrams.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.dsw b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.dsw
new file mode 100755
index 0000000..6f77d53
--- /dev/null
+++ b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "FrequencyOfNgrams"=.\FrequencyOfNgrams.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.ncb b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.ncb
new file mode 100755
index 0000000..a95fdad
--- /dev/null
+++ b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.ncb
Binary files differ
diff --git a/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.opt b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.opt
new file mode 100755
index 0000000..9a113d3
--- /dev/null
+++ b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.opt
Binary files differ
diff --git a/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.plg b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.plg
new file mode 100755
index 0000000..60184b9
--- /dev/null
+++ b/Distribution/Win32/FrequencyOfNgrams/FrequencyOfNgrams.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: FrequencyOfNgrams - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP130C.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/FrequencyOfNgrams.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\Applications\FrequencyOfNgrams.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP130C.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP130D.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/FrequencyOfNgrams.pdb" /machine:I386 /out:"../../../Bin/Win32/FrequencyOfNgrams.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArraySearchApplicationBase.obj"
+".\Release\FrequencyOfNgrams.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP130D.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArraySearchApplicationBase.cpp
+FrequencyOfNgrams.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+FrequencyOfNgrams.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/IndexSA/IndexSA.dsp b/Distribution/Win32/IndexSA/IndexSA.dsp
new file mode 100755
index 0000000..eb5b244
--- /dev/null
+++ b/Distribution/Win32/IndexSA/IndexSA.dsp
@@ -0,0 +1,130 @@
+# Microsoft Developer Studio Project File - Name="IndexSA" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=IndexSA - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "IndexSA.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "IndexSA.mak" CFG="IndexSA - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "IndexSA - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "IndexSA - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "IndexSA - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/IndexSA.exe"
+
+!ELSEIF "$(CFG)" == "IndexSA - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "IndexSA - Win32 Release"
+# Name "IndexSA - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\IndexSA\_MonoCorpus.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\IndexSA\IndexSA.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\IndexSA\_MonoCorpus.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/IndexSA/IndexSA.dsw b/Distribution/Win32/IndexSA/IndexSA.dsw
new file mode 100755
index 0000000..e50f943
--- /dev/null
+++ b/Distribution/Win32/IndexSA/IndexSA.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "IndexSA"=.\IndexSA.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/IndexSA/IndexSA.ncb b/Distribution/Win32/IndexSA/IndexSA.ncb
new file mode 100755
index 0000000..dcead0d
--- /dev/null
+++ b/Distribution/Win32/IndexSA/IndexSA.ncb
Binary files differ
diff --git a/Distribution/Win32/IndexSA/IndexSA.opt b/Distribution/Win32/IndexSA/IndexSA.opt
new file mode 100755
index 0000000..d581c7c
--- /dev/null
+++ b/Distribution/Win32/IndexSA/IndexSA.opt
Binary files differ
diff --git a/Distribution/Win32/IndexSA/IndexSA.plg b/Distribution/Win32/IndexSA/IndexSA.plg
new file mode 100755
index 0000000..3864401
--- /dev/null
+++ b/Distribution/Win32/IndexSA/IndexSA.plg
@@ -0,0 +1,41 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: IndexSA - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1313.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/IndexSA.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\IndexSA\_MonoCorpus.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\IndexSA\IndexSA.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1313.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1314.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/IndexSA.pdb" /machine:I386 /out:"../../../Bin/Win32/IndexSA.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_MonoCorpus.obj"
+".\Release\_String.obj"
+".\Release\IndexSA.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1314.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_MonoCorpus.cpp
+_String.cpp
+IndexSA.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+IndexSA.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.dsp b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.dsp
new file mode 100755
index 0000000..dfc1711
--- /dev/null
+++ b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.dsp
@@ -0,0 +1,122 @@
+# Microsoft Developer Studio Project File - Name="InitializeVocabulary" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=InitializeVocabulary - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "InitializeVocabulary.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "InitializeVocabulary.mak" CFG="InitializeVocabulary - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "InitializeVocabulary - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "InitializeVocabulary - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "InitializeVocabulary - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/InitializeVocabulary.exe"
+
+!ELSEIF "$(CFG)" == "InitializeVocabulary - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/InitializeVocabulary.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "InitializeVocabulary - Win32 Release"
+# Name "InitializeVocabulary - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Utils\InitializeVocabulary.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.dsw b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.dsw
new file mode 100755
index 0000000..55a8eaf
--- /dev/null
+++ b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "InitializeVocabulary"=.\InitializeVocabulary.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.ncb b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.ncb
new file mode 100755
index 0000000..d7d5535
--- /dev/null
+++ b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.ncb
Binary files differ
diff --git a/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.opt b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.opt
new file mode 100755
index 0000000..9704514
--- /dev/null
+++ b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.opt
Binary files differ
diff --git a/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.plg b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.plg
new file mode 100755
index 0000000..09ddfc7
--- /dev/null
+++ b/Distribution/Win32/InitializeVocabulary/InitializeVocabulary.plg
@@ -0,0 +1,38 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: InitializeVocabulary - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP131A.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/InitializeVocabulary.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Utils\InitializeVocabulary.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP131A.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP131B.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/InitializeVocabulary.pdb" /machine:I386 /out:"../../../Bin/Win32/InitializeVocabulary.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\InitializeVocabulary.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP131B.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+InitializeVocabulary.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+InitializeVocabulary.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.dsp b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.dsp
new file mode 100755
index 0000000..7610e6a
--- /dev/null
+++ b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="LocateEmbeddedNgramsInCorpus" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=LocateEmbeddedNgramsInCorpus - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "LocateEmbeddedNgramsInCorpus.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "LocateEmbeddedNgramsInCorpus.mak" CFG="LocateEmbeddedNgramsInCorpus - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "LocateEmbeddedNgramsInCorpus - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "LocateEmbeddedNgramsInCorpus - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "LocateEmbeddedNgramsInCorpus - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/LocateEmbeddedNgramsInCorpus.exe"
+
+!ELSEIF "$(CFG)" == "LocateEmbeddedNgramsInCorpus - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/LocateEmbeddedNgramsInCorpus.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "LocateEmbeddedNgramsInCorpus - Win32 Release"
+# Name "LocateEmbeddedNgramsInCorpus - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\Applications\LocateEmbeddedNgramsInCorpus.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.dsw b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.dsw
new file mode 100755
index 0000000..265f023
--- /dev/null
+++ b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "LocateEmbeddedNgramsInCorpus"=.\LocateEmbeddedNgramsInCorpus.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.ncb b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.ncb
new file mode 100755
index 0000000..4e8ed51
--- /dev/null
+++ b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.ncb
Binary files differ
diff --git a/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.opt b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.opt
new file mode 100755
index 0000000..6c65529
--- /dev/null
+++ b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.opt
Binary files differ
diff --git a/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.plg b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.plg
new file mode 100755
index 0000000..dec0e33
--- /dev/null
+++ b/Distribution/Win32/LocateEmbeddedNgramsInCorpus/LocateEmbeddedNgramsInCorpus.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: LocateEmbeddedNgramsInCorpus - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1323.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/LocateEmbeddedNgramsInCorpus.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\Applications\LocateEmbeddedNgramsInCorpus.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1323.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1324.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/LocateEmbeddedNgramsInCorpus.pdb" /machine:I386 /out:"../../../Bin/Win32/LocateEmbeddedNgramsInCorpus.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArraySearchApplicationBase.obj"
+".\Release\LocateEmbeddedNgramsInCorpus.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1324.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArraySearchApplicationBase.cpp
+LocateEmbeddedNgramsInCorpus.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+LocateEmbeddedNgramsInCorpus.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.dsp b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.dsp
new file mode 100755
index 0000000..493205e
--- /dev/null
+++ b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="LocateNgramInCorpus" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=LocateNgramInCorpus - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "LocateNgramInCorpus.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "LocateNgramInCorpus.mak" CFG="LocateNgramInCorpus - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "LocateNgramInCorpus - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "LocateNgramInCorpus - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "LocateNgramInCorpus - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/LocateNgramInCorpus.exe"
+
+!ELSEIF "$(CFG)" == "LocateNgramInCorpus - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/LocateNgramInCorpus.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "LocateNgramInCorpus - Win32 Release"
+# Name "LocateNgramInCorpus - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\Applications\LocateNgramInCorpus.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.dsw b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.dsw
new file mode 100755
index 0000000..a1e1567
--- /dev/null
+++ b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "LocateNgramInCorpus"=.\LocateNgramInCorpus.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.ncb b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.ncb
new file mode 100755
index 0000000..afd51f3
--- /dev/null
+++ b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.ncb
Binary files differ
diff --git a/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.opt b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.opt
new file mode 100755
index 0000000..f70ea81
--- /dev/null
+++ b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.opt
Binary files differ
diff --git a/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.plg b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.plg
new file mode 100755
index 0000000..3c248e3
--- /dev/null
+++ b/Distribution/Win32/LocateNgramInCorpus/LocateNgramInCorpus.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: LocateNgramInCorpus - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP132A.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/LocateNgramInCorpus.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\Applications\LocateNgramInCorpus.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP132A.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP132B.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/LocateNgramInCorpus.pdb" /machine:I386 /out:"../../../Bin/Win32/LocateNgramInCorpus.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArraySearchApplicationBase.obj"
+".\Release\LocateNgramInCorpus.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP132B.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArraySearchApplicationBase.cpp
+LocateNgramInCorpus.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+LocateNgramInCorpus.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.dsp b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.dsp
new file mode 100755
index 0000000..0d46f34
--- /dev/null
+++ b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="NGramMatchingStat4TestSet" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=NGramMatchingStat4TestSet - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "NGramMatchingStat4TestSet.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "NGramMatchingStat4TestSet.mak" CFG="NGramMatchingStat4TestSet - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "NGramMatchingStat4TestSet - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "NGramMatchingStat4TestSet - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "NGramMatchingStat4TestSet - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/NGramMatchingStat4TestSet.exe"
+
+!ELSEIF "$(CFG)" == "NGramMatchingStat4TestSet - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/NGramMatchingStat4TestSet.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "NGramMatchingStat4TestSet - Win32 Release"
+# Name "NGramMatchingStat4TestSet - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\Applications\NGramMatchingStat4TestSet.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.dsw b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.dsw
new file mode 100755
index 0000000..1fd185b
--- /dev/null
+++ b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "NGramMatchingStat4TestSet"=.\NGramMatchingStat4TestSet.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.ncb b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.ncb
new file mode 100755
index 0000000..89f1956
--- /dev/null
+++ b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.ncb
Binary files differ
diff --git a/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.opt b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.opt
new file mode 100755
index 0000000..859f567
--- /dev/null
+++ b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.opt
Binary files differ
diff --git a/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.plg b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.plg
new file mode 100755
index 0000000..de5db5d
--- /dev/null
+++ b/Distribution/Win32/NGramMatchingStat4TestSet/NGramMatchingStat4TestSet.plg
@@ -0,0 +1,16 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: NGramMatchingStat4TestSet - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+
+
+
+<h3>Results</h3>
+NGramMatchingStat4TestSet.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.dsp b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.dsp
new file mode 100755
index 0000000..a0c44c5
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="NgramMatchingFreq4Sent" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=NgramMatchingFreq4Sent - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "NgramMatchingFreq4Sent.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "NgramMatchingFreq4Sent.mak" CFG="NgramMatchingFreq4Sent - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "NgramMatchingFreq4Sent - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "NgramMatchingFreq4Sent - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "NgramMatchingFreq4Sent - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/NgramMatchingFreq4Sent.exe"
+
+!ELSEIF "$(CFG)" == "NgramMatchingFreq4Sent - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/NgramMatchingFreq4Sent.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "NgramMatchingFreq4Sent - Win32 Release"
+# Name "NgramMatchingFreq4Sent - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\Applications\NgramMatchingFreq4Sent.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.dsw b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.dsw
new file mode 100755
index 0000000..217d92a
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "NgramMatchingFreq4Sent"=.\NgramMatchingFreq4Sent.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.ncb b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.ncb
new file mode 100755
index 0000000..4cba86d
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.ncb
Binary files differ
diff --git a/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.opt b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.opt
new file mode 100755
index 0000000..d8ee671
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.opt
Binary files differ
diff --git a/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.plg b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.plg
new file mode 100755
index 0000000..2a34d1e
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreq4Sent/NgramMatchingFreq4Sent.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: NgramMatchingFreq4Sent - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1333.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/NgramMatchingFreq4Sent.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\Applications\NgramMatchingFreq4Sent.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1333.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1334.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/NgramMatchingFreq4Sent.pdb" /machine:I386 /out:"../../../Bin/Win32/NgramMatchingFreq4Sent.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArraySearchApplicationBase.obj"
+".\Release\NgramMatchingFreq4Sent.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1334.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArraySearchApplicationBase.cpp
+NgramMatchingFreq4Sent.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+NgramMatchingFreq4Sent.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.dsp b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.dsp
new file mode 100755
index 0000000..819bbe3
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="NgramMatchingFreqAndNonCompositionality4Sent" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=NgramMatchingFreqAndNonCompositionality4Sent - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "NgramMatchingFreqAndNonCompositionality4Sent.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "NgramMatchingFreqAndNonCompositionality4Sent.mak" CFG="NgramMatchingFreqAndNonCompositionality4Sent - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "NgramMatchingFreqAndNonCompositionality4Sent - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "NgramMatchingFreqAndNonCompositionality4Sent - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "NgramMatchingFreqAndNonCompositionality4Sent - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/NgramMatchingFreqAndNonCompositionality4Sent.exe"
+
+!ELSEIF "$(CFG)" == "NgramMatchingFreqAndNonCompositionality4Sent - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/NgramMatchingFreqAndNonCompositionality4Sent.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "NgramMatchingFreqAndNonCompositionality4Sent - Win32 Release"
+# Name "NgramMatchingFreqAndNonCompositionality4Sent - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\Applications\NgramMatchingFreqAndNonCompositionality4Sent.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.dsw b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.dsw
new file mode 100755
index 0000000..18dd072
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "NgramMatchingFreqAndNonCompositionality4Sent"=".\NgramMatchingFreqAndNonCompositionality4Sent.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.ncb b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.ncb
new file mode 100755
index 0000000..1c3486b
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.ncb
Binary files differ
diff --git a/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.opt b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.opt
new file mode 100755
index 0000000..077b7bf
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.opt
Binary files differ
diff --git a/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.plg b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.plg
new file mode 100755
index 0000000..3aa5d04
--- /dev/null
+++ b/Distribution/Win32/NgramMatchingFreqAndNonCompositionality4Sent/NgramMatchingFreqAndNonCompositionality4Sent.plg
@@ -0,0 +1,16 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: NgramMatchingFreqAndNonCompositionality4Sent - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+
+
+
+<h3>Results</h3>
+NgramMatchingFreqAndNonCompositionality4Sent.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.dsp b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.dsp
new file mode 100755
index 0000000..430193e
--- /dev/null
+++ b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="NgramTypeInTestSetMatchedInCorpus" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=NgramTypeInTestSetMatchedInCorpus - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "NgramTypeInTestSetMatchedInCorpus.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "NgramTypeInTestSetMatchedInCorpus.mak" CFG="NgramTypeInTestSetMatchedInCorpus - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "NgramTypeInTestSetMatchedInCorpus - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "NgramTypeInTestSetMatchedInCorpus - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "NgramTypeInTestSetMatchedInCorpus - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/NgramTypeInTestSetMatchedInCorpus.exe"
+
+!ELSEIF "$(CFG)" == "NgramTypeInTestSetMatchedInCorpus - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "NgramTypeInTestSetMatchedInCorpus - Win32 Release"
+# Name "NgramTypeInTestSetMatchedInCorpus - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\Applications\NgramTypeInTestSetMatchedInCorpus.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.dsw b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.dsw
new file mode 100755
index 0000000..a024e5a
--- /dev/null
+++ b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "NgramTypeInTestSetMatchedInCorpus"=.\NgramTypeInTestSetMatchedInCorpus.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.ncb b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.ncb
new file mode 100755
index 0000000..17322e1
--- /dev/null
+++ b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.ncb
Binary files differ
diff --git a/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.opt b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.opt
new file mode 100755
index 0000000..da07052
--- /dev/null
+++ b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.opt
Binary files differ
diff --git a/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.plg b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.plg
new file mode 100755
index 0000000..0f6d44c
--- /dev/null
+++ b/Distribution/Win32/NgramTypeInTestSetMatchedInCorpus/NgramTypeInTestSetMatchedInCorpus.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: NgramTypeInTestSetMatchedInCorpus - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1348.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/NgramTypeInTestSetMatchedInCorpus.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArraySearch\Applications\NgramTypeInTestSetMatchedInCorpus.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1348.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1349.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/NgramTypeInTestSetMatchedInCorpus.pdb" /machine:I386 /out:"../../../Bin/Win32/NgramTypeInTestSetMatchedInCorpus.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArraySearchApplicationBase.obj"
+".\Release\NgramTypeInTestSetMatchedInCorpus.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1349.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArraySearchApplicationBase.cpp
+NgramTypeInTestSetMatchedInCorpus.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+NgramTypeInTestSetMatchedInCorpus.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.dsp b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.dsp
new file mode 100755
index 0000000..9201330
--- /dev/null
+++ b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.dsp
@@ -0,0 +1,137 @@
+# Microsoft Developer Studio Project File - Name="OutputHighFreqNgram" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=OutputHighFreqNgram - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "OutputHighFreqNgram.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "OutputHighFreqNgram.mak" CFG="OutputHighFreqNgram - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "OutputHighFreqNgram - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "OutputHighFreqNgram - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "OutputHighFreqNgram - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/OutputHighFreqNgram.exe"
+
+!ELSEIF "$(CFG)" == "OutputHighFreqNgram - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "OutputHighFreqNgram - Win32 Release"
+# Name "OutputHighFreqNgram - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayScan\_SuffixArrayScanningBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayScan\Applications\OutputHighFreqNgram.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayScan\_SuffixArrayScanningBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.dsw b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.dsw
new file mode 100755
index 0000000..d9cfbde
--- /dev/null
+++ b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "OutputHighFreqNgram"=".\OutputHighFreqNgram.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.ncb b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.ncb
new file mode 100755
index 0000000..1910394
--- /dev/null
+++ b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.ncb
Binary files differ
diff --git a/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.opt b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.opt
new file mode 100755
index 0000000..83019d2
--- /dev/null
+++ b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.opt
Binary files differ
diff --git a/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.plg b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.plg
new file mode 100755
index 0000000..7eb44c4
--- /dev/null
+++ b/Distribution/Win32/OutputHighFreqNgram/OutputHighFreqNgram.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: OutputHighFreqNgram - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1351.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/OutputHighFreqNgram.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArrayScan\_SuffixArrayScanningBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArrayScan\Applications\OutputHighFreqNgram.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1351.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1352.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/OutputHighFreqNgram.pdb" /machine:I386 /out:"../../../Bin/Win32/OutputHighFreqNgram.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArrayScanningBase.obj"
+".\Release\OutputHighFreqNgram.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1352.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArrayScanningBase.cpp
+OutputHighFreqNgram.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+OutputHighFreqNgram.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.dsp b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.dsp
new file mode 100755
index 0000000..5efedc2
--- /dev/null
+++ b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.dsp
@@ -0,0 +1,138 @@
+# Microsoft Developer Studio Project File - Name="TypeTokenFreqInCorpus" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=TypeTokenFreqInCorpus - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "TypeTokenFreqInCorpus.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "TypeTokenFreqInCorpus.mak" CFG="TypeTokenFreqInCorpus - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "TypeTokenFreqInCorpus - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "TypeTokenFreqInCorpus - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "TypeTokenFreqInCorpus - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x804 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/TypeTokenFreqInCorpus.exe"
+
+!ELSEIF "$(CFG)" == "TypeTokenFreqInCorpus - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x804 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/TypeTokenFreqInCorpus.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "TypeTokenFreqInCorpus - Win32 Release"
+# Name "TypeTokenFreqInCorpus - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayScan\_SuffixArrayScanningBase.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayScan\Applications\TypeTokenFreqInCorpus.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\SuffixArrayApplications\SuffixArrayScan\_SuffixArrayScanningBase.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.dsw b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.dsw
new file mode 100755
index 0000000..b712001
--- /dev/null
+++ b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "TypeTokenFreqInCorpus"=".\TypeTokenFreqInCorpus.dsp" - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.ncb b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.ncb
new file mode 100755
index 0000000..fecf539
--- /dev/null
+++ b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.ncb
Binary files differ
diff --git a/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.opt b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.opt
new file mode 100755
index 0000000..d8ab043
--- /dev/null
+++ b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.opt
Binary files differ
diff --git a/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.plg b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.plg
new file mode 100755
index 0000000..35dcdf2
--- /dev/null
+++ b/Distribution/Win32/TypeTokenFreqInCorpus/TypeTokenFreqInCorpus.plg
@@ -0,0 +1,44 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: TypeTokenFreqInCorpus - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP135A.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/TypeTokenFreqInCorpus.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArrayScan\_SuffixArrayScanningBase.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\SuffixArrayApplications\SuffixArrayScan\Applications\TypeTokenFreqInCorpus.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP135A.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP135B.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/TypeTokenFreqInCorpus.pdb" /machine:I386 /out:"../../../Bin/Win32/TypeTokenFreqInCorpus.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_SuffixArrayApplicationBase.obj"
+".\Release\_SuffixArrayScanningBase.obj"
+".\Release\TypeTokenFreqInCorpus.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP135B.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_SuffixArrayApplicationBase.cpp
+_SuffixArrayScanningBase.cpp
+TypeTokenFreqInCorpus.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+TypeTokenFreqInCorpus.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.dsp b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.dsp
new file mode 100755
index 0000000..e4692f6
--- /dev/null
+++ b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.dsp
@@ -0,0 +1,130 @@
+# Microsoft Developer Studio Project File - Name="UpdateUniversalVoc" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=UpdateUniversalVoc - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "UpdateUniversalVoc.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "UpdateUniversalVoc.mak" CFG="UpdateUniversalVoc - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "UpdateUniversalVoc - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "UpdateUniversalVoc - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "UpdateUniversalVoc - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD BASE RSC /l 0x409 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /out:"../../../Bin/Win32/UpdateUniversalVoc.exe"
+
+!ELSEIF "$(CFG)" == "UpdateUniversalVoc - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x409 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /out:"../../../Bin/Win32/UpdateUniversalVoc.exe" /pdbtype:sept
+
+!ENDIF
+
+# Begin Target
+
+# Name "UpdateUniversalVoc - Win32 Release"
+# Name "UpdateUniversalVoc - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Utils\_UniversalVocabulary.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Utils\UpdateUniversalVoc.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_IDVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\_String.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Utils\_UniversalVocabulary.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\Src\Shared\salm_shared.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# End Target
+# End Project
diff --git a/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.dsw b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.dsw
new file mode 100755
index 0000000..ce291e3
--- /dev/null
+++ b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.dsw
@@ -0,0 +1,29 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "UpdateUniversalVoc"=.\UpdateUniversalVoc.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
diff --git a/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.ncb b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.ncb
new file mode 100755
index 0000000..95f10be
--- /dev/null
+++ b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.ncb
Binary files differ
diff --git a/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.opt b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.opt
new file mode 100755
index 0000000..a931ed0
--- /dev/null
+++ b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.opt
Binary files differ
diff --git a/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.plg b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.plg
new file mode 100755
index 0000000..04874e2
--- /dev/null
+++ b/Distribution/Win32/UpdateUniversalVoc/UpdateUniversalVoc.plg
@@ -0,0 +1,41 @@
+<html>
+<body>
+<pre>
+<h1>Build Log</h1>
+<h3>
+--------------------Configuration: UpdateUniversalVoc - Win32 Release--------------------
+</h3>
+<h3>Command Lines</h3>
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1365.tmp" with contents
+[
+/nologo /ML /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /Fp"Release/UpdateUniversalVoc.pch" /YX /Fo"Release/" /Fd"Release/" /FD /c
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_IDVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Shared\_String.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Utils\_UniversalVocabulary.cpp"
+"E:\SVN-working-copy\Programs\suffixArray\Release\Src\Utils\UpdateUniversalVoc.cpp"
+]
+Creating command line "cl.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1365.tmp"
+Creating temporary file "C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1366.tmp" with contents
+[
+kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /incremental:no /pdb:"Release/UpdateUniversalVoc.pdb" /machine:I386 /out:"../../../Bin/Win32/UpdateUniversalVoc.exe"
+".\Release\_IDVocabulary.obj"
+".\Release\_String.obj"
+".\Release\_UniversalVocabulary.obj"
+".\Release\UpdateUniversalVoc.obj"
+]
+Creating command line "link.exe @C:\DOCUME~1\joy\LOCALS~1\Temp\RSP1366.tmp"
+<h3>Output Window</h3>
+Compiling...
+_IDVocabulary.cpp
+_String.cpp
+_UniversalVocabulary.cpp
+UpdateUniversalVoc.cpp
+Linking...
+
+
+
+<h3>Results</h3>
+UpdateUniversalVoc.exe - 0 error(s), 0 warning(s)
+</pre>
+</body>
+</html>
diff --git a/Readme b/Readme
new file mode 100755
index 0000000..dbf8b3e
--- /dev/null
+++ b/Readme
@@ -0,0 +1,70 @@
+SALM: Suffix Array tool kit for empirical Language Manipulations.
+By Joy, joy@cs.cmu.edu
+
+1) Download the source code from: http://projectile.is.cs.cmu.edu/research/public/tools/salm/salm.htm or http://www.sourceforge.net/projects/salm
+2) Build binaries:
+ a) For Linux platform:
+ cd Distribution/Linux
+ make allO32 (for 32-bit platform)
+ or
+ make allO64 (for 64-bit platform)
+
+ Binaries are created under Bin/Linux
+
+ b) For Win32 platform
+ open project files under Distribution/Win32 and use Visual C++ to build executables.
+ Executables are placed under Bin/Win32
+
+3) Index a corpus.
+ The first step is to index a corpus using IndexSA program.
+ There is no limitation to the size of the corpus as long as there is enough RAM.
+ A corpus of N words requires 9N bytes memory during indexing.
+
+ Another constraint is that no sentence can have more than 254 words.
+
+ Synposis of IndexSA:
+ IndexSA corpusFileName [existingIDVocabularyFile]
+
+ Optional existingIDVocabularyFile can be used to specify an existing vocabulary.
+ It will be updated if the words in the corpus are new to the exising vocabulary.
+ This is useful if several corpora want to share a common vocabulary.
+
+
+4) Applications
+ The key functions to suffix array applications are provided in class C_SuffixArraySearchApplicationBase and C_SuffixArrayScanningBase
+ Please check the documentation and API for more details.
+
+ Sample programs such as:
+
+ FrequencyOfNgrams:
+ Output the frequency of an n-gram in the training corpus
+
+ NGramMatchingStat4TestSet
+ Output the n-gram token matching statistics of a testing data
+
+ NgramTypeInTestSetMatchedInCorpus
+ Output the n-gram type matching statistics of a testing data
+
+ NgramMatchingFreq4Sent
+ Output the frequencies of all the embedded n-grams in a sentence
+
+ NgramMatchingFreqAndNonCompositionality4Sent
+ Output the non-compositionalities of the embedded n-grams in a sentence
+
+ FilterDuplicatedSentences
+ Filter out duplicated sentences in the training corpus and output the unique ones
+
+ CollectNgramFreqCount
+ Given a list of n-grams and a list of traing corpus indexed by their suffix array, collect counts of n-grams in these corpus. E.g. given a Chinese word list, one can collect the frequency of these words (as character n-grams) from several large corpora (segmented into characters).
+
+ CalcCountOfCounts
+ Output the count-of-counts information of a corpus
+
+ OutputHighFreqNgram
+ Specified by a configuration file, output the n-gram types that have frequencies higher than the threshold
+
+ TypeTokenFreqInCorpus
+ Output the type/token statistics of the corpus
+
+5) Quesitons, comments and suggestions?
+Please email joy+salm@cs.cmu.edu
diff --git a/Src/IndexSA/IndexSA.cpp b/Src/IndexSA/IndexSA.cpp
new file mode 100755
index 0000000..3013d4c
--- /dev/null
+++ b/Src/IndexSA/IndexSA.cpp
@@ -0,0 +1,58 @@
+/**
+* Main function to index a corpus according to its suffix array
+* Revision: $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <cstring>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include "_MonoCorpus.h"
+#include "salm_shared.h"
+
+using namespace std;
+
+IndexType * corpus; //because the compare function needs to see this, make it global
+TextLenType actualCorpusSize;
+
+int main(int argc, char* argv[]){
+
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:");
+ fprintf(stderr,"\n%s fileNameStem [existingIDVocFileName]\n",argv[0]);
+
+ exit(0);
+ }
+
+ C_MonoCorpus corpus;
+
+ char vocFileName[1024];
+ sprintf(vocFileName, "%s.id_voc", argv[1]);
+
+ if(argc==2){ //no existing vocabulary given
+ cerr<<"Initialize vocabulary file: "<<vocFileName<<endl;
+ corpus.initializeVocabulary(argv[1]);
+ corpus.loadCorpusAndSort(argv[1], vocFileName, true);
+ }
+ else{
+ if(strcmp(vocFileName, argv[2])!=0){
+ cerr<<"Error! ExistingIDVocFileName has to be called: "<<vocFileName<<" and cover all the words in the corpus."<<endl;
+ exit(-1);
+ }
+ corpus.loadCorpusAndSort(argv[1], argv[2], false);
+ }
+
+ corpus.output(argv[1]);
+
+ return 0;
+}
+
diff --git a/Src/IndexSA/IndexSA.cpp~ b/Src/IndexSA/IndexSA.cpp~
new file mode 100755
index 0000000..d8ad043
--- /dev/null
+++ b/Src/IndexSA/IndexSA.cpp~
@@ -0,0 +1,57 @@
+/**
+* Main function to index a corpus according to its suffix array
+* Revision: $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include "_MonoCorpus.h"
+#include "salm_shared.h"
+
+using namespace std;
+
+IndexType * corpus; //because the compare function needs to see this, make it global
+TextLenType actualCorpusSize;
+
+int main(int argc, char* argv[]){
+
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:");
+ fprintf(stderr,"\n%s fileNameStem [existingIDVocFileName]\n",argv[0]);
+
+ exit(0);
+ }
+
+ C_MonoCorpus corpus;
+
+ char vocFileName[1024];
+ sprintf(vocFileName, "%s.id_voc", argv[1]);
+
+ if(argc==2){ //no existing vocabulary given
+ cerr<<"Initialize vocabulary file: "<<vocFileName<<endl;
+ corpus.initializeVocabulary(argv[1]);
+ corpus.loadCorpusAndSort(argv[1], vocFileName, true);
+ }
+ else{
+ if(strcmp(vocFileName, argv[2])!=0){
+ cerr<<"Error! ExistingIDVocFileName has to be called: "<<vocFileName<<" and cover all the words in the corpus."<<endl;
+ exit(-1);
+ }
+ corpus.loadCorpusAndSort(argv[1], argv[2], false);
+ }
+
+ corpus.output(argv[1]);
+
+ return 0;
+}
+
diff --git a/Src/IndexSA/_MonoCorpus.cpp b/Src/IndexSA/_MonoCorpus.cpp
new file mode 100755
index 0000000..ab53813
--- /dev/null
+++ b/Src/IndexSA/_MonoCorpus.cpp
@@ -0,0 +1,440 @@
+/**
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "_MonoCorpus.h"
+#include "malloc.h"
+#include "time.h"
+
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <string>
+#include <algorithm>
+
+using namespace std;
+
+extern IndexType * corpus;
+extern TextLenType actualCorpusSize;
+
+bool operator<(const C_SuffixPointer& a, const C_SuffixPointer& b)
+{
+ bool stillEqual = true;
+ TextLenType currentPosOfA = a.pointer;
+ TextLenType currentPosOfB = b.pointer;
+
+ if(currentPosOfA==currentPosOfB){
+ return false;
+ }
+
+ while(stillEqual){
+ if(corpus[currentPosOfA]<corpus[currentPosOfB]){
+ return true;
+ }
+
+ if(corpus[currentPosOfA]>corpus[currentPosOfB]){
+ return false;
+ }
+
+ //then still equal at these two positions
+ currentPosOfA++;
+ currentPosOfB++;
+
+ if(currentPosOfA>=actualCorpusSize){
+ currentPosOfA=0;
+ }
+
+ if(currentPosOfB>=actualCorpusSize){
+ currentPosOfB=0;
+ }
+ }
+
+ //equal
+ return false;
+}
+
+
+C_SuffixPointer::C_SuffixPointer()
+{
+
+}
+
+//copy constructor
+C_SuffixPointer::C_SuffixPointer(const C_SuffixPointer & obj)
+{
+ this->pointer = obj.pointer;
+}
+
+C_SuffixPointer::~C_SuffixPointer()
+{
+
+}
+
+
+C_SuffixPointer::C_SuffixPointer(TextLenType pointer)
+{
+ this->pointer = pointer;
+}
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_MonoCorpus::C_MonoCorpus()
+{
+ this->currentPosInCorpus = 0;
+ this->maxVocIdFromCorpus = 0;
+}
+
+C_MonoCorpus::~C_MonoCorpus()
+{
+ free(corpus);
+ free(this->suffix);
+ free(this->offsetList);
+}
+
+
+/**
+* Initialize an IDVocabulary file
+**/
+void C_MonoCorpus::initializeVocabulary(char *fileNameStem)
+{
+ C_IDVocabulary tmpVoc;
+ tmpVoc.addingReservedWords();
+
+ char vocFileName[1024];
+ sprintf(vocFileName, "%s.id_voc", fileNameStem);
+
+ tmpVoc.outputToFile(vocFileName);
+}
+
+
+void C_MonoCorpus::loadCorpusAndSort(const char *fileName, const char * idVocFileName, bool vocNeedsToBeUpdated)
+{
+ IndexType id = 0;
+
+ //load vocabulary
+ this->voc = new C_IDVocabulary(idVocFileName);
+ this->vocNeedsToBeUpdated = vocNeedsToBeUpdated;
+
+ this->vocIdForSentIdPlaceHolder = this->voc->returnId(C_String("_SENT_ID_PLACEHOLDER_"));
+ if(this->vocIdForSentIdPlaceHolder==0){
+ cerr<<"ID vocabulary does not have the type _SENT_ID_PLACEHOLDER_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentIdPlaceHolder>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentIdPlaceHolder;
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"ID vocabulary does not have the type _SENTENCE_START_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentStart>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentStart;
+ }
+
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+ if(this->vocIdForSentEnd==0){
+ cerr<<"ID vocabulary does not have the type _END_OF_SENTENCE_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentEnd>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentEnd;
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"ID vocabulary does not have the type _END_OF_CORPUS_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForCorpusEnd>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForCorpusEnd;
+ }
+
+ ifstream textStream1;
+ textStream1.open(fileName);
+
+ if(textStream1==NULL){
+ fprintf(stderr,"Text %s does not exist. Exit!\n",fileName);
+ exit(-1);
+ }
+
+ long ltime1, ltime2;
+ time( &ltime1 );
+
+ string aLine;
+ unsigned int sentNumber = 1;
+ unsigned int sentLen = 0;
+ unsigned int corpusSize = 0;
+
+ char * thisToken;
+ char delimit[] =" \t\r\n";
+
+ //first, scan the corpus to estimate the size and check if each line is shorter than 256 words
+ getline(textStream1, aLine);
+ while(!textStream1.eof()){
+
+ if(aLine.length()>0){
+ sentLen = 0;
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ if(this->vocNeedsToBeUpdated){
+ id = this->voc->getId(C_String(thisToken));
+ }
+ else{ //the provided vocabulary should cover all the words in this corpus
+ id = this->voc->returnId(C_String(thisToken));
+
+ if(id==0){ //word does not exist
+ cerr<<"Vocabulary: "<<idVocFileName<<" does not cover all the words in the corpus!"<<endl;
+ cerr<<"Word: "<<thisToken<<" does not exist in the voc!\n";
+ exit(-1);
+ }
+ }
+
+
+
+ sentLen++;
+
+ if(id>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = id;
+ }
+
+ if(sentLen>=256){
+ cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+ exit(-1);
+ }
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+ corpusSize+=sentLen;
+
+ sentLen = 0;
+ sentNumber++;
+ }
+ else{
+ cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this message if this is the last sentence.\n";
+ }
+ getline(textStream1, aLine);
+ }
+
+ sentNumber--;
+ unsigned int estimatedSize = corpusSize+3*sentNumber+1000; //with some redundancy
+ cerr<<sentNumber<<" sentences and "<<corpusSize<<" words in corpus\n";
+ cerr<<"Reserve "<<estimatedSize*2<<" bytes in RAM for sorting\n";
+ textStream1.close();
+
+
+ //second pass, convert the corpus into vocIDs and create suffix array
+ ifstream textStream2;
+ textStream2.open(fileName);
+
+ this->allocateMem(estimatedSize);
+ this->currentPosInCorpus = 0;
+ sentNumber = 1;
+
+ getline(textStream2, aLine);
+ while(!textStream2.eof()){
+
+ if(aLine.length()>0){
+ sentLen = 0;
+
+ //add sentId
+ //offset at this position will store the acutal sentence length
+ corpus[this->currentPosInCorpus]=this->vocIdForSentIdPlaceHolder;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->currentPosInCorpus++;
+
+ //add <s>
+ sentLen++; //not real sentence length, but to keep track of offset
+ corpus[this->currentPosInCorpus]=this->vocIdForSentStart;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+ this->currentPosInCorpus++;
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ id = this->voc->returnId(C_String(thisToken));
+ if(id==0){
+ cerr<<"Word \""<<thisToken<<"\" is not listed in the IDVocabulary.\n";
+ exit(-1);
+ }
+
+ sentLen++;
+
+ if(id>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = id;
+ }
+
+ corpus[this->currentPosInCorpus]=id;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+ this->currentPosInCorpus++;
+
+ if(sentLen>=256){
+ cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+ exit(-1);
+ }
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+
+ //add <sentEnd>
+ corpus[this->currentPosInCorpus]=this->vocIdForSentEnd;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) (sentLen + 1);
+ this->offsetList[this->currentPosInCorpus - sentLen - 1] = (unsigned char) (sentLen-1); //write the sentLen to sent begin correspond to <sentId>
+ this->currentPosInCorpus++;
+
+ sentLen = 0;
+ sentNumber++;
+ }
+ else{
+ cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this if this is the last sentence.\n";
+ }
+
+ aLine[0]=0;
+ getline(textStream2, aLine);
+ }
+ textStream2.close();
+
+ //add <endOfCorpus> to the end of data
+ corpus[this->currentPosInCorpus]=this->vocIdForCorpusEnd;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) 0;
+ this->currentPosInCorpus++;
+
+ actualCorpusSize = this->currentPosInCorpus;
+
+ time( &ltime2 );
+ cerr<<"\nCorpus loaded in: "<<ltime2-ltime1<<" seconds."<<endl;
+ cerr<<"Total "<<sentNumber-1<<" sentences loaded.\n";
+
+ //replace the sentId place holder to actual sentId
+ time( &ltime1 );
+ cerr<<"Inserting sentence IDs into the corpus...\n";
+ IndexType sentId = this->maxVocIdFromCorpus+1;
+ for(TextLenType i=0;i<actualCorpusSize;i++){
+ if(corpus[i]==this->vocIdForSentIdPlaceHolder){
+ corpus[i]=sentId;
+ sentId++;
+ }
+ }
+ time( &ltime2 );
+ cerr<<"\nSentence IDs inserted in: "<<ltime2-ltime1<<" seconds."<<endl;
+
+ //sorting
+ time( &ltime1 );
+ cerr<<"Sorting the suffix...\n";
+ sort(this->suffix, this->suffix+actualCorpusSize);
+ time( &ltime2 );
+ cerr<<"\nCorpus sorted in: "<<ltime2-ltime1<<" seconds."<<endl;
+ cerr<<"Done."<<endl;
+
+}
+
+void C_MonoCorpus::allocateMem(TextLenType corpusSize)
+{
+ corpus = (IndexType *) malloc(sizeof(IndexType)*corpusSize);
+
+ if(corpus==0){
+ cerr<<"Failed to allocate memory for corpus. Quit!\n";
+ exit(-1);
+ }
+
+ this->suffix = (C_SuffixPointer *) malloc(sizeof(C_SuffixPointer)*corpusSize);
+ if(this->suffix==0){
+ cerr<<"Failed to allocate memory for suffix. Quit!\n";
+ exit(-1);
+ }
+
+ this->offsetList = (unsigned char *) malloc(sizeof(unsigned char)*corpusSize);
+ if(this->offsetList==0){
+ cerr<<"Failed to allocate memory for offset. Quit!\n";
+ exit(-1);
+ }
+
+}
+
+
+void C_MonoCorpus::outputCorpus(char *filename)
+{
+ cerr<<"Writing corpus to file: "<<filename<<endl;
+ ofstream textOutStream;
+ textOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ textOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0; i<actualCorpusSize;i++){
+ textOutStream.write((char *)&(corpus[i]), sizeof(IndexType));
+ }
+
+ textOutStream.close();
+
+}
+
+void C_MonoCorpus::outputOffset(char *filename)
+{
+ cerr<<"Writing offset to file: "<<filename<<endl;
+
+ ofstream offsetOutStream;
+ offsetOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ offsetOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0; i<actualCorpusSize; i++){
+ offsetOutStream.write((char *)& (this->offsetList[i]), sizeof(unsigned char));
+ }
+ offsetOutStream.close();
+}
+
+void C_MonoCorpus::outputSuffix(char *filename)
+{
+ cerr<<"Writing suffix information to file: "<<filename<<endl;
+
+ ofstream saOutStream;
+ saOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ saOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0;i<actualCorpusSize; i++){
+ saOutStream.write((char *) & (this->suffix[i].pointer), sizeof(TextLenType));
+ }
+
+ saOutStream.close();
+}
+
+void C_MonoCorpus::output(char *filename)
+{
+ char outputVocFileName[1024];
+ char outputCorpusFileName[1024];
+ char outputOffsetFileName[1024];
+ char outputSuffixFileName[1024];
+
+
+ if(this->vocNeedsToBeUpdated){
+ sprintf(outputVocFileName, "%s.id_voc", filename);
+ this->voc->outputToFile(outputVocFileName);
+ }
+
+ sprintf(outputCorpusFileName, "%s.sa_corpus", filename);
+ sprintf(outputOffsetFileName, "%s.sa_offset", filename);
+ sprintf(outputSuffixFileName, "%s.sa_suffix", filename);
+
+
+ this->outputCorpus(outputCorpusFileName);
+ this->outputOffset(outputOffsetFileName);
+ this->outputSuffix(outputSuffixFileName);
+}
+
diff --git a/Src/IndexSA/_MonoCorpus.cpp~ b/Src/IndexSA/_MonoCorpus.cpp~
new file mode 100755
index 0000000..3e3a29b
--- /dev/null
+++ b/Src/IndexSA/_MonoCorpus.cpp~
@@ -0,0 +1,439 @@
+/**
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "_MonoCorpus.h"
+#include "malloc.h"
+#include "time.h"
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <algorithm>
+
+using namespace std;
+
+extern IndexType * corpus;
+extern TextLenType actualCorpusSize;
+
+bool operator<(const C_SuffixPointer& a, const C_SuffixPointer& b)
+{
+ bool stillEqual = true;
+ TextLenType currentPosOfA = a.pointer;
+ TextLenType currentPosOfB = b.pointer;
+
+ if(currentPosOfA==currentPosOfB){
+ return false;
+ }
+
+ while(stillEqual){
+ if(corpus[currentPosOfA]<corpus[currentPosOfB]){
+ return true;
+ }
+
+ if(corpus[currentPosOfA]>corpus[currentPosOfB]){
+ return false;
+ }
+
+ //then still equal at these two positions
+ currentPosOfA++;
+ currentPosOfB++;
+
+ if(currentPosOfA>=actualCorpusSize){
+ currentPosOfA=0;
+ }
+
+ if(currentPosOfB>=actualCorpusSize){
+ currentPosOfB=0;
+ }
+ }
+
+ //equal
+ return false;
+}
+
+
+C_SuffixPointer::C_SuffixPointer()
+{
+
+}
+
+//copy constructor
+C_SuffixPointer::C_SuffixPointer(const C_SuffixPointer & obj)
+{
+ this->pointer = obj.pointer;
+}
+
+C_SuffixPointer::~C_SuffixPointer()
+{
+
+}
+
+
+C_SuffixPointer::C_SuffixPointer(TextLenType pointer)
+{
+ this->pointer = pointer;
+}
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_MonoCorpus::C_MonoCorpus()
+{
+ this->currentPosInCorpus = 0;
+ this->maxVocIdFromCorpus = 0;
+}
+
+C_MonoCorpus::~C_MonoCorpus()
+{
+ free(corpus);
+ free(this->suffix);
+ free(this->offsetList);
+}
+
+
+/**
+* Initialize an IDVocabulary file
+**/
+void C_MonoCorpus::initializeVocabulary(char *fileNameStem)
+{
+ C_IDVocabulary tmpVoc;
+ tmpVoc.addingReservedWords();
+
+ char vocFileName[1024];
+ sprintf(vocFileName, "%s.id_voc", fileNameStem);
+
+ tmpVoc.outputToFile(vocFileName);
+}
+
+
+void C_MonoCorpus::loadCorpusAndSort(const char *fileName, const char * idVocFileName, bool vocNeedsToBeUpdated)
+{
+ IndexType id = 0;
+
+ //load vocabulary
+ this->voc = new C_IDVocabulary(idVocFileName);
+ this->vocNeedsToBeUpdated = vocNeedsToBeUpdated;
+
+ this->vocIdForSentIdPlaceHolder = this->voc->returnId(C_String("_SENT_ID_PLACEHOLDER_"));
+ if(this->vocIdForSentIdPlaceHolder==0){
+ cerr<<"ID vocabulary does not have the type _SENT_ID_PLACEHOLDER_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentIdPlaceHolder>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentIdPlaceHolder;
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"ID vocabulary does not have the type _SENTENCE_START_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentStart>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentStart;
+ }
+
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+ if(this->vocIdForSentEnd==0){
+ cerr<<"ID vocabulary does not have the type _END_OF_SENTENCE_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentEnd>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentEnd;
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"ID vocabulary does not have the type _END_OF_CORPUS_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForCorpusEnd>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForCorpusEnd;
+ }
+
+ ifstream textStream1;
+ textStream1.open(fileName);
+
+ if(textStream1==NULL){
+ fprintf(stderr,"Text %s does not exist. Exit!\n",fileName);
+ exit(-1);
+ }
+
+ long ltime1, ltime2;
+ time( &ltime1 );
+
+ string aLine;
+ unsigned int sentNumber = 1;
+ unsigned int sentLen = 0;
+ unsigned int corpusSize = 0;
+
+ char * thisToken;
+ char delimit[] =" \t\r\n";
+
+ //first, scan the corpus to estimate the size and check if each line is shorter than 256 words
+ getline(textStream1, aLine);
+ while(!textStream1.eof()){
+
+ if(aLine.length()>0){
+ sentLen = 0;
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ if(this->vocNeedsToBeUpdated){
+ id = this->voc->getId(C_String(thisToken));
+ }
+ else{ //the provided vocabulary should cover all the words in this corpus
+ id = this->voc->returnId(C_String(thisToken));
+
+ if(id==0){ //word does not exist
+ cerr<<"Vocabulary: "<<idVocFileName<<" does not cover all the words in the corpus!"<<endl;
+ cerr<<"Word: "<<thisToken<<" does not exist in the voc!\n";
+ exit(-1);
+ }
+ }
+
+
+
+ sentLen++;
+
+ if(id>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = id;
+ }
+
+ if(sentLen>=256){
+ cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+ exit(-1);
+ }
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+ corpusSize+=sentLen;
+
+ sentLen = 0;
+ sentNumber++;
+ }
+ else{
+ cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this message if this is the last sentence.\n";
+ }
+ getline(textStream1, aLine);
+ }
+
+ sentNumber--;
+ unsigned int estimatedSize = corpusSize+3*sentNumber+1000; //with some redundancy
+ cerr<<sentNumber<<" sentences and "<<corpusSize<<" words in corpus\n";
+ cerr<<"Reserve "<<estimatedSize*2<<" bytes in RAM for sorting\n";
+ textStream1.close();
+
+
+ //second pass, convert the corpus into vocIDs and create suffix array
+ ifstream textStream2;
+ textStream2.open(fileName);
+
+ this->allocateMem(estimatedSize);
+ this->currentPosInCorpus = 0;
+ sentNumber = 1;
+
+ getline(textStream2, aLine);
+ while(!textStream2.eof()){
+
+ if(aLine.length()>0){
+ sentLen = 0;
+
+ //add sentId
+ //offset at this position will store the acutal sentence length
+ corpus[this->currentPosInCorpus]=this->vocIdForSentIdPlaceHolder;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->currentPosInCorpus++;
+
+ //add <s>
+ sentLen++; //not real sentence length, but to keep track of offset
+ corpus[this->currentPosInCorpus]=this->vocIdForSentStart;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+ this->currentPosInCorpus++;
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ id = this->voc->returnId(C_String(thisToken));
+ if(id==0){
+ cerr<<"Word \""<<thisToken<<"\" is not listed in the IDVocabulary.\n";
+ exit(-1);
+ }
+
+ sentLen++;
+
+ if(id>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = id;
+ }
+
+ corpus[this->currentPosInCorpus]=id;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+ this->currentPosInCorpus++;
+
+ if(sentLen>=256){
+ cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+ exit(-1);
+ }
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+
+ //add <sentEnd>
+ corpus[this->currentPosInCorpus]=this->vocIdForSentEnd;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) (sentLen + 1);
+ this->offsetList[this->currentPosInCorpus - sentLen - 1] = (unsigned char) (sentLen-1); //write the sentLen to sent begin correspond to <sentId>
+ this->currentPosInCorpus++;
+
+ sentLen = 0;
+ sentNumber++;
+ }
+ else{
+ cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this if this is the last sentence.\n";
+ }
+
+ aLine[0]=0;
+ getline(textStream2, aLine);
+ }
+ textStream2.close();
+
+ //add <endOfCorpus> to the end of data
+ corpus[this->currentPosInCorpus]=this->vocIdForCorpusEnd;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) 0;
+ this->currentPosInCorpus++;
+
+ actualCorpusSize = this->currentPosInCorpus;
+
+ time( &ltime2 );
+ cerr<<"\nCorpus loaded in: "<<ltime2-ltime1<<" seconds."<<endl;
+ cerr<<"Total "<<sentNumber-1<<" sentences loaded.\n";
+
+ //replace the sentId place holder to actual sentId
+ time( &ltime1 );
+ cerr<<"Inserting sentence IDs into the corpus...\n";
+ IndexType sentId = this->maxVocIdFromCorpus+1;
+ for(TextLenType i=0;i<actualCorpusSize;i++){
+ if(corpus[i]==this->vocIdForSentIdPlaceHolder){
+ corpus[i]=sentId;
+ sentId++;
+ }
+ }
+ time( &ltime2 );
+ cerr<<"\nSentence IDs inserted in: "<<ltime2-ltime1<<" seconds."<<endl;
+
+ //sorting
+ time( &ltime1 );
+ cerr<<"Sorting the suffix...\n";
+ sort(this->suffix, this->suffix+actualCorpusSize);
+ time( &ltime2 );
+ cerr<<"\nCorpus sorted in: "<<ltime2-ltime1<<" seconds."<<endl;
+ cerr<<"Done."<<endl;
+
+}
+
+void C_MonoCorpus::allocateMem(TextLenType corpusSize)
+{
+ corpus = (IndexType *) malloc(sizeof(IndexType)*corpusSize);
+
+ if(corpus==0){
+ cerr<<"Failed to allocate memory for corpus. Quit!\n";
+ exit(-1);
+ }
+
+ this->suffix = (C_SuffixPointer *) malloc(sizeof(C_SuffixPointer)*corpusSize);
+ if(this->suffix==0){
+ cerr<<"Failed to allocate memory for suffix. Quit!\n";
+ exit(-1);
+ }
+
+ this->offsetList = (unsigned char *) malloc(sizeof(unsigned char)*corpusSize);
+ if(this->offsetList==0){
+ cerr<<"Failed to allocate memory for offset. Quit!\n";
+ exit(-1);
+ }
+
+}
+
+
+void C_MonoCorpus::outputCorpus(char *filename)
+{
+ cerr<<"Writing corpus to file: "<<filename<<endl;
+ ofstream textOutStream;
+ textOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ textOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0; i<actualCorpusSize;i++){
+ textOutStream.write((char *)&(corpus[i]), sizeof(IndexType));
+ }
+
+ textOutStream.close();
+
+}
+
+void C_MonoCorpus::outputOffset(char *filename)
+{
+ cerr<<"Writing offset to file: "<<filename<<endl;
+
+ ofstream offsetOutStream;
+ offsetOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ offsetOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0; i<actualCorpusSize; i++){
+ offsetOutStream.write((char *)& (this->offsetList[i]), sizeof(unsigned char));
+ }
+ offsetOutStream.close();
+}
+
+void C_MonoCorpus::outputSuffix(char *filename)
+{
+ cerr<<"Writing suffix information to file: "<<filename<<endl;
+
+ ofstream saOutStream;
+ saOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ saOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0;i<actualCorpusSize; i++){
+ saOutStream.write((char *) & (this->suffix[i].pointer), sizeof(TextLenType));
+ }
+
+ saOutStream.close();
+}
+
+void C_MonoCorpus::output(char *filename)
+{
+ char outputVocFileName[1024];
+ char outputCorpusFileName[1024];
+ char outputOffsetFileName[1024];
+ char outputSuffixFileName[1024];
+
+
+ if(this->vocNeedsToBeUpdated){
+ sprintf(outputVocFileName, "%s.id_voc", filename);
+ this->voc->outputToFile(outputVocFileName);
+ }
+
+ sprintf(outputCorpusFileName, "%s.sa_corpus", filename);
+ sprintf(outputOffsetFileName, "%s.sa_offset", filename);
+ sprintf(outputSuffixFileName, "%s.sa_suffix", filename);
+
+
+ this->outputCorpus(outputCorpusFileName);
+ this->outputOffset(outputOffsetFileName);
+ this->outputSuffix(outputSuffixFileName);
+}
+
diff --git a/Src/IndexSA/_MonoCorpus.h b/Src/IndexSA/_MonoCorpus.h
new file mode 100755
index 0000000..4c834b0
--- /dev/null
+++ b/Src/IndexSA/_MonoCorpus.h
@@ -0,0 +1,60 @@
+#if !defined(__MonoCorpus__H__INCLUDED_)
+#define __MonoCorpus__H__INCLUDED_
+
+#include "_IDVocabulary.h"
+#include "salm_shared.h"
+
+/**
+* \ingroup index
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+* Defines the wrapper class for the comparing function
+**/
+class C_SuffixPointer
+{
+public:
+ C_SuffixPointer(const C_SuffixPointer &);
+ C_SuffixPointer();
+ ~C_SuffixPointer();
+ C_SuffixPointer(TextLenType pointer);
+ TextLenType pointer;
+};
+
+/**
+* \ingroup index
+* Monolingual corpus class for loading the corpus from file, sort it according to the suffix array order
+* and convert it to the binary format for suffix array applications
+**/
+class C_MonoCorpus
+{
+public:
+ void initializeVocabulary(char * fileNameStem);
+ void output(char * filename);
+ void loadCorpusAndSort(const char * fileName, const char * idVocFileName, bool vocNeedsToBeUpdated);
+
+ C_MonoCorpus();
+ virtual ~C_MonoCorpus();
+
+private:
+ IndexType maxVocIdFromCorpus;
+ void outputSuffix(char * filename);
+ void outputOffset(char * filename);
+ void outputCorpus(char * filename);
+
+ IndexType vocIdForSentIdPlaceHolder;
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+ TextLenType currentPosInCorpus;
+ void allocateMem(TextLenType corpusSize);
+
+ C_SuffixPointer * suffix;
+ unsigned char * offsetList;
+ C_IDVocabulary * voc;
+
+ bool vocNeedsToBeUpdated;
+
+};
+
+#endif // !defined(__MonoCorpus__H__INCLUDED_)
diff --git a/Src/SALM-API-Description.txt b/Src/SALM-API-Description.txt
new file mode 100755
index 0000000..c36f60c
--- /dev/null
+++ b/Src/SALM-API-Description.txt
@@ -0,0 +1,24 @@
+/**
+* \defgroup index Indexing the corpus
+* \defgroup search Search Applications
+* \defgroup scan Scan Applications
+* \defgroup lm Suffix Array Language Model
+* \defgroup utils Utilities
+*
+* \mainpage SALM API Documentation
+* Author: <a href=mailto:joy+salm@cs.cmu.edu > Ying (Joy) Zhang </a>
+* \section intro Introduction
+*
+* There are three main modules in <a href=http://projectile.is.cs.cmu.edu/research/public/tools/salm/salm.htm > SALM </a> : Indexing, Searching and Scanning.
+* To start, use IndexSA to index the corpus according to its suffix array.
+* This is the first step for all applications.
+* Once the corpus is indexed. We can use SALM to do all kinds of interesting process on this corpus.
+* \section search Applications based on searching the corpus
+* These applications searches for the occurrences of an n-gram or all the embedded n-grams of a sentence in the corpus.
+* \section scan Applications based on scanning the corpus
+* These applications scan through the corpus in a linear time and collects information such as the type/token frequency of the n-grams in the data.
+* \section lm Suffix Array Language Model
+* An online language model based on the suffix array indexing. Suffix array language model can use arbitrarily long history and very large corpus.
+* \section utils Utilities
+* Utility functions such as updating the universal ID vocabulary after observing a new corpus
+**/
diff --git a/Src/Shared/_IDVocabulary.cpp b/Src/Shared/_IDVocabulary.cpp
new file mode 100755
index 0000000..a34b043
--- /dev/null
+++ b/Src/Shared/_IDVocabulary.cpp
@@ -0,0 +1,219 @@
+/**
+* _IDVocabulary.cpp: implementation of the C_IDVocabulary class.
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+
+#include "_IDVocabulary.h"
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <memory.h>
+#include <stdlib.h>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_IDVocabulary::C_IDVocabulary()
+{
+ this->maxIdInVoc = 0;
+}
+
+C_IDVocabulary::C_IDVocabulary(const char * fileName)
+{
+
+ this->maxIdInVoc = 0;
+
+ this->loadFromFile(fileName);
+}
+
+C_IDVocabulary::~C_IDVocabulary()
+{
+
+}
+
+/// Return the vocID of word "text" if it exist in the vocabulary
+/// Otherwise return 0
+IndexType C_IDVocabulary::returnId(C_String text)
+{
+ IndexType id;
+
+ map<C_String, IndexType, ltstr>::iterator iterText2Id;
+ iterText2Id = this->text2id.find(text);
+
+ if(iterText2Id==this->text2id.end()){ //this word does not exist in the voc yet, return ID for <unk>
+ id = 0;
+ }
+ else{
+ id = iterText2Id->second;
+ }
+
+ return id;
+}
+
+/// Return the text of the word given its vocID
+/// return <UNK> if specified vocID does not exist
+C_String C_IDVocabulary::getText(IndexType id)
+{
+ map<IndexType, C_String>::iterator iterId2Text;
+ iterId2Text = this->id2text.find(id);
+
+ if(iterId2Text==this->id2text.end()){
+ return C_String("<UNK>");
+ }
+
+ return iterId2Text->second;
+}
+
+IndexType C_IDVocabulary::getSize()
+{
+ return this->text2id.size();
+}
+
+
+/// Load the vocabulary file into memory
+/// The format of the vocabulary file is:
+/// word vocID
+// in each line.
+void C_IDVocabulary::loadFromFile(const char *fileName)
+{
+
+ ifstream existingVocFile;
+ existingVocFile.open(fileName);
+
+ if(!existingVocFile){
+ cerr<<"Can not open existing vocabulary file "<<fileName<<endl;
+ exit(0);
+ }
+
+ cerr<<"Loading existing vocabulary file: "<<fileName<<endl;
+
+ char aLine[1024];
+ char * aToken;
+ char delimit[] = " \t\r\n";
+ IndexType vocId = 0;
+
+ while(!existingVocFile.eof()){
+ existingVocFile.getline(aLine, 1024, '\n');
+
+ if(strlen(aLine)>0){ //a meaningful word, esp for the last line during reading file
+ vector<C_String> tokensInLine;
+
+ aToken = strtok(aLine, delimit);
+ while( aToken != NULL ) {
+ tokensInLine.push_back(C_String(aToken));
+ aToken = strtok( NULL, delimit);
+ }
+
+ if(tokensInLine.size()!=2){
+ cerr<<"Not valid format for Vocabulary: "<<aLine<<endl;
+ }
+
+ vocId = atoi(tokensInLine[1].toString());
+
+ if(vocId>this->maxIdInVoc){
+ this->maxIdInVoc = vocId;
+ }
+
+ this->text2id.insert(make_pair(tokensInLine[0], vocId));
+ this->id2text.insert(make_pair(vocId, tokensInLine[0] ));
+
+ }
+
+ aLine[0]=0;
+ }
+ cerr<<"Total "<<this->text2id.size()<<" word types loaded\n";
+ cerr<<"Max VocID="<<this->maxIdInVoc<<endl;
+}
+
+/// Return the maximum ID from all words in the vocabulary
+/// Usually equals to the size of the vocabulary if the vocabulary is created from this corpus only.
+/// If the vocabulary includes words from other corpora and the vocabulary only lists words in this corpus,
+/// then max voc ID could be different from the vocabulary size
+IndexType C_IDVocabulary::returnMaxID()
+{
+ return this->maxIdInVoc;
+}
+
+IndexType C_IDVocabulary::returnNullWordID()
+{
+ return 0;
+}
+
+/**
+* Output the vocabulary to a file
+**/
+void C_IDVocabulary::outputToFile(char *filename)
+{
+
+ ofstream outputVocFile;
+ outputVocFile.open(filename);
+
+ if(!outputVocFile){
+ cerr<<"Can not open "<<filename<<" to write vocabulary\n";
+ exit(-1);
+ }
+
+ map<C_String, IndexType, ltstr>::iterator iterText2Id;
+
+ iterText2Id = this->text2id.begin();
+ while(iterText2Id!=this->text2id.end()){
+ outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl;
+ iterText2Id++;
+ }
+
+ outputVocFile.close();
+}
+
+/// Reserver vocID 0-NUMBER_OF_RESERVED_WORDS_IN_VOC for special words that might be useful for applications
+/// Here we reserved 5 words:
+/// _SENT_ID_PLACEHOLDER_ 1
+/// _END_OF_SENTENCE_ 2
+/// _TOO_LONG_TOKEN_ 3
+/// _SENTENCE_START_ 4
+/// _END_OF_CORPUS_ 5
+/// You can add other special words to the list as long as the assignment of vocID and its interpretation is consistent between application and indexing
+void C_IDVocabulary::addingReservedWords()
+{
+ this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1);
+ this->insertWord(C_String("_END_OF_SENTENCE_"), 2);
+ this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3);
+ this->insertWord(C_String("_SENTENCE_START_"), 4);
+ this->insertWord(C_String("_END_OF_CORPUS_"), 5);
+
+ char reservedWord[20];
+ for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){
+ memset(reservedWord, 0, 20);
+ sprintf(reservedWord, "_RESERVED_WORDS_%d", i);
+ this->insertWord(C_String(reservedWord), i);
+ }
+}
+
+void C_IDVocabulary::insertWord(C_String text, IndexType id)
+{
+ this->text2id.insert(make_pair(text, id));
+ this->id2text.insert(make_pair(id, text));
+
+}
+
+/**
+* Check if the word already exist in the voc,
+* if so, return the vocID of the word,
+* otherwise assign an ID to this word and insert it into the voc
+**/
+IndexType C_IDVocabulary::getId(C_String text)
+{
+ IndexType id = this->returnId(text);
+ if(id==0){
+ this->maxIdInVoc++;
+ this->insertWord(text, this->maxIdInVoc);
+ return this->maxIdInVoc;
+ }
+
+ //else, already exist
+ return id;
+}
diff --git a/Src/Shared/_IDVocabulary.cpp~ b/Src/Shared/_IDVocabulary.cpp~
new file mode 100755
index 0000000..d5e6a14
--- /dev/null
+++ b/Src/Shared/_IDVocabulary.cpp~
@@ -0,0 +1,218 @@
+/**
+* _IDVocabulary.cpp: implementation of the C_IDVocabulary class.
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+
+#include "_IDVocabulary.h"
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <memory.h>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_IDVocabulary::C_IDVocabulary()
+{
+ this->maxIdInVoc = 0;
+}
+
+C_IDVocabulary::C_IDVocabulary(const char * fileName)
+{
+
+ this->maxIdInVoc = 0;
+
+ this->loadFromFile(fileName);
+}
+
+C_IDVocabulary::~C_IDVocabulary()
+{
+
+}
+
+/// Return the vocID of word "text" if it exist in the vocabulary
+/// Otherwise return 0
+IndexType C_IDVocabulary::returnId(C_String text)
+{
+ IndexType id;
+
+ map<C_String, IndexType, ltstr>::iterator iterText2Id;
+ iterText2Id = this->text2id.find(text);
+
+ if(iterText2Id==this->text2id.end()){ //this word does not exist in the voc yet, return ID for <unk>
+ id = 0;
+ }
+ else{
+ id = iterText2Id->second;
+ }
+
+ return id;
+}
+
+/// Return the text of the word given its vocID
+/// return <UNK> if specified vocID does not exist
+C_String C_IDVocabulary::getText(IndexType id)
+{
+ map<IndexType, C_String>::iterator iterId2Text;
+ iterId2Text = this->id2text.find(id);
+
+ if(iterId2Text==this->id2text.end()){
+ return C_String("<UNK>");
+ }
+
+ return iterId2Text->second;
+}
+
+IndexType C_IDVocabulary::getSize()
+{
+ return this->text2id.size();
+}
+
+
+/// Load the vocabulary file into memory
+/// The format of the vocabulary file is:
+/// word vocID
+// in each line.
+void C_IDVocabulary::loadFromFile(const char *fileName)
+{
+
+ ifstream existingVocFile;
+ existingVocFile.open(fileName);
+
+ if(!existingVocFile){
+ cerr<<"Can not open existing vocabulary file "<<fileName<<endl;
+ exit(0);
+ }
+
+ cerr<<"Loading existing vocabulary file: "<<fileName<<endl;
+
+ char aLine[1024];
+ char * aToken;
+ char delimit[] = " \t\r\n";
+ IndexType vocId = 0;
+
+ while(!existingVocFile.eof()){
+ existingVocFile.getline(aLine, 1024, '\n');
+
+ if(strlen(aLine)>0){ //a meaningful word, esp for the last line during reading file
+ vector<C_String> tokensInLine;
+
+ aToken = strtok(aLine, delimit);
+ while( aToken != NULL ) {
+ tokensInLine.push_back(C_String(aToken));
+ aToken = strtok( NULL, delimit);
+ }
+
+ if(tokensInLine.size()!=2){
+ cerr<<"Not valid format for Vocabulary: "<<aLine<<endl;
+ }
+
+ vocId = atoi(tokensInLine[1].toString());
+
+ if(vocId>this->maxIdInVoc){
+ this->maxIdInVoc = vocId;
+ }
+
+ this->text2id.insert(make_pair(tokensInLine[0], vocId));
+ this->id2text.insert(make_pair(vocId, tokensInLine[0] ));
+
+ }
+
+ aLine[0]=0;
+ }
+ cerr<<"Total "<<this->text2id.size()<<" word types loaded\n";
+ cerr<<"Max VocID="<<this->maxIdInVoc<<endl;
+}
+
+/// Return the maximum ID from all words in the vocabulary
+/// Usually equals to the size of the vocabulary if the vocabulary is created from this corpus only.
+/// If the vocabulary includes words from other corpora and the vocabulary only lists words in this corpus,
+/// then max voc ID could be different from the vocabulary size
+IndexType C_IDVocabulary::returnMaxID()
+{
+ return this->maxIdInVoc;
+}
+
+IndexType C_IDVocabulary::returnNullWordID()
+{
+ return 0;
+}
+
+/**
+* Output the vocabulary to a file
+**/
+void C_IDVocabulary::outputToFile(char *filename)
+{
+
+ ofstream outputVocFile;
+ outputVocFile.open(filename);
+
+ if(!outputVocFile){
+ cerr<<"Can not open "<<filename<<" to write vocabulary\n";
+ exit(-1);
+ }
+
+ map<C_String, IndexType, ltstr>::iterator iterText2Id;
+
+ iterText2Id = this->text2id.begin();
+ while(iterText2Id!=this->text2id.end()){
+ outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl;
+ iterText2Id++;
+ }
+
+ outputVocFile.close();
+}
+
+/// Reserver vocID 0-NUMBER_OF_RESERVED_WORDS_IN_VOC for special words that might be useful for applications
+/// Here we reserved 5 words:
+/// _SENT_ID_PLACEHOLDER_ 1
+/// _END_OF_SENTENCE_ 2
+/// _TOO_LONG_TOKEN_ 3
+/// _SENTENCE_START_ 4
+/// _END_OF_CORPUS_ 5
+/// You can add other special words to the list as long as the assignment of vocID and its interpretation is consistent between application and indexing
+void C_IDVocabulary::addingReservedWords()
+{
+ this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1);
+ this->insertWord(C_String("_END_OF_SENTENCE_"), 2);
+ this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3);
+ this->insertWord(C_String("_SENTENCE_START_"), 4);
+ this->insertWord(C_String("_END_OF_CORPUS_"), 5);
+
+ char reservedWord[20];
+ for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){
+ memset(reservedWord, 0, 20);
+ sprintf(reservedWord, "_RESERVED_WORDS_%d", i);
+ this->insertWord(C_String(reservedWord), i);
+ }
+}
+
+void C_IDVocabulary::insertWord(C_String text, IndexType id)
+{
+ this->text2id.insert(make_pair(text, id));
+ this->id2text.insert(make_pair(id, text));
+
+}
+
+/**
+* Check if the word already exist in the voc,
+* if so, return the vocID of the word,
+* otherwise assign an ID to this word and insert it into the voc
+**/
+IndexType C_IDVocabulary::getId(C_String text)
+{
+ IndexType id = this->returnId(text);
+ if(id==0){
+ this->maxIdInVoc++;
+ this->insertWord(text, this->maxIdInVoc);
+ return this->maxIdInVoc;
+ }
+
+ //else, already exist
+ return id;
+}
diff --git a/Src/Shared/_IDVocabulary.h b/Src/Shared/_IDVocabulary.h
new file mode 100755
index 0000000..fa50add
--- /dev/null
+++ b/Src/Shared/_IDVocabulary.h
@@ -0,0 +1,55 @@
+#if !defined(__IDVocabulary_H__INCLUDED_)
+#define __IDVocabulary_H__INCLUDED_
+
+#include "_String.h"
+#include <string>
+#include <map>
+#include <vector>
+#include "salm_shared.h"
+
+using namespace std;
+
+
+struct ltstr
+{
+ bool operator()(C_String s1, C_String s2) const
+ {
+ return s1<s2;
+ }
+};
+
+/**
+* Vocabulary class
+* Mapping between words and their IDs
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_IDVocabulary
+{
+
+public:
+ ///Return the ID of word "text", if the word does not exist, add the word into the voc and return the newly assigned ID
+ IndexType getId(C_String text);
+
+ void addingReservedWords();
+ void outputToFile(char * filename);
+ IndexType returnNullWordID();
+ IndexType returnMaxID();
+ IndexType returnId(C_String text);
+
+ IndexType getSize();
+ C_String getText(IndexType);
+
+ C_IDVocabulary();
+ C_IDVocabulary(const char * fileName);
+ virtual ~C_IDVocabulary();
+
+private:
+ void insertWord(C_String text, IndexType id);
+ void loadFromFile(const char * fileName);
+ IndexType maxIdInVoc;
+ map<C_String, IndexType, ltstr> text2id;
+ map<IndexType, C_String> id2text;
+};
+
+#endif // !defined(__IDVocabulary_H__INCLUDED_)
diff --git a/Src/Shared/_String.cpp b/Src/Shared/_String.cpp
new file mode 100755
index 0000000..75ba8e8
--- /dev/null
+++ b/Src/Shared/_String.cpp
@@ -0,0 +1,253 @@
+/**
+* _String.cpp: implementation of the C_String class.
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_String.h"
+#include "malloc.h"
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_String::C_String()
+{
+ this->content = (char *) malloc(sizeof(char));
+ this->content[0]='\0';
+ this->hasContent = true;
+}
+
+void C_String::freeContent()
+{
+ if(this->hasContent){
+ this->hasContent = false;
+ free(this->content);
+ }
+}
+
+C_String::~C_String()
+{
+ this->freeContent();
+}
+
+/**
+* Copy constructor from a char string
+**/
+C_String::C_String(char * str1)
+{
+
+ this->content = (char *) malloc(sizeof(char)*strlen(str1)+1);
+ if(this->content==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ strcpy(this->content, str1);
+
+ this->hasContent = true;
+}
+
+
+C_String::C_String(C_String const &strObj1)
+{
+ this->hasContent = false;
+ copy(strObj1);
+}
+
+C_String::C_String(const C_String & obj1, const C_String & obj2)
+{
+ this->freeContent();
+
+ int len1 = strlen(obj1.content);
+ int len2 = strlen(obj2.content);
+
+ int fullLen = len1+len2;
+ this->content = (char *) malloc(sizeof(char)*len1 + sizeof(char)*len2 + 1);
+
+ if(this->content==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ char * pointer = (char*) this->content;
+ strcpy(pointer, obj1.content); //copy first part
+ pointer += len1;
+ strcpy(pointer, obj2.content); //copy second part
+
+ this->content[fullLen]='\0';
+
+ this->hasContent = true;
+}
+
+void C_String::operator=(const C_String &strObj2)
+{
+ copy(strObj2);
+}
+
+void C_String::copy(const C_String &strObj)
+{
+ this->freeContent();
+
+ this->content = (char *) malloc(sizeof(char)*strlen(strObj.content)+1);
+ if(this->content==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ strcpy(this->content, strObj.content);
+ this->hasContent = true;
+}
+
+void C_String::copy(const C_String &strObj, int copyLen)
+{
+ this->freeContent();
+
+ this->content = (char *) malloc(sizeof(char)*(copyLen+1) );
+ if(this->content==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ for(int i=0;i<copyLen;i++){
+ this->content[i]=strObj.getCharAtPos(i);
+ }
+
+ this->content[copyLen]='\0';
+
+ this->hasContent = true;
+
+}
+
+void C_String::print2stream(FILE *stream)
+{
+ fprintf(stream, content);
+}
+
+
+int C_String::length() const
+{
+ if(this->hasContent){
+ return strlen(this->content);
+ }
+
+ return 0;
+}
+
+bool C_String::operator==(const C_String &obj1) const
+{
+ if(strcmp(this->content, obj1.content)==0){
+ return true;
+ };
+
+ return false;
+}
+
+bool C_String::operator!=(const C_String &obj1) const
+{
+ if(strcmp(this->content, obj1.content)!=0){
+ return true;
+ };
+
+ return false;
+}
+
+bool C_String::operator<(const C_String &obj1) const
+{
+ if(strcmp(this->content, obj1.content)<0){
+ return true;
+ };
+
+ return false;
+}
+
+char * C_String::toString() const
+{
+ return this->content;
+}
+
+void C_String::clear()
+{
+ this->freeContent();
+
+ this->content = (char *) malloc(sizeof(char));
+ this->content[0]='\0';
+ this->hasContent = true;
+}
+
+
+char C_String::getCharAtPos(int pos) const
+{
+ if(pos>=this->length()){
+ fprintf(stderr,"Can not get char at pos %d, out of bound! Exit.\n", pos);
+ exit(0);
+ }
+
+ return this->content[pos];
+}
+
+
+void C_String::appending(const C_String &obj)
+{
+ int len1 = 0;
+
+ if(this->hasContent){
+ len1 = strlen(this->content);
+ }
+
+ int len2 = strlen(obj.content);
+
+ int fullLen = len1+len2;
+
+ char * newContent = (char *) malloc(sizeof(char)*fullLen + 1);
+
+ if(newContent==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ char * pointer = newContent;
+ if(this->hasContent){
+ strcpy(pointer, content); //copy first part
+ pointer += len1;
+ }
+
+ strcpy(pointer, obj.content); //copy second part
+ newContent[fullLen]='\0';
+
+ //free old content
+ this->freeContent();
+
+ //point to new content
+ this->content = newContent;
+
+ this->hasContent = true;
+}
+
+void C_String::appending(const char nextChar)
+{
+ int len1 = 0;
+
+ if(this->hasContent){
+ len1 = strlen(this->content);
+ }
+
+ int fullLen = len1+1;
+
+ char * newContent = (char *) malloc(sizeof(char)*fullLen + 1);
+
+ if(newContent==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ strcpy(newContent, content); //copy first part
+
+ newContent[len1]=nextChar; //copy second part
+ newContent[fullLen]='\0';
+
+ //free old content
+ this->freeContent();
+
+ //point to new content
+ this->content = newContent;
+
+ this->hasContent = true;
+}
diff --git a/Src/Shared/_String.h b/Src/Shared/_String.h
new file mode 100755
index 0000000..d8f633d
--- /dev/null
+++ b/Src/Shared/_String.h
@@ -0,0 +1,45 @@
+#if !defined(__STRING_H__INCLUDED_)
+#define __STRING_H__INCLUDED_
+
+/**
+* Definition of class C_String
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+#include "stdio.h"
+
+class C_String
+{
+public:
+
+ char getCharAtPos(int) const;
+ void clear();
+ char * toString() const;
+ int length() const;
+ void print2stream(FILE *);
+
+ C_String(const C_String & obj1, const C_String & obj2);
+ C_String(C_String const&);
+ C_String(char *);
+ C_String();
+
+ bool operator==(const C_String &) const;
+ bool operator!=(const C_String &) const;
+ bool operator<(const C_String &) const;
+ void operator=(const C_String &strObj2);
+
+ void appending(const C_String & obj);
+ void appending(const char nextChar);
+
+ virtual ~C_String();
+
+private:
+ void freeContent();
+ void copy(const C_String &);
+ void copy(const C_String &strObj, int copyLen);
+
+ bool hasContent;
+ char * content;
+};
+
+#endif // !defined(__STRING_H__INCLUDED_)
diff --git a/Src/Shared/salm_shared.h b/Src/Shared/salm_shared.h
new file mode 100755
index 0000000..2c0e186
--- /dev/null
+++ b/Src/Shared/salm_shared.h
@@ -0,0 +1,36 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+#if !defined(_SA_common_h)
+#define _SA_common_h
+
+#include "math.h"
+
+typedef unsigned int IndexType;
+typedef unsigned int TextLenType;
+typedef unsigned short int SearchLenType;
+
+//constants
+const int SIZE_ONE_READ = 16384; //when loading the data, each I/O read in SIZE_ONE_READ data points
+const int MAX_TOKEN_LEN = 1024; //length of the longest word
+
+const int NUMBER_OF_RESERVED_WORDS_IN_VOC = 100;
+
+/// for language modeling
+const double SALM_PROB_UNK = 0.00000000023283064365386962890625; // 1/4G
+const double SALM_LOG_PROB_UNK = log(SALM_PROB_UNK);
+const double SALM_LOG_0 = -20;
+
+/**
+* \ingroup scan
+**/
+typedef struct s_nGramScanningInfoElement
+{
+ IndexType vocId;
+ TextLenType freqThreshForOutput;
+ TextLenType freqSoFar;
+}S_nGramScanningInfoElement;
+
+#endif
+
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp
new file mode 100755
index 0000000..ab2915d
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp
@@ -0,0 +1,63 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "time.h"
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <cstring>
+
+using namespace std;
+
+/**
+* A simple example of using the C_SuffixArrayLanguageModel class to calculate the LM prob of input sentences
+*
+* Revision $Rev: 3816 $
+* Last Modified $LastChangedDate: 2007-07-06 14:36:11 -0400 (Fri, 06 Jul 2007) $
+**/
+int main(int argc, char * argv[]){
+ if(argc<2){
+ cerr<<"\nUsage:\n\t"<<argv[0]<<" configurationFileName < sentences\n";
+ exit(0);
+ }
+
+ C_SuffixArrayLanguageModel salm(argv[1]);
+
+ long ltime1, ltime2;
+ time( &ltime1 );
+
+ string aWord;
+ char aLine[10240];
+ while(!cin.eof()){
+ cin.getline(aLine, 10240, '\n');
+
+ if(strlen(aLine)>0){
+ istringstream inputLine(aLine, istringstream::in);
+ LMState lmState = salm.beginOfSentenceState();
+
+ LMState nextState;
+ double logProb = 0;
+
+ while(! inputLine.eof()){
+ inputLine>>aWord;
+ if(aWord.length()>0){
+ IndexType vocId = salm.returnVocId(C_String((char *) aWord.c_str()));
+ logProb+=salm.logProb(lmState, vocId, nextState);
+ lmState = nextState;
+ }
+ aWord="";
+ }
+
+ logProb+=salm.logProbEnd(lmState);
+ cout<<"LogProb="<<logProb<<endl;
+
+ }
+
+ aLine[0]=0;
+ }
+
+ time( &ltime2 );
+ cerr<<"\n"<<ltime2-ltime1<<" seconds spent."<<endl;
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~
new file mode 100755
index 0000000..95e7993
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~
@@ -0,0 +1,62 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "time.h"
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+
+using namespace std;
+
+/**
+* A simple example of using the C_SuffixArrayLanguageModel class to calculate the LM prob of input sentences
+*
+* Revision $Rev: 3816 $
+* Last Modified $LastChangedDate: 2007-07-06 14:36:11 -0400 (Fri, 06 Jul 2007) $
+**/
+int main(int argc, char * argv[]){
+ if(argc<2){
+ cerr<<"\nUsage:\n\t"<<argv[0]<<" configurationFileName < sentences\n";
+ exit(0);
+ }
+
+ C_SuffixArrayLanguageModel salm(argv[1]);
+
+ long ltime1, ltime2;
+ time( &ltime1 );
+
+ string aWord;
+ char aLine[10240];
+ while(!cin.eof()){
+ cin.getline(aLine, 10240, '\n');
+
+ if(strlen(aLine)>0){
+ istringstream inputLine(aLine, istringstream::in);
+ LMState lmState = salm.beginOfSentenceState();
+
+ LMState nextState;
+ double logProb = 0;
+
+ while(! inputLine.eof()){
+ inputLine>>aWord;
+ if(aWord.length()>0){
+ IndexType vocId = salm.returnVocId(C_String((char *) aWord.c_str()));
+ logProb+=salm.logProb(lmState, vocId, nextState);
+ lmState = nextState;
+ }
+ aWord="";
+ }
+
+ logProb+=salm.logProbEnd(lmState);
+ cout<<"LogProb="<<logProb<<endl;
+
+ }
+
+ aLine[0]=0;
+ }
+
+ time( &ltime2 );
+ cerr<<"\n"<<ltime2-ltime1<<" seconds spent."<<endl;
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt
new file mode 100755
index 0000000..17cd5a8
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt
@@ -0,0 +1,5 @@
+June 27, 2007
+
+Working branch of applying KN smoothing in LM.
+Not finished yet.
+Do not distribute! \ No newline at end of file
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp
new file mode 100755
index 0000000..583b222
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp
@@ -0,0 +1,1113 @@
+/**
+* Revision $Rev: 3665 $
+* $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <set>
+
+#include "math.h"
+
+using namespace std;
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+
+}
+
+
+/**
+* Construct the suffix array language model object
+* Take the configuration filename as the parameter for the constructor
+*
+* The configuration file is of the following format for each line:
+*
+* Keyword<tab>value
+* <p>
+* Note: keywords are all case sensitive.
+* <ul>
+* <li> <b>CORPUS</b> filename of the corpus for LM training. It should be the same as used in IndexSA
+* <li> <b>N</b> Highest order of n considered for n-gram LM. Default value = <i>5</i>
+* <li> <b>SMOOTHING_STRATEGY</b> Smoothing strategy.
+* <ul>
+* <li> <i>k</i> : default value. Modified Kneser-Ney Smoothing @see <a href=http://acl.ldc.upenn.edu/P/P96/P96-1041.pdf> An Empirical Study of Smoothing Techniques for Language Modeling </a>
+* <li> <i>g</i> : Good-Turing discounting @see <a href=http://l2r.cs.uiuc.edu/~danr/Teaching/CS598-05/Papers/Gale-Sampson-smoothgoodturing.pdf> Good Turing without Tears</a>
+* </ul>
+* <li> <b>INTERPOLATION_STRATEGY</b> : Interpolation strategy
+* <ul>
+* <li> <i>e</i> : Probability of the next word predicted by histories of different orders are equally interpolated
+* <li> <i>m</i> : Use the maximum conditional probability from all different order of history as the probability for the next word
+* <li> <i>i</i> : Use deleted interpolation based on heuristics developed by IBM
+* </ul>
+* <li> <b>MAX_FREQ_DISC</b>: <br>
+* <i>default</i>=50<br>
+* If the frequency of an n-gram is lower than this value and SMOOTHING is set, discounting will be applied. <br>
+* If this value is set to 0 or negative values, smoothing/discounting will not be used. <br>
+* <li> <b>PURGE_CACHE</b>: Check entries in the cache after "PURGE_CACHE" number of sentences have been processed. Default = 100.
+* <li> <b>FRESH_TIME</b>: Entries in the cache that are not used since "current time - FRESH_TIME" will be purged from the cache. Mesured in seconds of wall clock time.
+** </ul>
+* @param Configuration File Name
+* @param corpusFileNameStem The training corpus filename used by IndexSA.
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName)
+{
+
+ fstream cfgFile;
+ cfgFile.open(cfgFileName,ios::in);
+
+ if(!cfgFile){
+ fprintf(stderr,"Configuration file %s does not exist! quit!!\n", cfgFileName);
+ exit(-1);
+ }
+
+ //-----------------------------------------------------------------------------
+ //reading parameters
+ char paraName[1024];
+ char corpusFileNameStem[1024];
+
+ corpusFileNameStem[0]='\0';
+
+ //default values for member variables
+ this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob
+ this->smoothingStrategy = 'k'; //default smoothing strategy: modified Kneser-Ney smoothing
+ this->maxFreqForDiscounting = 50; //default, freq that is lower than this value will not be applied with discounting
+ this->maxN= 5; // default value; consider up to 5 words
+
+ this->numberOfSentSeenToPurgeCache = 100; //default value, purge cache after processing 100 sentences
+ this->freshTime = 50; //entries in the cache that are older than 50 seconds are subject to purging
+ this->sentenceProcessedSoFar = 0;
+ this->typeOfBigrams = 0;
+
+ while(!cfgFile.eof()){
+ cfgFile>>paraName;
+
+ if(strcmp(paraName,"CORPUS")==0){
+ cfgFile>>corpusFileNameStem;
+ }
+ else if(strcmp(paraName, "SMOOTHING_STRATEGY")==0){
+ cfgFile>>this->smoothingStrategy;
+ }
+ else if(strcmp(paraName,"N")==0){
+ cfgFile>>this->maxN;
+ }
+ else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+ cfgFile>>this->maxFreqForDiscounting;
+ }
+ else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+ cfgFile>>this->interpolationStrategy;
+ }
+ else if(strcmp(paraName,"FRESH_TIME")==0){
+ cfgFile>>this->freshTime;
+ }
+ else if(strcmp(paraName, "PURGE_CACHE")==0){
+ cfgFile>>this->numberOfSentSeenToPurgeCache;
+ }
+
+ paraName[0]=0;
+
+ }
+
+
+ if(strlen(corpusFileNameStem)==0){
+ cerr<<"CORPUS not specified in the configuration file! Quit!"<<endl;
+ exit(-1);
+ }
+
+
+ this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class
+ //corpusName, with vocabulary, no offset,
+
+
+ this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+
+ //initialize the scanning list
+ for(int i=0;i<this->maxN;i++){
+ this->nGramScanningList[i].freqSoFar=0;
+ this->nGramScanningList[i].vocId = 0;
+ this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ if(this->maxFreqForDiscounting<=0){
+ this->applyDiscounting = false;
+ }
+ else{
+ if(this->maxFreqForDiscounting<3){
+ cerr<<"MAX_FREQ_DISC has to be at least 3!"<<endl;
+ exit(-1);
+ }
+
+ this->applyDiscounting = true;
+ this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map
+ }
+
+}
+
+/**
+* Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 'e' for equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+void C_SuffixArrayLanguageModel::setParam_interpolationStrategy(char interpolationStrategy)
+{
+ this->interpolationStrategy = interpolationStrategy;
+}
+
+/**
+* Set the value for parameter :numberOfSentSeenToPurgeCache
+* LM will purge the entries in the cache that have not been used in 'freshTime'
+**/
+void C_SuffixArrayLanguageModel::setParam_numberOfSentSeenToPurgeCache(int numberOfSentSeenToPurgeCache)
+{
+ this->numberOfSentSeenToPurgeCache = numberOfSentSeenToPurgeCache;
+}
+
+/**
+* Set the value for parameter: freshTime
+* LM will purge the entries in the cache that have not been used in 'freshTime'
+**/
+void C_SuffixArrayLanguageModel::setParam_freshTime(long freshTime)
+{
+ this->freshTime = freshTime;
+}
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information
+* and construct the discounting using Good-Turing smoothing
+* Also, estimate the Y, D1, D2, D3+ values as needed for the modified Kneser-Ney smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+ unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+ this->typeOfBigrams = 0;
+
+ if(countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+ countOfCountsTable[c]=0;
+ }
+
+
+ int i,j;
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==this->nGramScanningList[i].vocId)){ //still match
+
+ this->nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+
+
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(this->nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ if(j==1){ //a new bigram type, this information is important for KN-smoothing
+ this->typeOfBigrams++;
+ }
+
+
+ freqSoFar = this->nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ this->nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ this->nGramScanningList[j].freqSoFar = 1;
+ }
+
+ this->nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(this->nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+
+ if(i==1){
+ this->typeOfBigrams++;
+ }
+
+ freqSoFar = this->nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+ }
+
+ //now, use Good-Turing discounting to create frequency mapping
+ //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+ this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+
+ for(i=0;i<this->maxN;i++){
+ //for (i+1)-gram
+
+ unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+ double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+ for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq
+ //for all (freq+1) ngrams
+ if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists
+ discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);
+ }
+ else{
+ discountingMapForThisN[freq] = -1;
+ }
+ }
+
+ discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency
+ }
+
+
+ //estimate the Y, D1, D2 and D3+ values for each order of n.
+ //these values will be used for KN-smoothing to estimate the gamma, the discounting factor
+ this->Y = (double *) malloc(sizeof(double) * this->maxN);
+ this->D1 = (double *) malloc(sizeof(double) * this->maxN);
+ this->D2 = (double *) malloc(sizeof(double) * this->maxN);
+ this->D3plus = (double *) malloc(sizeof(double) * this->maxN);
+
+ for(i=0;i<this->maxN;i++){
+ unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+ double n1 = ccTableForThisN[0]; //number of n-gram types that have freq equals 1
+ double n2 = ccTableForThisN[1]; //number of n-gram types that have freq equals 2;
+ double n3 = ccTableForThisN[2]; //number of n-gram types that have freq equals 3;
+ double n4 = ccTableForThisN[3]; //number of n-gram types that have freq equals 4;
+
+ this->Y[i] = n1/(n1+2*n2); //for (i+1)-gram
+ this->D1[i] = 1-2*Y[i]*n2/n1;
+ this->D2[i] = 2-3*Y[i]*n3/n2;
+ this->D3plus[i] = 3 - 4*Y[i]*n4/n3;
+ }
+
+ free(countOfCountsTable);
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+/// 3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ vector<IndexType> nGram;
+
+ if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk>
+ if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram
+ currentMatchStart++;
+ currentMatchLen--;
+ }
+
+ for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+ nGram.push_back(this->corpus_list[pos]);
+ }
+ }
+
+ nGram.push_back(nextWord);
+
+ int sentLen = nGram.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+ int startPosForNgram;
+ int startPosForLongestMatchingWithNextWord;
+ int cellIndexForLongestMatchingWithNextWord;
+
+ bool stillMatched = true;
+ bool atLeastOneMatched = false;
+
+ int indexForNgram;
+
+ unsigned int totalOccurrences;
+ unsigned int totalOccurrencesOfHistory;
+
+ //for unigram
+ indexForNgram = sentLen - 1;
+ if(table[indexForNgram].found){
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ if(this->smoothingStrategy=='g'){ //if use Good-Turing for discounting
+ freqTable[0] = this->discountFreq_GT(1, totalOccurrences);
+ }
+ else{
+ freqTable[0] = totalOccurrences;
+ }
+
+ freqTable[1] = this->corpusSize;
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = sentLen-1;
+ atLeastOneMatched = true;
+ }
+ else{
+ stillMatched = false;
+ }
+
+ int n=2; //considering 2-gram and longer n-gram now
+ startPosForNgram = sentLen - 2;
+ while((stillMatched)&&(startPosForNgram>=0)){
+
+ indexForNgram = (n-1) * sentLen + startPosForNgram;
+ int indexForHistory = (n-2) * sentLen + startPosForNgram;
+
+ if(table[indexForNgram].found){
+
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+ if(this->applyDiscounting){
+ freqTable[2*n-2] = this->discountFreq_GT(n, totalOccurrences);
+ }
+ else{
+ freqTable[2*n-2] = (double)totalOccurrences;
+ }
+
+ freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history
+
+ if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = startPosForNgram;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ startPosForNgram--;
+ n++;
+ }
+
+ if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord'
+ updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+ updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+ }
+ else{
+ updatedMatchingStart = (TextLenType) -1;
+ updatedMatchingLen = 0;
+ }
+
+ free(table);
+
+}
+
+
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, S_ContextTypeInfo * contextTypeInfo, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ vector<IndexType> nGram;
+
+ if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk>
+ if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram
+ currentMatchStart++;
+ currentMatchLen--;
+ }
+
+ for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+ nGram.push_back(this->corpus_list[pos]);
+ }
+ }
+
+ nGram.push_back(nextWord);
+
+ int sentLen = nGram.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+ int startPosForNgram;
+ int startPosForLongestMatchingWithNextWord;
+ int cellIndexForLongestMatchingWithNextWord;
+
+ bool stillMatched = true;
+ bool atLeastOneMatched = false;
+
+ int indexForNgram;
+
+ unsigned int totalOccurrences;
+ unsigned int totalOccurrencesOfHistory;
+
+ //for unigram
+ indexForNgram = sentLen - 1;
+ if(table[indexForNgram].found){
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+
+ freqTable[0] = totalOccurrences;
+ freqTable[1] = this->corpusSize;
+
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = sentLen-1;
+ atLeastOneMatched = true;
+ }
+ else{
+ stillMatched = false;
+ }
+
+ int n=2; //considering 2-gram and longer n-gram now for token freq
+ startPosForNgram = sentLen - n;
+ while((stillMatched)&&(startPosForNgram>=0)){
+
+ indexForNgram = (n-1) * sentLen + startPosForNgram;
+ int indexForHistory = (n-2) * sentLen + startPosForNgram;
+
+ if(table[indexForNgram].found){
+
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+ freqTable[2*n-2] = (double)totalOccurrences;
+ freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history
+
+ if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = startPosForNgram;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ startPosForNgram--;
+ n++;
+ }
+
+ if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord'
+ updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+ updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+ }
+ else{
+ updatedMatchingStart = (TextLenType) -1;
+ updatedMatchingLen = 0;
+ }
+
+
+ //estimate the context type information which will be used for KN-smoothing
+ for(n=2;n<=sentLen;n++){
+ startPosForNgram = sentLen - n;
+ TextLenType w_in2_i1_startPos_in_SA = 0;
+ TextLenType w_in2_i1_endPos_in_SA = 0;
+
+ if(n>2){
+ int indexForW_in2_i1 = (n-3) * sentLen + startPosForNgram + 1; //the location information for w_{i-n+2}^{i-1} of length n-2
+ w_in2_i1_startPos_in_SA = table[indexForW_in2_i1].startPosInSA;
+ w_in2_i1_endPos_in_SA = table[indexForW_in2_i1].endingPosInSA;
+ }
+
+ int indexForW_in1_i1 = (n-2) * sentLen + startPosForNgram; //the location information of w_{i-n+1}^{i-1} of length n-1
+
+ this->scanCorpusForContextTypeInfo(n, nextWord,
+ w_in2_i1_startPos_in_SA, w_in2_i1_endPos_in_SA,
+ table[indexForW_in1_i1].startPosInSA, table[indexForW_in1_i1].endingPosInSA,
+ contextTypeInfo[n-1]);
+ }
+
+ free(table);
+
+
+}
+
+///given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq_GT(int n, unsigned int observedFreq)
+{
+ if(n>=this->maxN){ //do not discount
+ return (double) observedFreq;
+ }
+
+ if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq
+ return (double) observedFreq;
+ }
+
+ //else, check the discount map
+ double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+ if(discountedFreq>0){
+ return discountedFreq;
+ }
+
+ //else, no discounting
+ return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+///Increase the count of 'sentenceProcessedSoFar'
+///If LM has processed 'numberOfSentSeenToPurgeCache' sentences
+///it is time to check if old entries in the cache should be cleaned
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+ long currentTime;
+ time(&currentTime);
+
+ this->resetLmStates();
+ this->initialLmState();
+
+ this->sentenceProcessedSoFar++;
+
+ if(this->sentenceProcessedSoFar==this->numberOfSentSeenToPurgeCache){
+ //purge the cache
+ this->purgeCache(currentTime-this->freshTime);
+
+ this->sentenceProcessedSoFar = 0;
+ }
+
+ return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+ //add sentence start
+ S_LMStateInfo sentStartNode;
+ sentStartNode.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s>
+ sentStartNode.len = 1;
+
+ this->allLMStates.push_back(sentStartNode);
+ this->lmStateInfo2Id.insert(make_pair(sentStartNode, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+ this->buffer.clear();
+ this->allLMStates.clear();
+ this->lmStateInfo2Id.clear();
+}
+
+/**
+* Purge entries in the cache that are not visited after "lastVisitedTime"
+* @param lastVisitedTime Entries in the cache that are older than 'lastVisitedTime' parameter will be purged
+**/
+void C_SuffixArrayLanguageModel::purgeCache(long lastVisitedTime)
+{
+ //cerr<<this->cached_sa_access.size()<<" entries in cache, purged to ";
+
+ map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key>::iterator iter1,iter2;
+
+ iter1 = this->cached_sa_access.begin();
+
+ while(iter1!=this->cached_sa_access.end()){
+ iter2=iter1;
+ iter2++;
+
+ if(iter1->second.lastTimedUsed<lastVisitedTime){
+ this->cached_sa_access.erase(iter1);
+ }
+
+ iter1=iter2;
+ }
+ //cerr<<this->cached_sa_access.size()<<" entries"<<endl;
+}
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+
+ //first check if we have already seen this before
+ map< pair<LMState, IndexType>, S_BufferedLmInfo>::iterator iterBuffer;
+ iterBuffer = this->buffer.find( make_pair( lmState, nextWord) );
+
+ if(iterBuffer==this->buffer.end()){ //we haven't seen this lmState+word yet
+ //search for it in the corpus
+ S_LMStateInfo lmStateInfo = this->allLMStates[lmState];
+ TextLenType updatedMatchingStart;
+ unsigned char updatedMatchingLen;
+
+ double logProb = this->logProbOfNgramFromCorpusInfo(lmStateInfo.posInCorpus, lmStateInfo.len, nextWord, updatedMatchingStart, updatedMatchingLen);
+
+
+ S_LMStateInfo updatedLmStateInfo;
+ updatedLmStateInfo.posInCorpus = updatedMatchingStart;
+ updatedLmStateInfo.len = updatedMatchingLen;
+
+ int updatedLmStateId;
+ map<S_LMStateInfo, int, lt_lmStateInfo>::iterator iterLmStateInfo2Id;
+ iterLmStateInfo2Id = this->lmStateInfo2Id.find(updatedLmStateInfo);
+ if(iterLmStateInfo2Id==this->lmStateInfo2Id.end()){ //this updated lm state does not exist yet
+ this->allLMStates.push_back(updatedLmStateInfo);
+ updatedLmStateId = this->allLMStates.size()-1;
+ this->lmStateInfo2Id.insert(make_pair(updatedLmStateInfo, updatedLmStateId));
+ }
+ else{
+ updatedLmStateId = iterLmStateInfo2Id->second;
+ }
+
+ //buffer this
+ S_BufferedLmInfo bufferedLmInfo;
+ bufferedLmInfo.logProb = logProb;
+ bufferedLmInfo.nextState = updatedLmStateId;
+
+ this->buffer.insert(make_pair( make_pair(lmState, nextWord), bufferedLmInfo));
+
+ //updated next state
+ nextState = updatedLmStateId;
+
+ return logProb;
+ }
+
+ nextState = iterBuffer->second.nextState;
+
+ return iterBuffer->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+ double logProb = 0;
+ for(int i=0;i<phrase.size();i++){
+ logProb+=this->logProb(lmState, phrase[i], nextState);
+ lmState = nextState;
+ }
+
+ return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+ LMState dummyNextState;
+ return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbOfNgramFromCorpusInfo(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ long currentTime;
+ time(&currentTime);
+
+ double logProb;
+
+ //first check if information is already in cache
+ S_CachedSA_Access_Key accessKey;
+ accessKey.currentMatchStart = currentMatchStart;
+ accessKey.currentMatchLen = currentMatchLen;
+ accessKey.nextWord = nextWord;
+
+ map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key>::iterator iter_cached_sa_access;
+
+ iter_cached_sa_access = this->cached_sa_access.find(accessKey);
+
+ if(iter_cached_sa_access==this->cached_sa_access.end()){ //information not in cache yet
+ double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+ memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+ S_ContextTypeInfo * contextTypeInfo = (S_ContextTypeInfo *) malloc(sizeof(S_ContextTypeInfo)*this->maxN);
+
+ switch(this->smoothingStrategy){
+ case 'k': //for Modified Kneser-Ney smoothing
+
+ this->calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, contextTypeInfo, updatedMatchingStart, updatedMatchingLen);
+ logProb = this->calcLogProb_kneserNeySmoothing(freqTable, contextTypeInfo);
+ break;
+ default: //all other cases including 'g' (Good-Turing smoothing)
+ this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+ logProb = this->calcLogProb(freqTable);
+ }
+
+ free(freqTable);
+ free(contextTypeInfo);
+
+ //insert the info into the cache
+ S_Cached_SA_Access_Info accessInfo;
+ accessInfo.updatedMatchingStart = updatedMatchingStart;
+ accessInfo.updatedMatchingLen = updatedMatchingLen;
+ accessInfo.logProb = logProb;
+ accessInfo.lastTimedUsed = currentTime;
+
+ this->cached_sa_access.insert(make_pair(accessKey, accessInfo));
+
+ return logProb;
+ }
+
+ //otherwise, already exist in the cache, just update the last touched time
+ updatedMatchingStart = iter_cached_sa_access->second.updatedMatchingStart;
+ updatedMatchingLen = iter_cached_sa_access->second.updatedMatchingLen;
+ logProb = iter_cached_sa_access->second.logProb;
+
+ return logProb;
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+ switch(this->interpolationStrategy){
+ case 'e':
+ return this->calcLogProb_equalWeightedInterpolation(freq);
+ break;
+ case 'i':
+ return this->calcLogProb_ibmHeuristicInterpolation(freq);
+ break;
+ case 'm':
+ return this->calcLogProb_maxProbInterpolation(freq);
+ break;
+ default:
+ cerr<<"Unknown interpolation strategy!\n";
+ exit(0);
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+ double prob = 0.0;
+
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ prob+=freq[2*i]/freq[2*i+1];
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(prob/(double)this->maxN);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+ double prob = 0.0;
+ if(freq[0]==0){ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+
+ double remainingWeightSum = 1.0;
+
+ //find the first non-zero match
+ int i = this->maxN - 1;
+
+ while(freq[2*i]==0){ //will stop for sure because freq[0]!=0
+ i--;
+ }
+
+ for(int j=i;j>=0;j--){
+ //for (j+1)-gram
+ double historyFreq = freq[2*j+1];
+ double logHistoryFreq = log(historyFreq);
+ if(logHistoryFreq>1){
+ logHistoryFreq = 1.0; //cap it to 1
+ }
+
+ double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history
+ double adjustedWeights = remainingWeightSum * reliability;
+
+ prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+ remainingWeightSum -= adjustedWeights;
+ }
+
+ return log(prob);
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+ double maxProb = 0.0;
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ double prob=freq[2*i]/freq[2*i+1];
+
+ if(prob>maxProb){
+ maxProb = prob;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(maxProb);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+/**
+* Follow the implementation described in page 23 of Chen & Goodman tech report (section 4.1.6 and 4.1.7)
+* Use notation described in James 2000 pp3 for MODKN-COUNT
+**/
+double C_SuffixArrayLanguageModel::calcLogProb_kneserNeySmoothing(double *freq, S_ContextTypeInfo * contextTypeFreq)
+{
+ double prob = 0.0;
+ int i;
+
+ if(freq[0]>0){
+ contextTypeFreq[i].
+ }
+
+ //unknown word
+ return SALM_LOG_PROB_UNK;
+}
+
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+ return this->voc->returnId(aWord);
+}
+
+
+/**
+* Scan corpus to collect important context-type information needed for KN-smoothing
+* Knowing where n-gram w_(i-n+2)^(i-1) occurs, scan corpus for N_{1+}(dot w_{i-n+2}^i)
+* and N_{1+}(dot w_{i-n+2}^{i-1} dot)
+* Also, collect type freq of n-grams w_{i-n+1}^{i-1} that occur exactly 1, 2 and 3+ times
+* to estimate the discounting factor gammar
+*
+* @see Chen & Goodman 1998 page 19-20 for detailed description
+*
+* @param n order of n-gram
+* @param w_in1 VocId of w<sub>i-n+1</sub>
+* @param w_i VocId of w<sub>i</sub>, the next word to be predicted
+* @param leftBoundaryOfSaRangeFor_w_in2_i1
+* @param rightBoundaryOfSaRangeFor_w_in2_i1 [leftBoundaryOfSaRangeFor_w_in2_i1, rightBoundaryOfSaRangeFor_w_in2_i1] is the range of suffix array positions that correspond to the locations of phrase w<sub>i-n+2</sub><sup>i-1</sup>
+* @param leftBoundaryOfSaRangeFor_w_in1
+* @param rigthBoundaryOfSaRangeFor_w_i1 [leftBoundaryOfSaRangeFor_w_in1, rigthBoundaryOfSaRangeFor_w_i1] is the range of suffix array positions that correspond to the locations of phrase w<sub>i-n+1</sub><sup>i-1</sup>
+* @return S_ContextTypeInfo containing the context type information
+**/
+void C_SuffixArrayLanguageModel::scanCorpusForContextTypeInfo(int n, IndexType w_i, TextLenType leftBoundaryOfSaRangeFor_w_in2_i1, TextLenType rightBoundaryOfSaRangeFor_w_in2_i1, TextLenType leftBoundaryOfSaRangeFor_w_in1_i1, TextLenType rigthBoundaryOfSaRangeFor_w_in1_i1, S_ContextTypeInfo & result)
+{
+
+ TextLenType i;
+ TextLenType posInCorpus;
+ IndexType nextWordInCorpus;
+ int n1 = n-1; //this value will be used frequently here
+
+ //first scan the corpus for all the word types that follow w_{i-n+1}^{i-1}
+ //to collect N1(w_in1^i1 dot) N2, and N3+ info needed
+ result.N1_w_in1_i1_dot = 0;
+ result.N2_w_in1_i1_dot = 0;
+ result.N3plus_w_in1_i1_dot = 0;
+
+ int freqOfCurrentType = -1; //freq of 'dot' with current type
+ IndexType currentNextWordType = 0;
+ for(i=leftBoundaryOfSaRangeFor_w_in1_i1;i<=rigthBoundaryOfSaRangeFor_w_in1_i1;i++){
+ posInCorpus = this->suffix_list[i] + n1;
+ //suffix_list[i] is the position of w_{i-n+1} in the corpus
+ //suffix_list[i]+n-1 is hte position of the word (the dot in the equation) that follows w_{i-n+1}^{i-1}
+
+ nextWordInCorpus = this->corpus_list[posInCorpus];
+ freqOfCurrentType++;
+ if(nextWordInCorpus!=currentNextWordType){
+
+ if(freqOfCurrentType==1){
+ result.N1_w_in1_i1_dot++;
+ }
+ else if(freqOfCurrentType==2){
+ result.N2_w_in1_i1_dot++;
+ }
+ else{ //freq of this type is >=3
+ result.N3plus_w_in1_i1_dot++;
+ }
+
+ currentNextWordType = nextWordInCorpus;
+ freqOfCurrentType=0;
+ }
+ }
+
+ //for the last type in the range
+ freqOfCurrentType++;
+
+ if(freqOfCurrentType==1){
+ result.N1_w_in1_i1_dot++;
+ }
+ else if(freqOfCurrentType==2){
+ result.N2_w_in1_i1_dot++;
+ }
+ else{ //freq of this type is >=3
+ result.N3plus_w_in1_i1_dot++;
+ }
+
+
+ //step 2, scan the corpus for N_{1+}(dot w_{i-n+2}^{i}) and N_{1+}(dot w_{i-n+2}^{i-1} dot)
+ IndexType precedingWord;
+ IndexType followingWord;
+ if(n==2){ //the special case
+ result.N1plus_dot_w_in2_i1_dot = this->typeOfBigrams;
+
+ //check if we have the N_1+(dot w_i) information already
+ map<IndexType, unsigned int>::iterator iterTypeFreqPrecedingWord;
+ iterTypeFreqPrecedingWord = this->typeFreqPrecedingWord.find(w_i);
+
+ if(iterTypeFreqPrecedingWord==this->typeFreqPrecedingWord.end()){ //does not exist yet
+ TextLenType startPosInSA = this->level1Buckets[w_i].first;
+ TextLenType endPosInSA = this->level1Buckets[w_i].last;
+
+ set<IndexType> wordTypePrecedesW_i;
+ for(i=startPosInSA;i<=endPosInSA;i++){
+ posInCorpus = this->suffix_list[i] - 1;
+ precedingWord = this->corpus_list[posInCorpus];
+
+ wordTypePrecedesW_i.insert(precedingWord);
+ }
+
+ result.N1plus_dot_w_in2_i = (double) wordTypePrecedesW_i.size();
+
+ //and save this for future references
+ this->typeFreqPrecedingWord.insert(make_pair(w_i, wordTypePrecedesW_i.size()));
+ }
+ else{ //already has the information in typeFreqPrecedingWord
+ result.N1plus_dot_w_in2_i = (double) (iterTypeFreqPrecedingWord->second);
+ }
+ }
+ else{
+ set<IndexType> wordTypesPrecedesW_in2_i;
+ set< pair<IndexType, IndexType> > wordTypesSurroundW_in2_i1;
+
+ for(i=leftBoundaryOfSaRangeFor_w_in2_i1;i<=rightBoundaryOfSaRangeFor_w_in2_i1;i++){
+ posInCorpus = this->suffix_list[i] -1; //pos of preceding word (w_{i-n+1}) in the corpus
+ precedingWord = this->corpus_list[posInCorpus];
+
+ posInCorpus+=n1; //pos of following word w_i in the corpus
+ followingWord = this->corpus_list[posInCorpus];
+
+ pair<IndexType, IndexType> tmpPair = make_pair(precedingWord, followingWord);
+
+ //if w_i equals next word, add the preceding word to set
+ if(followingWord==w_i){
+ wordTypesPrecedesW_in2_i.insert(precedingWord);
+ }
+
+ //add the pair to set
+ wordTypesSurroundW_in2_i1.insert(tmpPair);
+
+ }
+
+
+ result.N1plus_dot_w_in2_i = wordTypesPrecedesW_in2_i.size();
+ result.N1plus_dot_w_in2_i1_dot = wordTypesSurroundW_in2_i1.size();
+ }
+
+ result.valid = true;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h
new file mode 100755
index 0000000..9f9155a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h
@@ -0,0 +1,210 @@
+#if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__)
+#define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__
+
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include "salm_shared.h"
+#include "time.h"
+
+/**
+* \ingroup lm
+* Context type information needed in KN-smoothing
+**/
+typedef struct s_contextTypeInfo{
+ double N1plus_dot_w_in2_i; //Goodman and Chen 98, eq 23
+ double N1plus_dot_w_in2_i1_dot;
+ double N1_w_in1_i1_dot; //Goodman and Chen 98, eq 19
+ double N2_w_in1_i1_dot;
+ double N3plus_w_in1_i1_dot;
+ bool valid;
+}S_ContextTypeInfo;
+
+
+/**
+* \ingroup lm
+**/
+typedef unsigned int LMState;
+
+
+/**
+* \ingroup lm
+**/
+typedef struct s_lmStateInfo{
+ TextLenType posInCorpus;
+ unsigned char len;
+}S_LMStateInfo;
+
+/**
+* \ingroup lm
+**/
+typedef struct s_bufferedLmInfo{
+ int nextState;
+ double logProb;
+}S_BufferedLmInfo;
+
+
+/**
+* \ingroup lm
+**/
+struct lt_lmStateInfo
+{
+ bool operator()(S_LMStateInfo a, S_LMStateInfo b) const{
+ if(a.posInCorpus<b.posInCorpus){
+ return true;
+ }
+
+ if(a.posInCorpus>b.posInCorpus){
+ return false;
+ }
+
+ if(a.len<b.len){
+ return true;
+ }
+
+ return false;
+ }
+};
+
+
+/**
+* \ingroup lm
+* structure for elements in the cache for accessing the suffix array for LM prob
+**/
+typedef struct s_cached_SA_access_key{
+ TextLenType currentMatchStart;
+ unsigned char currentMatchLen;
+ IndexType nextWord;
+}S_CachedSA_Access_Key;
+
+typedef struct s_cached_SA_access_info{
+ TextLenType updatedMatchingStart;
+ unsigned char updatedMatchingLen;
+ double logProb;
+ long lastTimedUsed;
+}S_Cached_SA_Access_Info;
+
+struct lt_s_cached_SA_access_key
+{
+ bool operator()(S_CachedSA_Access_Key a, S_CachedSA_Access_Key b) const{
+ if(a.currentMatchStart<b.currentMatchStart){
+ return true;
+ }
+
+ if(a.currentMatchStart>b.currentMatchStart){
+ return false;
+ }
+
+ if(a.currentMatchLen<b.currentMatchLen){
+ return true;
+ }
+
+ if(a.currentMatchLen>b.currentMatchLen){
+ return false;
+ }
+
+ if(a.nextWord<b.nextWord){
+ return true;
+ }
+
+ return false;
+ }
+};
+
+
+/**
+* \ingroup lm
+* C_SuffixArrayLanguageModel inherit the C_SuffixArraySearchApplicationBase class and C_SuffixArrayScanningBase
+* to provide functionalities of estimating the likelihood of a sentence given an indexed training corpus
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase
+{
+
+public:
+ IndexType returnVocId(C_String aWord);
+
+ /// At the beginning of a sentence, return the LMState and reset the cache
+ LMState beginOfSentenceState();
+
+ /// Calculate the log prob of a word predicted by the history LM state
+ double logProb(LMState lmState, IndexType nextWord, LMState & nextState);
+
+ /// The log prob of a phrase extending the history as a LMState
+ double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState);
+
+ /// End of sentence
+ double logProbEnd(LMState lmState);
+
+ /// Constructors
+ C_SuffixArrayLanguageModel(const char * cfgFileName);
+ C_SuffixArrayLanguageModel();
+ ~C_SuffixArrayLanguageModel();
+
+
+private:
+ void scanCorpusForContextTypeInfo(int n, IndexType w_i, TextLenType leftBoundaryOfSaRangeFor_w_in2_i1, TextLenType rightBoundaryOfSaRangeFor_w_in2_i1, TextLenType leftBoundaryOfSaRangeFor_w_in1_i1, TextLenType rigthBoundaryOfSaRangeFor_w_in1_i1, S_ContextTypeInfo & result);
+
+ void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+ void calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, S_ContextTypeInfo * contextTypeInfo, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+
+ //Log prob calculation
+ double logProbOfNgramFromCorpusInfo(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+ double calcLogProb(double *freq);
+ double calcLogProb_equalWeightedInterpolation(double *freq);
+ double calcLogProb_ibmHeuristicInterpolation(double *freq);
+ double calcLogProb_maxProbInterpolation(double * freq);
+ double calcLogProb_kneserNeySmoothing(double *freq, S_ContextTypeInfo * contextTypeFreq);
+
+ ///parameter and settings
+ ///set the interploation strategy
+ void setParam_interpolationStrategy(char interpolationStrategy);
+
+ ///set the number of sentences processed by the LM before purging the cache
+ void setParam_numberOfSentSeenToPurgeCache(int numberOfSentSeenToPurgeCache);
+
+ ///set the fresh time thresh for the cache entries
+ void setParam_freshTime(long freshTime);
+
+ char smoothingStrategy;
+ char interpolationStrategy;
+ int maxN;
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+
+ ///Discounting
+ void constructDiscountingMap();
+ double discountFreq_GT(int n, unsigned int observedFreq);
+
+ double * Y; // following the notation of Chen&Goodman 98, Eq. 26
+ double * D1;
+ double * D2;
+ double * D3plus;
+ double typeOfBigrams; //will be needed for KN-smoothing
+
+ double *discountingMap;
+ bool applyDiscounting;
+ int maxFreqForDiscounting;
+ S_nGramScanningInfoElement * nGramScanningList;
+ map<IndexType, unsigned int> typeFreqPrecedingWord;
+
+ ///LM State and related functions
+ void resetLmStates();
+ void initialLmState();
+ map< pair<LMState, IndexType>, S_BufferedLmInfo> buffer;
+ vector<S_LMStateInfo> allLMStates;
+ map<S_LMStateInfo, int, lt_lmStateInfo> lmStateInfo2Id;
+
+ //caching information for SA access
+ unsigned int sentenceProcessedSoFar;
+ long freshTime;
+ unsigned int numberOfSentSeenToPurgeCache;
+ map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key> cached_sa_access;
+ void purgeCache(long lastVisitedTime);
+
+};
+
+#endif
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp
new file mode 100755
index 0000000..0a94ff0
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp
@@ -0,0 +1,691 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <memory.h>
+#include <cstring>
+
+#include "math.h"
+
+using namespace std;
+
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+
+}
+
+
+/**
+* Construct the suffix array language model object
+* Using the training data corpusFileNameStem that has been indexed by IndexSA
+* Consider at most maxN-gram in language modeling
+* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting
+* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history
+* @param cfgFileName Configuration file that specifies the value of parameters for SALM
+*
+* Each line in the configuration file is a Keyword Value pair. Legal keywords are:
+* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified!
+* N : Highest order of n considered for n-gram LM estimation, default value = 5
+* MAX_FREQ_DISC : When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1.
+* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName)
+{
+
+ fstream cfgFile;
+ cfgFile.open(cfgFileName,ios::in);
+
+ if(!cfgFile){
+ fprintf(stderr,"Configuration file does not exist! quit!!\n");
+ exit(0);
+ }
+
+ //-----------------------------------------------------------------------------
+ //reading parameters
+ char paraName[1024];
+ char corpusFileNameStem[1024];
+ corpusFileNameStem[0]=0;
+ this->maxFreqForDiscounting=-1;
+
+ this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob
+ this->maxN = 5; // default value; consider up to 5 words
+
+ while(!cfgFile.eof()){
+ cfgFile>>paraName;
+
+ if(strcmp(paraName,"CORPUS")==0){
+ cfgFile>>corpusFileNameStem;
+ }
+ else if(strcmp(paraName,"N")==0){
+ cfgFile>>this->maxN;
+ }
+ else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+ cfgFile>>maxFreqForDiscounting;
+ }
+ else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+ cfgFile>>this->interpolationStrategy;
+ }
+
+ paraName[0]=0;
+
+ }
+
+ //load corpus and suffix array
+ if(strlen(corpusFileNameStem)==0){
+ cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n";
+ exit(-1);
+ }
+ this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset,
+
+
+ //if apply discounting construct the discounting map
+ if(this->maxFreqForDiscounting<=0){
+ this->applyDiscounting = false;
+ }
+ else{
+ this->applyDiscounting = true;
+ this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->interpolationStrategy = 'e'; //default: interpolation strategy: equally weighted n-gram conditional prob
+
+}
+
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information
+* and construct the discounting using Good-Turing smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+ int i,j;
+ unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+
+ if(countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ //initialize count of counts table
+ for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+ countOfCountsTable[c]=0;
+ }
+
+ //initialize the scanning list
+ S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+ for(i=0;i<this->maxN;i++){
+ nGramScanningList[i].freqSoFar=0;
+ nGramScanningList[i].vocId = 0;
+ nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==nGramScanningList[i].vocId)){ //still match
+
+ nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ freqSoFar = nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ nGramScanningList[j].freqSoFar = 1;
+ }
+
+ nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+
+ freqSoFar = nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+ }
+
+ //now, use Good-Turing discounting to create frequency mapping
+ //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+ this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+
+ for(i=0;i<this->maxN;i++){
+ //for (i+1)-gram
+
+ unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+ double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+ for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq
+ //for all (freq+1) ngrams
+ if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists
+ discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);
+ }
+ else{
+ discountingMapForThisN[freq] = -1;
+ }
+ }
+
+ discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency
+ }
+
+
+ free(countOfCountsTable);
+
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+/// 3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ vector<IndexType> nGram;
+
+ if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk>
+ if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram
+ currentMatchStart++;
+ currentMatchLen--;
+ }
+
+ for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+ nGram.push_back(this->corpus_list[pos]);
+ }
+ }
+
+ nGram.push_back(nextWord);
+
+ int sentLen = nGram.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+ int startPosForNgram;
+ int startPosForLongestMatchingWithNextWord;
+ int cellIndexForLongestMatchingWithNextWord;
+
+ bool stillMatched = true;
+ bool atLeastOneMatched = false;
+
+ int indexForNgram;
+
+ unsigned int totalOccurrences;
+ unsigned int totalOccurrencesOfHistory;
+
+ //for unigram
+ indexForNgram = sentLen - 1;
+ if(table[indexForNgram].found){
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ if(this->applyDiscounting){
+ freqTable[0] = this->discountFreq(1, totalOccurrences);
+ }
+ else{
+ freqTable[0] = totalOccurrences;
+ }
+
+ freqTable[1] = this->corpusSize;
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = sentLen-1;
+ atLeastOneMatched = true;
+ }
+ else{
+ stillMatched = false;
+ }
+
+ int n=2; //considering 2-gram and longer n-gram now
+ startPosForNgram = sentLen - 2;
+ while((stillMatched)&&(startPosForNgram>=0)){
+
+ indexForNgram = (n-1) * sentLen + startPosForNgram;
+ int indexForHistory = (n-2) * sentLen + startPosForNgram;
+
+ if(table[indexForNgram].found){
+
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+ if(this->applyDiscounting){
+ freqTable[2*n-2] = this->discountFreq(n, totalOccurrences);
+ }
+ else{
+ freqTable[2*n-2] = (double)totalOccurrences;
+ }
+
+ freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history
+
+ if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = startPosForNgram;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ startPosForNgram--;
+ n++;
+ }
+
+ if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord'
+ updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+ updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+ }
+ else{
+ updatedMatchingStart = (TextLenType) -1;
+ updatedMatchingLen = 0;
+ }
+
+ free(table);
+
+}
+
+
+//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq)
+{
+ if(n>=this->maxN){ //do not discount
+ return (double) observedFreq;
+ }
+
+ if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq
+ return (double) observedFreq;
+ }
+
+ //else, check the discount map
+ double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+ if(discountedFreq>0){
+ return discountedFreq;
+ }
+
+ //else, no discounting
+ return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+
+ this->resetLmStates();
+ this->initialLmState();
+
+ return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+ //add sentence start
+ S_LMStateInfo sentStartNode;
+ sentStartNode.locationInCorpus.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s>
+ sentStartNode.locationInCorpus.len = 1;
+ sentStartNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(sentStartNode);
+ this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+ this->allLMStates.clear();
+ this->ngramLocation2LmStateId.clear();
+}
+
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+ if(lmState>=this->allLMStates.size()){
+ cerr<<"Invalid LM State: "<<lmState<<endl;
+ exit(-1);
+ }
+
+ //first check if we have already seen this 'nextWord' before
+ map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache;
+ iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord );
+
+ if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet
+
+ //search for it in the corpus
+ S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus;
+ S_NgramLocationInCorpus updatedNgramLocation;
+
+ double logProb = this->logProbFromFreq(
+ correspondingNgramLocation.posInCorpus,
+ correspondingNgramLocation.len,
+ nextWord,
+ updatedNgramLocation.posInCorpus,
+ updatedNgramLocation.len);
+
+ //caching the logprob of 'nextword' given the lmState
+ int updatedLmStateId;
+ map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId;
+ iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation);
+ if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){ //this updated lm state does not exist yet
+ S_LMStateInfo newLmStateNode;
+
+ newLmStateNode.locationInCorpus = updatedNgramLocation;
+ newLmStateNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(newLmStateNode);
+ updatedLmStateId = this->allLMStates.size() -1 ;
+ this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId));
+ }
+ else{
+ updatedLmStateId = iterNgramLocation2LmStateId->second;
+ }
+
+ //cache this
+ S_CachedLmInfo cachedLmInfo;
+ cachedLmInfo.logProb = logProb;
+ cachedLmInfo.nextState = updatedLmStateId;
+
+ this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo));
+
+ //updated next state
+ nextState = updatedLmStateId;
+
+ return logProb;
+ }
+
+ nextState = iterNextWordExtensionCache->second.nextState;
+
+ return iterNextWordExtensionCache->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase.
+* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function.
+ * @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+ double logProb = 0;
+
+ if (phrase.size() == 0) {
+ nextState = lmState;
+ return logProb;
+ }
+
+ for(int i=0;i<phrase.size();i++){
+ logProb+=this->logProb(lmState, phrase[i], nextState);
+ lmState = nextState;
+ }
+
+ return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+ LMState dummyNextState;
+ return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+
+ double logProb;
+
+ double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+ memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+ this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+
+ logProb = this->calcLogProb(freqTable);
+
+ free(freqTable);
+
+ return logProb;
+
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+ switch(this->interpolationStrategy){
+ case 'e':
+ return this->calcLogProb_equalWeightedInterpolation(freq);
+ break;
+ case 'i':
+ return this->calcLogProb_ibmHeuristicInterpolation(freq);
+ break;
+ case 'm':
+ return this->calcLogProb_maxProbInterpolation(freq);
+ break;
+ default:
+ cerr<<"Unknown interpolation strategy!\n";
+ exit(0);
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+ double prob = 0.0;
+
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ prob+=freq[2*i]/freq[2*i+1];
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(prob/(double)this->maxN);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+ double prob = 0.0;
+ if(freq[0]==0){ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+
+ double remainingWeightSum = 1.0;
+
+ //find the first non-zero match
+ int i = this->maxN - 1;
+
+ while(freq[2*i]==0){ //will stop for sure because freq[0]!=0
+ i--;
+ }
+
+ for(int j=i;j>=0;j--){
+ //for (j+1)-gram
+ double historyFreq = freq[2*j+1];
+ double logHistoryFreq = log(historyFreq);
+ if(logHistoryFreq>1){
+ logHistoryFreq = 1.0; //cap it to 1
+ }
+
+ double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history
+ double adjustedWeights = remainingWeightSum * reliability;
+
+ prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+ remainingWeightSum -= adjustedWeights;
+ }
+
+ return log(prob);
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+ double maxProb = 0.0;
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ double prob=freq[2*i]/freq[2*i+1];
+
+ if(prob>maxProb){
+ maxProb = prob;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(maxProb);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+ return this->voc->returnId(aWord);
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~
new file mode 100755
index 0000000..5241621
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~
@@ -0,0 +1,690 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <memory.h>
+
+#include "math.h"
+
+using namespace std;
+
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+
+}
+
+
+/**
+* Construct the suffix array language model object
+* Using the training data corpusFileNameStem that has been indexed by IndexSA
+* Consider at most maxN-gram in language modeling
+* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting
+* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history
+* @param cfgFileName Configuration file that specifies the value of parameters for SALM
+*
+* Each line in the configuration file is a Keyword Value pair. Legal keywords are:
+* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified!
+* N : Highest order of n considered for n-gram LM estimation, default value = 5
+* MAX_FREQ_DISC : When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1.
+* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName)
+{
+
+ fstream cfgFile;
+ cfgFile.open(cfgFileName,ios::in);
+
+ if(!cfgFile){
+ fprintf(stderr,"Configuration file does not exist! quit!!\n");
+ exit(0);
+ }
+
+ //-----------------------------------------------------------------------------
+ //reading parameters
+ char paraName[1024];
+ char corpusFileNameStem[1024];
+ corpusFileNameStem[0]=0;
+ this->maxFreqForDiscounting=-1;
+
+ this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob
+ this->maxN = 5; // default value; consider up to 5 words
+
+ while(!cfgFile.eof()){
+ cfgFile>>paraName;
+
+ if(strcmp(paraName,"CORPUS")==0){
+ cfgFile>>corpusFileNameStem;
+ }
+ else if(strcmp(paraName,"N")==0){
+ cfgFile>>this->maxN;
+ }
+ else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+ cfgFile>>maxFreqForDiscounting;
+ }
+ else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+ cfgFile>>this->interpolationStrategy;
+ }
+
+ paraName[0]=0;
+
+ }
+
+ //load corpus and suffix array
+ if(strlen(corpusFileNameStem)==0){
+ cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n";
+ exit(-1);
+ }
+ this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset,
+
+
+ //if apply discounting construct the discounting map
+ if(this->maxFreqForDiscounting<=0){
+ this->applyDiscounting = false;
+ }
+ else{
+ this->applyDiscounting = true;
+ this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->interpolationStrategy = 'e'; //default: interpolation strategy: equally weighted n-gram conditional prob
+
+}
+
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information
+* and construct the discounting using Good-Turing smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+ int i,j;
+ unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+
+ if(countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ //initialize count of counts table
+ for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+ countOfCountsTable[c]=0;
+ }
+
+ //initialize the scanning list
+ S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+ for(i=0;i<this->maxN;i++){
+ nGramScanningList[i].freqSoFar=0;
+ nGramScanningList[i].vocId = 0;
+ nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==nGramScanningList[i].vocId)){ //still match
+
+ nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ freqSoFar = nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ nGramScanningList[j].freqSoFar = 1;
+ }
+
+ nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+
+ freqSoFar = nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+ }
+
+ //now, use Good-Turing discounting to create frequency mapping
+ //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+ this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+
+ for(i=0;i<this->maxN;i++){
+ //for (i+1)-gram
+
+ unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+ double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+ for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq
+ //for all (freq+1) ngrams
+ if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists
+ discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);
+ }
+ else{
+ discountingMapForThisN[freq] = -1;
+ }
+ }
+
+ discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency
+ }
+
+
+ free(countOfCountsTable);
+
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+/// 3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ vector<IndexType> nGram;
+
+ if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk>
+ if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram
+ currentMatchStart++;
+ currentMatchLen--;
+ }
+
+ for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+ nGram.push_back(this->corpus_list[pos]);
+ }
+ }
+
+ nGram.push_back(nextWord);
+
+ int sentLen = nGram.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+ int startPosForNgram;
+ int startPosForLongestMatchingWithNextWord;
+ int cellIndexForLongestMatchingWithNextWord;
+
+ bool stillMatched = true;
+ bool atLeastOneMatched = false;
+
+ int indexForNgram;
+
+ unsigned int totalOccurrences;
+ unsigned int totalOccurrencesOfHistory;
+
+ //for unigram
+ indexForNgram = sentLen - 1;
+ if(table[indexForNgram].found){
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ if(this->applyDiscounting){
+ freqTable[0] = this->discountFreq(1, totalOccurrences);
+ }
+ else{
+ freqTable[0] = totalOccurrences;
+ }
+
+ freqTable[1] = this->corpusSize;
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = sentLen-1;
+ atLeastOneMatched = true;
+ }
+ else{
+ stillMatched = false;
+ }
+
+ int n=2; //considering 2-gram and longer n-gram now
+ startPosForNgram = sentLen - 2;
+ while((stillMatched)&&(startPosForNgram>=0)){
+
+ indexForNgram = (n-1) * sentLen + startPosForNgram;
+ int indexForHistory = (n-2) * sentLen + startPosForNgram;
+
+ if(table[indexForNgram].found){
+
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+ if(this->applyDiscounting){
+ freqTable[2*n-2] = this->discountFreq(n, totalOccurrences);
+ }
+ else{
+ freqTable[2*n-2] = (double)totalOccurrences;
+ }
+
+ freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history
+
+ if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = startPosForNgram;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ startPosForNgram--;
+ n++;
+ }
+
+ if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord'
+ updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+ updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+ }
+ else{
+ updatedMatchingStart = (TextLenType) -1;
+ updatedMatchingLen = 0;
+ }
+
+ free(table);
+
+}
+
+
+//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq)
+{
+ if(n>=this->maxN){ //do not discount
+ return (double) observedFreq;
+ }
+
+ if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq
+ return (double) observedFreq;
+ }
+
+ //else, check the discount map
+ double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+ if(discountedFreq>0){
+ return discountedFreq;
+ }
+
+ //else, no discounting
+ return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+
+ this->resetLmStates();
+ this->initialLmState();
+
+ return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+ //add sentence start
+ S_LMStateInfo sentStartNode;
+ sentStartNode.locationInCorpus.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s>
+ sentStartNode.locationInCorpus.len = 1;
+ sentStartNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(sentStartNode);
+ this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+ this->allLMStates.clear();
+ this->ngramLocation2LmStateId.clear();
+}
+
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+ if(lmState>=this->allLMStates.size()){
+ cerr<<"Invalid LM State: "<<lmState<<endl;
+ exit(-1);
+ }
+
+ //first check if we have already seen this 'nextWord' before
+ map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache;
+ iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord );
+
+ if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet
+
+ //search for it in the corpus
+ S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus;
+ S_NgramLocationInCorpus updatedNgramLocation;
+
+ double logProb = this->logProbFromFreq(
+ correspondingNgramLocation.posInCorpus,
+ correspondingNgramLocation.len,
+ nextWord,
+ updatedNgramLocation.posInCorpus,
+ updatedNgramLocation.len);
+
+ //caching the logprob of 'nextword' given the lmState
+ int updatedLmStateId;
+ map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId;
+ iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation);
+ if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){ //this updated lm state does not exist yet
+ S_LMStateInfo newLmStateNode;
+
+ newLmStateNode.locationInCorpus = updatedNgramLocation;
+ newLmStateNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(newLmStateNode);
+ updatedLmStateId = this->allLMStates.size() -1 ;
+ this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId));
+ }
+ else{
+ updatedLmStateId = iterNgramLocation2LmStateId->second;
+ }
+
+ //cache this
+ S_CachedLmInfo cachedLmInfo;
+ cachedLmInfo.logProb = logProb;
+ cachedLmInfo.nextState = updatedLmStateId;
+
+ this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo));
+
+ //updated next state
+ nextState = updatedLmStateId;
+
+ return logProb;
+ }
+
+ nextState = iterNextWordExtensionCache->second.nextState;
+
+ return iterNextWordExtensionCache->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase.
+* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function.
+ * @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+ double logProb = 0;
+
+ if (phrase.size() == 0) {
+ nextState = lmState;
+ return logProb;
+ }
+
+ for(int i=0;i<phrase.size();i++){
+ logProb+=this->logProb(lmState, phrase[i], nextState);
+ lmState = nextState;
+ }
+
+ return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+ LMState dummyNextState;
+ return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+
+ double logProb;
+
+ double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+ memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+ this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+
+ logProb = this->calcLogProb(freqTable);
+
+ free(freqTable);
+
+ return logProb;
+
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+ switch(this->interpolationStrategy){
+ case 'e':
+ return this->calcLogProb_equalWeightedInterpolation(freq);
+ break;
+ case 'i':
+ return this->calcLogProb_ibmHeuristicInterpolation(freq);
+ break;
+ case 'm':
+ return this->calcLogProb_maxProbInterpolation(freq);
+ break;
+ default:
+ cerr<<"Unknown interpolation strategy!\n";
+ exit(0);
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+ double prob = 0.0;
+
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ prob+=freq[2*i]/freq[2*i+1];
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(prob/(double)this->maxN);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+ double prob = 0.0;
+ if(freq[0]==0){ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+
+ double remainingWeightSum = 1.0;
+
+ //find the first non-zero match
+ int i = this->maxN - 1;
+
+ while(freq[2*i]==0){ //will stop for sure because freq[0]!=0
+ i--;
+ }
+
+ for(int j=i;j>=0;j--){
+ //for (j+1)-gram
+ double historyFreq = freq[2*j+1];
+ double logHistoryFreq = log(historyFreq);
+ if(logHistoryFreq>1){
+ logHistoryFreq = 1.0; //cap it to 1
+ }
+
+ double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history
+ double adjustedWeights = remainingWeightSum * reliability;
+
+ prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+ remainingWeightSum -= adjustedWeights;
+ }
+
+ return log(prob);
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+ double maxProb = 0.0;
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ double prob=freq[2*i]/freq[2*i+1];
+
+ if(prob>maxProb){
+ maxProb = prob;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(maxProb);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+ return this->voc->returnId(aWord);
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h
new file mode 100755
index 0000000..62427e5
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h
@@ -0,0 +1,137 @@
+// Revision $Rev: 3794 $
+// Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+
+#if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__)
+#define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__
+
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include "salm_shared.h"
+
+/**
+* \ingroup lm
+**/
+typedef unsigned int LMState;
+
+
+/**
+* \ingroup lm
+**/
+typedef struct s_cachedLmInfo{
+ int nextState;
+ double logProb;
+}S_CachedLmInfo;
+
+/**
+* \ingroup lm
+**/
+typedef struct s_NgramLocationInCorpus{
+ TextLenType posInCorpus;
+ unsigned char len;
+}S_NgramLocationInCorpus;
+
+/**
+* \ingroup lm
+**/
+typedef struct s_lmStateInfo{
+ S_NgramLocationInCorpus locationInCorpus;
+ map<IndexType, S_CachedLmInfo> cachedNextWordExtension; //cached information of this LMState extended by the next word
+}S_LMStateInfo;
+
+/**
+* \ingroup lm
+**/
+struct lt_ngramLocationInCorpus
+{
+ bool operator()(S_NgramLocationInCorpus a, S_NgramLocationInCorpus b) const{
+ if(a.posInCorpus<b.posInCorpus){
+ return true;
+ }
+
+ if(a.posInCorpus>b.posInCorpus){
+ return false;
+ }
+
+ if(a.len<b.len){
+ return true;
+ }
+
+ return false;
+ }
+};
+
+
+/**
+* \ingroup lm
+* C_SuffixArrayLanguageModel inherit the C_SuffixArraySearchApplicationBase class and C_SuffixArrayScanningBase
+* to provide functionalities of estimating the likelihood of a sentence given an indexed training corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase
+{
+
+public:
+ IndexType returnVocId(C_String aWord);
+
+ /// At the beginning of a sentence, return the LMState and reset the cache
+ LMState beginOfSentenceState();
+
+ /// Calculate the log prob of a word predicted by the history LM state
+ double logProb(LMState lmState, IndexType nextWord, LMState & nextState);
+
+ /// The log prob of a phrase extending the history as a LMState
+ double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState);
+
+ /// End of sentence
+ double logProbEnd(LMState lmState);
+
+ ///set the interploation strategy
+ void setParam_interpolationStrategy(char interpolationStrategy);
+
+
+ C_SuffixArrayLanguageModel(const char * cfgFileName);
+ C_SuffixArrayLanguageModel();
+ ~C_SuffixArrayLanguageModel();
+
+
+private:
+
+ void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+
+ //Log prob calculation
+ double logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+ double calcLogProb(double *freq);
+ double calcLogProb_equalWeightedInterpolation(double *freq);
+ double calcLogProb_ibmHeuristicInterpolation(double *freq);
+ double calcLogProb_maxProbInterpolation(double * freq);
+
+ char interpolationStrategy;
+ int maxN;
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+ ///Discounting
+ void constructDiscountingMap();
+ double *discountingMap;
+ double discountFreq(int n, unsigned int observedFreq);
+ bool applyDiscounting;
+ int maxFreqForDiscounting;
+ S_nGramScanningInfoElement * nGramScanningList;
+
+
+ ///LM State and related functions
+ void resetLmStates();
+ void initialLmState();
+
+ //caching lm prob for each sentence
+ vector<S_LMStateInfo> allLMStates;
+ map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus> ngramLocation2LmStateId;
+
+
+
+};
+
+#endif
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp
new file mode 100755
index 0000000..d7c96a2
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp
@@ -0,0 +1,34 @@
+
+#include "_SuffixArrayScanningBase.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <iostream>
+#include <fstream>
+#include <map>
+
+using namespace std;
+
+/**
+* Given a corpus indexed by its suffix array, output the count-of-count information
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ //-----------------------------------------------------------------------------
+ if(argc<4){
+ fprintf(stderr,"\nGiven an indexed corpus, output the count of counts for n-grams.\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem maxN maxFreq\n\n",argv[0]);
+ exit(0);
+ }
+
+ unsigned int maxN = atoi(argv[2]);
+ unsigned int maxFreq = atoi(argv[3]);
+
+ C_SuffixArrayScanningBase saObj(argv[1], maxN);
+ saObj.scanSuffixArrayForCountofCounts(maxFreq);
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp
new file mode 100755
index 0000000..8e9544a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp
@@ -0,0 +1,70 @@
+#include "_SuffixArrayScanningBase.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <iostream>
+#include <fstream>
+#include <map>
+
+using namespace std;
+
+/**
+* Output n-gram types that have frequencies equal or higher than specified
+*
+*
+* CfgFile Format:
+* n1<tab>freq thresh for output n1-gram
+* n2<tab>freq thresh for output n2-gram
+* ... ... ...
+* n1<tab>freq thresh for output n1-gram
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ //-----------------------------------------------------------------------------
+ if(argc<3){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem cfgFile\n\n",argv[0]);
+
+ fprintf(stderr,"\n\tCfgFile Format:");
+ fprintf(stderr,"\n\t\tn1<tab>freq thresh for output n1-gram");
+ fprintf(stderr,"\n\t\tn2<tab>freq thresh for output n2-gram");
+ fprintf(stderr,"\n\t\t... ... ...");
+ fprintf(stderr,"\n\t\tn1<tab>freq thresh for output n1-gram\n");
+
+
+ exit(0);
+ }
+
+ //processing the threshold file
+ map<int, unsigned int> threshMap;
+ map<int, unsigned int>::iterator iterThreshMap;
+ fstream threshFile;
+ threshFile.open(argv[2]);
+ int n;
+ int maxN = 0;
+ unsigned int thresh;
+ while(! threshFile.eof()){
+ threshFile>>n>>thresh;
+ if(n>maxN){
+ maxN=n;
+ }
+ iterThreshMap = threshMap.find(n);
+ if(iterThreshMap==threshMap.end()){
+ threshMap.insert(make_pair(n,thresh)); //a little over-kill here, should have a well defined cfg file
+ }
+ }
+
+ C_SuffixArrayScanningBase saObj(argv[1], maxN);
+ iterThreshMap = threshMap.begin();
+ while(iterThreshMap!=threshMap.end()){
+ saObj.setNgramOutputFreqThresh(iterThreshMap->first, iterThreshMap->second);
+ iterThreshMap++;
+ }
+
+ saObj.scanSuffixArrayForHighFreqNgramType();
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp
new file mode 100755
index 0000000..35f9d3d
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp
@@ -0,0 +1,32 @@
+#include "_SuffixArrayScanningBase.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <iostream>
+#include <fstream>
+#include <map>
+
+using namespace std;
+
+/**
+* Given an indexed corpus, output the type/token information of the n-grams in the corpus.
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ //-----------------------------------------------------------------------------
+ if(argc<3){
+ fprintf(stderr,"\nGiven an indexed corpus, output the type token information for n-grams.\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem maxN \n\n",argv[0]);
+ exit(0);
+ }
+
+ unsigned int maxN = atoi(argv[2]);
+
+ C_SuffixArrayScanningBase saObj(argv[1], maxN);
+ saObj.scanSuffixArrayForTypeToken();
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp
new file mode 100755
index 0000000..9050408
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp
@@ -0,0 +1,338 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArrayScanningBase.h"
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase()
+{
+ this->countOfCountsTable = 0; //no memory has been allocated
+ this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+}
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN)
+{
+ this->countOfCountsTable = 0; //no memory has been allocated
+ this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+
+ //load suffix array
+ this->loadData(filename, false, true, true);
+
+ this->initializeForScanning(filename, maxN);
+}
+
+void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered)
+{
+ this->maxFreqConsidered = maxFreqConsidered;
+}
+
+
+/**
+* Initialize data structure needed for scanning after the suffix array has been loaded
+**/
+void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN)
+{
+ this->maxN = maxN;
+ this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+ this->countOfCountsTable = 0; //no memory has been allocated
+
+ //initialize the scanning list
+ for(int i=0;i<this->maxN;i++){
+ this->nGramScanningList[i].freqSoFar=0;
+ this->nGramScanningList[i].vocId = 0;
+ this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+}
+
+C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase()
+{
+ free(this->nGramScanningList);
+
+ if(this->countOfCountsTable!=0){
+ free(this->countOfCountsTable);
+ }
+
+}
+
+void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh)
+{
+ if(n>this->maxN){
+ cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl;
+ exit(0);
+ }
+
+ this->nGramScanningList[n-1].freqThreshForOutput = freqThresh;
+}
+
+void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType()
+{
+ this->scanSuffixArray('H');
+
+}
+
+/// Count of counts is the number of n-gram types that occur a certain times in the corpus.
+/// Count of counts is important information in LM smoothing
+/// We scan the corpus for n-gram's type/token frequency and collect information for 1-gram, 2-gram,...and up to maxFreqConsidered-gram
+void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered)
+{
+ this->maxFreqConsidered = maxFreqConsidered;
+ this->constructCountOfCountsTable();
+
+ //output the count of counts
+ cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl;
+ for(int i=0;i<this->maxN;i++){
+ cout<<i+1<<endl;
+
+ unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered;
+ for(int freq=0;freq<maxFreqConsidered;freq++){
+ cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl;
+ }
+ }
+
+}
+
+///Check from 1-gram to maxN-gram for type-token information
+///the process is similar to "scanSuffixArrayForHighFreqNgramType"
+void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken()
+{
+ this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+ this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+
+ //initialize
+ for(int n=0;n<maxN;n++){
+ this->typeFreq[n]=0;
+ this->tokenFreq[n]=0;
+ }
+
+
+ //scan the suffix array
+ this->scanSuffixArray('T');
+
+ //output
+ cout<<"n\tType\tToken\n";
+ for(int i=0;i<this->maxN;i++){
+ cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl;
+ }
+}
+
+/**
+* Allocate memory for count-of-counts table and scan the corpus to fill in count of counts
+* memory will be freed in the destructor
+**/
+void C_SuffixArrayScanningBase::constructCountOfCountsTable()
+{
+ if(this->countOfCountsTable!=0){ //if there is already a count of counts table
+ free(this->countOfCountsTable);
+ }
+
+ this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered);
+
+ if(this->countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){
+ this->countOfCountsTable[c]=0;
+ }
+
+ this->scanSuffixArray('C');
+
+
+}
+
+/**
+* Scan through the indexed corpus and according to the action type,
+* perform actions accordingly when seeing a new n-gram type
+**/
+void C_SuffixArrayScanningBase::scanSuffixArray(char actionType)
+{
+
+ int i,j;
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==this->nGramScanningList[i].vocId)){ //still match
+
+ this->nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ C_String tmpPhrase; //for output high freq n-grams
+
+ //prepare the prefix of the n-grams
+ if(actionType=='H'){
+ //common i-gram
+ for(j=0;j<=i-1;j++){
+ if(this->nGramScanningList[j].vocId==0){ //one of the word in the common i-gram is a NULL word, not a valid n-gram
+ validNgramUpSoFar = false;
+ }
+ tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+ tmpPhrase.appending(C_String(" "));
+ }
+ }
+
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(this->nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ switch(actionType){
+
+ case 'C': //count of counts
+ freqSoFar = this->nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++;
+ }
+ break;
+
+ case 'H': //output high-freq n-grams
+ tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+ tmpPhrase.appending(C_String(" "));
+
+ if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){
+ cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl;
+ }
+ break;
+
+ case 'T': //type-token statistics
+ if(this->nGramScanningList[j].freqSoFar>0){
+ typeFreq[j]++;
+ }
+
+ tokenFreq[j]+=this->nGramScanningList[j].freqSoFar;
+
+ break;
+ default:
+ cerr<<"Unknown action!\n";
+ exit(-1);
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ this->nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ this->nGramScanningList[j].freqSoFar = 1;
+ }
+
+ this->nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ C_String finalTmpString; //for output high-freq n-gram type
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(this->nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+ switch(actionType){
+ case 'C': //for count-of-counts
+ freqSoFar = this->nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++;
+ }
+ break;
+
+ case 'H': //for high-freq n-gram types
+ finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId));
+ finalTmpString.appending(C_String(" "));
+ if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){
+ cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl;
+ }
+ break;
+
+ case 'T': //for type-token statistics
+ if(this->nGramScanningList[i].freqSoFar>0){
+ typeFreq[i]++;
+ }
+
+ tokenFreq[i]+=this->nGramScanningList[i].freqSoFar;
+ break;
+
+ default:
+ cerr<<"Unknown action!\n";
+ exit(-1);
+ }
+ }
+ }
+
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~
new file mode 100755
index 0000000..fd8bae8
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~
@@ -0,0 +1,338 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArrayScanningBase.h"
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase()
+{
+ this->countOfCountsTable = 0; //no memory has been allocated
+ this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+}
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN)
+{
+ this->countOfCountsTable = 0; //no memory has been allocated
+ this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+
+ //load suffix array
+ this->loadData(filename, false, true, true);
+
+ this->initializeForScanning(filename, maxN);
+}
+
+void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered)
+{
+ this->maxFreqConsidered = maxFreqConsidered;
+}
+
+
+/**
+* Initialize data structure needed for scanning after the suffix array has been loaded
+**/
+void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN)
+{
+ this->maxN = maxN;
+ this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+ this->countOfCountsTable = 0; //no memory has been allocated
+
+ //initialize the scanning list
+ for(int i=0;i<this->maxN;i++){
+ this->nGramScanningList[i].freqSoFar=0;
+ this->nGramScanningList[i].vocId = 0;
+ this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+}
+
+C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase()
+{
+ free(this->nGramScanningList);
+
+ if(this->countOfCountsTable!=0){
+ free(this->countOfCountsTable);
+ }
+
+}
+
+void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh)
+{
+ if(n>this->maxN){
+ cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl;
+ exit(0);
+ }
+
+ this->nGramScanningList[n-1].freqThreshForOutput = freqThresh;
+}
+
+void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType()
+{
+ this->scanSuffixArray('H');
+
+}
+
+/// Count of counts is the number of n-gram types that occur a certain times in the corpus.
+/// Count of counts is important information in LM smoothing
+/// We scan the corpus for n-gram's type/token frequency and collect information for 1-gram, 2-gram,...and up to maxFreqConsidered-gram
+void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered)
+{
+ this->maxFreqConsidered = maxFreqConsidered;
+ this->constructCountOfCountsTable();
+
+ //output the count of counts
+ cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl;
+ for(int i=0;i<this->maxN;i++){
+ cout<<i+1<<endl;
+
+ unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered;
+ for(int freq=0;freq<maxFreqConsidered;freq++){
+ cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl;
+ }
+ }
+
+}
+
+///Check from 1-gram to maxN-gram for type-token information
+///the process is similar to "scanSuffixArrayForHighFreqNgramType"
+void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken()
+{
+ this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+ this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+
+ //initialize
+ for(int n=0;n<maxN;n++){
+ this->typeFreq[n]=0;
+ this->tokenFreq[n]=0;
+ }
+
+
+ //scan the suffix array
+ this->scanSuffixArray('T');
+
+ //output
+ cout<<"n\tType\tToken\n";
+ for(int i=0;i<this->maxN;i++){
+ cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl;
+ }
+}
+
+/**
+* Allocate memory for count-of-counts table and scan the corpus to fill in count of counts
+* memory will be freed in the destructor
+**/
+void C_SuffixArrayScanningBase::constructCountOfCountsTable()
+{
+ if(this->countOfCountsTable!=0){ //if there is already a count of counts table
+ free(this->countOfCountsTable);
+ }
+
+ this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered);
+
+ if(this->countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){
+ this->countOfCountsTable[c]=0;
+ }
+
+ this->scanSuffixArray('C');
+
+
+}
+
+/**
+* Scan through the indexed corpus and according to the action type,
+* perform actions accordingly when seeing a new n-gram type
+**/
+void C_SuffixArrayScanningBase::scanSuffixArray(char actionType)
+{
+
+ int i,j;
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==this->nGramScanningList[i].vocId)){ //still match
+
+ this->nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ C_String tmpPhrase; //for output high freq n-grams
+
+ //prepare the prefix of the n-grams
+ if(actionType=='H'){
+ //common i-gram
+ for(j=0;j<=i-1;j++){
+ if(this->nGramScanningList[j].vocId==0){ //one of the word in the common i-gram is a NULL word, not a valid n-gram
+ validNgramUpSoFar = false;
+ }
+ tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+ tmpPhrase.appending(C_String(" "));
+ }
+ }
+
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(this->nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ switch(actionType){
+
+ case 'C': //count of counts
+ freqSoFar = this->nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++;
+ }
+ break;
+
+ case 'H': //output high-freq n-grams
+ tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+ tmpPhrase.appending(C_String(" "));
+
+ if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){
+ cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl;
+ }
+ break;
+
+ case 'T': //type-token statistics
+ if(this->nGramScanningList[j].freqSoFar>0){
+ typeFreq[j]++;
+ }
+
+ tokenFreq[j]+=this->nGramScanningList[j].freqSoFar;
+
+ break;
+ default:
+ cerr<<"Unknown action!\n";
+ exit(-1);
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ this->nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ this->nGramScanningList[j].freqSoFar = 1;
+ }
+
+ this->nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ C_String finalTmpString; //for output high-freq n-gram type
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(this->nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+ switch(actionType){
+ case 'C': //for count-of-counts
+ freqSoFar = this->nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++;
+ }
+ break;
+
+ case 'H': //for high-freq n-gram types
+ finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId));
+ finalTmpString.appending(C_String(" "));
+ if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){
+ cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl;
+ }
+ break;
+
+ case 'T': //for type-token statistics
+ if(this->nGramScanningList[i].freqSoFar>0){
+ typeFreq[i]++;
+ }
+
+ tokenFreq[i]+=this->nGramScanningList[i].freqSoFar;
+ break;
+
+ default:
+ cerr<<"Unknown action!\n";
+ exit(-1);
+ }
+ }
+ }
+
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h
new file mode 100755
index 0000000..c517b72
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h
@@ -0,0 +1,53 @@
+#if !defined (_HEADER_SUFFIX_ARRAY_SCANNING_BASE_CLASS_)
+#define _HEADER_SUFFIX_ARRAY_SCANNING_BASE_CLASS_
+
+
+#include "_SuffixArrayApplicationBase.h"
+
+
+
+
+/**
+* \ingroup scan
+* C_SuffixArrayScanningBase class provides functions to scan through an indexed corpus
+* and output information such as the type/token frequency of the data
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_SuffixArrayScanningBase : public C_SuffixArrayApplicationBase
+{
+public:
+ void setNgramOutputFreqThresh(int n, unsigned int freqThresh);
+ void scanSuffixArrayForHighFreqNgramType();
+ void scanSuffixArrayForCountofCounts(int maxFreqConsidered);
+ void scanSuffixArrayForTypeToken();
+
+ C_SuffixArrayScanningBase(const char * filename, unsigned int maxN);
+ C_SuffixArrayScanningBase();
+ ~C_SuffixArrayScanningBase();
+
+protected:
+ void setParam_maxFreqConsidered(int maxFreqConsidered);
+ void constructCountOfCountsTable();
+ void initializeForScanning(const char * filename, unsigned int maxN);
+
+ int maxN;
+ int maxFreqConsidered;
+
+ unsigned int * countOfCountsTable;
+
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+private:
+ void scanSuffixArray(char actionType);
+
+ S_nGramScanningInfoElement * nGramScanningList;
+
+
+ unsigned int * typeFreq;
+ unsigned int * tokenFreq;
+};
+
+#endif
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp
new file mode 100755
index 0000000..24b8cc4
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp
@@ -0,0 +1,130 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+int SHOW_DEBUG_INFO = 0;
+
+typedef struct s_ngram_freq_info{
+ C_String ngramText;
+ vector<IndexType> ngram;
+ unsigned int freq;
+}S_Ngram_Freq_Info;
+
+/**
+* Given several corpora indexed by their suffix array,
+* collect counts of n-grams in a list from all the corpora.
+* This is useful when a corpus is very large,
+* one can split the data into many chunks and sum up the n-gram frquencies.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //check parameters
+ if(argc<2){
+ cerr<<"\n-------------------------------------------";
+ cerr<<"\nUsage:";
+ cerr<<"\n\t"<<argv[0]<<" n-gram_list_filename < list of suffix arry used";
+ cerr<<"\nNote:";
+ cerr<<"\n\tn-gram_list_filename.id_voc must exist first.";
+ cerr<<"\n-------------------------------------------\n\n";
+
+ exit(0);
+ }
+
+ //load vocabulary
+ char id_voc_filename[1024];
+ sprintf(id_voc_filename, "%s.id_voc", argv[1]);
+ C_IDVocabulary voc(id_voc_filename);
+
+ //load the n-gram list
+ vector<S_Ngram_Freq_Info> ngramList;
+
+ ifstream NgramListFile;
+ NgramListFile.open(argv[1]);
+ char tmpString[4096];
+ while(!NgramListFile.eof()){
+
+ NgramListFile.getline(tmpString, 4096, '\n');
+
+ if(strlen(tmpString)>0){
+ S_Ngram_Freq_Info tmpNode;
+ tmpNode.ngramText = C_String(tmpString);
+ tmpNode.freq = 1;
+ tmpNode.ngram.clear();
+
+ //conver the n-gram as string to vocId
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+ int pos = 0;
+ int inputLen = strlen(tmpString);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = tmpString[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+ }
+
+ ngramList.push_back(tmpNode);
+ }
+ tmpString[0]='\0';
+ }
+ cerr<<"Total "<<ngramList.size()<<" ngrams loaded.\n";
+
+ //loop over all suffix array and collec the n-gram counts
+ char sa_filename[1024];
+ while(! cin.eof()){
+ cin>>sa_filename;
+
+ if(strlen(sa_filename)>0){
+ cerr<<"Considering "<<sa_filename<<endl;
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(sa_filename, true, true);
+
+ for(int i=0; i<ngramList.size(); i++){
+ unsigned int freq;
+
+ freq = sa.freqOfExactPhraseMatch(ngramList[i].ngram);
+
+ ngramList[i].freq+=freq;
+ }
+ }
+
+ sa_filename[0]=0;
+ }
+
+
+ for(int m=0;m<ngramList.size();m++){
+ cout<<ngramList[m].freq<<"\t";
+ cout<<ngramList[m].ngramText.toString()<<"\n";
+ }
+
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~
new file mode 100755
index 0000000..492b770
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~
@@ -0,0 +1,129 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+using namespace std;
+int SHOW_DEBUG_INFO = 0;
+
+typedef struct s_ngram_freq_info{
+ C_String ngramText;
+ vector<IndexType> ngram;
+ unsigned int freq;
+}S_Ngram_Freq_Info;
+
+/**
+* Given several corpora indexed by their suffix array,
+* collect counts of n-grams in a list from all the corpora.
+* This is useful when a corpus is very large,
+* one can split the data into many chunks and sum up the n-gram frquencies.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //check parameters
+ if(argc<2){
+ cerr<<"\n-------------------------------------------";
+ cerr<<"\nUsage:";
+ cerr<<"\n\t"<<argv[0]<<" n-gram_list_filename < list of suffix arry used";
+ cerr<<"\nNote:";
+ cerr<<"\n\tn-gram_list_filename.id_voc must exist first.";
+ cerr<<"\n-------------------------------------------\n\n";
+
+ exit(0);
+ }
+
+ //load vocabulary
+ char id_voc_filename[1024];
+ sprintf(id_voc_filename, "%s.id_voc", argv[1]);
+ C_IDVocabulary voc(id_voc_filename);
+
+ //load the n-gram list
+ vector<S_Ngram_Freq_Info> ngramList;
+
+ ifstream NgramListFile;
+ NgramListFile.open(argv[1]);
+ char tmpString[4096];
+ while(!NgramListFile.eof()){
+
+ NgramListFile.getline(tmpString, 4096, '\n');
+
+ if(strlen(tmpString)>0){
+ S_Ngram_Freq_Info tmpNode;
+ tmpNode.ngramText = C_String(tmpString);
+ tmpNode.freq = 1;
+ tmpNode.ngram.clear();
+
+ //conver the n-gram as string to vocId
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+ int pos = 0;
+ int inputLen = strlen(tmpString);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = tmpString[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+ }
+
+ ngramList.push_back(tmpNode);
+ }
+ tmpString[0]='\0';
+ }
+ cerr<<"Total "<<ngramList.size()<<" ngrams loaded.\n";
+
+ //loop over all suffix array and collec the n-gram counts
+ char sa_filename[1024];
+ while(! cin.eof()){
+ cin>>sa_filename;
+
+ if(strlen(sa_filename)>0){
+ cerr<<"Considering "<<sa_filename<<endl;
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(sa_filename, true, true);
+
+ for(int i=0; i<ngramList.size(); i++){
+ unsigned int freq;
+
+ freq = sa.freqOfExactPhraseMatch(ngramList[i].ngram);
+
+ ngramList[i].freq+=freq;
+ }
+ }
+
+ sa_filename[0]=0;
+ }
+
+
+ for(int m=0;m<ngramList.size();m++){
+ cout<<ngramList[m].freq<<"\t";
+ cout<<ngramList[m].ngramText.toString()<<"\n";
+ }
+
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp
new file mode 100755
index 0000000..9d47f3a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp
@@ -0,0 +1,72 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <map>
+#include <cstring>
+
+using namespace std;
+
+/**
+* Given a corpus indexed by its suffix array, filter out the duplicated sentences in the data
+* and output the unique sentences within.
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem < original corpus > corpus with uniq sentences\n",argv[0]);
+
+ exit(0);
+ }
+
+ map< pair<TextLenType, int>, bool> duplicatedSentAlreadyOutput;
+ map< pair<TextLenType, int>, bool>::iterator iterDuplicatedSentAlreadyOutput;
+
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(argv[1], false, true);
+
+ unsigned long totalFilteredSent = 0;
+
+ cerr<<"Filtering duplicated sentences:\n";
+ char tmpString[4000];
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ TextLenType freq = 0;
+ TextLenType firstOccurrence;
+ int sentLen;
+
+ freq = sa.freqOfExactPhraseMatchAndFirstOccurrence(tmpString, firstOccurrence, sentLen);
+
+ if(freq>1){ //freq is at least 1, because this is the same corpus
+ //then there are multiple occurrences of this sentence
+ //check if we have already output it
+ iterDuplicatedSentAlreadyOutput = duplicatedSentAlreadyOutput.find(make_pair(firstOccurrence, sentLen));
+
+ if(iterDuplicatedSentAlreadyOutput == duplicatedSentAlreadyOutput.end()){ //we haven't output it
+ cout<<tmpString<<endl;
+ duplicatedSentAlreadyOutput.insert(make_pair(make_pair(firstOccurrence, sentLen), true));
+ }
+ else{
+ //it has been output already, ignore it
+ totalFilteredSent++;
+ }
+ }
+ else{ //freq==1, no duplication
+ cout<<tmpString<<endl;
+ }
+
+ }
+ }
+
+ cerr<<"Total "<<totalFilteredSent<<" duplicated sentences are filtered\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~
new file mode 100755
index 0000000..1278b3f
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~
@@ -0,0 +1,71 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <map>
+
+using namespace std;
+
+/**
+* Given a corpus indexed by its suffix array, filter out the duplicated sentences in the data
+* and output the unique sentences within.
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem < original corpus > corpus with uniq sentences\n",argv[0]);
+
+ exit(0);
+ }
+
+ map< pair<TextLenType, int>, bool> duplicatedSentAlreadyOutput;
+ map< pair<TextLenType, int>, bool>::iterator iterDuplicatedSentAlreadyOutput;
+
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(argv[1], false, true);
+
+ unsigned long totalFilteredSent = 0;
+
+ cerr<<"Filtering duplicated sentences:\n";
+ char tmpString[4000];
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ TextLenType freq = 0;
+ TextLenType firstOccurrence;
+ int sentLen;
+
+ freq = sa.freqOfExactPhraseMatchAndFirstOccurrence(tmpString, firstOccurrence, sentLen);
+
+ if(freq>1){ //freq is at least 1, because this is the same corpus
+ //then there are multiple occurrences of this sentence
+ //check if we have already output it
+ iterDuplicatedSentAlreadyOutput = duplicatedSentAlreadyOutput.find(make_pair(firstOccurrence, sentLen));
+
+ if(iterDuplicatedSentAlreadyOutput == duplicatedSentAlreadyOutput.end()){ //we haven't output it
+ cout<<tmpString<<endl;
+ duplicatedSentAlreadyOutput.insert(make_pair(make_pair(firstOccurrence, sentLen), true));
+ }
+ else{
+ //it has been output already, ignore it
+ totalFilteredSent++;
+ }
+ }
+ else{ //freq==1, no duplication
+ cout<<tmpString<<endl;
+ }
+
+ }
+ }
+
+ cerr<<"Total "<<totalFilteredSent<<" duplicated sentences are filtered\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp
new file mode 100755
index 0000000..3daf337
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp
@@ -0,0 +1,47 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+
+/**
+* Application main functionL ExactNgramMatchingFreq
+* Input from stdin ngrams with each line containing one n-gram
+* Search the corpus for the occurrences of each n-gram and output their frequencies in the corpus
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem \n",argv[0]);
+
+ exit(0);
+ }
+
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(argv[1], false, true); //we need vocabulary, but do not need offset information here
+
+ cerr<<"Input N-grams:\n";
+ char tmpString[1000];
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ TextLenType freq = 0;
+ freq = sa.freqOfExactPhraseMatch(tmpString);
+ cout<<freq<<": "<<tmpString<<endl;
+ }
+ }
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~
new file mode 100755
index 0000000..4c63c0b
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~
@@ -0,0 +1,46 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+
+/**
+* Application main functionL ExactNgramMatchingFreq
+* Input from stdin ngrams with each line containing one n-gram
+* Search the corpus for the occurrences of each n-gram and output their frequencies in the corpus
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem \n",argv[0]);
+
+ exit(0);
+ }
+
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(argv[1], false, true); //we need vocabulary, but do not need offset information here
+
+ cerr<<"Input N-grams:\n";
+ char tmpString[1000];
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ TextLenType freq = 0;
+ freq = sa.freqOfExactPhraseMatch(tmpString);
+ cout<<freq<<": "<<tmpString<<endl;
+ }
+ }
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp
new file mode 100755
index 0000000..421e503
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp
@@ -0,0 +1,85 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include <vector>
+#include <iostream>
+#include <cstring>
+#include "_SuffixArraySearchApplicationBase.h"
+
+using namespace std;
+
+
+/**
+* Return locations of all the embedded n-grams of a sentence in the indexed corpus
+*
+* Revison $Rev: 3794 $
+* Last modified: $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+
+ //-----------------------------------------------------------------------------
+ //check arguments
+ if(argc<2){
+ fprintf(stderr,"\n\nOutput locations of all the matched embedded n-grams of a sentence in an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem [highestFreq maxRet smallestUnit longestUnit] < list of sentences\n\n",argv[0]);
+
+ exit(-1);
+ }
+
+
+ int highFreq;
+ int maxRet;
+ int smallestUnit;
+ int longestUnit;
+
+ C_SuffixArraySearchApplicationBase saObj;
+
+ saObj.loadData_forSearch(argv[1], false, false);
+
+ if(argc>=6){ //if argument of highestFreq, maxRet, smallestUnits are set
+ highFreq = atoi(argv[2]);
+ maxRet = atoi(argv[3]);
+ smallestUnit = atoi(argv[4]);
+ longestUnit = atoi(argv[5]);
+
+ saObj.setParam_highestFreqThresholdForReport(highFreq);
+ saObj.setParam_reportMaxOccurrenceOfOneNgram(maxRet);
+ saObj.setParam_shortestUnitToReport(smallestUnit);
+ saObj.setParam_longestUnitToReport(longestUnit);
+ }
+
+ cerr<<"Input sentences:\n";
+
+ char sentence[10000];
+
+ while(!cin.eof()){
+ cin.getline(sentence,10000,'\n');
+ if(strlen(sentence)>0){
+
+ vector<C_String> sentAsCStringVector = saObj.convertCharStringToCStringVector(sentence); //for later display purpose
+
+
+ vector<S_phraseLocationElement> locations;
+ locations = saObj.findPhrasesInASentence(sentence);
+
+ if(locations.size()==0){
+ cout<<"Nothing can be found in the corpus.\n";
+ }
+ else{
+ for(int i=0;i<locations.size(); i++){
+ cout<<"N-gram ["<<(int)locations[i].posStartInSrcSent<<", "<<(int)locations[i].posEndInSrcSent<<"]: ";
+ for(int j=locations[i].posStartInSrcSent; j<=locations[i].posEndInSrcSent; j++){
+ cout<<sentAsCStringVector[j-1].toString()<<" ";
+ }
+ cout<<" found in corpus: ";
+ cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+ }
+ }
+ cout<<endl;
+ }
+ }
+
+
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~
new file mode 100755
index 0000000..cd7a86a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~
@@ -0,0 +1,84 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include <vector>
+#include <iostream>
+#include "_SuffixArraySearchApplicationBase.h"
+
+using namespace std;
+
+
+/**
+* Return locations of all the embedded n-grams of a sentence in the indexed corpus
+*
+* Revison $Rev: 3794 $
+* Last modified: $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+
+ //-----------------------------------------------------------------------------
+ //check arguments
+ if(argc<2){
+ fprintf(stderr,"\n\nOutput locations of all the matched embedded n-grams of a sentence in an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem [highestFreq maxRet smallestUnit longestUnit] < list of sentences\n\n",argv[0]);
+
+ exit(-1);
+ }
+
+
+ int highFreq;
+ int maxRet;
+ int smallestUnit;
+ int longestUnit;
+
+ C_SuffixArraySearchApplicationBase saObj;
+
+ saObj.loadData_forSearch(argv[1], false, false);
+
+ if(argc>=6){ //if argument of highestFreq, maxRet, smallestUnits are set
+ highFreq = atoi(argv[2]);
+ maxRet = atoi(argv[3]);
+ smallestUnit = atoi(argv[4]);
+ longestUnit = atoi(argv[5]);
+
+ saObj.setParam_highestFreqThresholdForReport(highFreq);
+ saObj.setParam_reportMaxOccurrenceOfOneNgram(maxRet);
+ saObj.setParam_shortestUnitToReport(smallestUnit);
+ saObj.setParam_longestUnitToReport(longestUnit);
+ }
+
+ cerr<<"Input sentences:\n";
+
+ char sentence[10000];
+
+ while(!cin.eof()){
+ cin.getline(sentence,10000,'\n');
+ if(strlen(sentence)>0){
+
+ vector<C_String> sentAsCStringVector = saObj.convertCharStringToCStringVector(sentence); //for later display purpose
+
+
+ vector<S_phraseLocationElement> locations;
+ locations = saObj.findPhrasesInASentence(sentence);
+
+ if(locations.size()==0){
+ cout<<"Nothing can be found in the corpus.\n";
+ }
+ else{
+ for(int i=0;i<locations.size(); i++){
+ cout<<"N-gram ["<<(int)locations[i].posStartInSrcSent<<", "<<(int)locations[i].posEndInSrcSent<<"]: ";
+ for(int j=locations[i].posStartInSrcSent; j<=locations[i].posEndInSrcSent; j++){
+ cout<<sentAsCStringVector[j-1].toString()<<" ";
+ }
+ cout<<" found in corpus: ";
+ cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+ }
+ }
+ cout<<endl;
+ }
+ }
+
+
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp
new file mode 100755
index 0000000..deb8b81
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp
@@ -0,0 +1,67 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <vector>
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+/**
+* \ingroup search
+*
+* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs
+* SentID and offset are all 1-based
+*
+* Note:
+* The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement
+* To output it as a number, one needs to cast it to integer type for proper display
+*
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]);
+
+ exit(-1);
+ }
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase saObj;
+
+ //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
+ saObj.loadData_forSearch(argv[1], false, false);
+
+
+ cerr<<"Input N-grams:\n";
+ char tmpString[10000];
+ while(!cin.eof()){
+ cin.getline(tmpString,10000,'\n');
+ if(strlen(tmpString)>0){
+ vector<S_SimplePhraseLocationElement> locations;
+
+ locations = saObj.locateExactPhraseInCorpus(tmpString);
+
+ if(locations.size()==0){
+ cout<<"No occurrences found.\n";
+ }
+ else{
+ for(int i=0;i<locations.size(); i++){
+ cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+ }
+ }
+ cout<<endl;
+ }
+ }
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~
new file mode 100755
index 0000000..71097f9
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~
@@ -0,0 +1,66 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <vector>
+#include <iostream>
+
+using namespace std;
+
+/**
+* \ingroup search
+*
+* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs
+* SentID and offset are all 1-based
+*
+* Note:
+* The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement
+* To output it as a number, one needs to cast it to integer type for proper display
+*
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]);
+
+ exit(-1);
+ }
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase saObj;
+
+ //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
+ saObj.loadData_forSearch(argv[1], false, false);
+
+
+ cerr<<"Input N-grams:\n";
+ char tmpString[10000];
+ while(!cin.eof()){
+ cin.getline(tmpString,10000,'\n');
+ if(strlen(tmpString)>0){
+ vector<S_SimplePhraseLocationElement> locations;
+
+ locations = saObj.locateExactPhraseInCorpus(tmpString);
+
+ if(locations.size()==0){
+ cout<<"No occurrences found.\n";
+ }
+ else{
+ for(int i=0;i<locations.size(); i++){
+ cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+ }
+ }
+ cout<<endl;
+ }
+ }
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp
new file mode 100755
index 0000000..e614fdc
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp
@@ -0,0 +1,132 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+
+using namespace std;
+
+/**
+* Given the indexed training corpus, analyze the token/type matching ratio of the n-grams in the testing data.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+ fprintf(stderr,"\nOutput the n-gram matching statistics of a testing data given an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ map<int, pair<int, unsigned long> > results4OneSent;
+ map<int, pair<int, unsigned long> >::iterator iterResult;
+
+ vector<int> nGramTokenCountsInTest;
+ vector<int> nGramInTestMatched;
+ vector<double> nGramFreqInTrainMatched;
+
+ int maxSentLen = 4086;
+ nGramTokenCountsInTest.reserve(maxSentLen);
+ nGramInTestMatched.reserve(maxSentLen);
+ nGramFreqInTrainMatched.reserve(maxSentLen);
+
+ //initialize
+ for(int i=0;i<maxSentLen;i++){
+ nGramTokenCountsInTest.push_back(0);
+ nGramInTestMatched.push_back(0);
+ nGramFreqInTrainMatched.push_back(0);
+ }
+
+ char fileName[1000];
+ char tmpString[10000];
+
+ strcpy(fileName, argv[1]);
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(fileName, false, true);
+
+ fprintf(stderr,"Input sentences:\n");
+
+ long ltime1, ltime2;
+
+ time( &ltime1 );
+
+ int totalSentences = 0;
+ int matchedSentences = 0;
+ while(!cin.eof()){
+ int sentLen;
+ cin.getline(tmpString,10000,'\n');
+
+ if(strlen(tmpString)>0){
+
+ totalSentences++;
+
+ results4OneSent.clear();
+ results4OneSent = SA.returnNGramMatchingStatForOneSent(tmpString, sentLen);
+
+ if(sentLen>maxSentLen){
+ cerr<<"Sentence too long, we can not handle it! Exit.\n";
+ exit(0);
+ }
+
+ for(int j=1;j<=sentLen;j++){ //j-gram
+ nGramTokenCountsInTest[j]+=(sentLen-j+1); //number of j-grams in the sentence;
+ }
+
+ iterResult=results4OneSent.begin();
+ while(iterResult!=results4OneSent.end()){
+
+ nGramInTestMatched[iterResult->first]+=iterResult->second.first;
+ nGramFreqInTrainMatched[iterResult->first]+=iterResult->second.second;
+
+ if(iterResult->first==sentLen){ //a complete match
+ matchedSentences++;
+ }
+
+ iterResult++;
+ }
+ }
+
+ tmpString[0]=0;
+
+ }
+
+ int n = 1;
+ while(nGramInTestMatched[n]!=0){
+ int matched = nGramInTestMatched[n];
+ int totalInTest = nGramTokenCountsInTest[n];
+ cout<<"N="<<n<<":\t"<<matched<<" / "<<totalInTest<<"\t";
+ printf("%.1f\t", double(matched)/double(totalInTest)*100.0);
+ cout<<"OccInTrain= "<<nGramFreqInTrainMatched[n]<<endl;
+
+ n++;
+ }
+
+ cout<<"\nOut of "<<totalSentences<<" input sentences, "<<matchedSentences<<" can be found in the training data.\n";;
+ time( &ltime2 );
+ cout<<"Time cost:"<<ltime2-ltime2<<" seconds\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~
new file mode 100755
index 0000000..d33d3a9
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~
@@ -0,0 +1,131 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+
+using namespace std;
+
+/**
+* Given the indexed training corpus, analyze the token/type matching ratio of the n-grams in the testing data.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+ fprintf(stderr,"\nOutput the n-gram matching statistics of a testing data given an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ map<int, pair<int, unsigned long> > results4OneSent;
+ map<int, pair<int, unsigned long> >::iterator iterResult;
+
+ vector<int> nGramTokenCountsInTest;
+ vector<int> nGramInTestMatched;
+ vector<double> nGramFreqInTrainMatched;
+
+ int maxSentLen = 4086;
+ nGramTokenCountsInTest.reserve(maxSentLen);
+ nGramInTestMatched.reserve(maxSentLen);
+ nGramFreqInTrainMatched.reserve(maxSentLen);
+
+ //initialize
+ for(int i=0;i<maxSentLen;i++){
+ nGramTokenCountsInTest.push_back(0);
+ nGramInTestMatched.push_back(0);
+ nGramFreqInTrainMatched.push_back(0);
+ }
+
+ char fileName[1000];
+ char tmpString[10000];
+
+ strcpy(fileName, argv[1]);
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(fileName, false, true);
+
+ fprintf(stderr,"Input sentences:\n");
+
+ long ltime1, ltime2;
+
+ time( &ltime1 );
+
+ int totalSentences = 0;
+ int matchedSentences = 0;
+ while(!cin.eof()){
+ int sentLen;
+ cin.getline(tmpString,10000,'\n');
+
+ if(strlen(tmpString)>0){
+
+ totalSentences++;
+
+ results4OneSent.clear();
+ results4OneSent = SA.returnNGramMatchingStatForOneSent(tmpString, sentLen);
+
+ if(sentLen>maxSentLen){
+ cerr<<"Sentence too long, we can not handle it! Exit.\n";
+ exit(0);
+ }
+
+ for(int j=1;j<=sentLen;j++){ //j-gram
+ nGramTokenCountsInTest[j]+=(sentLen-j+1); //number of j-grams in the sentence;
+ }
+
+ iterResult=results4OneSent.begin();
+ while(iterResult!=results4OneSent.end()){
+
+ nGramInTestMatched[iterResult->first]+=iterResult->second.first;
+ nGramFreqInTrainMatched[iterResult->first]+=iterResult->second.second;
+
+ if(iterResult->first==sentLen){ //a complete match
+ matchedSentences++;
+ }
+
+ iterResult++;
+ }
+ }
+
+ tmpString[0]=0;
+
+ }
+
+ int n = 1;
+ while(nGramInTestMatched[n]!=0){
+ int matched = nGramInTestMatched[n];
+ int totalInTest = nGramTokenCountsInTest[n];
+ cout<<"N="<<n<<":\t"<<matched<<" / "<<totalInTest<<"\t";
+ printf("%.1f\t", double(matched)/double(totalInTest)*100.0);
+ cout<<"OccInTrain= "<<nGramFreqInTrainMatched[n]<<endl;
+
+ n++;
+ }
+
+ cout<<"\nOut of "<<totalSentences<<" input sentences, "<<matchedSentences<<" can be found in the training data.\n";;
+ time( &ltime2 );
+ cout<<"Time cost:"<<ltime2-ltime2<<" seconds\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp
new file mode 100755
index 0000000..ca12119
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp
@@ -0,0 +1,50 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+/**
+* Given a corpus indexed by its suffix array, input a sentence from STDIN and output the frequencies of its embedded n-grams in the corpus.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ char tmpString[1000];
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(argv[1], false, true);
+
+ fprintf(stderr,"Input Sentences:\n");
+
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ SA.displayNgramMatchingFreq4Sent(tmpString);
+ }
+ }
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~
new file mode 100755
index 0000000..5e2433b
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~
@@ -0,0 +1,49 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+/**
+* Given a corpus indexed by its suffix array, input a sentence from STDIN and output the frequencies of its embedded n-grams in the corpus.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ char tmpString[1000];
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(argv[1], false, true);
+
+ fprintf(stderr,"Input Sentences:\n");
+
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ SA.displayNgramMatchingFreq4Sent(tmpString);
+ }
+ }
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp
new file mode 100755
index 0000000..544a230
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp
@@ -0,0 +1,144 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "float.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+///Given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void local_oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+ n = index / sentLen + 1;
+ posInSrcSent = index % sentLen;
+}
+
+///Given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int local_twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+ unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+ return indexInTable;
+}
+
+/**
+* Given a corpus indexed by its suffix array
+* calcuate the non-compositionalities of the embedded n-grams in a testing sentence
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ char tmpString[1000];
+ double bigN = 1000000;
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(argv[1], false, true);
+
+ fprintf(stderr,"Input Sentences:\n");
+
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+
+ SA.displayNgramMatchingFreq4Sent(tmpString);
+
+ printf("\n");
+
+ int sentLen;
+
+ S_sentSearchTableElement * matchingTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+
+ //convert this to frequency table
+ double * freqTable = (double *) malloc (sizeof(double)*sentLen*sentLen);
+
+ for(unsigned int i=0;i<(sentLen*sentLen);i++){
+ //all the short n-grams should all exist and their frequency information should be in table now
+ unsigned int startPos, n;
+ double minNc;
+ int leftNWithMinNc;
+
+ local_oneDimensionTableIndexToTwoDimension(i, sentLen, startPos, n);
+
+ if(matchingTable[i].found){
+ double freq = matchingTable[i].endingPosInSA - matchingTable[i].startPosInSA +1;
+ freqTable[i]=freq;
+
+
+
+ //consider all splitting method
+ minNc = DBL_MAX;
+
+ for(unsigned int leftN=1;leftN<n;leftN++){
+ int index_left = local_twoDimensionIndexToOneDimensionTableIndex(startPos, leftN, sentLen);
+ int index_right = local_twoDimensionIndexToOneDimensionTableIndex(startPos+leftN, n-leftN, sentLen);
+
+ double leftFreq = freqTable[index_left];
+ double rightFreq = freqTable[index_right];
+
+ double nc = freq*bigN/(leftFreq*rightFreq);
+
+ if(nc<minNc){
+ minNc = nc;
+ leftNWithMinNc = leftN;
+ }
+
+ }
+ }
+ else{
+ freqTable[i]=0;
+ minNc = 0;
+ }
+
+ if(startPos==0){
+ printf("\n%d\t",n);
+ }
+
+ if(n==1){
+ printf("A\t"); //atom word, no way to break it
+ }
+ else{
+ if(minNc>0){
+ printf("%.1f[%d]\t", minNc, leftNWithMinNc);
+ }
+ else{
+ printf("_\t");
+ }
+ }
+ }
+
+ printf("\n");
+
+
+ free(matchingTable);
+ free(freqTable);
+
+
+ }
+ }
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~
new file mode 100755
index 0000000..294724e
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~
@@ -0,0 +1,145 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "float.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+///Given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void local_oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+ n = index / sentLen + 1;
+ posInSrcSent = index % sentLen;
+}
+
+///Given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int local_twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+ unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+ return indexInTable;
+}
+
+/**
+* Given a corpus indexed by its suffix array
+* calcuate the non-compositionalities of the embedded n-grams in a testing sentence
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ char tmpString[1000];
+ double bigN = 1000000;
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(argv[1], false, true);
+
+ fprintf(stderr,"Input Sentences:\n");
+
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+
+ SA.displayNgramMatchingFreq4Sent(tmpString);
+
+ printf("\n");
+
+ int sentLen;
+
+ S_sentSearchTableElement * matchingTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+
+ //convert this to frequency table
+ double * freqTable = (double *) malloc (sizeof(double)*sentLen*sentLen);
+
+ for(unsigned int i=0;i<(sentLen*sentLen);i++){
+ //all the short n-grams should all exist and their frequency information should be in table now
+ unsigned int startPos, n;
+ double minNc;
+ int leftNWithMinNc;
+
+ local_oneDimensionTableIndexToTwoDimension(i, sentLen, startPos, n);
+
+ if(matchingTable[i].found){
+ double freq = matchingTable[i].endingPosInSA - matchingTable[i].startPosInSA +1;
+ freqTable[i]=freq;
+
+
+
+ //consider all splitting method
+ minNc = DBL_MAX;
+
+ for(unsigned int leftN=1;leftN<n;leftN++){
+ int index_left = local_twoDimensionIndexToOneDimensionTableIndex(startPos, leftN, sentLen);
+ int index_right = local_twoDimensionIndexToOneDimensionTableIndex(startPos+leftN, n-leftN, sentLen);
+
+ double leftFreq = freqTable[index_left];
+ double rightFreq = freqTable[index_right];
+
+ double nc = freq*bigN/(leftFreq*rightFreq);
+
+ if(nc<minNc){
+ minNc = nc;
+ leftNWithMinNc = leftN;
+ }
+
+ }
+ }
+ else{
+ freqTable[i]=0;
+ minNc = 0;
+ }
+
+ if(startPos==0){
+ printf("\n%d\t",n);
+ }
+
+ if(n==1){
+ printf("A\t"); //atom word, no way to break it
+ }
+ else{
+ if(minNc>0){
+ printf("%.1f[%d]\t", minNc, leftNWithMinNc);
+ }
+ else{
+ printf("_\t");
+ }
+ }
+ }
+
+ printf("\n");
+
+
+ free(matchingTable);
+ free(freqTable);
+
+
+ }
+ }
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp
new file mode 100755
index 0000000..9697f4a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp
@@ -0,0 +1,178 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include "_String.h"
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+#include <cstring>
+
+using namespace std;
+
+
+vector<C_String> convertTextToStringVector(const char * sentText)
+{
+
+ vector<C_String> sentAsStringVect;
+
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+
+ int pos = 0;
+
+ int inputLen = strlen(sentText);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = sentText[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ sentAsStringVect.push_back(C_String(tmpToken));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ sentAsStringVect.push_back(C_String(tmpToken));
+ }
+
+ return sentAsStringVect;
+}
+
+/**
+* \ingroup search
+*
+* Given the training corpus indexed by its suffix array,
+* output all the n-grams in a testing data that can be found in the training corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+ fprintf(stderr,"\nOutput the matched n-gram types a testing data set given an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ map<C_String, double> matchedNgrams;
+ map<C_String, double>::iterator iterMatchedNgrams;
+
+
+ int maxSentLen = 4086;
+
+
+ char fileName[1000];
+ char tmpString[10000];
+
+ strcpy(fileName, argv[1]);
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(fileName, false, true);
+
+ cerr<<"Input sentences:\n";
+
+ long ltime1, ltime2;
+
+ time( &ltime1 );
+
+ int totalSentences = 0;
+ int matchedSentences = 0;
+ while(!cin.eof()){
+ cin.getline(tmpString,10000,'\n');
+
+ if(strlen(tmpString)>0){
+ vector<C_String> sentAsStringVector = convertTextToStringVector(tmpString);
+
+ int sentLen;
+ S_sentSearchTableElement * freqTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+
+ if(sentLen!=sentAsStringVector.size()){
+ cerr<<"Something wrong, can not proceed.!\n";
+ exit(-1);
+ }
+
+
+ //go over the frequency table
+ for(int startPos = 0; startPos<sentLen; startPos++){
+ C_String ngram;
+ bool stillMatching = true;
+ int n=1;
+ while(stillMatching & (n<=(sentLen-startPos)) ){
+
+ ngram.appending(sentAsStringVector[startPos+n-1]);
+
+ int posInFreqTable = (n-1)*sentLen+startPos;
+ if(freqTable[posInFreqTable].found){
+ double frequency = freqTable[posInFreqTable].endingPosInSA - freqTable[posInFreqTable].startPosInSA + 1;
+
+ iterMatchedNgrams = matchedNgrams.find(ngram);
+ if(iterMatchedNgrams!=matchedNgrams.end()){ //exist already
+ iterMatchedNgrams->second=frequency; //frequency is not meaningful in this case, just use it because map need some values to be mapped to
+ }
+ else{
+ matchedNgrams.insert(make_pair(ngram, frequency));
+ }
+ }
+ else{
+ stillMatching = false;
+ }
+
+
+ ngram.appending(C_String(" "));
+
+ n++;
+ }
+ }
+
+ }
+
+ tmpString[0]=0;
+
+ }
+
+
+ //now output all the n-grams
+ iterMatchedNgrams = matchedNgrams.begin();
+ while(iterMatchedNgrams != matchedNgrams.end()){
+ cout<<(iterMatchedNgrams->first).toString()<<endl;
+
+ iterMatchedNgrams++;
+ }
+
+
+ time( &ltime2 );
+ cerr<<"Time spent:"<<ltime2-ltime2<<" seconds\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~
new file mode 100755
index 0000000..5418db6
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~
@@ -0,0 +1,177 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include "_String.h"
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+
+using namespace std;
+
+
+vector<C_String> convertTextToStringVector(const char * sentText)
+{
+
+ vector<C_String> sentAsStringVect;
+
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+
+ int pos = 0;
+
+ int inputLen = strlen(sentText);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = sentText[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ sentAsStringVect.push_back(C_String(tmpToken));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ sentAsStringVect.push_back(C_String(tmpToken));
+ }
+
+ return sentAsStringVect;
+}
+
+/**
+* \ingroup search
+*
+* Given the training corpus indexed by its suffix array,
+* output all the n-grams in a testing data that can be found in the training corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+ fprintf(stderr,"\nOutput the matched n-gram types a testing data set given an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ map<C_String, double> matchedNgrams;
+ map<C_String, double>::iterator iterMatchedNgrams;
+
+
+ int maxSentLen = 4086;
+
+
+ char fileName[1000];
+ char tmpString[10000];
+
+ strcpy(fileName, argv[1]);
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(fileName, false, true);
+
+ cerr<<"Input sentences:\n";
+
+ long ltime1, ltime2;
+
+ time( &ltime1 );
+
+ int totalSentences = 0;
+ int matchedSentences = 0;
+ while(!cin.eof()){
+ cin.getline(tmpString,10000,'\n');
+
+ if(strlen(tmpString)>0){
+ vector<C_String> sentAsStringVector = convertTextToStringVector(tmpString);
+
+ int sentLen;
+ S_sentSearchTableElement * freqTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+
+ if(sentLen!=sentAsStringVector.size()){
+ cerr<<"Something wrong, can not proceed.!\n";
+ exit(-1);
+ }
+
+
+ //go over the frequency table
+ for(int startPos = 0; startPos<sentLen; startPos++){
+ C_String ngram;
+ bool stillMatching = true;
+ int n=1;
+ while(stillMatching & (n<=(sentLen-startPos)) ){
+
+ ngram.appending(sentAsStringVector[startPos+n-1]);
+
+ int posInFreqTable = (n-1)*sentLen+startPos;
+ if(freqTable[posInFreqTable].found){
+ double frequency = freqTable[posInFreqTable].endingPosInSA - freqTable[posInFreqTable].startPosInSA + 1;
+
+ iterMatchedNgrams = matchedNgrams.find(ngram);
+ if(iterMatchedNgrams!=matchedNgrams.end()){ //exist already
+ iterMatchedNgrams->second=frequency; //frequency is not meaningful in this case, just use it because map need some values to be mapped to
+ }
+ else{
+ matchedNgrams.insert(make_pair(ngram, frequency));
+ }
+ }
+ else{
+ stillMatching = false;
+ }
+
+
+ ngram.appending(C_String(" "));
+
+ n++;
+ }
+ }
+
+ }
+
+ tmpString[0]=0;
+
+ }
+
+
+ //now output all the n-grams
+ iterMatchedNgrams = matchedNgrams.begin();
+ while(iterMatchedNgrams != matchedNgrams.end()){
+ cout<<(iterMatchedNgrams->first).toString()<<endl;
+
+ iterMatchedNgrams++;
+ }
+
+
+ time( &ltime2 );
+ cerr<<"Time spent:"<<ltime2-ltime2<<" seconds\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp
new file mode 100755
index 0000000..ebb2ed5
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp
@@ -0,0 +1,754 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <stdlib.h>
+#include <cstring>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase()
+{
+
+ this->reportMaxOccurrenceOfOneNgram = -1;
+ this->highestFreqThresholdForReport = -1;
+ this->shortestUnitToReport = 1;
+ this->longestUnitToReport = -1; //no constraint
+
+ this->level1Buckets = NULL;
+ this->noLevel1Bucket = false; //by default, build level1 bucket
+
+ this->noOffset = false; //by default, load offset
+}
+
+C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase()
+{
+
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter highestFreqThresholdForReport is set so that very high frequent n-grams such as unigram "the" is skipped
+* high frequent n-grams occur too often in the corpus and their statistics can often be estimated offline.
+* Default value = -1 (no effective threshold)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport)
+{
+ this->highestFreqThresholdForReport = highestFreqThresholdForReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter shortestUnitToReport is set so that short n-grams can be skipped to speed up the process
+* Default value = 1 (no effective constraint)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport)
+{
+ this->shortestUnitToReport = shortestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter longestUnitToReport is set to skip long n-gram matches
+*
+* Default value = -1 (no effective limit, output all the matched n-grams no matter how long they are)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport)
+{
+ this->longestUnitToReport = longestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter reportMaxOccurrenceOfOneNgram is set to output information of only the "first" few occurrences of the matched n-gram
+* Since the order is based on the order of the corresponding suffices in the corpus,
+* the output occurrences are usually not the first few occurrences of the n-gram in the corpus
+**/
+void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram)
+{
+ this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram;
+}
+
+
+
+/**
+* Load the indexed corpus, suffix array, offset and vocabulary into memory
+* Note: if C_SuffixArraySearchApplicationBase will be used in the application to return the sentenceId/offset in sentence for the matched n-gram
+* then noOffset needs to be set to be false (to load the offset)
+**/
+void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset)
+{
+
+ this->loadData(filename, noVoc, noOffset, false); //call the constructor of the super class, load data and build level1Bucket
+
+ if(! this->noOffset){
+ TextLenType lastSentId;
+ unsigned char tmpOffset;
+ this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset);
+ this->totalSentNum = lastSentId;
+ }
+ else{
+ //we do not have offset information, simply travel to the sentence head
+ TextLenType pos = this->corpusSize-3;
+ while(this->corpus_list[pos]<this->sentIdStart){ //still actual words
+ pos--;
+ }
+ //at this position, it should be the <sentId> for the last sentence
+ this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1;
+ }
+ cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n";
+
+}
+
+
+///return 0 if w = text
+///return 1 if w < text
+///return 2 if w > text
+///given that the prefix of lcp words are the same
+char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText)
+{
+
+ IndexType vocInText = this->corpus_list[posInText+lcp];
+
+ if(vocInWord == vocInText){
+ return 0;
+ }
+
+ if(vocInWord < vocInText){
+ return 1;
+ }
+
+ return 2;
+}
+
+/** Utility function
+* Convert an input sentence as char string into a vector of C_String objects
+**/
+vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText)
+{
+ vector<C_String> sentAsStringVector;
+
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+
+ int pos = 0;
+
+ int inputLen = strlen(sentText);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = sentText[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ sentAsStringVector.push_back(C_String(tmpToken));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ sentAsStringVector.push_back(C_String(tmpToken));
+ }
+
+ return sentAsStringVector;
+
+}
+
+/**
+* Utility function: convert a sentence as a vector of C_String to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector)
+{
+ if(this->noVocabulary){
+ cerr<<"Vocabulary not available!\n";
+ exit(-1);
+ }
+
+ vector<IndexType> sentAsVocIdVector;
+
+ for(int i=0;i<sentAsStringVector.size();i++){
+ sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i]));
+ }
+ return sentAsVocIdVector;
+}
+
+
+/**
+* Utility function:
+* Convert a sentence as character string to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText)
+{
+ vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText);
+ return this->convertCStringVectorToVocIdVector(sentAsCStringVector);
+}
+
+
+/**
+* If know the range where the phrase is, search in this range for it
+* position here are all positions in SA, not the positions in the textstring
+*
+* LCP indicates that all the suffixes in the range has the same prefix with LCP length with the proposed n-gram phrase
+* only need to compare the "nextWord" at LCP+1 position
+*
+* return true if such phrase can be found inside the range, false if not
+**/
+bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos)
+{
+ TextLenType leftPos, rightPos, middlePos;
+
+ //in case the phrase to be searched is beyond the bucket although the first LCP word is the same as this bucket
+ //e.g. range correspondes to [ab, ad], but we are searching for (aa)
+ //so first step is to make sure the lcp+next word is still in this range
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){
+ //phrase+next word < text corresponding rangeStart, we could not find it inside this range
+ return false;
+ }
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){
+ //phrase+next word > text corresponding to rangeEnd
+ return false;
+ }
+
+ //now we are sure that text(SA[rangeStart]) <= phrase <= text(SA[rangeEnd])
+
+
+ //search for left bound ( the pos in text which is the min(text>=w))
+ //at any time, Left<w<=Right (actually Left<=w<=Right)
+ leftPos = rangeStartPos;
+ rightPos = rangeEndPos;
+ while( rightPos > (leftPos+1)){ //at the time when right = left +1, we should stop
+
+ middlePos = (TextLenType)((leftPos + rightPos) / 2);
+ if(((leftPos + rightPos) % 2) != 0){
+ middlePos++; //bias towards right
+ }
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){
+ // phrase <= middlePos in Text, go left
+ rightPos = middlePos;
+ }
+ else{
+ leftPos = middlePos; //word > middle, go right
+ }
+
+ }
+ //in previous implementation, we can gurantee that Left<w, because we take rangeStartPos-- from original range
+ //here we can only guarantee that Left<=w, so need to check if Left==w at lcp
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){
+ resultStartPos = leftPos;
+ }
+ else{
+ resultStartPos = rightPos;
+ }
+
+ //search for right bound ( the value which is the max(text<=w))
+ //at any time, Left<w<=Right (actually Left<=w<=Right)
+ leftPos = rangeStartPos;
+ rightPos = rangeEndPos;
+ while( rightPos > (leftPos+1)){ //stop when right = left + 1
+ middlePos = (TextLenType) ((leftPos + rightPos) / 2 ); //bias towards left
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){ // phrase >= middlePos in Text, go right
+ leftPos = middlePos;
+ }
+ else{
+ rightPos = middlePos; // ==1, phrase < middlePos
+ }
+ }
+ //in previous implementation, we can gurantee that w<Right, because we take rangeEndPos++ from original range
+ //here we can only guarantee that w<=Right, so need to check if Right==w at lcp
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){
+ resultEndPos = rightPos;
+ }
+ else{
+ resultEndPos = leftPos;
+ }
+
+ if(resultEndPos>=resultStartPos){
+ return true;
+ }
+
+ return false; //could not find this phrase
+}
+
+///memory allocated here, remember to free the memory when the table is not needed any more in the
+///calling function
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(sentText);
+ sentLen = sentInVocId.size();
+
+ return this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+}
+
+
+///constructing the n-gram search table
+///memory allocated here, remember to free the memory when the table is not needed any more in the
+///calling function
+///
+///faster than constructNgramSearchTable4Sent because the suffixes in the range given by n-1 gram can
+///guaranteed to have the first n-1 words to be the same as the n-1 gram
+///only needs to compare the following one word
+///
+/// for a sentence as:w1, w2,....
+/// cell [i,j] in the table is for n-gram from w_(j-1)...w_(j+i-1), that is a
+/// (i+1)-gram starting at position j+1 in sentence
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+ S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement));
+
+ //for consistency, initialize all cells
+ for(int c=0;c<(sentLen*sentLen);c++){
+ table[c].found = false;
+ table[c].startPosInSA = 0;
+ table[c].endingPosInSA = 0;
+ }
+
+ TextLenType startPos, endPos;
+
+ //initialize word level elements
+ for(int i=0;i<sentLen;i++){
+ IndexType vocId = sentInVocId[i];
+ //cout<<vocId<<" ";
+ if((vocId==0)||(vocId>=this->sentIdStart)){ //vocId ==0 means this word is OOV <unk>, if vocId>=sentIdStart means for this corpus, we don't know this word
+ table[i].found = false;
+ }
+ else{
+ table[i].startPosInSA = this->level1Buckets[vocId].first;
+ table[i].endingPosInSA = this->level1Buckets[vocId].last;
+
+ if(table[i].startPosInSA<=table[i].endingPosInSA){
+ table[i].found = true;
+ }
+ else{ //because vocabulary is built on top of an existing voc, this corpus may not have all the occurrences of all the words in the voc
+ table[i].found = false;
+ }
+ }
+ }
+
+
+ //filling in the cells in the table row by row
+ //basically this means we start by looking for smaller units first
+ //if they are found, search for longer n-grams
+ for(int n=1;n<sentLen;n++){ //finding n+1 gram. when n=sentLen-1, we are search for the occurrence of the whole sent
+ int levelN_1_0 = (n - 1) * sentLen; //map from two dimensional position to one-dimension
+ int levelN_0 = n * sentLen;
+ for(int j=0;j<= (sentLen - 1 - n); j++){ //possible starting point for n+1 gram
+ //necessary conditions that this n+1 gram exist are:
+ //the two sub n-gram all exist in the corpus
+ if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){
+ IndexType nextWord = sentInVocId[j+n]; //the last word of the n+1 gram
+
+ //n+1 gram has to be in the range of the n-gram in SA
+ startPos = table[levelN_1_0 + j].startPosInSA;
+ endPos = table[levelN_1_0 + j].endingPosInSA;
+
+ TextLenType foundPosStart = 0;
+ TextLenType foundPosEnd = 0;
+
+ //the prefix of n words of all suffixes between [startPos, endPos] is the same as the
+ //prefix of the n words in the proposed n+1 gram, no need to compare
+ //only need to compare the n+1 word, which is "nextWord" here
+ if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){
+ table[levelN_0 + j].found = true;
+ table[levelN_0 + j].startPosInSA = foundPosStart;
+ table[levelN_0 + j].endingPosInSA = foundPosEnd;
+ }
+ else{
+ table[levelN_0 + j].found = false;
+ }
+
+ }
+ else{
+ table[levelN_0 + j].found = false;
+ }
+ }
+ }
+ return table;
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(sent);
+ this->displayNgramMatchingFreq4Sent(sentInVocId);
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+
+ int i,j;
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ //show sentence
+ cout<<"\t";
+ for(i=0;i<sentLen;i++){
+ cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t";
+ }
+ cout<<endl;
+
+ //show frequency of each n-gram
+ i=0;
+ bool stillMatch = true;
+ while(stillMatch &&( i<sentLen)){
+ cout<<i+1<<"\t";
+ int startForRow = i*sentLen;
+ bool anyGood = false;
+ for(j=0;j<= (sentLen - 1 - i); j++){
+ if(table[startForRow+j].found){
+ //this is for regular case
+ if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){ //more than one occurrence
+ cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1;
+ anyGood = true;
+ }
+ else{
+ cout<<"0";
+ }
+
+ }
+ else{
+ cout<<"0";
+ }
+ cout<<"\t";
+ }
+
+ stillMatch = anyGood;
+ cout<<endl;
+ i++;
+ }
+
+ free(table);
+}
+
+///given the pos of a word in corpus, return its offset in the sentence
+///and the sentence ID
+///offset has to be loaded
+///we do not check it here for efficicency purposes
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset)
+{
+ offset = this->offset_list[pos];
+ sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+ offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen)
+{
+ offset = this->offset_list[pos];
+ sentLen = this->offset_list[pos-offset];
+ sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+ offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs)
+{
+ if(srcSentAsVocIDs.size()>255){
+ cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n";
+ exit(0);
+ }
+
+ unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs);
+
+ //Now, we know all the n-grams we are looking for
+ //output the results
+ vector<S_phraseLocationElement> allFoundNgrams;
+ S_phraseLocationElement tmpNode;
+
+ int longestUnitToReportForThisSent = sentLen;
+ if(this->longestUnitToReport!=-1){
+ //and if longestUnitToReport is shorter than sentLen
+ if(this->longestUnitToReport<sentLen){
+ longestUnitToReportForThisSent = this->longestUnitToReport;
+ }
+ }
+
+ for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){
+ int firstPosInRow = r*sentLen;
+ for(unsigned char c=0; c<= (sentLen - 1 - r); c++){
+ if(table[firstPosInRow + c].found){ //at this position the ngram was found
+ tmpNode.posStartInSrcSent = c + 1; //position starts from 1
+ tmpNode.posEndInSrcSent = r + c + 1;
+
+ //now for all ocurrences, find their sentId and realative positions
+ TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA;
+ TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA;
+
+ if( (this->highestFreqThresholdForReport <= 0) || //no limit
+ ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport ))
+ ){
+ // we don't want to retrieve high-freq n-gram which is very time consuming
+ //and meaningless for translation, such as 1M occurrences of "of the" in the corpus
+
+
+ if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){
+ //and for each n-gram, report only a limited amount of occurrences
+ endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1;
+ }
+
+ TextLenType sentId;
+ unsigned char posInSent;
+ for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){
+ this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent);
+ tmpNode.sentIdInCorpus = sentId;
+ tmpNode.posInSentInCorpus = posInSent;
+
+ allFoundNgrams.push_back(tmpNode);
+ }
+ }
+ }
+
+ }
+ }
+
+ free(table);
+
+ return allFoundNgrams;
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIDs
+ vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent);
+
+ return this->findPhrasesInASentence(srcSentAsVocIDs);
+}
+
+
+bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd)
+{
+ int phraseLen = phrase.size();
+
+ //first check if there are any <unk> in the phrase
+ for(int i=0;i<phrase.size();i++){
+ if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){
+ return false; //return empty matching result
+ }
+ }
+
+ TextLenType currentRangeStart, currentRangeEnd;
+ TextLenType narrowedRangeStart, narrowedRangeEnd;
+ IndexType vocId;
+
+ //for word 1
+ vocId = phrase[0];
+ currentRangeStart = this->level1Buckets[vocId].first;
+ currentRangeEnd = this->level1Buckets[vocId].last;
+
+ if(currentRangeStart>currentRangeEnd){
+ return false; //even this 1-gram does not exist
+ }
+
+ int posInPhrase = 1;
+ while( posInPhrase<phraseLen ){
+ vocId = phrase[posInPhrase];
+ bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd);
+
+ if(! stillExist){
+ return false;
+ }
+
+ currentRangeStart = narrowedRangeStart;
+ currentRangeEnd = narrowedRangeEnd;
+
+ posInPhrase++;
+ }
+
+ //we find the range of matching phrase, now get the sentId
+ rangeStart = currentRangeStart;
+ rangeEnd = currentRangeEnd;
+
+ return true;
+}
+
+///similar to construct the freq table
+///but only search for the exact phrase matching
+///Important: because locateSentIdFromPos is called which requires the offset information
+///Suffix array has to be initialized with offset loaded
+///i.e. initilized with loadData_forSearch(corpusName, bool noVoc, noOffset=fase)
+///otherwise the program will have segmentation fault
+///SALM does not check if offset has been loaded already for efficiency reasons because locateSendIdFromPos() is called frequently
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase)
+{
+ vector<S_SimplePhraseLocationElement> matchingResult;
+
+ TextLenType rangeStart, rangeEnd;
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ //we find some match
+ S_SimplePhraseLocationElement tmpNode;
+ for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){
+ this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus);
+ matchingResult.push_back(tmpNode);
+ }
+ }
+
+ return matchingResult;
+}
+
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->locateExactPhraseInCorpus(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase)
+{
+ TextLenType rangeStart, rangeEnd;
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ return rangeEnd - rangeStart + 1;
+ }
+
+ return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->freqOfExactPhraseMatch(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen)
+{
+ TextLenType rangeStart, rangeEnd;
+
+ sentLen = phrase.size();
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ startPosInSA = rangeStart;
+ return rangeEnd - rangeStart + 1;
+ }
+
+ return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber()
+{
+ return this->totalSentNum;
+}
+
+///given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+ n = index / sentLen + 1;
+ posInSrcSent = index % sentLen;
+}
+
+///given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+ unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+ return indexInTable;
+}
+
+///simple return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+ return this->numberOfMatcedNgram(sentInVocId);
+}
+
+///simply return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ unsigned int totalMatched = 0;
+
+ for(unsigned int i=0;i<(sentLen*sentLen);i++){
+ if(table[i].found){
+ totalMatched++;
+ }
+ }
+
+ free(table);
+ return totalMatched;
+}
+
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+ return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen);
+}
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen)
+{
+ sentLen = sentInVocId.size();
+ map<int, pair<int, unsigned long> > nGramMatched;
+ map<int, pair<int, unsigned long> >::iterator iterNGramMatched;
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ for(int n = 1; n <= sentLen; n++){
+ for(int startPos=0; startPos <= (sentLen - n); startPos++){
+ int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen);
+
+ if(table[indexInTable].found){
+
+ unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1;
+ iterNGramMatched = nGramMatched.find(n);
+ if(iterNGramMatched==nGramMatched.end()){//has not seen this before
+ nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) ));
+ }
+ else{
+ iterNGramMatched->second.first++;
+ iterNGramMatched->second.second+=freqInTraining;
+ }
+ }
+ }
+ }
+
+ free(table);
+
+ return nGramMatched;
+}
+
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~
new file mode 100755
index 0000000..94d272c
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~
@@ -0,0 +1,753 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase()
+{
+
+ this->reportMaxOccurrenceOfOneNgram = -1;
+ this->highestFreqThresholdForReport = -1;
+ this->shortestUnitToReport = 1;
+ this->longestUnitToReport = -1; //no constraint
+
+ this->level1Buckets = NULL;
+ this->noLevel1Bucket = false; //by default, build level1 bucket
+
+ this->noOffset = false; //by default, load offset
+}
+
+C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase()
+{
+
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter highestFreqThresholdForReport is set so that very high frequent n-grams such as unigram "the" is skipped
+* high frequent n-grams occur too often in the corpus and their statistics can often be estimated offline.
+* Default value = -1 (no effective threshold)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport)
+{
+ this->highestFreqThresholdForReport = highestFreqThresholdForReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter shortestUnitToReport is set so that short n-grams can be skipped to speed up the process
+* Default value = 1 (no effective constraint)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport)
+{
+ this->shortestUnitToReport = shortestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter longestUnitToReport is set to skip long n-gram matches
+*
+* Default value = -1 (no effective limit, output all the matched n-grams no matter how long they are)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport)
+{
+ this->longestUnitToReport = longestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter reportMaxOccurrenceOfOneNgram is set to output information of only the "first" few occurrences of the matched n-gram
+* Since the order is based on the order of the corresponding suffices in the corpus,
+* the output occurrences are usually not the first few occurrences of the n-gram in the corpus
+**/
+void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram)
+{
+ this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram;
+}
+
+
+
+/**
+* Load the indexed corpus, suffix array, offset and vocabulary into memory
+* Note: if C_SuffixArraySearchApplicationBase will be used in the application to return the sentenceId/offset in sentence for the matched n-gram
+* then noOffset needs to be set to be false (to load the offset)
+**/
+void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset)
+{
+
+ this->loadData(filename, noVoc, noOffset, false); //call the constructor of the super class, load data and build level1Bucket
+
+ if(! this->noOffset){
+ TextLenType lastSentId;
+ unsigned char tmpOffset;
+ this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset);
+ this->totalSentNum = lastSentId;
+ }
+ else{
+ //we do not have offset information, simply travel to the sentence head
+ TextLenType pos = this->corpusSize-3;
+ while(this->corpus_list[pos]<this->sentIdStart){ //still actual words
+ pos--;
+ }
+ //at this position, it should be the <sentId> for the last sentence
+ this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1;
+ }
+ cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n";
+
+}
+
+
+///return 0 if w = text
+///return 1 if w < text
+///return 2 if w > text
+///given that the prefix of lcp words are the same
+char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText)
+{
+
+ IndexType vocInText = this->corpus_list[posInText+lcp];
+
+ if(vocInWord == vocInText){
+ return 0;
+ }
+
+ if(vocInWord < vocInText){
+ return 1;
+ }
+
+ return 2;
+}
+
+/** Utility function
+* Convert an input sentence as char string into a vector of C_String objects
+**/
+vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText)
+{
+ vector<C_String> sentAsStringVector;
+
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+
+ int pos = 0;
+
+ int inputLen = strlen(sentText);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = sentText[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ sentAsStringVector.push_back(C_String(tmpToken));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ sentAsStringVector.push_back(C_String(tmpToken));
+ }
+
+ return sentAsStringVector;
+
+}
+
+/**
+* Utility function: convert a sentence as a vector of C_String to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector)
+{
+ if(this->noVocabulary){
+ cerr<<"Vocabulary not available!\n";
+ exit(-1);
+ }
+
+ vector<IndexType> sentAsVocIdVector;
+
+ for(int i=0;i<sentAsStringVector.size();i++){
+ sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i]));
+ }
+ return sentAsVocIdVector;
+}
+
+
+/**
+* Utility function:
+* Convert a sentence as character string to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText)
+{
+ vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText);
+ return this->convertCStringVectorToVocIdVector(sentAsCStringVector);
+}
+
+
+/**
+* If know the range where the phrase is, search in this range for it
+* position here are all positions in SA, not the positions in the textstring
+*
+* LCP indicates that all the suffixes in the range has the same prefix with LCP length with the proposed n-gram phrase
+* only need to compare the "nextWord" at LCP+1 position
+*
+* return true if such phrase can be found inside the range, false if not
+**/
+bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos)
+{
+ TextLenType leftPos, rightPos, middlePos;
+
+ //in case the phrase to be searched is beyond the bucket although the first LCP word is the same as this bucket
+ //e.g. range correspondes to [ab, ad], but we are searching for (aa)
+ //so first step is to make sure the lcp+next word is still in this range
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){
+ //phrase+next word < text corresponding rangeStart, we could not find it inside this range
+ return false;
+ }
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){
+ //phrase+next word > text corresponding to rangeEnd
+ return false;
+ }
+
+ //now we are sure that text(SA[rangeStart]) <= phrase <= text(SA[rangeEnd])
+
+
+ //search for left bound ( the pos in text which is the min(text>=w))
+ //at any time, Left<w<=Right (actually Left<=w<=Right)
+ leftPos = rangeStartPos;
+ rightPos = rangeEndPos;
+ while( rightPos > (leftPos+1)){ //at the time when right = left +1, we should stop
+
+ middlePos = (TextLenType)((leftPos + rightPos) / 2);
+ if(((leftPos + rightPos) % 2) != 0){
+ middlePos++; //bias towards right
+ }
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){
+ // phrase <= middlePos in Text, go left
+ rightPos = middlePos;
+ }
+ else{
+ leftPos = middlePos; //word > middle, go right
+ }
+
+ }
+ //in previous implementation, we can gurantee that Left<w, because we take rangeStartPos-- from original range
+ //here we can only guarantee that Left<=w, so need to check if Left==w at lcp
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){
+ resultStartPos = leftPos;
+ }
+ else{
+ resultStartPos = rightPos;
+ }
+
+ //search for right bound ( the value which is the max(text<=w))
+ //at any time, Left<w<=Right (actually Left<=w<=Right)
+ leftPos = rangeStartPos;
+ rightPos = rangeEndPos;
+ while( rightPos > (leftPos+1)){ //stop when right = left + 1
+ middlePos = (TextLenType) ((leftPos + rightPos) / 2 ); //bias towards left
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){ // phrase >= middlePos in Text, go right
+ leftPos = middlePos;
+ }
+ else{
+ rightPos = middlePos; // ==1, phrase < middlePos
+ }
+ }
+ //in previous implementation, we can gurantee that w<Right, because we take rangeEndPos++ from original range
+ //here we can only guarantee that w<=Right, so need to check if Right==w at lcp
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){
+ resultEndPos = rightPos;
+ }
+ else{
+ resultEndPos = leftPos;
+ }
+
+ if(resultEndPos>=resultStartPos){
+ return true;
+ }
+
+ return false; //could not find this phrase
+}
+
+///memory allocated here, remember to free the memory when the table is not needed any more in the
+///calling function
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(sentText);
+ sentLen = sentInVocId.size();
+
+ return this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+}
+
+
+///constructing the n-gram search table
+///memory allocated here, remember to free the memory when the table is not needed any more in the
+///calling function
+///
+///faster than constructNgramSearchTable4Sent because the suffixes in the range given by n-1 gram can
+///guaranteed to have the first n-1 words to be the same as the n-1 gram
+///only needs to compare the following one word
+///
+/// for a sentence as:w1, w2,....
+/// cell [i,j] in the table is for n-gram from w_(j-1)...w_(j+i-1), that is a
+/// (i+1)-gram starting at position j+1 in sentence
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+ S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement));
+
+ //for consistency, initialize all cells
+ for(int c=0;c<(sentLen*sentLen);c++){
+ table[c].found = false;
+ table[c].startPosInSA = 0;
+ table[c].endingPosInSA = 0;
+ }
+
+ TextLenType startPos, endPos;
+
+ //initialize word level elements
+ for(int i=0;i<sentLen;i++){
+ IndexType vocId = sentInVocId[i];
+ //cout<<vocId<<" ";
+ if((vocId==0)||(vocId>=this->sentIdStart)){ //vocId ==0 means this word is OOV <unk>, if vocId>=sentIdStart means for this corpus, we don't know this word
+ table[i].found = false;
+ }
+ else{
+ table[i].startPosInSA = this->level1Buckets[vocId].first;
+ table[i].endingPosInSA = this->level1Buckets[vocId].last;
+
+ if(table[i].startPosInSA<=table[i].endingPosInSA){
+ table[i].found = true;
+ }
+ else{ //because vocabulary is built on top of an existing voc, this corpus may not have all the occurrences of all the words in the voc
+ table[i].found = false;
+ }
+ }
+ }
+
+
+ //filling in the cells in the table row by row
+ //basically this means we start by looking for smaller units first
+ //if they are found, search for longer n-grams
+ for(int n=1;n<sentLen;n++){ //finding n+1 gram. when n=sentLen-1, we are search for the occurrence of the whole sent
+ int levelN_1_0 = (n - 1) * sentLen; //map from two dimensional position to one-dimension
+ int levelN_0 = n * sentLen;
+ for(int j=0;j<= (sentLen - 1 - n); j++){ //possible starting point for n+1 gram
+ //necessary conditions that this n+1 gram exist are:
+ //the two sub n-gram all exist in the corpus
+ if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){
+ IndexType nextWord = sentInVocId[j+n]; //the last word of the n+1 gram
+
+ //n+1 gram has to be in the range of the n-gram in SA
+ startPos = table[levelN_1_0 + j].startPosInSA;
+ endPos = table[levelN_1_0 + j].endingPosInSA;
+
+ TextLenType foundPosStart = 0;
+ TextLenType foundPosEnd = 0;
+
+ //the prefix of n words of all suffixes between [startPos, endPos] is the same as the
+ //prefix of the n words in the proposed n+1 gram, no need to compare
+ //only need to compare the n+1 word, which is "nextWord" here
+ if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){
+ table[levelN_0 + j].found = true;
+ table[levelN_0 + j].startPosInSA = foundPosStart;
+ table[levelN_0 + j].endingPosInSA = foundPosEnd;
+ }
+ else{
+ table[levelN_0 + j].found = false;
+ }
+
+ }
+ else{
+ table[levelN_0 + j].found = false;
+ }
+ }
+ }
+ return table;
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(sent);
+ this->displayNgramMatchingFreq4Sent(sentInVocId);
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+
+ int i,j;
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ //show sentence
+ cout<<"\t";
+ for(i=0;i<sentLen;i++){
+ cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t";
+ }
+ cout<<endl;
+
+ //show frequency of each n-gram
+ i=0;
+ bool stillMatch = true;
+ while(stillMatch &&( i<sentLen)){
+ cout<<i+1<<"\t";
+ int startForRow = i*sentLen;
+ bool anyGood = false;
+ for(j=0;j<= (sentLen - 1 - i); j++){
+ if(table[startForRow+j].found){
+ //this is for regular case
+ if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){ //more than one occurrence
+ cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1;
+ anyGood = true;
+ }
+ else{
+ cout<<"0";
+ }
+
+ }
+ else{
+ cout<<"0";
+ }
+ cout<<"\t";
+ }
+
+ stillMatch = anyGood;
+ cout<<endl;
+ i++;
+ }
+
+ free(table);
+}
+
+///given the pos of a word in corpus, return its offset in the sentence
+///and the sentence ID
+///offset has to be loaded
+///we do not check it here for efficicency purposes
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset)
+{
+ offset = this->offset_list[pos];
+ sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+ offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen)
+{
+ offset = this->offset_list[pos];
+ sentLen = this->offset_list[pos-offset];
+ sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+ offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs)
+{
+ if(srcSentAsVocIDs.size()>255){
+ cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n";
+ exit(0);
+ }
+
+ unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs);
+
+ //Now, we know all the n-grams we are looking for
+ //output the results
+ vector<S_phraseLocationElement> allFoundNgrams;
+ S_phraseLocationElement tmpNode;
+
+ int longestUnitToReportForThisSent = sentLen;
+ if(this->longestUnitToReport!=-1){
+ //and if longestUnitToReport is shorter than sentLen
+ if(this->longestUnitToReport<sentLen){
+ longestUnitToReportForThisSent = this->longestUnitToReport;
+ }
+ }
+
+ for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){
+ int firstPosInRow = r*sentLen;
+ for(unsigned char c=0; c<= (sentLen - 1 - r); c++){
+ if(table[firstPosInRow + c].found){ //at this position the ngram was found
+ tmpNode.posStartInSrcSent = c + 1; //position starts from 1
+ tmpNode.posEndInSrcSent = r + c + 1;
+
+ //now for all ocurrences, find their sentId and realative positions
+ TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA;
+ TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA;
+
+ if( (this->highestFreqThresholdForReport <= 0) || //no limit
+ ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport ))
+ ){
+ // we don't want to retrieve high-freq n-gram which is very time consuming
+ //and meaningless for translation, such as 1M occurrences of "of the" in the corpus
+
+
+ if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){
+ //and for each n-gram, report only a limited amount of occurrences
+ endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1;
+ }
+
+ TextLenType sentId;
+ unsigned char posInSent;
+ for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){
+ this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent);
+ tmpNode.sentIdInCorpus = sentId;
+ tmpNode.posInSentInCorpus = posInSent;
+
+ allFoundNgrams.push_back(tmpNode);
+ }
+ }
+ }
+
+ }
+ }
+
+ free(table);
+
+ return allFoundNgrams;
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIDs
+ vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent);
+
+ return this->findPhrasesInASentence(srcSentAsVocIDs);
+}
+
+
+bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd)
+{
+ int phraseLen = phrase.size();
+
+ //first check if there are any <unk> in the phrase
+ for(int i=0;i<phrase.size();i++){
+ if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){
+ return false; //return empty matching result
+ }
+ }
+
+ TextLenType currentRangeStart, currentRangeEnd;
+ TextLenType narrowedRangeStart, narrowedRangeEnd;
+ IndexType vocId;
+
+ //for word 1
+ vocId = phrase[0];
+ currentRangeStart = this->level1Buckets[vocId].first;
+ currentRangeEnd = this->level1Buckets[vocId].last;
+
+ if(currentRangeStart>currentRangeEnd){
+ return false; //even this 1-gram does not exist
+ }
+
+ int posInPhrase = 1;
+ while( posInPhrase<phraseLen ){
+ vocId = phrase[posInPhrase];
+ bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd);
+
+ if(! stillExist){
+ return false;
+ }
+
+ currentRangeStart = narrowedRangeStart;
+ currentRangeEnd = narrowedRangeEnd;
+
+ posInPhrase++;
+ }
+
+ //we find the range of matching phrase, now get the sentId
+ rangeStart = currentRangeStart;
+ rangeEnd = currentRangeEnd;
+
+ return true;
+}
+
+///similar to construct the freq table
+///but only search for the exact phrase matching
+///Important: because locateSentIdFromPos is called which requires the offset information
+///Suffix array has to be initialized with offset loaded
+///i.e. initilized with loadData_forSearch(corpusName, bool noVoc, noOffset=fase)
+///otherwise the program will have segmentation fault
+///SALM does not check if offset has been loaded already for efficiency reasons because locateSendIdFromPos() is called frequently
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase)
+{
+ vector<S_SimplePhraseLocationElement> matchingResult;
+
+ TextLenType rangeStart, rangeEnd;
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ //we find some match
+ S_SimplePhraseLocationElement tmpNode;
+ for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){
+ this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus);
+ matchingResult.push_back(tmpNode);
+ }
+ }
+
+ return matchingResult;
+}
+
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->locateExactPhraseInCorpus(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase)
+{
+ TextLenType rangeStart, rangeEnd;
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ return rangeEnd - rangeStart + 1;
+ }
+
+ return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->freqOfExactPhraseMatch(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen)
+{
+ TextLenType rangeStart, rangeEnd;
+
+ sentLen = phrase.size();
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ startPosInSA = rangeStart;
+ return rangeEnd - rangeStart + 1;
+ }
+
+ return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber()
+{
+ return this->totalSentNum;
+}
+
+///given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+ n = index / sentLen + 1;
+ posInSrcSent = index % sentLen;
+}
+
+///given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+ unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+ return indexInTable;
+}
+
+///simple return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+ return this->numberOfMatcedNgram(sentInVocId);
+}
+
+///simply return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ unsigned int totalMatched = 0;
+
+ for(unsigned int i=0;i<(sentLen*sentLen);i++){
+ if(table[i].found){
+ totalMatched++;
+ }
+ }
+
+ free(table);
+ return totalMatched;
+}
+
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+ return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen);
+}
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen)
+{
+ sentLen = sentInVocId.size();
+ map<int, pair<int, unsigned long> > nGramMatched;
+ map<int, pair<int, unsigned long> >::iterator iterNGramMatched;
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ for(int n = 1; n <= sentLen; n++){
+ for(int startPos=0; startPos <= (sentLen - n); startPos++){
+ int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen);
+
+ if(table[indexInTable].found){
+
+ unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1;
+ iterNGramMatched = nGramMatched.find(n);
+ if(iterNGramMatched==nGramMatched.end()){//has not seen this before
+ nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) ));
+ }
+ else{
+ iterNGramMatched->second.first++;
+ iterNGramMatched->second.second+=freqInTraining;
+ }
+ }
+ }
+ }
+
+ free(table);
+
+ return nGramMatched;
+}
+
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h
new file mode 100755
index 0000000..2c0070d
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h
@@ -0,0 +1,127 @@
+#if !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)
+#define __SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_
+
+#include "_SuffixArrayApplicationBase.h"
+/**
+* \ingroup search
+* Used by locateExactPhraseInCorpus() to return the location of an matched n-gram in the corpus
+* as a pair of <sentenceId, offset pos in sentence>
+**/
+typedef struct simplePhraseLocationElement
+{
+ TextLenType sentIdInCorpus;
+ unsigned char posInSentInCorpus;
+}S_SimplePhraseLocationElement;
+
+/**
+* \ingroup search
+* Used by findPhraseInASentence() to return the location of an embedded n-gram in the corpus
+* <posStartInSrcSent, posEndInSrcSent> represents the embedded n-gram in the sentence
+* <sentIdInCorpus, posInSentInCorpus> represents the location in the corpus
+**/
+typedef struct phraseLocationElement
+{
+ unsigned char posStartInSrcSent;
+ unsigned char posEndInSrcSent;
+ TextLenType sentIdInCorpus;
+ unsigned char posInSentInCorpus;
+}S_phraseLocationElement;
+
+/**
+* \ingroup search
+**/
+typedef struct phraseLocationWithSrcSentElement
+{
+ int srcPosStart;
+ int srcPosEnd;
+ TextLenType sentId;
+ TextLenType posInSent;
+ vector<C_String> sentence;
+}S_phraseLocationWithSrcSentElement;
+
+/**
+* \ingroup search
+**/
+typedef struct sentSearchTableElement
+{
+ bool found;
+ TextLenType startPosInSA;
+ TextLenType endingPosInSA;
+}S_sentSearchTableElement;
+
+
+/**
+* \ingroup search
+* Base class for suffix array search applications
+* Provides functions to search n-grams in the corpus
+* Including the frequency of the n-gram and the actual location (sentenceID+offset in sentence)
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_SuffixArraySearchApplicationBase : public C_SuffixArrayApplicationBase
+{
+public:
+ void loadData_forSearch(const char * filename, bool noVoc, bool noOffset);
+
+ unsigned int numberOfMatcedNgram(const char * srcSent);
+ unsigned int numberOfMatcedNgram(vector<IndexType> & sentInVocId);
+
+ TextLenType freqOfExactPhraseMatch(const char * phrase);
+ TextLenType freqOfExactPhraseMatch(vector<IndexType> & phrase);
+
+ TextLenType freqOfExactPhraseMatchAndFirstOccurrence(const char * phrase, TextLenType & startPosInSA, int & sentLen);
+ TextLenType freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen);
+
+ vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(const char * phrase);
+ vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(vector<IndexType> & phrase);
+
+ vector<S_phraseLocationElement> findPhrasesInASentence(const char * srcSent);
+ vector<S_phraseLocationElement> findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs);
+
+ void displayNgramMatchingFreq4Sent(const char *);
+ void displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId);
+
+ map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen);
+ map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int & sentLen);
+
+ S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen);
+ S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId);
+
+ void setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram);
+ void setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport);
+ void setParam_longestUnitToReport(int longestUnitToReport);
+ void setParam_shortestUnitToReport(int shortestUnitToReport);
+
+ TextLenType returnTotalSentNumber();
+
+ vector<IndexType> convertStringToVocId(const char * sentText);
+ vector<C_String> convertCharStringToCStringVector(const char * sentText);
+ vector<IndexType> convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector);
+
+
+ C_SuffixArraySearchApplicationBase();
+ virtual ~C_SuffixArraySearchApplicationBase();
+
+protected:
+ bool locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd);
+
+ bool searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType & resultStartPos, TextLenType & resultEndPos);
+ char comparePhraseWithTextWithLCP(IndexType, int, TextLenType);
+
+ void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset);
+ void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen);
+
+
+ unsigned int twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen);
+ void oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n);
+
+ int reportMaxOccurrenceOfOneNgram;
+ int highestFreqThresholdForReport;
+ int longestUnitToReport;
+ int shortestUnitToReport;
+
+ TextLenType totalSentNum;
+};
+
+#endif // !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)
diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp
new file mode 100755
index 0000000..91962fe
--- /dev/null
+++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp
@@ -0,0 +1,314 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayApplicationBase.h"
+
+#include "malloc.h"
+#include "time.h"
+
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase()
+{
+ this->level1Buckets = NULL;
+ this->noVocabulary = false; //by default, still load the vocabulary
+ this->noOffset = false; //by default, load offset
+ this->noLevel1Bucket = false; //by default, construct level1 bucket
+}
+
+C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase()
+{
+ if(this->level1Buckets!=NULL){
+ free(this->level1Buckets);
+ }
+
+ //not necessary too
+ free(this->corpus_list);
+ free(this->suffix_list);
+
+ if(! this->noOffset){
+ free(this->offset_list);
+ }
+
+ if(! this->noVocabulary){
+ delete(this->voc);
+ }
+}
+
+/**
+* Load the indexed corpus, suffix array, vocabulary, offset into memory for follow up applications
+* It is optional to load vocabulary, offset depends on the argument.
+* In the case when the testing data shares the same vocabulary as the training data and only vocIDs are used to represent the sentence/n-grams
+* then vocabulary which maps between vocId and the word text can be skipped to save some memory.
+*
+* If the suffix array object does not need to locate the sentence id of an occurred n-gram, then offset information is not needed.
+*
+* Be very careful here, the suffix array class does not check if offset has been loaded in the search function to make it efficient
+* you need to know what the suffix array class will be used (whether offset is needed) and load it properly
+* @param fileNameStem The filename of the corpus. This should be the same filename used in IndexSA
+* @param noVoc If set to be 'true', vocabulary will not be loaded
+* @param noOffset If set to be 'true', the offset information will not be loaded. <sentId, offsetInSent> information for an n-gram's occurrences can not be calculated.
+* @param noLevel1Bucket Level1Bucket is used to speed up the search at the cost of additional memory. For applications which do not need to locate n-grams in the corpus (such as the corpus scanning application), then there is no need to create Level1Bucket
+**/
+void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket)
+{
+ long ltime1, ltime2;
+
+ this->noVocabulary = noVoc;
+ this->noOffset = noOffset;
+ this->noLevel1Bucket = noLevel1Bucket;
+
+
+ char tmpString[1000];
+
+ //the order of loading the data is important, do not change
+ if(! this->noVocabulary){
+ time( &ltime1 );
+ cerr<<"Loading Vocabulary...\n";
+ sprintf(tmpString,"%s.id_voc",fileNameStem);
+ this->loadVoc(tmpString);
+ time( &ltime2);
+ cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n";
+ }
+
+ time( &ltime1 );
+ cerr<<"Loading corpus...\n";
+ sprintf(tmpString,"%s.sa_corpus",fileNameStem);
+ this->loadCorpusAndInitMem(tmpString);
+ time( &ltime2);
+ cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+ time( &ltime1 );
+ cerr<<"Loading suffix...\n";
+ sprintf(tmpString,"%s.sa_suffix",fileNameStem);
+ this->loadSuffix(tmpString);
+ time( &ltime2);
+ cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+ if(! this->noOffset){
+ time( &ltime1 );
+ cerr<<"Loading offset...\n";
+ sprintf(tmpString,"%s.sa_offset",fileNameStem);
+ this->loadOffset(tmpString);
+ time( &ltime2);
+ cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n";
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadVoc(const char *filename)
+{
+ this->voc = new C_IDVocabulary(filename);
+}
+
+void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * CorpusInputFile = fopen(filename, "rb");
+
+ if(!CorpusInputFile){
+ cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n";
+ exit(0);
+ }
+
+ //first, read the size of the corpus
+ dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile);
+
+ //allocate memory for all data structure
+ this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize);
+ if(! this->corpus_list){
+ cerr<<"Can not allocate memory to load the corpus!\n";
+ exit(0);
+ }
+
+ this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize);
+ if(! this->suffix_list){
+ cerr<<"Can not allocate memory to load the suffix!\n";
+ exit(0);
+ }
+
+ if(! this->noOffset){
+ this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize);
+ if(! this->offset_list){
+ cerr<<"Can not allocate memory to load the offset!\n";
+ exit(0);
+ }
+ }
+
+ //read the corpus file
+ unsigned int totalRead = 0;
+ unsigned int remaining = this->corpusSize;
+ unsigned int oneBatchReadSize;
+ char * currentPosInCorpusList = (char *) this->corpus_list;
+ while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile);
+
+ totalRead+=dwRead;
+ remaining-=dwRead;
+
+ currentPosInCorpusList+=sizeof(IndexType)*dwRead;
+ }
+ if(totalRead!=this->corpusSize){
+ cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+ fclose(CorpusInputFile);
+
+ this->sentIdStart = this->corpus_list[0];
+ this->vocIdForSentStart = this->corpus_list[1];
+ this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1];
+ this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2];
+
+ if(! this->noLevel1Bucket){
+ //in this corpus, we will have at most sentIdStart-1 word types
+ //the index in the array correspond to the vocId, 0 is for <unk> and the last one is for <sentIdStart-1> which is the largest vocId observed in the data
+ this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart);
+
+ //initialize the level1 buckets
+ for(IndexType i=0;i<this->sentIdStart;i++){
+ this->level1Buckets[i].first = (TextLenType) -1;
+ this->level1Buckets[i].last = 0;
+ }
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadSuffix(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * SuffixInputFile = fopen(filename, "rb");
+ if(!SuffixInputFile){
+ cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl;
+ exit(0);
+ }
+
+ //first, read in the size of the suffix array
+ TextLenType suffixArraySize;
+ dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile);
+
+ if(suffixArraySize!=this->corpusSize){
+ cerr<<"Something wrong, the suffix array size is different from the corpus size.\n";
+ cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl;
+ exit(0);
+ }
+
+ //read all the suffix into memory
+ unsigned int totalRead = 0;
+ unsigned int remaining = suffixArraySize;
+ unsigned int oneBatchReadSize;
+ char * currentPosInSuffixList = (char *) this->suffix_list;
+ while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile);
+
+ totalRead+=dwRead;
+ remaining -= dwRead;
+
+ currentPosInSuffixList+=sizeof(TextLenType)*dwRead;
+ }
+ if(totalRead!=suffixArraySize){
+ cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+
+ fclose(SuffixInputFile);
+
+ if(! this->noLevel1Bucket){
+ //build level-1 bucket
+ cerr<<"Initialize level-1 buckets...\n";
+ IndexType currentVocId = 0;
+ IndexType vocId;
+ TextLenType pos;
+ TextLenType lastSaIndex = 0;
+
+ for(TextLenType i=0; i<suffixArraySize; i++){
+ pos = this->suffix_list[i];
+
+ //for level1 bucket
+ vocId = this->corpus_list[pos];
+
+ if(vocId<this->sentIdStart){ //is a meaningful word type
+ if(vocId!=currentVocId){
+ this->level1Buckets[currentVocId].last = lastSaIndex; //for first word which is <unk> this does not matter
+ this->level1Buckets[vocId].first = i;
+
+ currentVocId=vocId;
+ }
+
+ lastSaIndex = i;
+ }
+ }
+
+ //for the last word type
+ this->level1Buckets[currentVocId].last = lastSaIndex;
+ }
+ else{
+ this->level1Buckets = NULL;
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadOffset(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * OffsetInputFile = fopen(filename, "rb");
+
+ if(!OffsetInputFile){
+ cerr<<"Offset file: "<<filename<<" does not exist!"<<endl;
+ exit(0);
+ }
+
+ //first, read the size of the corpus
+ TextLenType offsetListLen;
+ dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile);
+ if(offsetListLen!=this->corpusSize){
+ cerr<<"Text length is inconsistent with the length of the offset.\n";
+ exit(0);
+ }
+
+ //read all the suffix into memory
+ unsigned int totalRead = 0;
+ unsigned int remaining = offsetListLen;
+ unsigned int oneBatchReadSize;
+ char * currentOffsetListPos = (char *) this->offset_list;
+ while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){
+ oneBatchReadSize = SIZE_ONE_READ;
+
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile);
+
+ totalRead+=dwRead;
+ remaining-=dwRead;
+
+ currentOffsetListPos+=sizeof(unsigned char)*dwRead;
+
+ }
+ if(totalRead!=offsetListLen){
+ cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+ fclose(OffsetInputFile);
+
+}
+
+TextLenType C_SuffixArrayApplicationBase::returnCorpusSize()
+{
+ return this->corpusSize;
+}
diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~
new file mode 100755
index 0000000..bd17287
--- /dev/null
+++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~
@@ -0,0 +1,313 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayApplicationBase.h"
+
+#include "malloc.h"
+#include "time.h"
+
+#include <iostream>
+#include <fstream>
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase()
+{
+ this->level1Buckets = NULL;
+ this->noVocabulary = false; //by default, still load the vocabulary
+ this->noOffset = false; //by default, load offset
+ this->noLevel1Bucket = false; //by default, construct level1 bucket
+}
+
+C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase()
+{
+ if(this->level1Buckets!=NULL){
+ free(this->level1Buckets);
+ }
+
+ //not necessary too
+ free(this->corpus_list);
+ free(this->suffix_list);
+
+ if(! this->noOffset){
+ free(this->offset_list);
+ }
+
+ if(! this->noVocabulary){
+ delete(this->voc);
+ }
+}
+
+/**
+* Load the indexed corpus, suffix array, vocabulary, offset into memory for follow up applications
+* It is optional to load vocabulary, offset depends on the argument.
+* In the case when the testing data shares the same vocabulary as the training data and only vocIDs are used to represent the sentence/n-grams
+* then vocabulary which maps between vocId and the word text can be skipped to save some memory.
+*
+* If the suffix array object does not need to locate the sentence id of an occurred n-gram, then offset information is not needed.
+*
+* Be very careful here, the suffix array class does not check if offset has been loaded in the search function to make it efficient
+* you need to know what the suffix array class will be used (whether offset is needed) and load it properly
+* @param fileNameStem The filename of the corpus. This should be the same filename used in IndexSA
+* @param noVoc If set to be 'true', vocabulary will not be loaded
+* @param noOffset If set to be 'true', the offset information will not be loaded. <sentId, offsetInSent> information for an n-gram's occurrences can not be calculated.
+* @param noLevel1Bucket Level1Bucket is used to speed up the search at the cost of additional memory. For applications which do not need to locate n-grams in the corpus (such as the corpus scanning application), then there is no need to create Level1Bucket
+**/
+void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket)
+{
+ long ltime1, ltime2;
+
+ this->noVocabulary = noVoc;
+ this->noOffset = noOffset;
+ this->noLevel1Bucket = noLevel1Bucket;
+
+
+ char tmpString[1000];
+
+ //the order of loading the data is important, do not change
+ if(! this->noVocabulary){
+ time( &ltime1 );
+ cerr<<"Loading Vocabulary...\n";
+ sprintf(tmpString,"%s.id_voc",fileNameStem);
+ this->loadVoc(tmpString);
+ time( &ltime2);
+ cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n";
+ }
+
+ time( &ltime1 );
+ cerr<<"Loading corpus...\n";
+ sprintf(tmpString,"%s.sa_corpus",fileNameStem);
+ this->loadCorpusAndInitMem(tmpString);
+ time( &ltime2);
+ cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+ time( &ltime1 );
+ cerr<<"Loading suffix...\n";
+ sprintf(tmpString,"%s.sa_suffix",fileNameStem);
+ this->loadSuffix(tmpString);
+ time( &ltime2);
+ cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+ if(! this->noOffset){
+ time( &ltime1 );
+ cerr<<"Loading offset...\n";
+ sprintf(tmpString,"%s.sa_offset",fileNameStem);
+ this->loadOffset(tmpString);
+ time( &ltime2);
+ cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n";
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadVoc(const char *filename)
+{
+ this->voc = new C_IDVocabulary(filename);
+}
+
+void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * CorpusInputFile = fopen(filename, "rb");
+
+ if(!CorpusInputFile){
+ cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n";
+ exit(0);
+ }
+
+ //first, read the size of the corpus
+ dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile);
+
+ //allocate memory for all data structure
+ this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize);
+ if(! this->corpus_list){
+ cerr<<"Can not allocate memory to load the corpus!\n";
+ exit(0);
+ }
+
+ this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize);
+ if(! this->suffix_list){
+ cerr<<"Can not allocate memory to load the suffix!\n";
+ exit(0);
+ }
+
+ if(! this->noOffset){
+ this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize);
+ if(! this->offset_list){
+ cerr<<"Can not allocate memory to load the offset!\n";
+ exit(0);
+ }
+ }
+
+ //read the corpus file
+ unsigned int totalRead = 0;
+ unsigned int remaining = this->corpusSize;
+ unsigned int oneBatchReadSize;
+ char * currentPosInCorpusList = (char *) this->corpus_list;
+ while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile);
+
+ totalRead+=dwRead;
+ remaining-=dwRead;
+
+ currentPosInCorpusList+=sizeof(IndexType)*dwRead;
+ }
+ if(totalRead!=this->corpusSize){
+ cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+ fclose(CorpusInputFile);
+
+ this->sentIdStart = this->corpus_list[0];
+ this->vocIdForSentStart = this->corpus_list[1];
+ this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1];
+ this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2];
+
+ if(! this->noLevel1Bucket){
+ //in this corpus, we will have at most sentIdStart-1 word types
+ //the index in the array correspond to the vocId, 0 is for <unk> and the last one is for <sentIdStart-1> which is the largest vocId observed in the data
+ this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart);
+
+ //initialize the level1 buckets
+ for(IndexType i=0;i<this->sentIdStart;i++){
+ this->level1Buckets[i].first = (TextLenType) -1;
+ this->level1Buckets[i].last = 0;
+ }
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadSuffix(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * SuffixInputFile = fopen(filename, "rb");
+ if(!SuffixInputFile){
+ cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl;
+ exit(0);
+ }
+
+ //first, read in the size of the suffix array
+ TextLenType suffixArraySize;
+ dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile);
+
+ if(suffixArraySize!=this->corpusSize){
+ cerr<<"Something wrong, the suffix array size is different from the corpus size.\n";
+ cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl;
+ exit(0);
+ }
+
+ //read all the suffix into memory
+ unsigned int totalRead = 0;
+ unsigned int remaining = suffixArraySize;
+ unsigned int oneBatchReadSize;
+ char * currentPosInSuffixList = (char *) this->suffix_list;
+ while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile);
+
+ totalRead+=dwRead;
+ remaining -= dwRead;
+
+ currentPosInSuffixList+=sizeof(TextLenType)*dwRead;
+ }
+ if(totalRead!=suffixArraySize){
+ cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+
+ fclose(SuffixInputFile);
+
+ if(! this->noLevel1Bucket){
+ //build level-1 bucket
+ cerr<<"Initialize level-1 buckets...\n";
+ IndexType currentVocId = 0;
+ IndexType vocId;
+ TextLenType pos;
+ TextLenType lastSaIndex = 0;
+
+ for(TextLenType i=0; i<suffixArraySize; i++){
+ pos = this->suffix_list[i];
+
+ //for level1 bucket
+ vocId = this->corpus_list[pos];
+
+ if(vocId<this->sentIdStart){ //is a meaningful word type
+ if(vocId!=currentVocId){
+ this->level1Buckets[currentVocId].last = lastSaIndex; //for first word which is <unk> this does not matter
+ this->level1Buckets[vocId].first = i;
+
+ currentVocId=vocId;
+ }
+
+ lastSaIndex = i;
+ }
+ }
+
+ //for the last word type
+ this->level1Buckets[currentVocId].last = lastSaIndex;
+ }
+ else{
+ this->level1Buckets = NULL;
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadOffset(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * OffsetInputFile = fopen(filename, "rb");
+
+ if(!OffsetInputFile){
+ cerr<<"Offset file: "<<filename<<" does not exist!"<<endl;
+ exit(0);
+ }
+
+ //first, read the size of the corpus
+ TextLenType offsetListLen;
+ dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile);
+ if(offsetListLen!=this->corpusSize){
+ cerr<<"Text length is inconsistent with the length of the offset.\n";
+ exit(0);
+ }
+
+ //read all the suffix into memory
+ unsigned int totalRead = 0;
+ unsigned int remaining = offsetListLen;
+ unsigned int oneBatchReadSize;
+ char * currentOffsetListPos = (char *) this->offset_list;
+ while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){
+ oneBatchReadSize = SIZE_ONE_READ;
+
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile);
+
+ totalRead+=dwRead;
+ remaining-=dwRead;
+
+ currentOffsetListPos+=sizeof(unsigned char)*dwRead;
+
+ }
+ if(totalRead!=offsetListLen){
+ cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+ fclose(OffsetInputFile);
+
+}
+
+TextLenType C_SuffixArrayApplicationBase::returnCorpusSize()
+{
+ return this->corpusSize;
+}
diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h
new file mode 100755
index 0000000..74fad4e
--- /dev/null
+++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h
@@ -0,0 +1,58 @@
+#if !defined(__SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_)
+#define __SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_
+
+#include "salm_shared.h"
+#include "_IDVocabulary.h"
+#include "_String.h"
+
+using namespace std;
+
+typedef struct level1BucketElement
+{
+ TextLenType first;
+ TextLenType last;
+} S_level1BucketElement;
+
+
+/**
+* Base class of Suffix Array applications
+* Providing functions to load the suffix array and initialize the required vocIDs
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+class C_SuffixArrayApplicationBase
+{
+public:
+ void loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket);
+ TextLenType returnCorpusSize();
+
+ C_SuffixArrayApplicationBase();
+ virtual ~C_SuffixArrayApplicationBase();
+
+protected:
+ TextLenType corpusSize;
+
+ void loadVoc(const char * filename);
+ void loadOffset(const char * filename);
+ void loadSuffix(const char * filename);
+ void loadCorpusAndInitMem(const char * filename);
+
+ bool noVocabulary;
+ bool noOffset;
+ bool noLevel1Bucket;
+
+ C_IDVocabulary * voc;
+ IndexType sentIdStart;
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+ IndexType * corpus_list;
+ unsigned char * offset_list;
+ TextLenType * suffix_list;
+
+ S_level1BucketElement * level1Buckets;
+
+};
+
+#endif // !defined(__SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_)
diff --git a/Src/Utils/InitializeVocabulary.cpp b/Src/Utils/InitializeVocabulary.cpp
new file mode 100755
index 0000000..b749568
--- /dev/null
+++ b/Src/Utils/InitializeVocabulary.cpp
@@ -0,0 +1,30 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_IDVocabulary.h"
+
+#include <iostream>
+
+using namespace std;
+
+/**
+* \ingroup utils
+* Intialize an empty vocabulary with reserved words
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ if(argc<2){
+ cerr<<"\nUsage:";
+ cerr<<"\n\t"<<argv[0]<<" vocabularyFileName\n\n";
+ exit(0);
+ }
+
+ C_IDVocabulary voc;
+
+ voc.addingReservedWords();
+ voc.outputToFile(argv[1]);
+
+ return 0;
+
+}
diff --git a/Src/Utils/UpdateUniversalVoc.cpp b/Src/Utils/UpdateUniversalVoc.cpp
new file mode 100755
index 0000000..02ea6cb
--- /dev/null
+++ b/Src/Utils/UpdateUniversalVoc.cpp
@@ -0,0 +1,28 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_UniversalVocabulary.h"
+
+#include <iostream>
+
+using namespace std;
+
+/**
+* \ingroup utils
+* Update the universal vocabulary with words in corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ if(argc<3){
+ cerr<<"\nUsage:";
+ cerr<<"\n\t"<<argv[0]<<" universal_voc corpusFileName\n\n";
+ exit(0);
+ }
+
+ C_UniversalVocabulary universalVoc(argv[1]);
+
+ universalVoc.updateWithNewCorpus(argv[2]);
+
+ return 1;
+}
diff --git a/Src/Utils/_UniversalVocabulary.cpp b/Src/Utils/_UniversalVocabulary.cpp
new file mode 100755
index 0000000..3be91d2
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.cpp
@@ -0,0 +1,118 @@
+#include "_UniversalVocabulary.h"
+#include "malloc.h"
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <stdlib.h>
+
+using namespace std;
+
+C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName)
+{
+ int fileNameSize=strlen(universalVocFileName);
+ fileNameSize++;
+
+ this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize);
+ sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName);
+
+ this->universalVoc = new C_IDVocabulary(universalVocFileName);
+
+}
+
+C_UniversalVocabulary::~C_UniversalVocabulary()
+{
+ free(this->universalCorpusFileName);
+ delete(this->universalVoc);
+}
+
+
+/**
+* Update the universal vocabulary with words in a new corpus
+* Output the updated universal vocabulary
+* Output the vocabulary needed for the new corpus too
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName)
+{
+
+ ifstream textStream;
+ textStream.open(newCorpusFileName);
+
+ if(textStream==NULL){
+ fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName);
+ exit(-1);
+ }
+
+
+ //add reserved words from universal voc
+ for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){
+ C_String reservedWordText = this->universalVoc->getText(vocId);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId));
+ }
+
+ string aLine;
+ unsigned int sentNumber = 1;
+ unsigned int corpusSize = 0;
+
+ char * thisToken;
+ char delimit[] =" \t\r\n";
+ map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus;
+
+
+ getline(textStream, aLine);
+ while(!textStream.eof()){
+
+ if(aLine.length()>0){
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ C_String thisWord(thisToken);
+
+ //check if this word has already been seen
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord);
+
+ if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){
+ //new type
+ IndexType vocId = this->universalVoc->getId(thisWord);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId));
+ }
+
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+
+ }
+
+ getline(textStream, aLine);
+ }
+
+
+ //now output the updated universal vocabulary
+ this->universalVoc->outputToFile(this->universalCorpusFileName);
+
+ //output the vocabulary needed for the new corpus
+ char vocabularyForNewCorpusFileName[1024];
+ sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName);
+
+ ofstream outputVocFile;
+ outputVocFile.open(vocabularyForNewCorpusFileName);
+
+ if(!outputVocFile){
+ cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n";
+ exit(-1);
+ }
+
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin();
+ while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){
+ outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl;
+ iterWordsUsedInTheNewCorpus++;
+ }
+
+ outputVocFile.close();
+}
diff --git a/Src/Utils/_UniversalVocabulary.cpp~ b/Src/Utils/_UniversalVocabulary.cpp~
new file mode 100755
index 0000000..50a7396
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.cpp~
@@ -0,0 +1,117 @@
+#include "_UniversalVocabulary.h"
+#include "malloc.h"
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName)
+{
+ int fileNameSize=strlen(universalVocFileName);
+ fileNameSize++;
+
+ this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize);
+ sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName);
+
+ this->universalVoc = new C_IDVocabulary(universalVocFileName);
+
+}
+
+C_UniversalVocabulary::~C_UniversalVocabulary()
+{
+ free(this->universalCorpusFileName);
+ delete(this->universalVoc);
+}
+
+
+/**
+* Update the universal vocabulary with words in a new corpus
+* Output the updated universal vocabulary
+* Output the vocabulary needed for the new corpus too
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName)
+{
+
+ ifstream textStream;
+ textStream.open(newCorpusFileName);
+
+ if(textStream==NULL){
+ fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName);
+ exit(-1);
+ }
+
+
+ //add reserved words from universal voc
+ for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){
+ C_String reservedWordText = this->universalVoc->getText(vocId);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId));
+ }
+
+ string aLine;
+ unsigned int sentNumber = 1;
+ unsigned int corpusSize = 0;
+
+ char * thisToken;
+ char delimit[] =" \t\r\n";
+ map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus;
+
+
+ getline(textStream, aLine);
+ while(!textStream.eof()){
+
+ if(aLine.length()>0){
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ C_String thisWord(thisToken);
+
+ //check if this word has already been seen
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord);
+
+ if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){
+ //new type
+ IndexType vocId = this->universalVoc->getId(thisWord);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId));
+ }
+
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+
+ }
+
+ getline(textStream, aLine);
+ }
+
+
+ //now output the updated universal vocabulary
+ this->universalVoc->outputToFile(this->universalCorpusFileName);
+
+ //output the vocabulary needed for the new corpus
+ char vocabularyForNewCorpusFileName[1024];
+ sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName);
+
+ ofstream outputVocFile;
+ outputVocFile.open(vocabularyForNewCorpusFileName);
+
+ if(!outputVocFile){
+ cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n";
+ exit(-1);
+ }
+
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin();
+ while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){
+ outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl;
+ iterWordsUsedInTheNewCorpus++;
+ }
+
+ outputVocFile.close();
+}
diff --git a/Src/Utils/_UniversalVocabulary.h b/Src/Utils/_UniversalVocabulary.h
new file mode 100755
index 0000000..2df4954
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.h
@@ -0,0 +1,38 @@
+#if !defined (__HEADER_UNIVERSAL_VOC_INCLUDED__)
+#define __HEADER_UNIVERSAL_VOC_INCLUDED__
+
+#include "salm_shared.h"
+#include "_IDVocabulary.h"
+#include "_String.h"
+
+#include <map>
+
+using namespace std;
+
+/**
+* \ingroup utils
+* Universal Vocabulary class provides function to update the univeral vocabulary
+* with the words in a new corpus
+* and output the vocabulary needed for the new corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_UniversalVocabulary{
+
+public:
+ void updateWithNewCorpus(const char * newCorpusFileName);
+
+ C_UniversalVocabulary(const char * universalVocFileName);
+ ~C_UniversalVocabulary();
+
+private:
+ char * universalCorpusFileName;
+ C_IDVocabulary * universalVoc;
+
+ map<C_String, IndexType, ltstr> wordsUsedInTheNewCorpus;
+
+};
+
+
+#endif