diff options
author | Kenneth Heafield <github@kheafield.com> | 2011-11-17 16:49:55 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2011-11-17 16:49:55 +0400 |
commit | 72a4c8a0d34529086b91c016ce32f0b03f9778a1 (patch) | |
tree | 1520b39aa01e77dda85b57f42749e992b42a886d /util | |
parent | 07a8558c02fe46b08734c8479b58ad0f9e3a1a3c (diff) |
Move kenlm up one level, simplify compilation
Diffstat (limited to 'util')
38 files changed, 4868 insertions, 0 deletions
diff --git a/util/COPYING b/util/COPYING new file mode 100644 index 000000000..94a9ed024 --- /dev/null +++ b/util/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + <program> Copyright (C) <year> <name of author> + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +<http://www.gnu.org/licenses/>. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +<http://www.gnu.org/philosophy/why-not-lgpl.html>. diff --git a/util/COPYING.LESSER b/util/COPYING.LESSER new file mode 100644 index 000000000..cca7fc278 --- /dev/null +++ b/util/COPYING.LESSER @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/util/LICENSE b/util/LICENSE new file mode 100644 index 000000000..0ba079e30 --- /dev/null +++ b/util/LICENSE @@ -0,0 +1,20 @@ +Most of the code here is licensed under the LGPL. There are exceptions which have their own licenses, listed below. See comments in those files for more details. + +murmur_hash.cc is under the MIT license. +string_piece.hh and util/string_piece.cc are Google code and contains its own license. +file.cc contains a modified implementation of mkstemp under the LGPL. + +For the rest: + + Avenue code is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Avenue code is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with Avenue code. If not, see <http://www.gnu.org/licenses/>. diff --git a/util/Makefile.am b/util/Makefile.am new file mode 100644 index 000000000..c567793ff --- /dev/null +++ b/util/Makefile.am @@ -0,0 +1,12 @@ +lib_LTLIBRARIES = libkenutil.la + +AM_CPPFLAGS = -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES $(BOOST_CPPFLAGS) + +libkenutil_la_SOURCES = \ + bit_packing.cc \ + ersatz_progress.cc \ + exception.cc \ + file.cc \ + file_piece.cc \ + murmur_hash.cc \ + mmap.cc diff --git a/util/bit_packing.cc b/util/bit_packing.cc new file mode 100644 index 000000000..41999b726 --- /dev/null +++ b/util/bit_packing.cc @@ -0,0 +1,40 @@ +#include "util/bit_packing.hh" +#include "util/exception.hh" + +#include <string.h> + +namespace util { + +namespace { +template <bool> struct StaticCheck {}; +template <> struct StaticCheck<true> { typedef bool StaticAssertionPassed; }; + +// If your float isn't 4 bytes, we're hosed. +typedef StaticCheck<sizeof(float) == 4>::StaticAssertionPassed FloatSize; + +} // namespace + +uint8_t RequiredBits(uint64_t max_value) { + if (!max_value) return 0; + uint8_t ret = 1; + while (max_value >>= 1) ++ret; + return ret; +} + +void BitPackingSanity() { + const FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; + if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000"); + char mem[57+8]; + memset(mem, 0, sizeof(mem)); + const uint64_t test57 = 0x123456789abcdefULL; + for (uint64_t b = 0; b < 57 * 8; b += 57) { + WriteInt57(mem, b, 57, test57); + } + for (uint64_t b = 0; b < 57 * 8; b += 57) { + if (test57 != ReadInt57(mem, b, 57, (1ULL << 57) - 1)) + UTIL_THROW(Exception, "The bit packing routines are failing for your architecture. Please send a bug report with your architecture, operating system, and compiler."); + } + // TODO: more checks. +} + +} // namespace util diff --git a/util/bit_packing.hh b/util/bit_packing.hh new file mode 100644 index 000000000..f28f71f81 --- /dev/null +++ b/util/bit_packing.hh @@ -0,0 +1,165 @@ +#ifndef UTIL_BIT_PACKING__ +#define UTIL_BIT_PACKING__ + +/* Bit-level packing routines + * + * WARNING WARNING WARNING: + * The write functions assume that memory is zero initially. This makes them + * faster and is the appropriate case for mmapped language model construction. + * These routines assume that unaligned access to uint64_t is fast. This is + * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on + * x86 but my target audience is large language models for which 64-bit is + * necessary. + * + * Call the BitPackingSanity function to sanity check. Calling once suffices, + * but it may be called multiple times when that's inconvenient. + * + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ + +#include <assert.h> +#ifdef __APPLE__ +#include <architecture/byte_order.h> +#elif __linux__ +#include <endian.h> +#elif !defined(_WIN32) && !defined(_WIN64) +#include <arpa/nameser_compat.h> +#endif + +#include <stdint.h> + +#include <string.h> + +namespace util { + +// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. +#if BYTE_ORDER == LITTLE_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) { + return bit; +} +#elif BYTE_ORDER == BIG_ENDIAN +inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { + return 64 - length - bit; +} +#else +#error "Bit packing code isn't written for your byte order." +#endif + +inline uint64_t ReadOff(const void *base, uint64_t bit_off) { + return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)); +} + +/* Pack integers up to 57 bits using their least significant digits. + * The length is specified using mask: + * Assumes mask == (1 << length) - 1 where length <= 57. + */ +inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, uint64_t mask) { + return (ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, length)) & mask; +} +/* Assumes value < (1 << length) and length <= 57. + * Assumes the memory is zero initially. + */ +inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + value64 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value64, sizeof(value64)); +#else + *reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= + (value << BitPackShift(bit_off & 7, length)); +#endif +} + +/* Same caveats as above, but for a 25 bit limit. */ +inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) { + return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask; +} + +inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + value32 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value32, sizeof(value32)); +#else + *reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= + (value << BitPackShift(bit_off & 7, length)); +#endif +} + +typedef union { float f; uint32_t i; } FloatEnc; + +inline float ReadFloat32(const void *base, uint64_t bit_off) { + FloatEnc encoded; + encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32); + return encoded.f; +} +inline void WriteFloat32(void *base, uint64_t bit_off, float value) { + FloatEnc encoded; + encoded.f = value; + WriteInt57(base, bit_off, 32, encoded.i); +} + +const uint32_t kSignBit = 0x80000000; + +inline void SetSign(float &to) { + FloatEnc enc; + enc.f = to; + enc.i |= kSignBit; + to = enc.f; +} + +inline void UnsetSign(float &to) { + FloatEnc enc; + enc.f = to; + enc.i &= ~kSignBit; + to = enc.f; +} + +inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) { + FloatEnc encoded; + encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31); + // Sign bit set means negative. + encoded.i |= kSignBit; + return encoded.f; +} +inline void WriteNonPositiveFloat31(void *base, uint64_t bit_off, float value) { + FloatEnc encoded; + encoded.f = value; + encoded.i &= ~kSignBit; + WriteInt57(base, bit_off, 31, encoded.i); +} + +void BitPackingSanity(); + +// Return bits required to store integers upto max_value. Not the most +// efficient implementation, but this is only called a few times to size tries. +uint8_t RequiredBits(uint64_t max_value); + +struct BitsMask { + static BitsMask ByMax(uint64_t max_value) { + BitsMask ret; + ret.FromMax(max_value); + return ret; + } + static BitsMask ByBits(uint8_t bits) { + BitsMask ret; + ret.bits = bits; + ret.mask = (1ULL << bits) - 1; + return ret; + } + void FromMax(uint64_t max_value) { + bits = RequiredBits(max_value); + mask = (1ULL << bits) - 1; + } + uint8_t bits; + uint64_t mask; +}; + +} // namespace util + +#endif // UTIL_BIT_PACKING__ diff --git a/util/bit_packing_test.cc b/util/bit_packing_test.cc new file mode 100644 index 000000000..4edc2004c --- /dev/null +++ b/util/bit_packing_test.cc @@ -0,0 +1,59 @@ +#include "util/bit_packing.hh" + +#define BOOST_TEST_MODULE BitPackingTest +#include <boost/test/unit_test.hpp> + +#include <string.h> + +namespace util { +namespace { + +const uint64_t test57 = 0x123456789abcdefULL; +const uint32_t test25 = 0x1234567; + +BOOST_AUTO_TEST_CASE(ZeroBit57) { + char mem[16]; + memset(mem, 0, sizeof(mem)); + WriteInt57(mem, 0, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1)); +} + +BOOST_AUTO_TEST_CASE(EachBit57) { + char mem[16]; + for (uint8_t b = 0; b < 8; ++b) { + memset(mem, 0, sizeof(mem)); + WriteInt57(mem, b, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Consecutive57) { + char mem[57+8]; + memset(mem, 0, sizeof(mem)); + for (uint64_t b = 0; b < 57 * 8; b += 57) { + WriteInt57(mem, b, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } + for (uint64_t b = 0; b < 57 * 8; b += 57) { + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Consecutive25) { + char mem[25+8]; + memset(mem, 0, sizeof(mem)); + for (uint64_t b = 0; b < 25 * 8; b += 25) { + WriteInt25(mem, b, 25, test25); + BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); + } + for (uint64_t b = 0; b < 25 * 8; b += 25) { + BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Sanity) { + BitPackingSanity(); +} + +} // namespace +} // namespace util diff --git a/util/ersatz_progress.cc b/util/ersatz_progress.cc new file mode 100644 index 000000000..a82ce6726 --- /dev/null +++ b/util/ersatz_progress.cc @@ -0,0 +1,45 @@ +#include "util/ersatz_progress.hh" + +#include <algorithm> +#include <ostream> +#include <limits> +#include <string> + +namespace util { + +namespace { const unsigned char kWidth = 100; } + +ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<std::size_t>::max()), complete_(next_), out_(NULL) {} + +ErsatzProgress::~ErsatzProgress() { + if (!out_) return; + Finished(); +} + +ErsatzProgress::ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete) + : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) { + if (!out_) { + next_ = std::numeric_limits<std::size_t>::max(); + return; + } + *out_ << message << "\n----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"; +} + +void ErsatzProgress::Milestone() { + if (!out_) { current_ = 0; return; } + if (!complete_) return; + unsigned char stone = std::min(static_cast<std::size_t>(kWidth), (current_ * kWidth) / complete_); + + for (; stones_written_ < stone; ++stones_written_) { + (*out_) << '*'; + } + if (stone == kWidth) { + (*out_) << std::endl; + next_ = std::numeric_limits<std::size_t>::max(); + out_ = NULL; + } else { + next_ = std::max(next_, (stone * complete_) / kWidth); + } +} + +} // namespace util diff --git a/util/ersatz_progress.hh b/util/ersatz_progress.hh new file mode 100644 index 000000000..92c345fee --- /dev/null +++ b/util/ersatz_progress.hh @@ -0,0 +1,54 @@ +#ifndef UTIL_ERSATZ_PROGRESS__ +#define UTIL_ERSATZ_PROGRESS__ + +#include <iosfwd> +#include <string> + +// Ersatz version of boost::progress so core language model doesn't depend on +// boost. Also adds option to print nothing. + +namespace util { +class ErsatzProgress { + public: + // No output. + ErsatzProgress(); + + // Null means no output. The null value is useful for passing along the ostream pointer from another caller. + ErsatzProgress(std::ostream *to, const std::string &message, std::size_t complete); + + ~ErsatzProgress(); + + ErsatzProgress &operator++() { + if (++current_ >= next_) Milestone(); + return *this; + } + + ErsatzProgress &operator+=(std::size_t amount) { + if ((current_ += amount) >= next_) Milestone(); + return *this; + } + + void Set(std::size_t to) { + if ((current_ = to) >= next_) Milestone(); + Milestone(); + } + + void Finished() { + Set(complete_); + } + + private: + void Milestone(); + + std::size_t current_, next_, complete_; + unsigned char stones_written_; + std::ostream *out_; + + // noncopyable + ErsatzProgress(const ErsatzProgress &other); + ErsatzProgress &operator=(const ErsatzProgress &other); +}; + +} // namespace util + +#endif // UTIL_ERSATZ_PROGRESS__ diff --git a/util/exception.cc b/util/exception.cc new file mode 100644 index 000000000..c4f8c04ce --- /dev/null +++ b/util/exception.cc @@ -0,0 +1,87 @@ +#include "util/exception.hh" + +#ifdef __GXX_RTTI +#include <typeinfo> +#endif + +#include <errno.h> +#include <string.h> + +namespace util { + +Exception::Exception() throw() {} +Exception::~Exception() throw() {} + +Exception::Exception(const Exception &from) : std::exception() { + stream_ << from.stream_.str(); +} + +Exception &Exception::operator=(const Exception &from) { + stream_ << from.stream_.str(); + return *this; +} + +const char *Exception::what() const throw() { + text_ = stream_.str(); + return text_.c_str(); +} + +void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) { + /* The child class might have set some text, but we want this to come first. + * Another option would be passing this information to the constructor, but + * then child classes would have to accept constructor arguments and pass + * them down. + */ + text_ = stream_.str(); + stream_.str(""); + stream_ << file << ':' << line; + if (func) stream_ << " in " << func << " threw "; + if (child_name) { + stream_ << child_name; + } else { +#ifdef __GXX_RTTI + stream_ << typeid(this).name(); +#else + stream_ << "an exception"; +#endif + } + if (condition) stream_ << " because `" << condition; + stream_ << "'.\n"; + stream_ << text_; +} + +namespace { +// The XOPEN version. +const char *HandleStrerror(int ret, const char *buf) { + if (!ret) return buf; + return NULL; +} + +// The GNU version. +const char *HandleStrerror(const char *ret, const char * /*buf*/) { + return ret; +} +} // namespace + +ErrnoException::ErrnoException() throw() : errno_(errno) { + char buf[200]; + buf[0] = 0; +#if defined(sun) || defined(_WIN32) || defined(_WIN64) + const char *add = strerror(errno); +#else + const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf); +#endif + + if (add) { + *this << add << ' '; + } +} + +ErrnoException::~ErrnoException() throw() {} + +EndOfFileException::EndOfFileException() throw() { + *this << "End of file"; +} +EndOfFileException::~EndOfFileException() throw() {} + +} // namespace util diff --git a/util/exception.hh b/util/exception.hh new file mode 100644 index 000000000..6d6a37cb1 --- /dev/null +++ b/util/exception.hh @@ -0,0 +1,116 @@ +#ifndef UTIL_EXCEPTION__ +#define UTIL_EXCEPTION__ + +#include <exception> +#include <sstream> +#include <string> + +namespace util { + +template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data); + +class Exception : public std::exception { + public: + Exception() throw(); + virtual ~Exception() throw(); + + Exception(const Exception &from); + Exception &operator=(const Exception &from); + + // Not threadsafe, but probably doesn't matter. FWIW, Boost's exception guidance implies that what() isn't threadsafe. + const char *what() const throw(); + + // For use by the UTIL_THROW macros. + void SetLocation( + const char *file, + unsigned int line, + const char *func, + const char *child_name, + const char *condition); + + private: + template <class Except, class Data> friend typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data); + + // This helps restrict operator<< defined below. + template <class T> struct ExceptionTag { + typedef T Identity; + }; + + std::stringstream stream_; + mutable std::string text_; +}; + +/* This implements the normal operator<< for Exception and all its children. + * SNIFAE means it only applies to Exception. Think of this as an ersatz + * boost::enable_if. + */ +template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) { + e.stream_ << data; + return e; +} + +#ifdef __GNUC__ +#define UTIL_FUNC_NAME __PRETTY_FUNCTION__ +#else +#ifdef _WIN32 +#define UTIL_FUNC_NAME __FUNCTION__ +#else +#define UTIL_FUNC_NAME NULL +#endif +#endif + +#define UTIL_SET_LOCATION(UTIL_e, child, condition) do { \ + (UTIL_e).SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, (child), (condition)); \ +} while (0) + +/* Create an instance of Exception, add the message Modify, and throw it. + * Modify is appended to the what() message and can contain << for ostream + * operations. + * + * do .. while kludge to swallow trailing ; character + * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html . + */ +#define UTIL_THROW(Exception, Modify) do { \ + Exception UTIL_e; \ + UTIL_SET_LOCATION(UTIL_e, #Exception, NULL); \ + UTIL_e << Modify; \ + throw UTIL_e; \ +} while (0) + +#define UTIL_THROW_VAR(Var, Modify) do { \ + Exception &UTIL_e = (Var); \ + UTIL_SET_LOCATION(UTIL_e, NULL, NULL); \ + UTIL_e << Modify; \ + throw UTIL_e; \ +} while (0) + +#define UTIL_THROW_IF(Condition, Exception, Modify) do { \ + if (Condition) { \ + Exception UTIL_e; \ + UTIL_SET_LOCATION(UTIL_e, #Exception, #Condition); \ + UTIL_e << Modify; \ + throw UTIL_e; \ + } \ +} while (0) + +class ErrnoException : public Exception { + public: + ErrnoException() throw(); + + virtual ~ErrnoException() throw(); + + int Error() const throw() { return errno_; } + + private: + int errno_; +}; + +class EndOfFileException : public Exception { + public: + EndOfFileException() throw(); + ~EndOfFileException() throw(); +}; + +} // namespace util + +#endif // UTIL_EXCEPTION__ diff --git a/util/file.cc b/util/file.cc new file mode 100644 index 000000000..81d1d490c --- /dev/null +++ b/util/file.cc @@ -0,0 +1,246 @@ +#include "util/file.hh" + +#include "util/exception.hh" +#include "util/portability.hh" + +#include <cstdlib> +#include <cstdio> +#include <iostream> + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdint.h> + +#if defined(_WIN32) || defined(_WIN64) +#include <windows.h> +#endif + +namespace util { + +scoped_fd::~scoped_fd() { + if (fd_ != -1 && close(fd_)) { + std::cerr << "Could not close file " << fd_ << std::endl; + std::abort(); + } +} + +scoped_FILE::~scoped_FILE() { + if (file_ && std::fclose(file_)) { + std::cerr << "Could not close file " << std::endl; + std::abort(); + } +} + +int OpenReadOrThrow(const char *name) { + int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name); +#else + UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name); +#endif + return ret; +} + +uint64_t SizeFile(int fd) { + struct stat sb; + if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; + return sb.st_size; +} + +void ReadOrThrow(int fd, void *to_void, std::size_t amount) { + uint8_t *to = static_cast<uint8_t*>(to_void); + while (amount) { + ssize_t ret = read(fd, to, amount); + UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); + UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); + amount -= ret; + to += ret; + } +} + +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { + uint8_t *to = static_cast<uint8_t*>(to_void); + std::size_t remaining = amount; + while (remaining) { + ssize_t ret = read(fd, to, remaining); + UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed."); + if (!ret) return amount - remaining; + remaining -= ret; + to += ret; + } + return amount; +} + +void WriteOrThrow(int fd, const void *data_void, std::size_t size) { + const uint8_t *data = static_cast<const uint8_t*>(data_void); + while (size) { + ssize_t ret = write(fd, data, size); + if (ret < 1) UTIL_THROW(util::ErrnoException, "Write failed"); + data += ret; + size -= ret; + } +} + +namespace { +void InternalSeek(int fd, off_t off, int whence) { + UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed"); +} +} // namespace + +void SeekOrThrow(int fd, uint64_t off) { + InternalSeek(fd, off, SEEK_SET); +} + +void AdvanceOrThrow(int fd, int64_t off) { + InternalSeek(fd, off, SEEK_CUR); +} + +void SeekEnd(int fd) { + InternalSeek(fd, 0, SEEK_END); +} + +std::FILE *FDOpenOrThrow(scoped_fd &file) { + std::FILE *ret = fdopen(file.get(), "r+b"); + if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen"); + file.release(); + return ret; +} + +TempMaker::TempMaker(const std::string &prefix) : base_(prefix) { + base_ += "XXXXXX"; +} + +// Sigh. Windows temporary file creation is full of race conditions. +#if defined(_WIN32) || defined(_WIN64) +/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright + (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. */ + +/* This has been modified from the original version to rename the function and + * set the Windows temporary flag. */ + +static const char letters[] = +"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + +/* Generate a temporary file name based on TMPL. TMPL must match the + rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed + does not exist at the time of the call to mkstemp. TMPL is + overwritten with the result. */ +int +mkstemp_and_unlink(char *tmpl) +{ + int len; + char *XXXXXX; + static unsigned long long value; + unsigned long long random_time_bits; + unsigned int count; + int fd = -1; + int save_errno = errno; + + /* A lower bound on the number of temporary files to attempt to + generate. The maximum total number of temporary file names that + can exist for a given template is 62**6. It should never be + necessary to try all these combinations. Instead if a reasonable + number of names is tried (we define reasonable as 62**3) fail to + give the system administrator the chance to remove the problems. */ +#define ATTEMPTS_MIN (62 * 62 * 62) + + /* The number of times to attempt to generate a temporary file. To + conform to POSIX, this must be no smaller than TMP_MAX. */ +#if ATTEMPTS_MIN < TMP_MAX + unsigned int attempts = TMP_MAX; +#else + unsigned int attempts = ATTEMPTS_MIN; +#endif + + len = strlen (tmpl); + if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX")) + { + errno = EINVAL; + return -1; + } + +/* This is where the Xs start. */ + XXXXXX = &tmpl[len - 6]; + + /* Get some more or less random data. */ + { + SYSTEMTIME stNow; + FILETIME ftNow; + + // get system time + GetSystemTime(&stNow); + stNow.wMilliseconds = 500; + if (!SystemTimeToFileTime(&stNow, &ftNow)) + { + errno = -1; + return -1; + } + + random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32) + | (unsigned long long)ftNow.dwLowDateTime); + } + value += random_time_bits ^ (unsigned long long)GetCurrentThreadId (); + + for (count = 0; count < attempts; value += 7777, ++count) + { + unsigned long long v = value; + + /* Fill in the random bits. */ + XXXXXX[0] = letters[v % 62]; + v /= 62; + XXXXXX[1] = letters[v % 62]; + v /= 62; + XXXXXX[2] = letters[v % 62]; + v /= 62; + XXXXXX[3] = letters[v % 62]; + v /= 62; + XXXXXX[4] = letters[v % 62]; + v /= 62; + XXXXXX[5] = letters[v % 62]; + + /* Modified for windows and to unlink */ + // fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE); + fd = _open (tmpl, _O_RDWR | _O_CREAT | _O_TEMPORARY | _O_EXCL | _O_BINARY, _S_IREAD | _S_IWRITE); + if (fd >= 0) + { + errno = save_errno; + return fd; + } + else if (errno != EEXIST) + return -1; + } + + /* We got out of the loop because we ran out of combinations to try. */ + errno = EEXIST; + return -1; +} +#else +int +mkstemp_and_unlink(char *tmpl) { + int ret = mkstemp(tmpl); + if (ret == -1) return -1; + UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl); + return ret; +} +#endif + +int TempMaker::Make() const { + std::string copy(base_); + copy.push_back(0); + int ret; + UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(©[0])), util::ErrnoException, "Failed to make a temporary based on " << base_); + return ret; +} + +std::FILE *TempMaker::MakeFile() const { + util::scoped_fd file(Make()); + return FDOpenOrThrow(file); +} + +} // namespace util diff --git a/util/file.hh b/util/file.hh new file mode 100644 index 000000000..b820ba351 --- /dev/null +++ b/util/file.hh @@ -0,0 +1,100 @@ +#ifndef UTIL_FILE__ +#define UTIL_FILE__ + +#include <cstddef> +#include <cstdio> +#include <string> + +#include <stdint.h> + +namespace util { + +class scoped_fd { + public: + scoped_fd() : fd_(-1) {} + + explicit scoped_fd(int fd) : fd_(fd) {} + + ~scoped_fd(); + + void reset(int to) { + scoped_fd other(fd_); + fd_ = to; + } + + int get() const { return fd_; } + + int operator*() const { return fd_; } + + int release() { + int ret = fd_; + fd_ = -1; + return ret; + } + + operator bool() { return fd_ != -1; } + + private: + int fd_; + + scoped_fd(const scoped_fd &); + scoped_fd &operator=(const scoped_fd &); +}; + +class scoped_FILE { + public: + explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {} + + ~scoped_FILE(); + + std::FILE *get() { return file_; } + const std::FILE *get() const { return file_; } + + void reset(std::FILE *to = NULL) { + scoped_FILE other(file_); + file_ = to; + } + + std::FILE *release() { + std::FILE *ret = file_; + file_ = NULL; + return ret; + } + + private: + std::FILE *file_; +}; + +int OpenReadOrThrow(const char *name); + +// Return value for SizeFile when it can't size properly. +const uint64_t kBadSize = (uint64_t)-1; +uint64_t SizeFile(int fd); + +void ReadOrThrow(int fd, void *to, std::size_t size); +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount); + +void WriteOrThrow(int fd, const void *data_void, std::size_t size); + +// Seeking +void SeekOrThrow(int fd, uint64_t off); +void AdvanceOrThrow(int fd, int64_t off); +void SeekEnd(int fd); + +std::FILE *FDOpenOrThrow(scoped_fd &file); + +class TempMaker { + public: + explicit TempMaker(const std::string &prefix); + + int Make() const; + + std::FILE *MakeFile() const; + + private: + std::string base_; +}; + +} // namespace util + +#endif // UTIL_FILE__ diff --git a/util/file_piece.cc b/util/file_piece.cc new file mode 100644 index 000000000..43b578b65 --- /dev/null +++ b/util/file_piece.cc @@ -0,0 +1,310 @@ +#include "util/file_piece.hh" + +#include "util/exception.hh" +#include "util/file.hh" +#include "util/mmap.hh" +#include "util/portability.hh" + +#include <iostream> +#include <string> +#include <limits> + +#include <assert.h> +#include <ctype.h> +#include <fcntl.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> + +#ifdef HAVE_ZLIB +#include <zlib.h> +#endif + +namespace util { + +ParseNumberException::ParseNumberException(StringPiece value) throw() { + *this << "Could not parse \"" << value << "\" into a number"; +} + +GZException::GZException(void *file) { +#ifdef HAVE_ZLIB + int num; + *this << gzerror(file, &num) << " from zlib"; +#endif // HAVE_ZLIB +} + +// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). +const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + +FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()), + progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { + Initialize(name, show_progress, min_buffer); +} + +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), + progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { + Initialize(name, show_progress, min_buffer); +} + +FilePiece::~FilePiece() { +#ifdef HAVE_ZLIB + if (gz_file_) { + // zlib took ownership + file_.release(); + int ret; + if (Z_OK != (ret = gzclose(gz_file_))) { + std::cerr << "could not close file " << file_name_ << " using zlib" << std::endl; + abort(); + } + } +#endif +} + +StringPiece FilePiece::ReadLine(char delim) { + std::size_t skip = 0; + while (true) { + for (const char *i = position_ + skip; i < position_end_; ++i) { + if (*i == delim) { + StringPiece ret(position_, i - position_); + position_ = i + 1; + return ret; + } + } + if (at_end_) { + if (position_ == position_end_) Shift(); + return Consume(position_end_); + } + skip = position_end_ - position_; + Shift(); + } +} + +float FilePiece::ReadFloat() { + return ReadNumber<float>(); +} +double FilePiece::ReadDouble() { + return ReadNumber<double>(); +} +long int FilePiece::ReadLong() { + return ReadNumber<long int>(); +} +unsigned long int FilePiece::ReadULong() { + return ReadNumber<unsigned long int>(); +} + +void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { +#ifdef HAVE_ZLIB + gz_file_ = NULL; +#endif + file_name_ = name; + + default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2); + position_ = NULL; + position_end_ = NULL; + mapped_offset_ = 0; + at_end_ = false; + + if (total_size_ == kBadSize) { + // So the assertion passes. + fallback_to_read_ = false; + if (show_progress) + *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl; + TransitionToRead(); + } else { + fallback_to_read_ = false; + } + Shift(); + // gzip detect. + if ((position_end_ - position_) > 2 && *position_ == 0x1f && static_cast<unsigned char>(*(position_ + 1)) == 0x8b) { +#ifndef HAVE_ZLIB + UTIL_THROW(GZException, "Looks like a gzip file but support was not compiled in."); +#endif + if (!fallback_to_read_) { + at_end_ = false; + TransitionToRead(); + } + } +} + +namespace { +void ParseNumber(const char *begin, char *&end, float &out) { +#ifdef sun + out = static_cast<float>(strtod(begin, &end)); +#else + out = strtof(begin, &end); +#endif +} +void ParseNumber(const char *begin, char *&end, double &out) { + out = strtod(begin, &end); +} +void ParseNumber(const char *begin, char *&end, long int &out) { + out = strtol(begin, &end, 10); +} +void ParseNumber(const char *begin, char *&end, unsigned long int &out) { + out = strtoul(begin, &end, 10); +} +} // namespace + +template <class T> T FilePiece::ReadNumber() { + SkipSpaces(); + while (last_space_ < position_) { + if (at_end_) { + // Hallucinate a null off the end of the file. + std::string buffer(position_, position_end_); + char *end; + T ret; + ParseNumber(buffer.c_str(), end, ret); + if (buffer.c_str() == end) throw ParseNumberException(buffer); + position_ += end - buffer.c_str(); + return ret; + } + Shift(); + } + char *end; + T ret; + ParseNumber(position_, end, ret); + if (end == position_) throw ParseNumberException(ReadDelimited()); + position_ = end; + return ret; +} + +const char *FilePiece::FindDelimiterOrEOF(const bool *delim) { + std::size_t skip = 0; + while (true) { + for (const char *i = position_ + skip; i < position_end_; ++i) { + if (delim[static_cast<unsigned char>(*i)]) return i; + } + if (at_end_) { + if (position_ == position_end_) Shift(); + return position_end_; + } + skip = position_end_ - position_; + Shift(); + } +} + +void FilePiece::Shift() { + if (at_end_) { + progress_.Finished(); + throw EndOfFileException(); + } + uint64_t desired_begin = position_ - data_.begin() + mapped_offset_; + + if (!fallback_to_read_) MMapShift(desired_begin); + // Notice an mmap failure might set the fallback. + if (fallback_to_read_) ReadShift(); + + for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) { + if (isspace(*last_space_)) break; + } +} + +void FilePiece::MMapShift(uint64_t desired_begin) { + // Use mmap. + uint64_t ignore = desired_begin % page_; + // Duplicate request for Shift means give more data. + if (position_ == data_.begin() + ignore) { + default_map_size_ *= 2; + } + // Local version so that in case of failure it doesn't overwrite the class variable. + uint64_t mapped_offset = desired_begin - ignore; + + uint64_t mapped_size; + if (default_map_size_ >= static_cast<std::size_t>(total_size_ - mapped_offset)) { + at_end_ = true; + mapped_size = total_size_ - mapped_offset; + } else { + mapped_size = default_map_size_; + } + + // Forcibly clear the existing mmap first. + data_.reset(); + try { + MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_); + } catch (const util::ErrnoException &e) { + if (desired_begin) { + SeekOrThrow(*file_, desired_begin); + } + // The mmap was scheduled to end the file, but now we're going to read it. + at_end_ = false; + TransitionToRead(); + return; + } + mapped_offset_ = mapped_offset; + position_ = data_.begin() + ignore; + position_end_ = data_.begin() + mapped_size; + + progress_.Set(desired_begin); +} + +void FilePiece::TransitionToRead() { + assert(!fallback_to_read_); + fallback_to_read_ = true; + data_.reset(); + data_.reset(malloc(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); + UTIL_THROW_IF(!data_.get(), ErrnoException, "malloc failed for " << default_map_size_); + position_ = data_.begin(); + position_end_ = position_; + +#ifdef HAVE_ZLIB + assert(!gz_file_); + gz_file_ = gzdopen(file_.get(), "r"); + UTIL_THROW_IF(!gz_file_, GZException, "zlib failed to open " << file_name_); +#endif +} + +void FilePiece::ReadShift() { + assert(fallback_to_read_); + // Bytes [data_.begin(), position_) have been consumed. + // Bytes [position_, position_end_) have been read into the buffer. + + // Start at the beginning of the buffer if there's nothing useful in it. + if (position_ == position_end_) { + mapped_offset_ += (position_end_ - data_.begin()); + position_ = data_.begin(); + position_end_ = position_; + } + + std::size_t already_read = position_end_ - data_.begin(); + + if (already_read == default_map_size_) { + if (position_ == data_.begin()) { + // Buffer too small. + std::size_t valid_length = position_end_ - position_; + default_map_size_ *= 2; + data_.call_realloc(default_map_size_); + UTIL_THROW_IF(!data_.get(), ErrnoException, "realloc failed for " << default_map_size_); + position_ = data_.begin(); + position_end_ = position_ + valid_length; + } else { + size_t moving = position_end_ - position_; + memmove(data_.get(), position_, moving); + position_ = data_.begin(); + position_end_ = position_ + moving; + already_read = moving; + } + } + + ssize_t read_return; +#ifdef HAVE_ZLIB + read_return = gzread(gz_file_, static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); + if (read_return == -1) throw GZException(gz_file_); + if (total_size_ != kBadSize) { + // Just get the position, don't actually seek. Apparently this is how you do it. . . + off_t ret = lseek(file_.get(), 0, SEEK_CUR); + if (ret != -1) progress_.Set(ret); + } +#else + read_return = read(file_.get(), static_cast<char*>(data_.get()) + already_read, default_map_size_ - already_read); + UTIL_THROW_IF(read_return == -1, ErrnoException, "read failed"); + progress_.Set(mapped_offset_); +#endif + if (read_return == 0) { + at_end_ = true; + } + position_end_ += read_return; +} + +} // namespace util diff --git a/util/file_piece.hh b/util/file_piece.hh new file mode 100644 index 000000000..b81ac0e20 --- /dev/null +++ b/util/file_piece.hh @@ -0,0 +1,126 @@ +#ifndef UTIL_FILE_PIECE__ +#define UTIL_FILE_PIECE__ + +#include "util/ersatz_progress.hh" +#include "util/exception.hh" +#include "util/file.hh" +#include "util/have.hh" +#include "util/mmap.hh" +#include "util/string_piece.hh" + +#include <cstddef> +#include <string> + +#include <stdint.h> + +namespace util { + +class ParseNumberException : public Exception { + public: + explicit ParseNumberException(StringPiece value) throw(); + ~ParseNumberException() throw() {} +}; + +class GZException : public Exception { + public: + explicit GZException(void *file); + GZException() throw() {} + ~GZException() throw() {} +}; + +extern const bool kSpaces[256]; + +// Memory backing the returned StringPiece may vanish on the next call. +class FilePiece { + public: + // 32 MB default. + explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); + // Takes ownership of fd. name is used for messages. + explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); + + ~FilePiece(); + + char get() { + if (position_ == position_end_) { + Shift(); + if (at_end_) throw EndOfFileException(); + } + return *(position_++); + } + + // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace(). + StringPiece ReadDelimited(const bool *delim = kSpaces) { + SkipSpaces(delim); + return Consume(FindDelimiterOrEOF(delim)); + } + + // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter. + // It is similar to getline in that way. + StringPiece ReadLine(char delim = '\n'); + + float ReadFloat(); + double ReadDouble(); + long int ReadLong(); + unsigned long int ReadULong(); + + // Skip spaces defined by isspace. + void SkipSpaces(const bool *delim = kSpaces) { + for (; ; ++position_) { + if (position_ == position_end_) Shift(); + if (!delim[static_cast<unsigned char>(*position_)]) return; + } + } + + uint64_t Offset() const { + return position_ - data_.begin() + mapped_offset_; + } + + const std::string &FileName() const { return file_name_; } + + private: + void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); + + template <class T> T ReadNumber(); + + StringPiece Consume(const char *to) { + StringPiece ret(position_, to - position_); + position_ = to; + return ret; + } + + const char *FindDelimiterOrEOF(const bool *delim = kSpaces); + + void Shift(); + // Backends to Shift(). + void MMapShift(uint64_t desired_begin); + + void TransitionToRead(); + void ReadShift(); + + const char *position_, *last_space_, *position_end_; + + scoped_fd file_; + const uint64_t total_size_; + const uint64_t page_; + + std::size_t default_map_size_; + uint64_t mapped_offset_; + + // Order matters: file_ should always be destroyed after this. + scoped_memory data_; + + bool at_end_; + bool fallback_to_read_; + + ErsatzProgress progress_; + + std::string file_name_; + +#ifdef HAVE_ZLIB + void *gz_file_; +#endif // HAVE_ZLIB +}; + +} // namespace util + +#endif // UTIL_FILE_PIECE__ diff --git a/util/file_piece_test.cc b/util/file_piece_test.cc new file mode 100644 index 000000000..dc9ec7e7c --- /dev/null +++ b/util/file_piece_test.cc @@ -0,0 +1,102 @@ +#include "util/file_piece.hh" + +#include "util/scoped.hh" + +#define BOOST_TEST_MODULE FilePieceTest +#include <boost/test/unit_test.hpp> +#include <fstream> +#include <iostream> + +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> + +namespace util { +namespace { + +/* mmap implementation */ +BOOST_AUTO_TEST_CASE(MMapReadLine) { + std::fstream ref("file_piece.cc", std::ios::in); + FilePiece test("file_piece.cc", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +#ifndef __APPLE__ +/* Apple isn't happy with the popen, fileno, dup. And I don't want to + * reimplement popen. This is an issue with the test. + */ +/* read() implementation */ +BOOST_AUTO_TEST_CASE(StreamReadLine) { + std::fstream ref("file_piece.cc", std::ios::in); + + FILE *catter = popen("cat file_piece.cc", "r"); + BOOST_REQUIRE(catter); + + FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_REQUIRE(!pclose(catter)); +} +#endif // __APPLE__ + +#ifdef HAVE_ZLIB + +// gzip file +BOOST_AUTO_TEST_CASE(PlainZipReadLine) { + std::fstream ref("file_piece.cc", std::ios::in); + + BOOST_REQUIRE_EQUAL(0, system("gzip <file_piece.cc >file_piece.cc.gz")); + FilePiece test("file_piece.cc.gz", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +// gzip stream. Apple doesn't like popen, fileno, dup. This is an issue with +// the test. +#ifndef __APPLE__ +BOOST_AUTO_TEST_CASE(StreamZipReadLine) { + std::fstream ref("file_piece.cc", std::ios::in); + + FILE * catter = popen("gzip <file_piece.cc", "r"); + BOOST_REQUIRE(catter); + + FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_REQUIRE(!pclose(catter)); +} +#endif // __APPLE__ + +#endif // HAVE_ZLIB + +} // namespace +} // namespace util diff --git a/util/getopt.hh b/util/getopt.hh new file mode 100755 index 000000000..33a706833 --- /dev/null +++ b/util/getopt.hh @@ -0,0 +1,190 @@ + +/* getopt.h */ +/* Declarations for getopt. + Copyright (C) 1989-1994, 1996-1999, 2001 Free Software + Foundation, Inc. This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute + it and/or modify it under the terms of the GNU Lesser + General Public License as published by the Free Software + Foundation; either version 2.1 of the License, or + (at your option) any later version. + + The GNU C Library is distributed in the hope that it will + be useful, but WITHOUT ANY WARRANTY; without even the + implied warranty of MERCHANTABILITY or FITNESS FOR A + PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General + Public License along with the GNU C Library; if not, write + to the Free Software Foundation, Inc., 59 Temple Place, + Suite 330, Boston, MA 02111-1307 USA. */ + + + + +#ifndef _GETOPT_H + +#ifndef __need_getopt +# define _GETOPT_H 1 +#endif + +/* If __GNU_LIBRARY__ is not already defined, either we are being used + standalone, or this is the first header included in the source file. + If we are being used with glibc, we need to include <features.h>, but + that does not exist if we are standalone. So: if __GNU_LIBRARY__ is + not defined, include <ctype.h>, which will pull in <features.h> for us + if it's from glibc. (Why ctype.h? It's guaranteed to exist and it + doesn't flood the namespace with stuff the way some other headers do.) */ +#if !defined __GNU_LIBRARY__ +# include <ctype.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +int getopt (int argc, char *const *argv, const char *optstring); + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +extern char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +extern int optind; + +/* Callers store zero here to inhibit the error message `getopt' prints + for unrecognized options. */ + +extern int opterr; + +/* Set to an option character which was unrecognized. */ + +extern int optopt; + +#ifndef __need_getopt +/* Describe the long-named options requested by the application. + The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector + of `struct option' terminated by an element containing a name which is + zero. + + The field `has_arg' is: + no_argument (or 0) if the option does not take an argument, + required_argument (or 1) if the option requires an argument, + optional_argument (or 2) if the option takes an optional argument. + + If the field `flag' is not NULL, it points to a variable that is set + to the value given in the field `val' when the option is found, but + left unchanged if the option is not found. + + To have a long-named option do something other than set an `int' to + a compiled-in constant, such as set a value from `optarg', set the + option's `flag' field to zero and its `val' field to a nonzero + value (the equivalent single-letter option character, if there is + one). For long options that have a zero `flag' field, `getopt' + returns the contents of the `val' field. */ + +struct option +{ +# if (defined __STDC__ && __STDC__) || defined __cplusplus + const char *name; +# else + char *name; +# endif + /* has_arg can't be an enum because some compilers complain about + type mismatches in all the code that assumes it is an int. */ + int has_arg; + int *flag; + int val; +}; + +/* Names for the values of the `has_arg' field of `struct option'. */ + +# define no_argument 0 +# define required_argument 1 +# define optional_argument 2 +#endif /* need getopt */ + + +/* Get definitions and prototypes for functions to process the + arguments in ARGV (ARGC of them, minus the program name) for + options given in OPTS. + + Return the option character from OPTS just read. Return -1 when + there are no more options. For unrecognized options, or options + missing arguments, `optopt' is set to the option letter, and '?' is + returned. + + The OPTS string is a list of characters which are recognized option + letters, optionally followed by colons, specifying that that letter + takes an argument, to be placed in `optarg'. + + If a letter in OPTS is followed by two colons, its argument is + optional. This behavior is specific to the GNU `getopt'. + + The argument `--' causes premature termination of argument + scanning, explicitly telling `getopt' that there are no more + options. + + If OPTS begins with `--', then non-option arguments are treated as + arguments to the option '\0'. This behavior is specific to the GNU + `getopt'. */ + +#if (defined __STDC__ && __STDC__) || defined __cplusplus +# ifdef __GNU_LIBRARY__ +/* Many other libraries have conflicting prototypes for getopt, with + differences in the consts, in stdlib.h. To avoid compilation + errors, only prototype getopt for the GNU C library. */ +extern int getopt (int ___argc, char *const *___argv, const char *__shortopts); +# else /* not __GNU_LIBRARY__ */ +//extern int getopt (); +# endif /* __GNU_LIBRARY__ */ + +# ifndef __need_getopt +extern int getopt_long (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind); +extern int getopt_long_only (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind); + +/* Internal only. Users should not call this directly. */ +extern int _getopt_internal (int ___argc, char *const *___argv, + const char *__shortopts, + const struct option *__longopts, int *__longind, + int __long_only); +# endif +#else /* not __STDC__ */ +extern int getopt (); +# ifndef __need_getopt +extern int getopt_long (); +extern int getopt_long_only (); + +extern int _getopt_internal (); +# endif +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +/* Make sure we later can get all the definitions and declarations. */ +#undef __need_getopt + +#endif /* getopt.h */ diff --git a/util/have.hh b/util/have.hh new file mode 100644 index 000000000..9e1fc20cd --- /dev/null +++ b/util/have.hh @@ -0,0 +1,9 @@ +/* This ties kenlm's config into Moses's build system. If you are using kenlm + * outside Moses, see http://kheafield.com/code/kenlm/developers/ . + */ +#ifndef UTIL_HAVE__ +#define UTIL_HAVE__ + +#define HAVE_ZLIB + +#endif // UTIL_HAVE__ diff --git a/util/joint_sort.hh b/util/joint_sort.hh new file mode 100644 index 000000000..cf3d84321 --- /dev/null +++ b/util/joint_sort.hh @@ -0,0 +1,151 @@ +#ifndef UTIL_JOINT_SORT__ +#define UTIL_JOINT_SORT__ + +/* A terrifying amount of C++ to coax std::sort into soring one range while + * also permuting another range the same way. + */ + +#include "util/proxy_iterator.hh" + +#include <algorithm> +#include <functional> +#include <iostream> + +namespace util { + +namespace detail { + +template <class KeyIter, class ValueIter> class JointProxy; + +template <class KeyIter, class ValueIter> class JointIter { + public: + JointIter() {} + + JointIter(const KeyIter &key_iter, const ValueIter &value_iter) : key_(key_iter), value_(value_iter) {} + + bool operator==(const JointIter<KeyIter, ValueIter> &other) const { return key_ == other.key_; } + + bool operator<(const JointIter<KeyIter, ValueIter> &other) const { return (key_ < other.key_); } + + std::ptrdiff_t operator-(const JointIter<KeyIter, ValueIter> &other) const { return key_ - other.key_; } + + JointIter<KeyIter, ValueIter> &operator+=(std::ptrdiff_t amount) { + key_ += amount; + value_ += amount; + return *this; + } + + void swap(const JointIter &other) { + std::swap(key_, other.key_); + std::swap(value_, other.value_); + } + + private: + friend class JointProxy<KeyIter, ValueIter>; + KeyIter key_; + ValueIter value_; +}; + +template <class KeyIter, class ValueIter> class JointProxy { + private: + typedef JointIter<KeyIter, ValueIter> InnerIterator; + + public: + typedef struct { + typename std::iterator_traits<KeyIter>::value_type key; + typename std::iterator_traits<ValueIter>::value_type value; + const typename std::iterator_traits<KeyIter>::value_type &GetKey() const { return key; } + } value_type; + + JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {} + JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {} + + operator const value_type() const { + value_type ret; + ret.key = *inner_.key_; + ret.value = *inner_.value_; + return ret; + } + + JointProxy &operator=(const JointProxy &other) { + *inner_.key_ = *other.inner_.key_; + *inner_.value_ = *other.inner_.value_; + return *this; + } + + JointProxy &operator=(const value_type &other) { + *inner_.key_ = other.key; + *inner_.value_ = other.value; + return *this; + } + + typename std::iterator_traits<KeyIter>::reference GetKey() const { + return *(inner_.key_); + } + + void swap(JointProxy<KeyIter, ValueIter> &other) { + std::swap(*inner_.key_, *other.inner_.key_); + std::swap(*inner_.value_, *other.inner_.value_); + } + + private: + friend class ProxyIterator<JointProxy<KeyIter, ValueIter> >; + + InnerIterator &Inner() { return inner_; } + const InnerIterator &Inner() const { return inner_; } + InnerIterator inner_; +}; + +template <class Proxy, class Less> class LessWrapper : public std::binary_function<const typename Proxy::value_type &, const typename Proxy::value_type &, bool> { + public: + explicit LessWrapper(const Less &less) : less_(less) {} + + bool operator()(const Proxy &left, const Proxy &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const Proxy &left, const typename Proxy::value_type &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const typename Proxy::value_type &left, const Proxy &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const typename Proxy::value_type &left, const typename Proxy::value_type &right) const { + return less_(left.GetKey(), right.GetKey()); + } + + private: + const Less less_; +}; + +} // namespace detail + +template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > { + public: + PairedIterator(const KeyIter &key, const ValueIter &value) : + ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {} +}; + +template <class KeyIter, class ValueIter, class Less> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) { + ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > full_begin(detail::JointProxy<KeyIter, ValueIter>(key_begin, value_begin)); + detail::LessWrapper<detail::JointProxy<KeyIter, ValueIter>, Less> less_wrap(less); + std::sort(full_begin, full_begin + (key_end - key_begin), less_wrap); +} + + +template <class KeyIter, class ValueIter> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin) { + JointSort(key_begin, key_end, value_begin, std::less<typename std::iterator_traits<KeyIter>::value_type>()); +} + +} // namespace util + +namespace std { +template <class KeyIter, class ValueIter> void swap(util::detail::JointIter<KeyIter, ValueIter> &left, util::detail::JointIter<KeyIter, ValueIter> &right) { + left.swap(right); +} + +template <class KeyIter, class ValueIter> void swap(util::detail::JointProxy<KeyIter, ValueIter> &left, util::detail::JointProxy<KeyIter, ValueIter> &right) { + left.swap(right); +} +} // namespace std + +#endif // UTIL_JOINT_SORT__ diff --git a/util/joint_sort_test.cc b/util/joint_sort_test.cc new file mode 100644 index 000000000..4dc859164 --- /dev/null +++ b/util/joint_sort_test.cc @@ -0,0 +1,50 @@ +#include "util/joint_sort.hh" + +#define BOOST_TEST_MODULE JointSortTest +#include <boost/test/unit_test.hpp> + +namespace util { namespace { + +BOOST_AUTO_TEST_CASE(just_flip) { + char keys[2]; + int values[2]; + keys[0] = 1; values[0] = 327; + keys[1] = 0; values[1] = 87897; + JointSort<char *, int *>(keys + 0, keys + 2, values + 0); + BOOST_CHECK_EQUAL(0, keys[0]); + BOOST_CHECK_EQUAL(87897, values[0]); + BOOST_CHECK_EQUAL(1, keys[1]); + BOOST_CHECK_EQUAL(327, values[1]); +} + +BOOST_AUTO_TEST_CASE(three) { + char keys[3]; + int values[3]; + keys[0] = 1; values[0] = 327; + keys[1] = 2; values[1] = 87897; + keys[2] = 0; values[2] = 10; + JointSort<char *, int *>(keys + 0, keys + 3, values + 0); + BOOST_CHECK_EQUAL(0, keys[0]); + BOOST_CHECK_EQUAL(1, keys[1]); + BOOST_CHECK_EQUAL(2, keys[2]); +} + +BOOST_AUTO_TEST_CASE(char_int) { + char keys[4]; + int values[4]; + keys[0] = 3; values[0] = 327; + keys[1] = 1; values[1] = 87897; + keys[2] = 2; values[2] = 10; + keys[3] = 0; values[3] = 24347; + JointSort<char *, int *>(keys + 0, keys + 4, values + 0); + BOOST_CHECK_EQUAL(0, keys[0]); + BOOST_CHECK_EQUAL(24347, values[0]); + BOOST_CHECK_EQUAL(1, keys[1]); + BOOST_CHECK_EQUAL(87897, values[1]); + BOOST_CHECK_EQUAL(2, keys[2]); + BOOST_CHECK_EQUAL(10, values[2]); + BOOST_CHECK_EQUAL(3, keys[3]); + BOOST_CHECK_EQUAL(327, values[3]); +} + +}} // namespace anonymous util diff --git a/util/key_value_packing.hh b/util/key_value_packing.hh new file mode 100644 index 000000000..8339980b5 --- /dev/null +++ b/util/key_value_packing.hh @@ -0,0 +1,126 @@ +#ifndef UTIL_KEY_VALUE_PACKING__ +#define UTIL_KEY_VALUE_PACKING__ + +/* Why such a general interface? I'm planning on doing bit-level packing. */ + +#include <algorithm> +#include <cstddef> +#include <cstring> + +#include <stdint.h> + +namespace util { + +template <class Key, class Value> struct Entry { + Key key; + Value value; + + const Key &GetKey() const { return key; } + const Value &GetValue() const { return value; } + + Value &MutableValue() { return value; } + + void Set(const Key &key_in, const Value &value_in) { + SetKey(key_in); + SetValue(value_in); + } + void SetKey(const Key &key_in) { key = key_in; } + void SetValue(const Value &value_in) { value = value_in; } + + bool operator<(const Entry<Key, Value> &other) const { return GetKey() < other.GetKey(); } +}; + +// And now for a brief interlude to specialize std::swap. +} // namespace util +namespace std { +template <class Key, class Value> void swap(util::Entry<Key, Value> &first, util::Entry<Key, Value> &second) { + swap(first.key, second.key); + swap(first.value, second.value); +} +}// namespace std +namespace util { + +template <class KeyT, class ValueT> class AlignedPacking { + public: + typedef KeyT Key; + typedef ValueT Value; + + public: + static const std::size_t kBytes = sizeof(Entry<Key, Value>); + static const std::size_t kBits = kBytes * 8; + + typedef Entry<Key, Value> * MutableIterator; + typedef const Entry<Key, Value> * ConstIterator; + typedef const Entry<Key, Value> & ConstReference; + + static MutableIterator FromVoid(void *start) { + return reinterpret_cast<MutableIterator>(start); + } + + static Entry<Key, Value> Make(const Key &key, const Value &value) { + Entry<Key, Value> ret; + ret.Set(key, value); + return ret; + } +}; + +template <class KeyT, class ValueT> class ByteAlignedPacking { + public: + typedef KeyT Key; + typedef ValueT Value; + + private: +#pragma pack(push) +#pragma pack(1) + struct RawEntry { + Key key; + Value value; + + const Key &GetKey() const { return key; } + const Value &GetValue() const { return value; } + + Value &MutableValue() { return value; } + + void Set(const Key &key_in, const Value &value_in) { + SetKey(key_in); + SetValue(value_in); + } + void SetKey(const Key &key_in) { key = key_in; } + void SetValue(const Value &value_in) { value = value_in; } + + bool operator<(const RawEntry &other) const { return GetKey() < other.GetKey(); } + }; +#pragma pack(pop) + + friend void std::swap<>(RawEntry&, RawEntry&); + + public: + typedef RawEntry *MutableIterator; + typedef const RawEntry *ConstIterator; + typedef RawEntry &ConstReference; + + static const std::size_t kBytes = sizeof(RawEntry); + static const std::size_t kBits = kBytes * 8; + + static MutableIterator FromVoid(void *start) { + return MutableIterator(reinterpret_cast<RawEntry*>(start)); + } + + static RawEntry Make(const Key &key, const Value &value) { + RawEntry ret; + ret.Set(key, value); + return ret; + } +}; + +} // namespace util +namespace std { +template <class Key, class Value> void swap( + typename util::ByteAlignedPacking<Key, Value>::RawEntry &first, + typename util::ByteAlignedPacking<Key, Value>::RawEntry &second) { + swap(first.key, second.key); + swap(first.value, second.value); +} +}// namespace std + +#endif // UTIL_KEY_VALUE_PACKING__ diff --git a/util/key_value_packing_test.cc b/util/key_value_packing_test.cc new file mode 100644 index 000000000..a0d33fd76 --- /dev/null +++ b/util/key_value_packing_test.cc @@ -0,0 +1,75 @@ +#include "util/key_value_packing.hh" + +#include <boost/random/mersenne_twister.hpp> +#include <boost/random/uniform_int.hpp> +#include <boost/random/variate_generator.hpp> +#include <boost/scoped_array.hpp> +#define BOOST_TEST_MODULE KeyValueStoreTest +#include <boost/test/unit_test.hpp> + +#include <limits> +#include <stdlib.h> + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(basic_in_out) { + typedef ByteAlignedPacking<uint64_t, unsigned char> Packing; + void *backing = malloc(Packing::kBytes * 2); + Packing::MutableIterator i(Packing::FromVoid(backing)); + i->SetKey(10); + BOOST_CHECK_EQUAL(10, i->GetKey()); + i->SetValue(3); + BOOST_CHECK_EQUAL(3, i->GetValue()); + ++i; + i->SetKey(5); + BOOST_CHECK_EQUAL(5, i->GetKey()); + i->SetValue(42); + BOOST_CHECK_EQUAL(42, i->GetValue()); + + Packing::ConstIterator c(i); + BOOST_CHECK_EQUAL(5, c->GetKey()); + --c; + BOOST_CHECK_EQUAL(10, c->GetKey()); + BOOST_CHECK_EQUAL(42, i->GetValue()); + + BOOST_CHECK_EQUAL(5, i->GetKey()); + free(backing); +} + +BOOST_AUTO_TEST_CASE(simple_sort) { + typedef ByteAlignedPacking<uint64_t, unsigned char> Packing; + char foo[Packing::kBytes * 4]; + Packing::MutableIterator begin(Packing::FromVoid(foo)); + Packing::MutableIterator i = begin; + i->SetKey(0); ++i; + i->SetKey(2); ++i; + i->SetKey(3); ++i; + i->SetKey(1); ++i; + std::sort(begin, i); + BOOST_CHECK_EQUAL(0, begin[0].GetKey()); + BOOST_CHECK_EQUAL(1, begin[1].GetKey()); + BOOST_CHECK_EQUAL(2, begin[2].GetKey()); + BOOST_CHECK_EQUAL(3, begin[3].GetKey()); +} + +BOOST_AUTO_TEST_CASE(big_sort) { + typedef ByteAlignedPacking<uint64_t, unsigned char> Packing; + boost::scoped_array<char> memory(new char[Packing::kBytes * 1000]); + Packing::MutableIterator begin(Packing::FromVoid(memory.get())); + + boost::mt19937 rng; + boost::uniform_int<uint64_t> range(0, std::numeric_limits<uint64_t>::max()); + boost::variate_generator<boost::mt19937&, boost::uniform_int<uint64_t> > gen(rng, range); + + for (size_t i = 0; i < 1000; ++i) { + (begin + i)->SetKey(gen()); + } + std::sort(begin, begin + 1000); + for (size_t i = 0; i < 999; ++i) { + BOOST_CHECK(begin[i] < begin[i+1]); + } +} + +} // namespace +} // namespace util diff --git a/util/mmap.cc b/util/mmap.cc new file mode 100644 index 000000000..3dfe0ab2c --- /dev/null +++ b/util/mmap.cc @@ -0,0 +1,191 @@ +/* Memory mapping wrappers. + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ +#include "util/mmap.hh" + +#include "util/exception.hh" +#include "util/file.hh" + +#include <iostream> + +#include <assert.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#if defined(_WIN32) || defined(_WIN64) +#include <windows.h> +#else +#include <sys/mman.h> +#endif +#include <stdlib.h> +#include <unistd.h> + +namespace util { + +long SizePage() { +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwAllocationGranularity; +#else + return sysconf(_SC_PAGE_SIZE); +#endif +} + +void SyncOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap"); +#else + UTIL_THROW_IF(msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap"); +#endif +} + +void UnmapOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file"); +#else + UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed"); +#endif +} + +scoped_mmap::~scoped_mmap() { + if (data_ != (void*)-1) { + try { + // Thanks Denis Filimonov for pointing out NFS likes msync first. + SyncOrThrow(data_, size_); + UnmapOrThrow(data_, size_); + } catch (const util::ErrnoException &e) { + std::cerr << e.what(); + abort(); + } + } +} + +void scoped_memory::reset(void *data, std::size_t size, Alloc source) { + switch(source_) { + case MMAP_ALLOCATED: + scoped_mmap(data_, size_); + break; + case ARRAY_ALLOCATED: + delete [] reinterpret_cast<char*>(data_); + break; + case MALLOC_ALLOCATED: + free(data_); + break; + case NONE_ALLOCATED: + break; + } + data_ = data; + size_ = size; + source_ = source; +} + +void scoped_memory::call_realloc(std::size_t size) { + assert(source_ == MALLOC_ALLOCATED || source_ == NONE_ALLOCATED); + void *new_data = realloc(data_, size); + if (!new_data) { + reset(); + } else { + reset(new_data, size, MALLOC_ALLOCATED); + } +} + +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) { +#ifdef MAP_POPULATE // Linux specific + if (prefault) { + flags |= MAP_POPULATE; + } +#endif +#if defined(_WIN32) || defined(_WIN64) + int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY; + int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ; + HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, 0, size + offset, NULL); + UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed"); + ret = MapViewOfFile(hMapping, protectM, 0, offset, size); + CloseHandle(hMapping); + UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed"); +#else + int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; + void *ret = mmap(NULL, size, protect, flags, fd, offset); + UTIL_THROW_IF(ret == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset); +#endif + return ret; +} + +const int kFileFlags = +#if defined(_WIN32) || defined(_WIN64) + 0 // MapOrThrow ignores flags on windows +#elif defined(MAP_FILE) + MAP_FILE | MAP_SHARED +#else + MAP_SHARED +#endif + ; + +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) { + switch (method) { + case LAZY: + out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + break; + case POPULATE_OR_LAZY: +#ifdef MAP_POPULATE + case POPULATE_OR_READ: +#endif + out.reset(MapOrThrow(size, false, kFileFlags, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED); + break; +#ifndef MAP_POPULATE + case POPULATE_OR_READ: +#endif + case READ: + out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED); + if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc"); + SeekOrThrow(fd, offset); + ReadOrThrow(fd, out.get(), size); + break; + } +} + +void *MapAnonymous(std::size_t size) { + return MapOrThrow(size, true, +#if defined(_WIN32) || defined(_WIN64) + 0 // MapOrThrow ignores the flags anyway. +#elif defined(MAP_ANONYMOUS) + MAP_ANONYMOUS | MAP_PRIVATE // Linux +#else + MAP_ANON | MAP_PRIVATE // BSD +#endif + , false, -1, 0); +} + +void *MapZeroedWrite(int fd, std::size_t size) { + UTIL_THROW_IF(-1 == ftruncate(fd, 0), ErrnoException, "ftruncate on fd " << fd << " to 0 failed"); + UTIL_THROW_IF(-1 == ftruncate(fd, size), ErrnoException, "ftruncate on fd " << fd << " to " << size << " failed"); + return MapOrThrow(size, true, kFileFlags, false, fd, 0); +} + +namespace { + +int CreateOrThrow(const char *name) { + int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); +#else + UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); +#endif + return ret; +} + +} // namespace + +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { + file.reset(CreateOrThrow(name)); + try { + return MapZeroedWrite(file.get(), size); + } catch (ErrnoException &e) { + e << " in file " << name; + throw; + } +} + +} // namespace util diff --git a/util/mmap.hh b/util/mmap.hh new file mode 100644 index 000000000..921147c34 --- /dev/null +++ b/util/mmap.hh @@ -0,0 +1,114 @@ +#ifndef UTIL_MMAP__ +#define UTIL_MMAP__ +// Utilities for mmaped files. + +#include <cstddef> + +#include <stdint.h> +#include <sys/types.h> + +namespace util { + +class scoped_fd; + +long SizePage(); + +// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here. +class scoped_mmap { + public: + scoped_mmap() : data_((void*)-1), size_(0) {} + scoped_mmap(void *data, std::size_t size) : data_(data), size_(size) {} + ~scoped_mmap(); + + void *get() const { return data_; } + + const uint8_t *begin() const { return reinterpret_cast<uint8_t*>(data_); } + const uint8_t *end() const { return reinterpret_cast<uint8_t*>(data_) + size_; } + std::size_t size() const { return size_; } + + void reset(void *data, std::size_t size) { + scoped_mmap other(data_, size_); + data_ = data; + size_ = size; + } + + void reset() { + reset((void*)-1, 0); + } + + private: + void *data_; + std::size_t size_; + + scoped_mmap(const scoped_mmap &); + scoped_mmap &operator=(const scoped_mmap &); +}; + +/* For when the memory might come from mmap, new char[], or malloc. Uses NULL + * and 0 for blanks even though mmap signals errors with (void*)-1). The reset + * function checks that blank for mmap. + */ +class scoped_memory { + public: + typedef enum {MMAP_ALLOCATED, ARRAY_ALLOCATED, MALLOC_ALLOCATED, NONE_ALLOCATED} Alloc; + + scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {} + + ~scoped_memory() { reset(); } + + void *get() const { return data_; } + const char *begin() const { return reinterpret_cast<char*>(data_); } + const char *end() const { return reinterpret_cast<char*>(data_) + size_; } + std::size_t size() const { return size_; } + + Alloc source() const { return source_; } + + void reset() { reset(NULL, 0, NONE_ALLOCATED); } + + void reset(void *data, std::size_t size, Alloc from); + + // realloc allows the current data to escape hence the need for this call + // If realloc fails, destroys the original too and get() returns NULL. + void call_realloc(std::size_t to); + + private: + + void *data_; + std::size_t size_; + + Alloc source_; + + scoped_memory(const scoped_memory &); + scoped_memory &operator=(const scoped_memory &); +}; + +typedef enum { + // mmap with no prepopulate + LAZY, + // On linux, pass MAP_POPULATE to mmap. + POPULATE_OR_LAZY, + // Populate on Linux. malloc and read on non-Linux. + POPULATE_OR_READ, + // malloc and read. + READ +} LoadMethod; + +extern const int kFileFlags; + +// Wrapper around mmap to check it worked and hide some platform macros. +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0); + +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out); + +void *MapAnonymous(std::size_t size); + +// Open file name with mmap of size bytes, all of which are initially zero. +void *MapZeroedWrite(int fd, std::size_t size); +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); + +// msync wrapper +void SyncOrThrow(void *start, size_t length); + +} // namespace util + +#endif // UTIL_MMAP__ diff --git a/util/murmur_hash.cc b/util/murmur_hash.cc new file mode 100644 index 000000000..ef5783fec --- /dev/null +++ b/util/murmur_hash.cc @@ -0,0 +1,134 @@ +/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All + * code is released to the public domain. For business purposes, Murmurhash is + * under the MIT license." + * This is modified from the original: + * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit. + * length changed to unsigned int. + * placed in namespace util + * add MurmurHashNative + * default option = 0 for seed + */ + +#include "util/murmur_hash.hh" + +namespace util { + +//----------------------------------------------------------------------------- +// MurmurHash2, 64-bit versions, by Austin Appleby + +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. + +// 64-bit hash for 64-bit platforms + +uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed ) +{ + const uint64_t m = 0xc6a4a7935bd1e995ULL; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + + +// 64-bit hash for 32-bit platforms + +uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) +{ + const unsigned int m = 0x5bd1e995; + const int r = 24; + + unsigned int h1 = seed ^ len; + unsigned int h2 = 0; + + const unsigned int * data = (const unsigned int *)key; + + while(len >= 8) + { + unsigned int k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + + unsigned int k2 = *data++; + k2 *= m; k2 ^= k2 >> r; k2 *= m; + h2 *= m; h2 ^= k2; + len -= 4; + } + + if(len >= 4) + { + unsigned int k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + } + + switch(len) + { + case 3: h2 ^= ((unsigned char*)data)[2] << 16; + case 2: h2 ^= ((unsigned char*)data)[1] << 8; + case 1: h2 ^= ((unsigned char*)data)[0]; + h2 *= m; + }; + + h1 ^= h2 >> 18; h1 *= m; + h2 ^= h1 >> 22; h2 *= m; + h1 ^= h2 >> 17; h1 *= m; + h2 ^= h1 >> 19; h2 *= m; + + uint64_t h = h1; + + h = (h << 32) | h2; + + return h; +} +// Trick to test for 64-bit architecture at compile time. +namespace { +template <unsigned L> uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, unsigned int seed) { + return MurmurHash64A(key, len, seed); +} +template <> uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, unsigned int seed) { + return MurmurHash64B(key, len, seed); +} +} // namespace + +uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed) { + return MurmurHashNativeBackend<sizeof(void*)>(key, len, seed); +} + +} // namespace util diff --git a/util/murmur_hash.hh b/util/murmur_hash.hh new file mode 100644 index 000000000..638aaeb22 --- /dev/null +++ b/util/murmur_hash.hh @@ -0,0 +1,14 @@ +#ifndef UTIL_MURMUR_HASH__ +#define UTIL_MURMUR_HASH__ +#include <cstddef> +#include <stdint.h> + +namespace util { + +uint64_t MurmurHash64A(const void * key, std::size_t len, unsigned int seed = 0); +uint64_t MurmurHash64B(const void * key, std::size_t len, unsigned int seed = 0); +uint64_t MurmurHashNative(const void * key, std::size_t len, unsigned int seed = 0); + +} // namespace util + +#endif // UTIL_MURMUR_HASH__ diff --git a/util/portability.cc b/util/portability.cc new file mode 100644 index 000000000..2efd74cba --- /dev/null +++ b/util/portability.cc @@ -0,0 +1,74 @@ + +#include <stdlib.h> +#include <errno.h> +#include "util/portability.hh" + +#ifdef WIN32 + +int RUSAGE_SELF = 0; + +int sysconf(int) { return 0; } +int msync(void*, int, int) { return 0; } +int munmap(void *, int) { return 0; } +void *mmap(void*, int, int, int, FD, OFF_T) { return 0; } +int write(int, const void *, int) {return 0; } + +//FILE *popen(const char*, const char*) { return 0; } +//int pclose(FILE *) { return 0; } +int close(FD fd) { return 0; } + + +// to be implemented by boost +int mkdtemp(const char*) { return 0; } + +// done +long lrint(float x) +{ + long ret = (long) x; + return ret; +} + +float strtof(const char *begin, char **end) +{ + double ret = strtod(begin, end); + return (float) ret; +} + + +int ftruncate (FD hfile, unsigned int size) +{ + unsigned int curpos; + /* + HANDLE hfile; + + if (fd < 0) + { + errno = EBADF; + return -1; + } + + hfile = (HANDLE) _get_osfhandle (fd); + */ + curpos = SetFilePointer (hfile, 0, NULL, FILE_CURRENT); + if (curpos == ~0 + || SetFilePointer (hfile, size, NULL, FILE_BEGIN) == ~0 + || !SetEndOfFile (hfile)) + { + int error = GetLastError (); + switch (error) + { + case ERROR_INVALID_HANDLE: + errno = EBADF; + break; + default: + errno = EIO; + break; + } + return -1; + } + return 0; +} + +#endif + + diff --git a/util/portability.hh b/util/portability.hh new file mode 100644 index 000000000..acbc01922 --- /dev/null +++ b/util/portability.hh @@ -0,0 +1,115 @@ + +#pragma once + +#include <assert.h> +#include <stdint.h> + +#ifdef WIN32 + +#include <windows.h> +#include <direct.h> +#include <io.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> +#include "util/getopt.hh" + +#undef max +#undef min + +typedef HANDLE FD; + +const FD kBadFD = INVALID_HANDLE_VALUE; + +typedef int ssize_t; + +#define _SC_PAGE_SIZE 1 +#define MS_SYNC 1 + +int sysconf(int); +int msync(void*, int, int); +int ftruncate(FD, unsigned int); + +long lrint(float); + +//inline int getrusage(int, struct rusage*) { return 0; } +//extern int RUSAGE_SELF; + +typedef __int64 OFF_T; +//#define OFF_T __int64 + +#ifndef S_ISDIR +#define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) +#endif + +#ifndef S_ISREG +#define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) +#endif + +int mkdtemp(const char*); +int munmap(void *, int); +void *mmap(void*, int, int, int, FD, OFF_T); + +#define PROT_READ 1 +#define PROT_WRITE 1 +#define MAP_FAILED (void*) 0x1 +#define MAP_SHARED 1 +#define MAP_ANON 1 +#define MAP_PRIVATE 1 +#define S_IRUSR 1 +#define S_IROTH 1 +#define S_IRGRP 1 + +int write(int, const void *, int); +#define S_IRUSR 1 +#define S_IWUSR 1 + +//const char *strerror_r(int, const char *buf, int); + +float strtof(const char *begin, char **end); +//FILE *popen(const char*, const char*); +//int pclose(FILE *); +int close(FD fd); + +#define dup(x) _dup(x) +#define rmdir(x) _rmdir(x) +#define strerror_r(errNum, buffer, numberOfElements) strerror_s(buffer, numberOfElements); + +#else // assume UNIX OS + +#include <stdint.h> +#include <sys/resource.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> + +typedef int FD; +const FD kBadFD = -1; + +typedef off_t OFF_T; + +#endif + +#ifdef __GNUC__ +#define UTIL_FUNC_NAME __PRETTY_FUNCTION__ +#else +#ifdef _WIN32 +#define UTIL_FUNC_NAME __FUNCTION__ +#else +#define UTIL_FUNC_NAME NULL +#endif +#endif + +/* Bit-level packing routines */ +#ifdef __APPLE__ + #include <architecture/byte_order.h> +#elif __linux__ + #include <endian.h> +#elif WIN32 + // nothing +#else + #include <arpa/nameser_compat.h> +#endif + diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh new file mode 100644 index 000000000..8122d69c5 --- /dev/null +++ b/util/probing_hash_table.hh @@ -0,0 +1,117 @@ +#ifndef UTIL_PROBING_HASH_TABLE__ +#define UTIL_PROBING_HASH_TABLE__ + +#include "util/exception.hh" + +#include <algorithm> +#include <cstddef> +#include <functional> + +#include <assert.h> + +namespace util { + +/* Thrown when table grows too large */ +class ProbingSizeException : public Exception { + public: + ProbingSizeException() throw() {} + ~ProbingSizeException() throw() {} +}; + +/* Non-standard hash table + * Buckets must be set at the beginning and must be greater than maximum number + * of elements, else an infinite loop happens. + * Memory management and initialization is externalized to make it easier to + * serialize these to disk and load them quickly. + * Uses linear probing to find value. + * Only insert and lookup operations. + */ + +template <class PackingT, class HashT, class EqualT = std::equal_to<typename PackingT::Key> > class ProbingHashTable { + public: + typedef PackingT Packing; + typedef typename Packing::Key Key; + typedef typename Packing::MutableIterator MutableIterator; + typedef typename Packing::ConstIterator ConstIterator; + + typedef HashT Hash; + typedef EqualT Equal; + + static std::size_t Size(std::size_t entries, float multiplier) { + return std::max(entries + 1, static_cast<std::size_t>(multiplier * static_cast<float>(entries))) * Packing::kBytes; + } + + // Must be assigned to later. + ProbingHashTable() : entries_(0) +#ifdef DEBUG + , initialized_(false) +#endif + {} + + ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) + : begin_(Packing::FromVoid(start)), + buckets_(allocated / Packing::kBytes), + end_(begin_ + (allocated / Packing::kBytes)), + invalid_(invalid), + hash_(hash_func), + equal_(equal_func), + entries_(0) +#ifdef DEBUG + , initialized_(true) +#endif + {} + + template <class T> MutableIterator Insert(const T &t) { + if (++entries_ >= buckets_) + UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); +#ifdef DEBUG + assert(initialized_); +#endif + for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { + if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } + if (++i == end_) { i = begin_; } + } + } + + void FinishedInserting() {} + + void LoadedBinary() {} + + // Don't change anything related to GetKey, + template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) { + for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) { + Key got(i->GetKey()); + if (equal_(got, key)) { out = i; return true; } + if (equal_(got, invalid_)) return false; + if (++i == end_) i = begin_; + } + } + + template <class Key> bool Find(const Key key, ConstIterator &out) const { +#ifdef DEBUG + assert(initialized_); +#endif + for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) { + Key got(i->GetKey()); + if (equal_(got, key)) { out = i; return true; } + if (equal_(got, invalid_)) return false; + if (++i == end_) i = begin_; + } + } + + private: + MutableIterator begin_; + std::size_t buckets_; + MutableIterator end_; + Key invalid_; + Hash hash_; + Equal equal_; + std::size_t entries_; +#ifdef DEBUG + bool initialized_; +#endif +}; + +} // namespace util + +#endif // UTIL_PROBING_HASH_TABLE__ diff --git a/util/probing_hash_table_test.cc b/util/probing_hash_table_test.cc new file mode 100644 index 000000000..ff2f5af31 --- /dev/null +++ b/util/probing_hash_table_test.cc @@ -0,0 +1,30 @@ +#include "util/probing_hash_table.hh" + +#include "util/key_value_packing.hh" + +#define BOOST_TEST_MODULE ProbingHashTableTest +#include <boost/test/unit_test.hpp> +#include <boost/functional/hash.hpp> + +namespace util { +namespace { + +typedef AlignedPacking<char, uint64_t> Packing; +typedef ProbingHashTable<Packing, boost::hash<char> > Table; + +BOOST_AUTO_TEST_CASE(simple) { + char mem[Table::Size(10, 1.2)]; + memset(mem, 0, sizeof(mem)); + + Table table(mem, sizeof(mem)); + Packing::ConstIterator i = Packing::ConstIterator(); + BOOST_CHECK(!table.Find(2, i)); + table.Insert(Packing::Make(3, 328920)); + BOOST_REQUIRE(table.Find(3, i)); + BOOST_CHECK_EQUAL(3, i->GetKey()); + BOOST_CHECK_EQUAL(static_cast<uint64_t>(328920), i->GetValue()); + BOOST_CHECK(!table.Find(2, i)); +} + +} // namespace +} // namespace util diff --git a/util/proxy_iterator.hh b/util/proxy_iterator.hh new file mode 100644 index 000000000..121a45fa3 --- /dev/null +++ b/util/proxy_iterator.hh @@ -0,0 +1,96 @@ +#ifndef UTIL_PROXY_ITERATOR__ +#define UTIL_PROXY_ITERATOR__ + +#include <cstddef> +#include <iterator> + +/* This is a RandomAccessIterator that uses a proxy to access the underlying + * data. Useful for packing data at bit offsets but still using STL + * algorithms. + * + * Normally I would use boost::iterator_facade but some people are too lazy to + * install boost and still want to use my language model. It's amazing how + * many operators an iterator has. + * + * The Proxy needs to provide: + * class InnerIterator; + * InnerIterator &Inner(); + * const InnerIterator &Inner() const; + * + * InnerIterator has to implement: + * operator==(InnerIterator) + * operator<(InnerIterator) + * operator+=(std::ptrdiff_t) + * operator-(InnerIterator) + * and of course whatever Proxy needs to dereference it. + * + * It's also a good idea to specialize std::swap for Proxy. + */ + +namespace util { +template <class Proxy> class ProxyIterator { + private: + // Self. + typedef ProxyIterator<Proxy> S; + typedef typename Proxy::InnerIterator InnerIterator; + + public: + typedef std::random_access_iterator_tag iterator_category; + typedef typename Proxy::value_type value_type; + typedef std::ptrdiff_t difference_type; + typedef Proxy reference; + typedef Proxy * pointer; + + ProxyIterator() {} + + // For cast from non const to const. + template <class AlternateProxy> ProxyIterator(const ProxyIterator<AlternateProxy> &in) : p_(*in) {} + explicit ProxyIterator(const Proxy &p) : p_(p) {} + + // p_'s operator= does value copying, but here we want iterator copying. + S &operator=(const S &other) { + I() = other.I(); + return *this; + } + + bool operator==(const S &other) const { return I() == other.I(); } + bool operator!=(const S &other) const { return !(*this == other); } + bool operator<(const S &other) const { return I() < other.I(); } + bool operator>(const S &other) const { return other < *this; } + bool operator<=(const S &other) const { return !(*this > other); } + bool operator>=(const S &other) const { return !(*this < other); } + + S &operator++() { return *this += 1; } + S operator++(int) { S ret(*this); ++*this; return ret; } + S &operator+=(std::ptrdiff_t amount) { I() += amount; return *this; } + S operator+(std::ptrdiff_t amount) const { S ret(*this); ret += amount; return ret; } + + S &operator--() { return *this -= 1; } + S operator--(int) { S ret(*this); --*this; return ret; } + S &operator-=(std::ptrdiff_t amount) { I() += (-amount); return *this; } + S operator-(std::ptrdiff_t amount) const { S ret(*this); ret -= amount; return ret; } + + std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); } + + Proxy operator*() { return p_; } + const Proxy operator*() const { return p_; } + Proxy *operator->() { return &p_; } + const Proxy *operator->() const { return &p_; } + Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); } + + const InnerIterator &Inner() { return p_.Inner(); } + + private: + InnerIterator &I() { return p_.Inner(); } + const InnerIterator &I() const { return p_.Inner(); } + + Proxy p_; +}; + +template <class Proxy> ProxyIterator<Proxy> operator+(std::ptrdiff_t amount, const ProxyIterator<Proxy> &it) { + return it + amount; +} + +} // namespace util + +#endif // UTIL_PROXY_ITERATOR__ diff --git a/util/scoped.hh b/util/scoped.hh new file mode 100644 index 000000000..93e2e8176 --- /dev/null +++ b/util/scoped.hh @@ -0,0 +1,97 @@ +#ifndef UTIL_SCOPED__ +#define UTIL_SCOPED__ + +#include "util/exception.hh" + +/* Other scoped objects in the style of scoped_ptr. */ +#include <cstddef> +#include <cstdlib> + +namespace util { + +template <class T, class R, R (*Free)(T*)> class scoped_thing { + public: + explicit scoped_thing(T *c = static_cast<T*>(0)) : c_(c) {} + + ~scoped_thing() { if (c_) Free(c_); } + + void reset(T *c) { + if (c_) Free(c_); + c_ = c; + } + + T &operator*() { return *c_; } + const T&operator*() const { return *c_; } + T &operator->() { return *c_; } + const T&operator->() const { return *c_; } + + T *get() { return c_; } + const T *get() const { return c_; } + + private: + T *c_; + + scoped_thing(const scoped_thing &); + scoped_thing &operator=(const scoped_thing &); +}; + +class scoped_malloc { + public: + scoped_malloc() : p_(NULL) {} + + scoped_malloc(void *p) : p_(p) {} + + ~scoped_malloc() { std::free(p_); } + + void reset(void *p = NULL) { + scoped_malloc other(p_); + p_ = p; + } + + void call_realloc(std::size_t to) { + void *ret; + UTIL_THROW_IF(!(ret = std::realloc(p_, to)) && to, util::ErrnoException, "realloc to " << to << " bytes failed."); + p_ = ret; + } + + void *get() { return p_; } + const void *get() const { return p_; } + + private: + void *p_; + + scoped_malloc(const scoped_malloc &); + scoped_malloc &operator=(const scoped_malloc &); +}; + +// Hat tip to boost. +template <class T> class scoped_array { + public: + explicit scoped_array(T *content = NULL) : c_(content) {} + + ~scoped_array() { delete [] c_; } + + T *get() { return c_; } + const T* get() const { return c_; } + + T &operator*() { return *c_; } + const T&operator*() const { return *c_; } + + T &operator->() { return *c_; } + const T&operator->() const { return *c_; } + + T &operator[](std::size_t idx) { return c_[idx]; } + const T &operator[](std::size_t idx) const { return c_[idx]; } + + void reset(T *to = NULL) { + scoped_array<T> other(c_); + c_ = to; + } + + private: + T *c_; +}; + +} // namespace util + +#endif // UTIL_SCOPED__ diff --git a/util/sized_iterator.hh b/util/sized_iterator.hh new file mode 100644 index 000000000..aabcc5319 --- /dev/null +++ b/util/sized_iterator.hh @@ -0,0 +1,107 @@ +#ifndef UTIL_SIZED_ITERATOR__ +#define UTIL_SIZED_ITERATOR__ + +#include "util/proxy_iterator.hh" + +#include <functional> +#include <string> + +#include <stdint.h> +#include <string.h> + +namespace util { + +class SizedInnerIterator { + public: + SizedInnerIterator() {} + + SizedInnerIterator(void *ptr, std::size_t size) : ptr_(static_cast<uint8_t*>(ptr)), size_(size) {} + + bool operator==(const SizedInnerIterator &other) const { + return ptr_ == other.ptr_; + } + bool operator<(const SizedInnerIterator &other) const { + return ptr_ < other.ptr_; + } + SizedInnerIterator &operator+=(std::ptrdiff_t amount) { + ptr_ += amount * size_; + return *this; + } + std::ptrdiff_t operator-(const SizedInnerIterator &other) const { + return (ptr_ - other.ptr_) / size_; + } + + const void *Data() const { return ptr_; } + void *Data() { return ptr_; } + std::size_t EntrySize() const { return size_; } + + private: + uint8_t *ptr_; + std::size_t size_; +}; + +class SizedProxy { + public: + SizedProxy() {} + + SizedProxy(void *ptr, std::size_t size) : inner_(ptr, size) {} + + operator std::string() const { + return std::string(reinterpret_cast<const char*>(inner_.Data()), inner_.EntrySize()); + } + + SizedProxy &operator=(const SizedProxy &from) { + memcpy(inner_.Data(), from.inner_.Data(), inner_.EntrySize()); + return *this; + } + + SizedProxy &operator=(const std::string &from) { + memcpy(inner_.Data(), from.data(), inner_.EntrySize()); + return *this; + } + + const void *Data() const { return inner_.Data(); } + void *Data() { return inner_.Data(); } + + private: + friend class util::ProxyIterator<SizedProxy>; + + typedef std::string value_type; + + typedef SizedInnerIterator InnerIterator; + + InnerIterator &Inner() { return inner_; } + const InnerIterator &Inner() const { return inner_; } + InnerIterator inner_; +}; + +typedef ProxyIterator<SizedProxy> SizedIterator; + +inline SizedIterator SizedIt(void *ptr, std::size_t size) { return SizedIterator(SizedProxy(ptr, size)); } + +// Useful wrapper for a comparison function i.e. sort. +template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public std::binary_function<const Proxy &, const Proxy &, bool> { + public: + explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {} + + bool operator()(const Proxy &first, const Proxy &second) const { + return delegate_(first.Data(), second.Data()); + } + bool operator()(const Proxy &first, const std::string &second) const { + return delegate_(first.Data(), second.data()); + } + bool operator()(const std::string &first, const Proxy &second) const { + return delegate_(first.data(), second.Data()); + } + bool operator()(const std::string &first, const std::string &second) const { + return delegate_(first.data(), second.data()); + } + + const Delegate &GetDelegate() const { return delegate_; } + + private: + const Delegate delegate_; +}; + +} // namespace util +#endif // UTIL_SIZED_ITERATOR__ diff --git a/util/sorted_uniform.hh b/util/sorted_uniform.hh new file mode 100644 index 000000000..0391189f0 --- /dev/null +++ b/util/sorted_uniform.hh @@ -0,0 +1,220 @@ +#ifndef UTIL_SORTED_UNIFORM__ +#define UTIL_SORTED_UNIFORM__ + +#include <algorithm> +#include <cstddef> + +#include <assert.h> +#include <stdint.h> + +namespace util { + +template <class T> class IdentityAccessor { + public: + typedef T Key; + T operator()(const T *in) const { return *in; } +}; + +struct Pivot64 { + static inline std::size_t Calc(uint64_t off, uint64_t range, std::size_t width) { + std::size_t ret = static_cast<std::size_t>(static_cast<float>(off) / static_cast<float>(range) * static_cast<float>(width)); + // Cap for floating point rounding + return (ret < width) ? ret : width - 1; + } +}; + +// Use when off * width is <2^64. This is guaranteed when each of them is actually a 32-bit value. +struct Pivot32 { + static inline std::size_t Calc(uint64_t off, uint64_t range, uint64_t width) { + return static_cast<std::size_t>((off * width) / (range + 1)); + } +}; + +// Usage: PivotSelect<sizeof(DataType)>::T +template <unsigned> struct PivotSelect; +template <> struct PivotSelect<8> { typedef Pivot64 T; }; +template <> struct PivotSelect<4> { typedef Pivot32 T; }; +template <> struct PivotSelect<2> { typedef Pivot32 T; }; + +/* Binary search. */ +template <class Iterator, class Accessor> bool BinaryFind( + const Accessor &accessor, + Iterator begin, + Iterator end, + const typename Accessor::Key key, Iterator &out) { + while (end > begin) { + Iterator pivot(begin + (end - begin) / 2); + typename Accessor::Key mid(accessor(pivot)); + if (mid < key) { + begin = pivot + 1; + } else if (mid > key) { + end = pivot; + } else { + out = pivot; + return true; + } + } + return false; +} + +// Search the range [before_it + 1, after_it - 1] for key. +// Preconditions: +// before_v <= key <= after_v +// before_v <= all values in the range [before_it + 1, after_it - 1] <= after_v +// range is sorted. +template <class Iterator, class Accessor, class Pivot> bool BoundedSortedUniformFind( + const Accessor &accessor, + Iterator before_it, typename Accessor::Key before_v, + Iterator after_it, typename Accessor::Key after_v, + const typename Accessor::Key key, Iterator &out) { + while (after_it - before_it > 1) { + Iterator pivot(before_it + (1 + Pivot::Calc(key - before_v, after_v - before_v, after_it - before_it - 1))); + typename Accessor::Key mid(accessor(pivot)); + if (mid < key) { + before_it = pivot; + before_v = mid; + } else if (mid > key) { + after_it = pivot; + after_v = mid; + } else { + out = pivot; + return true; + } + } + return false; +} + +template <class Iterator, class Accessor, class Pivot> bool SortedUniformFind(const Accessor &accessor, Iterator begin, Iterator end, const typename Accessor::Key key, Iterator &out) { + if (begin == end) return false; + typename Accessor::Key below(accessor(begin)); + if (key <= below) { + if (key == below) { out = begin; return true; } + return false; + } + // Make the range [begin, end]. + --end; + typename Accessor::Key above(accessor(end)); + if (key >= above) { + if (key == above) { out = end; return true; } + return false; + } + return BoundedSortedUniformFind<Iterator, Accessor, Pivot>(accessor, begin, below, end, above, key, out); +} + +// May return begin - 1. +template <class Iterator, class Accessor> Iterator BinaryBelow( + const Accessor &accessor, + Iterator begin, + Iterator end, + const typename Accessor::Key key) { + while (end > begin) { + Iterator pivot(begin + (end - begin) / 2); + typename Accessor::Key mid(accessor(pivot)); + if (mid < key) { + begin = pivot + 1; + } else if (mid > key) { + end = pivot; + } else { + for (++pivot; (pivot < end) && accessor(pivot) == mid; ++pivot) {} + return pivot - 1; + } + } + return begin - 1; +} + +// To use this template, you need to define a Pivot function to match Key. +template <class PackingT> class SortedUniformMap { + public: + typedef PackingT Packing; + typedef typename Packing::ConstIterator ConstIterator; + typedef typename Packing::MutableIterator MutableIterator; + + struct Accessor { + public: + typedef typename Packing::Key Key; + const Key &operator()(const ConstIterator &i) const { return i->GetKey(); } + Key &operator()(const MutableIterator &i) const { return i->GetKey(); } + }; + + // Offer consistent API with probing hash. + static std::size_t Size(std::size_t entries, float /*ignore*/ = 0.0) { + return sizeof(uint64_t) + entries * Packing::kBytes; + } + + SortedUniformMap() +#ifdef DEBUG + : initialized_(false), loaded_(false) +#endif + {} + + SortedUniformMap(void *start, std::size_t /*allocated*/) : + begin_(Packing::FromVoid(reinterpret_cast<uint64_t*>(start) + 1)), + end_(begin_), size_ptr_(reinterpret_cast<uint64_t*>(start)) +#ifdef DEBUG + , initialized_(true), loaded_(false) +#endif + {} + + void LoadedBinary() { +#ifdef DEBUG + assert(initialized_); + assert(!loaded_); + loaded_ = true; +#endif + // Restore the size. + end_ = begin_ + *size_ptr_; + } + + // Caller responsible for not exceeding specified size. Do not call after FinishedInserting. + template <class T> void Insert(const T &t) { +#ifdef DEBUG + assert(initialized_); + assert(!loaded_); +#endif + *end_ = t; + ++end_; + } + + void FinishedInserting() { +#ifdef DEBUG + assert(initialized_); + assert(!loaded_); + loaded_ = true; +#endif + std::sort(begin_, end_); + *size_ptr_ = (end_ - begin_); + } + + // Don't use this to change the key. + template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) { +#ifdef DEBUG + assert(initialized_); + assert(loaded_); +#endif + return SortedUniformFind<MutableIterator, Accessor, Pivot64>(begin_, end_, key, out); + } + + // Do not call before FinishedInserting. + template <class Key> bool Find(const Key key, ConstIterator &out) const { +#ifdef DEBUG + assert(initialized_); + assert(loaded_); +#endif + return SortedUniformFind<ConstIterator, Accessor, Pivot64>(Accessor(), ConstIterator(begin_), ConstIterator(end_), key, out); + } + + ConstIterator begin() const { return begin_; } + ConstIterator end() const { return end_; } + + private: + typename Packing::MutableIterator begin_, end_; + uint64_t *size_ptr_; +#ifdef DEBUG + bool initialized_; + bool loaded_; +#endif +}; + +} // namespace util + +#endif // UTIL_SORTED_UNIFORM__ diff --git a/util/sorted_uniform_test.cc b/util/sorted_uniform_test.cc new file mode 100644 index 000000000..4aa4c8aad --- /dev/null +++ b/util/sorted_uniform_test.cc @@ -0,0 +1,116 @@ +#include "util/sorted_uniform.hh" + +#include "util/key_value_packing.hh" + +#include <boost/random/mersenne_twister.hpp> +#include <boost/random/uniform_int.hpp> +#include <boost/random/variate_generator.hpp> +#include <boost/scoped_array.hpp> +#include <boost/unordered_map.hpp> +#define BOOST_TEST_MODULE SortedUniformTest +#include <boost/test/unit_test.hpp> + +#include <algorithm> +#include <limits> +#include <vector> + +namespace util { +namespace { + +template <class Map, class Key, class Value> void Check(const Map &map, const boost::unordered_map<Key, Value> &reference, const Key key) { + typename boost::unordered_map<Key, Value>::const_iterator ref = reference.find(key); + typename Map::ConstIterator i = typename Map::ConstIterator(); + if (ref == reference.end()) { + BOOST_CHECK(!map.Find(key, i)); + } else { + // g++ can't tell that require will crash and burn. + BOOST_REQUIRE(map.Find(key, i)); + BOOST_CHECK_EQUAL(ref->second, i->GetValue()); + } +} + +typedef SortedUniformMap<AlignedPacking<uint64_t, uint32_t> > TestMap; + +BOOST_AUTO_TEST_CASE(empty) { + char buf[TestMap::Size(0)]; + TestMap map(buf, TestMap::Size(0)); + map.FinishedInserting(); + TestMap::ConstIterator i; + BOOST_CHECK(!map.Find(42, i)); +} + +BOOST_AUTO_TEST_CASE(one) { + char buf[TestMap::Size(1)]; + TestMap map(buf, sizeof(buf)); + Entry<uint64_t, uint32_t> e; + e.Set(42,2); + map.Insert(e); + map.FinishedInserting(); + TestMap::ConstIterator i = TestMap::ConstIterator(); + BOOST_REQUIRE(map.Find(42, i)); + BOOST_CHECK(i == map.begin()); + BOOST_CHECK(!map.Find(43, i)); + BOOST_CHECK(!map.Find(41, i)); +} + +template <class Key> void RandomTest(Key upper, size_t entries, size_t queries) { + typedef unsigned char Value; + typedef SortedUniformMap<AlignedPacking<Key, unsigned char> > Map; + boost::scoped_array<char> buffer(new char[Map::Size(entries)]); + Map map(buffer.get(), entries); + boost::mt19937 rng; + boost::uniform_int<Key> range_key(0, upper); + boost::uniform_int<Value> range_value(0, 255); + boost::variate_generator<boost::mt19937&, boost::uniform_int<Key> > gen_key(rng, range_key); + boost::variate_generator<boost::mt19937&, boost::uniform_int<unsigned char> > gen_value(rng, range_value); + + boost::unordered_map<Key, unsigned char> reference; + Entry<Key, unsigned char> ent; + for (size_t i = 0; i < entries; ++i) { + Key key = gen_key(); + unsigned char value = gen_value(); + if (reference.insert(std::make_pair(key, value)).second) { + ent.Set(key, value); + map.Insert(Entry<Key, unsigned char>(ent)); + } + } + map.FinishedInserting(); + + // Random queries. + for (size_t i = 0; i < queries; ++i) { + const Key key = gen_key(); + Check<Map, Key, unsigned char>(map, reference, key); + } + + typename boost::unordered_map<Key, unsigned char>::const_iterator it = reference.begin(); + for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) { + Check<Map, Key, unsigned char>(map, reference, it->second); + } +} + +BOOST_AUTO_TEST_CASE(basic) { + RandomTest<uint8_t>(11, 10, 200); +} + +BOOST_AUTO_TEST_CASE(tiny_dense_random) { + RandomTest<uint8_t>(11, 50, 200); +} + +BOOST_AUTO_TEST_CASE(small_dense_random) { + RandomTest<uint8_t>(100, 100, 200); +} + +BOOST_AUTO_TEST_CASE(small_sparse_random) { + RandomTest<uint8_t>(200, 15, 200); +} + +BOOST_AUTO_TEST_CASE(medium_sparse_random) { + RandomTest<uint16_t>(32000, 1000, 2000); +} + +BOOST_AUTO_TEST_CASE(sparse_random) { + RandomTest<uint64_t>(std::numeric_limits<uint64_t>::max(), 100000, 2000); +} + +} // namespace +} // namespace util diff --git a/util/string_piece.hh b/util/string_piece.hh new file mode 100644 index 000000000..5de053aa8 --- /dev/null +++ b/util/string_piece.hh @@ -0,0 +1,283 @@ +/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If + * you don't use ICU, then this will use the Google implementation from Chrome. + * This has been modified from the original version to let you choose. + */ + +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Copied from strings/stringpiece.h with modifications +// +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. The implicit conversion means that it is often appropriate +// to include this .h file in other files rather than forward-declaring +// StringPiece as would be appropriate for most other Google classes. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// + +#ifndef BASE_STRING_PIECE_H__ +#define BASE_STRING_PIECE_H__ + +#include "util/have.hh" + +#ifdef HAVE_BOOST +#include <boost/functional/hash/hash.hpp> +#endif // HAVE_BOOST + +#include <cstring> +#include <iosfwd> +#include <ostream> + +#ifdef HAVE_ICU +#include <unicode/stringpiece.h> +#include <unicode/uversion.h> + +// Old versions of ICU don't define operator== and operator!=. +#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) +#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} +#endif // old version of ICU + +U_NAMESPACE_BEGIN +#else + +#include <algorithm> +#include <cstddef> +#include <string> +#include <string.h> + +class StringPiece { + public: + typedef size_t size_type; + + private: + const char* ptr_; + size_type length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() : ptr_(NULL), length_(0) { } + StringPiece(const char* str) + : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } + StringPiece(const std::string& str) + : ptr_(str.data()), length_(str.size()) { } + StringPiece(const char* offset, size_type len) + : ptr_(offset), length_(len) { } + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + const char* data() const { return ptr_; } + size_type size() const { return length_; } + size_type length() const { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { ptr_ = NULL; length_ = 0; } + void set(const char* data, size_type len) { ptr_ = data; length_ = len; } + void set(const char* str) { + ptr_ = str; + length_ = str ? strlen(str) : 0; + } + void set(const void* data, size_type len) { + ptr_ = reinterpret_cast<const char*>(data); + length_ = len; + } + + char operator[](size_type i) const { return ptr_[i]; } + + void remove_prefix(size_type n) { + ptr_ += n; + length_ -= n; + } + + void remove_suffix(size_type n) { + length_ -= n; + } + + int compare(const StringPiece& x) const { + int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); + if (r == 0) { + if (length_ < x.length_) r = -1; + else if (length_ > x.length_) r = +1; + } + return r; + } + + std::string as_string() const { + // std::string doesn't like to take a NULL pointer even with a 0 size. + return std::string(!empty() ? data() : "", size()); + } + + void CopyToString(std::string* target) const; + void AppendToString(std::string* target) const; + + // Does "this" start with "x" + bool starts_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); + } + + // Does "this" end with "x" + bool ends_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + } + + // standard STL container boilerplate + typedef char value_type; + typedef const char* pointer; + typedef const char& reference; + typedef const char& const_reference; + typedef ptrdiff_t difference_type; + static const size_type npos; + typedef const char* const_iterator; + typedef const char* iterator; + typedef std::reverse_iterator<const_iterator> const_reverse_iterator; + typedef std::reverse_iterator<iterator> reverse_iterator; + iterator begin() const { return ptr_; } + iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + + size_type max_size() const { return length_; } + size_type capacity() const { return length_; } + + size_type copy(char* buf, size_type n, size_type pos = 0) const; + + size_type find(const StringPiece& s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece& s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; + + size_type find_first_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_of(char c, size_type pos = 0) const { + return find(c, pos); + } + size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_not_of(char c, size_type pos = 0) const; + size_type find_last_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_of(char c, size_type pos = npos) const { + return rfind(c, pos); + } + size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_not_of(char c, size_type pos = npos) const; + + StringPiece substr(size_type pos, size_type n = npos) const; + + static int wordmemcmp(const char* p, const char* p2, size_type N) { + return memcmp(p, p2, N); + } +}; + +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} + +#endif // HAVE_ICU undefined + +inline bool operator<(const StringPiece& x, const StringPiece& y) { + const int r = std::memcmp(x.data(), y.data(), + std::min(x.size(), y.size())); + return ((r < 0) || ((r == 0) && (x.size() < y.size()))); +} + +inline bool operator>(const StringPiece& x, const StringPiece& y) { + return y < x; +} + +inline bool operator<=(const StringPiece& x, const StringPiece& y) { + return !(x > y); +} + +inline bool operator>=(const StringPiece& x, const StringPiece& y) { + return !(x < y); +} + +// allow StringPiece to be logged (needed for unit testing). +inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { + return o.write(piece.data(), static_cast<std::streamsize>(piece.size())); +} + +#ifdef HAVE_BOOST +inline size_t hash_value(const StringPiece &str) { + return boost::hash_range(str.data(), str.data() + str.length()); +} + +/* Support for lookup of StringPiece in boost::unordered_map<std::string> */ +struct StringPieceCompatibleHash : public std::unary_function<const StringPiece &, size_t> { + size_t operator()(const StringPiece &str) const { + return hash_value(str); + } +}; + +struct StringPieceCompatibleEquals : public std::binary_function<const StringPiece &, const std::string &, bool> { + bool operator()(const StringPiece &first, const StringPiece &second) const { + return first == second; + } +}; +template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +} +template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) { + return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); +} +#endif + +#ifdef HAVE_ICU +U_NAMESPACE_END +#endif + +#endif // BASE_STRING_PIECE_H__ diff --git a/util/tokenize_piece.hh b/util/tokenize_piece.hh new file mode 100644 index 000000000..413bda0b9 --- /dev/null +++ b/util/tokenize_piece.hh @@ -0,0 +1,144 @@ +#ifndef UTIL_TOKENIZE_PIECE__ +#define UTIL_TOKENIZE_PIECE__ + +#include "util/string_piece.hh" + +#include <boost/iterator/iterator_facade.hpp> + +#include <algorithm> +#include <iostream> + +/* Usage: + * + * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) { + * std::cout << *i << "\n"; + * } + * + */ + +namespace util { + +// Tokenize a StringPiece using an iterator interface. boost::tokenizer doesn't work with StringPiece. +template <char d> class PieceIterator : public boost::iterator_facade<PieceIterator<d>, const StringPiece, boost::forward_traversal_tag> { + public: + // Default construct is end, which is also accessed by kEndPieceIterator; + PieceIterator() {} + + explicit PieceIterator(const StringPiece &str) + : after_(str) { + increment(); + } + + bool operator!() const { + return after_.data() == 0; + } + operator bool() const { + return after_.data() != 0; + } + + static PieceIterator<d> end() { + return PieceIterator<d>(); + } + + private: + friend class boost::iterator_core_access; + + void increment() { + const char *start = after_.data(); + for (; (start != after_.data() + after_.size()) && (d == *start); ++start) {} + if (start == after_.data() + after_.size()) { + // End condition. + after_.clear(); + return; + } + const char *finish = start; + for (; (finish != after_.data() + after_.size()) && (d != *finish); ++finish) {} + current_ = StringPiece(start, finish - start); + after_ = StringPiece(finish, after_.data() + after_.size() - finish); + } + + bool equal(const PieceIterator &other) const { + return after_.data() == other.after_.data(); + } + + const StringPiece &dereference() const { return current_; } + + StringPiece current_; + StringPiece after_; +}; + +class MultiCharacter { + public: + explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size()); + } + + private: + StringPiece delimiter_; +}; + +class AnyCharacter { + public: + explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {} + + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); + } + + private: + StringPiece chars_; +}; + +template <class Find, bool SkipEmpty = false> class TokenIter : public boost::iterator_facade<TokenIter<Find, SkipEmpty>, const StringPiece, boost::forward_traversal_tag> { + public: + TokenIter() {} + + TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) { + increment(); + } + + bool operator!() const { + return current_.data() == 0; + } + operator bool() const { + return current_.data() != 0; + } + + static TokenIter<Find> end() { + return TokenIter<Find>(); + } + + private: + friend class boost::iterator_core_access; + + void increment() { + do { + StringPiece found(finder_.Find(after_)); + current_ = StringPiece(after_.data(), found.data() - after_.data()); + if (found.data() == after_.data() + after_.size()) { + after_ = StringPiece(NULL, 0); + } else { + after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size()); + } + } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false. + } + + bool equal(const TokenIter<Find> &other) const { + return after_.data() == other.after_.data(); + } + + const StringPiece &dereference() const { + return current_; + } + + StringPiece current_; + StringPiece after_; + + Find finder_; +}; + +} // namespace util + +#endif // UTIL_TOKENIZE_PIECE__ diff --git a/util/tokenize_piece_test.cc b/util/tokenize_piece_test.cc new file mode 100644 index 000000000..e07ebcf5e --- /dev/null +++ b/util/tokenize_piece_test.cc @@ -0,0 +1,94 @@ +#include "util/tokenize_piece.hh" +#include "util/string_piece.hh" + +#define BOOST_TEST_MODULE TokenIteratorTest +#include <boost/test/unit_test.hpp> + +#include <iostream> + +namespace util { +namespace { + +BOOST_AUTO_TEST_CASE(simple) { + PieceIterator<' '> it("single spaced words."); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("single"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("spaced"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("words."), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(null_delimiter) { + const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0"; + PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1)); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("first"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("second"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("third"), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece("fourth"), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(null_entries) { + const char str[] = "\0split\0\0 \0me\0 "; + PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1)); + BOOST_REQUIRE(it); + const char first[] = "\0split\0\0"; + BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it); + ++it; + BOOST_REQUIRE(it); + const char second[] = "\0me\0"; + BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it); + ++it; + BOOST_CHECK(!it); +} + +/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) { + const char str[] = "nodelimit at all"; + TokenIter<MultiCharacter> it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(str), *it); + ++it; + BOOST_CHECK(!it); +} +BOOST_AUTO_TEST_CASE(pipe_pipe_two) { + const char str[] = "|||"; + TokenIter<MultiCharacter> it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(), *it); + ++it; + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(), *it); + ++it; + BOOST_CHECK(!it); +} + +BOOST_AUTO_TEST_CASE(remove_empty) { + const char str[] = "|||"; + TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||")); + BOOST_CHECK(!it); +}*/ + +BOOST_AUTO_TEST_CASE(remove_empty_keep) { + const char str[] = " |||"; + TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||")); + BOOST_REQUIRE(it); + BOOST_CHECK_EQUAL(StringPiece(" "), *it); + ++it; + BOOST_CHECK(!it); +} + +} // namespace +} // namespace util |