diff options
Diffstat (limited to 'libibex')
-rw-r--r-- | libibex/.cvsignore | 9 | ||||
-rw-r--r-- | libibex/COPYING.LIB | 481 | ||||
-rw-r--r-- | libibex/ChangeLog | 390 | ||||
-rw-r--r-- | libibex/Makefile.am | 40 | ||||
-rw-r--r-- | libibex/TODO | 61 | ||||
-rw-r--r-- | libibex/block.c | 595 | ||||
-rw-r--r-- | libibex/block.h | 114 | ||||
-rw-r--r-- | libibex/diskarray.c | 255 | ||||
-rw-r--r-- | libibex/disktail.c | 818 | ||||
-rw-r--r-- | libibex/dumpindex.c | 63 | ||||
-rw-r--r-- | libibex/hash.c | 859 | ||||
-rw-r--r-- | libibex/ibex.h | 106 | ||||
-rw-r--r-- | libibex/ibex_block.c | 345 | ||||
-rw-r--r-- | libibex/ibex_db.c | 512 | ||||
-rw-r--r-- | libibex/ibex_internal.h | 51 | ||||
-rw-r--r-- | libibex/index.h | 103 | ||||
-rw-r--r-- | libibex/lookup.c | 83 | ||||
-rw-r--r-- | libibex/mkindex.c | 84 | ||||
-rw-r--r-- | libibex/testindex.c | 200 | ||||
-rw-r--r-- | libibex/wordindex.c | 646 | ||||
-rw-r--r-- | libibex/wordindex.h | 74 | ||||
-rw-r--r-- | libibex/wordindexmem.c | 889 |
22 files changed, 0 insertions, 6778 deletions
diff --git a/libibex/.cvsignore b/libibex/.cvsignore deleted file mode 100644 index b0d27ba734..0000000000 --- a/libibex/.cvsignore +++ /dev/null @@ -1,9 +0,0 @@ -.deps -Makefile -Makefile.in -.libs -.deps -*.lo -*.la -testindex -dumpindex diff --git a/libibex/COPYING.LIB b/libibex/COPYING.LIB deleted file mode 100644 index eb685a5ec9..0000000000 --- a/libibex/COPYING.LIB +++ /dev/null @@ -1,481 +0,0 @@ - GNU LIBRARY GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1991 Free Software Foundation, Inc. - 675 Mass Ave, Cambridge, MA 02139, USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the library GPL. It is - numbered 2 because it goes with version 2 of the ordinary GPL.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Library General Public License, applies to some -specially designated Free Software Foundation software, and to any -other libraries whose authors decide to use it. You can use it for -your libraries, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if -you distribute copies of the library, or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link a program with the library, you must provide -complete object files to the recipients so that they can relink them -with the library, after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - Our method of protecting your rights has two steps: (1) copyright -the library, and (2) offer you this license which gives you legal -permission to copy, distribute and/or modify the library. - - Also, for each distributor's protection, we want to make certain -that everyone understands that there is no warranty for this free -library. If the library is modified by someone else and passed on, we -want its recipients to know that what they have is not the original -version, so that any problems introduced by others will not reflect on -the original authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that companies distributing free -software will individually obtain patent licenses, thus in effect -transforming the program into proprietary software. To prevent this, -we have made it clear that any patent must be licensed for everyone's -free use or not licensed at all. - - Most GNU software, including some libraries, is covered by the ordinary -GNU General Public License, which was designed for utility programs. This -license, the GNU Library General Public License, applies to certain -designated libraries. This license is quite different from the ordinary -one; be sure to read it in full, and don't assume that anything in it is -the same as in the ordinary license. - - The reason we have a separate public license for some libraries is that -they blur the distinction we usually make between modifying or adding to a -program and simply using it. Linking a program with a library, without -changing the library, is in some sense simply using the library, and is -analogous to running a utility program or application program. However, in -a textual and legal sense, the linked executable is a combined work, a -derivative of the original library, and the ordinary General Public License -treats it as such. - - Because of this blurred distinction, using the ordinary General -Public License for libraries did not effectively promote software -sharing, because most developers did not use the libraries. We -concluded that weaker conditions might promote sharing better. - - However, unrestricted linking of non-free programs would deprive the -users of those programs of all benefit from the free status of the -libraries themselves. This Library General Public License is intended to -permit developers of non-free programs to use free libraries, while -preserving your freedom as a user of such programs to change the free -libraries that are incorporated in them. (We have not seen how to achieve -this as regards changes in header files, but we have achieved it as regards -changes in the actual functions of the Library.) The hope is that this -will lead to faster development of free libraries. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, while the latter only -works together with the library. - - Note that it is possible for a library to be covered by the ordinary -General Public License rather than by this special one. - - GNU LIBRARY GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library which -contains a notice placed by the copyright holder or other authorized -party saying it may be distributed under the terms of this Library -General Public License (also called "this License"). Each licensee is -addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also compile or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - c) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - d) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the source code distributed need not include anything that is normally -distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Library General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - Appendix: How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - <one line to give the library's name and a brief idea of what it does.> - Copyright (C) <year> <name of author> - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - <signature of Ty Coon>, 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! diff --git a/libibex/ChangeLog b/libibex/ChangeLog deleted file mode 100644 index 2d3fe7bace..0000000000 --- a/libibex/ChangeLog +++ /dev/null @@ -1,390 +0,0 @@ -2000-12-24 Not Zed <NotZed@HelixCode.com> - - * Merge from camel-mt-branch. - -2000-12-18 Not Zed <NotZed@HelixCode.com> - - * dumpindex.c (main): Same here. - - * testindex.c (main): Add a g_thread_init(). Sigh, glib's thread - stuff is snot. - (read_words): Setup another flat-out thread to test - multithreadedness at little bit. - - * ibex_block.c (ibex_index_buffer): Add locking around internal - calls. - (ibex_open): Init the locking mutex. - (ibex_close): Free the locking mutex. - (ibex_unindex): - (ibex_find): - (ibex_find_name): - (ibex_contains_name): Add locking around internal calls. - - * ibex_internal.h (struct ibex): Add a lock. Include config.h - -2000-12-13 Christopher James Lahey <clahey@helixcode.com> - - * disktail.c (tail_compress): - (tail_get): Added some casts to get rid of warnings. - (tail_dump): #if 0ed this out to get rid of a warning. - (ibex_diskarray_dump): Added a prototype. - - * ibex_block.c (ibex_index_buffer): Assigned cat the value 0 to - start off with to avoid a warning. - -2000-12-12 Christopher James Lahey <clahey@helixcode.com> - - * wordindex.c (cache_sanity): Made cache_sanity only be included - if d(x) is defined as x. - - * wordindexmem.c: Made node_sanity and cache_sanity only be - included if d(x) is defined as x or if MALLOC_CHECK is defined. - Made sync_value only be included if d(x) is defined as x. - -2000-11-28 Not Zed <NotZed@HelixCode.com> - - * index.h: Turn off index stats by default. - - * ibex_block.c (ibex_save): And here. - (ibex_close): Debug out printfs. - - * wordindexmem.c (ibex_create_word_index_mem): And here. - (num): Made buf static. - - * block.c (ibex_block_cache_open): Debug out some printfs. - (ibex_block_read): And here. - -2000-11-17 Not Zed <NotZed@HelixCode.com> - - * wordindexmem.c (add_list): If we have the namecache active, and - there is no name there, we add it directly and dont look it up - first. - - * testindex.c: Some performance testing & stat gathering stuff. - -2000-11-16 Not Zed <NotZed@HelixCode.com> - - * wordindexmem.c (ibex_create_word_index_mem): Initialise nameinit - & namecache. - (contains_name): On first call, load all names into memory. We - usually do a whole lot of lookups in a row, and this saves a lot - of penalties on a big list, for not too much a memory hit. - (find_name): If we have the namelist in memory do a quick - short-circuit check to see if we have to do further processing. - (unindex_name): Cross check the namecache, if it is active. - Remove it there too/or exit (no work to do). - (word_flush): If we have the namecache active, destroy it now, as - it is not needed anymore (for now). - -2000-10-30 Kjartan Maraas <kmaraas@gnome.org> - - * hash.c: #include <stdlib.h> to remove warning. - * wordindex.c: #include <stdlib.h> and <string.h>. - -2000-10-26 Not Zed <NotZed@HelixCode.com> - - * block.c (ibex_block_cache_open): Use IBEX_VERSION rather than - hardcoded version string. - - * ibex_internal.h (IBEX_VERSION): Bumped version again. This time - I did change the index format. - (IBEX_VERSION): moved into block.h - - * hash.c (struct _hashroot): Add a linked list of keys to the table. - (struct _hashblock): Added a next pointer as a block number. - (hash_insert): Link new key blocks into the key block list. - (struct _HASHCursor): Renamed block to key and added a block item. - (hash_cursor_next): Changed to go through the linked list of all - hash items rather than through each hash chain separately. >> - faster. - (ibex_hash_dump_rec): Remove a warning. - -2000-10-25 <jpr@helixcode.com> - - * ibex_block.c: No longer include <db.h> - -2000-10-25 Not Zed <NotZed@HelixCode.com> - - * ibex_internal.h (IBEX_VERSION): Bumped to another version. The - file format hasn't changed, but earlier bugs may create invalid - files. - - * block.c (ibex_block_read): Use the root data directly. - (ibex_block_cache_open): As well. - (ibex_block_get): And here too. - (ibex_block_cache_sync): Sync the root block directly here. - - * block.h: Pad root block out to 1024 bytes. - Added root block to struct _memcache. - - * disktail.c (tail_get): Dirty the root block. - (tail_get): Fix for changes to root access. - (disk_remove): And here too. - - * wordindexmem.c (sync_cache_entry): Handle the case of not having - any files in the list, which can happen now. - (word_index_pre): Make sure we set the wordid on the new cache - entry. - - * ibex_block.c (ibex_save): Sigh. Pass the right argument to - index_post. - -2000-10-24 JP Rosevear <jpr@helixcode.com> - - * .cvsignore: Shush - -2000-10-24 Not Zed <NotZed@HelixCode.com> - - * block.c (ibex_block_cache_open): Create a word_index_mem for - indexing the words, rather than a word_index. - - * ibex_block.c (ibex_index_buffer): If we haven't called index_pre - yet, do it before indexing anything. - (ibex_save): If wehave called index_pre previously, call - index_post. - (ibex_close): And same for here. - - * index.h: Added a cursor class, and cursor retrieval function for - iterating through an index's keys. - - * wordindexmem.c (ibex_create_word_index_mem): New word class, - similar to wordindex, but meant to be faster for updates. - (word_index_pre): Implement. We load all keys into memory. - (word_index_post): Implement. We sync and free all keys. - (find): Remove lru code, its no longer a cache, but a lookup - table. - (add_index_cache): Remove lru code here too. - (find_name): And here. - (word_flush): Flush the hashtable direct. - (word_close): Call flush to flush, rather than doing it ourselves. - (add_index_cache): If we are in an index state, we can assume a - cache miss == a new word. - (word_index_post): Maintain whether or not we are in an index - state, and the depth of the state. - (word_index_pre): Likewise. Dont reread the index if we have - already. - (cache_sanity): Fixed for struct changes. - - * wordindex.h (IBEXWordClass): Added functions to prepare/cleanup - for lots of indexing. i.e. can be used to optimise indexing speed - at the cost of extra memory usage during the indexing process. - - * dumpindex.c: Dumps the contents of indexs. - - * hash.c (ibex_hash_dump_rec): Also print the word count. - (hash_cursor_create): Create a new cursor for iterating through a - hashtable. - (hash_cursor_close): 'close' the cursor. It is upto the - application to close any cursors it creates. - (hash_cursor_next): Goto the next key id. - (hash_cursor_next_key): Goto the next key, reutrn the key. - (hash_get_cursor): Return a cursor object. - - * wordindex.c (unindex_name): Cross-check the cache as well. - (word_index_post): - (word_index_pre): Added (empty) callbacks for pre/post functions. - -2000-10-12 Not Zed <NotZed@HelixCode.com> - - * ibex_internal.h (struct ibex): Bumped ibex rev. - - * block.c (ibex_block_cache_open): Bumped the ibex file revision - because of the hash table size change. - - * index.h: Added some stat stuff. - - * wordindex.c (struct _wordcache): Changed files[] to be a pointer - to an allocated block/or an individual item. - (find): Fix for changes to struct. - (find_name): " - (sync_cache_entry): " - (add): " - (add_list): " - (add_index_cache): Free the cache file array if it was created. - (word_flush): And here. - (word_close): And here too. - (ibex_create_word_index): Double the size of the hashtables. - (word_flush): Make sure we reset the wordcount to 0 if we remove - the list items. DOH. - (add_index_cache): Use a slightly more sohpisticated aging - algorithm to remove expired nodes. - -2000-10-10 Not Zed <NotZed@HelixCode.com> - - * hash.c (hash_find): - (hash_remove): - (hash_insert): Truncate key if it is too big to fit in a - single block to MAX_KEYLEN bytes. - -2000-09-28 Not Zed <NotZed@HelixCode.com> - - * block.c (ibex_block_free): Make sure we map the 'free' block to - a block number when unlinking a block (fixes a lot of assertion - failures). - (ibex_block_cache_open): Initialise sync flag on root block. If - it is not set on open then the index could be in an invalid state, - and should be rescanned. - (ibex_block_cache_sync): Sync root block last, and set the sync - flag. - (ibex_block_cache_open): Mirror root block flags in block_cache - struct. - (ibex_block_cache_sync): Likewise. - (ibex_block_read): If we write a dirty block, then we clear the - sync flag if its still set; we are no longer synced. - -2000-09-19 Not Zed <NotZed@HelixCode.com> - - ** Merged from IBEX_DISK branch to head. - - * file.c: - * find.c: - * words.c: - * index.c: Removed unused files. - - * block.h: Changed block to use only 24 bits for next and 8 for - used, and fixed all relevant code. Some cleanup. - - * disktail.c (tail_get): If we use an empty tail node, then make - sure we make it dirty. - -2000-09-15 Not Zed <NotZed@HelixCode.com> - - * wordindex.c (word_close): Free hashtable on exit too. - - * disktail.c: Implemented tail-node storage for the end of long - lists, or for short lists. Should save significant disk space - (5x?). - Implemented special case for 1-item lists, where the tailnode - pointer is used to store the index entry. - -2000-09-14 Not Zed <NotZed@HelixCode.com> - - * wordindex.c (add_index_key): Keys also handle tails. - - * hash.c (hash_set_data_block): Added new parameter to keys - a - tail block (a full 32 bit block pointer). - (hash_get_data_block): And same here. - -2000-09-12 Not Zed <NotZed@HelixCode.com> - - * wordindex.c (word_close): Dont close namestore twice. - -2000-09-11 Not Zed <NotZed@HelixCode.com> - - ** Redid almost everything, on-disk hash table to store an index - to index records, mroe on the way to modularisation (more to go), - now stores reverse indexes for deleting. - -2000-08-31 Not Zed <NotZed@HelixCode.com> - - * block.c (add_key_mem): Initialise a memory based array for newly - added index entries. - (add_record): Changed to cache updates in memory until we hit a - limit, and then flush them to disk. - (get_record): Merge in-memory records with disk records. - (remove_record): Remove from memory first, and if that fails, goto - disk. - (find_record): Check memory first, then disk if that fails. - (add_datum_list): oops, copy size * sizeof(blockid_t) - (add_indexed): Make sure we link in the head node when we create a - new one. - -2000-08-09 Christopher James Lahey <clahey@helixcode.com> - - * file.c, find.c: Fixed some warnings. - -2000-05-11 NotZed <NotZed@HelixCode.com> - - * index.c (ibex_unindex): Make sure we mark the ibex as dirty. - -2000-05-07 NotZed <NotZed@HelixCode.com> - - * file.c (ibex_save): New function, only write out the ibex if it - has changed. - -2000-05-07 <notzed@helixcode.com> - - * file.c (ibex_open): Also close the fd after we're done. - - * find.c (ibex_contains_name): New function to find out if a file - is indexed. - -2000-05-02 Matt Loper <matt@helixcode.com> - - * Makefile.am: set G_LOG_DOMAIN. - -2000-04-12 NotZed <NotZed@HelixCode.com> - - * find.c (ibex_dump_all): Debug function to dump the whole index - to stdout. - - * words.c (get_ibex_file): Use g_strdup(), not strdup(). - -2000-04-11 NotZed <NotZed@HelixCode.com> - - * file.c (write_word): Always write out all words we have (even if - its 0 ... the file expects it). No longer check for removed files. - (store_word): Check for removed files here, and only add to the - ordered tree if we have references left to this word. - (ibex_write): First insert into the tree, to determine the - wordcount to be saved in the output file, and then write that. - (ibex_open): Remove some debug. - - * words.c (ibex_index_buffer): Always set 'unread', if it is a - valid pointer (dont rely on caller to initialise it). - -2000-03-26 NotZed <NotZed@HelixCode.com> - - * lookup.c (main): Fixed call to ibex_open. - - * mkindex.c (main): Fixed call to ibex_open. - - * file.c (ibex_open): Changed to accept flags and mode equivalent - to open(2). - -2000-02-25 Dan Winship <danw@helixcode.com> - - * *.c: add gtk-doc-style comments - -2000-02-21 Matt Loper <matt@helixcode.com> - - * .cvsignore: Added mkindex. - -2000-02-21 NotZed <NotZed@HelixCode.com> - - * Makefile.am: change noinst_LIBRARIES to noinst_LTLIBRARIES, and - supply -static to LDFLAGS. Duh, and changed LDADD back to - libibex.la. - -2000-02-20 Matt Loper <matt@helixcode.com> - - * Makefile.am: changed mkindex_LDADD to libibex.a instead of - libibex.la. - -2000-02-19 Matt Loper <matt@helixcode.com> - - * .cvsignore: added lookup. - -2000-02-18 Miguel de Icaza <miguel@nuclecu.unam.mx> - - * Makefile.am (lookup_LDADD): For now. make a libibex.a library so - we can link it with the camel provider. I hate libtool - -2000-02-16 Dan Winship <danw@helixcode.com> - - * Makefile.am: automakify - -2000-02-16 NotZed <NotZed@HelixCode.com> - - * find.[ch] (ibex_find_name): Finds if a word is indexed under a - given name. - -2000-02-14 NotZed <notzed@zedzone.helixcode.com> - - * Makefile: Hack together a build using libtool. This should all - be auto*'d at some point I guess. - -2000-02-13 NotZed <notzed@zedzone.helixcode.com> - - * Added ChangeLog file. - diff --git a/libibex/Makefile.am b/libibex/Makefile.am deleted file mode 100644 index ab7c92206c..0000000000 --- a/libibex/Makefile.am +++ /dev/null @@ -1,40 +0,0 @@ -## Process this file with automake to produce Makefile.in - -noinst_LTLIBRARIES = libibex.la - -libibex_la_SOURCES = \ - wordindex.c wordindexmem.c \ - block.c ibex.h \ - hash.c \ - disktail.c \ - ibex_block.c - -libibex_la_LDFLAGS = -static - -noinst_HEADERS = \ - ibex_internal.h \ - block.h \ - wordindex.h \ - index.h - -INCLUDES = -I$(srcdir) $(GLIB_CFLAGS) $(UNICODE_CFLAGS) \ - $(THREADS_CFLAGS) \ - -DG_LOG_DOMAIN=\"libibex\" - - -noinst_PROGRAMS = dumpindex testindex - -dumpindex_SOURCES = dumpindex.c -dumpindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) $(THREADS_LIBS) - -testindex_SOURCES = testindex.c -testindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) $(THREADS_LIBS) -lm - -#noinst_PROGRAMS = mkindex lookup -# -#mkindex_SOURCES = mkindex.c -#mkindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) -# -#lookup_SOURCES = lookup.c -#lookup_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) - diff --git a/libibex/TODO b/libibex/TODO deleted file mode 100644 index a087c8d1f3..0000000000 --- a/libibex/TODO +++ /dev/null @@ -1,61 +0,0 @@ -Stability ---------- -* ibex_open should never crash, and should never return NULL without -errno being set. Should check for errors when reading. - - -Performance ------------ -* Profiling, keep thinking about data structures, etc. - -* Check memory usage - -* See if writing the "inverse image" of long ref streams helps -compression without hurting performance now. (ie, if a word appears in -more than half of the files, write out the list of files it _doesn't_ -appear in). (I tried this before, and it wasn't working well, but the -file format and data structures have changed a lot.) - -* We could save a noticeable chunk of time if normalize_word computed -the hash of the word and then we could pass that into -g_hash_table_insert somehow. - -* Make a copy of the buffer to be indexed (or provide interface for -caller to say ibex can munge the provided data) and then use that -rather than constantly copying things. ? - - -Functionality -------------- -* ibex file locking - -* specify file mode in ibex_open - -* ibex_find* need to normalize the search words... should this be done -by the caller or by ibex_find? - -* Needs to be some way to do a secondary search after getting results -back from ibex_find* (ie, for "foo near bar"). This either has to be -done by ibex, or requires us to export the normalize interface. - -* Does there need to be an ibex_find_any, or is that easy enough for the -caller to do? - -* utf8_trans needs to cover at least two more code pages. This is -tricky because it's not clear whether some of the letters there should -be translated to ASCII or left as UTF8. This requires some -investigation. - -* ibex_index_* need to ignore HTML tags. - NAME = [A-Za-z][A-Za-z0-9.-]* - </?{NAME}(\s*{NAME}(\s*=\s*({NAME}|"[^"]*"|'[^']*')))*> - <!(--([^-]*|-[^-])--\s*)*> - - ugh. ok, simplifying, we get: - <[^!](([^"'>]*("[^"]*"|'[^']*'))*> or - <!(--([^-]*|-[^-])--\s*)*> - - which is still not simple. sigh. - -* ibex_index_* need to recognize and ignore "non-text". Particularly -BinHex and uuencoding. diff --git a/libibex/block.c b/libibex/block.c deleted file mode 100644 index b9057bc109..0000000000 --- a/libibex/block.c +++ /dev/null @@ -1,595 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- - * - * Copyright (C) 2000 Helix Code, Inc. - * - * Authors: Michael Zucchi <notzed@helixcode.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* - block file/cache/utility functions -*/ - -#include <stdio.h> - -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <glib.h> - -#include "block.h" - -/*#define MALLOC_CHECK*/ - -#ifdef MALLOC_CHECK -#include <mcheck.h> -#endif - -#define d(x) -/*#define DEBUG*/ - -int block_log; - -#ifdef IBEX_STATS -static void -init_stats(struct _memcache *index) -{ - index->stats = g_hash_table_new(g_direct_hash, g_direct_equal); -} - -static void -dump_1_stat(int id, struct _stat_info *info, struct _memcache *index) -{ - printf("%d %d %d %d %d\n", id, info->read, info->write, info->cache_hit, info->cache_miss); -} - -static void -dump_stats(struct _memcache *index) -{ - printf("Block reads writes hits misses\n"); - g_hash_table_foreach(index->stats, dump_1_stat, index); -} - -static void -add_read(struct _memcache *index, int id) -{ - struct _stat_info *info; - - info = g_hash_table_lookup(index->stats, (void *)id); - if (info == NULL) { - info = g_malloc0(sizeof(*info)); - g_hash_table_insert(index->stats, (void *)id, info); - } - info->read++; -} - -static void -add_write(struct _memcache *index, int id) -{ - struct _stat_info *info; - - info = g_hash_table_lookup(index->stats, (void *)id); - if (info == NULL) { - info = g_malloc0(sizeof(*info)); - g_hash_table_insert(index->stats, (void *)id, info); - } - info->write++; -} - -static void -add_hit(struct _memcache *index, int id) -{ - struct _stat_info *info; - - info = g_hash_table_lookup(index->stats, (void *)id); - if (info == NULL) { - info = g_malloc0(sizeof(*info)); - g_hash_table_insert(index->stats, (void *)id, info); - } - info->cache_hit++; -} - -static void -add_miss(struct _memcache *index, int id) -{ - struct _stat_info *info; - - info = g_hash_table_lookup(index->stats, (void *)id); - if (info == NULL) { - info = g_malloc0(sizeof(*info)); - g_hash_table_insert(index->stats, (void *)id, info); - } - info->cache_miss++; -} -#endif /* IBEX_STATS */ - -#ifdef MALLOC_CHECK -static void -checkmem(void *p) -{ - if (p) { - int status = mprobe(p); - - switch (status) { - case MCHECK_HEAD: - printf("Memory underrun at %p\n", p); - abort(); - case MCHECK_TAIL: - printf("Memory overrun at %p\n", p); - abort(); - case MCHECK_FREE: - printf("Double free %p\n", p); - abort(); - } - } -} -#endif - -/* simple list routines (for simplified memory management of cache/lists) */ - -/** - * ibex_list_new: - * @v: - * - * Initialise a list header. A list header must always be initialised - * before use. - **/ -void ibex_list_new(struct _list *v) -{ - v->head = (struct _listnode *)&v->tail; - v->tail = 0; - v->tailpred = (struct _listnode *)&v->head; -} - -/** - * ibex_list_addhead: - * @l: List. - * @n: Node to append. - * - * Prepend a listnode to the head of the list @l. - * - * Return value: Always @n. - **/ -struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n) -{ - n->next = l->head; - n->prev = (struct _listnode *)&l->head; - l->head->prev = n; - l->head = n; - return n; -} - -/** - * ibex_list_addtail: - * @l: - * @n: - * - * Append a listnode to the end of the list @l. - * - * Return value: Always the same as @n. - **/ -struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n) -{ - n->next = (struct _listnode *)&l->tail; - n->prev = l->tailpred; - l->tailpred->next = n; - l->tailpred = n; - return n; -} - -/** - * ibex_list_remove: - * @n: The node to remove. - * - * Remove a listnode from a list. - * - * Return value: Always the same as @n. - **/ -struct _listnode *ibex_list_remove(struct _listnode *n) -{ - n->next->prev = n->prev; - n->prev->next = n->next; - return n; -} - -static struct _memblock * -memblock_addr(struct _block *block) -{ - return (struct _memblock *)(((char *)block) - G_STRUCT_OFFSET(struct _memblock, data)); -} - -/* read/sync the rootblock into the block_cache structure */ -static int -ibex_block_read_root(struct _memcache *block_cache) -{ - lseek(block_cache->fd, 0, SEEK_SET); - if (read(block_cache->fd, &block_cache->root, sizeof(block_cache->root)) != sizeof(block_cache->root)) { - - return -1; - } - return 0; -} - -static int -ibex_block_sync_root(struct _memcache *block_cache) -{ - lseek(block_cache->fd, 0, SEEK_SET); - if (write(block_cache->fd, &block_cache->root, sizeof(block_cache->root)) != sizeof(block_cache->root)) { - return -1; - } - return fsync(block_cache->fd); -} - - -/** - * ibex_block_dirty: - * @block: - * - * Dirty a block. This will cause it to be written to disk on - * a cache sync, or when the block is flushed from the cache. - **/ -void -ibex_block_dirty(struct _block *block) -{ - memblock_addr(block)->flags |= BLOCK_DIRTY; -} - -static void -sync_block(struct _memcache *block_cache, struct _memblock *memblock) -{ - if (block_log) - printf("writing block %d\n", memblock->block); - - lseek(block_cache->fd, memblock->block, SEEK_SET); - if (write(block_cache->fd, &memblock->data, sizeof(memblock->data)) != -1) { - memblock->flags &= ~BLOCK_DIRTY; - } -#ifdef IBEX_STATS - add_write(block_cache, memblock->block); -#endif -} - -/** - * ibex_block_cache_sync: - * @block_cache: - * - * Ensure the block cache is fully synced to disk. - **/ -void -ibex_block_cache_sync(struct _memcache *block_cache) -{ - struct _memblock *memblock; - - memblock = (struct _memblock *)block_cache->nodes.head; - while (memblock->next) { -#ifdef MALLOC_CHECK - checkmem(memblock); -#endif - if (memblock->flags & BLOCK_DIRTY) { - sync_block(block_cache, memblock); - } - memblock = memblock->next; - } - - block_cache->root.flags |= IBEX_ROOT_SYNCF; - if (ibex_block_sync_root(block_cache) != 0) { - block_cache->root.flags &= ~IBEX_ROOT_SYNCF; - } - -#ifdef IBEX_STATS - dump_stats(block_cache); -#endif - -} - -#ifdef MALLOC_CHECK -static void -check_cache(struct _memcache *block_cache) -{ - struct _memblock *mw, *mn; - - checkmem(block_cache); - checkmem(block_cache->index); - - mw = (struct _memblock *)block_cache->nodes.head; - mn = mw->next; - while (mn) { - checkmem(mw); - mw = mn; - mn = mn->next; - } -} -#endif - -/** - * ibex_block_cache_flush: - * @block_cache: - * - * Ensure the block cache is fully synced to disk, and then flush - * its contents from memory. - **/ -void -ibex_block_cache_flush(struct _memcache *block_cache) -{ - struct _memblock *mw, *mn; - - ibex_block_cache_sync(block_cache); - - mw = (struct _memblock *)block_cache->nodes.head; - mn = mw->next; - while (mn) { - g_hash_table_remove(block_cache->index, (void *)mw->block); - g_free(mw); - mw = mn; - mn = mn->next; - } - ibex_list_new(&block_cache->nodes); -} - -/** - * ibex_block_read: - * @block_cache: - * @blockid: - * - * Read the data of a block by blockid. The data contents is backed by - * the block cache, and should be considered static. - * - * TODO; should this return a NULL block on error? - * - * Return value: The address of the block data (which may be cached). - **/ -struct _block * -ibex_block_read(struct _memcache *block_cache, blockid_t blockid) -{ - struct _memblock *memblock; - -#ifdef MALLOC_CHECK - check_cache(block_cache); -#endif - - /* nothing can read the root block directly */ - g_assert(blockid != 0); - g_assert(blockid < block_cache->root.roof); - - memblock = g_hash_table_lookup(block_cache->index, (void *)blockid); - -#ifdef MALLOC_CHECK - check_cache(block_cache); -#endif - - if (memblock) { - d(printf("foudn blockid in cache %d = %p\n", blockid, &memblock->data)); - - /* 'access' page */ - ibex_list_remove((struct _listnode *)memblock); - ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock); - -#ifdef MALLOC_CHECK - check_cache(block_cache); -#endif - -#ifdef IBEX_STATS - add_hit(block_cache, memblock->block); -#endif - -#ifdef MALLOC_CHECK - check_cache(block_cache); -#endif - - return &memblock->data; - } -#ifdef IBEX_STATS - add_miss(block_cache, blockid); - add_read(block_cache, blockid); -#endif - if (block_log) - printf("miss block %d\n", blockid); - - d(printf("loading blockid from disk %d\n", blockid)); - memblock = g_malloc(sizeof(*memblock)); - memblock->block = blockid; - memblock->flags = 0; - lseek(block_cache->fd, blockid, SEEK_SET); - memset(&memblock->data, 0, sizeof(memblock->data)); - read(block_cache->fd, &memblock->data, sizeof(memblock->data)); - - ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock); - g_hash_table_insert(block_cache->index, (void *)blockid, memblock); - if (block_cache->count >= CACHE_SIZE) { - struct _memblock *old = (struct _memblock *)block_cache->nodes.head; - d(printf("discaring cache block %d\n", old->block)); - g_hash_table_remove(block_cache->index, (void *)old->block); - ibex_list_remove((struct _listnode *)old); - if (old->flags & BLOCK_DIRTY) { - /* are we about to un-sync the file? update root and sync it */ - if (block_cache->root.flags & IBEX_ROOT_SYNCF) { - d(printf("Unsyncing root block\n")); - - block_cache->root.flags &= ~IBEX_ROOT_SYNCF; - if (ibex_block_sync_root(block_cache) != 0) { - /* what do we do? i dont know! */ - g_warning("Could not sync root block of index: %s", strerror(errno)); - } - } - sync_block(block_cache, old); - } - g_free(old); - } else { - block_cache->count++; - } - - d(printf(" --- cached blocks : %d\n", block_cache->count)); - -#ifdef MALLOC_CHECK - check_cache(block_cache); -#endif - return &memblock->data; -} - -/** - * ibex_block_cache_open: - * @name: - * @flags: Flags as to open(2), should use O_RDWR and optionally O_CREAT. - * @mode: Mose as to open(2) - * - * Open a block file. - * - * FIXME; this currently also initialises the word and name indexes - * because their pointers are stored in the root block. Should be - * upto the caller to manage these pointers/data. - * - * Return value: NULL if the backing file could not be opened. - **/ -struct _memcache * -ibex_block_cache_open(const char *name, int flags, int mode) -{ - struct _memcache *block_cache = g_malloc0(sizeof(*block_cache)); - - d(printf("opening ibex file: %s", name)); - - /* setup cache */ - ibex_list_new(&block_cache->nodes); - block_cache->count = 0; - block_cache->index = g_hash_table_new(g_direct_hash, g_direct_equal); - block_cache->fd = open(name, flags, mode); - - if (block_cache->fd == -1) { - g_hash_table_destroy(block_cache->index); - g_free(block_cache); - return NULL; - } - - ibex_block_read_root(block_cache); - if (block_cache->root.roof == 0 - || memcmp(block_cache->root.version, IBEX_VERSION, 4) - || ((block_cache->root.flags & IBEX_ROOT_SYNCF) == 0)) { - d(printf("Initialising superblock\n")); - /* reset root data */ - memcpy(block_cache->root.version, IBEX_VERSION, 4); - block_cache->root.roof = 1024; - block_cache->root.free = 0; - block_cache->root.words = 0; - block_cache->root.names = 0; - block_cache->root.tail = 0; /* list of tail blocks */ - block_cache->root.flags = 0; - ibex_block_sync_root(block_cache); - /* reset the file contents */ - ftruncate(block_cache->fd, 1024); - } else { - d(printf("superblock already initialised:\n" - " roof = %d\n free = %d\n words = %d\n names = %d\n tail = %d\n", - block_cache->root.roof, block_cache->root.free, - block_cache->root.words, block_cache->root.names, block_cache->root.tail)); - } - /* FIXME: this should be moved higher up in the object tree */ - { - struct _IBEXWord *ibex_create_word_index_mem(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot); - - block_cache->words = ibex_create_word_index_mem(block_cache, &block_cache->root.words,&block_cache->root.names); - } - -#ifdef IBEX_STATS - init_stats(block_cache); -#endif - - return block_cache; -} - -/** - * ibex_block_cache_close: - * @block_cache: - * - * Close the block file, sync any remaining cached data - * to disk, and free all resources. - **/ -void -ibex_block_cache_close(struct _memcache *block_cache) -{ - struct _memblock *mw, *mn; - - ibex_block_cache_sync(block_cache); - close(block_cache->fd); - - mw = (struct _memblock *)block_cache->nodes.head; - mn = mw->next; - while (mn) { - g_free(mw); - mw = mn; - mn = mw->next; - } - - g_hash_table_destroy(block_cache->index); - - g_free(block_cache); -} - -/** - * ibex_block_free: - * @block_cache: - * @blockid: - * - * Return a block to the free pool. - **/ -void -ibex_block_free(struct _memcache *block_cache, blockid_t blockid) -{ - struct _block *block = ibex_block_read(block_cache, blockid); - - block->next = block_number(block_cache->root.free); - block_cache->root.free = blockid; - ibex_block_dirty((struct _block *)block); -} - -/** - * ibex_block_get: - * @block_cache: - * - * Allocate a new block, or access a previously freed block and return - * its block id. The block will have zeroed contents. - * - * Return value: 0 if there are no blocks left (disk full/read only - * file, etc). - **/ -blockid_t -ibex_block_get(struct _memcache *block_cache) -{ - struct _block *block; - blockid_t head; - - if (block_cache->root.free) { - head = block_cache->root.free; - block = ibex_block_read(block_cache, head); - block_cache->root.free = block_location(block->next); - } else { - /* TODO: check the block will fit first */ - /* TODO: no need to read this block, can allocate it manually (saves a syscall/read) */ - head = block_cache->root.roof; - block_cache->root.roof += BLOCK_SIZE; - block = ibex_block_read(block_cache, head); - } - - g_assert(head != 0); - - d(printf("new block = %d\n", head)); - block->next = 0; - block->used = 0; - ibex_block_dirty(block); - return head; -} diff --git a/libibex/block.h b/libibex/block.h deleted file mode 100644 index dbc8fd5ad8..0000000000 --- a/libibex/block.h +++ /dev/null @@ -1,114 +0,0 @@ - -/* public interfaces for block io routines */ - -#ifndef _BLOCK_H -#define _BLOCK_H - -/*#define IBEX_STATS*/ /* define to get/dump block access stats */ - -#include <glib.h> - -/* version of file format */ -#define IBEX_VERSION "ibx6" - -typedef guint32 nameid_t; -typedef guint32 blockid_t; - -#define BLOCK_BITS (8) -#define BLOCK_SIZE (1<<BLOCK_BITS) -#define CACHE_SIZE 256 /* blocks in disk cache */ - -/* root block */ -struct _root { - char version[4]; - - blockid_t free; /* list of free blocks */ - blockid_t roof; /* top of allocated space, everything below is in a free or used list */ - blockid_t tail; /* list of 'tail' blocks */ - - blockid_t words; /* root of words index */ - blockid_t names; /* root of names index */ - - char flags; /* state flags */ - - /* makes structure fill up to 1024 bytes */ - char dummy[1024 - (sizeof(char)*5) - (sizeof(blockid_t)*5)]; -}; - -#define IBEX_ROOT_SYNCF (1<<0) /* file is synced */ - -/* basic disk structure for (data) blocks */ -struct _block { - unsigned int next:32-BLOCK_BITS; /* next block */ - unsigned int used:BLOCK_BITS; /* number of elements used */ - - nameid_t bl_data[(BLOCK_SIZE-4)/4]; /* references */ -}; - -/* custom list structure, for a simple/efficient cache */ -struct _listnode { - struct _listnode *next; - struct _listnode *prev; -}; -struct _list { - struct _listnode *head; - struct _listnode *tail; - struct _listnode *tailpred; -}; - -void ibex_list_new(struct _list *v); -struct _listnode *ibex_list_addhead(struct _list *l, struct _listnode *n); -struct _listnode *ibex_list_addtail(struct _list *l, struct _listnode *n); -struct _listnode *ibex_list_remove(struct _listnode *n); - -/* in-memory structure for block cache */ -struct _memblock { - struct _memblock *next; - struct _memblock *prev; - - blockid_t block; - int flags; - - struct _block data; -}; -#define BLOCK_DIRTY (1<<0) - -struct _memcache { - struct _list nodes; - int count; /* nodes in cache */ - - GHashTable *index; /* blockid->memblock mapping */ - int fd; /* file fd */ - -#ifdef IBEX_STATS - GHashTable *stats; -#endif - struct _root root; /* root block */ - - /* temporary here */ - struct _IBEXWord *words; /* word index */ -}; - -#ifdef IBEX_STATS -struct _stat_info { - int read; - int write; - int cache_hit; - int cache_miss; -}; -#endif /* IBEX_STATS */ - -struct _memcache *ibex_block_cache_open(const char *name, int flags, int mode); -void ibex_block_cache_close(struct _memcache *block_cache); -void ibex_block_cache_sync(struct _memcache *block_cache); -void ibex_block_cache_flush(struct _memcache *block_cache); - -blockid_t ibex_block_get(struct _memcache *block_cache); -void ibex_block_free(struct _memcache *block_cache, blockid_t blockid); -void ibex_block_dirty(struct _block *block); -struct _block *ibex_block_read(struct _memcache *block_cache, blockid_t blockid); - -#define block_number(x) ((x)>>BLOCK_BITS) -#define block_location(x) ((x)<<BLOCK_BITS) - -#endif /* ! _BLOCK_H */ diff --git a/libibex/diskarray.c b/libibex/diskarray.c deleted file mode 100644 index 2664e0f6b5..0000000000 --- a/libibex/diskarray.c +++ /dev/null @@ -1,255 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- - * - * Copyright (C) 2000 Helix Code, Inc. - * - * Authors: Michael Zucchi <notzed@helixcode.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* a disk based array storage class */ - - -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> - -#include <glib.h> - -#include "block.h" -#include "index.h" - -#define d(x) -/*#define DEBUG*/ - -static struct _IBEXStore *disk_create(struct _memcache *bc); -static int disk_sync(struct _IBEXStore *store); -static int disk_close(struct _IBEXStore *store); - -static blockid_t disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data); -static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data); -static blockid_t disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data); -static void disk_free(struct _IBEXStore *store, blockid_t head); - -static gboolean disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data); -static GArray *disk_get(struct _IBEXStore *store, blockid_t head); - -struct _IBEXStoreClass ibex_diskarray_class = { - disk_create, disk_sync, disk_close, - disk_add, disk_add_list, - disk_remove, disk_free, - disk_find, disk_get -}; - -static struct _IBEXStore *disk_create(struct _memcache *bc) -{ - struct _IBEXStore *store; - - store = g_malloc0(sizeof(*store)); - store->klass = &ibex_diskarray_class; - store->blocks = bc; - - return store; -} - -static int disk_sync(struct _IBEXStore *store) -{ - /* no cache, nop */ - return 0; -} - -static int disk_close(struct _IBEXStore *store) -{ - g_free(store); - return 0; -} - -static blockid_t -disk_add(struct _IBEXStore *store, blockid_t head, nameid_t data) -{ - struct _block *block; - struct _block *newblock; - blockid_t new; - - if (head == 0) { - head = ibex_block_get(store->blocks); - } - block = ibex_block_read(store->blocks, head); - - d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next)); - - if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) { - d(printf("adding record into block %d %d\n", head, data)); - block->bl_data[block->used] = data; - block->used++; - ibex_block_dirty(block); - return head; - } else { - new = ibex_block_get(store->blocks); - newblock = ibex_block_read(store->blocks, new); - newblock->next = head; - newblock->bl_data[0] = data; - newblock->used = 1; - d(printf("adding record into new %d %d, next =%d\n", new, data, newblock->next)); - ibex_block_dirty(newblock); - return new; - } -} - -static blockid_t -disk_add_list(struct _IBEXStore *store, blockid_t head, GArray *data) -{ - struct _block *block; - struct _block *newblock; - blockid_t new; - int copied = 0; - int left, space, tocopy; - - if (head == 0) { - head = ibex_block_get(store->blocks); - } - block = ibex_block_read(store->blocks, head); - - while (copied < data->len) { - left = data->len - copied; - space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used; - if (space) { - tocopy = MIN(left, space); - memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); - block->used += tocopy; - ibex_block_dirty(block); - } else { - new = ibex_block_get(store->blocks); - newblock = ibex_block_read(store->blocks, new); - newblock->next = head; - tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0])); - memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); - newblock->used = tocopy; - block = newblock; - head = new; - ibex_block_dirty(newblock); - } - copied += tocopy; - } - return head; -} - -static blockid_t -disk_remove(struct _IBEXStore *store, blockid_t head, nameid_t data) -{ - blockid_t node = head; - - d(printf("removing %d from %d\n", data, head)); - while (node) { - struct _block *block = ibex_block_read(store->blocks, node); - int i; - - for (i=0;i<block->used;i++) { - if (block->bl_data[i] == data) { - struct _block *start = ibex_block_read(store->blocks, head); - - start->used--; - block->bl_data[i] = start->bl_data[start->used]; - if (start->used == 0) { - struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0); - blockid_t new; - - d(printf("dropping block %d, new head = %d\n", head, start->next)); - new = start->next; - start->next = root->free; - root->free = head; - head = new; - ibex_block_dirty((struct _block *)root); - } - ibex_block_dirty(block); - ibex_block_dirty(start); - return head; - } - } - node = block->next; - } - return head; -} - -static void disk_free(struct _IBEXStore *store, blockid_t head) -{ - blockid_t next; - struct _block *block; - - while (head) { - block = ibex_block_read(store->blocks, head); - next = block->next; - ibex_block_free(store->blocks, head); - head = next; - } -} - -static gboolean -disk_find(struct _IBEXStore *store, blockid_t head, nameid_t data) -{ - blockid_t node = head; - - d(printf("finding %d from %d\n", data, head)); - while (node) { - struct _block *block = ibex_block_read(store->blocks, node); - int i; - - for (i=0;i<block->used;i++) { - if (block->bl_data[i] == data) { - return TRUE; - } - } - node = block->next; - } - return FALSE; -} - -static GArray * -disk_get(struct _IBEXStore *store, blockid_t head) -{ - GArray *result = g_array_new(0, 0, sizeof(nameid_t)); - - while (head) { - struct _block *block = ibex_block_read(store->blocks, head); - - d(printf("getting data from block %d\n", head)); - - g_array_append_vals(result, block->bl_data, block->used); - head = block->next; - d(printf("next = %d\n", head)); - } - return result; -} - -void -ibex_diskarray_dump(struct _memcache *blocks, blockid_t head) -{ - blockid_t node = head; - - printf("dumping list %d\n", node); - while (node) { - struct _block *block = ibex_block_read(blocks, node); - int i; - - printf(" block %d used %d\n ", node, block->used); - for (i=0;i<block->used;i++) { - printf(" %d", block->bl_data[i]); - } - printf("\n"); - node = block->next; - } -} diff --git a/libibex/disktail.c b/libibex/disktail.c deleted file mode 100644 index 88f5ca6c55..0000000000 --- a/libibex/disktail.c +++ /dev/null @@ -1,818 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- - * - * Copyright (C) 2000 Helix Code, Inc. - * - * Authors: Michael Zucchi <notzed@helixcode.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* a disk based array storage class that stores the tails of data lists - in common blocks */ - - -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> - -#include <glib.h> -#include <string.h> - -#include <stdio.h> - -#include "block.h" -#include "index.h" - -#define d(x) -/*#define DEBUG*/ - -/* marker to define which root keys indicate a single length key */ -#define BLOCK_ONE (1<<BLOCK_BITS) - -/* tail blocks only contain tail data ... */ -/* and we pack it in, similar to the key data, only more compact */ -struct _tailblock { - unsigned int next:32-BLOCK_BITS; /* only needs to point to block numbers */ - unsigned int used:BLOCK_BITS; /* how many entries are used */ - union { - unsigned char offset[BLOCK_SIZE-4]; /* works upto blocksize of 1024 bytes */ - nameid_t data[(BLOCK_SIZE-4)/4]; - } tailblock_u; -}; -#define tb_offset tailblock_u.offset -#define tb_data tailblock_u.data - -/* map a tail index to a block index */ -#define TAIL_INDEX(b) ((b) & (BLOCK_SIZE-1)) -/* map a tail index to a block number */ -#define TAIL_BLOCK(b) ((b) & ~(BLOCK_SIZE-1)) -/* map a block + index to a tailid */ -#define TAIL_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1))) - -#define TAIL_THRESHOLD ((BLOCK_SIZE-8)/6) - -static struct _IBEXStore *disk_create(struct _memcache *bc); -static int disk_sync(struct _IBEXStore *store); -static int disk_close(struct _IBEXStore *store); - -static blockid_t disk_add(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data); -static blockid_t disk_add_list(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data); -static blockid_t disk_remove(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data); -static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail); - -static gboolean disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data); -static GArray *disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail); - -struct _IBEXStoreClass ibex_diskarray_class = { - disk_create, disk_sync, disk_close, - disk_add, disk_add_list, - disk_remove, disk_free, - disk_find, disk_get -}; - - -static int -tail_info(struct _tailblock *bucket, nameid_t tailid, blockid_t **startptr) -{ - blockid_t *start, *end; - int index; - - /* get start/end of area to zap */ - index = TAIL_INDEX(tailid); - start = &bucket->tb_data[bucket->tb_offset[index]]; - if (index == 0) { - end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]; - } else { - end = &bucket->tb_data[bucket->tb_offset[index-1]]; - } - if (startptr) - *startptr = start; - return end-start; -} - -/* compresses (or expand) the bucket entry, to the new size */ -static void -tail_compress(struct _tailblock *bucket, int index, int newsize) -{ - int i; - blockid_t *start, *end, *newstart; - - /* get start/end of area to zap */ - start = &bucket->tb_data[bucket->tb_offset[index]]; - if (index == 0) { - end = &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]; - } else { - end = &bucket->tb_data[bucket->tb_offset[index-1]]; - } - - if (end-start == newsize) - return; - - /* - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy - 0 20 25 - - newsize = 0 - end = 25 - newstart = 0 - start = 20 - - newstart+(end-start)-newsize = 5 - i = start-newstart - - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyyyyy - 0 20 25 - - newsize = 2 - end = 25 - newstart = 0 - start = 20 - - newstart+(end-start)-newsize = 3 - i = start-newstart + MIN(end-start, newsize)) = 22 - - XXXXXXXXXXXXXXXXXXXXXXXXXXXXX - 5 25 - newsize = 5 - end = 25 - start = 25 - newstart = 5 - - newstart+(end-start)-newsize = 0 - i = start-newstart = 20 - - XXXXXXXXXXXXXXXXXXXXXXXXXXXXXyy - 3 23 25 - newsize = 5 - end = 25 - start = 23 - newstart = 3 - - newstart+(end-start)-newsize = 0 - i = start-newstart + MIN(end-start, newsize) = 22 - - */ - - - /* fixup data */ - newstart = &bucket->tb_data[bucket->tb_offset[bucket->used-1]]; - - g_assert(newstart+(end-start)-newsize <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]); - g_assert(newstart + (start-newstart) + MIN(end-start, newsize) <= &bucket->tb_data[sizeof(bucket->tb_data)/sizeof(bucket->tb_data[0])]); - g_assert(newstart+(end-start)-newsize >= (blockid_t *) &bucket->tb_offset[bucket->used]); - g_assert(newstart + (start-newstart) + MIN(end-start, newsize) >= (blockid_t *) &bucket->tb_offset[bucket->used]); - - memmove(newstart+(end-start)-newsize, newstart, ((start-newstart)+MIN(end-start, newsize)) * sizeof(blockid_t)); - - /* fixup key pointers */ - for (i=index;i<bucket->used;i++) { - bucket->tb_offset[i] += (end-start)-newsize; - } - ibex_block_dirty((struct _block *)bucket); -} - -/* - returns the number of blockid's free -*/ -static int -tail_space(struct _tailblock *tail) -{ - if (tail->used == 0) - return sizeof(tail->tb_data)/sizeof(tail->tb_data[0])-1; - - return &tail->tb_data[tail->tb_offset[tail->used-1]] - - (blockid_t *)&tail->tb_offset[tail->used]; -} - -#if 0 -static void -tail_dump(struct _memcache *blocks, blockid_t tailid) -{ - int i; - struct _tailblock *tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid)); - - printf("Block %d, used %d\n", tailid, tail->used); - for (i=0;i<sizeof(struct _tailblock)/sizeof(unsigned int);i++) { - printf(" %08x", ((unsigned int *)tail)[i]); - } - printf("\n"); -} -#endif - -static blockid_t -tail_get(struct _memcache *blocks, int size) -{ - blockid_t tailid; - struct _tailblock *tail; - int freeindex; - blockid_t *end; - int i, count = 0; - - d(printf("looking for a tail node with %d items in it\n", size)); - - /* look for a node with enough space, if we dont find it fairly - quickly, just quit. needs a better free algorithm i think ... */ - tailid = blocks->root.tail; - while (tailid && count<5) { - int space; - - d(printf(" checking tail node %d\n", tailid)); - - tail = (struct _tailblock *)ibex_block_read(blocks, tailid); - - if (tail->used == 0) { - /* assume its big enough ... */ - tail->used = 1; - tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size; - d(printf("allocated %d (%d), used %d\n", tailid, tailid, tail->used)); - ibex_block_dirty((struct _block *)tail); - - g_assert(&tail->tb_offset[tail->used-1] - < (unsigned char *) &tail->tb_data[tail->tb_offset[tail->used-1]]); - - return tailid; - } - - g_assert(&tail->tb_offset[tail->used-1] - < (unsigned char *) &tail->tb_data[tail->tb_offset[tail->used-1]]); - - /* see if we have a free slot first */ - freeindex = -1; - end = &tail->tb_data[sizeof(tail->tb_data)/sizeof(tail->tb_data[0])]; - for (i=0;i<tail->used;i++) { - if (end == &tail->tb_data[tail->tb_offset[i]]) { - freeindex = i; - break; - } - end = &tail->tb_data[tail->tb_offset[i]]; - } - - /* determine how much space we have available - incl any extra header we might need */ - space = ((char *)&tail->tb_data[tail->tb_offset[tail->used-1]]) - - ((char *)&tail->tb_offset[tail->used]) - /* FIXMEL work out why this is out a little bit */ - - 8; - if (freeindex == -1) - space -= sizeof(tail->tb_offset[0]); - - /* if we have enough, set it up, creating a new entry if necessary */ - /* for some really odd reason the compiler promotes this expression to unsigned, hence - the requirement for the space>0 check ... */ - if (space>0 && space > size*sizeof(blockid_t)) { - d(printf("space = %d, size = %d size*sizeof() = %d truth = %d\n", space, size, size*sizeof(blockid_t), space>size*sizeof(blockid_t))); - if (freeindex == -1) { - freeindex = tail->used; - tail->tb_offset[tail->used] = tail->tb_offset[tail->used-1]; - tail->used++; - } - tail_compress(tail, freeindex, size); - ibex_block_dirty((struct _block *)tail); - d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, freeindex), tail->used)); - return TAIL_KEY(tailid, freeindex); - } - count++; - tailid = block_location(tail->next); - } - - d(printf("allocating new data node for tail data\n")); - tailid = ibex_block_get(blocks); - tail = (struct _tailblock *)ibex_block_read(blocks, tailid); - tail->next = block_number(blocks->root.tail); - blocks->root.tail = tailid; - tail->used = 1; - tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size; - ibex_block_dirty((struct _block *)tail); - d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, 0), tail->used)); - - g_assert(&tail->tb_offset[tail->used-1] - < (unsigned char *) &tail->tb_data[tail->tb_offset[tail->used-1]]); - - return TAIL_KEY(tailid, 0); -} - -static void -tail_free(struct _memcache *blocks, blockid_t tailid) -{ - struct _tailblock *tail; - - d(printf("freeing tail id %d\n", tailid)); - - if (tailid == 0) - return; - - tail = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tailid)); - d(printf(" tail %d used %d\n", tailid, tail->used)); - g_assert(tail->used); - if (TAIL_INDEX(tailid) == tail->used - 1) { - tail->used --; - } else { - tail_compress(tail, TAIL_INDEX(tailid), 0); - } - ibex_block_dirty((struct _block *)tail); -} - -static struct _IBEXStore *disk_create(struct _memcache *bc) -{ - struct _IBEXStore *store; - - store = g_malloc0(sizeof(*store)); - store->klass = &ibex_diskarray_class; - store->blocks = bc; - - return store; -} - -static int disk_sync(struct _IBEXStore *store) -{ - /* no cache, nop */ - return 0; -} - -static int disk_close(struct _IBEXStore *store) -{ - g_free(store); - return 0; -} - -static blockid_t -disk_add(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data) -{ - struct _block *block; - struct _block *newblock; - blockid_t new, head = *headptr /*, tail = *tailptr*/; - - g_error("Inbimplemented"); - - if (head == 0) { - head = ibex_block_get(store->blocks); - } - block = ibex_block_read(store->blocks, head); - - d(printf("adding record %d to block %d (next = %d)\n", data, head, block->next)); - - if (block->used < sizeof(block->bl_data)/sizeof(block->bl_data[0])) { - (printf("adding record into block %d %d\n", head, data)); - block->bl_data[block->used] = data; - block->used++; - ibex_block_dirty(block); - return head; - } else { - new = ibex_block_get(store->blocks); - newblock = ibex_block_read(store->blocks, new); - newblock->next = block_number(head); - newblock->bl_data[0] = data; - newblock->used = 1; - d(printf("adding record into new %d %d, next =%d\n", new, data, newblock->next)); - ibex_block_dirty(newblock); - return new; - } -} - -static blockid_t -disk_add_blocks_internal(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data) -{ - blockid_t head = *headptr, tail = *tailptr, new; - int tocopy; - struct _tailblock *tailblock; - struct _block *block, *newblock; - int space, copied = 0, left; - - /* assumes this funciton is in control of any tail creation */ - g_assert(tail == 0); - - d(printf("Adding %d items to block list\n", data->len)); - - if (head == 0) { - head = ibex_block_get(store->blocks); - d(printf("allocating new head %d\n", head)); - } - block = ibex_block_read(store->blocks, head); - - /* ensure the first block is full before proceeding */ - space = sizeof(block->bl_data)/sizeof(block->bl_data[0]) - block->used; - if (space) { - tocopy = MIN(data->len, space); - memcpy(block->bl_data+block->used, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); - block->used += tocopy; - ibex_block_dirty(block); - copied = tocopy; - d(printf("copied %d items to left over last node\n", tocopy)); - } - - while (copied < data->len) { - left = data->len - copied; - /* do we drop the rest in a tail? */ - if (left < TAIL_THRESHOLD) { - d(printf("Storing remaining %d items in tail\n", left)); - tocopy = left; - new = tail_get(store->blocks, tocopy); - tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new)); - memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(new)]], - &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); - *tailptr = new; - } else { - new = ibex_block_get(store->blocks); - newblock = (struct _block *)ibex_block_read(store->blocks, new); - newblock->next = block_number(head); - tocopy = MIN(left, sizeof(block->bl_data)/sizeof(block->bl_data[0])); - d(printf("storing %d items in own block\n", tocopy)); - memcpy(newblock->bl_data, &g_array_index(data, blockid_t, copied), tocopy*sizeof(blockid_t)); - newblock->used = tocopy; - block = newblock; - head = new; - ibex_block_dirty(newblock); - } - copied += tocopy; - } - - *headptr = head; - return head; -} -/* - case 1: - no head, no tail, adding a lot of data - add blocks until the last, which goes in a tail node - case 2: - no head, no tail, adding a little data - just add a tail node - case 3: - no head, tail, total exceeds a block - merge as much as possible into new full blocks, then the remainder in the tail - case 4: - no head, tail, room in existing tail for data - add new data to tail node - case 5: - no head, tail, no room in existing tail for data - add a new tail node, copy data across, free old tail node -*/ - -static blockid_t -disk_add_list(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, GArray *data) -{ - blockid_t new, head = *headptr, tail = *tailptr, *start; - struct _tailblock *tailblock, *tailnew; - int len; - GArray *tmpdata = NULL; - - d(printf("adding %d items head=%d tail=%d\n", data->len, head, tail)); - if (data->len == 0) - return head; - - /* store length=1 data in the tail pointer */ - if (head == 0 && tail == 0 && data->len == 1) { - *headptr = BLOCK_ONE; - *tailptr = g_array_index(data, blockid_t, 0); - return BLOCK_ONE; - } - /* if we got length=1 data to append to, upgrade it to a real block list */ - if (head == BLOCK_ONE) { - tmpdata = g_array_new(0, 0, sizeof(blockid_t)); - g_array_append_vals(tmpdata, data->data, data->len); - g_array_append_val(tmpdata, tail); - head = *headptr = 0; - tail = *tailptr = 0; - } - - /* if we have no head, then check the tail */ - if (head == 0) { - if (tail == 0) { - if (data->len >= TAIL_THRESHOLD) { - /* add normally */ - head = disk_add_blocks_internal(store, headptr, tailptr, data); - } else { - /* else add to a tail block */ - new = tail_get(store->blocks, data->len); - d(printf("adding %d items to a tail block %d\n", data->len, new)); - tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new)); - memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]], - data->data, data->len*sizeof(blockid_t)); - *tailptr = new; - ibex_block_dirty((struct _block *)tailnew); - } - } else { - tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); - len = tail_info(tailblock, tail, &start); - /* case 3 */ - if (len + data->len >= TAIL_THRESHOLD) { - /* this is suboptimal, but should work - merge the tail data with - our new data, and write it out */ - if (tmpdata == NULL) { - tmpdata = g_array_new(0, 0, sizeof(blockid_t)); - g_array_append_vals(tmpdata, data->data, data->len); - } - g_array_append_vals(tmpdata, start, len); - *tailptr = 0; - tail_free(store->blocks, tail); - head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata); - } else if (tail_space(tailblock) >= data->len) { - /* can we just expand this in our node, or do we need a new one? */ - tail_compress(tailblock, TAIL_INDEX(tail), data->len + len); - memcpy(&tailblock->tb_data[tailblock->tb_offset[TAIL_INDEX(tail)] + len], - data->data, data->len * sizeof(blockid_t)); - ibex_block_dirty((struct _block *)tailblock); - } else { - /* we need to allocate a new tail node */ - new = tail_get(store->blocks, data->len + len); - /* and copy the data across */ - tailnew = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(new)); - memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]], - start, len*sizeof(blockid_t)); - memcpy(&tailnew->tb_data[tailnew->tb_offset[TAIL_INDEX(new)]+len], - data->data, data->len*sizeof(blockid_t)); - tail_free(store->blocks, tail); - ibex_block_dirty((struct _block *)tailnew); - *tailptr = new; - } - } - } else { - if (tail) { - /* read/merge the tail with the new data, rewrite out. - suboptimal, but it should be 'ok' ? */ - tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); - len = tail_info(tailblock, tail, &start); - tmpdata = g_array_new(0, 0, sizeof(blockid_t)); - g_array_append_vals(tmpdata, start, len); - g_array_append_vals(tmpdata, data->data, data->len); - *tailptr = 0; - tail_free(store->blocks, tail); - head = disk_add_blocks_internal(store, headptr, tailptr, tmpdata); - } else { - head = disk_add_blocks_internal(store, headptr, tailptr, data); - } - } - if (tmpdata) - g_array_free(tmpdata, TRUE); - - return head; -} - -static blockid_t -disk_remove(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, nameid_t data) -{ - blockid_t head = *headptr, node = *headptr, tail = *tailptr; - int i; - - /* special case for 1-item nodes */ - if (head == BLOCK_ONE) { - if (tail == data) { - *tailptr = 0; - *headptr = 0; - head = 0; - } - return head; - } - - d(printf("removing %d from %d\n", data, head)); - while (node) { - struct _block *block = ibex_block_read(store->blocks, node); - - for (i=0;i<block->used;i++) { - if (block->bl_data[i] == data) { - struct _block *start = ibex_block_read(store->blocks, head); - - d(printf("removing data from block %d\n ", head)); - - start->used--; - block->bl_data[i] = start->bl_data[start->used]; - if (start->used == 0) { - blockid_t new; - - d(printf("dropping block %d, new head = %d\n", head, start->next)); - new = block_location(start->next); - start->next = block_number(store->blocks->root.free); - store->blocks->root.free = head; - head = new; - } - ibex_block_dirty(block); - ibex_block_dirty(start); - *headptr = head; - return head; - } - } - node = block_location(block->next); - } - - if (tail) { - struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); - int len; - blockid_t *start; - - len = tail_info(tailblock, tail, &start); - for (i=0;i<len;i++) { - if (start[i] == data) { - for (;i<len-1;i++) - start[i] = start[i+1]; - if (len == 1) - *tailptr = 0; - if (len == 1 && (tailblock->used-1) == TAIL_INDEX(tail)) { - d(printf("dropping/unlinking tailblock %d (%d) used = %d\n", - TAIL_BLOCK(tail), tail, tailblock->used)); - tailblock->used--; - /* drop/unlink block? */ - ibex_block_dirty((struct _block *)tailblock); - *tailptr = 0; - } else { - tail_compress(tailblock, TAIL_INDEX(tail), len-1); - ibex_block_dirty((struct _block *)tailblock); - } - } - } - - } - - return head; -} - -static void disk_free(struct _IBEXStore *store, blockid_t head, blockid_t tail) -{ - blockid_t next; - struct _block *block; - - if (head == BLOCK_ONE) - return; - - while (head) { - d(printf("freeing block %d\n", head)); - block = ibex_block_read(store->blocks, head); - next = block_location(block->next); - ibex_block_free(store->blocks, head); - head = next; - } - if (tail) { - struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); - d(printf("freeing tail block %d (%d)\n", TAIL_BLOCK(tail), tail)); - tail_compress(tailblock, TAIL_INDEX(tail), 0); - ibex_block_dirty((struct _block *)tailblock); - } -} - -static gboolean -disk_find(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data) -{ - blockid_t node = head; - int i; - - d(printf("finding %d from %d\n", data, head)); - - if (head == BLOCK_ONE) - return data == tail; - - /* first check full blocks */ - while (node) { - struct _block *block = ibex_block_read(store->blocks, node); - - for (i=0;i<block->used;i++) { - if (block->bl_data[i] == data) { - return TRUE; - } - } - node = block_location(block->next); - } - - /* then check tail */ - if (tail) { - struct _tailblock *tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); - int len; - blockid_t *start; - - len = tail_info(tailblock, tail, &start); - for (i=0;i<len;i++) { - if (start[i] == data) - return TRUE; - } - } - - return FALSE; -} - -static GArray * -disk_get(struct _IBEXStore *store, blockid_t head, blockid_t tail) -{ - GArray *result = g_array_new(0, 0, sizeof(nameid_t)); - - if (head == BLOCK_ONE) { - g_array_append_val(result, tail); - return result; - } - - while (head) { - struct _block *block = ibex_block_read(store->blocks, head); - - d(printf("getting data from block %d\n", head)); - - g_array_append_vals(result, block->bl_data, block->used); - head = block_location(block->next); - d(printf("next = %d\n", head)); - } - - /* chuck on any tail data as well */ - if (tail) { - struct _tailblock *tailblock; - int len; - blockid_t *start; - - tailblock = (struct _tailblock *)ibex_block_read(store->blocks, TAIL_BLOCK(tail)); - len = tail_info(tailblock, tail, &start); - g_array_append_vals(result, start, len); - } - return result; -} - -void ibex_diskarray_dump(struct _memcache *blocks, blockid_t head, blockid_t tail); - -void -ibex_diskarray_dump(struct _memcache *blocks, blockid_t head, blockid_t tail) -{ - blockid_t node = head; - - printf("dumping list %d tail %d\n", node, tail); - if (head == BLOCK_ONE) { - printf(" 1 length index: %d\n", tail); - return; - } - - while (node) { - struct _block *block = ibex_block_read(blocks, node); - int i; - - printf(" block %d used %d\n ", node, block->used); - for (i=0;i<block->used;i++) { - printf(" %d", block->bl_data[i]); - } - printf("\n"); - node = block_location(block->next); - } - - printf("tail: "); - if (tail) { - struct _tailblock *tailblock; - int len; - blockid_t *start; - int i; - - tailblock = (struct _tailblock *)ibex_block_read(blocks, TAIL_BLOCK(tail)); - len = tail_info(tailblock, tail, &start); - for (i=0;i<len;i++) - printf(" %d", start[i]); - } - printf("\n"); -} - -#if 0 -int main(int argc, char **argv) -{ - struct _memcache *bc; - struct _IBEXStore *disk; - int i; - blockid_t data[100]; - GArray adata; - blockid_t head=0, tail=0; - - for (i=0;i<100;i++) { - data[i] = i; - } - - bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600); - disk = ibex_diskarray_class.create(bc); - - head = 0; - tail = 0; - adata.data = data; - adata.len = 70; - for (i=0;i<100;i++) { - printf("Loop %d\n", i); - ibex_diskarray_class.add_list(disk, &head, &tail, &adata); - ibex_diskarray_dump(bc, head, tail); - } - -#if 0 - for (i=1;i<100;i++) { - printf("inserting %d items\n", i); - adata.data = data; - adata.len = i; - head=0; - tail=0; - ibex_diskarray_class.add_list(disk, &head, &tail, &adata); - ibex_diskarray_dump(bc, head, tail); - } -#endif - ibex_diskarray_class.close(disk); - ibex_block_cache_close(bc); - return 0; -} - -#endif diff --git a/libibex/dumpindex.c b/libibex/dumpindex.c deleted file mode 100644 index 410a7083d6..0000000000 --- a/libibex/dumpindex.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - Dump the hash tables from an ibex file. - */ - -#include <stdio.h> -#include <stdlib.h> - -#include "ibex_internal.h" - -extern void ibex_hash_dump(struct _IBEXIndex *index); - -static void -index_iterate(struct _IBEXIndex *index) -{ - struct _IBEXCursor *idc; - int len; - char *key; - int total = 0, totallen = 0; - - idc = index->klass->get_cursor(index); - key = idc->klass->next_key(idc, &len); - while (len) { - total++; - totallen += len; - printf("%s\n", key); - g_free(key); - key = idc->klass->next_key(idc, &len); - } - g_free(key); - - idc->klass->close(idc); - - printf("Iterate Totals: %d items, total bytes %d\n", total, totallen); -} - -int main(int argc, char **argv) -{ - ibex *ib; - -#ifdef ENABLE_THREADS - g_thread_init(0); -#endif - - if (argc != 2) { - printf("Usage: %s ibexfile\n", argv[0]); - return 1; - } - ib = ibex_open(argv[1], O_RDONLY, 0); - if (ib == NULL) { - perror("Opening ibex file\n"); - return 1; - } - - ibex_hash_dump(ib->words->wordindex); - ibex_hash_dump(ib->words->nameindex); - - index_iterate(ib->words->wordindex); - index_iterate(ib->words->nameindex); - - ibex_close(ib); - - return 0; -} diff --git a/libibex/hash.c b/libibex/hash.c deleted file mode 100644 index f79ee8f55e..0000000000 --- a/libibex/hash.c +++ /dev/null @@ -1,859 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- - * - * Copyright (C) 2000 Helix Code, Inc. - * - * Authors: Michael Zucchi <notzed@helixcode.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* hash based index mechanism */ - -#include <stdio.h> -#include <stdlib.h> - -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include <string.h> - -#include "block.h" -#include "index.h" - -#define d(x) - -#define HASH_SIZE (1024) -#define KEY_THRESHOLD (sizeof(struct _hashkey) + 4) /* minimum number of free bytes we worry about - maintaining free blocks for */ -#define ARRAY_LEN(a) (sizeof(a)/sizeof(a[0])) - -typedef guint32 hashid_t; - -struct _HASHCursor { - struct _IBEXCursor cursor; - - hashid_t key; - hashid_t block; - unsigned int index; - unsigned int size; -}; - -static struct _IBEXIndex *hash_create(struct _memcache *bc, int size); -static struct _IBEXIndex *hash_open(struct _memcache *bc, blockid_t root); -static int hash_sync(struct _IBEXIndex *index); -static int hash_close(struct _IBEXIndex *index); - -static hashid_t hash_find(struct _IBEXIndex *index, const char *key, int keylen); -static void hash_remove(struct _IBEXIndex *index, const char *key, int keylen); -static hashid_t hash_insert(struct _IBEXIndex *index, const char *key, int keylen); -static char *hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len); -static void hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail); -static blockid_t hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail); -static struct _IBEXCursor *hash_get_cursor(struct _IBEXIndex *index); - -static struct _IBEXCursor *hash_cursor_create(struct _IBEXIndex *); -static void hash_cursor_close(struct _IBEXCursor *); -static guint32 hash_cursor_next(struct _IBEXCursor *); -static char *hash_cursor_next_key(struct _IBEXCursor *, int *keylenptr); - -struct _IBEXIndexClass ibex_hash_class = { - hash_create, hash_open, - hash_sync, hash_close, - hash_find, - hash_remove, - hash_insert, - hash_get_key, - hash_set_data_block, - hash_get_data_block, - hash_get_cursor, -}; - -struct _IBEXCursorClass ibex_hash_cursor_class = { - hash_cursor_close, - hash_cursor_next, - hash_cursor_next_key -}; - -/* the reason we have the tail here is that otherwise we need to - have a 32 bit blockid for the root node; which would make this - structure the same size anyway, with about 24 wasted bits */ -struct _hashkey { - blockid_t next; /* next in hash chain */ - blockid_t tail; - unsigned int root:32-BLOCK_BITS; - unsigned int keyoffset:BLOCK_BITS; -}; - -struct _hashblock { - unsigned int next:32-BLOCK_BITS; /* next block, linked list of all key blocks: block number */ - unsigned int used:BLOCK_BITS; /* number of elements used */ - union { - struct _hashkey keys[(BLOCK_SIZE-4)/sizeof(struct _hashkey)]; - char keydata[BLOCK_SIZE-4]; - } hashblock_u; -}; -#define hb_keys hashblock_u.keys -#define hb_keydata hashblock_u.keydata - -/* size of block overhead + 2 key block overhead */ -#define MAX_KEYLEN (BLOCK_SIZE - 4 - 12 - 12) - -/* root block for a hash index */ -struct _hashroot { - hashid_t free; /* free list */ - guint32 size; /* how big the hash table is */ - hashid_t keys; /* linked list of blocks */ - hashid_t table[(BLOCK_SIZE-8)/sizeof(hashid_t)]; /* pointers to blocks of pointers */ -}; - -struct _hashtableblock { - hashid_t buckets[BLOCK_SIZE/sizeof(hashid_t)]; -}; - -/* map a hash index to a block index */ -#define HASH_INDEX(b) ((b) & (BLOCK_SIZE-1)) -/* map a hash index to a block number */ -#define HASH_BLOCK(b) ((b) & ~(BLOCK_SIZE-1)) -/* map a block + index to a hash key */ -#define HASH_KEY(b, i) (((b) & ~(BLOCK_SIZE-1)) | ((i) & (BLOCK_SIZE-1))) - -/* taken from tdb/gdbm */ -static unsigned int hash_key(const unsigned char *key, int keylen) -{ - char *newkey; - newkey = alloca(keylen+1); - memcpy(newkey, key, keylen); - newkey[keylen]=0; - return g_str_hash(newkey); -#if 0 - unsigned int value; /* Used to compute the hash value. */ - unsigned int i; /* Used to cycle through random values. */ - - /* Set the initial value from the key size. */ - value = 0x238F13AF * keylen; - for (i=0; i < keylen; i++) { - value = (value + (key[i] << (i*5 % 24))); - } - - value = (1103515243 * value + 12345); - - return value; -#endif -} - -/* create a new hash table, return a pointer to its root block */ -static struct _IBEXIndex * -hash_create(struct _memcache *bc, int size) -{ - blockid_t root, block; - struct _hashroot *hashroot; - int i; - struct _hashtableblock *table; - struct _IBEXIndex *index; - - g_assert(size<=10240); - - d(printf("initialising hash table, size = %d\n", size)); - - index = g_malloc0(sizeof(*index)); - index->blocks = bc; - index->klass = &ibex_hash_class; - root = ibex_block_get(bc); - index->root = root; - d(printf(" root = %d\n", root)); - hashroot = (struct _hashroot *)ibex_block_read(bc, root); - hashroot->free = 0; - hashroot->size = size; - ibex_block_dirty((struct _block *)hashroot); - for (i=0;i<size/(BLOCK_SIZE/sizeof(blockid_t));i++) { - d(printf("initialising hash table index block %d\n", i)); - block = hashroot->table[i] = ibex_block_get(bc); - table = (struct _hashtableblock *)ibex_block_read(bc, block); - memset(table, 0, sizeof(table)); - ibex_block_dirty((struct _block *)table); - } - - return index; -} - -static struct _IBEXIndex * -hash_open(struct _memcache *bc, blockid_t root) -{ - struct _IBEXIndex *index; - - /* FIXME: check a 'magic', and the root for validity */ - - index = g_malloc0(sizeof(*index)); - index->blocks = bc; - index->root = root; - index->klass = &ibex_hash_class; - - return index; -} - - -static int hash_sync(struct _IBEXIndex *index) -{ - /* nop, index always synced on disk (at least, to blocks) */ - return 0; -} - -static int hash_close(struct _IBEXIndex *index) -{ -#ifdef INDEX_STAT - printf("Performed %d lookups, average %f depth\n", index->lookups, (double)index->lookup_total/index->lookups); -#endif - g_free(index); - return 0; -} - -/* get an iterator class */ -static struct _IBEXCursor *hash_get_cursor(struct _IBEXIndex *index) -{ - return hash_cursor_create(index); -} - -/* convert a hashbucket id into a name */ -static char * -hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len) -{ - struct _hashblock *bucket; - int ind; - char *ret, *start, *end; - - if (hashbucket == 0) { - if (len) - *len = 0; - return g_strdup(""); - } - - bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); - ind = HASH_INDEX(hashbucket); - - g_assert(ind < bucket->used); - - start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset]; - if (ind == 0) { - end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; - } else { - end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset]; - } - - ret = g_malloc(end-start+1); - memcpy(ret, start, end-start); - ret[end-start]=0; - if (len) - *len = end-start; - return ret; -} - -/* sigh, this is fnugly code ... */ -static hashid_t -hash_find(struct _IBEXIndex *index, const char *key, int keylen) -{ - struct _hashroot *hashroot; - guint32 hash; - int hashentry; - blockid_t hashtable; - hashid_t hashbucket; - struct _hashtableblock *table; - - g_assert(index != 0); - g_assert(index->root != 0); - - d(printf("finding hash %.*s\n", keylen, key)); - - /* truncate the key to the maximum size */ - if (keylen > MAX_KEYLEN) - keylen = MAX_KEYLEN; - - hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); - - /* find the table containing this entry */ - hash = hash_key(key, keylen) % hashroot->size; - hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))]; - g_assert(hashtable != 0); - table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable); - hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t)); - /* and its bucket */ - hashbucket = table->buckets[hashentry]; - -#ifdef INDEX_STAT - index->lookups++; -#endif - /* go down the bucket chain, reading each entry till we are done ... */ - while (hashbucket != 0) { - struct _hashblock *bucket; - char *start, *end; - int ind; - -#ifdef INDEX_STAT - index->lookup_total++; -#endif - - d(printf(" checking bucket %d\n", hashbucket)); - - /* get the real bucket id from the hashbucket id */ - bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); - /* and get the key number within the block */ - ind = HASH_INDEX(hashbucket); - - g_assert(ind < bucket->used); - - start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset]; - if (ind == 0) { - end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; - } else { - end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset]; - } - if ( (end-start) == keylen - && memcmp(start, key, keylen) == 0) { - return hashbucket; - } - hashbucket = bucket->hb_keys[ind].next; - } - return 0; -} - -static int -hash_info(struct _hashblock *bucket, int index) -{ - char *start, *end; - - g_assert(index < bucket->used); - - start = &bucket->hb_keydata[bucket->hb_keys[index].keyoffset]; - if (index == 0) { - end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; - } else { - end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset]; - } - - return end-start; -} - - -/* TODO: get rid of hash_compress/remove and just have one a-la the disktail code */ - -/* compresses the bucket 'bucket', removing data - at index 'index' */ -static void -hash_compress(struct _hashblock *bucket, int index) -{ - int i; - char *start, *end, *newstart; - - /* get start/end of area to zap */ - start = &bucket->hb_keydata[bucket->hb_keys[index].keyoffset]; - if (index == 0) { - end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; - } else { - end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset]; - } - - if (start == end) - return; - - /* fixup data */ - newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset]; - memmove(newstart+(end-start), newstart, start-newstart); - - /* fixup key pointers */ - for (i=index;i<bucket->used;i++) { - bucket->hb_keys[i].keyoffset += (end-start); - } - ibex_block_dirty((struct _block *)bucket); -} - -/* make room 'len' for the key 'index' */ -/* assumes key 'index' is already empty (0 length) */ -static void -hash_expand(struct _hashblock *bucket, int index, int len) -{ - int i; - char *end, *newstart; - - /* get start/end of area to zap */ - if (index == 0) { - end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; - } else { - end = &bucket->hb_keydata[bucket->hb_keys[index-1].keyoffset]; - } - - /* fixup data */ - newstart = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset]; - memmove(newstart-len, newstart, end-newstart); - - /* fixup key pointers */ - for (i=index;i<bucket->used;i++) { - bucket->hb_keys[i].keyoffset -= len; - } - ibex_block_dirty((struct _block *)bucket); -} - -static void -hash_remove(struct _IBEXIndex *index, const char *key, int keylen) -{ - struct _hashroot *hashroot; - guint32 hash; - int hashentry; - blockid_t hashtable; - hashid_t hashbucket, hashprev; - struct _hashtableblock *table; - - g_assert(index != 0); - g_assert(index->root != 0); - - d(printf("removing hash %.*s\n", keylen, key)); - - /* truncate the key to the maximum size */ - if (keylen > MAX_KEYLEN) - keylen = MAX_KEYLEN; - - hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); - - /* find the table containing this entry */ - hash = hash_key(key, keylen) % hashroot->size; - hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))]; - table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable); - hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t)); - /* and its bucket */ - hashbucket = table->buckets[hashentry]; - - /* go down the bucket chain, reading each entry till we are done ... */ - hashprev = 0; - while (hashbucket != 0) { - struct _hashblock *bucket; - char *start, *end; - int ind; - - d(printf(" checking bucket %d\n", hashbucket)); - - /* get the real bucket id from the hashbucket id */ - bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); - /* and get the key number within the block */ - ind = HASH_INDEX(hashbucket); - - g_assert(ind < bucket->used); - - start = &bucket->hb_keydata[bucket->hb_keys[ind].keyoffset]; - if (ind == 0) { - end = &bucket->hb_keydata[sizeof(bucket->hb_keydata)/sizeof(bucket->hb_keydata[0])]; - } else { - end = &bucket->hb_keydata[bucket->hb_keys[ind-1].keyoffset]; - } - if ( (end-start) == keylen - && memcmp(start, key, keylen) == 0) { - struct _hashblock *prevbucket; - - if (hashprev == 0) { - /* unlink from hash chain */ - table->buckets[hashentry] = bucket->hb_keys[HASH_INDEX(hashbucket)].next; - /* link into free list */ - bucket->hb_keys[HASH_INDEX(hashbucket)].next = hashroot->free; - hashroot->free = hashbucket; - /* compress away data */ - hash_compress(bucket, HASH_INDEX(hashbucket)); - ibex_block_dirty((struct _block *)bucket); - ibex_block_dirty((struct _block *)table); - ibex_block_dirty((struct _block *)hashroot); - } else { - prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashprev)); - prevbucket->hb_keys[HASH_INDEX(hashprev)].next = - bucket->hb_keys[ind].next; - /* link into free list */ - bucket->hb_keys[ind].next = hashroot->free; - hashroot->free = hashbucket; - /* compress entry */ - hash_compress(bucket, ind); - ibex_block_dirty((struct _block *)bucket); - ibex_block_dirty((struct _block *)prevbucket); - ibex_block_dirty((struct _block *)hashroot); - } - return; - } - hashprev = hashbucket; - hashbucket = bucket->hb_keys[ind].next; - } -} - -/* set where the datablock is located */ -static void -hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail) -{ - struct _hashblock *bucket; - - d(printf("setting data block hash %d to %d tail %d\n", keyid, blockid, tail)); - - /* map to a block number */ - g_assert((blockid & (BLOCK_SIZE-1)) == 0); - blockid >>= BLOCK_BITS; - - bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid)); - if (bucket->hb_keys[HASH_INDEX(keyid)].root != blockid - || bucket->hb_keys[HASH_INDEX(keyid)].tail != tail) { - bucket->hb_keys[HASH_INDEX(keyid)].tail = tail; - bucket->hb_keys[HASH_INDEX(keyid)].root = blockid; - ibex_block_dirty((struct _block *)bucket); - } -} - -static blockid_t -hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail) -{ - struct _hashblock *bucket; - - d(printf("getting data block hash %d\n", keyid)); - - if (keyid == 0) { - if (tail) - *tail = 0; - return 0; - } - - bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyid)); - if (tail) - *tail = bucket->hb_keys[HASH_INDEX(keyid)].tail; - return bucket->hb_keys[HASH_INDEX(keyid)].root << BLOCK_BITS; -} - -static hashid_t -hash_insert(struct _IBEXIndex *index, const char *key, int keylen) -{ - struct _hashroot *hashroot; - guint32 hash; - int hashentry; - blockid_t hashtable; - hashid_t hashbucket, keybucket, keyprev, keyfree; - struct _hashtableblock *table; - struct _hashblock *bucket; - int count; - - g_assert(index != 0); - g_assert(index->root != 0); - - /* truncate the key to the maximum size */ - if (keylen > MAX_KEYLEN) - keylen = MAX_KEYLEN; - - d(printf("inserting hash %.*s\n", keylen, key)); - - hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); - - /* find the table containing this entry */ - hash = hash_key(key, keylen) % hashroot->size; - hashtable = hashroot->table[hash / (BLOCK_SIZE/sizeof(blockid_t))]; - table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable); - hashentry = hash % (BLOCK_SIZE/sizeof(blockid_t)); - /* and its bucket */ - hashbucket = table->buckets[hashentry]; - - /* now look for a free slot, first try the free list */ - /* but dont try too hard if our key is just too long ... so just scan upto - 4 blocks, but if we dont find a space, tough ... */ - keybucket = hashroot->free; - keyprev = 0; - count = 0; - while (keybucket && count<4) { - int space; - - d(printf(" checking free %d\n", keybucket)); - - /* read the bucket containing this free key */ - bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keybucket)); - - /* check if there is enough space for the key */ - space = &bucket->hb_keydata[bucket->hb_keys[bucket->used-1].keyoffset] - - (char *)&bucket->hb_keys[bucket->used]; - if (space >= keylen) { - hash_expand(bucket, HASH_INDEX(keybucket), keylen); - memcpy(&bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(keybucket)].keyoffset], - key, keylen); - - /* check if there is free space still in this node, and there are no other empty blocks */ - keyfree = bucket->hb_keys[HASH_INDEX(keybucket)].next; - if ((space-keylen) >= KEY_THRESHOLD) { - int i; - int head = ARRAY_LEN(bucket->hb_keydata); - int found = FALSE; - - for (i=0;i<bucket->used;i++) { - if (bucket->hb_keys[i].keyoffset == head) { - /* already have a free slot in this block, leave it */ - found = TRUE; - break; - } - head = bucket->hb_keys[i].keyoffset; - } - if (!found) { - /* we should link in a new free slot for this node */ - bucket->hb_keys[bucket->used].next = bucket->hb_keys[HASH_INDEX(keybucket)].next; - bucket->hb_keys[bucket->used].keyoffset = bucket->hb_keys[bucket->used-1].keyoffset; - keyfree = HASH_KEY(HASH_BLOCK(keybucket), bucket->used); - bucket->used++; - } - } - /* link 'keyfree' back to the parent ... */ - if (keyprev == 0) { - hashroot->free = keyfree; - ibex_block_dirty((struct _block *)hashroot); - } else { - struct _hashblock *prevbucket; - prevbucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(keyprev)); - prevbucket->hb_keys[HASH_INDEX(keyprev)].next = keyfree; - ibex_block_dirty((struct _block *)prevbucket); - } - - /* link into the hash chain */ - bucket->hb_keys[HASH_INDEX(keybucket)].next = hashbucket; - bucket->hb_keys[HASH_INDEX(keybucket)].root = 0; - bucket->hb_keys[HASH_INDEX(keybucket)].tail = 0; - table->buckets[hashentry] = keybucket; - ibex_block_dirty((struct _block *)table); - ibex_block_dirty((struct _block *)bucket); - - d(printf(" new key id %d\n", keybucket)); - d(printf(" new free id %d\n", hashroot->free)); - - return keybucket; - } - - count++; - keyprev = keybucket; - keybucket = bucket->hb_keys[HASH_INDEX(keybucket)].next; - } - - /* else create a new block ... */ - keybucket = ibex_block_get(index->blocks); - bucket = (struct _hashblock *)ibex_block_read(index->blocks, keybucket); - - d(printf("creating new key bucket %d\n", keybucket)); - - memset(bucket, 0, sizeof(*bucket)); - - bucket->used = 2; - /* first block, is the new key */ - bucket->hb_keys[0].keyoffset = ARRAY_LEN(bucket->hb_keydata) - keylen; - memcpy(&bucket->hb_keydata[bucket->hb_keys[0].keyoffset], key, keylen); - bucket->hb_keys[0].next = hashbucket; - bucket->hb_keys[0].root = 0; - bucket->hb_keys[0].tail = 0; - - table->buckets[hashentry] = HASH_KEY(keybucket, 0); - - /* next block is a free block, link into free list */ - bucket->hb_keys[1].keyoffset = bucket->hb_keys[0].keyoffset; - bucket->hb_keys[1].next = hashroot->free; - hashroot->free = HASH_KEY(keybucket, 1); - - /* link new block into keys list */ - bucket->next = block_number(hashroot->keys); - hashroot->keys = keybucket; - - ibex_block_dirty((struct _block *)hashroot); - ibex_block_dirty((struct _block *)table); - ibex_block_dirty((struct _block *)bucket); - - d(printf(" new key id %d\n", HASH_KEY(keybucket, 0))); - d(printf(" new free id %d\n", hashroot->free)); - - return HASH_KEY(keybucket, 0); -} - -/* hash cursor functions */ -static struct _IBEXCursor * -hash_cursor_create(struct _IBEXIndex *idx) -{ - struct _HASHCursor *idc; - struct _hashroot *hashroot; - - idc = g_malloc(sizeof(*idc)); - idc->cursor.klass = &ibex_hash_cursor_class; - idc->cursor.index = idx; - idc->key = 0; - idc->index = 0; - - hashroot = (struct _hashroot *)ibex_block_read(idx->blocks, idx->root); - idc->size = hashroot->size; - idc->block = hashroot->keys; - - return &idc->cursor; -} - -static void -hash_cursor_close(struct _IBEXCursor *idc) -{ - g_free(idc); -} - -static guint32 -hash_cursor_next(struct _IBEXCursor *idc) -{ - struct _HASHCursor *hc = (struct _HASHCursor *)idc; - struct _hashblock *bucket; - - while (hc->block != 0) { - bucket = (struct _hashblock *)ibex_block_read(idc->index->blocks, hc->block); - while (hc->index < bucket->used) { - if (hash_info(bucket, hc->index) > 0) { - hc->key = HASH_KEY(hc->block, hc->index); - hc->index++; - if (hc->index == bucket->used) { - hc->index = 0; - hc->block = block_location(bucket->next); - } - return hc->key; - } - hc->index++; - } - hc->index = 0; - hc->block = block_location(bucket->next); - } - - return hc->block; -} - -static char * -hash_cursor_next_key(struct _IBEXCursor *idc, int *keylenptr) -{ - /* TODO: this could be made slightly mroe efficient going to the structs direct. - but i'm lazy today */ - return idc->index->klass->get_key(idc->index, idc->klass->next(idc), keylenptr); -} - -/* debug */ -void ibex_hash_dump(struct _IBEXIndex *index); -static void ibex_hash_dump_rec(struct _IBEXIndex *index, int *words, int *wordslen); - -void ibex_hash_dump(struct _IBEXIndex *index) -{ - int words = 0, wordslen=0; - - ibex_hash_dump_rec(index, &words, &wordslen); - - printf("Total words = %d, bytes = %d, ave length = %f\n", words, wordslen, (double)wordslen/(double)words); -} - - -static void -ibex_hash_dump_rec(struct _IBEXIndex *index, int *words, int *wordslen) -{ - int i; - struct _hashtableblock *table; - struct _hashblock *bucket; - struct _hashroot *hashroot; - blockid_t hashtable; - hashid_t hashbucket; - extern void ibex_diskarray_dump(struct _memcache *blocks, blockid_t head, blockid_t tail); - - - printf("Walking hash tree:\n"); - hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); - for (i=0;i<hashroot->size;i++) { - printf("Hash table chain: %d\n", i); - hashtable = hashroot->table[i / (BLOCK_SIZE/sizeof(blockid_t))]; - table = (struct _hashtableblock *)ibex_block_read(index->blocks, hashtable); - hashbucket = table->buckets[i % (BLOCK_SIZE/sizeof(blockid_t))]; - while (hashbucket) { - int len; - - *words = *words + 1; - - bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); - printf(" bucket %d: [used %d]", hashbucket, bucket->used); - if (HASH_INDEX(hashbucket) == 0) { - len = ARRAY_LEN(bucket->hb_keydata) - - bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset; - } else { - len = bucket->hb_keys[HASH_INDEX(hashbucket)-1].keyoffset - - bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset; - } - printf("'%.*s' = %d next=%d\n", len, &bucket->hb_keydata[bucket->hb_keys[HASH_INDEX(hashbucket)].keyoffset], - bucket->hb_keys[HASH_INDEX(hashbucket)].root, - bucket->hb_keys[HASH_INDEX(hashbucket)].next); - - *wordslen = *wordslen + len; - - ibex_diskarray_dump(index->blocks, - bucket->hb_keys[HASH_INDEX(hashbucket)].root << BLOCK_BITS, - bucket->hb_keys[HASH_INDEX(hashbucket)].tail); - - hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next; - } - /* make sure its still in the cache */ - hashroot = (struct _hashroot *)ibex_block_read(index->blocks, index->root); - } - - hashbucket = hashroot->free; - printf("Dumping free lists ..\n"); - while (hashbucket) { - printf(" %d", hashbucket); - bucket = (struct _hashblock *)ibex_block_read(index->blocks, HASH_BLOCK(hashbucket)); - hashbucket = bucket->hb_keys[HASH_INDEX(hashbucket)].next; - } - printf("\n"); -} - -#if 0 -int main(int argc, char **argv) -{ - struct _memcache *bc; - struct _IBEXIndex *hash; - int i; - - bc = ibex_block_cache_open("index.db", O_CREAT|O_RDWR, 0600); - hash = ibex_hash_class.create(bc, 1024); - for (i=0;i<10000;i++) { - char key[16]; - sprintf(key, "key %d", i); - ibex_hash_class.insert(hash, key, strlen(key)); - } - - for (i=500;i<1000;i++) { - char key[16]; - sprintf(key, "key %d", i); - ibex_hash_class.remove(hash, key, strlen(key)); - } - - for (i=500;i<1000;i++) { - char key[16]; - sprintf(key, "key %d", i); - ibex_hash_class.insert(hash, key, strlen(key)); - } - - - ibex_hash_dump(hash); - - for (i=0;i<2000;i++) { - char key[16], *lookup; - hashid_t keyid; - blockid_t root, tail; - - sprintf(key, "key %d", i); - keyid = ibex_hash_class.find(hash, key, strlen(key)); - lookup = ibex_hash_class.get_key(hash, keyid, 0); - root = ibex_hash_class.get_data(hash, keyid, &tail); - printf("key %s = %d = '%s' root:%d tail:%d \n", key, keyid, lookup, root, tail); - g_free(lookup); - } - - ibex_hash_class.close(hash); - ibex_block_cache_close(bc); - return 0; -} - -#endif diff --git a/libibex/ibex.h b/libibex/ibex.h deleted file mode 100644 index 03635e275d..0000000000 --- a/libibex/ibex.h +++ /dev/null @@ -1,106 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Copyright (C) 2000 Helix Code, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -#ifndef IBEX_H -#define IBEX_H - -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <glib.h> - -struct ibex; -typedef struct ibex ibex; - -/* All functions that can fail set errno and return NULL or -1 on - * failure. - */ - -/* Open the named ibex index file. If CREATE is true, create the file - * if it doesn't already exist. - */ -ibex *ibex_open (char *file, int flags, int mode); - -/* Write the ibex to disk. */ -int ibex_write (ibex *ib); - -/* only save if ibex has changed. */ -int ibex_save (ibex *ib); - -/* Write the ibex to disk if it has changed, and free all memory - * associated with it. - */ -int ibex_close (ibex *ib); - - - -/* Index the named file. (If the FILENAME is already in the index, - * remove the old copy. - */ -int ibex_index_file (ibex *ib, char *filename); - -/* Index LEN bytes off FD, using NAME as the filename in the index. - * (If NAME already exists in the index, this adds more data to it.) - */ -int ibex_index_fd (ibex *ib, char *name, int fd, size_t len); - -/* Like ibex_index_fd, but with a buffer rather than a file descriptor. - * The buffer does not need to be '\0'-terminated. If UNREAD is not - * NULL, then the indexer won't assume that the buffer ends on a word - * boundary, and will return (in UNREAD) the number of bytes from the - * end of the buffer that it didn't use, if any. - */ -int ibex_index_buffer (ibex *ib, char *name, char *buffer, - size_t len, size_t *unread); - -/* Remove entries for a given file from the index. (Most of the removal - * isn't actually done until the file is written out to disk, so this - * is very fast.) - */ -void ibex_unindex (ibex *ib, char *name); - -/* Rename a file in the index. (This is also fast.) */ -void ibex_rename (ibex *ib, char *oldfilename, char *newfilename); - - - -/* Find a word in the index. Returns an array of strings: the caller - * should free the array, but should not free or modify the strings. - */ -GPtrArray *ibex_find (ibex *ib, char *word); - -/* Find if a word is contained in a specific name reference. - */ -gboolean ibex_find_name (ibex *ib, char *name, char *word); - -/* has a file been indexed? - */ -gboolean ibex_contains_name(ibex *ib, char *name); - -/* Return all the files containing all of the words in the given - * array. Returned data is like with ibex_find. - */ -GPtrArray *ibex_find_all (ibex *ib, GPtrArray *words); - -/* Debug function, dumps the whole index tree, showing removed files as well */ -void ibex_dump_all (ibex *ib); - -#endif /* ! IBEX_H */ - diff --git a/libibex/ibex_block.c b/libibex/ibex_block.c deleted file mode 100644 index faeee232ac..0000000000 --- a/libibex/ibex_block.c +++ /dev/null @@ -1,345 +0,0 @@ - -#include <glib.h> - -#include <stdio.h> -#include <unicode.h> -#include <ctype.h> -#include <string.h> -#include <errno.h> - -#include "ibex_internal.h" - -#define d(x) - -static signed char utf8_trans[] = { - 'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I', - 'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U', - 'U', 'Y', -3, -4, 'a', 'a', 'a', 'a', 'a', 'a', -5, 'c', 'e', 'e', - 'e', 'e', 'i', 'i', 'i', 'i', -6, 'n', 'o', 'o', 'o', 'o', 'o', '/', - 'o', 'u', 'u', 'u', 'u', 'y', -7, 'y', 'A', 'a', 'A', 'a', 'A', 'a', - 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', - 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', - 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', - 'I', 'i', -8, -9, 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L', - 'l', 'L', 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', -10, -11, - 'O', 'o', 'O', 'o', 'O', 'o', -12, -13, 'R', 'r', 'R', 'r', 'R', 'r', - 'S', 'r', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', - 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', - 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's' -}; - -static char *utf8_long_trans[] = { - "AE", "TH", "TH", "ss", "ae", "th", "th", "IJ", "ij", - "NG", "ng", "OE", "oe" -}; - -/* This is a bit weird. It takes pointers to the start and end (actually - * just past the end) of a UTF-8-encoded word, and a buffer at least 1 - * byte longer than the length of the word. It copies the word into the - * buffer in all lowercase without accents, and splits up ligatures. - * (Since any ligature would be a multi-byte character in UTF-8, splitting - * them into two US-ASCII characters won't overrun the buffer.) - * - * It is not safe to call this routine with bad UTF-8. - */ -static void -ibex_normalise_word(char *start, char *end, char *buf) -{ - unsigned char *s, *d; - unicode_char_t uc; - - s = (unsigned char *)start; - d = (unsigned char *)buf; - while (s < (unsigned char *)end) { - if (*s < 0x80) { - /* US-ASCII character: copy unless it's - * an apostrophe. - */ - if (*s != '\'') - *d++ = tolower (*s); - s++; - } else { - char *next = unicode_get_utf8 (s, &uc); - if (uc >= 0xc0 && uc < 0xc0 + sizeof (utf8_trans)) { - signed char ch = utf8_trans[uc - 0xc0]; - if (ch > 0) - *d++ = tolower (ch); - else { - *d++ = tolower (utf8_long_trans[-ch - 1][0]); - *d++ = tolower (utf8_long_trans[-ch - 1][1]); - } - s = next; - } else { - while (s < (unsigned char *)next) - *d++ = *s++; - } - } - } - *d = '\0'; -} - -enum { IBEX_ALPHA, IBEX_NONALPHA, IBEX_INVALID, IBEX_INCOMPLETE }; - -/* This incorporates parts of libunicode, because there's no way to - * force libunicode to not read past a certain point. - */ -static int -utf8_category (char *sp, char **snp, char *send) -{ - unsigned char *p = (unsigned char *)sp, **np = (unsigned char **)snp; - unsigned char *end = (unsigned char *)send; - - if (isascii (*p)) { - *np = p + 1; - if (isalpha (*p) || *p == '\'') - return IBEX_ALPHA; - return IBEX_NONALPHA; - } else { - unicode_char_t uc; - int more; - - if ((*p & 0xe0) == 0xc0) { - more = 1; - uc = *p & 0x1f; - } else if ((*p & 0xf0) == 0xe0) { - more = 2; - uc = *p & 0x0f; - } else if ((*p & 0xf8) == 0xf0) { - more = 3; - uc = *p & 0x07; - } else if ((*p & 0xfc) == 0xf8) { - more = 4; - uc = *p & 0x03; - } else if ((*p & 0xfe) == 0xfc) { - more = 5; - uc = *p & 0x01; - } else - return IBEX_INVALID; - - if (p + more > end) - return IBEX_INCOMPLETE; - - while (more--) { - if ((*++p & 0xc0) != 0x80) - return IBEX_INVALID; - uc <<= 6; - uc |= *p & 0x3f; - } - - *np = p + 1; - if (unicode_isalpha (uc)) - return IBEX_ALPHA; - else - return IBEX_NONALPHA; - } -} - -/** - * ibex_index_buffer: the lowest-level ibex indexing interface - * @ib: an ibex - * @name: the name of the file being indexed - * @buffer: a buffer containing data from the file - * @len: the length of @buffer - * @unread: an output argument containing the number of unread bytes - * - * This routine indexes up to @len bytes from @buffer into @ib. - * If @unread is NULL, the indexer assumes that the buffer ends on a - * word boundary, and will index all the way to the end of the - * buffer. If @unread is not NULL, and the buffer ends with an - * alphabetic character, the indexer will assume that the buffer has - * been cut off in the middle of a word, and return the number of - * un-indexed bytes at the end of the buffer in *@unread. The caller - * should then read in more data through whatever means it has - * and pass in the unread bytes from the original buffer, followed - * by the new data, on its next call. - * - * Return value: 0 on success, -1 on failure. - **/ -int -ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread) -{ - char *p, *q, *nq, *end, *word; - int wordsiz, cat = 0; - GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal); - GPtrArray *wordlist = g_ptr_array_new(); - int i, ret=-1; - - if (unread) - *unread = 0; - - end = buffer + len; - wordsiz = 20; - word = g_malloc (wordsiz); - - p = buffer; - while (p < end) { - while (p < end) { - cat = utf8_category (p, &q, end); - if (cat != IBEX_NONALPHA) - break; - p = q; - } - if (p == end) { - goto done; - } else if (cat == IBEX_INVALID) { - goto error; - } else if (cat == IBEX_INCOMPLETE) - q = end; - - while (q < end) { - cat = utf8_category (q, &nq, end); - if (cat != IBEX_ALPHA) - break; - q = nq; - } - if (cat == IBEX_INVALID || - (cat == IBEX_INCOMPLETE && !unread)) { - goto error; - } else if (cat == IBEX_INCOMPLETE || (q == end && unread)) { - *unread = end - p; - goto done; - } - - if (wordsiz < q - p + 1) { - wordsiz = q - p + 1; - word = g_realloc (word, wordsiz); - } - ibex_normalise_word (p, q, word); - if (word[0]) { - if (g_hash_table_lookup(words, word) == 0) { - char *newword = g_strdup(word); - g_ptr_array_add(wordlist, newword); - g_hash_table_insert(words, newword, name); - } - } - p = q; - } -done: - IBEX_LOCK(ib); - - d(printf("name %s count %d size %d\n", name, wordlist->len, len)); - if (!ib->predone) { - ib->words->klass->index_pre(ib->words); - ib->predone = TRUE; - } - ib->words->klass->add_list(ib->words, name, wordlist); - - IBEX_UNLOCK(ib); - - ret = 0; -error: - for (i=0;i<wordlist->len;i++) - g_free(wordlist->pdata[i]); - g_ptr_array_free(wordlist, TRUE); - g_hash_table_destroy(words); - g_free (word); - return ret; -} - - -ibex *ibex_open (char *file, int flags, int mode) -{ - ibex *ib; - - ib = g_malloc0(sizeof(*ib)); - ib->blocks = ibex_block_cache_open(file, flags, mode); - if (ib->blocks == 0) { - g_warning("create: Error occured?: %s\n", strerror(errno)); - g_free(ib); - return NULL; - } - /* FIXME: the blockcache or the wordindex needs to manage the other one */ - ib->words = ib->blocks->words; - -#ifdef ENABLE_THREADS - ib->lock = g_mutex_new(); -#endif - return ib; -} - -int ibex_save (ibex *ib) -{ - d(printf("syncing database\n")); - - IBEX_LOCK(ib); - - if (ib->predone) { - ib->words->klass->index_post(ib->words); - ib->predone = FALSE; - } - ib->words->klass->sync(ib->words); - /* FIXME: some return */ - ibex_block_cache_sync(ib->blocks); - - IBEX_UNLOCK(ib); - - return 0; -} - -int ibex_close (ibex *ib) -{ - int ret = 0; - - d(printf("closing database\n")); - - if (ib->predone) { - ib->words->klass->index_post(ib->words); - ib->predone = FALSE; - } - - ib->words->klass->close(ib->words); - ibex_block_cache_close(ib->blocks); -#ifdef ENABLE_THREADS - g_mutex_free(ib->lock); -#endif - g_free(ib); - return ret; -} - -void ibex_unindex (ibex *ib, char *name) -{ - d(printf("trying to unindex '%s'\n", name)); - IBEX_LOCK(ib); - ib->words->klass->unindex_name(ib->words, name); - IBEX_UNLOCK(ib); -} - -GPtrArray *ibex_find (ibex *ib, char *word) -{ - char *normal; - int len; - GPtrArray *ret; - - len = strlen(word); - normal = alloca(len+1); - ibex_normalise_word(word, word+len, normal); - IBEX_LOCK(ib); - ret = ib->words->klass->find(ib->words, normal); - IBEX_UNLOCK(ib); - return ret; -} - -gboolean ibex_find_name (ibex *ib, char *name, char *word) -{ - char *normal; - int len; - gboolean ret; - - len = strlen(word); - normal = alloca(len+1); - ibex_normalise_word(word, word+len, normal); - IBEX_LOCK(ib); - ret = ib->words->klass->find_name(ib->words, name, normal); - IBEX_UNLOCK(ib); - return ret; -} - -gboolean ibex_contains_name(ibex *ib, char *name) -{ - gboolean ret; - - IBEX_LOCK(ib); - ret = ib->words->klass->contains_name(ib->words, name); - IBEX_UNLOCK(ib); - return ret; -} diff --git a/libibex/ibex_db.c b/libibex/ibex_db.c deleted file mode 100644 index 1ef0b3d1bc..0000000000 --- a/libibex/ibex_db.c +++ /dev/null @@ -1,512 +0,0 @@ - -#include <glib.h> - -#include <db.h> -#include <stdio.h> -#include <unicode.h> -#include <ctype.h> - -#include "ibex_internal.h" - -#define d(x) - -/* - Uses 2 databases: - - names - HASH - lists which files are stored - words - BTREE - index words to files - -*/ - -static int -db_delete_name(DB *db, const char *name) -{ - DBC *cursor; - int err, namelen; - DBT key, data; - - printf("called to delete name %s\n", name); - return 0; - - err = db->cursor(db, NULL, &cursor, 0); - if (err != 0) { - printf("Error occured?: %s\n", db_strerror(err)); - return err; - } - - memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); - namelen = strlen(name); - - err = cursor->c_get(cursor, &key, &data, DB_FIRST); - while (err == 0) { - if (data.size == namelen && memcmp(data.data, name, namelen) == 0) { - d(printf("Found match, removing it\n")); - cursor->c_del(cursor, 0); - } else { - data.size = namelen; - data.data = (void *)name; - if (cursor->c_get(cursor, &key, &data, DB_GET_BOTH) == 0) { - d(printf("seek to match, removing it\n")); - cursor->c_del(cursor, 0); - } else { - d(printf("seek to match, not found\n")); - } - } - err = cursor->c_get(cursor, &key, &data, DB_NEXT_NODUP); - } - - cursor->c_close(cursor); - - return 0; -} - -static int -db_index_words(DB *db, char *name, char **words) -{ - DBT key, data; - int err = 0; - char *word; - - memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); - data.data = name; - data.size = strlen(name); - for (;(word=*words);words++) { - /* normalise word first ... */ - key.data = word; - key.size = strlen(word); - - err = db->put(db, NULL, &key, &data, 0); - if (err != 0) - printf("Error occured?: %s\n", db_strerror(err)); - } - - return err; -} - -static int -db_index_word(DB *db, char *name, char *word) -{ - DBT key, data; - int err = 0; - - memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); - data.data = name; - data.size = strlen(name); - key.data = word; - key.size = strlen(word); - - err = db->put(db, NULL, &key, &data, 0); - if (err != 0) - printf("Error occured?: %s\n", db_strerror(err)); - - return err; -} - -static GPtrArray * -db_find(DB *db, char *word) -{ - DBT key, data; - int err; - DBC *cursor; - GPtrArray *matches = g_ptr_array_new(); - - memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); - - err = db->cursor(db, NULL, &cursor, 0); - if (err != 0) { - printf("Error occured?: %s\n", db_strerror(err)); - return matches; - } - - memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); - - key.size = strlen(word); - key.data = word; - - err = cursor->c_get(cursor, &key, &data, DB_SET); - while (err == 0) { - char *name; - name = g_malloc(data.size+1); - memcpy(name, data.data, data.size); - name[data.size] = '\0'; - g_ptr_array_add(matches, name); - err = cursor->c_get(cursor, &key, &data, DB_NEXT_DUP); - } - if (err != DB_NOTFOUND) { - printf("Error occured?: %s\n", db_strerror(err)); - /* what to do? doesn't matter too much its just a search ... */ - } - - cursor->c_close(cursor); - - return matches; -} - -/* check that db contains key @word, optionally with contents @name */ -static gboolean -db_contains_word(DB *db, char *name, char *word) -{ - DBT key, data; - - memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); - - if (name != NULL) { - data.size = strlen(name); - data.data = name; - } - key.size = strlen(word); - key.data = word; - - return db->get(db, NULL, &key, &data, name?DB_GET_BOTH:DB_SET) == 0; -} - -static int -db_delete_word(DB *db, char *word) -{ - DBT key; - - memset(&key, 0, sizeof(key)); - key.size = strlen(word); - key.data = word; - return db->del(db, NULL, &key, 0); -} - - -static signed char utf8_trans[] = { - 'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I', - 'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U', - 'U', 'Y', -3, -4, 'a', 'a', 'a', 'a', 'a', 'a', -5, 'c', 'e', 'e', - 'e', 'e', 'i', 'i', 'i', 'i', -6, 'n', 'o', 'o', 'o', 'o', 'o', '/', - 'o', 'u', 'u', 'u', 'u', 'y', -7, 'y', 'A', 'a', 'A', 'a', 'A', 'a', - 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', - 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', - 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', - 'I', 'i', -8, -9, 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L', - 'l', 'L', 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', -10, -11, - 'O', 'o', 'O', 'o', 'O', 'o', -12, -13, 'R', 'r', 'R', 'r', 'R', 'r', - 'S', 'r', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', - 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', - 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's' -}; - -static char *utf8_long_trans[] = { - "AE", "TH", "TH", "ss", "ae", "th", "th", "IJ", "ij", - "NG", "ng", "OE", "oe" -}; - -/* This is a bit weird. It takes pointers to the start and end (actually - * just past the end) of a UTF-8-encoded word, and a buffer at least 1 - * byte longer than the length of the word. It copies the word into the - * buffer in all lowercase without accents, and splits up ligatures. - * (Since any ligature would be a multi-byte character in UTF-8, splitting - * them into two US-ASCII characters won't overrun the buffer.) - * - * It is not safe to call this routine with bad UTF-8. - */ -static void -ibex_normalise_word(char *start, char *end, char *buf) -{ - unsigned char *s, *d; - unicode_char_t uc; - - s = (unsigned char *)start; - d = (unsigned char *)buf; - while (s < (unsigned char *)end) { - if (*s < 0x80) { - /* US-ASCII character: copy unless it's - * an apostrophe. - */ - if (*s != '\'') - *d++ = tolower (*s); - s++; - } else { - char *next = unicode_get_utf8 (s, &uc); - if (uc >= 0xc0 && uc < 0xc0 + sizeof (utf8_trans)) { - signed char ch = utf8_trans[uc - 0xc0]; - if (ch > 0) - *d++ = tolower (ch); - else { - *d++ = tolower (utf8_long_trans[-ch - 1][0]); - *d++ = tolower (utf8_long_trans[-ch - 1][1]); - } - s = next; - } else { - while (s < (unsigned char *)next) - *d++ = *s++; - } - } - } - *d = '\0'; -} - -enum { IBEX_ALPHA, IBEX_NONALPHA, IBEX_INVALID, IBEX_INCOMPLETE }; - -/* This incorporates parts of libunicode, because there's no way to - * force libunicode to not read past a certain point. - */ -static int -utf8_category (char *sp, char **snp, char *send) -{ - unsigned char *p = (unsigned char *)sp, **np = (unsigned char **)snp; - unsigned char *end = (unsigned char *)send; - - if (isascii (*p)) { - *np = p + 1; - if (isalpha (*p) || *p == '\'') - return IBEX_ALPHA; - return IBEX_NONALPHA; - } else { - unicode_char_t uc; - int more; - - if ((*p & 0xe0) == 0xc0) { - more = 1; - uc = *p & 0x1f; - } else if ((*p & 0xf0) == 0xe0) { - more = 2; - uc = *p & 0x0f; - } else if ((*p & 0xf8) == 0xf0) { - more = 3; - uc = *p & 0x07; - } else if ((*p & 0xfc) == 0xf8) { - more = 4; - uc = *p & 0x03; - } else if ((*p & 0xfe) == 0xfc) { - more = 5; - uc = *p & 0x01; - } else - return IBEX_INVALID; - - if (p + more > end) - return IBEX_INCOMPLETE; - - while (more--) { - if ((*++p & 0xc0) != 0x80) - return IBEX_INVALID; - uc <<= 6; - uc |= *p & 0x3f; - } - - *np = p + 1; - if (unicode_isalpha (uc)) - return IBEX_ALPHA; - else - return IBEX_NONALPHA; - } -} - -static void -do_insert_words(char *key, char *name, ibex *ib) -{ - db_index_word(ib->words, name, key); - g_free(key); -} - -static void -do_free_words(char *key, char *name, void *data) -{ - g_free(key); -} - -/** - * ibex_index_buffer: the lowest-level ibex indexing interface - * @ib: an ibex - * @name: the name of the file being indexed - * @buffer: a buffer containing data from the file - * @len: the length of @buffer - * @unread: an output argument containing the number of unread bytes - * - * This routine indexes up to @len bytes from @buffer into @ib. - * If @unread is NULL, the indexer assumes that the buffer ends on a - * word boundary, and will index all the way to the end of the - * buffer. If @unread is not NULL, and the buffer ends with an - * alphabetic character, the indexer will assume that the buffer has - * been cut off in the middle of a word, and return the number of - * un-indexed bytes at the end of the buffer in *@unread. The caller - * should then read in more data through whatever means it has - * and pass in the unread bytes from the original buffer, followed - * by the new data, on its next call. - * - * Return value: 0 on success, -1 on failure. - **/ -int -ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unread) -{ - char *p, *q, *nq, *end, *word; - int wordsiz, cat; - GHashTable *words = g_hash_table_new(g_str_hash, g_str_equal); - - if (unread) - *unread = 0; - - end = buffer + len; - wordsiz = 20; - word = g_malloc (wordsiz); - - p = buffer; - while (p < end) { - while (p < end) { - cat = utf8_category (p, &q, end); - if (cat != IBEX_NONALPHA) - break; - p = q; - } - if (p == end) { - goto done; - } else if (cat == IBEX_INVALID) { - goto error; - } else if (cat == IBEX_INCOMPLETE) - q = end; - - while (q < end) { - cat = utf8_category (q, &nq, end); - if (cat != IBEX_ALPHA) - break; - q = nq; - } - if (cat == IBEX_INVALID || - (cat == IBEX_INCOMPLETE && !unread)) { - goto error; - } else if (cat == IBEX_INCOMPLETE || (q == end && unread)) { - *unread = end - p; - goto done; - } - - if (wordsiz < q - p + 1) { - wordsiz = q - p + 1; - word = g_realloc (word, wordsiz); - } - ibex_normalise_word (p, q, word); - if (word[0]) { - if (g_hash_table_lookup(words, word) == 0) { - g_hash_table_insert(words, g_strdup(word), name); - } - } - p = q; - } -done: - /* FIXME: do this and inserts within a transaction */ - if (!db_contains_word(ib->names, NULL, name)) { - printf("adding '%s' to database\n", name); - db_index_word(ib->names, "", name); - } - g_hash_table_foreach(words, (GHFunc)do_insert_words, ib); - g_hash_table_destroy(words); - g_free (word); - return 0; -error: - g_hash_table_foreach(words, (GHFunc)do_free_words, NULL); - g_hash_table_destroy(words); - g_free (word); - return -1; -} - - -ibex *ibex_open (char *file, int flags, int mode) -{ - ibex *ib; - u_int32_t dbflags = 0; - int err; - - ib = g_malloc0(sizeof(*ib)); - err = db_create(&ib->words, NULL, 0); - if (err != 0) { - g_warning("create: Error occured?: %s\n", db_strerror(err)); - g_free(ib); - return NULL; - } - - err = ib->words->set_flags(ib->words, DB_DUP); - if (err != 0) { - g_warning("set flags: Error occured?: %s\n", db_strerror(err)); - ib->words->close(ib->words, 0); - g_free(ib); - return NULL; - } - - if (flags & O_CREAT) - dbflags |= DB_CREATE; - if (flags & O_EXCL) - dbflags |= DB_EXCL; - if (flags & O_RDONLY) - dbflags |= DB_RDONLY; - - /* 1MB cache size? */ - ib->words->set_cachesize(ib->words, 0, 1000000, 0); - - err = ib->words->open(ib->words, file, "words", DB_BTREE, dbflags, mode); - if (err != 0) { - printf("open: Error occured?: %s\n", db_strerror(err)); - ib->words->close(ib->words, 0); - g_free(ib); - return NULL; - } - - /* FIXME: check returns */ - err = db_create(&ib->names, NULL, 0); - err = ib->names->open(ib->names, file, "names", DB_HASH, dbflags, mode); - - return ib; -} - -int ibex_save (ibex *ib) -{ - printf("syncing database\n"); - ib->names->sync(ib->names, 0); - return ib->words->sync(ib->words, 0); -} - -int ibex_close (ibex *ib) -{ - int ret; - - printf("closing database\n"); - - ret = ib->names->close(ib->names, 0); - ret = ib->words->close(ib->words, 0); - g_free(ib); - return ret; -} - -void ibex_unindex (ibex *ib, char *name) -{ - printf("trying to unindex '%s'\n", name); - if (db_contains_word(ib->names, NULL, name)) { - /* FIXME: do within transaction? */ - db_delete_name(ib->words, name); - db_delete_word(ib->names, name); - } -} - -GPtrArray *ibex_find (ibex *ib, char *word) -{ - char *normal; - int len; - - len = strlen(word); - normal = alloca(len+1); - ibex_normalise_word(word, word+len, normal); - return db_find(ib->words, normal); -} - -gboolean ibex_find_name (ibex *ib, char *name, char *word) -{ - char *normal; - int len; - - len = strlen(word); - normal = alloca(len+1); - ibex_normalise_word(word, word+len, normal); - return db_contains_word(ib->words, name, normal); -} - -gboolean ibex_contains_name(ibex *ib, char *name) -{ - return db_contains_word(ib->names, NULL, name); -} diff --git a/libibex/ibex_internal.h b/libibex/ibex_internal.h deleted file mode 100644 index f2212799c6..0000000000 --- a/libibex/ibex_internal.h +++ /dev/null @@ -1,51 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Copyright (C) 2000 Helix Code, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -#include "config.h" - -#include <glib.h> - -#include "ibex.h" -#include "block.h" -#include "wordindex.h" - -struct ibex { - char *path; - struct _memcache *blocks; - struct _IBEXWord *words; - int predone; - - /* sigh i hate glib's mutex stuff too */ -#ifdef ENABLE_THREADS - GMutex *lock; -#endif - -}; - -#ifdef ENABLE_THREADS -/*#define IBEX_LOCK(ib) (printf(__FILE__ "%d: %s: locking ibex\n", __LINE__, __FUNCTION__), g_mutex_lock(ib->lock)) - #define IBEX_UNLOCK(ib) (printf(__FILE__ "%d: %s: unlocking ibex\n", __LINE__, __FUNCTION__), g_mutex_unlock(ib->lock))*/ -#define IBEX_LOCK(ib) (g_mutex_lock(ib->lock)) -#define IBEX_UNLOCK(ib) (g_mutex_unlock(ib->lock)) -#else -#define IBEX_LOCK(ib) -#define IBEX_UNLOCK(ib) -#endif - diff --git a/libibex/index.h b/libibex/index.h deleted file mode 100644 index 442cba8e25..0000000000 --- a/libibex/index.h +++ /dev/null @@ -1,103 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- - * - * Copyright (C) 2000 Helix Code, Inc. - * - * Authors: Michael Zucchi <notzed@helixcode.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -#ifndef _INDEX_H -#define _INDEX_H - -/* an indexing 'class' maps a key to 1 piece of info */ - -/*#define INDEX_STAT*/ - -struct _IBEXCursor { - struct _IBEXCursorClass *klass; - struct _IBEXIndex *index; -}; - -struct _IBEXCursorClass { - void (*close)(struct _IBEXCursor *); - - guint32 (*next)(struct _IBEXCursor *); - char *(*next_key)(struct _IBEXCursor *, int *keylenptr); -}; - -struct _IBEXIndex { - struct _IBEXIndexClass *klass; - struct _memcache *blocks; - blockid_t root; /* root block of ondisk index data */ -#ifdef INDEX_STAT - int lookups; /* how many lookups */ - int lookup_total; /* how many blocks loaded for all lookups (hash chain depth) */ -#endif -}; - -struct _IBEXIndexClass { - - struct _IBEXIndex *(*create)(struct _memcache *bc, int size); - struct _IBEXIndex *(*open)(struct _memcache *bc, blockid_t root); - - int (*sync)(struct _IBEXIndex *); - int (*close)(struct _IBEXIndex *); - - /* lookup a key in the index, returns the keyid of this item, or 0 if not found */ - guint32 (*find)(struct _IBEXIndex *, const char *key, int keylen); - - /* remove a key from the index */ - void (*remove)(struct _IBEXIndex *, const char *key, int keylen); - - /* insert a new key into the index, the keyid is returned */ - guint32 (*insert)(struct _IBEXIndex *, const char *key, int keylen); - - /* get the key contents/key length from the keyid */ - char *(*get_key)(struct _IBEXIndex *, guint32 keyid, int *keylenptr); - - /* set the key contents based on the keyid */ - void (*set_data)(struct _IBEXIndex *, guint32 keyid, blockid_t datablock, blockid_t tail); - - /* get the key contents based on the keyid */ - blockid_t (*get_data)(struct _IBEXIndex *, guint32 keyid, blockid_t *tail); - - /* get a cursor for iterating over all contents */ - struct _IBEXCursor *(*get_cursor)(struct _IBEXIndex *); -}; - -/* a storage class, stores lists of lists of id's */ - -struct _IBEXStore { - struct _IBEXStoreClass *klass; - struct _memcache *blocks; -}; - -struct _IBEXStoreClass { - struct _IBEXStore *(*create)(struct _memcache *bc); - int (*sync)(struct _IBEXStore *store); - int (*close)(struct _IBEXStore *store); - - blockid_t (*add)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data); - blockid_t (*add_list)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, GArray *data); - blockid_t (*remove)(struct _IBEXStore *store, blockid_t *head, blockid_t *tail, nameid_t data); - void (*free)(struct _IBEXStore *store, blockid_t head, blockid_t tail); - - gboolean (*find)(struct _IBEXStore *store, blockid_t head, blockid_t tail, nameid_t data); - GArray *(*get)(struct _IBEXStore *store, blockid_t head, blockid_t tail); -}; - -#endif diff --git a/libibex/lookup.c b/libibex/lookup.c deleted file mode 100644 index 2d01dbf850..0000000000 --- a/libibex/lookup.c +++ /dev/null @@ -1,83 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Copyright (C) 2000 Helix Code Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -/* lookup.c: a simple client, part 2 */ - -#include <errno.h> -#include <stdio.h> -#include <string.h> -#include <unistd.h> - -#include "ibex.h" - -extern int optind; -extern char *optarg; - -static void -usage (void) -{ - fprintf (stderr, "Usage: lookup [-f indexfile] word ...\n"); - exit (1); -} - -int -main (int argc, char **argv) -{ - ibex *ib; - GPtrArray *ans, *words; - int opt, i; - char *file = "INDEX"; - - while ((opt = getopt (argc, argv, "f:")) != -1) { - switch (opt) { - case 'f': - file = optarg; - break; - - default: - usage (); - break; - } - } - argc -= optind; - argv += optind; - - if (argc == 0) - usage (); - - ib = ibex_open (file, O_RDWR|O_CREAT, 0600); - if (!ib) { - printf ("Couldn't open %s: %s\n", file, strerror (errno)); - exit (1); - } - - words = g_ptr_array_new (); - while (argc--) - g_ptr_array_add (words, argv[argc]); - - ans = ibex_find_all (ib, words); - if (ans) { - for (i = 0; i < ans->len; i++) - printf ("%s\n", (char *)g_ptr_array_index (ans, i)); - exit (0); - } else { - printf ("Nope.\n"); - exit (1); - } -} diff --git a/libibex/mkindex.c b/libibex/mkindex.c deleted file mode 100644 index 151dcecb2d..0000000000 --- a/libibex/mkindex.c +++ /dev/null @@ -1,84 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ -/* - * Copyright (C) 2000 Helix Code Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -/* mkindex.c: a simple client, part 1 */ - -#include <errno.h> -#include <stdio.h> -#include <string.h> -#include <unistd.h> - -#include "ibex.h" - -extern int optind; -extern char *optarg; - -static void -usage (void) -{ - fprintf (stderr, "Usage: mkindex [-f indexfile] file ...\n"); - exit (1); -} - -int -main (int argc, char **argv) -{ - ibex *ib; - int opt; - char *file = "INDEX"; - - while ((opt = getopt (argc, argv, "f:")) != -1) { - switch (opt) { - case 'f': - file = optarg; - break; - - default: - usage (); - break; - } - } - argc -= optind; - argv += optind; - - if (argc == 0) - usage (); - - ib = ibex_open (file, O_CREAT|O_RDWR, 0600); - if (!ib) { - fprintf (stderr, "Couldn't open index file %s: %s\n", - file, strerror (errno)); - exit (1); - } - - while (argc--) { - if (ibex_index_file (ib, argv[argc]) == -1) { - fprintf (stderr, "Couldn't index %s: %s\n", - argv[argc], strerror (errno)); - exit (1); - } - } - - if (ibex_close (ib) != 0) { - fprintf (stderr, "Failed to write index file %s: %s\n", - file, strerror (errno)); - exit (1); - } - exit (0); -} diff --git a/libibex/testindex.c b/libibex/testindex.c deleted file mode 100644 index a3b6a9ce03..0000000000 --- a/libibex/testindex.c +++ /dev/null @@ -1,200 +0,0 @@ -/* Test code for libibex */ - -#include <stdio.h> -#include <glib.h> -#include <errno.h> -#include <string.h> -#include "ibex_internal.h" - -#ifdef ENABLE_THREADS -#include <pthread.h> -#endif - -void word_index_mem_dump_info(struct _IBEXWord *idx); - -/* - The following is a routine to generate a Gaussian distribution - of pseudo random numbers, to make the results a little more - meaningful -*/ - -/* boxmuller.c Implements the Polar form of the Box-Muller - Transformation - - (c) Copyright 1994, Everett F. Carter Jr. - Permission is granted by the author to use - this software for any application provided this - copyright notice is preserved. - -*/ - -#include <stdlib.h> -#include <math.h> - -#define ranf() ((float)rand()/(float)RAND_MAX) - -static float box_muller(float m, float s) /* normal random variate generator */ -{ /* mean m, standard deviation s */ - float x1, x2, w, y1; - static float y2; - static int use_last = 0; - - if (use_last) /* use value from previous call */ - { - y1 = y2; - use_last = 0; - } - else - { - do { - x1 = 2.0 * ranf() - 1.0; - x2 = 2.0 * ranf() - 1.0; - w = x1 * x1 + x2 * x2; - } while ( w >= 1.0 ); - - w = sqrt( (-2.0 * log( w ) ) / w ); - y1 = x1 * w; - y2 = x2 * w; - use_last = 1; - } - - return( m + y1 * s ); -} - -/* gets a word from words, using m and s as distribution values */ -static char *getword(GPtrArray *words, float m, float s) -{ - int index; - - do { - index = (int)box_muller(m, s); - } while (index<0 || index>=words->len); - - return words->pdata[index]; -} - -#ifdef ENABLE_THREADS -int do_read_words; - -static void * -read_words(void *in) -{ - ibex *ib = in; - GPtrArray *a; - int lastlen = 0; - int i; - - while (do_read_words) { - a = ibex_find(ib, "joneses"); - if (a->len != lastlen) { - printf("Found %d joneses!\n", a->len); - lastlen = a->len; - } - for (i=0;i<a->len;i++) - g_free(a->pdata[i]); - g_ptr_array_free(a, TRUE); - } -} -#endif - -int main(int argc, char **argv) -{ - int i, j; - GPtrArray *words = g_ptr_array_new(); - char line[256]; - int len; - FILE *file; - float m, s; - ibex *ib; - GString *buffer = g_string_new(""); - int files; - char *dict; - -#ifdef ENABLE_THREADS - pthread_t id; - - g_thread_init(0); -#endif - - srand(0xABADF00D); - - files = 8000; - dict = "/usr/dict/words"; - - /* read words into an array */ - file = fopen(dict, "r"); - if (file == NULL) { - fprintf(stderr, "Cannot open word file: %s: %s\n", dict, strerror(errno)); - return 1; - } - while (fgets(line, sizeof(line), file) != NULL) { - len = strlen(line); - if (len>0 && line[len-1]=='\n') { - line[len-1]=0; - } - g_ptr_array_add(words, g_strdup(line)); - } - fclose(file); - - fprintf(stderr, "Read %d words\n", words->len); - - /* *shrug* arbitrary values really */ - m = words->len/2; - /* well, the average vocabulary of a mailbox is about 10K words */ - s = 1000.0; - - printf("mean is %f, s is %f\n", m, s); - - /* open ibex file */ - ib = ibex_open("test.ibex", O_RDWR|O_CREAT, 0600); - if (ib == NULL) { - perror("Creating ibex file\n"); - return 1; - } - -#ifdef ENABLE_THREADS - do_read_words = 1; - pthread_create(&id, 0, read_words, ib); -#endif - printf("Adding %d files\n", files); - - /* simulate adding new words to a bunch of files */ - for (j=0;j<files;j++) { - /* always new name */ - char *name = words->pdata[j % words->len]; - /* something like 60 words in a typical message, say */ - int count = (int)box_muller(60.0, 20.0); - - if (j%1000 == 0) - word_index_mem_dump_info(ib->words); - - /* cache the name info */ - ibex_contains_name(ib, name); - - /*printf("Adding %d words to '%s'\n", count, name);*/ - - g_string_truncate(buffer, 0); - - /* build up the word buffer */ - for (i=0;i<count;i++) { - if (i>0) - g_string_append_c(buffer, ' '); - g_string_append(buffer, getword(words, m, s)); - } - - /* and index it */ - ibex_index_buffer(ib, name, buffer->str, buffer->len, NULL); - } - - word_index_mem_dump_info(ib->words); - -#ifdef ENABLE_THREADS - do_read_words = 0; - pthread_join(id, 0); -#endif - - ibex_close(ib); - - return 0; -} - diff --git a/libibex/wordindex.c b/libibex/wordindex.c deleted file mode 100644 index a8592cbcef..0000000000 --- a/libibex/wordindex.c +++ /dev/null @@ -1,646 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- - * - * Copyright (C) 2000 Helix Code, Inc. - * - * Authors: Michael Zucchi <notzed@helixcode.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* code to manage a word index */ -/* includes a cache for word index writes, - but not for name index writes (currently), or any reads. - -Note the word cache is only needed during indexing of lots -of words, and could then be discarded (:flush()). - -*/ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> - -#include <glib.h> - -#include "block.h" -#include "index.h" -#include "wordindex.h" - -#define d(x) - -/*#define WORDCACHE_SIZE (256)*/ -#define WORDCACHE_SIZE (4096) - -extern struct _IBEXStoreClass ibex_diskarray_class; -extern struct _IBEXIndexClass ibex_hash_class; - -/* need 2 types of hash key? - one that just stores the wordid / wordblock - and one that stores the filecount/files? -*/ - - -#define CACHE_FILE_COUNT (62) - -struct _wordcache { - struct _wordcache *next; - struct _wordcache *prev; - nameid_t wordid; /* disk wordid */ - blockid_t wordblock; /* head of disk list */ - blockid_t wordtail; /* and the tail data */ - short filecount; /* how many valid items in files[] */ - short filealloc; /* how much allocated space in files[] */ - union { - nameid_t *files; /* memory cache of files */ - nameid_t file0; /* if filecount == 1 && filealloc == 0, store directly */ - } file; - char word[1]; /* actual word follows */ -}; - -static void unindex_name(struct _IBEXWord *, const char *name); /* unindex all entries for name */ -static gboolean contains_name(struct _IBEXWord *, const char *name); /* index contains data for name */ -static GPtrArray *find(struct _IBEXWord *, const char *word); /* returns all matches for word */ -static gboolean find_name(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */ -static void add(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name (slow) */ -static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */ -static int word_sync(struct _IBEXWord *idx); -static int word_flush(struct _IBEXWord *idx); -static int word_close(struct _IBEXWord *idx); -static void word_index_pre(struct _IBEXWord *idx); -static void word_index_post(struct _IBEXWord *idx); - -struct _IBEXWordClass ibex_word_index_class = { - word_sync, word_flush, word_close, - word_index_pre, word_index_post, - unindex_name, contains_name, - find, find_name, - add, add_list -}; - -/* this interface isn't the best, but it'll do for now */ -struct _IBEXWord * -ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot) -{ - struct _IBEXWord *idx; - - idx = g_malloc0(sizeof(*idx)); - idx->wordcache = g_hash_table_new(g_str_hash, g_str_equal); - ibex_list_new(&idx->wordnodes); - idx->wordcount = 0; - idx->klass = &ibex_word_index_class; - - /* we use the same block array storage for both indexes at the moment */ - idx->wordstore = ibex_diskarray_class.create(bc); - idx->namestore = idx->wordstore; - - /* but not the same indexes! */ - if (*wordroot) { - printf("opening wordindex root = %d\n", *wordroot); - idx->wordindex = ibex_hash_class.open(bc, *wordroot); - } else { - idx->wordindex = ibex_hash_class.create(bc, 2048); - *wordroot = idx->wordindex->root; - printf("creating wordindex root = %d\n", *wordroot); - } - if (*nameroot) { - printf("opening nameindex root = %d\n", *nameroot); - idx->nameindex = ibex_hash_class.open(bc, *nameroot); - } else { - idx->nameindex = ibex_hash_class.create(bc, 2048); - *nameroot = idx->nameindex->root; - printf("creating nameindex root = %d\n", *nameroot); - } - return idx; -} - -#if d(!)0 -static void -cache_sanity(struct _wordcache *head) -{ - while (head->next) { - g_assert(head->filecount <= head->filealloc); - g_assert(strlen(head->word) != 0); - head = head->next; - } -} -#endif - -static void word_index_pre(struct _IBEXWord *idx) -{ -} - -static void word_index_post(struct _IBEXWord *idx) -{ -} - -/* unindex all entries for name */ -static void unindex_name(struct _IBEXWord *idx, const char *name) -{ - GArray *words; - int i; - nameid_t nameid, wordid; - blockid_t nameblock, wordblock, newblock, nametail, wordtail, newtail; - char *word; - struct _wordcache *cache; - - d(printf("unindexing %s\n", name)); - - /* lookup the hash key */ - nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name)); - /* get the block for this key */ - nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail); - /* and the data for this block */ - words = idx->namestore->klass->get(idx->namestore, nameblock, nametail); - /* walk it ... */ - for (i=0;i<words->len;i++) { - /* get the word */ - wordid = g_array_index(words, nameid_t, i); - d(printf(" word %d\n", wordid)); - /* get the data block */ - wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail); - /* clear this name from it */ - newblock = wordblock; - newtail = wordtail; - idx->wordstore->klass->remove(idx->wordstore, &newblock, &newtail, nameid); - if (newblock != wordblock || newtail != wordtail) - idx->wordindex->klass->set_data(idx->wordindex, wordid, newblock, newtail); - - /* now check the cache as well */ - word = idx->nameindex->klass->get_key(idx->wordindex, wordid, NULL); - if (word) { - cache = g_hash_table_lookup(idx->wordcache, word); - if (cache) { - /* its there, update our head/tail pointers */ - cache->wordblock = newblock; - cache->wordtail = newtail; - - /* now check that we have a data entry in it */ - if (cache->filealloc == 0 && cache->filecount == 1) { - if (cache->file.file0 == nameid) { - cache->filecount = 0; - } - } else { - int j; - - for (j=0;j<cache->filecount;j++) { - if (cache->file.files[j] == nameid) { - cache->file.files[j] = cache->file.files[cache->filecount-1]; - cache->filecount--; - break; - } - } - } - } - g_free(word); - } - } - g_array_free(words, TRUE); - - /* and remove name data and itself */ - idx->namestore->klass->free(idx->namestore, nameblock, nametail); - idx->nameindex->klass->remove(idx->nameindex, name, strlen(name)); -} - -/* index contains (any) data for name */ -static gboolean contains_name(struct _IBEXWord *idx, const char *name) -{ - return idx->nameindex->klass->find(idx->nameindex, name, strlen(name)) != 0; -} - -/* returns all matches for word */ -static GPtrArray *find(struct _IBEXWord *idx, const char *word) -{ - nameid_t wordid, nameid; - GPtrArray *res; - GArray *names; - int i; - char *new; - struct _wordcache *cache; - blockid_t wordblock, wordtail; - - res = g_ptr_array_new(); - - cache = g_hash_table_lookup(idx->wordcache, word); - if (cache) { - /* freshen cache entry if we touch it */ - ibex_list_remove((struct _listnode *)cache); - ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache); - wordid = cache->wordid; - wordblock = cache->wordblock; - wordtail = cache->wordtail; - } else { - /* lookup the hash key */ - wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word)); - /* get the block for this key */ - wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail); - } - /* and the data for this block */ - names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail); - /* .. including any memory-only data */ - if (cache) { - if (cache->filealloc == 0 && cache->filecount == 1) - g_array_append_val(names, cache->file.file0); - else - g_array_append_vals(names, cache->file.files, cache->filecount); - } - - /* walk it ... converting id's back to strings */ - g_ptr_array_set_size(res, names->len); - for (i=0;i<names->len;i++) { - nameid = g_array_index(names, nameid_t, i); - new = idx->nameindex->klass->get_key(idx->nameindex, nameid, NULL); - res->pdata[i] = new; - } - g_array_free(names, TRUE); - return res; -} - -/* find if name contains word */ -static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *word) -{ - nameid_t wordid, nameid; - blockid_t nameblock, nametail; - struct _wordcache *cache; - int i; - - /* lookup the hash key for the name */ - nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name)); - /* get the block for this name */ - nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail); - - /* check if there is an in-memory cache for this word, check its file there first */ - cache = g_hash_table_lookup(idx->wordcache, word); - if (cache) { - /* freshen cache entry if we touch it */ - ibex_list_remove((struct _listnode *)cache); - ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache); - if (cache->filecount == 1 && cache->filealloc == 0) { - if (cache->file.file0 == nameid) - return TRUE; - } else { - for (i=0;i<cache->filecount;i++) { - if (cache->file.files[i] == nameid) - return TRUE; - } - } - /* not there? well we can use the wordid anyway */ - wordid = cache->wordid; - } else { - /* lookup the hash key for word */ - wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word)); - } - - /* see if wordid is in nameblock */ - return idx->namestore->klass->find(idx->namestore, nameblock, nametail, wordid); -} - -/* cache helper functions */ -/* flush a cache entry to disk, and empty it out */ -static void -sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache) -{ - GArray array; /* just use this as a header */ - blockid_t oldblock, oldtail; - - d(printf("syncing cache entry '%s' used %d\n", cache->word, cache->filecount)); - if (cache->filecount == 1 && cache->filealloc == 0) - array.data = (char *)&cache->file.file0; - else - array.data = (char *)cache->file.files; - array.len = cache->filecount; - oldblock = cache->wordblock; - oldtail = cache->wordtail; - idx->wordstore->klass->add_list(idx->wordstore, &cache->wordblock, &cache->wordtail, &array); - if (oldblock != cache->wordblock || oldtail != cache->wordtail) { - idx->wordindex->klass->set_data(idx->wordindex, cache->wordid, cache->wordblock, cache->wordtail); - } - cache->filecount = 0; -} - -/* create a new key in an index, returning its id and head block */ -static void -add_index_key(struct _IBEXIndex *wordindex, const char *word, nameid_t *wordid, blockid_t *wordblock, blockid_t *wordtail) -{ - /* initialise cache entry - id of word entry and head block */ - *wordid = wordindex->klass->find(wordindex, word, strlen(word)); - if (*wordid == 0) { - *wordid = wordindex->klass->insert(wordindex, word, strlen(word)); - *wordblock = 0; - *wordtail = 0; - } else { - *wordblock = wordindex->klass->get_data(wordindex, *wordid, wordtail); - } -} - -/* create a new key in a cached index (only word cache so far), flushing old keys - if too much space is being used */ -static struct _wordcache * -add_index_cache(struct _IBEXWord *idx, const char *word) -{ - struct _wordcache *cache; - - d(printf("adding %s to cache\n", word)); - - cache = g_hash_table_lookup(idx->wordcache, word); - if (cache == 0) { - /* see if we have to flush off the last entry */ - if (idx->wordcount >= WORDCACHE_SIZE) { - struct _wordcache *mincache; - int min, count=0; - /* remove last entry, and flush it */ - cache = (struct _wordcache *)idx->wordnodes.tailpred; - mincache = cache; - min = mincache->filecount; - - d(printf("flushing word from cache %s\n", cache->word)); - /* instead of just using the last entry, we try and find an entry with - with only 1 item (failing that, the smallest in the range we look at) */ - /* this could probably benefit greatly from a more sophisticated aging algorithm */ - while (cache->next && count < 100) { - if (cache->filecount == 1) { - mincache = cache; - break; - } - if (cache->filecount > 0 && cache->filecount < min) { - mincache = cache; - min = cache->filecount; - } - cache = cache->next; - count++; - } - ibex_list_remove((struct _listnode *)mincache); - g_hash_table_remove(idx->wordcache, mincache->word); - sync_cache_entry(idx, mincache); - if (mincache->filealloc) - g_free(mincache->file.files); - g_free(mincache); - idx->wordcount--; - } - cache = g_malloc0(sizeof(*cache)+strlen(word)); - /* initialise cache entry - id of word entry and head block */ - add_index_key(idx->wordindex, word, &cache->wordid, &cache->wordblock, &cache->wordtail); - /* other fields */ - strcpy(cache->word, word); - cache->filecount = 0; - g_hash_table_insert(idx->wordcache, cache->word, cache); - ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache); - idx->wordcount++; - } else { - /* move cache bucket ot the head of the cache list */ - ibex_list_remove((struct _listnode *)cache); - ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache); - } - return cache; -} - -/* adds a single word to name (slow) */ -static void add(struct _IBEXWord *idx, const char *name, const char *word) -{ - nameid_t nameid; - blockid_t nameblock, newblock, nametail, newtail; - struct _wordcache *cache; - - g_error("Dont use wordindex::add()"); - abort(); - - cache = add_index_cache(idx, word); - - /* get the nameid and block start for this name */ - add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); - - /* check for repeats of the last name - dont add anything */ - if (cache->filecount == 1 && cache->filealloc == 0) { - if (cache->file.file0 == nameid) - return; - } else { - if (cache->file.files[cache->filecount] == nameid) - return; - } - - /* see if we are setting the first, drop it in the union */ - if (cache->filecount == 0 && cache->filealloc == 0) { - cache->file.file0 = nameid; - } else if (cache->filecount == 1 && cache->filealloc == 0) { - nameid_t saveid; - /* we need to allocate space for words */ - saveid = cache->file.file0; - cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT); - /* this could possibly grow as needed, but i wont for now */ - cache->filealloc = CACHE_FILE_COUNT; - cache->file.files[0] = saveid; - cache->file.files[1] = nameid; - } else { - cache->file.files[cache->filecount] = nameid; - } - - cache->filecount++; - - /* if we are full, force a flush now */ - if (cache->filealloc && cache->filecount >= cache->filealloc) { - sync_cache_entry(idx, cache); - } - - newtail = nametail; - newblock = nameblock; - idx->namestore->klass->add(idx->namestore, &newblock, &newtail, cache->wordid); - if (newblock != nameblock || newtail != nametail) { - idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail); - } -} - -/* adds a bunch of words to a given name */ -static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words) -{ - int i; - GArray *data = g_array_new(0, 0, sizeof(nameid_t)); - blockid_t nameblock, newblock, nametail, newtail; - nameid_t nameid; - struct _wordcache *cache; - - d(printf("Adding words to name %s\n", name)); - - d(cache_sanity((struct _wordcache *)idx->wordnodes.head)); - - /* get the nameid and block start for this name */ - add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); - - d(cache_sanity((struct _wordcache *)idx->wordnodes.head)); - - for (i=0;i<words->len;i++) { - char *word = words->pdata[i]; - - cache = add_index_cache(idx, word); - - /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/ - - /* check for duplicates; doesn't catch duplicates over an overflow boundary. Watch me care. */ - if (cache->filecount == 0 - /* the 1 item case */ - || (cache->filecount == 1 && cache->filealloc == 0 && cache->file.file0 != nameid) - /* the normal case */ - || (cache->filealloc > 0 && cache->file.files[cache->filecount-1] != nameid)) { - - /* see if we are setting the first, drop it in the union */ - if (cache->filecount == 0 && cache->filealloc == 0) { - cache->file.file0 = nameid; - } else if (cache->filecount == 1 && cache->filealloc == 0) { - nameid_t saveid; - /* we need to allocate space for words */ - saveid = cache->file.file0; - cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT); - /* this could possibly grow as needed, but i wont for now */ - cache->filealloc = CACHE_FILE_COUNT; - cache->file.files[0] = saveid; - cache->file.files[1] = nameid; - } else { - cache->file.files[cache->filecount] = nameid; - } - - cache->filecount++; - - /* if we are full, force a flush now */ - if (cache->filealloc && cache->filecount >= cache->filealloc) { - sync_cache_entry(idx, cache); - } - - /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/ - - /* and append this wordid for this name in memory */ - g_array_append_val(data, cache->wordid); - } - - /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/ - } - - d(cache_sanity((struct _wordcache *)idx->wordnodes.head)); - - /* and append these word id's in one go */ - newblock = nameblock; - newtail = nametail; - idx->namestore->klass->add_list(idx->namestore, &newblock, &newtail, data); - if (newblock != nameblock || newtail != nametail) { - idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail); - } - - d(cache_sanity((struct _wordcache *)idx->wordnodes.head)); - - g_array_free(data, TRUE); -} - -/* sync any in-memory data to disk */ -static int -word_sync(struct _IBEXWord *idx) -{ - /* we just flush also, save memory */ - word_flush(idx); - -#if 0 - struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head; - - while (cache->next) { - sync_cache_entry(idx, cache); - cache = cache->next; - } - - /*ibex_hash_dump(idx->wordindex);*/ - /*ibex_hash_dump(idx->nameindex);*/ -#endif - return 0; -} - -/* sync and flush any in-memory data to disk and free it */ -static int -word_flush(struct _IBEXWord *idx) -{ - struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next; - extern int block_log; - int count= 0; - int used=0, wasted=0; - - block_log = 0; - - next = cache->next; - while (next) { - count++; - used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t)); - if (cache->filealloc) - wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t); - else - wasted += (1-cache->filecount) * sizeof(nameid_t); - - /*printf("syncing word %s\n", cache->word);*/ - sync_cache_entry(idx, cache); - g_hash_table_remove(idx->wordcache, cache->word); - if (cache->filealloc) - g_free(cache->file.files); - g_free(cache); - cache = next; - next = cache->next; - } - - printf("sync cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted); - - block_log = 0; - ibex_list_new(&idx->wordnodes); - idx->wordcount = 0; - return 0; -} - -static int word_close(struct _IBEXWord *idx) -{ - struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next; - extern int block_log; - int count= 0; - int used=0, wasted=0; - - block_log = 0; - - next = cache->next; - while (next) { - count++; - used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t)); - if (cache->filealloc) - wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t); - else - wasted += (1-cache->filecount) * sizeof(nameid_t); - - /*printf("closing word %s\n", cache->word);*/ - sync_cache_entry(idx, cache); - if (cache->filealloc) - g_free(cache->file.files); - g_free(cache); - cache = next; - next = cache->next; - } - block_log = 0; - - printf("cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted); - - idx->namestore->klass->close(idx->namestore); - idx->nameindex->klass->close(idx->nameindex); - /*same as namestore: - idx->wordstore->klass->close(idx->wordstore);*/ - idx->wordindex->klass->close(idx->wordindex); - g_hash_table_destroy(idx->wordcache); - g_free(idx); - - return 0; -} diff --git a/libibex/wordindex.h b/libibex/wordindex.h deleted file mode 100644 index 3a60accb70..0000000000 --- a/libibex/wordindex.h +++ /dev/null @@ -1,74 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- - * - * Copyright (C) 2000 Helix Code, Inc. - * - * Authors: Michael Zucchi <notzed@helixcode.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -#ifndef _WORDINDEX_H -#define _WORDINDEX_H - -#include <glib.h> - -#include "block.h" -#include "index.h" - -struct _IBEXWord; - -/* not used yet */ -typedef void (*IBEXNormaliseFunc)(char *source, int len, char *dest); - -struct _IBEXWordClass { - int (*sync)(struct _IBEXWord *); - int (*flush)(struct _IBEXWord *); - int (*close)(struct _IBEXWord *); - - void (*index_pre)(struct _IBEXWord *); /* get ready for doing a lot of indexing. may be a nop */ - void (*index_post)(struct _IBEXWord *); - - void (*unindex_name)(struct _IBEXWord *, const char *name); /* unindex all entries for name */ - gboolean (*contains_name)(struct _IBEXWord *, const char *name); /* index contains data for name */ - GPtrArray *(*find)(struct _IBEXWord *, const char *word); /* returns all matches for word */ - gboolean (*find_name)(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */ - void (*add)(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name */ - void (*add_list)(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */ -}; - -struct _IBEXWord { - struct _IBEXWordClass *klass; - struct _IBEXStore *wordstore; - struct _IBEXIndex *wordindex; - struct _IBEXStore *namestore; - struct _IBEXIndex *nameindex; - - /* word caching info (should probably be modularised) */ - GHashTable *wordcache; /* word->struct _wordcache mapping */ - struct _list wordnodes; /* LRU list of wordcache structures */ - int wordcount; /* how much space used in cache */ - int precount; - GHashTable *namecache; /* a list of names (only), cached for quick reference */ - int nameinit; -}; - - -struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot); - -/* alternate implemenation */ -struct _IBEXWord *ibex_create_word_index_mem(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot); - -#endif /* !_WORDINDEX_H */ diff --git a/libibex/wordindexmem.c b/libibex/wordindexmem.c deleted file mode 100644 index 479e5b0343..0000000000 --- a/libibex/wordindexmem.c +++ /dev/null @@ -1,889 +0,0 @@ -/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- - * - * Copyright (C) 2000 Helix Code, Inc. - * - * Authors: Michael Zucchi <notzed@helixcode.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public License - * as published by the Free Software Foundation; either version 2 of - * the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with the Gnome Library; see the file COPYING.LIB. If not, - * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ - -/* this is the same as wordindex.c, but it doesn't have an LRU cache - for word names. it has a lookup tab le that is only loaded if - index-pre is called, otherwise it always hits disk */ - -/* code to manage a word index */ -/* includes a cache for word index writes, - but not for name index writes (currently), or any reads. - -Note the word cache is only needed during indexing of lots -of words, and could then be discarded (:flush()). - -*/ - -#include <stdio.h> - -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> - -#include <glib.h> - -#include "block.h" -#include "index.h" -#include "wordindex.h" - -/*#define MALLOC_CHECK*/ - -#ifdef MALLOC_CHECK -#include <mcheck.h> -#endif - -#define d(x) - -/*#define WORDCACHE_SIZE (256)*/ -#define WORDCACHE_SIZE (4096) - -extern struct _IBEXStoreClass ibex_diskarray_class; -extern struct _IBEXIndexClass ibex_hash_class; - -/* need 2 types of hash key? - one that just stores the wordid / wordblock - and one that stores the filecount/files? -*/ - - -#define CACHE_FILE_COUNT (62) - -struct _wordcache { - nameid_t wordid; /* disk wordid */ - blockid_t wordblock; /* head of disk list */ - blockid_t wordtail; /* and the tail data */ - short filecount; /* how many valid items in files[] */ - short filealloc; /* how much allocated space in files[] */ - union { - nameid_t *files; /* memory cache of files */ - nameid_t file0; /* if filecount == 1 && filealloc == 0, store directly */ - } file; - char word[1]; /* actual word follows */ -}; - -static void unindex_name(struct _IBEXWord *, const char *name); /* unindex all entries for name */ -static gboolean contains_name(struct _IBEXWord *, const char *name); /* index contains data for name */ -static GPtrArray *find(struct _IBEXWord *, const char *word); /* returns all matches for word */ -static gboolean find_name(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */ -static void add(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name (slow) */ -static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */ -static int word_sync(struct _IBEXWord *idx); -static int word_flush(struct _IBEXWord *idx); -static int word_close(struct _IBEXWord *idx); -static void word_index_pre(struct _IBEXWord *idx); -static void word_index_post(struct _IBEXWord *idx); - -static void sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache); - -struct _IBEXWordClass ibex_word_index_mem_class = { - word_sync, word_flush, word_close, - word_index_pre, word_index_post, - unindex_name, contains_name, - find, find_name, - add, add_list -}; - -#ifdef MALLOC_CHECK -static void -checkmem(void *p) -{ - if (p) { - int status = mprobe(p); - - switch (status) { - case MCHECK_HEAD: - printf("Memory underrun at %p\n", p); - abort(); - case MCHECK_TAIL: - printf("Memory overrun at %p\n", p); - abort(); - case MCHECK_FREE: - printf("Double free %p\n", p); - abort(); - } - } -} -#endif - -/* this interface isn't the best, but it'll do for now */ -struct _IBEXWord * -ibex_create_word_index_mem(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot) -{ - struct _IBEXWord *idx; - - idx = g_malloc0(sizeof(*idx)); - idx->wordcache = g_hash_table_new(g_str_hash, g_str_equal); - ibex_list_new(&idx->wordnodes); - idx->wordcount = 0; - idx->precount = 0; - idx->namecache = g_hash_table_new(g_str_hash, g_str_equal); - idx->nameinit = 0; - idx->klass = &ibex_word_index_mem_class; - - /* we use the same block array storage for both indexes at the moment */ - idx->wordstore = ibex_diskarray_class.create(bc); - idx->namestore = idx->wordstore; - - /* but not the same indexes! */ - if (*wordroot) { - d(printf("opening wordindex root = %d\n", *wordroot)); - idx->wordindex = ibex_hash_class.open(bc, *wordroot); - } else { - idx->wordindex = ibex_hash_class.create(bc, 2048); - *wordroot = idx->wordindex->root; - d(printf("creating wordindex root = %d\n", *wordroot)); - } - if (*nameroot) { - d(printf("opening nameindex root = %d\n", *nameroot)); - idx->nameindex = ibex_hash_class.open(bc, *nameroot); - } else { - idx->nameindex = ibex_hash_class.create(bc, 2048); - *nameroot = idx->nameindex->root; - d(printf("creating nameindex root = %d\n", *nameroot)); - } - return idx; -} - -#if (d(!)0) || defined(MALLOC_CHECK) -static void -node_sanity(char *key, struct _wordcache *node, void *data) -{ - g_assert(node->filecount <= node->filealloc || (node->filecount == 1 && node->filealloc == 0)); - g_assert(strlen(node->word) != 0); -#ifdef MALLOC_CHECK - checkmem(node); - if (node->filealloc) - checkmem(node->file.files); -#endif -} - -static void -cache_sanity(struct _IBEXWord *idx) -{ -#ifdef MALLOC_CHECK - checkmem(idx); -#endif - g_hash_table_foreach(idx->wordcache, (GHFunc)node_sanity, idx); -} -#endif - -static void word_index_pre(struct _IBEXWord *idx) -{ - struct _IBEXCursor *idc; - struct _wordcache *cache; - nameid_t wordid; - char *key; - int len; - - idx->precount ++; - if (idx->precount > 1) - return; - - /* want to load all words into the cache lookup table */ - d(printf("pre-loading all word info into memory\n")); - idc = idx->wordindex->klass->get_cursor(idx->wordindex); - while ( (wordid = idc->klass->next(idc)) ) { - key = idc->index->klass->get_key(idc->index, wordid, &len); - /*d(printf("Adding word %s\n", key));*/ - cache = g_malloc0(sizeof(*cache) + strlen(key)); - strcpy(cache->word, key); - g_free(key); - cache->wordid = wordid; - cache->wordblock = idc->index->klass->get_data(idc->index, wordid, &cache->wordtail); - cache->filecount = 0; - cache->filealloc = 0; - g_hash_table_insert(idx->wordcache, cache->word, cache); - idx->wordcount++; - } - -#ifdef MALLOC_CHECK - cache_sanity(idx); -#endif - - idc->klass->close(idc); - - d(printf("done\n")); -} - -static gboolean -sync_free_value(void *key, void *value, void *data) -{ - struct _wordcache *cache = (struct _wordcache *)value; - struct _IBEXWord *idx = (struct _IBEXWord *)data; - - sync_cache_entry(idx, cache); - if (cache->filealloc) - g_free(cache->file.files); - g_free(cache); - - return TRUE; -} - -#if d(!)0 -static void -sync_value(void *key, void *value, void *data) -{ - struct _wordcache *cache = (struct _wordcache *)value; - struct _IBEXWord *idx = (struct _IBEXWord *)data; - - sync_cache_entry(idx, cache); -} -#endif - -static void word_index_post(struct _IBEXWord *idx) -{ - idx->precount--; - if (idx->precount > 0) - return; - idx->precount = 0; - -#ifdef MALLOC_CHECK - cache_sanity(idx); -#endif - - g_hash_table_foreach_remove(idx->wordcache, sync_free_value, idx); - idx->wordcount = 0; -} - -/* unindex all entries for name */ -static void unindex_name(struct _IBEXWord *idx, const char *name) -{ - GArray *words; - int i; - nameid_t nameid, wordid; - blockid_t nameblock, wordblock, newblock, nametail, wordtail, newtail; - char *word; - struct _wordcache *cache; - - d(printf("unindexing %s\n", name)); - - /* if we have a namecache, check that to see if we need to remove that item, or there is no work here */ - if (idx->nameinit) { - char *oldkey; - gboolean oldval; - - if (g_hash_table_lookup_extended(idx->namecache, name, (void *)&oldkey, (void *)&oldval)) { - g_hash_table_remove(idx->namecache, oldkey); - g_free(oldkey); - } else { - return; - } - } - - /* lookup the hash key */ - nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name)); - /* get the block for this key */ - nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail); - /* and the data for this block */ - words = idx->namestore->klass->get(idx->namestore, nameblock, nametail); - /* walk it ... */ - for (i=0;i<words->len;i++) { - /* get the word */ - wordid = g_array_index(words, nameid_t, i); - d(printf(" word %d\n", wordid)); - /* get the data block */ - wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail); - /* clear this name from it */ - newblock = wordblock; - newtail = wordtail; - idx->wordstore->klass->remove(idx->wordstore, &newblock, &newtail, nameid); - if (newblock != wordblock || newtail != wordtail) - idx->wordindex->klass->set_data(idx->wordindex, wordid, newblock, newtail); - - /* now check the cache as well */ - word = idx->nameindex->klass->get_key(idx->wordindex, wordid, NULL); - if (word) { - cache = g_hash_table_lookup(idx->wordcache, word); - if (cache) { - /* its there, update our head/tail pointers */ - cache->wordblock = newblock; - cache->wordtail = newtail; - - /* now check that we have a data entry in it */ - if (cache->filealloc == 0 && cache->filecount == 1) { - if (cache->file.file0 == nameid) { - cache->filecount = 0; - } - } else { - int j; - - for (j=0;j<cache->filecount;j++) { - if (cache->file.files[j] == nameid) { - cache->file.files[j] = cache->file.files[cache->filecount-1]; - cache->filecount--; - break; - } - } - } - } - g_free(word); - } - } - g_array_free(words, TRUE); - - /* and remove name data and itself */ - idx->namestore->klass->free(idx->namestore, nameblock, nametail); - idx->nameindex->klass->remove(idx->nameindex, name, strlen(name)); -} - -/* index contains (any) data for name */ -static gboolean contains_name(struct _IBEXWord *idx, const char *name) -{ - struct _IBEXCursor *idc; - nameid_t wordid; - char *key; - int len; - - /* load all the names into memory, since we're *usually* about to do a lot of these */ - - /* Note that because of the (poor) hash algorithm, this is >> faster than - looking up every key in turn. Basically because all keys are stored packed - in the same list, not in buckets of keys for the same hash (among other reasons) */ - - if (!idx->nameinit) { - d(printf("pre-loading all name info into memory\n")); - idc = idx->nameindex->klass->get_cursor(idx->nameindex); - while ( (wordid = idc->klass->next(idc)) ) { - key = idc->index->klass->get_key(idc->index, wordid, &len); - g_hash_table_insert(idx->namecache, key, (void *)TRUE); - } - idc->klass->close(idc); - idx->nameinit = TRUE; - } - - return (gboolean)g_hash_table_lookup(idx->namecache, name); - /*return idx->nameindex->klass->find(idx->nameindex, name, strlen(name)) != 0;*/ -} - -/* returns all matches for word */ -static GPtrArray *find(struct _IBEXWord *idx, const char *word) -{ - nameid_t wordid, nameid; - GPtrArray *res; - GArray *names; - int i; - char *new; - struct _wordcache *cache; - blockid_t wordblock, wordtail; - - res = g_ptr_array_new(); - - cache = g_hash_table_lookup(idx->wordcache, word); - if (cache) { -#if 0 - /* freshen cache entry if we touch it */ - ibex_list_remove((struct _listnode *)cache); - ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache); -#endif - wordid = cache->wordid; - wordblock = cache->wordblock; - wordtail = cache->wordtail; - } else { - /* lookup the hash key */ - wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word)); - /* get the block for this key */ - wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail); - } - /* and the data for this block */ - names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail); - /* .. including any memory-only data */ - if (cache) { - if (cache->filealloc == 0 && cache->filecount == 1) - g_array_append_val(names, cache->file.file0); - else - g_array_append_vals(names, cache->file.files, cache->filecount); - } - - /* walk it ... converting id's back to strings */ - g_ptr_array_set_size(res, names->len); - for (i=0;i<names->len;i++) { - nameid = g_array_index(names, nameid_t, i); - new = idx->nameindex->klass->get_key(idx->nameindex, nameid, NULL); - res->pdata[i] = new; - } - g_array_free(names, TRUE); - return res; -} - -/* find if name contains word */ -static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *word) -{ - nameid_t wordid, nameid; - blockid_t nameblock, nametail; - struct _wordcache *cache; - int i; - - /* if we have the namelist in memory, quick-check that */ - if (idx->nameinit && g_hash_table_lookup(idx->namecache, name) == NULL) - return FALSE; - - /* lookup the hash key for the name */ - nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name)); - /* get the block for this name */ - nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail); - - /* check if there is an in-memory cache for this word, check its file there first */ - cache = g_hash_table_lookup(idx->wordcache, word); - if (cache) { -#if 0 - /* freshen cache entry if we touch it */ - ibex_list_remove((struct _listnode *)cache); - ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache); -#endif - if (cache->filecount == 1 && cache->filealloc == 0) { - if (cache->file.file0 == nameid) - return TRUE; - } else { - for (i=0;i<cache->filecount;i++) { - if (cache->file.files[i] == nameid) - return TRUE; - } - } - /* not there? well we can use the wordid anyway */ - wordid = cache->wordid; - } else { - /* lookup the hash key for word */ - wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word)); - } - - /* see if wordid is in nameblock */ - return idx->namestore->klass->find(idx->namestore, nameblock, nametail, wordid); -} - -/* cache helper functions */ -/* flush a cache entry to disk, and empty it out */ -static void -sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache) -{ - GArray array; /* just use this as a header */ - blockid_t oldblock, oldtail; - - if (cache->filecount == 0) - return; - - d(printf("syncing cache entry '%s' used %d\n", cache->word, cache->filecount)); - if (cache->filecount == 1 && cache->filealloc == 0) - array.data = (char *)&cache->file.file0; - else - array.data = (char *)cache->file.files; - array.len = cache->filecount; - oldblock = cache->wordblock; - oldtail = cache->wordtail; - idx->wordstore->klass->add_list(idx->wordstore, &cache->wordblock, &cache->wordtail, &array); - if (oldblock != cache->wordblock || oldtail != cache->wordtail) { - idx->wordindex->klass->set_data(idx->wordindex, cache->wordid, cache->wordblock, cache->wordtail); - } - cache->filecount = 0; -} - -/* create a new key in an index, returning its id and head block */ -static void -add_index_key(struct _IBEXIndex *wordindex, const char *word, nameid_t *wordid, blockid_t *wordblock, blockid_t *wordtail) -{ - /* initialise cache entry - id of word entry and head block */ - *wordid = wordindex->klass->find(wordindex, word, strlen(word)); - - if (*wordid == 0) { - *wordid = wordindex->klass->insert(wordindex, word, strlen(word)); - *wordblock = 0; - *wordtail = 0; - } else { - *wordblock = wordindex->klass->get_data(wordindex, *wordid, wordtail); - } -} - -/* create a new key in a cached index (only word cache so far), flushing old keys - if too much space is being used */ -static struct _wordcache * -add_index_cache(struct _IBEXWord *idx, const char *word) -{ - struct _wordcache *cache; - - cache = g_hash_table_lookup(idx->wordcache, word); - if (cache == 0) { - /*d(printf("adding %s to cache\n", word));*/ - -#if 0 - /* see if we have to flush off the last entry */ - if (idx->wordcount >= WORDCACHE_SIZE) { - struct _wordcache *mincache; - int min, count=0; - /* remove last entry, and flush it */ - cache = (struct _wordcache *)idx->wordnodes.tailpred; - mincache = cache; - min = mincache->filecount; - - d(printf("flushing word from cache %s\n", cache->word)); - /* instead of just using the last entry, we try and find an entry with - with only 1 item (failing that, the smallest in the range we look at) */ - /* this could probably benefit greatly from a more sophisticated aging algorithm */ - while (cache->next && count < 100) { - if (cache->filecount == 1) { - mincache = cache; - break; - } - if (cache->filecount > 0 && cache->filecount < min) { - mincache = cache; - min = cache->filecount; - } - cache = cache->next; - count++; - } - ibex_list_remove((struct _listnode *)mincache); - g_hash_table_remove(idx->wordcache, mincache->word); - sync_cache_entry(idx, mincache); - if (mincache->filealloc) - g_free(mincache->file.files); - g_free(mincache); - idx->wordcount--; - } -#endif - cache = g_malloc0(sizeof(*cache)+strlen(word)); - /* if we're in an index state, we can assume we dont have it if we dont have it in memory */ - if (idx->precount == 0) { - /* initialise cache entry - id of word entry and head block */ - add_index_key(idx->wordindex, word, &cache->wordid, &cache->wordblock, &cache->wordtail); - } else { - cache->wordid = idx->wordindex->klass->insert(idx->wordindex, word, strlen(word)); - } - /* other fields */ - strcpy(cache->word, word); - cache->filecount = 0; - g_hash_table_insert(idx->wordcache, cache->word, cache); -#if 0 - ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache); -#endif - idx->wordcount++; - } else { - /*d(printf("already have %s in cache\n", word));*/ -#if 0 - /* move cache bucket ot the head of the cache list */ - ibex_list_remove((struct _listnode *)cache); - ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache); -#endif - } - return cache; -} - -/* adds a single word to name (slow) */ -static void add(struct _IBEXWord *idx, const char *name, const char *word) -{ - nameid_t nameid; - blockid_t nameblock, newblock, nametail, newtail; - struct _wordcache *cache; - - g_error("Dont use wordindex::add()"); - abort(); - - cache = add_index_cache(idx, word); - - /* get the nameid and block start for this name */ - add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); - - /* check for repeats of the last name - dont add anything */ - if (cache->filecount == 1 && cache->filealloc == 0) { - if (cache->file.file0 == nameid) - return; - } else { - if (cache->file.files[cache->filecount] == nameid) - return; - } - - /* see if we are setting the first, drop it in the union */ - if (cache->filecount == 0 && cache->filealloc == 0) { - cache->file.file0 = nameid; - } else if (cache->filecount == 1 && cache->filealloc == 0) { - nameid_t saveid; - /* we need to allocate space for words */ - saveid = cache->file.file0; - cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT); - /* this could possibly grow as needed, but i wont for now */ - cache->filealloc = CACHE_FILE_COUNT; - cache->file.files[0] = saveid; - cache->file.files[1] = nameid; - } else { - cache->file.files[cache->filecount] = nameid; - } - - cache->filecount++; - - /* if we are full, force a flush now */ - if (cache->filealloc && cache->filecount >= cache->filealloc) { - sync_cache_entry(idx, cache); - } - - newtail = nametail; - newblock = nameblock; - idx->namestore->klass->add(idx->namestore, &newblock, &newtail, cache->wordid); - if (newblock != nameblock || newtail != nametail) { - idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail); - } -} - -/* adds a bunch of words to a given name */ -static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words) -{ - int i; - GArray *data = g_array_new(0, 0, sizeof(nameid_t)); - blockid_t nameblock, newblock, nametail, newtail; - nameid_t nameid; - struct _wordcache *cache; - - d(printf("Adding words to name %s\n", name)); - - d(cache_sanity(idx)); - - /* make sure we keep the namecache in sync, if it is active */ - if (idx->nameinit && g_hash_table_lookup(idx->namecache, name) == NULL) { - g_hash_table_insert(idx->namecache, g_strdup(name), (void *)TRUE); - /* we know we dont have it in the disk hash either, so we insert anew (saves a lookup) */ - nameid = idx->nameindex->klass->insert(idx->nameindex, name, strlen(name)); - nameblock = 0; - nametail = 0; - } else { - /* get the nameid and block start for this name */ - add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); - } - - d(cache_sanity(idx)); - - for (i=0;i<words->len;i++) { - char *word = words->pdata[i]; - - cache = add_index_cache(idx, word); - - /*d(cache_sanity(idx));*/ - - /* check for duplicates; doesn't catch duplicates over an overflow boundary. Watch me care. */ - if (cache->filecount == 0 - /* the 1 item case */ - || (cache->filecount == 1 && cache->filealloc == 0 && cache->file.file0 != nameid) - /* the normal case */ - || (cache->filealloc > 0 && cache->file.files[cache->filecount-1] != nameid)) { - - /* see if we are setting the first, drop it in the union */ - if (cache->filecount == 0 && cache->filealloc == 0) { - cache->file.file0 = nameid; - } else if (cache->filecount == 1 && cache->filealloc == 0) { - nameid_t saveid; - /* we need to allocate space for words */ - saveid = cache->file.file0; - cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT); - /* this could possibly grow as needed, but i wont for now */ - cache->filealloc = CACHE_FILE_COUNT; - cache->file.files[0] = saveid; - cache->file.files[1] = nameid; - } else { - cache->file.files[cache->filecount] = nameid; - } - - cache->filecount++; - - /* if we are full, force a flush now */ - if (cache->filealloc && cache->filecount >= cache->filealloc) { - sync_cache_entry(idx, cache); - } - - /*d(cache_sanity(idx));*/ - - /* and append this wordid for this name in memory */ - g_array_append_val(data, cache->wordid); - } - - /*d(cache_sanity(idx));*/ - } - - d(cache_sanity(idx)); - - /* and append these word id's in one go */ - newblock = nameblock; - newtail = nametail; - idx->namestore->klass->add_list(idx->namestore, &newblock, &newtail, data); - if (newblock != nameblock || newtail != nametail) { - idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail); - } - - d(cache_sanity(idx)); - - g_array_free(data, TRUE); -} - -/* sync any in-memory data to disk */ -static int -word_sync(struct _IBEXWord *idx) -{ - /* we just flush also, save memory */ - word_flush(idx); - -#if 0 - struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head; - - while (cache->next) { - sync_cache_entry(idx, cache); - cache = cache->next; - } - - /*ibex_hash_dump(idx->wordindex);*/ - /*ibex_hash_dump(idx->nameindex);*/ -#endif - return 0; -} - -static gboolean -free_key(void *key, void *value, void *data) -{ - g_free(key); - - return TRUE; -} - -/* sync and flush any in-memory data to disk and free it */ -static int -word_flush(struct _IBEXWord *idx) -{ - d(cache_sanity(idx)); - - g_hash_table_foreach_remove(idx->wordcache, sync_free_value, idx); - idx->wordcount = 0; - if (idx->nameinit) { - g_hash_table_foreach_remove(idx->namecache, free_key, NULL); - idx->nameinit = FALSE; - } - return 0; -} - -static int word_close(struct _IBEXWord *idx) -{ - idx->klass->flush(idx); - - idx->namestore->klass->close(idx->namestore); - idx->nameindex->klass->close(idx->nameindex); - /*same as namestore: - idx->wordstore->klass->close(idx->wordstore);*/ - idx->wordindex->klass->close(idx->wordindex); - g_hash_table_destroy(idx->wordcache); - g_hash_table_destroy(idx->namecache); - g_free(idx); - - return 0; -} - -/* debugging/tuning function */ - -struct _stats { - int memcache; /* total memory used by cache entries */ - int memfile; /* total mem ysed by file data */ - int memfileused; /* actual memory used by file data */ - int memword; /* total mem used by words */ - int file1; /* total file entries with only 1 entry */ - int total; -}; - -static void -get_info(void *key, void *value, void *data) -{ - struct _wordcache *cache = (struct _wordcache *)value; - struct _stats *stats = (struct _stats *)data; - - /* round up to probable alignment, + malloc overheads */ - stats->memcache += ((sizeof(struct _wordcache) + strlen(cache->word) + 4 + 3) & ~3); - if (cache->filealloc > 0) { - /* size of file array data */ - stats->memcache += sizeof(nameid_t) * cache->filealloc + 4; - /* actual used memory */ - stats->memfile += sizeof(nameid_t) * cache->filealloc; - stats->memfileused += sizeof(nameid_t) * cache->filecount; - } - if (cache->filecount == 1 && cache->filealloc == 0) - stats->file1++; - - stats->memword += strlen(cache->word); - stats->total++; -} - -static char * -num(int num) -{ - int n; - static char buf[256]; - char *p = buf; - char type = 0; - - n = num; - if (n>1000000) { - p+= sprintf(p, "%d ", n/1000000); - n -= (n/1000000)*1000000; - type = 'M'; - } - if (n>1000) { - if (num>1000000) - p+= sprintf(p, "%03d ", n/1000); - else - p+= sprintf(p, "%d ", n/1000); - n -= (n/1000)*1000; - if (type == 0) - type = 'K'; - } - if (num > 1000) - p += sprintf(p, "%03d", n); - else - p += sprintf(p, "%d", n); - - n = num; - switch (type) { - case 'M': - p += sprintf(p, ", %d.%02dM", n/1024/1024, n*100/1024/1024); - break; - case 'K': - p += sprintf(p, ", %d.%02dK", n/1024, n*100/1024); - break; - case 0: - break; - } - - return buf; -} - -void word_index_mem_dump_info(struct _IBEXWord *idx); - -void word_index_mem_dump_info(struct _IBEXWord *idx) -{ - struct _stats stats = { 0 }; - int useful; - - g_hash_table_foreach(idx->wordcache, get_info, &stats); - - useful = stats.total * sizeof(struct _wordcache) + stats.memword + stats.memfile; - - printf("Word Index Stats:\n"); - printf("Total word count: %d\n", stats.total); - printf("Total memory used: %s\n", num(stats.memcache)); - printf("Total useful memory: %s\n", num(useful)); - printf("Total malloc/alignment overhead: %s\n", num(stats.memcache - useful)); - printf("Total buffer overhead: %s\n", num(stats.memfile - stats.memfileused)); - printf("Space taken by words: %s\n", num(stats.memword + stats.total)); - printf("Number of 1-word entries: %s\n", num(stats.file1)); - if (stats.memcache > 0) - printf("%% unused space: %d %%\n", (stats.memfile - stats.memfileused) * 100 / stats.memcache); -} - |