librsync  2.3.2
hashtable.h
Go to the documentation of this file.
1 /*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*-
2  *
3  * hashtable.h -- a generic open addressing hashtable.
4  *
5  * Copyright (C) 2003 by Donovan Baarda <abo@minkirri.apana.org.au>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation; either version 2.1 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20 #ifndef _HASHTABLE_H_
21 # define _HASHTABLE_H_
22 
23 # include <assert.h>
24 # include <stdlib.h>
25 # include <stdbool.h>
26 
27 /** \file hashtable.h
28  * A generic open addressing hashtable.
29  *
30  * This is a minimal hashtable containing pointers to arbitrary entries with
31  * configurable hashtable size and support for custom hash() and cmp() methods.
32  * The cmp() method can either be a simple comparison between two keys, or can
33  * be against a special match object containing additional mutable state. This
34  * allows for things like deferred and cached evaluation of costly comparison
35  * data. The hash() function doesn't need to avoid clustering behaviour.
36  *
37  * It uses open addressing with quadratic probing for collisions. The
38  * MurmurHash3 finalization function is optionally used on the hash() output to
39  * avoid clustering and can be disabled by setting HASHTABLE_NMIX32. There is
40  * no support for removing entries, only adding them. Multiple entries with the
41  * same key can be added, and you can use a fancy cmp() function to find
42  * particular entries by more than just their key. There is an iterator for
43  * iterating through all entries in the hashtable. There are optional
44  * NAME_find() find/match/hashcmp/entrycmp stats counters that can be disabled
45  * by defining HASHTABLE_NSTATS. There is an optional simple k=1 bloom filter
46  * for speed that can be disabled by defining HASHTABLE_NBLOOM.
47  *
48  * The types and methods of the hashtable and its contents are specified by
49  * using \#define parameters set to their basenames (the prefixes for the *_t
50  * type and *_func() methods) before doing \#include "hashtable.h". This
51  * produces static inline type-safe methods that are either application
52  * optimized for speed or wrappers around void* implementation methods for
53  * compactness.
54  *
55  * \param ENTRY - the entry type basename.
56  *
57  * \param KEY - optional key type basename (default: ENTRY).
58  *
59  * \param MATCH - optional match type basename (default: KEY).
60  *
61  * \param NAME - optional hashtable type basename (default: ENTRY_hashtable).
62  *
63  * Example: \code
64  * typedef ... mykey_t;
65  * int mykey_hash(mykey_t const *e);
66  * int mykey_cmp(mykey_t *e, mykey_t const *o);
67  *
68  * typedef struct myentry {
69  * mykey_t key; // Inherit from mykey_t.
70  * ...extra entry value data...
71  * } myentry_t;
72  * void myentry_init(myentry_t *e, ...);
73  *
74  * #define ENTRY myentry
75  * #define KEY mykey
76  * #include "hashtable.h"
77  *
78  * hashtable_t *t;
79  * myentry_t entries[300];
80  * mykey_t k;
81  * myentry_t *e;
82  *
83  * t = myentry_hashtable_new(300);
84  * myentry_init(&entries[5], ...);
85  * myentry_hashtable_add(t, &entries[5]);
86  * k = ...;
87  * e = myentry_hashtable_find(t, &k);
88  *
89  * int i;
90  * for (e = myentry_hashtable_iter(t, &i); e != NULL;
91  * e = myentry_hashtable_next(t, &i))
92  * ...
93  *
94  * myentry_hashtable_free(t);
95  * \endcode
96  *
97  * The mykey_hash() and mykey_cmp() fuctions will typically take pointers to
98  * mykey/myentry instances the same as the pointers stored in the hashtable.
99  * However it is also possible for them to take "match objects" that are a
100  * "subclass" of the entry type that contain additional state for complicated
101  * comparision operations.
102  *
103  * Example: \code
104  * typedef struct mymatch {
105  * mykey_t key; // Inherit from mykey_t;
106  * ...extra match criteria and state data...
107  * } mymatch_t;
108  * int mymatch_cmp(mymatch_t *m, myentry_t const *e);
109  *
110  * #define ENTRY myentry
111  * #define KEY mykey
112  * #define MATCH mymatch
113  * #include "hashtable.h"
114  *
115  * ...
116  * mymatch_t m;
117  *
118  * t = myentry_hashtable_new(300);
119  * ...
120  * m = ...;
121  * e = myentry_hashtable_find(t, &m);
122  * \endcode
123  *
124  * The mymatch_cmp() function is only called for finding hashtable entries and
125  * can mutate the mymatch_t object for doing things like deferred and cached
126  * evaluation of expensive match data. It can also access the whole myentry_t
127  * object to match against more than just the key. */
128 
129 /** The hashtable type. */
130 typedef struct hashtable {
131  int size; /**< Size of allocated hashtable. */
132  int count; /**< Number of entries in hashtable. */
133  unsigned tmask; /**< Mask to get the hashtable index. */
134 # ifndef HASHTABLE_NBLOOM
135  unsigned bshift; /**< Shift to get the bloomfilter index. */
136 # endif
137 # ifndef HASHTABLE_NSTATS
138  /* The following are for accumulating NAME_find() stats. */
139  long find_count; /**< The count of finds tried. */
140  long match_count; /**< The count of matches found. */
141  long hashcmp_count; /**< The count of hash compares done. */
142  long entrycmp_count; /**< The count of entry compares done. */
143 # endif
144 # ifndef HASHTABLE_NBLOOM
145  unsigned char *kbloom; /**< Bloom filter of hash keys with k=1. */
146 # endif
147  void **etable; /**< Table of pointers to entries. */
148  unsigned ktable[]; /**< Table of hash keys. */
150 
151 /* void* implementations for the type-safe static inline wrappers below. */
152 hashtable_t *_hashtable_new(int size);
153 void _hashtable_free(hashtable_t *t);
154 
155 # ifndef HASHTABLE_NBLOOM
156 static inline void hashtable_setbloom(hashtable_t *t, unsigned const h)
157 {
158  /* Use upper bits for a "different hash". */
159  unsigned const i = h >> t->bshift;
160  t->kbloom[i / 8] |= (unsigned char)(1 << (i % 8));
161 }
162 
163 static inline bool hashtable_getbloom(hashtable_t *t, unsigned const h)
164 {
165  /* Use upper bits for a "different hash". */
166  unsigned const i = h >> t->bshift;
167  return (t->kbloom[i / 8] >> (i % 8)) & 1;
168 }
169 # endif
170 
171 /** MurmurHash3 finalization mix function. */
172 static inline unsigned mix32(unsigned h)
173 {
174  h ^= h >> 16;
175  h *= 0x85ebca6b;
176  h ^= h >> 13;
177  h *= 0xc2b2ae35;
178  h ^= h >> 16;
179  return h;
180 }
181 
182 /** Ensure hash's are never zero. */
183 static inline unsigned nozero(unsigned h)
184 {
185  return h ? h : (unsigned)-1;
186 }
187 
188 #endif /* _HASHTABLE_H_ */
189 
190 /* If ENTRY is defined, define type-dependent static inline methods. */
191 #ifdef ENTRY
192 
193 # define _JOIN2(x, y) x##y
194 # define _JOIN(x, y) _JOIN2(x, y)
195 
196 # ifndef KEY
197 # define KEY ENTRY
198 # endif
199 
200 # ifndef MATCH
201 # define MATCH KEY
202 # endif
203 
204 # ifndef NAME
205 # define NAME _JOIN(ENTRY, _hashtable)
206 # endif
207 
208 # define ENTRY_t _JOIN(ENTRY, _t) /**< The entry type. */
209 # define KEY_t _JOIN(KEY, _t) /**< The key type. */
210 # define MATCH_t _JOIN(MATCH, _t) /**< The match type. */
211 # define KEY_hash _JOIN(KEY, _hash) /**< The key hash(k) method. */
212 # define MATCH_cmp _JOIN(MATCH, _cmp) /**< The match cmp(m, e) method. */
213 /* The names for all the hashtable methods. */
214 # define NAME_new _JOIN(NAME, _new)
215 # define NAME_free _JOIN(NAME, _free)
216 # define NAME_stats_init _JOIN(NAME, _stats_init)
217 # define NAME_add _JOIN(NAME, _add)
218 # define NAME_find _JOIN(NAME, _find)
219 # define NAME_iter _JOIN(NAME, _iter)
220 # define NAME_next _JOIN(NAME, _next)
221 
222 /* Modified hash() with/without mix32() and reserving zero for empty buckets. */
223 # ifdef HASHTABLE_NMIX32
224 # define _KEY_HASH(k) nozero(KEY_hash((KEY_t *)k))
225 # else
226 # define _KEY_HASH(k) nozero(mix32(KEY_hash((KEY_t *)k)))
227 # endif
228 
229 /* Loop macro for probing table t for key hash hk, iterating with index i and
230  entry hash h, terminating at an empty bucket. */
231 # define _for_probe(t, hk, i, h) \
232  unsigned const *const ktable = t->ktable;\
233  unsigned const tmask = t->tmask;\
234  unsigned i, s, h;\
235  for (i = hk & tmask, s = 0; (h = ktable[i]); i = (i + ++s) & tmask)
236 
237 /* Conditional macro for incrementing stats counters. */
238 # ifndef HASHTABLE_NSTATS
239 # define _stats_inc(c) (c++)
240 # else
241 # define _stats_inc(c)
242 # endif
243 
244 /** Allocate and initialize a hashtable instance.
245  *
246  * The provided size is used as an indication of the number of entries you wish
247  * to add, but the allocated size will probably be larger depending on the
248  * implementation to enable optimisations or avoid degraded performance. It may
249  * be possible to fill the table beyond the requested size, but performance can
250  * start to degrade badly if it is over filled.
251  *
252  * \param size - The desired minimum size of the hash table.
253  *
254  * \return The initialized hashtable instance or NULL if it failed. */
255 static inline hashtable_t *NAME_new(int size)
256 {
257  return _hashtable_new(size);
258 }
259 
260 /** Destroy and free a hashtable instance.
261  *
262  * This will free the hashtable, but will not free the entries in the
263  * hashtable. If you want to free the entries too, use a hashtable iterator to
264  * free the the entries first.
265  *
266  * \param *t - The hashtable to destroy and free. */
267 static inline void NAME_free(hashtable_t *t)
268 {
269  _hashtable_free(t);
270 }
271 
272 /** Initialize hashtable stats counters.
273  *
274  * This will reset all the stats counters for the hashtable,
275  *
276  * \param *t - The hashtable to initializ stats for. */
277 static inline void NAME_stats_init(hashtable_t *t)
278 {
279 # ifndef HASHTABLE_NSTATS
280  t->find_count = t->match_count = t->hashcmp_count = t->entrycmp_count = 0;
281 # endif
282 }
283 
284 /** Add an entry to a hashtable.
285  *
286  * This doesn't use MATCH_cmp() or do any checks for existing copies or
287  * instances, so it will add duplicates. If you want to avoid adding
288  * duplicates, use NAME_find() to check for existing entries first.
289  *
290  * \param *t - The hashtable to add to.
291  *
292  * \param *e - The entry object to add.
293  *
294  * \return The added entry, or NULL if the table is full. */
295 static inline ENTRY_t *NAME_add(hashtable_t *t, ENTRY_t *e)
296 {
297  unsigned he = _KEY_HASH(e);
298 
299  assert(e != NULL);
300  if (t->count + 1 == t->size)
301  return NULL;
302 # ifndef HASHTABLE_NBLOOM
303  hashtable_setbloom(t, he);
304 # endif
305  _for_probe(t, he, i, h);
306  t->count++;
307  t->ktable[i] = he;
308  return t->etable[i] = e;
309 }
310 
311 /** Find an entry in a hashtable.
312  *
313  * Uses MATCH_cmp() to find the first matching entry in the table in the same
314  * hash() bucket.
315  *
316  * \param *t - The hashtable to search.
317  *
318  * \param *m - The key or match object to search for.
319  *
320  * \return The first found entry, or NULL if nothing was found. */
321 static inline ENTRY_t *NAME_find(hashtable_t *t, MATCH_t *m)
322 {
323  assert(m != NULL);
324  unsigned hm = _KEY_HASH(m);
325  ENTRY_t *e;
326 
327  _stats_inc(t->find_count);
328 # ifndef HASHTABLE_NBLOOM
329  if (!hashtable_getbloom(t, hm))
330  return NULL;
331 # endif
332  _for_probe(t, hm, i, he) {
333  _stats_inc(t->hashcmp_count);
334  if (hm == he) {
335  _stats_inc(t->entrycmp_count);
336  if (!MATCH_cmp(m, e = t->etable[i])) {
337  _stats_inc(t->match_count);
338  return e;
339  }
340  }
341  }
342  /* Also count the compare for the empty bucket. */
343  _stats_inc(t->hashcmp_count);
344  return NULL;
345 }
346 
347 static inline ENTRY_t *NAME_next(hashtable_t *t, int *i);
348 
349 /** Initialize a iteration and return the first entry.
350  *
351  * This works together with NAME_next() for iterating through all entries in a
352  * hashtable.
353  *
354  * Example: \code
355  * for (e = NAME_iter(t, &i); e != NULL; e = NAME_next(t, &i))
356  * ...
357  * \endcode
358  *
359  * \param *t - the hashtable to iterate over.
360  *
361  * \param *i - the int iterator index to initialize.
362  *
363  * \return The first entry or NULL if the hashtable is empty. */
364 static inline ENTRY_t *NAME_iter(hashtable_t *t, int *i)
365 {
366  assert(t != NULL);
367  assert(i != NULL);
368  *i = 0;
369  return NAME_next(t, i);
370 }
371 
372 /** Get the next entry from a hashtable iterator or NULL when finished.
373  *
374  * This works together with NAME_iter() for iterating through all entries in a
375  * hashtable.
376  *
377  * \param *t - the hashtable to iterate over.
378  *
379  * \param *i - the int iterator index to use.
380  *
381  * \return The next entry or NULL if the iterator is finished. */
382 static inline ENTRY_t *NAME_next(hashtable_t *t, int *i)
383 {
384  assert(t != NULL);
385  assert(i != NULL);
386  ENTRY_t *e = NULL;
387 
388  while ((*i < t->size) && !(e = t->etable[(*i)++])) ;
389  return e;
390 }
391 
392 # undef ENTRY
393 # undef KEY
394 # undef MATCH
395 # undef NAME
396 # undef ENTRY_t
397 # undef KEY_t
398 # undef MATCH_t
399 # undef KEY_hash
400 # undef MATCH_cmp
401 # undef NAME_new
402 # undef NAME_free
403 # undef NAME_stats_init
404 # undef NAME_add
405 # undef NAME_find
406 # undef NAME_iter
407 # undef NAME_next
408 # undef _KEY_HASH
409 #endif /* ENTRY */
#define ENTRY_t
The entry type.
Definition: hashtable.h:208
static void NAME_stats_init(hashtable_t *t)
Initialize hashtable stats counters.
Definition: hashtable.h:277
static ENTRY_t * NAME_next(hashtable_t *t, int *i)
Get the next entry from a hashtable iterator or NULL when finished.
Definition: hashtable.h:382
static ENTRY_t * NAME_add(hashtable_t *t, ENTRY_t *e)
Add an entry to a hashtable.
Definition: hashtable.h:295
#define MATCH_cmp
The match cmp(m, e) method.
Definition: hashtable.h:212
struct hashtable hashtable_t
The hashtable type.
#define MATCH_t
The match type.
Definition: hashtable.h:210
static void NAME_free(hashtable_t *t)
Destroy and free a hashtable instance.
Definition: hashtable.h:267
static ENTRY_t * NAME_iter(hashtable_t *t, int *i)
Initialize a iteration and return the first entry.
Definition: hashtable.h:364
static ENTRY_t * NAME_find(hashtable_t *t, MATCH_t *m)
Find an entry in a hashtable.
Definition: hashtable.h:321
static unsigned mix32(unsigned h)
MurmurHash3 finalization mix function.
Definition: hashtable.h:172
static unsigned nozero(unsigned h)
Ensure hash's are never zero.
Definition: hashtable.h:183
static hashtable_t * NAME_new(int size)
Allocate and initialize a hashtable instance.
Definition: hashtable.h:255
The hashtable type.
Definition: hashtable.h:130
long find_count
The count of finds tried.
Definition: hashtable.h:139
unsigned ktable[]
Table of hash keys.
Definition: hashtable.h:148
long match_count
The count of matches found.
Definition: hashtable.h:140
int count
Number of entries in hashtable.
Definition: hashtable.h:132
unsigned char * kbloom
Bloom filter of hash keys with k=1.
Definition: hashtable.h:145
long entrycmp_count
The count of entry compares done.
Definition: hashtable.h:142
int size
Size of allocated hashtable.
Definition: hashtable.h:131
unsigned tmask
Mask to get the hashtable index.
Definition: hashtable.h:133
void ** etable
Table of pointers to entries.
Definition: hashtable.h:147
long hashcmp_count
The count of hash compares done.
Definition: hashtable.h:141
unsigned bshift
Shift to get the bloomfilter index.
Definition: hashtable.h:135