s2.util.container.dense_hash

1 /++
2    A dense hashtable is a particular implementation of
3    a hashtable: one that is meant to minimize memory allocation.
4    It does this by using an array to store all the data.  We
5    steal a value from the key space to indicate "empty" array
6    elements (ie indices where no item lives) and another to indicate
7    "deleted" elements.
8 
9    (Note it is possible to change the value of the delete key
10    on the fly; you can even remove it, though after that point
11    the hashtable is insert_only until you set it again.  The empty
12    value however can't be changed.)
13 
14    To minimize allocation and pointer overhead, we use internal
15    probing, in which the hashtable is a single table, and collisions
16    are resolved by trying to insert again in another bucket.  The
17    most cache-efficient internal probing schemes are linear probing
18    (which suffers, alas, from clumping) and quadratic probing, which
19    is what we implement by default.
20 
21    Type requirements: value_type is required to be Copy Constructible
22    and Default Constructible. It is not required to be (and commonly
23    isn't) Assignable.
24 
25    You probably shouldn't use this code directly.  Use dense_hash_map<>
26    or dense_hash_set<> instead.
27 
28    You can change the following below:
29 
30    - HT_OCCUPANCY_PCT      -- how full before we double size
31 
32    - HT_EMPTY_PCT          -- how empty before we halve size
33 
34    - HT_MIN_BUCKETS        -- default smallest bucket size
35 
36    You can also change enlarge_factor (which defaults to
37    HT_OCCUPANCY_PCT), and shrink_factor (which defaults to
38    HT_EMPTY_PCT) with set_resizing_parameters().
39 
40    How to decide what values to use?
41    shrink_factor's default of .4 * OCCUPANCY_PCT, is probably good.
42    HT_MIN_BUCKETS is probably unnecessary since you can specify
43    (indirectly) the starting number of buckets at construct-time.
44    For enlarge_factor, you can use this chart to try to trade-off
45    expected lookup time to the space taken up.  By default, this
46    code uses quadratic probing, though you can change it to linear
47    via JUMP_ below if you really want to.
48 
49    From http://www.augustana.ca/~mohrj/courses/1999.fall/csc210/lecture_notes/hashing.html
50 
51    ---
52    NUMBER OF PROBES / LOOKUP       Successful            Unsuccessful
53    Quadratic collision resolution   1 - ln(1-L) - L/2    1/(1-L) - L - ln(1-L)
54    Linear collision resolution     [1+1/(1-L)]/2         [1+1/(1-L)2]/2
55 
56    -- enlarge_factor --           0.10  0.50  0.60  0.75  0.80  0.90  0.99
57    QUADRATIC COLLISION RES.
58       probes/successful lookup    1.05  1.44  1.62  2.01  2.21  2.85  5.11
59       probes/unsuccessful lookup  1.11  2.19  2.82  4.64  5.81  11.4  103.6
60    LINEAR COLLISION RES.
61       probes/successful lookup    1.06  1.5   1.75  2.5   3.0   5.5   50.5
62       probes/unsuccessful lookup  1.12  2.5   3.6   8.5   13.0  50.0  5000.0
63    ---
64 
65    Copyright: (c) 2005, Google Inc.
66 
67    License:
68    All rights reserved.
69 
70    Redistribution and use in source and binary forms, with or without
71    modification, are permitted provided that the following conditions are met:
72 
73        * Redistributions of source code must retain the above copyright
74    notice, this list of conditions and the following disclaimer.
75        * Redistributions in binary form must reproduce the above
76    copyright notice, this list of conditions and the following disclaimer
77    in the documentation and/or other materials provided with the
78    distribution.
79        * Neither the name of Google Inc. nor the names of its
80    contributors may be used to endorse or promote products derived from
81    this software without specific prior written permission.
82 
83    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
84    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
85    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
86    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
87    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
88    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
89    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
90    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
91    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
92    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
93    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
94 +/
95 module s2.util.container.dense_hash_table;
96 
97 import std.algorithm;
98 import std.array;
99 import std.range;
100 import std.typecons;
101 
102 // Helper function that detects template types from the provided arguments.
103 auto denseHashTable(Value, Key, HashFcn, ExtractKey, SetKey, EqualKey)(
104     size_t expected_max_items_in_table,
105     HashFcn hashFcn,
106     ExtractKey extractKey,
107     SetKey setKey,
108     EqualKey equalKey) {
109   return new DenseHashTable!(Value, Key, HashFcn, ExtractKey, SetKey, EqualKey)(
110       expected_max_items_in_table,
111       hashFcn,
112       extractKey,
113       setKey,
114       equalKey);
115 }
116 
117 /**
118 Hashtable class, used to implement the hashed associative containers
119 hash_set and hash_map.
120 Params:
121   Value = what is stored in the table (each bucket is a Value).
122   Key = something in a 1-to-1 correspondence to a Value, that can be used
123       to search for a Value in the table (find() takes a Key).
124   HashFcn = A callable type which when given a Key, returns an integer, the more unique the better.
125   ExtractKey = A callable type which when given a Value, returns the unique Key associated with it.
126       Must inherit from unary_function, or at least have a
127       result_type enum indicating the return type of operator().
128   SetKey = A callable type which when given a ref Value and a Key, modifies the value such that
129       ExtractKey(value) == key.  We guarantee this is only called
130       with key == deleted_key or key == empty_key.
131   EqualKey = A callable type which when given two Keys, says whether they are the same (that is,
132       if they are both associated with the same Value).
133 */
134 class DenseHashTable(Value, Key, HashFcn, ExtractKey, SetKey, EqualKey) {
135 public:
136   // ACCESSOR FUNCTIONS for the things we templatize on, basically.
137   // The functions validate at compile time that the accessor functions have the right signature.
138   size_t hash(in Key key) const {
139     return _keyValInfo.hashFcn(key);
140   }
141 
142   Key getKey(in Value v) const {
143     return _keyValInfo.extractKey(v);
144   }
145 
146   void setKey(ref Value v, Key k) const {
147     _keyValInfo.setKey(v, k);
148   }
149 
150   bool equalKey(in Key a, in Key b) const {
151     return _keyValInfo.equalKey(a, b);
152   }
153 
154   alias KeyType = Key;
155   alias ValueType = Value;
156 
157   alias ThisT = DenseHashTable!(Value, Key, HashFcn, ExtractKey, SetKey, EqualKey);
158   alias Iterator = DenseHashTableIterator!(Value, Key, HashFcn, ExtractKey, SetKey, EqualKey);
159 
160   /**
161   How full we let the table get before we resize.  Knuth says .8 is
162   good -- higher causes us to probe too much, though saves memory.
163   However, we go with .5, getting better performance at the cost of
164   more space (a trade-off densehashtable explicitly chooses to make).
165   Feel free to play around with different values, though, via
166   max_load_factor() and/or set_resizing_parameters().
167   */
168   enum size_t HT_OCCUPANCY_PCT = 50;
169 
170   /**
171   How empty we let the table get before we resize lower, by default.
172   (0.0 means never resize lower.)
173   It should be less than OCCUPANCY_PCT / 2 or we thrash resizing.
174   */
175   enum size_t HT_EMPTY_PCT = cast(size_t)(0.4 * HT_OCCUPANCY_PCT);
176 
177   /**
178   Minimum size we're willing to let hashtables be.
179   Must be a power of two, and at least 4.
180   Note, however, that for a given hashtable, the initial size is a
181   function of the first constructor arg, and may be >HT_MIN_BUCKETS.
182   */
183   enum size_t HT_MIN_BUCKETS = 4;
184 
185   /**
186   By default, if you don't specify a hashtable size at
187   construction-time, we use this size.  Must be a power of two, and
188   at least HT_MIN_BUCKETS.
189   */
190   enum size_t HT_DEFAULT_STARTING_BUCKETS = 32;
191 
192   // ITERATOR FUNCTIONS
193   Iterator begin() {
194     return Iterator(this, _table[0 .. _numBuckets], true);
195   }
196 
197   Iterator end() {
198     return Iterator(this, _table[_numBuckets .. _numBuckets], true);
199   }
200 
201   // Accessor function for statistics gathering.
202   int numTableCopies() const {
203     return _settings.numHtCopies;
204   }
205 
206 private:
207   void destroyBuckets(size_t first, size_t last) {
208     for ( ; first != last; ++first)
209       _table[first] = Value.init;
210   }
211 
212   // DELETE HELPER FUNCTIONS
213   // This lets the user describe a key that will indicate deleted
214   // table entries.  This key should be an "impossible" entry --
215   // if you try to insert it for real, you won't be able to retrieve it!
216   // (NB: while you pass in an entire value, only the key part is looked
217   // at.  This is just because I don't know how to assign just a key.)
218   void squashDeleted() {
219     if (_numDeleted) {
220       auto tmp = new DenseHashTable(this);  // copying will get rid of deleted
221       swap(tmp);                   // now we are tmp
222     }
223     assert(_numDeleted == 0);
224   }
225 
226   // Test if the given key is the deleted indicator.  Requires
227   // num_deleted > 0, for correctness of read(), and because that
228   // guarantees that key_info.delkey is valid.
229   bool testDeletedKey(in Key key) const {
230     assert(_numDeleted > 0);
231     return equalKey(_keyValInfo.delKey, key);
232   }
233 
234 public:
235   void setDeletedKey(in Key key) {
236     // the empty indicator (if specified) and the deleted indicator
237     // must be different
238     assert(!_settings.useEmpty || !equalKey(key, getKey(_keyValInfo.emptyValue)),
239         "The deleted-key must be distinct from the the empty-key.");
240     // It's only safe to change what "deleted" means if we purge deleted guys
241     squashDeleted();
242     _settings.useDeleted = true;
243     _keyValInfo.delKey = key;
244   }
245 
246   void clearDeletedKey() {
247     squashDeleted();
248     _settings.useDeleted = false;
249   }
250   Key deletedKey() const {
251     assert(_settings.useDeleted, "Must set deleted key before calling deletedKey().");
252     return _keyValInfo.delKey;
253   }
254 
255   // These are public so the iterators can use them
256   // True if the item at position bucknum is "deleted" marker
257   bool testDeleted(size_t bucknum) const {
258     // Invariant: !use_deleted() implies num_deleted is 0.
259     assert(_settings.useDeleted || _numDeleted == 0);
260     return _numDeleted > 0 && testDeletedKey(getKey(_table[bucknum]));
261   }
262   bool testDeleted(in Iterator it) const {
263     // Invariant: !use_deleted() implies num_deleted is 0.
264     assert(_settings.useDeleted || _numDeleted == 0);
265     return _numDeleted > 0 && testDeletedKey(getKey(*it));
266   }
267 
268 private:
269   void checkUseDeleted(string caller) {
270     assert(_settings.useDeleted, caller);
271   }
272 
273   // Set it so test_deleted is true.  true if object didn't used to be deleted.
274   bool setDeleted(Iterator it) {
275     checkUseDeleted("set_deleted()");
276     bool retval = !testDeleted(it);
277     // &* converts from iterator to value-type.
278     setKey(*it, _keyValInfo.delKey);
279     return retval;
280   }
281 
282   // Set it so test_deleted is false.  true if object used to be deleted.
283   bool clearDeleted(Iterator it) {
284     checkUseDeleted("clear_deleted()");
285     // Happens automatically when we assign something else in its place.
286     return testDeleted(it);
287   }
288 
289   // EMPTY HELPER FUNCTIONS
290   // This lets the user describe a key that will indicate empty (unused)
291   // table entries.  This key should be an "impossible" entry --
292   // if you try to insert it for real, you won't be able to retrieve it!
293   // (NB: while you pass in an entire value, only the key part is looked
294   // at.  This is just because I don't know how to assign just a key.)
295 public:
296   // These are public so the iterators can use them
297   // True if the item at position bucknum is "empty" marker
298   bool testEmpty(size_t bucknum) const {
299     assert(_settings.useEmpty, "HashTable must call setEmpty() before use!");
300     return equalKey(getKey(_keyValInfo.emptyValue), getKey(_table[bucknum]));
301   }
302 
303   bool testEmpty(in Iterator it) const {
304     assert(_settings.useEmpty, "HashTable must call setEmpty() before use!");
305     return equalKey(getKey(_keyValInfo.emptyValue), getKey(*it));
306   }
307 
308 private:
309   void fillRangeWithEmpty(size_t table_start, size_t table_end) {
310     _table[table_start..table_end] = _keyValInfo.emptyValue;
311   }
312 
313 public:
314   // TODO(csilvers): change all callers of this to pass in a key instead,
315   //                 and take a const key_type instead of const value_type.
316   void setEmptyKey(Value val) {
317     // Once you set the empty key, you can't change it
318     assert(!_settings.useEmpty, "Calling setEmptyKey() multiple times, which is invalid!");
319     // The deleted indicator (if specified) and the empty indicator
320     // must be different.
321     assert(!_settings.useDeleted || !equalKey(getKey(val), _keyValInfo.delKey),
322         "setEmptyKey() must be called with a key distinct from the deleted-key!");
323     _settings.useEmpty = true;
324     _keyValInfo.emptyValue = val;
325 
326     assert(_table is null);            // must set before first use
327     // num_buckets was set in constructor even though table was not initialized.
328     _table = new Value[_numBuckets];
329     assert(_table);
330     fillRangeWithEmpty(0, _numBuckets);
331   }
332 
333   Key emptyKey() const {
334     assert(_settings.useEmpty);
335     return getKey(_keyValInfo.emptyValue);
336   }
337 
338   // FUNCTIONS CONCERNING SIZE
339 public:
340   size_t size() const {
341     return _numElements - _numDeleted;
342   }
343 
344   size_t maxSize() const {
345     return size_t.max;
346   }
347 
348   bool empty() const {
349     return size() == 0;
350   }
351 
352   size_t bucketCount() const {
353     return _numBuckets;
354   }
355 
356   size_t maxBucketCount() const {
357     return maxSize();
358   }
359 
360   size_t nonemptyBucketCount() const {
361     return _numElements;
362   }
363 
364   // These are tr1 methods.  Their idea of 'bucket' doesn't map well to
365   // what we do.  We just say every bucket has 0 or 1 items in it.
366   size_t bucketSize(size_t i) /*const*/ {
367     return begin() == end() ? 0 : 1;
368   }
369 
370 private:
371   // Because of the above, size_t(-1) is never legal; use it for errors
372   enum size_t ILLEGAL_BUCKET = cast(size_t) -1;
373 
374   // Used after a string of deletes.  Returns true if we actually shrunk.
375   // TODO(csilvers): take a delta so we can take into account inserts
376   // done after shrinking.  Maybe make part of the Settings class?
377   bool maybeShrink()
378   in {
379     assert(_numElements >= _numDeleted);
380     assert((bucketCount() & (bucketCount() - 1)) == 0); // is a power of two
381     assert(bucketCount() >= HT_MIN_BUCKETS);
382   } do {
383     bool retval = false;
384 
385     // If you construct a hashtable with < HT_DEFAULT_STARTING_BUCKETS,
386     // we'll never shrink until you get relatively big, and we'll never
387     // shrink below HT_DEFAULT_STARTING_BUCKETS.  Otherwise, something
388     // like "dense_hash_set<int> x; x.insert(4); x.erase(4);" will
389     // shrink us down to HT_MIN_BUCKETS buckets, which is too small.
390     const size_t num_remain = _numElements - _numDeleted;
391     const size_t shrink_threshold = _settings.shrinkThreshold;
392     if (shrink_threshold > 0 && num_remain < shrink_threshold
393         && bucketCount() > HT_DEFAULT_STARTING_BUCKETS) {
394       const float shrink_factor = _settings.shrinkFactor;
395       size_t sz = bucketCount() / 2;    // find how much we should shrink
396       while (sz > HT_DEFAULT_STARTING_BUCKETS && num_remain < sz * shrink_factor) {
397         sz /= 2;                            // stay a power of 2
398       }
399       auto tmp = new DenseHashTable(this, sz);   // Do the actual resizing
400       swap(tmp);                                 // now we are tmp
401       retval = true;
402     }
403     _settings.considerShrink = false;    // because we just considered it
404     return retval;
405   }
406 
407   // We'll let you resize a hashtable -- though this makes us copy all!
408   // When you resize, you say, "make it big enough for this many more elements"
409   // Returns true if we actually resized, false if size was already ok.
410   bool resizeDelta(size_t delta) {
411     bool did_resize = false;
412     if (_settings.considerShrink) {  // see if lots of deletes happened
413       if (maybeShrink())
414         did_resize = true;
415     }
416     if (_numElements >= size_t.max - delta) {
417       throw new Exception("resize overflow");
418     }
419     if ( bucketCount() >= HT_MIN_BUCKETS
420         && (_numElements + delta) <= _settings.enlargeThreshold )
421       return did_resize;                          // we're ok as we are
422 
423     // Sometimes, we need to resize just to get rid of all the
424     // "deleted" buckets that are clogging up the hashtable.  So when
425     // deciding whether to resize, count the deleted buckets (which
426     // are currently taking up room).  But later, when we decide what
427     // size to resize to, *don't* count deleted buckets, since they
428     // get discarded during the resize.
429     size_t needed_size = _settings.minBuckets(_numElements + delta, 0);
430     if ( needed_size <= bucketCount() )      // we have enough buckets
431       return did_resize;
432 
433     size_t resize_to =
434       _settings.minBuckets(_numElements - _numDeleted + delta, bucketCount());
435 
436     // When num_deleted is large, we may still grow but we do not want to
437     // over expand.  So we reduce needed_size by a portion of num_deleted
438     // (the exact portion does not matter).  This is especially helpful
439     // when min_load_factor is zero (no shrink at all) to avoid doubling
440     // the bucket count to infinity.  See also test ResizeWithoutShrink.
441     needed_size = _settings.minBuckets(_numElements - _numDeleted / 4 + delta, 0);
442     if (resize_to < needed_size &&    // may double resize_to
443         resize_to < size_t.max / 2) {
444       // This situation means that we have enough deleted elements,
445       // that once we purge them, we won't actually have needed to
446       // grow.  But we may want to grow anyway: if we just purge one
447       // element, say, we'll have to grow anyway next time we
448       // insert.  Might as well grow now, since we're already going
449       // through the trouble of copying (in order to purge the
450       // deleted elements).
451       const size_t target = _settings.shrinkSize(resize_to * 2);
452       if (_numElements - _numDeleted + delta >= target) {
453         // Good, we won't be below the shrink threshhold even if we double.
454         resize_to *= 2;
455       }
456     }
457     auto tmp = new DenseHashTable(this, resize_to);
458     swap(tmp);                             // now we are tmp
459     return true;
460   }
461 
462   // We require table be not-NULL and empty before calling this.
463   void resizeTable(size_t new_size) {
464     _table.length = new_size;
465   }
466 
467   void resizeTable(size_t old_size, size_t new_size) {
468     _table = new Value[new_size];
469   }
470 
471   // Used to actually do the rehashing when we grow/shrink a hashtable
472   void copyFrom(/*in*/ DenseHashTable ht, size_t min_buckets_wanted) {
473     clearToSize(_settings.minBuckets(ht.size(), min_buckets_wanted));
474 
475     // We use a normal iterator to get non-deleted bcks from ht
476     // We could use insert() here, but since we know there are
477     // no duplicates and no deleted items, we can be more efficient
478     assert((bucketCount() & (bucketCount() - 1)) == 0);      // a power of two
479     for ( Iterator it = ht.begin(); it != ht.end(); ++it ) {
480       size_t num_probes = 0;              // how many times we've probed
481       size_t bucknum;
482       const size_t bucket_count_minus_one = bucketCount() - 1;
483       for (bucknum = hash(getKey(*it)) & bucket_count_minus_one;
484            !testEmpty(bucknum);                               // not empty
485            bucknum = (bucknum + num_probes) & bucket_count_minus_one) {
486         ++num_probes;
487         assert(num_probes < bucketCount(),
488                "Hashtable is full: an error in key_equal<> or hash<>");
489       }
490       _table[bucknum] = *it;       // copies the value to here
491       _numElements++;
492     }
493     _settings.numHtCopies++;
494   }
495 
496   // Required by the spec for hashed associative container
497 public:
498   // Though the docs say this should be num_buckets, I think it's much
499   // more useful as num_elements.  As a special feature, calling with
500   // req_elements==0 will cause us to shrink if we can, saving space.
501   void resize(size_t req_elements) {       // resize to this or larger
502     if ( _settings.considerShrink || req_elements == 0 )
503       maybeShrink();
504     if ( req_elements > _numElements )
505       resizeDelta(req_elements - _numElements);
506   }
507 
508   // Get and change the value of shrink_factor and enlarge_factor.  The
509   // description at the beginning of this file explains how to choose
510   // the values.  Setting the shrink parameter to 0.0 ensures that the
511   // table never shrinks.
512   void getResizingParameters(out float shrink, out float grow) const {
513     shrink = _settings.shrinkFactor;
514     grow = _settings.enlargeFactor;
515   }
516   void setResizingParameters(float shrink, float grow) {
517     _settings.setResizingParameters(shrink, grow);
518     _settings.resetThresholds(bucketCount());
519   }
520 
521   // CONSTRUCTORS -- as required by the specs, we take a size,
522   // but also let you specify a hashfunction, key comparator,
523   // and key extractor.  We also define a copy constructor and =.
524   // DESTRUCTOR -- needs to free the table
525   this(size_t expected_max_items_in_table,
526       HashFcn hashFcn,
527       ExtractKey extractKey,
528       SetKey setKey,
529       EqualKey equalKey) {
530     _numDeleted = 0;
531     _numElements = 0;
532     _numBuckets = expected_max_items_in_table == 0
533         ? HT_DEFAULT_STARTING_BUCKETS
534         : _settings.minBuckets(expected_max_items_in_table, 0);
535     _table = null;
536 
537     // Keep track of callable entities (functions, delegates, functors) used to work with keys.
538     _keyValInfo = new KeyValInfo();
539     _keyValInfo.hashFcn = hashFcn;
540     _keyValInfo.extractKey = extractKey;
541     _keyValInfo.setKey = setKey;
542     _keyValInfo.equalKey = equalKey;
543 
544     // table is NULL until emptyval is set.  However, we set num_buckets
545     // here so we know how much space to allocate once emptyval is set
546     _settings = new Settings();
547     _settings.resetThresholds(bucketCount());
548   }
549 
550   // As a convenience for resize(), we allow an optional second argument
551   // which lets you make this new hashtable a different size than ht
552   this(this DenseHashTableT)(DenseHashTableT ht, size_t min_buckets_wanted = HT_DEFAULT_STARTING_BUCKETS) {
553     _settings = ht._settings;
554     _keyValInfo = ht._keyValInfo;
555     _numDeleted = 0;
556     _numElements = 0;
557     _numBuckets = 0;
558     _table = null;
559 
560     if (!ht._settings.useEmpty) {
561       // If use_empty isn't set, copy_from will crash, so we do our own copying.
562       assert(ht.empty());
563       _numBuckets = _settings.minBuckets(ht.size(), min_buckets_wanted);
564       _settings.resetThresholds(bucketCount());
565       return;
566     }
567     _settings.resetThresholds(bucketCount());
568     copyFrom(ht, min_buckets_wanted);   // copy_from() ignores deleted entries
569   }
570 
571   ~this() {
572     if (_table) {
573       destroyBuckets(0, _numBuckets);
574       //_keyValInfo.deallocate(table, _numBuckets);
575     }
576   }
577 
578   // Many STL algorithms use swap instead of copy constructors
579   void swap(DenseHashTable ht) {
580     .swap(_settings, ht._settings);
581     .swap(_keyValInfo, ht._keyValInfo);
582     .swap(_numDeleted, ht._numDeleted);
583     .swap(_numElements, ht._numElements);
584     .swap(_numBuckets, ht._numBuckets);
585     .swap(_keyValInfo, ht._keyValInfo);
586     .swap(_table, ht._table);
587     _settings.resetThresholds(bucketCount());  // also resets consider_shrink
588     ht._settings.resetThresholds(ht.bucketCount());
589   }
590 
591 private:
592   void clearToSize(size_t new_num_buckets) {
593     if (!_table) {
594       // TODO: Use a custom allocator here.
595       _table = new Value[new_num_buckets];
596     } else {
597       destroyBuckets(0, _numBuckets);
598       if (new_num_buckets != _numBuckets) {   // resize, if necessary
599         resizeTable(_numBuckets, new_num_buckets);
600       }
601     }
602     assert(_table);
603     fillRangeWithEmpty(0, new_num_buckets);
604     _numElements = 0;
605     _numDeleted = 0;
606     _numBuckets = new_num_buckets;          // our new size
607     _settings.resetThresholds(bucketCount());
608   }
609 
610 public:
611   // It's always nice to be able to clear a table without deallocating it
612   void clear() {
613     // If the table is already empty, and the number of buckets is
614     // already as we desire, there's nothing to do.
615     const size_t new_num_buckets = _settings.minBuckets(0, 0);
616     if (_numElements == 0 && new_num_buckets == _numBuckets) {
617       return;
618     }
619     clearToSize(new_num_buckets);
620   }
621 
622   // Clear the table without resizing it.
623   // Mimicks the stl_hashtable's behaviour when clear()-ing in that it
624   // does not modify the bucket count
625   void clearNoResize() {
626     if (_numElements > 0) {
627       assert(_table);
628       destroyBuckets(0, _numBuckets);
629       fillRangeWithEmpty(0, _numBuckets);
630     }
631     // don't consider to shrink before another erase()
632     _settings.resetThresholds(bucketCount());
633     _numElements = 0;
634     _numDeleted = 0;
635   }
636 
637   // LOOKUP ROUTINES
638 private:
639 
640   struct Pair(T1, T2) {
641     T1 first;
642     T2 second;
643   }
644 
645   // Returns a pair of positions: 1st where the object is, 2nd where
646   // it would go if you wanted to insert it.  1st is ILLEGAL_BUCKET
647   // if object is not found; 2nd is ILLEGAL_BUCKET if it is.
648   // Note: because of deletions where-to-insert is not trivial: it's the
649   // first deleted bucket we see, as long as we don't find the key later
650   Pair!(size_t, size_t) findPosition(in Key key) const {
651     size_t num_probes = 0;              // how many times we've probed
652     const size_t bucket_count_minus_one = bucketCount() - 1;
653     size_t bucknum = hash(key) & bucket_count_minus_one;
654     size_t insert_pos = ILLEGAL_BUCKET; // where we would insert
655     while ( 1 ) {                          // probe until something happens
656       if ( testEmpty(bucknum) ) {         // bucket is empty
657         if ( insert_pos == ILLEGAL_BUCKET )   // found no prior place to insert
658           return Pair!(size_t, size_t)(ILLEGAL_BUCKET, bucknum);
659         else
660           return Pair!(size_t, size_t)(ILLEGAL_BUCKET, insert_pos);
661 
662       } else if ( testDeleted(bucknum) ) { // keep searching, but mark to insert
663         if ( insert_pos == ILLEGAL_BUCKET )
664           insert_pos = bucknum;
665 
666       } else if ( equalKey(key, getKey(_table[bucknum])) ) {
667         return Pair!(size_t, size_t)(bucknum, ILLEGAL_BUCKET);
668       }
669       ++num_probes;                        // we're doing another probe
670       bucknum = (bucknum + num_probes) & bucket_count_minus_one;
671       assert(num_probes < bucketCount(), "Hashtable is full: an error in key_equal<> or hash<>");
672     }
673   }
674 
675 public:
676 
677   Iterator find(in Key key) {
678     if (size() == 0) return end();
679     auto pos = findPosition(key);
680     if ( pos.first == ILLEGAL_BUCKET )     // alas, not there
681       return end();
682     else
683       return Iterator(this, _table[pos.first .. _numBuckets], false);
684   }
685 
686   // This is a tr1 method: the bucket a given key is in, or what bucket
687   // it would be put in, if it were to be inserted.  Shrug.
688   size_t bucket(in Key key) const {
689     auto pos = findPosition(key);
690     return pos.first == ILLEGAL_BUCKET ? pos.second : pos.first;
691   }
692 
693   // Counts how many elements have key key.  For maps, it's either 0 or 1.
694   size_t count(in Key key) const {
695     auto pos = findPosition(key);
696     return pos.first == ILLEGAL_BUCKET ? 0 : 1;
697   }
698 
699   // Likewise, equal_range doesn't really make sense for us.  Oh well.
700   Pair!(Iterator, Iterator) equalRange(in Key key) {
701     Iterator pos = find(key);      // either an iterator or end
702     if (pos == end()) {
703       return Pair!(Iterator, Iterator)(pos, pos);
704     } else {
705       Iterator startpos = pos++;
706       return Pair!(Iterator, Iterator)(startpos, pos);
707     }
708   }
709 
710   // INSERTION ROUTINES
711 private:
712   // Private method used by insert_noresize and find_or_insert.
713   Iterator insertAt(Value obj, size_t pos) {
714     if (size() >= maxSize()) {
715       throw new Exception("insert overflow");
716     }
717     if ( testDeleted(pos) ) {      // just replace if it's been del.
718       // shrug: shouldn't need to be const.
719       auto delpos = Iterator(this, _table[pos .. _numBuckets], false);
720       clearDeleted(delpos);
721       assert( _numDeleted > 0);
722       --_numDeleted;                // used to be, now it isn't
723     } else {
724       ++_numElements;               // replacing an empty bucket
725     }
726     _table[pos] = obj;
727     return Iterator(this, _table[pos .. _numBuckets], false);
728   }
729 
730   // If you know *this is big enough to hold obj, use this routine
731   Pair!(Iterator, bool) insertNoResize(Value obj) {
732     // First, double-check we're not inserting delkey or emptyval
733     assert(!_settings.useEmpty || !equalKey(getKey(obj), getKey(_keyValInfo.emptyValue)),
734         "Inserting the empty key");
735     assert(!_settings.useDeleted || !equalKey(getKey(obj), _keyValInfo.delKey),
736         "Inserting the deleted key");
737     const Pair!(size_t, size_t) pos = findPosition(getKey(obj));
738     if ( pos.first != ILLEGAL_BUCKET) {      // object was already there
739       return Pair!(Iterator, bool)(
740           Iterator(this, _table[pos.first .. _numBuckets], false),
741           false);          // false: we didn't insert
742     } else {               // pos.second says where to put it
743       return Pair!(Iterator, bool)(insertAt(obj, pos.second), true);
744     }
745   }
746 
747 public:
748   // This is the normal insert routine, used by the outside world
749   Pair!(Iterator, bool) insert(Value obj) {
750     bool didResize = resizeDelta(1);            // adding an object, grow if need be
751     return insertNoResize(obj);
752   }
753 
754   // Specializations of insert(it, it) depending on the power of the iterator:
755   void insert(Iterator f, Iterator l) {
756     for (; f != l; f++)
757       insert(*f);
758   }
759 
760   // DefaultValue is a functor that takes a key and returns a value_type
761   // representing the default value to be inserted if none is found.
762   Value findOrInsert(Value function(Key) defaultValue)(in Key key) {
763     // First, double-check we're not inserting emptykey or delkey
764     assert(!_settings.useEmpty || !equalKey(key, getKey(_keyValInfo.emptyValue)),
765         "Inserting the empty key");
766     assert(!_settings.useDeleted || !equalKey(key, _keyValInfo.delKey),
767         "Inserting the deleted key");
768     Pair!(size_t,size_t) pos = findPosition(key);
769     if (pos.first != ILLEGAL_BUCKET) {  // object was already there
770       return _table[pos.first];
771     } else if (resizeDelta(1)) {        // needed to rehash to make room
772       // Since we resized, we can't use pos, so recalculate where to insert.
773       return insertNoResize(defaultValue(key)).first;
774     } else {                             // no need to rehash, insert right here
775       return insertAt(defaultValue(key), pos.second);
776     }
777   }
778 
779   // DELETION ROUTINES
780   size_t erase(in Key key) {
781     // First, double-check we're not trying to erase delkey or emptyval.
782     assert(!_settings.useEmpty || !equalKey(key, getKey(_keyValInfo.emptyValue)),
783         "Erasing the empty key");
784     assert(!_settings.useDeleted || !equalKey(key, _keyValInfo.delKey),
785         "Erasing the deleted key");
786     Iterator pos = find(key);   // shrug: shouldn't need to be const
787     if ( pos != end() ) {
788       assert(!testDeleted(pos));  // or find() shouldn't have returned it
789       setDeleted(pos);
790       ++_numDeleted;
791       _settings.considerShrink = true; // will think about shrink after next insert
792       return 1;                    // because we deleted one thing
793     } else {
794       return 0;                    // because we deleted nothing
795     }
796   }
797 
798   // We return the iterator past the deleted item.
799   void erase(Iterator pos) {
800     if ( pos == end() ) return;    // sanity check
801     if ( setDeleted(pos) ) {      // true if object has been newly deleted
802       ++_numDeleted;
803       _settings.considerShrink = true; // will think about shrink after next insert
804     }
805   }
806 
807   void erase(Iterator f, Iterator l) {
808     for ( ; f != l; ++f) {
809       if (setDeleted(f))       // should always be true
810         ++_numDeleted;
811     }
812     _settings.considerShrink = true; // will think about shrink after next insert
813   }
814 
815   // COMPARISON
816   override
817   bool opEquals(in Object o) {
818     ThisT ht = cast(ThisT) o;
819     if (size() != ht.size()) {
820       return false;
821     } else if (this is ht) {
822       return true;
823     } else {
824       // Iterate through the elements in "this" and see if the
825       // corresponding element is in ht
826       for ( Iterator it = begin(); it != end(); ++it ) {
827         Iterator it2 = ht.find(getKey(*it));
828         if (it2 == ht.end() || *it != *it2) {
829           return false;
830         }
831       }
832       return true;
833     }
834   }
835 
836   // I/O
837   // We support reading and writing hashtables to disk.  Alas, since
838   // I don't know how to write a hasher or key_equal, you have to make
839   // sure everything but the table is the same.  We compact before writing.
840 private:
841   // Every time the disk format changes, this should probably change too
842   alias MagicNumberType = ulong;
843   enum MagicNumberType MAGIC_NUMBER = 0x13578642;
844 
845 public:
846   // I/O -- this is an add-on for writing hash table to disk
847   //
848   // INPUT and OUTPUT must be either a FILE, *or* a C++ stream
849   //    (istream, ostream, etc) *or* a class providing
850   //    Read(void*, size_t) and Write(const void*, size_t)
851   //    (respectively), which writes a buffer into a stream
852   //    (which the INPUT/OUTPUT instance presumably owns).
853 
854   // typedef sparsehash_internal::pod_serializer<value_type> NopointerSerializer;
855 
856   // TODO: Convert if serialization logic is needed.
857   // ValueSerializer: a functor.  operator()(OUTPUT*, const value_type&)
858   // template <typename ValueSerializer, typename OUTPUT>
859   // bool serialize(ValueSerializer serializer, OUTPUT *fp) {
860   //   squash_deleted();           // so we don't have to worry about delkey
861   //   if ( !sparsehash_internal::write_bigendian_number(fp, MAGIC_NUMBER, 4) )
862   //     return false;
863   //   if ( !sparsehash_internal::write_bigendian_number(fp, num_buckets, 8) )
864   //     return false;
865   //   if ( !sparsehash_internal::write_bigendian_number(fp, num_elements, 8) )
866   //     return false;
867   //   // Now write a bitmap of non-empty buckets.
868   //   for ( size_t i = 0; i < num_buckets; i += 8 ) {
869   //     unsigned char bits = 0;
870   //     for ( int bit = 0; bit < 8; ++bit ) {
871   //       if ( i + bit < num_buckets && !test_empty(i + bit) )
872   //         bits |= (1 << bit);
873   //     }
874   //     if ( !sparsehash_internal::write_data(fp, &bits, sizeof(bits)) )
875   //       return false;
876   //     for ( int bit = 0; bit < 8; ++bit ) {
877   //       if ( bits & (1 << bit) ) {
878   //         if ( !serializer(fp, table[i + bit]) ) return false;
879   //       }
880   //     }
881   //   }
882   //   return true;
883   // }
884 
885   // TODO: Convert if serialization logic is needed.
886   // INPUT: anything we've written an overload of read_data() for.
887   // ValueSerializer: a functor.  operator()(INPUT*, value_type*)
888   // template <typename ValueSerializer, typename INPUT>
889   // bool unserialize(ValueSerializer serializer, INPUT *fp) {
890   //   assert(settings.use_empty() && "empty_key not set for read");
891 
892   //   clear();                        // just to be consistent
893   //   MagicNumberType magic_read;
894   //   if ( !sparsehash_internal::read_bigendian_number(fp, &magic_read, 4) )
895   //     return false;
896   //   if ( magic_read != MAGIC_NUMBER ) {
897   //     return false;
898   //   }
899   //   size_t new_num_buckets;
900   //   if ( !sparsehash_internal::read_bigendian_number(fp, &new_num_buckets, 8) )
901   //     return false;
902   //   clear_to_size(new_num_buckets);
903   //   if ( !sparsehash_internal::read_bigendian_number(fp, &num_elements, 8) )
904   //     return false;
905 
906   //   // Read the bitmap of non-empty buckets.
907   //   for (size_t i = 0; i < num_buckets; i += 8) {
908   //     unsigned char bits;
909   //     if ( !sparsehash_internal::read_data(fp, &bits, sizeof(bits)) )
910   //       return false;
911   //     for ( int bit = 0; bit < 8; ++bit ) {
912   //       if ( i + bit < num_buckets && (bits & (1 << bit)) ) {  // not empty
913   //         if ( !serializer(fp, &table[i + bit]) ) return false;
914   //       }
915   //     }
916   //   }
917   //   return true;
918   // }
919 
920 private:
921   // Package functors with another class to eliminate memory needed for zero-size functors.
922   static class Settings {
923     size_t enlargeThreshold = 0;  // table.size() * enlarge_factor
924     size_t shrinkThreshold = 0;   // table.size() * shrink_factor
925     float enlargeFactor = HT_OCCUPANCY_PCT / 100.0f;  // how full before resize
926     float shrinkFactor = HT_EMPTY_PCT / 100.0f;  // how empty before resize
927     // consider_shrink=true if we should try to shrink before next insert
928     bool considerShrink = false;
929     bool useEmpty = false;    // used only by densehashtable, not sparsehashtable
930     bool useDeleted = false;  // false until delkey has been set
931     // num_ht_copies is a counter incremented every Copy/Move
932     uint numHtCopies = 0;
933 
934     size_t enlargeSize(size_t x) const {
935       return cast(size_t)(x * enlargeFactor);
936     }
937 
938     size_t shrinkSize(size_t x) const {
939       return cast(size_t)(x * shrinkFactor);
940     }
941 
942     // Reset the enlarge and shrink thresholds
943     void resetThresholds(size_t num_buckets) {
944       enlargeThreshold = enlargeSize(num_buckets);
945       shrinkThreshold = shrinkSize(num_buckets);
946       // whatever caused us to reset already considered
947       considerShrink = false;
948     }
949 
950     // Caller is resposible for calling reset_threshold right after
951     // set_resizing_parameters.
952     void setResizingParameters(float shrink, float grow) {
953       assert(shrink >= 0.0);
954       assert(grow <= 1.0);
955       if (shrink > grow / 2.0f)
956         shrink = grow / 2.0f;     // otherwise we thrash hashtable size
957       shrinkFactor = shrink;
958       enlargeFactor = grow;
959     }
960 
961     // This is the smallest size a hashtable can be without being too crowded
962     // If you like, you can give a min #buckets as well as a min #elts
963     size_t minBuckets(size_t num_elts, size_t min_buckets_wanted) {
964       float enlarge = enlargeFactor;
965       size_t sz = HT_MIN_BUCKETS;             // min buckets allowed
966       while ( sz < min_buckets_wanted || num_elts >= cast(size_t)(sz * enlarge) ) {
967         // This just prevents overflowing size_t, since sz can exceed
968         // max_size() here.
969         if (cast(size_t)(sz * 2) < sz) {
970           throw new Exception("resize overflow");  // protect against overflow
971         }
972         sz *= 2;
973       }
974       return sz;
975     }
976 
977   }
978 
979   // A class is passed by reference, meaning large numbers of hash-tables can share the same data.
980   static class KeyValInfo {
981     Value emptyValue;
982     Key delKey;
983     HashFcn hashFcn;
984     ExtractKey extractKey;
985     SetKey setKey;
986     EqualKey equalKey;
987   }
988 
989 private:
990   // Actual data
991   Settings _settings;
992   KeyValInfo _keyValInfo;
993 
994   size_t _numDeleted;  // how many occupied buckets are marked deleted
995   size_t _numElements;
996   size_t _numBuckets;
997   Value[] _table;
998 }
999 
1000 /**
1001  * A basic iterator type for finding entries and iterating.
1002  * We're just an array, but we need to skip over empty and deleted elements.
1003  *
1004  * TODO(vnayar): Coonvert DenseHashTable to be range based after S2Builder is modified to use it.
1005  */
1006 struct DenseHashTableIterator(Value, Key, HashFcn, ExtractKey, SetKey, EqualKey) {
1007 public:
1008   alias Iterator = DenseHashTableIterator!(Value, Key, HashFcn, ExtractKey, SetKey, EqualKey);
1009 
1010   // "Real" constructor and default constructor
1011   this(
1012       in DenseHashTable!(Value, Key, HashFcn, ExtractKey, SetKey, EqualKey) h,
1013       Value[] data, bool advance) {
1014     _ht = h;
1015     _data = data;
1016     if (advance) advancePastEmptyAndDeleted();
1017   }
1018 
1019   // Happy dereferencer
1020   ref inout(Value) opUnary(string op)() inout
1021     if (op == "*")
1022   in {
1023     assert(!_data.empty(), "Iterator is already at end!");
1024   } do {
1025     // return _data.front(); -- Bug: See https://issues.dlang.org/show_bug.cgi?id=19518
1026     return _data[0];
1027   }
1028 
1029   // Arithmetic.  The only hard part is making sure that
1030   // we're not on an empty or marked-deleted array element
1031   void advancePastEmptyAndDeleted() {
1032     while ( !_data.empty() && (_ht.testEmpty(this) || _ht.testDeleted(this)) )
1033       _data.popFront();
1034   }
1035 
1036   DenseHashTableIterator opUnary(string op)() if (op == "++")
1037   in {
1038     assert(!_data.empty());
1039   } do {
1040     _data.popFront();
1041     advancePastEmptyAndDeleted();
1042     return this;
1043   }
1044 
1045   // Comparison.
1046   bool opEquals(this ThisT)(ThisT o) const {
1047     return _data == o._data;
1048   }
1049 
1050   // The actual data
1051   Rebindable!(const DenseHashTable!(Value, Key, HashFcn, ExtractKey, SetKey, EqualKey)) _ht;
1052   Value[] _data;
1053 }