1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS-IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 16 // Original Author: ericv@google.com (Eric Veach) 17 // Converted to D: madric@gmail.com (Vijay Nayar) 18 19 module s2.sequence_lexicon; 20 21 import s2.util.container.dense_hash_set; 22 23 import std.range; 24 25 // SequenceLexicon is a class for compactly representing sequences of values 26 // (e.g., tuples). It automatically eliminates duplicates, and maps the 27 // remaining sequences to sequentially increasing integer ids. See also 28 // ValueLexicon and IdSetLexicon. 29 // 30 // Each distinct sequence is mapped to a 32-bit integer. The space used for 31 // each sequence is approximately 11 bytes plus the memory needed to represent 32 // the sequence elements. For example, a sequence of three "double"s would 33 // need about 11 + 3*8 = 35 bytes. Note also that sequences are referred to 34 // using 32-bit ids rather than 64-bit pointers. 35 // 36 // This class has the same thread-safety properties as "string": const methods 37 // are thread safe, and non-const methods are not thread safe. 38 // 39 // Example usage: 40 // 41 // SequenceLexicon<string> lexicon; 42 // vector<string> pets {"cat", "dog", "parrot"}; 43 // uint32 pets_id = lexicon.Add(pets); 44 // CHECK_EQ(pets_id, lexicon.Add(pets)); 45 // string values; 46 // for (const auto& pet : lexicon.sequence(pets_id)) { 47 // values += pet; 48 // } 49 // CHECK_EQ("catdogparrot", values); 50 // 51 class SequenceLexicon(T) { 52 public: 53 this() { 54 _idSet = new IdSet(0, new IdHashFunctor(), new IdKeyEqualFunctor()); 55 _idSet.setEmptyKey(EMPTY_KEY); 56 _begins ~= 0; 57 } 58 59 this(SequenceLexicon x) { 60 _values = x._values.dup; 61 _begins = x._begins.dup; 62 _idSet = new IdSet( 63 x._idSet.begin(), x._idSet.end(), EMPTY_KEY, 64 0, new IdHashFunctor(), new IdKeyEqualFunctor()); 65 } 66 67 // Clears all data from the lexicon. 68 void clear() { 69 _values.length = 0; 70 _begins.length = 0; 71 _idSet.clear(); 72 _begins ~= 0; 73 } 74 75 // Add the given sequence of values to the lexicon if it is not already 76 // present, and return its integer id. Ids are assigned sequentially 77 // starting from zero. "begin" and "end" are forward iterators over a 78 // sequence of values of type T. 79 //uint add(FwdIterator)(FwdIterator begin, FwdIterator end) { 80 uint add(Range)(Range r) 81 if (isInputRange!Range && is(typeof(r.front) : T)) { 82 // Add all new T values to _values. 83 foreach (elem; r) { 84 _values ~= elem; 85 } 86 // Add a new "begin" which is the end of the current sequence and start of the next. 87 _begins ~= cast(uint) _values.length; 88 // ids start from zero, and by now, there's the initial begin of 0 and the one just added. 89 // Thus to start with 0, we subtract two from the length. 90 uint id = cast(uint) _begins.length - 2; 91 // Equality is redefined so be if the id maps to the same sequence, they are the same. 92 // This also means that when no new values are added, the id maps to an empty sequence, 93 // and all these are considered to be the same. 94 auto result = _idSet.insert(id); 95 if (result.second) { 96 // If the insert worked, keep it. 97 return id; 98 } else { 99 // If the insert did not work, take back the id and remove the new values. 100 _begins.popBack(); 101 _values.length = _begins.back(); 102 return *result.first; 103 } 104 } 105 106 // Return the number of value sequences in the lexicon. 107 size_t size() const { 108 return _begins.length - 1; 109 } 110 111 // A representation of a sequence of values. 112 alias Sequence = T[]; 113 114 // Return the value sequence with the given id. This method can be used 115 // with range-based for loops as follows: 116 // for (const auto& value : lexicon.sequence(id)) { ... } 117 const(Sequence) sequence(uint id) const { 118 return _values[_begins[id] .. _begins[id + 1]]; 119 } 120 121 private: 122 // Choose kEmptyKey to be the last key that will ever be generated. 123 enum uint EMPTY_KEY = uint.max; 124 125 class IdHashFunctor { 126 size_t opCall(in uint id) const { 127 import s2.util.hash.mix; 128 129 HashMix mix; 130 foreach (value; this.outer.sequence(id)) { 131 mix.mix(typeid(uint).getHash(&value)); 132 } 133 return mix.get(); 134 } 135 } 136 137 class IdKeyEqualFunctor { 138 bool opCall(in uint id1, in uint id2) const { 139 import std.algorithm : equal; 140 141 if (id1 == id2) return true; 142 if (id1 == EMPTY_KEY || id2 == EMPTY_KEY) { 143 return false; 144 } 145 auto seq1 = this.outer.sequence(id1); 146 auto seq2 = this.outer.sequence(id2); 147 return seq1.length == seq2.length && equal(seq1, seq2); 148 } 149 } 150 151 alias IdSet = DenseHashSet!(uint, IdHashFunctor, IdKeyEqualFunctor); 152 153 T[] _values; 154 uint[] _begins; 155 IdSet _idSet; 156 }