1 // Copyright 2016 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS-IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 
16 // Original Author: ericv@google.com (Eric Veach)
17 // Converted to D:  madric@gmail.com (Vijay Nayar)
18 
19 module s2.sequence_lexicon;
20 
21 import s2.util.container.dense_hash_set;
22 
23 import std.range;
24 
25 // SequenceLexicon is a class for compactly representing sequences of values
26 // (e.g., tuples).  It automatically eliminates duplicates, and maps the
27 // remaining sequences to sequentially increasing integer ids.  See also
28 // ValueLexicon and IdSetLexicon.
29 //
30 // Each distinct sequence is mapped to a 32-bit integer.  The space used for
31 // each sequence is approximately 11 bytes plus the memory needed to represent
32 // the sequence elements.  For example, a sequence of three "double"s would
33 // need about 11 + 3*8 = 35 bytes.  Note also that sequences are referred to
34 // using 32-bit ids rather than 64-bit pointers.
35 //
36 // This class has the same thread-safety properties as "string": const methods
37 // are thread safe, and non-const methods are not thread safe.
38 //
39 // Example usage:
40 //
41 //   SequenceLexicon<string> lexicon;
42 //   vector<string> pets {"cat", "dog", "parrot"};
43 //   uint32 pets_id = lexicon.Add(pets);
44 //   CHECK_EQ(pets_id, lexicon.Add(pets));
45 //   string values;
46 //   for (const auto& pet : lexicon.sequence(pets_id)) {
47 //     values += pet;
48 //   }
49 //   CHECK_EQ("catdogparrot", values);
50 //
51 class SequenceLexicon(T) {
52 public:
53   this() {
54     _idSet = new IdSet(0, new IdHashFunctor(), new IdKeyEqualFunctor());
55     _idSet.setEmptyKey(EMPTY_KEY);
56     _begins ~= 0;
57   }
58 
59   this(SequenceLexicon x) {
60     _values = x._values.dup;
61     _begins = x._begins.dup;
62     _idSet = new IdSet(
63         x._idSet.begin(), x._idSet.end(), EMPTY_KEY,
64         0, new IdHashFunctor(), new IdKeyEqualFunctor());
65   }
66 
67   // Clears all data from the lexicon.
68   void clear() {
69     _values.length = 0;
70     _begins.length = 0;
71     _idSet.clear();
72     _begins ~= 0;
73   }
74 
75   // Add the given sequence of values to the lexicon if it is not already
76   // present, and return its integer id.  Ids are assigned sequentially
77   // starting from zero.  "begin" and "end" are forward iterators over a
78   // sequence of values of type T.
79   //uint add(FwdIterator)(FwdIterator begin, FwdIterator end) {
80   uint add(Range)(Range r)
81   if (isInputRange!Range && is(typeof(r.front) : T)) {
82     // Add all new T values to _values.
83     foreach (elem; r) {
84       _values ~= elem;
85     }
86     // Add a new "begin" which is the end of the current sequence and start of the next.
87     _begins ~= cast(uint) _values.length;
88     // ids start from zero, and by now, there's the initial begin of 0 and the one just added.
89     // Thus to start with 0, we subtract two from the length.
90     uint id = cast(uint) _begins.length - 2;
91     // Equality is redefined so be if the id maps to the same sequence, they are the same.
92     // This also means that when no new values are added, the id maps to an empty sequence,
93     // and all these are considered to be the same.
94     auto result = _idSet.insert(id);
95     if (result.second) {
96       // If the insert worked, keep it.
97       return id;
98     } else {
99       // If the insert did not work, take back the id and remove the new values.
100       _begins.popBack();
101       _values.length = _begins.back();
102       return *result.first;
103     }
104   }
105 
106   // Return the number of value sequences in the lexicon.
107   size_t size() const {
108     return _begins.length - 1;
109   }
110 
111   // A representation of a sequence of values.
112   alias Sequence = T[];
113 
114   // Return the value sequence with the given id.  This method can be used
115   // with range-based for loops as follows:
116   //   for (const auto& value : lexicon.sequence(id)) { ... }
117   const(Sequence) sequence(uint id) const {
118     return _values[_begins[id] .. _begins[id + 1]];
119   }
120 
121 private:
122   // Choose kEmptyKey to be the last key that will ever be generated.
123   enum uint EMPTY_KEY = uint.max;
124 
125   class IdHashFunctor {
126     size_t opCall(in uint id) const {
127       import s2.util.hash.mix;
128 
129       HashMix mix;
130       foreach (value; this.outer.sequence(id)) {
131         mix.mix(typeid(uint).getHash(&value));
132       }
133       return mix.get();
134     }
135   }
136 
137   class IdKeyEqualFunctor {
138     bool opCall(in uint id1, in uint id2) const {
139       import std.algorithm : equal;
140 
141       if (id1 == id2) return true;
142       if (id1 == EMPTY_KEY || id2 == EMPTY_KEY) {
143         return false;
144       }
145       auto seq1 = this.outer.sequence(id1);
146       auto seq2 = this.outer.sequence(id2);
147       return seq1.length == seq2.length && equal(seq1, seq2);
148     }
149   }
150 
151   alias IdSet = DenseHashSet!(uint, IdHashFunctor, IdKeyEqualFunctor);
152 
153   T[] _values;
154   uint[] _begins;
155   IdSet _idSet;
156 }