1 // Copyright 2016 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS-IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 
16 // Original Author: ericv@google.com (Eric Veach)
17 // Converted to D:  madric@gmail.com (Vijay Nayar)
18 
19 module s2.id_set_lexicon;
20 
21 import s2.sequence_lexicon;
22 
23 import std.range;
24 import std.exception;
25 
26 /**
27  * IdSetLexicon is a class for compactly representing sets of non-negative
28  * integers such as array indices ("id sets").  It is especially suitable when
29  * either (1) there are many duplicate sets, or (2) there are many singleton
30  * or empty sets.  See also ValueLexicon and SequenceLexicon.
31  *
32  * Each distinct id set is mapped to a 32-bit integer.  Empty and singleton
33  * sets take up no additional space whatsoever; the set itself is represented
34  * by the unique id assigned to the set. Sets of size 2 or more occupy about
35  * 11 bytes per set plus 4 bytes per element (as compared to 24 bytes per set
36  * plus 4 bytes per element for std::vector).  Duplicate sets are
37  * automatically eliminated.  Note also that id sets are referred to using
38  * 32-bit integers rather than 64-bit pointers.
39  *
40  * This class is especially useful in conjunction with ValueLexicon<T>.  For
41  * example, suppose that you want to label objects with a set of strings.  You
42  * could use a ValueLexicon<string> to map the strings to "label ids" (32-bit
43  * integers), and then use IdSetLexicon to map each set of labels to a "label
44  * set id".  Each reference to that label set then takes up only 4 bytes.
45  *
46  * Example usage:
47  *
48  *   ValueLexicon<string> labels_;
49  *   IdSetLexicon label_sets_;
50  *
51  *   int32 GetLabelSet(const vector<string>& label_strings) {
52  *     vector<int32> label_ids;
53  *     for (const auto& str : label_strings) {
54  *       label_ids.push_back(labels_.Add(str));
55  *     }
56  *     return label_sets_.Add(label_ids);
57  *   }
58  *
59  *   int label_set_id = GetLabelSet(...);
60  *   for (auto id : label_sets_.id_set(label_set_id)) {
61  *     LOG(INFO) << id;
62  *   }
63  *
64  * This class is similar to SequenceLexicon, except:
65  *
66  * 1. Empty and singleton sets are represented implicitly; they use no space.
67  * 2. Sets are represented rather than sequences; the ordering of values is
68  *    not important and duplicates are removed.
69  * 3. The values must be 32-bit non-negative integers (only).
70  */
71 class IdSetLexicon {
72 public:
73   this() {
74     _idSets = new SequenceLexicon!int();
75   }
76 
77   // IdSetLexicon is movable and copyable.
78   this(IdSetLexicon x) {
79     _idSets = new SequenceLexicon!int(x._idSets);
80   }
81 
82   // Clears all data from the lexicon.
83   void clear() {
84     _idSets.clear();
85   }
86 
87   // Add the given set of integers to the lexicon if it is not already
88   // present, and return the unique id for this set.  "begin" and "end" are
89   // forward iterators over a sequence of values that can be converted to
90   // non-negative 32-bit integers.  The values are automatically sorted and
91   // duplicates are removed.  Returns a signed integer representing this set.
92   //
93   // REQUIRES: All values in [begin, end) are non-negative 32-bit integers.
94   int add(ForwardRange)(ForwardRange fr)
95   if (isForwardRange!ForwardRange && is(typeof(fr.front) : int)) {
96     _tmp.length = 0;
97     foreach (v; fr) {
98       enforce(v >= 0);
99       enforce(v <=  int.max);
100       _tmp ~= v;
101     }
102     return addInternal(_tmp);
103   }
104 
105   // Convenience method that returns the unique id for a singleton set.
106   // Note that because singleton sets take up no space, this method is
107   // const.  Equivalent to calling Add(&id, &id + 1).
108   int addSingleton(int id) const
109   in {
110     assert(id >= 0);
111     assert(id <= int.max);
112   } do {
113     // Singleton sets are represented by their element.
114     return id;
115   }
116 
117   // Convenience method that returns the unique id for the empty set.  Note
118   // that because the empty set takes up no space and has a fixed id, this
119   // method is static.  Equivalent to calling Add() with an empty container.
120   static int emptySetId() {
121     return EMPTY_SET_ID;
122   }
123 
124   // Iterator type; please treat this as an opaque forward iterator.
125   alias Iterator = const(int)*;
126 
127   // Represents a set of integers stored in the IdSetLexicon.
128   alias IdSet = int[];
129 
130   // Return the set of integers corresponding to an id returned by Add().
131   const(IdSet) idSet(int set_id) const {
132     if (set_id >= 0) {
133       return [set_id];
134     } else if (set_id == EMPTY_SET_ID) {
135       return [];
136     } else {
137       auto seq = _idSets.sequence(~set_id);
138       enforce(seq.length != 0);
139       return seq;
140     }
141   }
142 
143 
144 private:
145   // Choose kEmptySetId to be the last id that will ever be generated.
146   // (Non-negative ids are reserved for singleton sets.)
147   enum int EMPTY_SET_ID = int.min;
148 
149   int addInternal(int[] ids) {
150     import std.algorithm : sort, uniq;
151 
152     if (ids.empty()) {
153       // Empty sets have a special id chosen not to conflict with other ids.
154       return EMPTY_SET_ID;
155     } else if (ids.length == 1) {
156       // Singleton sets are represented by their element.
157       return ids[0];
158     } else {
159       // Canonicalize the set by sorting and removing duplicates.
160       ids = ids.sort.uniq.array;
161       // Non-singleton sets are represented by the bitwise complement of the id
162       // returned by SequenceLexicon.
163       return ~_idSets.add(ids);
164     }
165   }
166 
167   SequenceLexicon!int _idSets;
168   int[] _tmp;  // temporary storage used during Add()
169 }