1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS-IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 16 // Original Author: ericv@google.com (Eric Veach) 17 // Converted to D: madric@gmail.com (Vijay Nayar) 18 19 module s2.id_set_lexicon; 20 21 import s2.sequence_lexicon; 22 23 import std.range; 24 import std.exception; 25 26 /** 27 * IdSetLexicon is a class for compactly representing sets of non-negative 28 * integers such as array indices ("id sets"). It is especially suitable when 29 * either (1) there are many duplicate sets, or (2) there are many singleton 30 * or empty sets. See also ValueLexicon and SequenceLexicon. 31 * 32 * Each distinct id set is mapped to a 32-bit integer. Empty and singleton 33 * sets take up no additional space whatsoever; the set itself is represented 34 * by the unique id assigned to the set. Sets of size 2 or more occupy about 35 * 11 bytes per set plus 4 bytes per element (as compared to 24 bytes per set 36 * plus 4 bytes per element for std::vector). Duplicate sets are 37 * automatically eliminated. Note also that id sets are referred to using 38 * 32-bit integers rather than 64-bit pointers. 39 * 40 * This class is especially useful in conjunction with ValueLexicon<T>. For 41 * example, suppose that you want to label objects with a set of strings. You 42 * could use a ValueLexicon<string> to map the strings to "label ids" (32-bit 43 * integers), and then use IdSetLexicon to map each set of labels to a "label 44 * set id". Each reference to that label set then takes up only 4 bytes. 45 * 46 * Example usage: 47 * 48 * ValueLexicon<string> labels_; 49 * IdSetLexicon label_sets_; 50 * 51 * int32 GetLabelSet(const vector<string>& label_strings) { 52 * vector<int32> label_ids; 53 * for (const auto& str : label_strings) { 54 * label_ids.push_back(labels_.Add(str)); 55 * } 56 * return label_sets_.Add(label_ids); 57 * } 58 * 59 * int label_set_id = GetLabelSet(...); 60 * for (auto id : label_sets_.id_set(label_set_id)) { 61 * LOG(INFO) << id; 62 * } 63 * 64 * This class is similar to SequenceLexicon, except: 65 * 66 * 1. Empty and singleton sets are represented implicitly; they use no space. 67 * 2. Sets are represented rather than sequences; the ordering of values is 68 * not important and duplicates are removed. 69 * 3. The values must be 32-bit non-negative integers (only). 70 */ 71 class IdSetLexicon { 72 public: 73 this() { 74 _idSets = new SequenceLexicon!int(); 75 } 76 77 // IdSetLexicon is movable and copyable. 78 this(IdSetLexicon x) { 79 _idSets = new SequenceLexicon!int(x._idSets); 80 } 81 82 // Clears all data from the lexicon. 83 void clear() { 84 _idSets.clear(); 85 } 86 87 // Add the given set of integers to the lexicon if it is not already 88 // present, and return the unique id for this set. "begin" and "end" are 89 // forward iterators over a sequence of values that can be converted to 90 // non-negative 32-bit integers. The values are automatically sorted and 91 // duplicates are removed. Returns a signed integer representing this set. 92 // 93 // REQUIRES: All values in [begin, end) are non-negative 32-bit integers. 94 int add(ForwardRange)(ForwardRange fr) 95 if (isForwardRange!ForwardRange && is(typeof(fr.front) : int)) { 96 _tmp.length = 0; 97 foreach (v; fr) { 98 enforce(v >= 0); 99 enforce(v <= int.max); 100 _tmp ~= v; 101 } 102 return addInternal(_tmp); 103 } 104 105 // Convenience method that returns the unique id for a singleton set. 106 // Note that because singleton sets take up no space, this method is 107 // const. Equivalent to calling Add(&id, &id + 1). 108 int addSingleton(int id) const 109 in { 110 assert(id >= 0); 111 assert(id <= int.max); 112 } do { 113 // Singleton sets are represented by their element. 114 return id; 115 } 116 117 // Convenience method that returns the unique id for the empty set. Note 118 // that because the empty set takes up no space and has a fixed id, this 119 // method is static. Equivalent to calling Add() with an empty container. 120 static int emptySetId() { 121 return EMPTY_SET_ID; 122 } 123 124 // Iterator type; please treat this as an opaque forward iterator. 125 alias Iterator = const(int)*; 126 127 // Represents a set of integers stored in the IdSetLexicon. 128 alias IdSet = int[]; 129 130 // Return the set of integers corresponding to an id returned by Add(). 131 const(IdSet) idSet(int set_id) const { 132 if (set_id >= 0) { 133 return [set_id]; 134 } else if (set_id == EMPTY_SET_ID) { 135 return []; 136 } else { 137 auto seq = _idSets.sequence(~set_id); 138 enforce(seq.length != 0); 139 return seq; 140 } 141 } 142 143 144 private: 145 // Choose kEmptySetId to be the last id that will ever be generated. 146 // (Non-negative ids are reserved for singleton sets.) 147 enum int EMPTY_SET_ID = int.min; 148 149 int addInternal(int[] ids) { 150 import std.algorithm : sort, uniq; 151 152 if (ids.empty()) { 153 // Empty sets have a special id chosen not to conflict with other ids. 154 return EMPTY_SET_ID; 155 } else if (ids.length == 1) { 156 // Singleton sets are represented by their element. 157 return ids[0]; 158 } else { 159 // Canonicalize the set by sorting and removing duplicates. 160 ids = ids.sort.uniq.array; 161 // Non-singleton sets are represented by the bitwise complement of the id 162 // returned by SequenceLexicon. 163 return ~_idSets.add(ids); 164 } 165 } 166 167 SequenceLexicon!int _idSets; 168 int[] _tmp; // temporary storage used during Add() 169 }