1 // Copyright 2020 Google Inc. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package s2 16 17 import ( 18 "encoding/binary" 19 "hash/adler32" 20 "math" 21 "sort" 22 ) 23 24 // TODO(roberts): If any of these are worth making public, change the 25 // method signatures and type names. 26 27 // emptySetID represents the last ID that will ever be generated. 28 // (Non-negative IDs are reserved for singleton sets.) 29 var emptySetID = int32(math.MinInt32) 30 31 // idSetLexicon compactly represents a set of non-negative 32 // integers such as array indices ("ID sets"). It is especially suitable when 33 // either (1) there are many duplicate sets, or (2) there are many singleton 34 // or empty sets. See also sequenceLexicon. 35 // 36 // Each distinct ID set is mapped to a 32-bit integer. Empty and singleton 37 // sets take up no additional space; the set itself is represented 38 // by the unique ID assigned to the set. Duplicate sets are automatically 39 // eliminated. Note also that ID sets are referred to using 32-bit integers 40 // rather than pointers. 41 type idSetLexicon struct { 42 idSets *sequenceLexicon 43 } 44 45 func newIDSetLexicon() *idSetLexicon { 46 return &idSetLexicon{ 47 idSets: newSequenceLexicon(), 48 } 49 } 50 51 // add adds the given set of integers to the lexicon if it is not already 52 // present, and return the unique ID for this set. The values are automatically 53 // sorted and duplicates are removed. 54 // 55 // The primary difference between this and sequenceLexicon are: 56 // 1. Empty and singleton sets are represented implicitly; they use no space. 57 // 2. Sets are represented rather than sequences; the ordering of values is 58 // 59 // not important and duplicates are removed. 60 // 61 // 3. The values must be 32-bit non-negative integers only. 62 func (l *idSetLexicon) add(ids ...int32) int32 { 63 // Empty sets have a special ID chosen not to conflict with other IDs. 64 if len(ids) == 0 { 65 return emptySetID 66 } 67 68 // Singleton sets are represented by their element. 69 if len(ids) == 1 { 70 return ids[0] 71 } 72 73 // Canonicalize the set by sorting and removing duplicates. 74 // 75 // Creates a new slice in order to not alter the supplied values. 76 set := uniqueInt32s(ids) 77 78 // Non-singleton sets are represented by the bitwise complement of the ID 79 // returned by the sequenceLexicon 80 return ^l.idSets.add(set) 81 } 82 83 // idSet returns the set of integers corresponding to an ID returned by add. 84 func (l *idSetLexicon) idSet(setID int32) []int32 { 85 if setID >= 0 { 86 return []int32{setID} 87 } 88 if setID == emptySetID { 89 return []int32{} 90 } 91 92 return l.idSets.sequence(^setID) 93 } 94 95 func (l *idSetLexicon) clear() { 96 l.idSets.clear() 97 } 98 99 // sequenceLexicon compactly represents a sequence of values (e.g., tuples). 100 // It automatically eliminates duplicates slices, and maps the remaining 101 // sequences to sequentially increasing integer IDs. See also idSetLexicon. 102 // 103 // Each distinct sequence is mapped to a 32-bit integer. 104 type sequenceLexicon struct { 105 values []int32 106 begins []uint32 107 108 // idSet is a mapping of a sequence hash to sequence index in the lexicon. 109 idSet map[uint32]int32 110 } 111 112 func newSequenceLexicon() *sequenceLexicon { 113 return &sequenceLexicon{ 114 begins: []uint32{0}, 115 idSet: make(map[uint32]int32), 116 } 117 } 118 119 // clears all data from the lexicon. 120 func (l *sequenceLexicon) clear() { 121 l.values = nil 122 l.begins = []uint32{0} 123 l.idSet = make(map[uint32]int32) 124 } 125 126 // add adds the given value to the lexicon if it is not already present, and 127 // returns its ID. IDs are assigned sequentially starting from zero. 128 func (l *sequenceLexicon) add(ids []int32) int32 { 129 if id, ok := l.idSet[hashSet(ids)]; ok { 130 return id 131 } 132 for _, v := range ids { 133 l.values = append(l.values, v) 134 } 135 l.begins = append(l.begins, uint32(len(l.values))) 136 137 id := int32(len(l.begins)) - 2 138 l.idSet[hashSet(ids)] = id 139 140 return id 141 } 142 143 // sequence returns the original sequence of values for the given ID. 144 func (l *sequenceLexicon) sequence(id int32) []int32 { 145 return l.values[l.begins[id]:l.begins[id+1]] 146 } 147 148 // size reports the number of value sequences in the lexicon. 149 func (l *sequenceLexicon) size() int { 150 // Subtract one because the list of begins starts out with the first element set to 0. 151 return len(l.begins) - 1 152 } 153 154 // hash returns a hash of this sequence of int32s. 155 func hashSet(s []int32) uint32 { 156 // TODO(roberts): We just need a way to nicely hash all the values down to 157 // a 32-bit value. To ensure no unnecessary dependencies we use the core 158 // library types available to do this. Is there a better option? 159 a := adler32.New() 160 binary.Write(a, binary.LittleEndian, s) 161 return a.Sum32() 162 } 163 164 // uniqueInt32s returns the sorted and uniqued set of int32s from the input. 165 func uniqueInt32s(in []int32) []int32 { 166 var vals []int32 167 m := make(map[int32]bool) 168 for _, i := range in { 169 if m[i] { 170 continue 171 } 172 m[i] = true 173 vals = append(vals, i) 174 } 175 sort.Slice(vals, func(i, j int) bool { return vals[i] < vals[j] }) 176 return vals 177 } 178