lexicon.go

Documentation: github.com/golang/geo/s2

     1  // Copyright 2020 Google Inc. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package s2
    16  
    17  import (
    18  	"encoding/binary"
    19  	"hash/adler32"
    20  	"math"
    21  	"sort"
    22  )
    23  
    24  // TODO(roberts): If any of these are worth making public, change the
    25  // method signatures and type names.
    26  
    27  // emptySetID represents the last ID that will ever be generated.
    28  // (Non-negative IDs are reserved for singleton sets.)
    29  var emptySetID = int32(math.MinInt32)
    30  
    31  // idSetLexicon compactly represents a set of non-negative
    32  // integers such as array indices ("ID sets"). It is especially suitable when
    33  // either (1) there are many duplicate sets, or (2) there are many singleton
    34  // or empty sets. See also sequenceLexicon.
    35  //
    36  // Each distinct ID set is mapped to a 32-bit integer. Empty and singleton
    37  // sets take up no additional space; the set itself is represented
    38  // by the unique ID assigned to the set. Duplicate sets are automatically
    39  // eliminated. Note also that ID sets are referred to using 32-bit integers
    40  // rather than pointers.
    41  type idSetLexicon struct {
    42  	idSets *sequenceLexicon
    43  }
    44  
    45  func newIDSetLexicon() *idSetLexicon {
    46  	return &idSetLexicon{
    47  		idSets: newSequenceLexicon(),
    48  	}
    49  }
    50  
    51  // add adds the given set of integers to the lexicon if it is not already
    52  // present, and return the unique ID for this set. The values are automatically
    53  // sorted and duplicates are removed.
    54  //
    55  // The primary difference between this and sequenceLexicon are:
    56  // 1. Empty and singleton sets are represented implicitly; they use no space.
    57  // 2. Sets are represented rather than sequences; the ordering of values is
    58  //
    59  //	not important and duplicates are removed.
    60  //
    61  // 3. The values must be 32-bit non-negative integers only.
    62  func (l *idSetLexicon) add(ids ...int32) int32 {
    63  	// Empty sets have a special ID chosen not to conflict with other IDs.
    64  	if len(ids) == 0 {
    65  		return emptySetID
    66  	}
    67  
    68  	// Singleton sets are represented by their element.
    69  	if len(ids) == 1 {
    70  		return ids[0]
    71  	}
    72  
    73  	// Canonicalize the set by sorting and removing duplicates.
    74  	//
    75  	// Creates a new slice in order to not alter the supplied values.
    76  	set := uniqueInt32s(ids)
    77  
    78  	// Non-singleton sets are represented by the bitwise complement of the ID
    79  	// returned by the sequenceLexicon
    80  	return ^l.idSets.add(set)
    81  }
    82  
    83  // idSet returns the set of integers corresponding to an ID returned by add.
    84  func (l *idSetLexicon) idSet(setID int32) []int32 {
    85  	if setID >= 0 {
    86  		return []int32{setID}
    87  	}
    88  	if setID == emptySetID {
    89  		return []int32{}
    90  	}
    91  
    92  	return l.idSets.sequence(^setID)
    93  }
    94  
    95  func (l *idSetLexicon) clear() {
    96  	l.idSets.clear()
    97  }
    98  
    99  // sequenceLexicon compactly represents a sequence of values (e.g., tuples).
   100  // It automatically eliminates duplicates slices, and maps the remaining
   101  // sequences to sequentially increasing integer IDs. See also idSetLexicon.
   102  //
   103  // Each distinct sequence is mapped to a 32-bit integer.
   104  type sequenceLexicon struct {
   105  	values []int32
   106  	begins []uint32
   107  
   108  	// idSet is a mapping of a sequence hash to sequence index in the lexicon.
   109  	idSet map[uint32]int32
   110  }
   111  
   112  func newSequenceLexicon() *sequenceLexicon {
   113  	return &sequenceLexicon{
   114  		begins: []uint32{0},
   115  		idSet:  make(map[uint32]int32),
   116  	}
   117  }
   118  
   119  // clears all data from the lexicon.
   120  func (l *sequenceLexicon) clear() {
   121  	l.values = nil
   122  	l.begins = []uint32{0}
   123  	l.idSet = make(map[uint32]int32)
   124  }
   125  
   126  // add adds the given value to the lexicon if it is not already present, and
   127  // returns its ID. IDs are assigned sequentially starting from zero.
   128  func (l *sequenceLexicon) add(ids []int32) int32 {
   129  	if id, ok := l.idSet[hashSet(ids)]; ok {
   130  		return id
   131  	}
   132  	for _, v := range ids {
   133  		l.values = append(l.values, v)
   134  	}
   135  	l.begins = append(l.begins, uint32(len(l.values)))
   136  
   137  	id := int32(len(l.begins)) - 2
   138  	l.idSet[hashSet(ids)] = id
   139  
   140  	return id
   141  }
   142  
   143  // sequence returns the original sequence of values for the given ID.
   144  func (l *sequenceLexicon) sequence(id int32) []int32 {
   145  	return l.values[l.begins[id]:l.begins[id+1]]
   146  }
   147  
   148  // size reports the number of value sequences in the lexicon.
   149  func (l *sequenceLexicon) size() int {
   150  	// Subtract one because the list of begins starts out with the first element set to 0.
   151  	return len(l.begins) - 1
   152  }
   153  
   154  // hash returns a hash of this sequence of int32s.
   155  func hashSet(s []int32) uint32 {
   156  	// TODO(roberts): We just need a way to nicely hash all the values down to
   157  	// a 32-bit value. To ensure no unnecessary dependencies we use the core
   158  	// library types available to do this. Is there a better option?
   159  	a := adler32.New()
   160  	binary.Write(a, binary.LittleEndian, s)
   161  	return a.Sum32()
   162  }
   163  
   164  // uniqueInt32s returns the sorted and uniqued set of int32s from the input.
   165  func uniqueInt32s(in []int32) []int32 {
   166  	var vals []int32
   167  	m := make(map[int32]bool)
   168  	for _, i := range in {
   169  		if m[i] {
   170  			continue
   171  		}
   172  		m[i] = true
   173  		vals = append(vals, i)
   174  	}
   175  	sort.Slice(vals, func(i, j int) bool { return vals[i] < vals[j] })
   176  	return vals
   177  }
   178
View as plain text