1 // Copyright 2021 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package fuzzy 6 7 import ( 8 "bytes" 9 "fmt" 10 "log" 11 "unicode" 12 ) 13 14 // SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols 15 // of the form: 16 // 17 // example.com/path/to/package.object.field 18 // 19 // Knowing that we are matching symbols like this allows us to make the 20 // following optimizations: 21 // - We can incorporate right-to-left relevance directly into the score 22 // calculation. 23 // - We can match from right to left, discarding leading bytes if the input is 24 // too long. 25 // - We just take the right-most match without losing too much precision. This 26 // allows us to use an O(n) algorithm. 27 // - We can operate directly on chunked strings; in many cases we will 28 // be storing the package path and/or package name separately from the 29 // symbol or identifiers, so doing this avoids allocating strings. 30 // - We can return the index of the right-most match, allowing us to trim 31 // irrelevant qualification. 32 type SymbolMatcher struct { 33 // Using buffers of length 256 is both a reasonable size for most qualified 34 // symbols, and makes it easy to avoid bounds checks by using uint8 indexes. 35 pattern [256]rune 36 patternLen uint8 37 inputBuffer [256]rune // avoid allocating when considering chunks 38 roles [256]uint32 // which roles does a rune play (word start, etc.) 39 segments [256]uint8 // how many segments from the right is each rune 40 } 41 42 // Rune roles. 43 const ( 44 segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.') 45 wordStart // input rune starts a word, per camel-case naming rules 46 separator // input rune is a separator ('/' or '.') 47 upper // input rune is an upper case letter 48 ) 49 50 // NewSymbolMatcher creates a SymbolMatcher that may be used to match the given 51 // search pattern. 52 // 53 // Currently this matcher only accepts case-insensitive fuzzy patterns. 54 // 55 // An empty pattern matches no input. 56 func NewSymbolMatcher(pattern string) *SymbolMatcher { 57 m := &SymbolMatcher{} 58 for _, p := range pattern { 59 m.pattern[m.patternLen] = unicode.ToLower(p) 60 m.patternLen++ 61 if m.patternLen == 255 || int(m.patternLen) == len(pattern) { 62 // break at 255 so that we can represent patternLen with a uint8. 63 break 64 } 65 } 66 return m 67 } 68 69 // Match searches for the right-most match of the search pattern within the 70 // symbol represented by concatenating the given chunks. 71 // 72 // If a match is found, the first result holds the absolute byte offset within 73 // all chunks for the start of the symbol. In other words, the index of the 74 // match within strings.Join(chunks, ""). 75 // 76 // The second return value will be the score of the match, which is always 77 // between 0 and 1, inclusive. A score of 0 indicates no match. 78 // 79 // If no match is found, Match returns (-1, 0). 80 func (m *SymbolMatcher) Match(chunks []string) (int, float64) { 81 // Explicit behavior for an empty pattern. 82 // 83 // As a minor optimization, this also avoids nilness checks later on, since 84 // the compiler can prove that m != nil. 85 if m.patternLen == 0 { 86 return -1, 0 87 } 88 89 // Matching implements a heavily optimized linear scoring algorithm on the 90 // input. This is not guaranteed to produce the highest score, but works well 91 // enough, particularly due to the right-to-left significance of qualified 92 // symbols. 93 // 94 // Matching proceeds in three passes through the input: 95 // - The first pass populates the input buffer and collects rune roles. 96 // - The second pass proceeds right-to-left to find the right-most match. 97 // - The third pass proceeds left-to-right from the start of the right-most 98 // match, to find the most *compact* match, and computes the score of this 99 // match. 100 // 101 // See below for more details of each pass, as well as the scoring algorithm. 102 103 // First pass: populate the input buffer out of the provided chunks 104 // (lower-casing in the process), and collect rune roles. 105 // 106 // We could also check for a forward match here, but since we'd have to write 107 // the entire input anyway this has negligible impact on performance. 108 var ( 109 inputLen = uint8(0) 110 modifiers = wordStart | segmentStart 111 ) 112 113 input: 114 for _, chunk := range chunks { 115 for _, r := range chunk { 116 if r == '.' || r == '/' { 117 modifiers |= separator 118 } 119 // optimization: avoid calls to unicode.ToLower, which can't be inlined. 120 l := r 121 if r <= unicode.MaxASCII { 122 if 'A' <= r && r <= 'Z' { 123 l = r + 'a' - 'A' 124 } 125 } else { 126 l = unicode.ToLower(r) 127 } 128 if l != r { 129 modifiers |= upper 130 131 // If the current rune is capitalized *and the preceding rune was not*, 132 // mark this as a word start. This avoids spuriously high ranking of 133 // non-camelcase naming schemas, such as the 134 // yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of 135 // golang/go#60201. 136 if inputLen == 0 || m.roles[inputLen-1]&upper == 0 { 137 modifiers |= wordStart 138 } 139 } 140 m.inputBuffer[inputLen] = l 141 m.roles[inputLen] = modifiers 142 inputLen++ 143 if m.roles[inputLen-1]&separator != 0 { 144 modifiers = wordStart | segmentStart 145 } else { 146 modifiers = 0 147 } 148 // TODO: we should prefer the right-most input if it overflows, rather 149 // than the left-most as we're doing here. 150 if inputLen == 255 { 151 break input 152 } 153 } 154 } 155 156 // Second pass: find the right-most match, and count segments from the 157 // right. 158 var ( 159 pi = uint8(m.patternLen - 1) // pattern index 160 p = m.pattern[pi] // pattern rune 161 start = -1 // start offset of match 162 rseg = uint8(0) // effective "depth" from the right of the current rune in consideration 163 ) 164 const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes. 165 166 for ii := inputLen - 1; ; ii-- { 167 r := m.inputBuffer[ii] 168 if rseg < maxSeg && m.roles[ii]&separator != 0 { 169 rseg++ 170 } 171 m.segments[ii] = rseg 172 if p == r { 173 if pi == 0 { 174 // TODO(rfindley): BUG: the docstring for Match says that it returns an 175 // absolute byte offset, but clearly it is returning a rune offset here. 176 start = int(ii) 177 break 178 } 179 pi-- 180 p = m.pattern[pi] 181 } 182 // Don't check ii >= 0 in the loop condition: ii is a uint8. 183 if ii == 0 { 184 break 185 } 186 } 187 188 if start < 0 { 189 // no match: skip scoring 190 return -1, 0 191 } 192 193 // Third pass: find the shortest match and compute the score. 194 195 // Score is the average score for each rune. 196 // 197 // A rune score is the multiple of: 198 // 1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the 199 // rune starts a mid-segment word, else 0.6. 200 // 201 // Runes preceded by a matching rune are treated the same as the start 202 // of a mid-segment word (with a 0.9 score), so that sequential or exact 203 // matches are preferred. We call this a sequential bonus. 204 // 205 // For the final rune match, this sequential bonus is reduced to 0.8 if 206 // the next rune in the input is a mid-segment word, or 0.7 if the next 207 // rune in the input is not a word or segment start. This ensures that 208 // we favor whole-word or whole-segment matches over prefix matches. 209 // 210 // 2. 1.0 if the rune is part of the last segment, otherwise 211 // 1.0-0.1*<segments from the right>, with a max segment count of 3. 212 // Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very 213 // early in a qualified symbol name) still scores higher than _f_o_o_ (a 214 // completely split match). 215 // 216 // This is a naive algorithm, but it is fast. There's lots of prior art here 217 // that could be leveraged. For example, we could explicitly consider 218 // rune distance, and exact matches of words or segments. 219 // 220 // Also note that this might not actually find the highest scoring match, as 221 // doing so could require a non-linear algorithm, depending on how the score 222 // is calculated. 223 224 // debugging support 225 const debug = false // enable to log debugging information 226 var ( 227 runeScores []float64 228 runeIdxs []int 229 ) 230 231 pi = 0 232 p = m.pattern[pi] 233 234 const ( 235 segStartScore = 1.0 // base score of runes starting a segment 236 wordScore = 0.9 // base score of runes starting or continuing a word 237 noStreak = 0.6 238 perSegment = 0.1 // we count at most 3 segments above 239 ) 240 241 totScore := 0.0 242 lastMatch := uint8(255) 243 for ii := uint8(start); ii < inputLen; ii++ { 244 r := m.inputBuffer[ii] 245 if r == p { 246 pi++ 247 finalRune := pi >= m.patternLen 248 p = m.pattern[pi] 249 250 baseScore := noStreak 251 252 // Calculate the sequence bonus based on preceding matches. 253 // 254 // We do this first as it is overridden by role scoring below. 255 if lastMatch == ii-1 { 256 baseScore = wordScore 257 // Reduce the sequence bonus for the final rune of the pattern based on 258 // whether it borders a new segment or word. 259 if finalRune { 260 switch { 261 case ii == inputLen-1 || m.roles[ii+1]&separator != 0: 262 // Full segment: no reduction 263 case m.roles[ii+1]&wordStart != 0: 264 baseScore = wordScore - 0.1 265 default: 266 baseScore = wordScore - 0.2 267 } 268 } 269 } 270 lastMatch = ii 271 272 // Calculate the rune's role score. If the rune starts a segment or word, 273 // this overrides the sequence score, as the rune starts a new sequence. 274 switch { 275 case m.roles[ii]&segmentStart != 0: 276 baseScore = segStartScore 277 case m.roles[ii]&wordStart != 0: 278 baseScore = wordScore 279 } 280 281 // Apply the segment-depth penalty (segments from the right). 282 runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment) 283 if debug { 284 runeScores = append(runeScores, runeScore) 285 runeIdxs = append(runeIdxs, int(ii)) 286 } 287 totScore += runeScore 288 if finalRune { 289 break 290 } 291 } 292 } 293 294 if debug { 295 // Format rune roles and scores in line: 296 // fo[o:.52].[b:1]a[r:.6] 297 var summary bytes.Buffer 298 last := 0 299 for i, idx := range runeIdxs { 300 summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes 301 fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i]) 302 last = idx + 1 303 } 304 summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes 305 log.Println(summary.String()) 306 } 307 308 return start, totScore / float64(m.patternLen) 309 } 310