// Copyright 2021 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package fuzzy import ( "bytes" "fmt" "log" "unicode" ) // SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols // of the form: // // example.com/path/to/package.object.field // // Knowing that we are matching symbols like this allows us to make the // following optimizations: // - We can incorporate right-to-left relevance directly into the score // calculation. // - We can match from right to left, discarding leading bytes if the input is // too long. // - We just take the right-most match without losing too much precision. This // allows us to use an O(n) algorithm. // - We can operate directly on chunked strings; in many cases we will // be storing the package path and/or package name separately from the // symbol or identifiers, so doing this avoids allocating strings. // - We can return the index of the right-most match, allowing us to trim // irrelevant qualification. type SymbolMatcher struct { // Using buffers of length 256 is both a reasonable size for most qualified // symbols, and makes it easy to avoid bounds checks by using uint8 indexes. pattern [256]rune patternLen uint8 inputBuffer [256]rune // avoid allocating when considering chunks roles [256]uint32 // which roles does a rune play (word start, etc.) segments [256]uint8 // how many segments from the right is each rune } // Rune roles. const ( segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.') wordStart // input rune starts a word, per camel-case naming rules separator // input rune is a separator ('/' or '.') upper // input rune is an upper case letter ) // NewSymbolMatcher creates a SymbolMatcher that may be used to match the given // search pattern. // // Currently this matcher only accepts case-insensitive fuzzy patterns. // // An empty pattern matches no input. func NewSymbolMatcher(pattern string) *SymbolMatcher { m := &SymbolMatcher{} for _, p := range pattern { m.pattern[m.patternLen] = unicode.ToLower(p) m.patternLen++ if m.patternLen == 255 || int(m.patternLen) == len(pattern) { // break at 255 so that we can represent patternLen with a uint8. break } } return m } // Match searches for the right-most match of the search pattern within the // symbol represented by concatenating the given chunks. // // If a match is found, the first result holds the absolute byte offset within // all chunks for the start of the symbol. In other words, the index of the // match within strings.Join(chunks, ""). // // The second return value will be the score of the match, which is always // between 0 and 1, inclusive. A score of 0 indicates no match. // // If no match is found, Match returns (-1, 0). func (m *SymbolMatcher) Match(chunks []string) (int, float64) { // Explicit behavior for an empty pattern. // // As a minor optimization, this also avoids nilness checks later on, since // the compiler can prove that m != nil. if m.patternLen == 0 { return -1, 0 } // Matching implements a heavily optimized linear scoring algorithm on the // input. This is not guaranteed to produce the highest score, but works well // enough, particularly due to the right-to-left significance of qualified // symbols. // // Matching proceeds in three passes through the input: // - The first pass populates the input buffer and collects rune roles. // - The second pass proceeds right-to-left to find the right-most match. // - The third pass proceeds left-to-right from the start of the right-most // match, to find the most *compact* match, and computes the score of this // match. // // See below for more details of each pass, as well as the scoring algorithm. // First pass: populate the input buffer out of the provided chunks // (lower-casing in the process), and collect rune roles. // // We could also check for a forward match here, but since we'd have to write // the entire input anyway this has negligible impact on performance. var ( inputLen = uint8(0) modifiers = wordStart | segmentStart ) input: for _, chunk := range chunks { for _, r := range chunk { if r == '.' || r == '/' { modifiers |= separator } // optimization: avoid calls to unicode.ToLower, which can't be inlined. l := r if r <= unicode.MaxASCII { if 'A' <= r && r <= 'Z' { l = r + 'a' - 'A' } } else { l = unicode.ToLower(r) } if l != r { modifiers |= upper // If the current rune is capitalized *and the preceding rune was not*, // mark this as a word start. This avoids spuriously high ranking of // non-camelcase naming schemas, such as the // yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of // golang/go#60201. if inputLen == 0 || m.roles[inputLen-1]&upper == 0 { modifiers |= wordStart } } m.inputBuffer[inputLen] = l m.roles[inputLen] = modifiers inputLen++ if m.roles[inputLen-1]&separator != 0 { modifiers = wordStart | segmentStart } else { modifiers = 0 } // TODO: we should prefer the right-most input if it overflows, rather // than the left-most as we're doing here. if inputLen == 255 { break input } } } // Second pass: find the right-most match, and count segments from the // right. var ( pi = uint8(m.patternLen - 1) // pattern index p = m.pattern[pi] // pattern rune start = -1 // start offset of match rseg = uint8(0) // effective "depth" from the right of the current rune in consideration ) const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes. for ii := inputLen - 1; ; ii-- { r := m.inputBuffer[ii] if rseg < maxSeg && m.roles[ii]&separator != 0 { rseg++ } m.segments[ii] = rseg if p == r { if pi == 0 { // TODO(rfindley): BUG: the docstring for Match says that it returns an // absolute byte offset, but clearly it is returning a rune offset here. start = int(ii) break } pi-- p = m.pattern[pi] } // Don't check ii >= 0 in the loop condition: ii is a uint8. if ii == 0 { break } } if start < 0 { // no match: skip scoring return -1, 0 } // Third pass: find the shortest match and compute the score. // Score is the average score for each rune. // // A rune score is the multiple of: // 1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the // rune starts a mid-segment word, else 0.6. // // Runes preceded by a matching rune are treated the same as the start // of a mid-segment word (with a 0.9 score), so that sequential or exact // matches are preferred. We call this a sequential bonus. // // For the final rune match, this sequential bonus is reduced to 0.8 if // the next rune in the input is a mid-segment word, or 0.7 if the next // rune in the input is not a word or segment start. This ensures that // we favor whole-word or whole-segment matches over prefix matches. // // 2. 1.0 if the rune is part of the last segment, otherwise // 1.0-0.1*, with a max segment count of 3. // Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very // early in a qualified symbol name) still scores higher than _f_o_o_ (a // completely split match). // // This is a naive algorithm, but it is fast. There's lots of prior art here // that could be leveraged. For example, we could explicitly consider // rune distance, and exact matches of words or segments. // // Also note that this might not actually find the highest scoring match, as // doing so could require a non-linear algorithm, depending on how the score // is calculated. // debugging support const debug = false // enable to log debugging information var ( runeScores []float64 runeIdxs []int ) pi = 0 p = m.pattern[pi] const ( segStartScore = 1.0 // base score of runes starting a segment wordScore = 0.9 // base score of runes starting or continuing a word noStreak = 0.6 perSegment = 0.1 // we count at most 3 segments above ) totScore := 0.0 lastMatch := uint8(255) for ii := uint8(start); ii < inputLen; ii++ { r := m.inputBuffer[ii] if r == p { pi++ finalRune := pi >= m.patternLen p = m.pattern[pi] baseScore := noStreak // Calculate the sequence bonus based on preceding matches. // // We do this first as it is overridden by role scoring below. if lastMatch == ii-1 { baseScore = wordScore // Reduce the sequence bonus for the final rune of the pattern based on // whether it borders a new segment or word. if finalRune { switch { case ii == inputLen-1 || m.roles[ii+1]&separator != 0: // Full segment: no reduction case m.roles[ii+1]&wordStart != 0: baseScore = wordScore - 0.1 default: baseScore = wordScore - 0.2 } } } lastMatch = ii // Calculate the rune's role score. If the rune starts a segment or word, // this overrides the sequence score, as the rune starts a new sequence. switch { case m.roles[ii]&segmentStart != 0: baseScore = segStartScore case m.roles[ii]&wordStart != 0: baseScore = wordScore } // Apply the segment-depth penalty (segments from the right). runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment) if debug { runeScores = append(runeScores, runeScore) runeIdxs = append(runeIdxs, int(ii)) } totScore += runeScore if finalRune { break } } } if debug { // Format rune roles and scores in line: // fo[o:.52].[b:1]a[r:.6] var summary bytes.Buffer last := 0 for i, idx := range runeIdxs { summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i]) last = idx + 1 } summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes log.Println(summary.String()) } return start, totScore / float64(m.patternLen) }