symbol.go

Documentation: golang.org/x/tools/internal/fuzzy

     1  // Copyright 2021 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package fuzzy
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"log"
    11  	"unicode"
    12  )
    13  
    14  // SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
    15  // of the form:
    16  //
    17  //	example.com/path/to/package.object.field
    18  //
    19  // Knowing that we are matching symbols like this allows us to make the
    20  // following optimizations:
    21  //   - We can incorporate right-to-left relevance directly into the score
    22  //     calculation.
    23  //   - We can match from right to left, discarding leading bytes if the input is
    24  //     too long.
    25  //   - We just take the right-most match without losing too much precision. This
    26  //     allows us to use an O(n) algorithm.
    27  //   - We can operate directly on chunked strings; in many cases we will
    28  //     be storing the package path and/or package name separately from the
    29  //     symbol or identifiers, so doing this avoids allocating strings.
    30  //   - We can return the index of the right-most match, allowing us to trim
    31  //     irrelevant qualification.
    32  type SymbolMatcher struct {
    33  	// Using buffers of length 256 is both a reasonable size for most qualified
    34  	// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
    35  	pattern     [256]rune
    36  	patternLen  uint8
    37  	inputBuffer [256]rune   // avoid allocating when considering chunks
    38  	roles       [256]uint32 // which roles does a rune play (word start, etc.)
    39  	segments    [256]uint8  // how many segments from the right is each rune
    40  }
    41  
    42  // Rune roles.
    43  const (
    44  	segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.')
    45  	wordStart                       // input rune starts a word, per camel-case naming rules
    46  	separator                       // input rune is a separator ('/' or '.')
    47  	upper                           // input rune is an upper case letter
    48  )
    49  
    50  // NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
    51  // search pattern.
    52  //
    53  // Currently this matcher only accepts case-insensitive fuzzy patterns.
    54  //
    55  // An empty pattern matches no input.
    56  func NewSymbolMatcher(pattern string) *SymbolMatcher {
    57  	m := &SymbolMatcher{}
    58  	for _, p := range pattern {
    59  		m.pattern[m.patternLen] = unicode.ToLower(p)
    60  		m.patternLen++
    61  		if m.patternLen == 255 || int(m.patternLen) == len(pattern) {
    62  			// break at 255 so that we can represent patternLen with a uint8.
    63  			break
    64  		}
    65  	}
    66  	return m
    67  }
    68  
    69  // Match searches for the right-most match of the search pattern within the
    70  // symbol represented by concatenating the given chunks.
    71  //
    72  // If a match is found, the first result holds the absolute byte offset within
    73  // all chunks for the start of the symbol. In other words, the index of the
    74  // match within strings.Join(chunks, "").
    75  //
    76  // The second return value will be the score of the match, which is always
    77  // between 0 and 1, inclusive. A score of 0 indicates no match.
    78  //
    79  // If no match is found, Match returns (-1, 0).
    80  func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
    81  	// Explicit behavior for an empty pattern.
    82  	//
    83  	// As a minor optimization, this also avoids nilness checks later on, since
    84  	// the compiler can prove that m != nil.
    85  	if m.patternLen == 0 {
    86  		return -1, 0
    87  	}
    88  
    89  	// Matching implements a heavily optimized linear scoring algorithm on the
    90  	// input. This is not guaranteed to produce the highest score, but works well
    91  	// enough, particularly due to the right-to-left significance of qualified
    92  	// symbols.
    93  	//
    94  	// Matching proceeds in three passes through the input:
    95  	//  - The first pass populates the input buffer and collects rune roles.
    96  	//  - The second pass proceeds right-to-left to find the right-most match.
    97  	//  - The third pass proceeds left-to-right from the start of the right-most
    98  	//    match, to find the most *compact* match, and computes the score of this
    99  	//    match.
   100  	//
   101  	// See below for more details of each pass, as well as the scoring algorithm.
   102  
   103  	// First pass: populate the input buffer out of the provided chunks
   104  	// (lower-casing in the process), and collect rune roles.
   105  	//
   106  	// We could also check for a forward match here, but since we'd have to write
   107  	// the entire input anyway this has negligible impact on performance.
   108  	var (
   109  		inputLen  = uint8(0)
   110  		modifiers = wordStart | segmentStart
   111  	)
   112  
   113  input:
   114  	for _, chunk := range chunks {
   115  		for _, r := range chunk {
   116  			if r == '.' || r == '/' {
   117  				modifiers |= separator
   118  			}
   119  			// optimization: avoid calls to unicode.ToLower, which can't be inlined.
   120  			l := r
   121  			if r <= unicode.MaxASCII {
   122  				if 'A' <= r && r <= 'Z' {
   123  					l = r + 'a' - 'A'
   124  				}
   125  			} else {
   126  				l = unicode.ToLower(r)
   127  			}
   128  			if l != r {
   129  				modifiers |= upper
   130  
   131  				// If the current rune is capitalized *and the preceding rune was not*,
   132  				// mark this as a word start. This avoids spuriously high ranking of
   133  				// non-camelcase naming schemas, such as the
   134  				// yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of
   135  				// golang/go#60201.
   136  				if inputLen == 0 || m.roles[inputLen-1]&upper == 0 {
   137  					modifiers |= wordStart
   138  				}
   139  			}
   140  			m.inputBuffer[inputLen] = l
   141  			m.roles[inputLen] = modifiers
   142  			inputLen++
   143  			if m.roles[inputLen-1]&separator != 0 {
   144  				modifiers = wordStart | segmentStart
   145  			} else {
   146  				modifiers = 0
   147  			}
   148  			// TODO: we should prefer the right-most input if it overflows, rather
   149  			//       than the left-most as we're doing here.
   150  			if inputLen == 255 {
   151  				break input
   152  			}
   153  		}
   154  	}
   155  
   156  	// Second pass: find the right-most match, and count segments from the
   157  	// right.
   158  	var (
   159  		pi    = uint8(m.patternLen - 1) // pattern index
   160  		p     = m.pattern[pi]           // pattern rune
   161  		start = -1                      // start offset of match
   162  		rseg  = uint8(0)                // effective "depth" from the right of the current rune in consideration
   163  	)
   164  	const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
   165  
   166  	for ii := inputLen - 1; ; ii-- {
   167  		r := m.inputBuffer[ii]
   168  		if rseg < maxSeg && m.roles[ii]&separator != 0 {
   169  			rseg++
   170  		}
   171  		m.segments[ii] = rseg
   172  		if p == r {
   173  			if pi == 0 {
   174  				// TODO(rfindley): BUG: the docstring for Match says that it returns an
   175  				// absolute byte offset, but clearly it is returning a rune offset here.
   176  				start = int(ii)
   177  				break
   178  			}
   179  			pi--
   180  			p = m.pattern[pi]
   181  		}
   182  		// Don't check ii >= 0 in the loop condition: ii is a uint8.
   183  		if ii == 0 {
   184  			break
   185  		}
   186  	}
   187  
   188  	if start < 0 {
   189  		// no match: skip scoring
   190  		return -1, 0
   191  	}
   192  
   193  	// Third pass: find the shortest match and compute the score.
   194  
   195  	// Score is the average score for each rune.
   196  	//
   197  	// A rune score is the multiple of:
   198  	//   1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the
   199  	//      rune starts a mid-segment word, else 0.6.
   200  	//
   201  	//      Runes preceded by a matching rune are treated the same as the start
   202  	//      of a mid-segment word (with a 0.9 score), so that sequential or exact
   203  	//      matches are preferred. We call this a sequential bonus.
   204  	//
   205  	//      For the final rune match, this sequential bonus is reduced to 0.8 if
   206  	//      the next rune in the input is a mid-segment word, or 0.7 if the next
   207  	//      rune in the input is not a word or segment start. This ensures that
   208  	//      we favor whole-word or whole-segment matches over prefix matches.
   209  	//
   210  	//   2. 1.0 if the rune is part of the last segment, otherwise
   211  	//      1.0-0.1*<segments from the right>, with a max segment count of 3.
   212  	//      Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
   213  	//      early in a qualified symbol name) still scores higher than _f_o_o_ (a
   214  	//      completely split match).
   215  	//
   216  	// This is a naive algorithm, but it is fast. There's lots of prior art here
   217  	// that could be leveraged. For example, we could explicitly consider
   218  	// rune distance, and exact matches of words or segments.
   219  	//
   220  	// Also note that this might not actually find the highest scoring match, as
   221  	// doing so could require a non-linear algorithm, depending on how the score
   222  	// is calculated.
   223  
   224  	// debugging support
   225  	const debug = false // enable to log debugging information
   226  	var (
   227  		runeScores []float64
   228  		runeIdxs   []int
   229  	)
   230  
   231  	pi = 0
   232  	p = m.pattern[pi]
   233  
   234  	const (
   235  		segStartScore = 1.0 // base score of runes starting a segment
   236  		wordScore     = 0.9 // base score of runes starting or continuing a word
   237  		noStreak      = 0.6
   238  		perSegment    = 0.1 // we count at most 3 segments above
   239  	)
   240  
   241  	totScore := 0.0
   242  	lastMatch := uint8(255)
   243  	for ii := uint8(start); ii < inputLen; ii++ {
   244  		r := m.inputBuffer[ii]
   245  		if r == p {
   246  			pi++
   247  			finalRune := pi >= m.patternLen
   248  			p = m.pattern[pi]
   249  
   250  			baseScore := noStreak
   251  
   252  			// Calculate the sequence bonus based on preceding matches.
   253  			//
   254  			// We do this first as it is overridden by role scoring below.
   255  			if lastMatch == ii-1 {
   256  				baseScore = wordScore
   257  				// Reduce the sequence bonus for the final rune of the pattern based on
   258  				// whether it borders a new segment or word.
   259  				if finalRune {
   260  					switch {
   261  					case ii == inputLen-1 || m.roles[ii+1]&separator != 0:
   262  						// Full segment: no reduction
   263  					case m.roles[ii+1]&wordStart != 0:
   264  						baseScore = wordScore - 0.1
   265  					default:
   266  						baseScore = wordScore - 0.2
   267  					}
   268  				}
   269  			}
   270  			lastMatch = ii
   271  
   272  			// Calculate the rune's role score. If the rune starts a segment or word,
   273  			// this overrides the sequence score, as the rune starts a new sequence.
   274  			switch {
   275  			case m.roles[ii]&segmentStart != 0:
   276  				baseScore = segStartScore
   277  			case m.roles[ii]&wordStart != 0:
   278  				baseScore = wordScore
   279  			}
   280  
   281  			// Apply the segment-depth penalty (segments from the right).
   282  			runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment)
   283  			if debug {
   284  				runeScores = append(runeScores, runeScore)
   285  				runeIdxs = append(runeIdxs, int(ii))
   286  			}
   287  			totScore += runeScore
   288  			if finalRune {
   289  				break
   290  			}
   291  		}
   292  	}
   293  
   294  	if debug {
   295  		// Format rune roles and scores in line:
   296  		// fo[o:.52].[b:1]a[r:.6]
   297  		var summary bytes.Buffer
   298  		last := 0
   299  		for i, idx := range runeIdxs {
   300  			summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes
   301  			fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i])
   302  			last = idx + 1
   303  		}
   304  		summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes
   305  		log.Println(summary.String())
   306  	}
   307  
   308  	return start, totScore / float64(m.patternLen)
   309  }
   310
View as plain text