1 // Package levenshtein is a Go implementation to calculate Levenshtein Distance. 2 // 3 // Implementation taken from 4 // https://gist.github.com/andrei-m/982927#gistcomment-1931258 5 package levenshtein 6 7 import "unicode/utf8" 8 9 // minLengthThreshold is the length of the string beyond which 10 // an allocation will be made. Strings smaller than this will be 11 // zero alloc. 12 const minLengthThreshold = 32 13 14 // ComputeDistance computes the levenshtein distance between the two 15 // strings passed as an argument. The return value is the levenshtein distance 16 // 17 // Works on runes (Unicode code points) but does not normalize 18 // the input strings. See https://blog.golang.org/normalization 19 // and the golang.org/x/text/unicode/norm package. 20 func ComputeDistance(a, b string) int { 21 if len(a) == 0 { 22 return utf8.RuneCountInString(b) 23 } 24 25 if len(b) == 0 { 26 return utf8.RuneCountInString(a) 27 } 28 29 if a == b { 30 return 0 31 } 32 33 // We need to convert to []rune if the strings are non-ASCII. 34 // This could be avoided by using utf8.RuneCountInString 35 // and then doing some juggling with rune indices, 36 // but leads to far more bounds checks. It is a reasonable trade-off. 37 s1 := []rune(a) 38 s2 := []rune(b) 39 40 // swap to save some memory O(min(a,b)) instead of O(a) 41 if len(s1) > len(s2) { 42 s1, s2 = s2, s1 43 } 44 lenS1 := len(s1) 45 lenS2 := len(s2) 46 47 // Init the row. 48 var x []uint16 49 if lenS1+1 > minLengthThreshold { 50 x = make([]uint16, lenS1+1) 51 } else { 52 // We make a small optimization here for small strings. 53 // Because a slice of constant length is effectively an array, 54 // it does not allocate. So we can re-slice it to the right length 55 // as long as it is below a desired threshold. 56 x = make([]uint16, minLengthThreshold) 57 x = x[:lenS1+1] 58 } 59 60 // we start from 1 because index 0 is already 0. 61 for i := 1; i < len(x); i++ { 62 x[i] = uint16(i) 63 } 64 65 // make a dummy bounds check to prevent the 2 bounds check down below. 66 // The one inside the loop is particularly costly. 67 _ = x[lenS1] 68 // fill in the rest 69 for i := 1; i <= lenS2; i++ { 70 prev := uint16(i) 71 for j := 1; j <= lenS1; j++ { 72 current := x[j-1] // match 73 if s2[i-1] != s1[j-1] { 74 current = min(min(x[j-1]+1, prev+1), x[j]+1) 75 } 76 x[j-1] = prev 77 prev = current 78 } 79 x[lenS1] = prev 80 } 81 return int(x[lenS1]) 82 } 83 84 func min(a, b uint16) uint16 { 85 if a < b { 86 return a 87 } 88 return b 89 } 90