...

Source file src/github.com/xrash/smetrics/jaro.go

Documentation: github.com/xrash/smetrics

     1  package smetrics
     2  
     3  import (
     4  	"math"
     5  )
     6  
     7  // The Jaro distance. The result is 1 for equal strings, and 0 for completely different strings.
     8  func Jaro(a, b string) float64 {
     9  	// If both strings are zero-length, they are completely equal,
    10  	// therefore return 1.
    11  	if len(a) == 0 && len(b) == 0 {
    12  		return 1
    13  	}
    14  
    15  	// If one string is zero-length, strings are completely different,
    16  	// therefore return 0.
    17  	if len(a) == 0 || len(b) == 0 {
    18  		return 0
    19  	}
    20  
    21  	// Define the necessary variables for the algorithm.
    22  	la := float64(len(a))
    23  	lb := float64(len(b))
    24  	matchRange := int(math.Max(0, math.Floor(math.Max(la, lb)/2.0)-1))
    25  	matchesA := make([]bool, len(a))
    26  	matchesB := make([]bool, len(b))
    27  	var matches float64 = 0
    28  
    29  	// Step 1: Matches
    30  	// Loop through each character of the first string,
    31  	// looking for a matching character in the second string.
    32  	for i := 0; i < len(a); i++ {
    33  		start := int(math.Max(0, float64(i-matchRange)))
    34  		end := int(math.Min(lb-1, float64(i+matchRange)))
    35  
    36  		for j := start; j <= end; j++ {
    37  			if matchesB[j] {
    38  				continue
    39  			}
    40  
    41  			if a[i] == b[j] {
    42  				matchesA[i] = true
    43  				matchesB[j] = true
    44  				matches++
    45  				break
    46  			}
    47  		}
    48  	}
    49  
    50  	// If there are no matches, strings are completely different,
    51  	// therefore return 0.
    52  	if matches == 0 {
    53  		return 0
    54  	}
    55  
    56  	// Step 2: Transpositions
    57  	// Loop through the matches' arrays, looking for
    58  	// unaligned matches. Count the number of unaligned matches.
    59  	unaligned := 0
    60  	j := 0
    61  	for i := 0; i < len(a); i++ {
    62  		if !matchesA[i] {
    63  			continue
    64  		}
    65  
    66  		for !matchesB[j] {
    67  			j++
    68  		}
    69  
    70  		if a[i] != b[j] {
    71  			unaligned++
    72  		}
    73  
    74  		j++
    75  	}
    76  
    77  	// The number of unaligned matches divided by two, is the number of _transpositions_.
    78  	transpositions := math.Floor(float64(unaligned / 2))
    79  
    80  	// Jaro distance is the average between these three numbers:
    81  	// 1. matches / length of string A
    82  	// 2. matches / length of string B
    83  	// 3. (matches - transpositions/matches)
    84  	// So, all that divided by three is the final result.
    85  	return ((matches / la) + (matches / lb) + ((matches - transpositions) / matches)) / 3.0
    86  }
    87  

View as plain text