...

Source file src/github.com/xrash/smetrics/ukkonen.go

Documentation: github.com/xrash/smetrics

     1  package smetrics
     2  
     3  import (
     4  	"math"
     5  )
     6  
     7  // The Ukkonen algorithm for calculating the Levenshtein distance. The algorithm is described in http://www.cs.helsinki.fi/u/ukkonen/InfCont85.PDF, or in docs/InfCont85.PDF. It runs on O(t . min(m, n)) where t is the actual distance between strings a and b. It needs O(min(t, m, n)) space. This function might be preferred over WagnerFischer() for *very* similar strings. But test it out yourself.
     8  // The first two parameters are the two strings to be compared. The last three parameters are the insertion cost, the deletion cost and the substitution cost. These are normally defined as 1, 1 and 2 respectively.
     9  func Ukkonen(a, b string, icost, dcost, scost int) int {
    10  	var lowerCost int
    11  
    12  	if icost < dcost && icost < scost {
    13  		lowerCost = icost
    14  	} else if dcost < scost {
    15  		lowerCost = dcost
    16  	} else {
    17  		lowerCost = scost
    18  	}
    19  
    20  	infinite := math.MaxInt32 / 2
    21  
    22  	var r []int
    23  	var k, kprime, p, t int
    24  	var ins, del, sub int
    25  
    26  	if len(a) > len(b) {
    27  		t = (len(a) - len(b) + 1) * lowerCost
    28  	} else {
    29  		t = (len(b) - len(a) + 1) * lowerCost
    30  	}
    31  
    32  	for {
    33  		if (t / lowerCost) < (len(b) - len(a)) {
    34  			continue
    35  		}
    36  
    37  		// This is the right damn thing since the original Ukkonen
    38  		// paper minimizes the expression result only, but the uncommented version
    39  		// doesn't need to deal with floats so it's faster.
    40  		// p = int(math.Floor(0.5*((float64(t)/float64(lowerCost)) - float64(len(b) - len(a)))))
    41  		p = ((t / lowerCost) - (len(b) - len(a))) / 2
    42  
    43  		k = -p
    44  		kprime = k
    45  
    46  		rowlength := (len(b) - len(a)) + (2 * p)
    47  
    48  		r = make([]int, rowlength+2)
    49  
    50  		for i := 0; i < rowlength+2; i++ {
    51  			r[i] = infinite
    52  		}
    53  
    54  		for i := 0; i <= len(a); i++ {
    55  			for j := 0; j <= rowlength; j++ {
    56  				if i == j+k && i == 0 {
    57  					r[j] = 0
    58  				} else {
    59  					if j-1 < 0 {
    60  						ins = infinite
    61  					} else {
    62  						ins = r[j-1] + icost
    63  					}
    64  
    65  					del = r[j+1] + dcost
    66  					sub = r[j] + scost
    67  
    68  					if i-1 < 0 || i-1 >= len(a) || j+k-1 >= len(b) || j+k-1 < 0 {
    69  						sub = infinite
    70  					} else if a[i-1] == b[j+k-1] {
    71  						sub = r[j]
    72  					}
    73  
    74  					if ins < del && ins < sub {
    75  						r[j] = ins
    76  					} else if del < sub {
    77  						r[j] = del
    78  					} else {
    79  						r[j] = sub
    80  					}
    81  				}
    82  			}
    83  			k++
    84  		}
    85  
    86  		if r[(len(b)-len(a))+(2*p)+kprime] <= t {
    87  			break
    88  		} else {
    89  			t *= 2
    90  		}
    91  	}
    92  
    93  	return r[(len(b)-len(a))+(2*p)+kprime]
    94  }
    95  

View as plain text