...

Source file src/github.com/huandu/xstrings/translate.go

Documentation: github.com/huandu/xstrings

     1  // Copyright 2015 Huan Du. All rights reserved.
     2  // Licensed under the MIT license that can be found in the LICENSE file.
     3  
     4  package xstrings
     5  
     6  import (
     7  	"unicode"
     8  	"unicode/utf8"
     9  )
    10  
    11  type runeRangeMap struct {
    12  	FromLo rune // Lower bound of range map.
    13  	FromHi rune // An inclusive higher bound of range map.
    14  	ToLo   rune
    15  	ToHi   rune
    16  }
    17  
    18  type runeDict struct {
    19  	Dict [unicode.MaxASCII + 1]rune
    20  }
    21  
    22  type runeMap map[rune]rune
    23  
    24  // Translator can translate string with pre-compiled from and to patterns.
    25  // If a from/to pattern pair needs to be used more than once, it's recommended
    26  // to create a Translator and reuse it.
    27  type Translator struct {
    28  	quickDict  *runeDict       // A quick dictionary to look up rune by index. Only available for latin runes.
    29  	runeMap    runeMap         // Rune map for translation.
    30  	ranges     []*runeRangeMap // Ranges of runes.
    31  	mappedRune rune            // If mappedRune >= 0, all matched runes are translated to the mappedRune.
    32  	reverted   bool            // If to pattern is empty, all matched characters will be deleted.
    33  	hasPattern bool
    34  }
    35  
    36  // NewTranslator creates new Translator through a from/to pattern pair.
    37  func NewTranslator(from, to string) *Translator {
    38  	tr := &Translator{}
    39  
    40  	if from == "" {
    41  		return tr
    42  	}
    43  
    44  	reverted := from[0] == '^'
    45  	deletion := len(to) == 0
    46  
    47  	if reverted {
    48  		from = from[1:]
    49  	}
    50  
    51  	var fromStart, fromEnd, fromRangeStep rune
    52  	var toStart, toEnd, toRangeStep rune
    53  	var fromRangeSize, toRangeSize rune
    54  	var singleRunes []rune
    55  
    56  	// Update the to rune range.
    57  	updateRange := func() {
    58  		// No more rune to read in the to rune pattern.
    59  		if toEnd == utf8.RuneError {
    60  			return
    61  		}
    62  
    63  		if toRangeStep == 0 {
    64  			to, toStart, toEnd, toRangeStep = nextRuneRange(to, toEnd)
    65  			return
    66  		}
    67  
    68  		// Current range is not empty. Consume 1 rune from start.
    69  		if toStart != toEnd {
    70  			toStart += toRangeStep
    71  			return
    72  		}
    73  
    74  		// No more rune. Repeat the last rune.
    75  		if to == "" {
    76  			toEnd = utf8.RuneError
    77  			return
    78  		}
    79  
    80  		// Both start and end are used. Read two more runes from the to pattern.
    81  		to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError)
    82  	}
    83  
    84  	if deletion {
    85  		toStart = utf8.RuneError
    86  		toEnd = utf8.RuneError
    87  	} else {
    88  		// If from pattern is reverted, only the last rune in the to pattern will be used.
    89  		if reverted {
    90  			var size int
    91  
    92  			for len(to) > 0 {
    93  				toStart, size = utf8.DecodeRuneInString(to)
    94  				to = to[size:]
    95  			}
    96  
    97  			toEnd = utf8.RuneError
    98  		} else {
    99  			to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError)
   100  		}
   101  	}
   102  
   103  	fromEnd = utf8.RuneError
   104  
   105  	for len(from) > 0 {
   106  		from, fromStart, fromEnd, fromRangeStep = nextRuneRange(from, fromEnd)
   107  
   108  		// fromStart is a single character. Just map it with a rune in the to pattern.
   109  		if fromRangeStep == 0 {
   110  			singleRunes = tr.addRune(fromStart, toStart, singleRunes)
   111  			updateRange()
   112  			continue
   113  		}
   114  
   115  		for toEnd != utf8.RuneError && fromStart != fromEnd {
   116  			// If mapped rune is a single character instead of a range, simply shift first
   117  			// rune in the range.
   118  			if toRangeStep == 0 {
   119  				singleRunes = tr.addRune(fromStart, toStart, singleRunes)
   120  				updateRange()
   121  				fromStart += fromRangeStep
   122  				continue
   123  			}
   124  
   125  			fromRangeSize = (fromEnd - fromStart) * fromRangeStep
   126  			toRangeSize = (toEnd - toStart) * toRangeStep
   127  
   128  			// Not enough runes in the to pattern. Need to read more.
   129  			if fromRangeSize > toRangeSize {
   130  				fromStart, toStart = tr.addRuneRange(fromStart, fromStart+toRangeSize*fromRangeStep, toStart, toEnd, singleRunes)
   131  				fromStart += fromRangeStep
   132  				updateRange()
   133  
   134  				// Edge case: If fromRangeSize == toRangeSize + 1, the last fromStart value needs be considered
   135  				// as a single rune.
   136  				if fromStart == fromEnd {
   137  					singleRunes = tr.addRune(fromStart, toStart, singleRunes)
   138  					updateRange()
   139  				}
   140  
   141  				continue
   142  			}
   143  
   144  			fromStart, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart+fromRangeSize*toRangeStep, singleRunes)
   145  			updateRange()
   146  			break
   147  		}
   148  
   149  		if fromStart == fromEnd {
   150  			fromEnd = utf8.RuneError
   151  			continue
   152  		}
   153  
   154  		_, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart, singleRunes)
   155  		fromEnd = utf8.RuneError
   156  	}
   157  
   158  	if fromEnd != utf8.RuneError {
   159  		tr.addRune(fromEnd, toStart, singleRunes)
   160  	}
   161  
   162  	tr.reverted = reverted
   163  	tr.mappedRune = -1
   164  	tr.hasPattern = true
   165  
   166  	// Translate RuneError only if in deletion or reverted mode.
   167  	if deletion || reverted {
   168  		tr.mappedRune = toStart
   169  	}
   170  
   171  	return tr
   172  }
   173  
   174  func (tr *Translator) addRune(from, to rune, singleRunes []rune) []rune {
   175  	if from <= unicode.MaxASCII {
   176  		if tr.quickDict == nil {
   177  			tr.quickDict = &runeDict{}
   178  		}
   179  
   180  		tr.quickDict.Dict[from] = to
   181  	} else {
   182  		if tr.runeMap == nil {
   183  			tr.runeMap = make(runeMap)
   184  		}
   185  
   186  		tr.runeMap[from] = to
   187  	}
   188  
   189  	singleRunes = append(singleRunes, from)
   190  	return singleRunes
   191  }
   192  
   193  func (tr *Translator) addRuneRange(fromLo, fromHi, toLo, toHi rune, singleRunes []rune) (rune, rune) {
   194  	var r rune
   195  	var rrm *runeRangeMap
   196  
   197  	if fromLo < fromHi {
   198  		rrm = &runeRangeMap{
   199  			FromLo: fromLo,
   200  			FromHi: fromHi,
   201  			ToLo:   toLo,
   202  			ToHi:   toHi,
   203  		}
   204  	} else {
   205  		rrm = &runeRangeMap{
   206  			FromLo: fromHi,
   207  			FromHi: fromLo,
   208  			ToLo:   toHi,
   209  			ToHi:   toLo,
   210  		}
   211  	}
   212  
   213  	// If there is any single rune conflicts with this rune range, clear single rune record.
   214  	for _, r = range singleRunes {
   215  		if rrm.FromLo <= r && r <= rrm.FromHi {
   216  			if r <= unicode.MaxASCII {
   217  				tr.quickDict.Dict[r] = 0
   218  			} else {
   219  				delete(tr.runeMap, r)
   220  			}
   221  		}
   222  	}
   223  
   224  	tr.ranges = append(tr.ranges, rrm)
   225  	return fromHi, toHi
   226  }
   227  
   228  func nextRuneRange(str string, last rune) (remaining string, start, end rune, rangeStep rune) {
   229  	var r rune
   230  	var size int
   231  
   232  	remaining = str
   233  	escaping := false
   234  	isRange := false
   235  
   236  	for len(remaining) > 0 {
   237  		r, size = utf8.DecodeRuneInString(remaining)
   238  		remaining = remaining[size:]
   239  
   240  		// Parse special characters.
   241  		if !escaping {
   242  			if r == '\\' {
   243  				escaping = true
   244  				continue
   245  			}
   246  
   247  			if r == '-' {
   248  				// Ignore slash at beginning of string.
   249  				if last == utf8.RuneError {
   250  					continue
   251  				}
   252  
   253  				start = last
   254  				isRange = true
   255  				continue
   256  			}
   257  		}
   258  
   259  		escaping = false
   260  
   261  		if last != utf8.RuneError {
   262  			// This is a range which start and end are the same.
   263  			// Considier it as a normal character.
   264  			if isRange && last == r {
   265  				isRange = false
   266  				continue
   267  			}
   268  
   269  			start = last
   270  			end = r
   271  
   272  			if isRange {
   273  				if start < end {
   274  					rangeStep = 1
   275  				} else {
   276  					rangeStep = -1
   277  				}
   278  			}
   279  
   280  			return
   281  		}
   282  
   283  		last = r
   284  	}
   285  
   286  	start = last
   287  	end = utf8.RuneError
   288  	return
   289  }
   290  
   291  // Translate str with a from/to pattern pair.
   292  //
   293  // See comment in Translate function for usage and samples.
   294  func (tr *Translator) Translate(str string) string {
   295  	if !tr.hasPattern || str == "" {
   296  		return str
   297  	}
   298  
   299  	var r rune
   300  	var size int
   301  	var needTr bool
   302  
   303  	orig := str
   304  
   305  	var output *stringBuilder
   306  
   307  	for len(str) > 0 {
   308  		r, size = utf8.DecodeRuneInString(str)
   309  		r, needTr = tr.TranslateRune(r)
   310  
   311  		if needTr && output == nil {
   312  			output = allocBuffer(orig, str)
   313  		}
   314  
   315  		if r != utf8.RuneError && output != nil {
   316  			output.WriteRune(r)
   317  		}
   318  
   319  		str = str[size:]
   320  	}
   321  
   322  	// No character is translated.
   323  	if output == nil {
   324  		return orig
   325  	}
   326  
   327  	return output.String()
   328  }
   329  
   330  // TranslateRune return translated rune and true if r matches the from pattern.
   331  // If r doesn't match the pattern, original r is returned and translated is false.
   332  func (tr *Translator) TranslateRune(r rune) (result rune, translated bool) {
   333  	switch {
   334  	case tr.quickDict != nil:
   335  		if r <= unicode.MaxASCII {
   336  			result = tr.quickDict.Dict[r]
   337  
   338  			if result != 0 {
   339  				translated = true
   340  
   341  				if tr.mappedRune >= 0 {
   342  					result = tr.mappedRune
   343  				}
   344  
   345  				break
   346  			}
   347  		}
   348  
   349  		fallthrough
   350  
   351  	case tr.runeMap != nil:
   352  		var ok bool
   353  
   354  		if result, ok = tr.runeMap[r]; ok {
   355  			translated = true
   356  
   357  			if tr.mappedRune >= 0 {
   358  				result = tr.mappedRune
   359  			}
   360  
   361  			break
   362  		}
   363  
   364  		fallthrough
   365  
   366  	default:
   367  		var rrm *runeRangeMap
   368  		ranges := tr.ranges
   369  
   370  		for i := len(ranges) - 1; i >= 0; i-- {
   371  			rrm = ranges[i]
   372  
   373  			if rrm.FromLo <= r && r <= rrm.FromHi {
   374  				translated = true
   375  
   376  				if tr.mappedRune >= 0 {
   377  					result = tr.mappedRune
   378  					break
   379  				}
   380  
   381  				if rrm.ToLo < rrm.ToHi {
   382  					result = rrm.ToLo + r - rrm.FromLo
   383  				} else if rrm.ToLo > rrm.ToHi {
   384  					// ToHi can be smaller than ToLo if range is from higher to lower.
   385  					result = rrm.ToLo - r + rrm.FromLo
   386  				} else {
   387  					result = rrm.ToLo
   388  				}
   389  
   390  				break
   391  			}
   392  		}
   393  	}
   394  
   395  	if tr.reverted {
   396  		if !translated {
   397  			result = tr.mappedRune
   398  		}
   399  
   400  		translated = !translated
   401  	}
   402  
   403  	if !translated {
   404  		result = r
   405  	}
   406  
   407  	return
   408  }
   409  
   410  // HasPattern returns true if Translator has one pattern at least.
   411  func (tr *Translator) HasPattern() bool {
   412  	return tr.hasPattern
   413  }
   414  
   415  // Translate str with the characters defined in from replaced by characters defined in to.
   416  //
   417  // From and to are patterns representing a set of characters. Pattern is defined as following.
   418  //
   419  // Special characters:
   420  //
   421  //  1. '-' means a range of runes, e.g.
   422  //     "a-z" means all characters from 'a' to 'z' inclusive;
   423  //     "z-a" means all characters from 'z' to 'a' inclusive.
   424  //  2. '^' as first character means a set of all runes excepted listed, e.g.
   425  //     "^a-z" means all characters except 'a' to 'z' inclusive.
   426  //  3. '\' escapes special characters.
   427  //
   428  // Normal character represents itself, e.g. "abc" is a set including 'a', 'b' and 'c'.
   429  //
   430  // Translate will try to find a 1:1 mapping from from to to.
   431  // If to is smaller than from, last rune in to will be used to map "out of range" characters in from.
   432  //
   433  // Note that '^' only works in the from pattern. It will be considered as a normal character in the to pattern.
   434  //
   435  // If the to pattern is an empty string, Translate works exactly the same as Delete.
   436  //
   437  // Samples:
   438  //
   439  //	Translate("hello", "aeiou", "12345")    => "h2ll4"
   440  //	Translate("hello", "a-z", "A-Z")        => "HELLO"
   441  //	Translate("hello", "z-a", "a-z")        => "svool"
   442  //	Translate("hello", "aeiou", "*")        => "h*ll*"
   443  //	Translate("hello", "^l", "*")           => "**ll*"
   444  //	Translate("hello ^ world", `\^lo`, "*") => "he*** * w*r*d"
   445  func Translate(str, from, to string) string {
   446  	tr := NewTranslator(from, to)
   447  	return tr.Translate(str)
   448  }
   449  
   450  // Delete runes in str matching the pattern.
   451  // Pattern is defined in Translate function.
   452  //
   453  // Samples:
   454  //
   455  //	Delete("hello", "aeiou") => "hll"
   456  //	Delete("hello", "a-k")   => "llo"
   457  //	Delete("hello", "^a-k")  => "he"
   458  func Delete(str, pattern string) string {
   459  	tr := NewTranslator(pattern, "")
   460  	return tr.Translate(str)
   461  }
   462  
   463  // Count how many runes in str match the pattern.
   464  // Pattern is defined in Translate function.
   465  //
   466  // Samples:
   467  //
   468  //	Count("hello", "aeiou") => 3
   469  //	Count("hello", "a-k")   => 3
   470  //	Count("hello", "^a-k")  => 2
   471  func Count(str, pattern string) int {
   472  	if pattern == "" || str == "" {
   473  		return 0
   474  	}
   475  
   476  	var r rune
   477  	var size int
   478  	var matched bool
   479  
   480  	tr := NewTranslator(pattern, "")
   481  	cnt := 0
   482  
   483  	for len(str) > 0 {
   484  		r, size = utf8.DecodeRuneInString(str)
   485  		str = str[size:]
   486  
   487  		if _, matched = tr.TranslateRune(r); matched {
   488  			cnt++
   489  		}
   490  	}
   491  
   492  	return cnt
   493  }
   494  
   495  // Squeeze deletes adjacent repeated runes in str.
   496  // If pattern is not empty, only runes matching the pattern will be squeezed.
   497  //
   498  // Samples:
   499  //
   500  //	Squeeze("hello", "")             => "helo"
   501  //	Squeeze("hello", "m-z")          => "hello"
   502  //	Squeeze("hello   world", " ")    => "hello world"
   503  func Squeeze(str, pattern string) string {
   504  	var last, r rune
   505  	var size int
   506  	var skipSqueeze, matched bool
   507  	var tr *Translator
   508  	var output *stringBuilder
   509  
   510  	orig := str
   511  	last = -1
   512  
   513  	if len(pattern) > 0 {
   514  		tr = NewTranslator(pattern, "")
   515  	}
   516  
   517  	for len(str) > 0 {
   518  		r, size = utf8.DecodeRuneInString(str)
   519  
   520  		// Need to squeeze the str.
   521  		if last == r && !skipSqueeze {
   522  			if tr != nil {
   523  				if _, matched = tr.TranslateRune(r); !matched {
   524  					skipSqueeze = true
   525  				}
   526  			}
   527  
   528  			if output == nil {
   529  				output = allocBuffer(orig, str)
   530  			}
   531  
   532  			if skipSqueeze {
   533  				output.WriteRune(r)
   534  			}
   535  		} else {
   536  			if output != nil {
   537  				output.WriteRune(r)
   538  			}
   539  
   540  			last = r
   541  			skipSqueeze = false
   542  		}
   543  
   544  		str = str[size:]
   545  	}
   546  
   547  	if output == nil {
   548  		return orig
   549  	}
   550  
   551  	return output.String()
   552  }
   553  

View as plain text