...

Source file src/github.com/rivo/uniseg/wordrules.go

Documentation: github.com/rivo/uniseg

     1  package uniseg
     2  
     3  import "unicode/utf8"
     4  
     5  // The states of the word break parser.
     6  const (
     7  	wbAny = iota
     8  	wbCR
     9  	wbLF
    10  	wbNewline
    11  	wbWSegSpace
    12  	wbHebrewLetter
    13  	wbALetter
    14  	wbWB7
    15  	wbWB7c
    16  	wbNumeric
    17  	wbWB11
    18  	wbKatakana
    19  	wbExtendNumLet
    20  	wbOddRI
    21  	wbEvenRI
    22  	wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
    23  )
    24  
    25  // wbTransitions implements the word break parser's state transitions. It's
    26  // anologous to [grTransitions], see comments there for details.
    27  //
    28  // Unicode version 15.0.0.
    29  func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
    30  	switch uint64(state) | uint64(prop)<<32 {
    31  	// WB3b.
    32  	case wbAny | prNewline<<32:
    33  		return wbNewline, true, 32
    34  	case wbAny | prCR<<32:
    35  		return wbCR, true, 32
    36  	case wbAny | prLF<<32:
    37  		return wbLF, true, 32
    38  
    39  	// WB3a.
    40  	case wbNewline | prAny<<32:
    41  		return wbAny, true, 31
    42  	case wbCR | prAny<<32:
    43  		return wbAny, true, 31
    44  	case wbLF | prAny<<32:
    45  		return wbAny, true, 31
    46  
    47  	// WB3.
    48  	case wbCR | prLF<<32:
    49  		return wbLF, false, 30
    50  
    51  	// WB3d.
    52  	case wbAny | prWSegSpace<<32:
    53  		return wbWSegSpace, true, 9990
    54  	case wbWSegSpace | prWSegSpace<<32:
    55  		return wbWSegSpace, false, 34
    56  
    57  	// WB5.
    58  	case wbAny | prALetter<<32:
    59  		return wbALetter, true, 9990
    60  	case wbAny | prHebrewLetter<<32:
    61  		return wbHebrewLetter, true, 9990
    62  	case wbALetter | prALetter<<32:
    63  		return wbALetter, false, 50
    64  	case wbALetter | prHebrewLetter<<32:
    65  		return wbHebrewLetter, false, 50
    66  	case wbHebrewLetter | prALetter<<32:
    67  		return wbALetter, false, 50
    68  	case wbHebrewLetter | prHebrewLetter<<32:
    69  		return wbHebrewLetter, false, 50
    70  
    71  	// WB7. Transitions to wbWB7 handled by transitionWordBreakState().
    72  	case wbWB7 | prALetter<<32:
    73  		return wbALetter, false, 70
    74  	case wbWB7 | prHebrewLetter<<32:
    75  		return wbHebrewLetter, false, 70
    76  
    77  	// WB7a.
    78  	case wbHebrewLetter | prSingleQuote<<32:
    79  		return wbAny, false, 71
    80  
    81  	// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
    82  	case wbWB7c | prHebrewLetter<<32:
    83  		return wbHebrewLetter, false, 73
    84  
    85  	// WB8.
    86  	case wbAny | prNumeric<<32:
    87  		return wbNumeric, true, 9990
    88  	case wbNumeric | prNumeric<<32:
    89  		return wbNumeric, false, 80
    90  
    91  	// WB9.
    92  	case wbALetter | prNumeric<<32:
    93  		return wbNumeric, false, 90
    94  	case wbHebrewLetter | prNumeric<<32:
    95  		return wbNumeric, false, 90
    96  
    97  	// WB10.
    98  	case wbNumeric | prALetter<<32:
    99  		return wbALetter, false, 100
   100  	case wbNumeric | prHebrewLetter<<32:
   101  		return wbHebrewLetter, false, 100
   102  
   103  	// WB11. Transitions to wbWB11 handled by transitionWordBreakState().
   104  	case wbWB11 | prNumeric<<32:
   105  		return wbNumeric, false, 110
   106  
   107  	// WB13.
   108  	case wbAny | prKatakana<<32:
   109  		return wbKatakana, true, 9990
   110  	case wbKatakana | prKatakana<<32:
   111  		return wbKatakana, false, 130
   112  
   113  	// WB13a.
   114  	case wbAny | prExtendNumLet<<32:
   115  		return wbExtendNumLet, true, 9990
   116  	case wbALetter | prExtendNumLet<<32:
   117  		return wbExtendNumLet, false, 131
   118  	case wbHebrewLetter | prExtendNumLet<<32:
   119  		return wbExtendNumLet, false, 131
   120  	case wbNumeric | prExtendNumLet<<32:
   121  		return wbExtendNumLet, false, 131
   122  	case wbKatakana | prExtendNumLet<<32:
   123  		return wbExtendNumLet, false, 131
   124  	case wbExtendNumLet | prExtendNumLet<<32:
   125  		return wbExtendNumLet, false, 131
   126  
   127  	// WB13b.
   128  	case wbExtendNumLet | prALetter<<32:
   129  		return wbALetter, false, 132
   130  	case wbExtendNumLet | prHebrewLetter<<32:
   131  		return wbHebrewLetter, false, 132
   132  	case wbExtendNumLet | prNumeric<<32:
   133  		return wbNumeric, false, 132
   134  	case wbExtendNumLet | prKatakana<<32:
   135  		return wbKatakana, false, 132
   136  
   137  	default:
   138  		return -1, false, -1
   139  	}
   140  }
   141  
   142  // transitionWordBreakState determines the new state of the word break parser
   143  // given the current state and the next code point. It also returns whether a
   144  // word boundary was detected. If more than one code point is needed to
   145  // determine the new state, the byte slice or the string starting after rune "r"
   146  // can be used (whichever is not nil or empty) for further lookups.
   147  func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
   148  	// Determine the property of the next character.
   149  	nextProperty := property(workBreakCodePoints, r)
   150  
   151  	// "Replacing Ignore Rules".
   152  	if nextProperty == prZWJ {
   153  		// WB4 (for zero-width joiners).
   154  		if state == wbNewline || state == wbCR || state == wbLF {
   155  			return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
   156  		}
   157  		if state < 0 {
   158  			return wbAny | wbZWJBit, false
   159  		}
   160  		return state | wbZWJBit, false
   161  	} else if nextProperty == prExtend || nextProperty == prFormat {
   162  		// WB4 (for Extend and Format).
   163  		if state == wbNewline || state == wbCR || state == wbLF {
   164  			return wbAny, true // Make sure we don't apply WB4 to WB3a.
   165  		}
   166  		if state == wbWSegSpace || state == wbAny|wbZWJBit {
   167  			return wbAny, false // We don't break but this is also not WB3d or WB3c.
   168  		}
   169  		if state < 0 {
   170  			return wbAny, false
   171  		}
   172  		return state, false
   173  	} else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
   174  		// WB3c.
   175  		return wbAny, false
   176  	}
   177  	if state >= 0 {
   178  		state = state &^ wbZWJBit
   179  	}
   180  
   181  	// Find the applicable transition in the table.
   182  	var rule int
   183  	newState, wordBreak, rule = wbTransitions(state, nextProperty)
   184  	if newState < 0 {
   185  		// No specific transition found. Try the less specific ones.
   186  		anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
   187  		anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
   188  		if anyPropState >= 0 && anyStateState >= 0 {
   189  			// Both apply. We'll use a mix (see comments for grTransitions).
   190  			newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
   191  			if anyPropRule < anyStateRule {
   192  				wordBreak, rule = anyPropWordBreak, anyPropRule
   193  			}
   194  		} else if anyPropState >= 0 {
   195  			// We only have a specific state.
   196  			newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
   197  			// This branch will probably never be reached because okAnyState will
   198  			// always be true given the current transition map. But we keep it here
   199  			// for future modifications to the transition map where this may not be
   200  			// true anymore.
   201  		} else if anyStateState >= 0 {
   202  			// We only have a specific property.
   203  			newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
   204  		} else {
   205  			// No known transition. WB999: Any รท Any.
   206  			newState, wordBreak, rule = wbAny, true, 9990
   207  		}
   208  	}
   209  
   210  	// For those rules that need to look up runes further in the string, we
   211  	// determine the property after nextProperty, skipping over Format, Extend,
   212  	// and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
   213  	// be determined (because the text ends or the rune is faulty).
   214  	farProperty := -1
   215  	if rule > 60 &&
   216  		(state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
   217  		(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
   218  			nextProperty == prDoubleQuote || // WB7b.
   219  			nextProperty == prMidNum) { // WB12.
   220  		for {
   221  			var (
   222  				r      rune
   223  				length int
   224  			)
   225  			if b != nil { // Byte slice version.
   226  				r, length = utf8.DecodeRune(b)
   227  				b = b[length:]
   228  			} else { // String version.
   229  				r, length = utf8.DecodeRuneInString(str)
   230  				str = str[length:]
   231  			}
   232  			if r == utf8.RuneError {
   233  				break
   234  			}
   235  			prop := property(workBreakCodePoints, r)
   236  			if prop == prExtend || prop == prFormat || prop == prZWJ {
   237  				continue
   238  			}
   239  			farProperty = prop
   240  			break
   241  		}
   242  	}
   243  
   244  	// WB6.
   245  	if rule > 60 &&
   246  		(state == wbALetter || state == wbHebrewLetter) &&
   247  		(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
   248  		(farProperty == prALetter || farProperty == prHebrewLetter) {
   249  		return wbWB7, false
   250  	}
   251  
   252  	// WB7b.
   253  	if rule > 72 &&
   254  		state == wbHebrewLetter &&
   255  		nextProperty == prDoubleQuote &&
   256  		farProperty == prHebrewLetter {
   257  		return wbWB7c, false
   258  	}
   259  
   260  	// WB12.
   261  	if rule > 120 &&
   262  		state == wbNumeric &&
   263  		(nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
   264  		farProperty == prNumeric {
   265  		return wbWB11, false
   266  	}
   267  
   268  	// WB15 and WB16.
   269  	if newState == wbAny && nextProperty == prRegionalIndicator {
   270  		if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
   271  			// Transition into the first RI.
   272  			return wbOddRI, true
   273  		}
   274  		if state == wbOddRI {
   275  			// Don't break pairs of Regional Indicators.
   276  			return wbEvenRI, false
   277  		}
   278  		return wbOddRI, true // We can break after a pair.
   279  	}
   280  
   281  	return
   282  }
   283  

View as plain text