...

Source file src/github.com/rivo/uniseg/sentencerules.go

Documentation: github.com/rivo/uniseg

     1  package uniseg
     2  
     3  import "unicode/utf8"
     4  
     5  // The states of the sentence break parser.
     6  const (
     7  	sbAny = iota
     8  	sbCR
     9  	sbParaSep
    10  	sbATerm
    11  	sbUpper
    12  	sbLower
    13  	sbSB7
    14  	sbSB8Close
    15  	sbSB8Sp
    16  	sbSTerm
    17  	sbSB8aClose
    18  	sbSB8aSp
    19  )
    20  
    21  // sbTransitions implements the sentence break parser's state transitions. It's
    22  // anologous to [grTransitions], see comments there for details.
    23  //
    24  // Unicode version 15.0.0.
    25  func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
    26  	switch uint64(state) | uint64(prop)<<32 {
    27  	// SB3.
    28  	case sbAny | prCR<<32:
    29  		return sbCR, false, 9990
    30  	case sbCR | prLF<<32:
    31  		return sbParaSep, false, 30
    32  
    33  	// SB4.
    34  	case sbAny | prSep<<32:
    35  		return sbParaSep, false, 9990
    36  	case sbAny | prLF<<32:
    37  		return sbParaSep, false, 9990
    38  	case sbParaSep | prAny<<32:
    39  		return sbAny, true, 40
    40  	case sbCR | prAny<<32:
    41  		return sbAny, true, 40
    42  
    43  	// SB6.
    44  	case sbAny | prATerm<<32:
    45  		return sbATerm, false, 9990
    46  	case sbATerm | prNumeric<<32:
    47  		return sbAny, false, 60
    48  	case sbSB7 | prNumeric<<32:
    49  		return sbAny, false, 60 // Because ATerm also appears in SB7.
    50  
    51  	// SB7.
    52  	case sbAny | prUpper<<32:
    53  		return sbUpper, false, 9990
    54  	case sbAny | prLower<<32:
    55  		return sbLower, false, 9990
    56  	case sbUpper | prATerm<<32:
    57  		return sbSB7, false, 70
    58  	case sbLower | prATerm<<32:
    59  		return sbSB7, false, 70
    60  	case sbSB7 | prUpper<<32:
    61  		return sbUpper, false, 70
    62  
    63  	// SB8a.
    64  	case sbAny | prSTerm<<32:
    65  		return sbSTerm, false, 9990
    66  	case sbATerm | prSContinue<<32:
    67  		return sbAny, false, 81
    68  	case sbATerm | prATerm<<32:
    69  		return sbATerm, false, 81
    70  	case sbATerm | prSTerm<<32:
    71  		return sbSTerm, false, 81
    72  	case sbSB7 | prSContinue<<32:
    73  		return sbAny, false, 81
    74  	case sbSB7 | prATerm<<32:
    75  		return sbATerm, false, 81
    76  	case sbSB7 | prSTerm<<32:
    77  		return sbSTerm, false, 81
    78  	case sbSB8Close | prSContinue<<32:
    79  		return sbAny, false, 81
    80  	case sbSB8Close | prATerm<<32:
    81  		return sbATerm, false, 81
    82  	case sbSB8Close | prSTerm<<32:
    83  		return sbSTerm, false, 81
    84  	case sbSB8Sp | prSContinue<<32:
    85  		return sbAny, false, 81
    86  	case sbSB8Sp | prATerm<<32:
    87  		return sbATerm, false, 81
    88  	case sbSB8Sp | prSTerm<<32:
    89  		return sbSTerm, false, 81
    90  	case sbSTerm | prSContinue<<32:
    91  		return sbAny, false, 81
    92  	case sbSTerm | prATerm<<32:
    93  		return sbATerm, false, 81
    94  	case sbSTerm | prSTerm<<32:
    95  		return sbSTerm, false, 81
    96  	case sbSB8aClose | prSContinue<<32:
    97  		return sbAny, false, 81
    98  	case sbSB8aClose | prATerm<<32:
    99  		return sbATerm, false, 81
   100  	case sbSB8aClose | prSTerm<<32:
   101  		return sbSTerm, false, 81
   102  	case sbSB8aSp | prSContinue<<32:
   103  		return sbAny, false, 81
   104  	case sbSB8aSp | prATerm<<32:
   105  		return sbATerm, false, 81
   106  	case sbSB8aSp | prSTerm<<32:
   107  		return sbSTerm, false, 81
   108  
   109  	// SB9.
   110  	case sbATerm | prClose<<32:
   111  		return sbSB8Close, false, 90
   112  	case sbSB7 | prClose<<32:
   113  		return sbSB8Close, false, 90
   114  	case sbSB8Close | prClose<<32:
   115  		return sbSB8Close, false, 90
   116  	case sbATerm | prSp<<32:
   117  		return sbSB8Sp, false, 90
   118  	case sbSB7 | prSp<<32:
   119  		return sbSB8Sp, false, 90
   120  	case sbSB8Close | prSp<<32:
   121  		return sbSB8Sp, false, 90
   122  	case sbSTerm | prClose<<32:
   123  		return sbSB8aClose, false, 90
   124  	case sbSB8aClose | prClose<<32:
   125  		return sbSB8aClose, false, 90
   126  	case sbSTerm | prSp<<32:
   127  		return sbSB8aSp, false, 90
   128  	case sbSB8aClose | prSp<<32:
   129  		return sbSB8aSp, false, 90
   130  	case sbATerm | prSep<<32:
   131  		return sbParaSep, false, 90
   132  	case sbATerm | prCR<<32:
   133  		return sbParaSep, false, 90
   134  	case sbATerm | prLF<<32:
   135  		return sbParaSep, false, 90
   136  	case sbSB7 | prSep<<32:
   137  		return sbParaSep, false, 90
   138  	case sbSB7 | prCR<<32:
   139  		return sbParaSep, false, 90
   140  	case sbSB7 | prLF<<32:
   141  		return sbParaSep, false, 90
   142  	case sbSB8Close | prSep<<32:
   143  		return sbParaSep, false, 90
   144  	case sbSB8Close | prCR<<32:
   145  		return sbParaSep, false, 90
   146  	case sbSB8Close | prLF<<32:
   147  		return sbParaSep, false, 90
   148  	case sbSTerm | prSep<<32:
   149  		return sbParaSep, false, 90
   150  	case sbSTerm | prCR<<32:
   151  		return sbParaSep, false, 90
   152  	case sbSTerm | prLF<<32:
   153  		return sbParaSep, false, 90
   154  	case sbSB8aClose | prSep<<32:
   155  		return sbParaSep, false, 90
   156  	case sbSB8aClose | prCR<<32:
   157  		return sbParaSep, false, 90
   158  	case sbSB8aClose | prLF<<32:
   159  		return sbParaSep, false, 90
   160  
   161  	// SB10.
   162  	case sbSB8Sp | prSp<<32:
   163  		return sbSB8Sp, false, 100
   164  	case sbSB8aSp | prSp<<32:
   165  		return sbSB8aSp, false, 100
   166  	case sbSB8Sp | prSep<<32:
   167  		return sbParaSep, false, 100
   168  	case sbSB8Sp | prCR<<32:
   169  		return sbParaSep, false, 100
   170  	case sbSB8Sp | prLF<<32:
   171  		return sbParaSep, false, 100
   172  
   173  	// SB11.
   174  	case sbATerm | prAny<<32:
   175  		return sbAny, true, 110
   176  	case sbSB7 | prAny<<32:
   177  		return sbAny, true, 110
   178  	case sbSB8Close | prAny<<32:
   179  		return sbAny, true, 110
   180  	case sbSB8Sp | prAny<<32:
   181  		return sbAny, true, 110
   182  	case sbSTerm | prAny<<32:
   183  		return sbAny, true, 110
   184  	case sbSB8aClose | prAny<<32:
   185  		return sbAny, true, 110
   186  	case sbSB8aSp | prAny<<32:
   187  		return sbAny, true, 110
   188  	// We'll always break after ParaSep due to SB4.
   189  
   190  	default:
   191  		return -1, false, -1
   192  	}
   193  }
   194  
   195  // transitionSentenceBreakState determines the new state of the sentence break
   196  // parser given the current state and the next code point. It also returns
   197  // whether a sentence boundary was detected. If more than one code point is
   198  // needed to determine the new state, the byte slice or the string starting
   199  // after rune "r" can be used (whichever is not nil or empty) for further
   200  // lookups.
   201  func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
   202  	// Determine the property of the next character.
   203  	nextProperty := property(sentenceBreakCodePoints, r)
   204  
   205  	// SB5 (Replacing Ignore Rules).
   206  	if nextProperty == prExtend || nextProperty == prFormat {
   207  		if state == sbParaSep || state == sbCR {
   208  			return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
   209  		}
   210  		if state < 0 {
   211  			return sbAny, true // SB1.
   212  		}
   213  		return state, false
   214  	}
   215  
   216  	// Find the applicable transition in the table.
   217  	var rule int
   218  	newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
   219  	if newState < 0 {
   220  		// No specific transition found. Try the less specific ones.
   221  		anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
   222  		anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
   223  		if anyPropState >= 0 && anyStateState >= 0 {
   224  			// Both apply. We'll use a mix (see comments for grTransitions).
   225  			newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
   226  			if anyPropRule < anyStateRule {
   227  				sentenceBreak, rule = anyPropProp, anyPropRule
   228  			}
   229  		} else if anyPropState >= 0 {
   230  			// We only have a specific state.
   231  			newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
   232  			// This branch will probably never be reached because okAnyState will
   233  			// always be true given the current transition map. But we keep it here
   234  			// for future modifications to the transition map where this may not be
   235  			// true anymore.
   236  		} else if anyStateState >= 0 {
   237  			// We only have a specific property.
   238  			newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
   239  		} else {
   240  			// No known transition. SB999: Any × Any.
   241  			newState, sentenceBreak, rule = sbAny, false, 9990
   242  		}
   243  	}
   244  
   245  	// SB8.
   246  	if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
   247  		// Check the right side of the rule.
   248  		var length int
   249  		for nextProperty != prOLetter &&
   250  			nextProperty != prUpper &&
   251  			nextProperty != prLower &&
   252  			nextProperty != prSep &&
   253  			nextProperty != prCR &&
   254  			nextProperty != prLF &&
   255  			nextProperty != prATerm &&
   256  			nextProperty != prSTerm {
   257  			// Move on to the next rune.
   258  			if b != nil { // Byte slice version.
   259  				r, length = utf8.DecodeRune(b)
   260  				b = b[length:]
   261  			} else { // String version.
   262  				r, length = utf8.DecodeRuneInString(str)
   263  				str = str[length:]
   264  			}
   265  			if r == utf8.RuneError {
   266  				break
   267  			}
   268  			nextProperty = property(sentenceBreakCodePoints, r)
   269  		}
   270  		if nextProperty == prLower {
   271  			return sbLower, false
   272  		}
   273  	}
   274  
   275  	return
   276  }
   277  

View as plain text