...

Source file src/github.com/rivo/uniseg/graphemerules.go

Documentation: github.com/rivo/uniseg

     1  package uniseg
     2  
     3  // The states of the grapheme cluster parser.
     4  const (
     5  	grAny = iota
     6  	grCR
     7  	grControlLF
     8  	grL
     9  	grLVV
    10  	grLVTT
    11  	grPrepend
    12  	grExtendedPictographic
    13  	grExtendedPictographicZWJ
    14  	grRIOdd
    15  	grRIEven
    16  )
    17  
    18  // The grapheme cluster parser's breaking instructions.
    19  const (
    20  	grNoBoundary = iota
    21  	grBoundary
    22  )
    23  
    24  // grTransitions implements the grapheme cluster parser's state transitions.
    25  // Maps state and property to a new state, a breaking instruction, and rule
    26  // number. The breaking instruction always refers to the boundary between the
    27  // last and next code point. Returns negative values if no transition is found.
    28  //
    29  // This function is used as follows:
    30  //
    31  //  1. Find specific state + specific property. Stop if found.
    32  //  2. Find specific state + any property.
    33  //  3. Find any state + specific property.
    34  //  4. If only (2) or (3) (but not both) was found, stop.
    35  //  5. If both (2) and (3) were found, use state from (3) and breaking instruction
    36  //     from the transition with the lower rule number, prefer (3) if rule numbers
    37  //     are equal. Stop.
    38  //  6. Assume grAny and grBoundary.
    39  //
    40  // Unicode version 15.0.0.
    41  func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
    42  	// It turns out that using a big switch statement is much faster than using
    43  	// a map.
    44  
    45  	switch uint64(state) | uint64(prop)<<32 {
    46  	// GB5
    47  	case grAny | prCR<<32:
    48  		return grCR, grBoundary, 50
    49  	case grAny | prLF<<32:
    50  		return grControlLF, grBoundary, 50
    51  	case grAny | prControl<<32:
    52  		return grControlLF, grBoundary, 50
    53  
    54  	// GB4
    55  	case grCR | prAny<<32:
    56  		return grAny, grBoundary, 40
    57  	case grControlLF | prAny<<32:
    58  		return grAny, grBoundary, 40
    59  
    60  	// GB3
    61  	case grCR | prLF<<32:
    62  		return grControlLF, grNoBoundary, 30
    63  
    64  	// GB6
    65  	case grAny | prL<<32:
    66  		return grL, grBoundary, 9990
    67  	case grL | prL<<32:
    68  		return grL, grNoBoundary, 60
    69  	case grL | prV<<32:
    70  		return grLVV, grNoBoundary, 60
    71  	case grL | prLV<<32:
    72  		return grLVV, grNoBoundary, 60
    73  	case grL | prLVT<<32:
    74  		return grLVTT, grNoBoundary, 60
    75  
    76  	// GB7
    77  	case grAny | prLV<<32:
    78  		return grLVV, grBoundary, 9990
    79  	case grAny | prV<<32:
    80  		return grLVV, grBoundary, 9990
    81  	case grLVV | prV<<32:
    82  		return grLVV, grNoBoundary, 70
    83  	case grLVV | prT<<32:
    84  		return grLVTT, grNoBoundary, 70
    85  
    86  	// GB8
    87  	case grAny | prLVT<<32:
    88  		return grLVTT, grBoundary, 9990
    89  	case grAny | prT<<32:
    90  		return grLVTT, grBoundary, 9990
    91  	case grLVTT | prT<<32:
    92  		return grLVTT, grNoBoundary, 80
    93  
    94  	// GB9
    95  	case grAny | prExtend<<32:
    96  		return grAny, grNoBoundary, 90
    97  	case grAny | prZWJ<<32:
    98  		return grAny, grNoBoundary, 90
    99  
   100  	// GB9a
   101  	case grAny | prSpacingMark<<32:
   102  		return grAny, grNoBoundary, 91
   103  
   104  	// GB9b
   105  	case grAny | prPrepend<<32:
   106  		return grPrepend, grBoundary, 9990
   107  	case grPrepend | prAny<<32:
   108  		return grAny, grNoBoundary, 92
   109  
   110  	// GB11
   111  	case grAny | prExtendedPictographic<<32:
   112  		return grExtendedPictographic, grBoundary, 9990
   113  	case grExtendedPictographic | prExtend<<32:
   114  		return grExtendedPictographic, grNoBoundary, 110
   115  	case grExtendedPictographic | prZWJ<<32:
   116  		return grExtendedPictographicZWJ, grNoBoundary, 110
   117  	case grExtendedPictographicZWJ | prExtendedPictographic<<32:
   118  		return grExtendedPictographic, grNoBoundary, 110
   119  
   120  	// GB12 / GB13
   121  	case grAny | prRegionalIndicator<<32:
   122  		return grRIOdd, grBoundary, 9990
   123  	case grRIOdd | prRegionalIndicator<<32:
   124  		return grRIEven, grNoBoundary, 120
   125  	case grRIEven | prRegionalIndicator<<32:
   126  		return grRIOdd, grBoundary, 120
   127  	default:
   128  		return -1, -1, -1
   129  	}
   130  }
   131  
   132  // transitionGraphemeState determines the new state of the grapheme cluster
   133  // parser given the current state and the next code point. It also returns the
   134  // code point's grapheme property (the value mapped by the [graphemeCodePoints]
   135  // table) and whether a cluster boundary was detected.
   136  func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
   137  	// Determine the property of the next character.
   138  	prop = propertyGraphemes(r)
   139  
   140  	// Find the applicable transition.
   141  	nextState, nextProp, _ := grTransitions(state, prop)
   142  	if nextState >= 0 {
   143  		// We have a specific transition. We'll use it.
   144  		return nextState, prop, nextProp == grBoundary
   145  	}
   146  
   147  	// No specific transition found. Try the less specific ones.
   148  	anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
   149  	anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
   150  	if anyPropState >= 0 && anyStateState >= 0 {
   151  		// Both apply. We'll use a mix (see comments for grTransitions).
   152  		newState = anyStateState
   153  		boundary = anyStateProp == grBoundary
   154  		if anyPropRule < anyStateRule {
   155  			boundary = anyPropProp == grBoundary
   156  		}
   157  		return
   158  	}
   159  
   160  	if anyPropState >= 0 {
   161  		// We only have a specific state.
   162  		return anyPropState, prop, anyPropProp == grBoundary
   163  		// This branch will probably never be reached because okAnyState will
   164  		// always be true given the current transition map. But we keep it here
   165  		// for future modifications to the transition map where this may not be
   166  		// true anymore.
   167  	}
   168  
   169  	if anyStateState >= 0 {
   170  		// We only have a specific property.
   171  		return anyStateState, prop, anyStateProp == grBoundary
   172  	}
   173  
   174  	// No known transition. GB999: Any รท Any.
   175  	return grAny, prop, true
   176  }
   177  

View as plain text