step.go

Documentation: github.com/rivo/uniseg

     1  package uniseg
     2  
     3  import "unicode/utf8"
     4  
     5  // The bit masks used to extract boundary information returned by [Step].
     6  const (
     7  	MaskLine     = 3
     8  	MaskWord     = 4
     9  	MaskSentence = 8
    10  )
    11  
    12  // The number of bits to shift the boundary information returned by [Step] to
    13  // obtain the monospace width of the grapheme cluster.
    14  const ShiftWidth = 4
    15  
    16  // The bit positions by which boundary flags are shifted by the [Step] function.
    17  // These must correspond to the Mask constants.
    18  const (
    19  	shiftWord     = 2
    20  	shiftSentence = 3
    21  	// shiftwWidth is ShiftWidth above. No mask as these are always the remaining bits.
    22  )
    23  
    24  // The bit positions by which states are shifted by the [Step] function. These
    25  // values must ensure state values defined for each of the boundary algorithms
    26  // don't overlap (and that they all still fit in a single int). These must
    27  // correspond to the Mask constants.
    28  const (
    29  	shiftWordState     = 4
    30  	shiftSentenceState = 9
    31  	shiftLineState     = 13
    32  	shiftPropState     = 21 // No mask as these are always the remaining bits.
    33  )
    34  
    35  // The bit mask used to extract the state returned by the [Step] function, after
    36  // shifting. These values must correspond to the shift constants.
    37  const (
    38  	maskGraphemeState = 0xf
    39  	maskWordState     = 0x1f
    40  	maskSentenceState = 0xf
    41  	maskLineState     = 0xff
    42  )
    43  
    44  // Step returns the first grapheme cluster (user-perceived character) found in
    45  // the given byte slice. It also returns information about the boundary between
    46  // that grapheme cluster and the one following it as well as the monospace width
    47  // of the grapheme cluster. There are three types of boundary information: word
    48  // boundaries, sentence boundaries, and line breaks. This function is therefore
    49  // a combination of [FirstGraphemeCluster], [FirstWord], [FirstSentence], and
    50  // [FirstLineSegment].
    51  //
    52  // The "boundaries" return value can be evaluated as follows:
    53  //
    54  //   - boundaries&MaskWord != 0: The boundary is a word boundary.
    55  //   - boundaries&MaskWord == 0: The boundary is not a word boundary.
    56  //   - boundaries&MaskSentence != 0: The boundary is a sentence boundary.
    57  //   - boundaries&MaskSentence == 0: The boundary is not a sentence boundary.
    58  //   - boundaries&MaskLine == LineDontBreak: You must not break the line at the
    59  //     boundary.
    60  //   - boundaries&MaskLine == LineMustBreak: You must break the line at the
    61  //     boundary.
    62  //   - boundaries&MaskLine == LineCanBreak: You may or may not break the line at
    63  //     the boundary.
    64  //   - boundaries >> ShiftWidth: The width of the grapheme cluster for most
    65  //     monospace fonts where a value of 1 represents one character cell.
    66  //
    67  // This function can be called continuously to extract all grapheme clusters
    68  // from a byte slice, as illustrated in the examples below.
    69  //
    70  // If you don't know which state to pass, for example when calling the function
    71  // for the first time, you must pass -1. For consecutive calls, pass the state
    72  // and rest slice returned by the previous call.
    73  //
    74  // The "rest" slice is the sub-slice of the original byte slice "b" starting
    75  // after the last byte of the identified grapheme cluster. If the length of the
    76  // "rest" slice is 0, the entire byte slice "b" has been processed. The
    77  // "cluster" byte slice is the sub-slice of the input slice containing the
    78  // first identified grapheme cluster.
    79  //
    80  // Given an empty byte slice "b", the function returns nil values.
    81  //
    82  // While slightly less convenient than using the Graphemes class, this function
    83  // has much better performance and makes no allocations. It lends itself well to
    84  // large byte slices.
    85  //
    86  // Note that in accordance with [UAX #14 LB3], the final segment will end with
    87  // a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
    88  // to ignore this by checking if the length of the "rest" slice is 0 and calling
    89  // [HasTrailingLineBreak] or [HasTrailingLineBreakInString] on the last rune.
    90  //
    91  // [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
    92  func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
    93  	// An empty byte slice returns nothing.
    94  	if len(b) == 0 {
    95  		return
    96  	}
    97  
    98  	// Extract the first rune.
    99  	r, length := utf8.DecodeRune(b)
   100  	if len(b) <= length { // If we're already past the end, there is nothing else to parse.
   101  		var prop int
   102  		if state < 0 {
   103  			prop = propertyGraphemes(r)
   104  		} else {
   105  			prop = state >> shiftPropState
   106  		}
   107  		return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
   108  	}
   109  
   110  	// If we don't know the state, determine it now.
   111  	var graphemeState, wordState, sentenceState, lineState, firstProp int
   112  	remainder := b[length:]
   113  	if state < 0 {
   114  		graphemeState, firstProp, _ = transitionGraphemeState(state, r)
   115  		wordState, _ = transitionWordBreakState(state, r, remainder, "")
   116  		sentenceState, _ = transitionSentenceBreakState(state, r, remainder, "")
   117  		lineState, _ = transitionLineBreakState(state, r, remainder, "")
   118  	} else {
   119  		graphemeState = state & maskGraphemeState
   120  		wordState = (state >> shiftWordState) & maskWordState
   121  		sentenceState = (state >> shiftSentenceState) & maskSentenceState
   122  		lineState = (state >> shiftLineState) & maskLineState
   123  		firstProp = state >> shiftPropState
   124  	}
   125  
   126  	// Transition until we find a grapheme cluster boundary.
   127  	width := runeWidth(r, firstProp)
   128  	for {
   129  		var (
   130  			graphemeBoundary, wordBoundary, sentenceBoundary bool
   131  			lineBreak, prop                                  int
   132  		)
   133  
   134  		r, l := utf8.DecodeRune(remainder)
   135  		remainder = b[length+l:]
   136  
   137  		graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
   138  		wordState, wordBoundary = transitionWordBreakState(wordState, r, remainder, "")
   139  		sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, remainder, "")
   140  		lineState, lineBreak = transitionLineBreakState(lineState, r, remainder, "")
   141  
   142  		if graphemeBoundary {
   143  			boundary := lineBreak | (width << ShiftWidth)
   144  			if wordBoundary {
   145  				boundary |= 1 << shiftWord
   146  			}
   147  			if sentenceBoundary {
   148  				boundary |= 1 << shiftSentence
   149  			}
   150  			return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
   151  		}
   152  
   153  		if firstProp == prExtendedPictographic {
   154  			if r == vs15 {
   155  				width = 1
   156  			} else if r == vs16 {
   157  				width = 2
   158  			}
   159  		} else if firstProp != prRegionalIndicator && firstProp != prL {
   160  			width += runeWidth(r, prop)
   161  		}
   162  
   163  		length += l
   164  		if len(b) <= length {
   165  			return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
   166  		}
   167  	}
   168  }
   169  
   170  // StepString is like [Step] but its input and outputs are strings.
   171  func StepString(str string, state int) (cluster, rest string, boundaries int, newState int) {
   172  	// An empty byte slice returns nothing.
   173  	if len(str) == 0 {
   174  		return
   175  	}
   176  
   177  	// Extract the first rune.
   178  	r, length := utf8.DecodeRuneInString(str)
   179  	if len(str) <= length { // If we're already past the end, there is nothing else to parse.
   180  		prop := propertyGraphemes(r)
   181  		return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
   182  	}
   183  
   184  	// If we don't know the state, determine it now.
   185  	var graphemeState, wordState, sentenceState, lineState, firstProp int
   186  	remainder := str[length:]
   187  	if state < 0 {
   188  		graphemeState, firstProp, _ = transitionGraphemeState(state, r)
   189  		wordState, _ = transitionWordBreakState(state, r, nil, remainder)
   190  		sentenceState, _ = transitionSentenceBreakState(state, r, nil, remainder)
   191  		lineState, _ = transitionLineBreakState(state, r, nil, remainder)
   192  	} else {
   193  		graphemeState = state & maskGraphemeState
   194  		wordState = (state >> shiftWordState) & maskWordState
   195  		sentenceState = (state >> shiftSentenceState) & maskSentenceState
   196  		lineState = (state >> shiftLineState) & maskLineState
   197  		firstProp = state >> shiftPropState
   198  	}
   199  
   200  	// Transition until we find a grapheme cluster boundary.
   201  	width := runeWidth(r, firstProp)
   202  	for {
   203  		var (
   204  			graphemeBoundary, wordBoundary, sentenceBoundary bool
   205  			lineBreak, prop                                  int
   206  		)
   207  
   208  		r, l := utf8.DecodeRuneInString(remainder)
   209  		remainder = str[length+l:]
   210  
   211  		graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
   212  		wordState, wordBoundary = transitionWordBreakState(wordState, r, nil, remainder)
   213  		sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, nil, remainder)
   214  		lineState, lineBreak = transitionLineBreakState(lineState, r, nil, remainder)
   215  
   216  		if graphemeBoundary {
   217  			boundary := lineBreak | (width << ShiftWidth)
   218  			if wordBoundary {
   219  				boundary |= 1 << shiftWord
   220  			}
   221  			if sentenceBoundary {
   222  				boundary |= 1 << shiftSentence
   223  			}
   224  			return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
   225  		}
   226  
   227  		if firstProp == prExtendedPictographic {
   228  			if r == vs15 {
   229  				width = 1
   230  			} else if r == vs16 {
   231  				width = 2
   232  			}
   233  		} else if firstProp != prRegionalIndicator && firstProp != prL {
   234  			width += runeWidth(r, prop)
   235  		}
   236  
   237  		length += l
   238  		if len(str) <= length {
   239  			return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
   240  		}
   241  	}
   242  }
   243
View as plain text