...

Source file src/github.com/rivo/uniseg/grapheme.go

Documentation: github.com/rivo/uniseg

     1  package uniseg
     2  
     3  import "unicode/utf8"
     4  
     5  // Graphemes implements an iterator over Unicode grapheme clusters, or
     6  // user-perceived characters. While iterating, it also provides information
     7  // about word boundaries, sentence boundaries, line breaks, and monospace
     8  // character widths.
     9  //
    10  // After constructing the class via [NewGraphemes] for a given string "str",
    11  // [Graphemes.Next] is called for every grapheme cluster in a loop until it
    12  // returns false. Inside the loop, information about the grapheme cluster as
    13  // well as boundary information and character width is available via the various
    14  // methods (see examples below).
    15  //
    16  // This class basically wraps the [StepString] parser and provides a convenient
    17  // interface to it. If you are only interested in some parts of this package's
    18  // functionality, using the specialized functions starting with "First" is
    19  // almost always faster.
    20  type Graphemes struct {
    21  	// The original string.
    22  	original string
    23  
    24  	// The remaining string to be parsed.
    25  	remaining string
    26  
    27  	// The current grapheme cluster.
    28  	cluster string
    29  
    30  	// The byte offset of the current grapheme cluster relative to the original
    31  	// string.
    32  	offset int
    33  
    34  	// The current boundary information of the [Step] parser.
    35  	boundaries int
    36  
    37  	// The current state of the [Step] parser.
    38  	state int
    39  }
    40  
    41  // NewGraphemes returns a new grapheme cluster iterator.
    42  func NewGraphemes(str string) *Graphemes {
    43  	return &Graphemes{
    44  		original:  str,
    45  		remaining: str,
    46  		state:     -1,
    47  	}
    48  }
    49  
    50  // Next advances the iterator by one grapheme cluster and returns false if no
    51  // clusters are left. This function must be called before the first cluster is
    52  // accessed.
    53  func (g *Graphemes) Next() bool {
    54  	if len(g.remaining) == 0 {
    55  		// We're already past the end.
    56  		g.state = -2
    57  		g.cluster = ""
    58  		return false
    59  	}
    60  	g.offset += len(g.cluster)
    61  	g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
    62  	return true
    63  }
    64  
    65  // Runes returns a slice of runes (code points) which corresponds to the current
    66  // grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
    67  // has not yet been called, nil is returned.
    68  func (g *Graphemes) Runes() []rune {
    69  	if g.state < 0 {
    70  		return nil
    71  	}
    72  	return []rune(g.cluster)
    73  }
    74  
    75  // Str returns a substring of the original string which corresponds to the
    76  // current grapheme cluster. If the iterator is already past the end or
    77  // [Graphemes.Next] has not yet been called, an empty string is returned.
    78  func (g *Graphemes) Str() string {
    79  	return g.cluster
    80  }
    81  
    82  // Bytes returns a byte slice which corresponds to the current grapheme cluster.
    83  // If the iterator is already past the end or [Graphemes.Next] has not yet been
    84  // called, nil is returned.
    85  func (g *Graphemes) Bytes() []byte {
    86  	if g.state < 0 {
    87  		return nil
    88  	}
    89  	return []byte(g.cluster)
    90  }
    91  
    92  // Positions returns the interval of the current grapheme cluster as byte
    93  // positions into the original string. The first returned value "from" indexes
    94  // the first byte and the second returned value "to" indexes the first byte that
    95  // is not included anymore, i.e. str[from:to] is the current grapheme cluster of
    96  // the original string "str". If [Graphemes.Next] has not yet been called, both
    97  // values are 0. If the iterator is already past the end, both values are 1.
    98  func (g *Graphemes) Positions() (int, int) {
    99  	if g.state == -1 {
   100  		return 0, 0
   101  	} else if g.state == -2 {
   102  		return 1, 1
   103  	}
   104  	return g.offset, g.offset + len(g.cluster)
   105  }
   106  
   107  // IsWordBoundary returns true if a word ends after the current grapheme
   108  // cluster.
   109  func (g *Graphemes) IsWordBoundary() bool {
   110  	if g.state < 0 {
   111  		return true
   112  	}
   113  	return g.boundaries&MaskWord != 0
   114  }
   115  
   116  // IsSentenceBoundary returns true if a sentence ends after the current
   117  // grapheme cluster.
   118  func (g *Graphemes) IsSentenceBoundary() bool {
   119  	if g.state < 0 {
   120  		return true
   121  	}
   122  	return g.boundaries&MaskSentence != 0
   123  }
   124  
   125  // LineBreak returns whether the line can be broken after the current grapheme
   126  // cluster. A value of [LineDontBreak] means the line may not be broken, a value
   127  // of [LineMustBreak] means the line must be broken, and a value of
   128  // [LineCanBreak] means the line may or may not be broken.
   129  func (g *Graphemes) LineBreak() int {
   130  	if g.state == -1 {
   131  		return LineDontBreak
   132  	}
   133  	if g.state == -2 {
   134  		return LineMustBreak
   135  	}
   136  	return g.boundaries & MaskLine
   137  }
   138  
   139  // Width returns the monospace width of the current grapheme cluster.
   140  func (g *Graphemes) Width() int {
   141  	if g.state < 0 {
   142  		return 0
   143  	}
   144  	return g.boundaries >> ShiftWidth
   145  }
   146  
   147  // Reset puts the iterator into its initial state such that the next call to
   148  // [Graphemes.Next] sets it to the first grapheme cluster again.
   149  func (g *Graphemes) Reset() {
   150  	g.state = -1
   151  	g.offset = 0
   152  	g.cluster = ""
   153  	g.remaining = g.original
   154  }
   155  
   156  // GraphemeClusterCount returns the number of user-perceived characters
   157  // (grapheme clusters) for the given string.
   158  func GraphemeClusterCount(s string) (n int) {
   159  	state := -1
   160  	for len(s) > 0 {
   161  		_, s, _, state = FirstGraphemeClusterInString(s, state)
   162  		n++
   163  	}
   164  	return
   165  }
   166  
   167  // ReverseString reverses the given string while observing grapheme cluster
   168  // boundaries.
   169  func ReverseString(s string) string {
   170  	str := []byte(s)
   171  	reversed := make([]byte, len(str))
   172  	state := -1
   173  	index := len(str)
   174  	for len(str) > 0 {
   175  		var cluster []byte
   176  		cluster, str, _, state = FirstGraphemeCluster(str, state)
   177  		index -= len(cluster)
   178  		copy(reversed[index:], cluster)
   179  		if index <= len(str)/2 {
   180  			break
   181  		}
   182  	}
   183  	return string(reversed)
   184  }
   185  
   186  // The number of bits the grapheme property must be shifted to make place for
   187  // grapheme states.
   188  const shiftGraphemePropState = 4
   189  
   190  // FirstGraphemeCluster returns the first grapheme cluster found in the given
   191  // byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
   192  // Cluster Boundaries]. This function can be called continuously to extract all
   193  // grapheme clusters from a byte slice, as illustrated in the example below.
   194  //
   195  // If you don't know the current state, for example when calling the function
   196  // for the first time, you must pass -1. For consecutive calls, pass the state
   197  // and rest slice returned by the previous call.
   198  //
   199  // The "rest" slice is the sub-slice of the original byte slice "b" starting
   200  // after the last byte of the identified grapheme cluster. If the length of the
   201  // "rest" slice is 0, the entire byte slice "b" has been processed. The
   202  // "cluster" byte slice is the sub-slice of the input slice containing the
   203  // identified grapheme cluster.
   204  //
   205  // The returned width is the width of the grapheme cluster for most monospace
   206  // fonts where a value of 1 represents one character cell.
   207  //
   208  // Given an empty byte slice "b", the function returns nil values.
   209  //
   210  // While slightly less convenient than using the Graphemes class, this function
   211  // has much better performance and makes no allocations. It lends itself well to
   212  // large byte slices.
   213  //
   214  // [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
   215  func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
   216  	// An empty byte slice returns nothing.
   217  	if len(b) == 0 {
   218  		return
   219  	}
   220  
   221  	// Extract the first rune.
   222  	r, length := utf8.DecodeRune(b)
   223  	if len(b) <= length { // If we're already past the end, there is nothing else to parse.
   224  		var prop int
   225  		if state < 0 {
   226  			prop = propertyGraphemes(r)
   227  		} else {
   228  			prop = state >> shiftGraphemePropState
   229  		}
   230  		return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
   231  	}
   232  
   233  	// If we don't know the state, determine it now.
   234  	var firstProp int
   235  	if state < 0 {
   236  		state, firstProp, _ = transitionGraphemeState(state, r)
   237  	} else {
   238  		firstProp = state >> shiftGraphemePropState
   239  	}
   240  	width += runeWidth(r, firstProp)
   241  
   242  	// Transition until we find a boundary.
   243  	for {
   244  		var (
   245  			prop     int
   246  			boundary bool
   247  		)
   248  
   249  		r, l := utf8.DecodeRune(b[length:])
   250  		state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
   251  
   252  		if boundary {
   253  			return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
   254  		}
   255  
   256  		if firstProp == prExtendedPictographic {
   257  			if r == vs15 {
   258  				width = 1
   259  			} else if r == vs16 {
   260  				width = 2
   261  			}
   262  		} else if firstProp != prRegionalIndicator && firstProp != prL {
   263  			width += runeWidth(r, prop)
   264  		}
   265  
   266  		length += l
   267  		if len(b) <= length {
   268  			return b, nil, width, grAny | (prop << shiftGraphemePropState)
   269  		}
   270  	}
   271  }
   272  
   273  // FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
   274  // outputs are strings.
   275  func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
   276  	// An empty string returns nothing.
   277  	if len(str) == 0 {
   278  		return
   279  	}
   280  
   281  	// Extract the first rune.
   282  	r, length := utf8.DecodeRuneInString(str)
   283  	if len(str) <= length { // If we're already past the end, there is nothing else to parse.
   284  		var prop int
   285  		if state < 0 {
   286  			prop = propertyGraphemes(r)
   287  		} else {
   288  			prop = state >> shiftGraphemePropState
   289  		}
   290  		return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
   291  	}
   292  
   293  	// If we don't know the state, determine it now.
   294  	var firstProp int
   295  	if state < 0 {
   296  		state, firstProp, _ = transitionGraphemeState(state, r)
   297  	} else {
   298  		firstProp = state >> shiftGraphemePropState
   299  	}
   300  	width += runeWidth(r, firstProp)
   301  
   302  	// Transition until we find a boundary.
   303  	for {
   304  		var (
   305  			prop     int
   306  			boundary bool
   307  		)
   308  
   309  		r, l := utf8.DecodeRuneInString(str[length:])
   310  		state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
   311  
   312  		if boundary {
   313  			return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
   314  		}
   315  
   316  		if firstProp == prExtendedPictographic {
   317  			if r == vs15 {
   318  				width = 1
   319  			} else if r == vs16 {
   320  				width = 2
   321  			}
   322  		} else if firstProp != prRegionalIndicator && firstProp != prL {
   323  			width += runeWidth(r, prop)
   324  		}
   325  
   326  		length += l
   327  		if len(str) <= length {
   328  			return str, "", width, grAny | (prop << shiftGraphemePropState)
   329  		}
   330  	}
   331  }
   332  

View as plain text