...

Source file src/github.com/huandu/xstrings/count.go

Documentation: github.com/huandu/xstrings

     1  // Copyright 2015 Huan Du. All rights reserved.
     2  // Licensed under the MIT license that can be found in the LICENSE file.
     3  
     4  package xstrings
     5  
     6  import (
     7  	"unicode"
     8  	"unicode/utf8"
     9  )
    10  
    11  // Len returns str's utf8 rune length.
    12  func Len(str string) int {
    13  	return utf8.RuneCountInString(str)
    14  }
    15  
    16  // WordCount returns number of words in a string.
    17  //
    18  // Word is defined as a locale dependent string containing alphabetic characters,
    19  // which may also contain but not start with `'` and `-` characters.
    20  func WordCount(str string) int {
    21  	var r rune
    22  	var size, n int
    23  
    24  	inWord := false
    25  
    26  	for len(str) > 0 {
    27  		r, size = utf8.DecodeRuneInString(str)
    28  
    29  		switch {
    30  		case isAlphabet(r):
    31  			if !inWord {
    32  				inWord = true
    33  				n++
    34  			}
    35  
    36  		case inWord && (r == '\'' || r == '-'):
    37  			// Still in word.
    38  
    39  		default:
    40  			inWord = false
    41  		}
    42  
    43  		str = str[size:]
    44  	}
    45  
    46  	return n
    47  }
    48  
    49  const minCJKCharacter = '\u3400'
    50  
    51  // Checks r is a letter but not CJK character.
    52  func isAlphabet(r rune) bool {
    53  	if !unicode.IsLetter(r) {
    54  		return false
    55  	}
    56  
    57  	switch {
    58  	// Quick check for non-CJK character.
    59  	case r < minCJKCharacter:
    60  		return true
    61  
    62  	// Common CJK characters.
    63  	case r >= '\u4E00' && r <= '\u9FCC':
    64  		return false
    65  
    66  	// Rare CJK characters.
    67  	case r >= '\u3400' && r <= '\u4D85':
    68  		return false
    69  
    70  	// Rare and historic CJK characters.
    71  	case r >= '\U00020000' && r <= '\U0002B81D':
    72  		return false
    73  	}
    74  
    75  	return true
    76  }
    77  
    78  // Width returns string width in monotype font.
    79  // Multi-byte characters are usually twice the width of single byte characters.
    80  //
    81  // Algorithm comes from `mb_strwidth` in PHP.
    82  // http://php.net/manual/en/function.mb-strwidth.php
    83  func Width(str string) int {
    84  	var r rune
    85  	var size, n int
    86  
    87  	for len(str) > 0 {
    88  		r, size = utf8.DecodeRuneInString(str)
    89  		n += RuneWidth(r)
    90  		str = str[size:]
    91  	}
    92  
    93  	return n
    94  }
    95  
    96  // RuneWidth returns character width in monotype font.
    97  // Multi-byte characters are usually twice the width of single byte characters.
    98  //
    99  // Algorithm comes from `mb_strwidth` in PHP.
   100  // http://php.net/manual/en/function.mb-strwidth.php
   101  func RuneWidth(r rune) int {
   102  	switch {
   103  	case r == utf8.RuneError || r < '\x20':
   104  		return 0
   105  
   106  	case '\x20' <= r && r < '\u2000':
   107  		return 1
   108  
   109  	case '\u2000' <= r && r < '\uFF61':
   110  		return 2
   111  
   112  	case '\uFF61' <= r && r < '\uFFA0':
   113  		return 1
   114  
   115  	case '\uFFA0' <= r:
   116  		return 2
   117  	}
   118  
   119  	return 0
   120  }
   121  

View as plain text