step_test.go

Documentation: github.com/rivo/uniseg

     1  package uniseg
     2  
     3  import (
     4  	"testing"
     5  )
     6  
     7  // Test official Grapheme Cluster Unicode test cases for grapheme clusters using
     8  // the [Step] function.
     9  func TestStepBytesGrapheme(t *testing.T) {
    10  	for testNum, testCase := range graphemeBreakTestCases {
    11  		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
    12  		testNum,
    13  		strings.TrimSpace(testCase.original),
    14  		testCase.expected,
    15  		decomposed(testCase.original),
    16  		[]rune(testCase.original))*/
    17  		b := []byte(testCase.original)
    18  		state := -1
    19  		var (
    20  			index int
    21  			c     []byte
    22  		)
    23  	GraphemeLoop:
    24  		for len(b) > 0 {
    25  			c, b, _, state = Step(b, state)
    26  
    27  			if index >= len(testCase.expected) {
    28  				t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`,
    29  					testNum,
    30  					testCase.original,
    31  					len(testCase.expected))
    32  				break
    33  			}
    34  
    35  			cluster := []rune(string(c))
    36  			if len(cluster) != len(testCase.expected[index]) {
    37  				t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`,
    38  					testNum,
    39  					testCase.original,
    40  					index,
    41  					len(cluster),
    42  					cluster,
    43  					len(testCase.expected[index]),
    44  					testCase.expected[index])
    45  				break
    46  			}
    47  			for i, r := range cluster {
    48  				if r != testCase.expected[index][i] {
    49  					t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`,
    50  						testNum,
    51  						testCase.original,
    52  						index,
    53  						cluster,
    54  						testCase.expected[index])
    55  					break GraphemeLoop
    56  				}
    57  			}
    58  
    59  			index++
    60  		}
    61  		if index < len(testCase.expected) {
    62  			t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`,
    63  				testNum,
    64  				testCase.original,
    65  				index,
    66  				len(testCase.expected))
    67  		}
    68  	}
    69  	cluster, rest, boundaries, newState := Step([]byte{}, -1)
    70  	if len(cluster) > 0 {
    71  		t.Errorf(`Expected cluster to be empty byte slice, got %q`, cluster)
    72  	}
    73  	if len(rest) > 0 {
    74  		t.Errorf(`Expected rest to be empty byte slice, got %q`, rest)
    75  	}
    76  	if boundaries != 0 {
    77  		t.Errorf(`Expected width to be 0, got %d`, boundaries)
    78  	}
    79  	if newState != 0 {
    80  		t.Errorf(`Expected newState to be 0, got %d`, newState)
    81  	}
    82  }
    83  
    84  // Test official word boundaries Unicode test cases for grapheme clusters using
    85  // the [Step] function.
    86  func TestStepBytesWord(t *testing.T) {
    87  	for testNum, testCase := range wordBreakTestCases {
    88  		if testNum == 1700 {
    89  			// This test case reveals an inconsistency in the Unicode rule set,
    90  			// namely the handling of ZWJ within two RI graphemes. (Grapheme
    91  			// rules will restart the RI count, word rules will ignore the ZWJ.)
    92  			// An error has been reported.
    93  			continue
    94  		}
    95  		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
    96  		testNum,
    97  		strings.TrimSpace(testCase.original),
    98  		testCase.expected,
    99  		decomposed(testCase.original),
   100  		[]rune(testCase.original))*/
   101  		b := []byte(testCase.original)
   102  		state := -1
   103  		var (
   104  			index, boundaries int
   105  			c                 []byte
   106  			growingCluster    []rune
   107  		)
   108  	GraphemeLoop:
   109  		for len(b) > 0 {
   110  			c, b, boundaries, state = Step(b, state)
   111  
   112  			if index >= len(testCase.expected) {
   113  				t.Errorf(`Test case %d %q failed: More words returned than expected %d`,
   114  					testNum,
   115  					testCase.original,
   116  					len(testCase.expected))
   117  				break
   118  			}
   119  
   120  			growingCluster = append(growingCluster, []rune(string(c))...)
   121  			if boundaries&MaskWord == 0 {
   122  				continue
   123  			}
   124  			cluster := growingCluster
   125  			growingCluster = nil
   126  			if len(cluster) != len(testCase.expected[index]) {
   127  				t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`,
   128  					testNum,
   129  					testCase.original,
   130  					index,
   131  					len(cluster),
   132  					cluster,
   133  					len(testCase.expected[index]),
   134  					testCase.expected[index])
   135  				break
   136  			}
   137  			for i, r := range cluster {
   138  				if r != testCase.expected[index][i] {
   139  					t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`,
   140  						testNum,
   141  						testCase.original,
   142  						index,
   143  						cluster,
   144  						testCase.expected[index])
   145  					break GraphemeLoop
   146  				}
   147  			}
   148  
   149  			index++
   150  		}
   151  		if index < len(testCase.expected) {
   152  			t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`,
   153  				testNum,
   154  				testCase.original,
   155  				index,
   156  				len(testCase.expected))
   157  		}
   158  	}
   159  }
   160  
   161  // Test official sentence boundaries Unicode test cases for grapheme clusters
   162  // using the [Step] function.
   163  func TestStepBytesSentence(t *testing.T) {
   164  	for testNum, testCase := range sentenceBreakTestCases {
   165  		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
   166  		testNum,
   167  		strings.TrimSpace(testCase.original),
   168  		testCase.expected,
   169  		decomposed(testCase.original),
   170  		[]rune(testCase.original))*/
   171  		b := []byte(testCase.original)
   172  		state := -1
   173  		var (
   174  			index, boundaries int
   175  			c                 []byte
   176  			growingCluster    []rune
   177  		)
   178  	GraphemeLoop:
   179  		for len(b) > 0 {
   180  			c, b, boundaries, state = Step(b, state)
   181  
   182  			if index >= len(testCase.expected) {
   183  				t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`,
   184  					testNum,
   185  					testCase.original,
   186  					len(testCase.expected))
   187  				break
   188  			}
   189  
   190  			growingCluster = append(growingCluster, []rune(string(c))...)
   191  			if boundaries&MaskSentence == 0 {
   192  				continue
   193  			}
   194  			cluster := growingCluster
   195  			growingCluster = nil
   196  			if len(cluster) != len(testCase.expected[index]) {
   197  				t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`,
   198  					testNum,
   199  					testCase.original,
   200  					index,
   201  					len(cluster),
   202  					cluster,
   203  					len(testCase.expected[index]),
   204  					testCase.expected[index])
   205  				break
   206  			}
   207  			for i, r := range cluster {
   208  				if r != testCase.expected[index][i] {
   209  					t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`,
   210  						testNum,
   211  						testCase.original,
   212  						index,
   213  						cluster,
   214  						testCase.expected[index])
   215  					break GraphemeLoop
   216  				}
   217  			}
   218  
   219  			index++
   220  		}
   221  		if index < len(testCase.expected) {
   222  			t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`,
   223  				testNum,
   224  				testCase.original,
   225  				index,
   226  				len(testCase.expected))
   227  		}
   228  	}
   229  }
   230  
   231  // We don't test the [Step] function for UAX #14 line breaking because the rules
   232  // aren't really compatible. Specifically emoji modifiers and zero-width joiners
   233  // are kept together by the grapheme cluster rules while line breaking rules
   234  // will allow them to be broken apart. The handling of this limitation is
   235  // outlined in Section 8.2 Example 6 of UAX #14.
   236  
   237  // Test official Grapheme Cluster Unicode test cases for grapheme clusters using
   238  // the StepString() function.
   239  func TestStepStringGrapheme(t *testing.T) {
   240  	for testNum, testCase := range graphemeBreakTestCases {
   241  		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
   242  		testNum,
   243  		strings.TrimSpace(testCase.original),
   244  		testCase.expected,
   245  		decomposed(testCase.original),
   246  		[]rune(testCase.original))*/
   247  		str := testCase.original
   248  		state := -1
   249  		var (
   250  			index int
   251  			c     string
   252  		)
   253  	GraphemeLoop:
   254  		for len(str) > 0 {
   255  			c, str, _, state = StepString(str, state)
   256  
   257  			if index >= len(testCase.expected) {
   258  				t.Errorf(`Test case %d %q failed: More grapheme clusters returned than expected %d`,
   259  					testNum,
   260  					testCase.original,
   261  					len(testCase.expected))
   262  				break
   263  			}
   264  
   265  			cluster := []rune(c)
   266  			if len(cluster) != len(testCase.expected[index]) {
   267  				t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d has %d codepoints %x, %d expected %x`,
   268  					testNum,
   269  					testCase.original,
   270  					index,
   271  					len(cluster),
   272  					cluster,
   273  					len(testCase.expected[index]),
   274  					testCase.expected[index])
   275  				break
   276  			}
   277  			for i, r := range cluster {
   278  				if r != testCase.expected[index][i] {
   279  					t.Errorf(`Test case %d %q failed: Grapheme cluster at index %d is %x, expected %x`,
   280  						testNum,
   281  						testCase.original,
   282  						index,
   283  						cluster,
   284  						testCase.expected[index])
   285  					break GraphemeLoop
   286  				}
   287  			}
   288  
   289  			index++
   290  		}
   291  		if index < len(testCase.expected) {
   292  			t.Errorf(`Test case %d %q failed: Fewer grapheme clusters returned (%d) than expected (%d)`,
   293  				testNum,
   294  				testCase.original,
   295  				index,
   296  				len(testCase.expected))
   297  		}
   298  	}
   299  	cluster, rest, boundaries, newState := StepString("", -1)
   300  	if len(cluster) > 0 {
   301  		t.Errorf(`Expected cluster to be empty string, got %q`, cluster)
   302  	}
   303  	if len(rest) > 0 {
   304  		t.Errorf(`Expected rest to be empty string, got %q`, rest)
   305  	}
   306  	if boundaries != 0 {
   307  		t.Errorf(`Expected width to be 0, got %d`, boundaries)
   308  	}
   309  	if newState != 0 {
   310  		t.Errorf(`Expected newState to be 0, got %d`, newState)
   311  	}
   312  }
   313  
   314  // Test official word boundaries Unicode test cases for grapheme clusters using
   315  // the StepString() function.
   316  func TestStepStringWord(t *testing.T) {
   317  	for testNum, testCase := range wordBreakTestCases {
   318  		if testNum == 1700 {
   319  			// This test case reveals an inconsistency in the Unicode rule set,
   320  			// namely the handling of ZWJ within two RI graphemes. (Grapheme
   321  			// rules will restart the RI count, word rules will ignore the ZWJ.)
   322  			// An error has been reported.
   323  			continue
   324  		}
   325  		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
   326  		testNum,
   327  		strings.TrimSpace(testCase.original),
   328  		testCase.expected,
   329  		decomposed(testCase.original),
   330  		[]rune(testCase.original))*/
   331  		str := testCase.original
   332  		state := -1
   333  		var (
   334  			index, boundaries int
   335  			c                 string
   336  			growingCluster    []rune
   337  		)
   338  	GraphemeLoop:
   339  		for len(str) > 0 {
   340  			c, str, boundaries, state = StepString(str, state)
   341  
   342  			if index >= len(testCase.expected) {
   343  				t.Errorf(`Test case %d %q failed: More words returned than expected %d`,
   344  					testNum,
   345  					testCase.original,
   346  					len(testCase.expected))
   347  				break
   348  			}
   349  
   350  			growingCluster = append(growingCluster, []rune(c)...)
   351  			if boundaries&MaskWord == 0 {
   352  				continue
   353  			}
   354  			cluster := growingCluster
   355  			growingCluster = nil
   356  			if len(cluster) != len(testCase.expected[index]) {
   357  				t.Errorf(`Test case %d %q failed: Word at index %d has %d codepoints %x, %d expected %x`,
   358  					testNum,
   359  					testCase.original,
   360  					index,
   361  					len(cluster),
   362  					cluster,
   363  					len(testCase.expected[index]),
   364  					testCase.expected[index])
   365  				break
   366  			}
   367  			for i, r := range cluster {
   368  				if r != testCase.expected[index][i] {
   369  					t.Errorf(`Test case %d %q failed: Word at index %d is %x, expected %x`,
   370  						testNum,
   371  						testCase.original,
   372  						index,
   373  						cluster,
   374  						testCase.expected[index])
   375  					break GraphemeLoop
   376  				}
   377  			}
   378  
   379  			index++
   380  		}
   381  		if index < len(testCase.expected) {
   382  			t.Errorf(`Test case %d %q failed: Fewer words returned (%d) than expected (%d)`,
   383  				testNum,
   384  				testCase.original,
   385  				index,
   386  				len(testCase.expected))
   387  		}
   388  	}
   389  }
   390  
   391  // Test official sentence boundaries Unicode test cases for grapheme clusters
   392  // using the StepString() function.
   393  func TestStepStringSentence(t *testing.T) {
   394  	for testNum, testCase := range sentenceBreakTestCases {
   395  		/*t.Logf(`Test case %d %q: Expecting %x, getting %x, code points %x"`,
   396  		testNum,
   397  		strings.TrimSpace(testCase.original),
   398  		testCase.expected,
   399  		decomposed(testCase.original),
   400  		[]rune(testCase.original))*/
   401  		str := testCase.original
   402  		state := -1
   403  		var (
   404  			index, boundaries int
   405  			c                 string
   406  			growingCluster    []rune
   407  		)
   408  	GraphemeLoop:
   409  		for len(str) > 0 {
   410  			c, str, boundaries, state = StepString(str, state)
   411  
   412  			if index >= len(testCase.expected) {
   413  				t.Errorf(`Test case %d %q failed: More sentences returned than expected %d`,
   414  					testNum,
   415  					testCase.original,
   416  					len(testCase.expected))
   417  				break
   418  			}
   419  
   420  			growingCluster = append(growingCluster, []rune(c)...)
   421  			if boundaries&MaskSentence == 0 {
   422  				continue
   423  			}
   424  			cluster := growingCluster
   425  			growingCluster = nil
   426  			if len(cluster) != len(testCase.expected[index]) {
   427  				t.Errorf(`Test case %d %q failed: Sentence at index %d has %d codepoints %x, %d expected %x`,
   428  					testNum,
   429  					testCase.original,
   430  					index,
   431  					len(cluster),
   432  					cluster,
   433  					len(testCase.expected[index]),
   434  					testCase.expected[index])
   435  				break
   436  			}
   437  			for i, r := range cluster {
   438  				if r != testCase.expected[index][i] {
   439  					t.Errorf(`Test case %d %q failed: Sentence at index %d is %x, expected %x`,
   440  						testNum,
   441  						testCase.original,
   442  						index,
   443  						cluster,
   444  						testCase.expected[index])
   445  					break GraphemeLoop
   446  				}
   447  			}
   448  
   449  			index++
   450  		}
   451  		if index < len(testCase.expected) {
   452  			t.Errorf(`Test case %d %q failed: Fewer sentences returned (%d) than expected (%d)`,
   453  				testNum,
   454  				testCase.original,
   455  				index,
   456  				len(testCase.expected))
   457  		}
   458  	}
   459  }
   460  
   461  // Benchmark the use of the [Step] function.
   462  func BenchmarkStepBytes(b *testing.B) {
   463  	for i := 0; i < b.N; i++ {
   464  		var c []byte
   465  		state := -1
   466  		str := benchmarkBytes
   467  		for len(str) > 0 {
   468  			c, str, _, state = Step(str, state)
   469  			resultRunes = []rune(string(c))
   470  		}
   471  	}
   472  }
   473  
   474  // Benchmark the use of the StepString() function.
   475  func BenchmarkStepString(b *testing.B) {
   476  	for i := 0; i < b.N; i++ {
   477  		var c string
   478  		state := -1
   479  		str := benchmarkStr
   480  		for len(str) > 0 {
   481  			c, str, _, state = StepString(str, state)
   482  			resultRunes = []rune(c)
   483  		}
   484  	}
   485  }
   486  
   487  // Fuzz the StepString function.
   488  func FuzzStepString(f *testing.F) {
   489  	for _, tc := range graphemeBreakTestCases {
   490  		f.Add(tc.original)
   491  	}
   492  	f.Fuzz(func(t *testing.T, orig string) {
   493  		var (
   494  			c          string
   495  			b          []byte
   496  			boundaries int
   497  		)
   498  		str := orig
   499  		state := -1
   500  		for len(str) > 0 {
   501  			c, str, boundaries, state = StepString(str, state)
   502  			b = append(b, []byte(c)...)
   503  		}
   504  
   505  		// Check if the constructed string is the same as the original.
   506  		if string(b) != orig {
   507  			t.Errorf("Fuzzing failed: %q != %q", string(b), orig)
   508  		}
   509  
   510  		// For all other checks, we need to have a non-empty string.
   511  		if orig == "" {
   512  			return
   513  		}
   514  
   515  		// Check end boundaries.
   516  		if boundaries&MaskWord == 0 {
   517  			t.Errorf("String %q does not end on a word boundary (final boundary = %x)", orig, state)
   518  		}
   519  		if boundaries&MaskSentence == 0 {
   520  			t.Errorf("String %q does not end on a sentence boundary (final boundary = %x)", orig, state)
   521  		}
   522  		if boundaries&MaskLine != LineMustBreak {
   523  			t.Errorf("String %q does not end with a mandatory line break (final boundary = %x)", orig, state)
   524  		}
   525  
   526  		// Note: If you have ideas for more useful checks we could add here,
   527  		// please submit them here:
   528  		// https://github.com/rivo/uniseg/issues
   529  	})
   530  }
   531
View as plain text