normalize_test.go

Documentation: golang.org/x/text/unicode/norm

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package norm
     6  
     7  import (
     8  	"bytes"
     9  	"flag"
    10  	"fmt"
    11  	"io"
    12  	"log"
    13  	"os"
    14  	"os/exec"
    15  	"path/filepath"
    16  	"runtime"
    17  	"strings"
    18  	"testing"
    19  	"unicode/utf8"
    20  
    21  	"golang.org/x/text/transform"
    22  )
    23  
    24  var (
    25  	testn = flag.Int("testn", -1, "specific test number to run or -1 for all")
    26  )
    27  
    28  // pc replaces any rune r that is repeated n times, for n > 1, with r{n}.
    29  func pc(s string) []byte {
    30  	b := bytes.NewBuffer(make([]byte, 0, len(s)))
    31  	for i := 0; i < len(s); {
    32  		r, sz := utf8.DecodeRuneInString(s[i:])
    33  		n := 0
    34  		if sz == 1 {
    35  			// Special-case one-byte case to handle repetition for invalid UTF-8.
    36  			for c := s[i]; i+n < len(s) && s[i+n] == c; n++ {
    37  			}
    38  		} else {
    39  			for _, r2 := range s[i:] {
    40  				if r2 != r {
    41  					break
    42  				}
    43  				n++
    44  			}
    45  		}
    46  		b.WriteString(s[i : i+sz])
    47  		if n > 1 {
    48  			fmt.Fprintf(b, "{%d}", n)
    49  		}
    50  		i += sz * n
    51  	}
    52  	return b.Bytes()
    53  }
    54  
    55  // pidx finds the index from which two strings start to differ, plus context.
    56  // It returns the index and ellipsis if the index is greater than 0.
    57  func pidx(a, b string) (i int, prefix string) {
    58  	for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
    59  	}
    60  	if i < 8 {
    61  		return 0, ""
    62  	}
    63  	i -= 3 // ensure taking at least one full rune before the difference.
    64  	for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
    65  	}
    66  	return i, "..."
    67  }
    68  
    69  type PositionTest struct {
    70  	input  string
    71  	pos    int
    72  	buffer string // expected contents of reorderBuffer, if applicable
    73  }
    74  
    75  type positionFunc func(rb *reorderBuffer, s string) (int, []byte)
    76  
    77  func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
    78  	rb := reorderBuffer{}
    79  	rb.init(f, nil)
    80  	for i, test := range tests {
    81  		rb.reset()
    82  		rb.src = inputString(test.input)
    83  		rb.nsrc = len(test.input)
    84  		pos, out := fn(&rb, test.input)
    85  		if pos != test.pos {
    86  			t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
    87  		}
    88  		if outs := string(out); outs != test.buffer {
    89  			k, pfx := pidx(outs, test.buffer)
    90  			t.Errorf("%s:%d: buffer \nwas  %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:]))
    91  		}
    92  	}
    93  }
    94  
    95  func grave(n int) string {
    96  	return rep(0x0300, n)
    97  }
    98  
    99  func rep(r rune, n int) string {
   100  	return strings.Repeat(string(r), n)
   101  }
   102  
   103  const segSize = maxByteBufferSize
   104  
   105  var cgj = GraphemeJoiner
   106  
   107  var decomposeSegmentTests = []PositionTest{
   108  	// illegal runes
   109  	{"\xC2", 0, ""},
   110  	{"\xC0", 1, "\xC0"},
   111  	{"\u00E0\x80", 2, "\u0061\u0300"},
   112  	// starter
   113  	{"a", 1, "a"},
   114  	{"ab", 1, "a"},
   115  	// starter + composing
   116  	{"a\u0300", 3, "a\u0300"},
   117  	{"a\u0300b", 3, "a\u0300"},
   118  	// with decomposition
   119  	{"\u00C0", 2, "A\u0300"},
   120  	{"\u00C0b", 2, "A\u0300"},
   121  	// long
   122  	{grave(31), 60, grave(30) + cgj},
   123  	{"a" + grave(31), 61, "a" + grave(30) + cgj},
   124  
   125  	// Stability tests: see https://www.unicode.org/review/pr-29.html.
   126  	// U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;;
   127  	// U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
   128  	// U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
   129  	// U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;;
   130  	// U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;;
   131  	{"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"},
   132  	{"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"},
   133  	{"\u0B47\u0B3E", 6, "\u0B47\u0B3E"},
   134  	{"\u1100\u1161", 6, "\u1100\u1161"},
   135  
   136  	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
   137  	// Sequence of decomposing characters that are starters and modifiers.
   138  	{"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj},
   139  
   140  	{grave(30), 60, grave(30)},
   141  	// U+FF9E is a starter, but decomposes to U+3099, which is not.
   142  	{grave(30) + "\uff9e", 60, grave(30) + cgj},
   143  	// ends with incomplete UTF-8 encoding
   144  	{"\xCC", 0, ""},
   145  	{"\u0300\xCC", 2, "\u0300"},
   146  }
   147  
   148  func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) {
   149  	rb.initString(NFD, s)
   150  	rb.setFlusher(nil, appendFlush)
   151  	p := decomposeSegment(rb, 0, true)
   152  	return p, rb.out
   153  }
   154  
   155  func TestDecomposeSegment(t *testing.T) {
   156  	runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
   157  }
   158  
   159  var firstBoundaryTests = []PositionTest{
   160  	// no boundary
   161  	{"", -1, ""},
   162  	{"\u0300", -1, ""},
   163  	{"\x80\x80", -1, ""},
   164  	// illegal runes
   165  	{"\xff", 0, ""},
   166  	{"\u0300\xff", 2, ""},
   167  	{"\u0300\xc0\x80\x80", 2, ""},
   168  	// boundaries
   169  	{"a", 0, ""},
   170  	{"\u0300a", 2, ""},
   171  	// Hangul
   172  	{"\u1103\u1161", 0, ""},
   173  	{"\u110B\u1173\u11B7", 0, ""},
   174  	{"\u1161\u110B\u1173\u11B7", 3, ""},
   175  	{"\u1173\u11B7\u1103\u1161", 6, ""},
   176  	// too many combining characters.
   177  	{grave(maxNonStarters - 1), -1, ""},
   178  	{grave(maxNonStarters), 60, ""},
   179  	{grave(maxNonStarters + 1), 60, ""},
   180  }
   181  
   182  func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
   183  	return rb.f.form.FirstBoundary([]byte(s)), nil
   184  }
   185  
   186  func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) {
   187  	return rb.f.form.FirstBoundaryInString(s), nil
   188  }
   189  
   190  func TestFirstBoundary(t *testing.T) {
   191  	runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
   192  	runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
   193  }
   194  
   195  func TestNextBoundary(t *testing.T) {
   196  	testCases := []struct {
   197  		input string
   198  		atEOF bool
   199  		want  int
   200  	}{
   201  		// no boundary
   202  		{"", true, 0},
   203  		{"", false, -1},
   204  		{"\u0300", true, 2},
   205  		{"\u0300", false, -1},
   206  		{"\x80\x80", true, 1},
   207  		{"\x80\x80", false, 1},
   208  		// illegal runes
   209  		{"\xff", false, 1},
   210  		{"\u0300\xff", false, 2},
   211  		{"\u0300\xc0\x80\x80", false, 2},
   212  		{"\xc2\x80\x80", false, 2},
   213  		{"\xc2", false, -1},
   214  		{"\xc2", true, 1},
   215  		{"a\u0300\xc2", false, -1},
   216  		{"a\u0300\xc2", true, 3},
   217  		// boundaries
   218  		{"a", true, 1},
   219  		{"a", false, -1},
   220  		{"aa", false, 1},
   221  		{"\u0300", true, 2},
   222  		{"\u0300", false, -1},
   223  		{"\u0300a", false, 2},
   224  		// Hangul
   225  		{"\u1103\u1161", true, 6},
   226  		{"\u1103\u1161", false, -1},
   227  		{"\u110B\u1173\u11B7", false, -1},
   228  		{"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9},
   229  		{"\u1161\u110B\u1173\u11B7", false, 3},
   230  		{"\u1173\u11B7\u1103\u1161", false, 6},
   231  		// too many combining characters.
   232  		{grave(maxNonStarters - 1), false, -1},
   233  		{grave(maxNonStarters), false, 60},
   234  		{grave(maxNonStarters + 1), false, 60},
   235  	}
   236  
   237  	for _, tc := range testCases {
   238  		if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want {
   239  			t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
   240  		}
   241  		if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want {
   242  			t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
   243  		}
   244  	}
   245  }
   246  
   247  var decomposeToLastTests = []PositionTest{
   248  	// ends with inert character
   249  	{"Hello!", 6, ""},
   250  	{"\u0632", 2, ""},
   251  	{"a\u0301\u0635", 5, ""},
   252  	// ends with non-inert starter
   253  	{"a", 0, "a"},
   254  	{"a\u0301a", 3, "a"},
   255  	{"a\u0301\u03B9", 3, "\u03B9"},
   256  	{"a\u0327", 0, "a\u0327"},
   257  	// illegal runes
   258  	{"\xFF", 1, ""},
   259  	{"aa\xFF", 3, ""},
   260  	{"\xC0\x80\x80", 3, ""},
   261  	{"\xCC\x80\x80", 3, ""},
   262  	// ends with incomplete UTF-8 encoding
   263  	{"a\xCC", 2, ""},
   264  	// ends with combining characters
   265  	{"\u0300\u0301", 0, "\u0300\u0301"},
   266  	{"a\u0300\u0301", 0, "a\u0300\u0301"},
   267  	{"a\u0301\u0308", 0, "a\u0301\u0308"},
   268  	{"a\u0308\u0301", 0, "a\u0308\u0301"},
   269  	{"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
   270  	{"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
   271  	{"\u00C0", 0, "A\u0300"},
   272  	{"a\u00C0", 1, "A\u0300"},
   273  	// decomposing
   274  	{"a\u0300\u00E0", 3, "a\u0300"},
   275  	// multisegment decompositions (flushes leading segments)
   276  	{"a\u0300\uFDC0", 7, "\u064A"},
   277  	{"\uFDC0" + grave(29), 4, "\u064A" + grave(29)},
   278  	{"\uFDC0" + grave(30), 4, "\u064A" + grave(30)},
   279  	{"\uFDC0" + grave(31), 5, grave(30)},
   280  	{"\uFDFA" + grave(14), 31, "\u0645" + grave(14)},
   281  	// Overflow
   282  	{"\u00E0" + grave(29), 0, "a" + grave(30)},
   283  	{"\u00E0" + grave(30), 2, grave(30)},
   284  	// Hangul
   285  	{"a\u1103", 1, "\u1103"},
   286  	{"a\u110B", 1, "\u110B"},
   287  	{"a\u110B\u1173", 1, "\u110B\u1173"},
   288  	// See comment in composition.go:compBoundaryAfter.
   289  	{"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
   290  	{"a\uC73C", 1, "\u110B\u1173"},
   291  	{"다음", 3, "\u110B\u1173\u11B7"},
   292  	{"다", 0, "\u1103\u1161"},
   293  	{"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
   294  	{"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
   295  	{"다음음", 6, "\u110B\u1173\u11B7"},
   296  	{"음다다", 6, "\u1103\u1161"},
   297  	// maximized buffer
   298  	{"a" + grave(30), 0, "a" + grave(30)},
   299  	// Buffer overflow
   300  	{"a" + grave(31), 3, grave(30)},
   301  	// weird UTF-8
   302  	{"a\u0300\u11B7", 0, "a\u0300\u11B7"},
   303  }
   304  
   305  func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) {
   306  	rb.setFlusher([]byte(s), appendFlush)
   307  	decomposeToLastBoundary(rb)
   308  	buf := rb.flush(nil)
   309  	return len(rb.out), buf
   310  }
   311  
   312  func TestDecomposeToLastBoundary(t *testing.T) {
   313  	runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
   314  }
   315  
   316  var lastBoundaryTests = []PositionTest{
   317  	// ends with inert character
   318  	{"Hello!", 6, ""},
   319  	{"\u0632", 2, ""},
   320  	// ends with non-inert starter
   321  	{"a", 0, ""},
   322  	// illegal runes
   323  	{"\xff", 1, ""},
   324  	{"aa\xff", 3, ""},
   325  	{"a\xff\u0300", 1, ""}, // TODO: should probably be 2.
   326  	{"\xc0\x80\x80", 3, ""},
   327  	{"\xc0\x80\x80\u0300", 3, ""},
   328  	// ends with incomplete UTF-8 encoding
   329  	{"\xCC", -1, ""},
   330  	{"\xE0\x80", -1, ""},
   331  	{"\xF0\x80\x80", -1, ""},
   332  	{"a\xCC", 0, ""},
   333  	{"\x80\xCC", 1, ""},
   334  	{"\xCC\xCC", 1, ""},
   335  	// ends with combining characters
   336  	{"a\u0300\u0301", 0, ""},
   337  	{"aaaa\u0300\u0301", 3, ""},
   338  	{"\u0300a\u0300\u0301", 2, ""},
   339  	{"\u00C2", 0, ""},
   340  	{"a\u00C2", 1, ""},
   341  	// decomposition may recombine
   342  	{"\u0226", 0, ""},
   343  	// no boundary
   344  	{"", -1, ""},
   345  	{"\u0300\u0301", -1, ""},
   346  	{"\u0300", -1, ""},
   347  	{"\x80\x80", -1, ""},
   348  	{"\x80\x80\u0301", -1, ""},
   349  	// Hangul
   350  	{"다음", 3, ""},
   351  	{"다", 0, ""},
   352  	{"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
   353  	{"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
   354  	// too many combining characters.
   355  	{grave(maxNonStarters - 1), -1, ""},
   356  	// May still be preceded with a non-starter.
   357  	{grave(maxNonStarters), -1, ""},
   358  	// May still need to insert a cgj after the last combiner.
   359  	{grave(maxNonStarters + 1), 2, ""},
   360  	{grave(maxNonStarters + 2), 4, ""},
   361  
   362  	{"a" + grave(maxNonStarters-1), 0, ""},
   363  	{"a" + grave(maxNonStarters), 0, ""},
   364  	// May still need to insert a cgj after the last combiner.
   365  	{"a" + grave(maxNonStarters+1), 3, ""},
   366  	{"a" + grave(maxNonStarters+2), 5, ""},
   367  }
   368  
   369  func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
   370  	return rb.f.form.LastBoundary([]byte(s)), nil
   371  }
   372  
   373  func TestLastBoundary(t *testing.T) {
   374  	runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
   375  }
   376  
   377  type spanTest struct {
   378  	input string
   379  	atEOF bool
   380  	n     int
   381  	err   error
   382  }
   383  
   384  var quickSpanTests = []spanTest{
   385  	{"", true, 0, nil},
   386  	// starters
   387  	{"a", true, 1, nil},
   388  	{"abc", true, 3, nil},
   389  	{"\u043Eb", true, 3, nil},
   390  	// incomplete last rune.
   391  	{"\xCC", true, 1, nil},
   392  	{"\xCC", false, 0, transform.ErrShortSrc},
   393  	{"a\xCC", true, 2, nil},
   394  	{"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD
   395  	// incorrectly ordered combining characters
   396  	{"\u0300\u0316", true, 0, transform.ErrEndOfSpan},
   397  	{"\u0300\u0316", false, 0, transform.ErrEndOfSpan},
   398  	{"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan},
   399  	{"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan},
   400  	// have a maximum number of combining characters.
   401  	{rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
   402  	{"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
   403  	{"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
   404  	{"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan},
   405  	{rep(0x035D, 30) + cgj + "\u035B", true, 64, nil},
   406  	{"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil},
   407  	{"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
   408  	{"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
   409  
   410  	{"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc},
   411  	{"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
   412  	{"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
   413  }
   414  
   415  var quickSpanNFDTests = []spanTest{
   416  	// needs decomposing
   417  	{"\u00C0", true, 0, transform.ErrEndOfSpan},
   418  	{"abc\u00C0", true, 3, transform.ErrEndOfSpan},
   419  	// correctly ordered combining characters
   420  	{"\u0300", true, 2, nil},
   421  	{"ab\u0300", true, 4, nil},
   422  	{"ab\u0300cd", true, 6, nil},
   423  	{"\u0300cd", true, 4, nil},
   424  	{"\u0316\u0300", true, 4, nil},
   425  	{"ab\u0316\u0300", true, 6, nil},
   426  	{"ab\u0316\u0300cd", true, 8, nil},
   427  	{"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan},
   428  	{"\u0316\u0300cd", true, 6, nil},
   429  	{"\u043E\u0308b", true, 5, nil},
   430  	// incorrectly ordered combining characters
   431  	{"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well.
   432  	{"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
   433  	// Hangul
   434  	{"같은", true, 0, transform.ErrEndOfSpan},
   435  }
   436  
   437  var quickSpanNFCTests = []spanTest{
   438  	// okay composed
   439  	{"\u00C0", true, 2, nil},
   440  	{"abc\u00C0", true, 5, nil},
   441  	// correctly ordered combining characters
   442  	// TODO: b may combine with modifiers, which is why this fails. We could
   443  	// make a more precise test that actually checks whether last
   444  	// characters combines. Probably not worth it.
   445  	{"ab\u0300", true, 1, transform.ErrEndOfSpan},
   446  	{"ab\u0300cd", true, 1, transform.ErrEndOfSpan},
   447  	{"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan},
   448  	{"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan},
   449  	{"\u00C0\u035D", true, 4, nil},
   450  	// we do not special case leading combining characters
   451  	{"\u0300cd", true, 0, transform.ErrEndOfSpan},
   452  	{"\u0300", true, 0, transform.ErrEndOfSpan},
   453  	{"\u0316\u0300", true, 0, transform.ErrEndOfSpan},
   454  	{"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan},
   455  	// incorrectly ordered combining characters
   456  	{"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan},
   457  	{"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
   458  	// Hangul
   459  	{"같은", true, 6, nil},
   460  	{"같은", false, 3, transform.ErrShortSrc},
   461  	// We return the start of the violating segment in case of overflow.
   462  	{grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan},
   463  	{grave(30), true, 0, transform.ErrEndOfSpan},
   464  }
   465  
   466  func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) {
   467  	for i, tc := range testCases {
   468  		s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
   469  		ok := t.Run(s, func(t *testing.T) {
   470  			n, err := f.Span([]byte(tc.input), tc.atEOF)
   471  			if n != tc.n || err != tc.err {
   472  				t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
   473  			}
   474  		})
   475  		if !ok {
   476  			continue // Don't do the String variant if the Bytes variant failed.
   477  		}
   478  		s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
   479  		t.Run(s, func(t *testing.T) {
   480  			n, err := f.SpanString(tc.input, tc.atEOF)
   481  			if n != tc.n || err != tc.err {
   482  				t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
   483  			}
   484  		})
   485  	}
   486  }
   487  
   488  func TestSpan(t *testing.T) {
   489  	runSpanTests(t, "NFD", NFD, quickSpanTests)
   490  	runSpanTests(t, "NFD", NFD, quickSpanNFDTests)
   491  	runSpanTests(t, "NFC", NFC, quickSpanTests)
   492  	runSpanTests(t, "NFC", NFC, quickSpanNFCTests)
   493  }
   494  
   495  var isNormalTests = []PositionTest{
   496  	{"", 1, ""},
   497  	// illegal runes
   498  	{"\xff", 1, ""},
   499  	// starters
   500  	{"a", 1, ""},
   501  	{"abc", 1, ""},
   502  	{"\u043Eb", 1, ""},
   503  	// incorrectly ordered combining characters
   504  	{"\u0300\u0316", 0, ""},
   505  	{"ab\u0300\u0316", 0, ""},
   506  	{"ab\u0300\u0316cd", 0, ""},
   507  	{"\u0300\u0316cd", 0, ""},
   508  }
   509  var isNormalNFDTests = []PositionTest{
   510  	// needs decomposing
   511  	{"\u00C0", 0, ""},
   512  	{"abc\u00C0", 0, ""},
   513  	// correctly ordered combining characters
   514  	{"\u0300", 1, ""},
   515  	{"ab\u0300", 1, ""},
   516  	{"ab\u0300cd", 1, ""},
   517  	{"\u0300cd", 1, ""},
   518  	{"\u0316\u0300", 1, ""},
   519  	{"ab\u0316\u0300", 1, ""},
   520  	{"ab\u0316\u0300cd", 1, ""},
   521  	{"\u0316\u0300cd", 1, ""},
   522  	{"\u043E\u0308b", 1, ""},
   523  	// Hangul
   524  	{"같은", 0, ""},
   525  }
   526  var isNormalNFCTests = []PositionTest{
   527  	// okay composed
   528  	{"\u00C0", 1, ""},
   529  	{"abc\u00C0", 1, ""},
   530  	// need reordering
   531  	{"a\u0300", 0, ""},
   532  	{"a\u0300cd", 0, ""},
   533  	{"a\u0316\u0300", 0, ""},
   534  	{"a\u0316\u0300cd", 0, ""},
   535  	// correctly ordered combining characters
   536  	{"ab\u0300", 1, ""},
   537  	{"ab\u0300cd", 1, ""},
   538  	{"ab\u0316\u0300", 1, ""},
   539  	{"ab\u0316\u0300cd", 1, ""},
   540  	{"\u00C0\u035D", 1, ""},
   541  	{"\u0300", 1, ""},
   542  	{"\u0316\u0300cd", 1, ""},
   543  	// Hangul
   544  	{"같은", 1, ""},
   545  }
   546  
   547  var isNormalNFKXTests = []PositionTest{
   548  	// Special case.
   549  	{"\u00BC", 0, ""},
   550  }
   551  
   552  func isNormalF(rb *reorderBuffer, s string) (int, []byte) {
   553  	if rb.f.form.IsNormal([]byte(s)) {
   554  		return 1, nil
   555  	}
   556  	return 0, nil
   557  }
   558  
   559  func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) {
   560  	if rb.f.form.IsNormalString(s) {
   561  		return 1, nil
   562  	}
   563  	return 0, nil
   564  }
   565  
   566  func TestIsNormal(t *testing.T) {
   567  	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
   568  	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
   569  	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
   570  	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
   571  	runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests)
   572  	runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests)
   573  	runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests)
   574  	runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests)
   575  	runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests)
   576  	runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests)
   577  }
   578  
   579  func TestIsNormalString(t *testing.T) {
   580  	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests)
   581  	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests)
   582  	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests)
   583  	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests)
   584  }
   585  
   586  type AppendTest struct {
   587  	left  string
   588  	right string
   589  	out   string
   590  }
   591  
   592  type appendFunc func(f Form, out []byte, s string) []byte
   593  
   594  var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
   595  
   596  func runNormTests(t *testing.T, name string, fn appendFunc) {
   597  	for f := NFC; f <= NFKD; f++ {
   598  		runAppendTests(t, name, f, fn, normTests[f])
   599  	}
   600  }
   601  
   602  func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
   603  	for i, test := range tests {
   604  		t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) {
   605  			id := pc(test.left + test.right)
   606  			if *testn >= 0 && i != *testn {
   607  				return
   608  			}
   609  			t.Run("fn", func(t *testing.T) {
   610  				out := []byte(test.left)
   611  				have := string(fn(f, out, test.right))
   612  				if len(have) != len(test.out) {
   613  					t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out))
   614  				}
   615  				if have != test.out {
   616  					k, pf := pidx(have, test.out)
   617  					t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:]))
   618  				}
   619  			})
   620  
   621  			// Bootstrap by normalizing input. Ensures that the various variants
   622  			// behave the same.
   623  			for g := NFC; g <= NFKD; g++ {
   624  				if f == g {
   625  					continue
   626  				}
   627  				t.Run(fstr[g], func(t *testing.T) {
   628  					want := g.String(test.left + test.right)
   629  					have := string(fn(g, g.AppendString(nil, test.left), test.right))
   630  					if len(have) != len(want) {
   631  						t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want))
   632  					}
   633  					if have != want {
   634  						k, pf := pidx(have, want)
   635  						t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:]))
   636  					}
   637  				})
   638  			}
   639  		})
   640  	}
   641  }
   642  
   643  var normTests = [][]AppendTest{
   644  	appendTestsNFC,
   645  	appendTestsNFD,
   646  	appendTestsNFKC,
   647  	appendTestsNFKD,
   648  }
   649  
   650  var appendTestsNFC = []AppendTest{
   651  	{"", ascii, ascii},
   652  	{"", txt_all, txt_all},
   653  	{"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)},
   654  	{grave(30), "\uff9e", grave(30) + cgj + "\uff9e"},
   655  
   656  	// Tests designed for Iter.
   657  	{ // ordering of non-composing combining characters
   658  		"",
   659  		"\u0305\u0316",
   660  		"\u0316\u0305",
   661  	},
   662  	{ // segment overflow
   663  		"",
   664  		"a" + rep(0x0305, maxNonStarters+4) + "\u0316",
   665  		"a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4),
   666  	},
   667  
   668  	{ // Combine across non-blocking non-starters.
   669  		// U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;;
   670  		// U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;;
   671  		"", "a\u0327\u0325", "\u1e01\u0327",
   672  	},
   673  
   674  	{ // Jamo V+T does not combine.
   675  		"",
   676  		"\u1161\u11a8",
   677  		"\u1161\u11a8",
   678  	},
   679  
   680  	// Stability tests: see https://www.unicode.org/review/pr-29.html.
   681  	{"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"},
   682  	{"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"},
   683  	{"", "\u0b47\u0b3e", "\u0b4b"},
   684  	{"", "\u1100\u1161", "\uac00"},
   685  
   686  	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
   687  	{ // 0d4a starts a new segment.
   688  		"",
   689  		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
   690  		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
   691  	},
   692  
   693  	{ // Split combining characters.
   694  		// TODO: don't insert CGJ before starters.
   695  		"",
   696  		"\u0d46" + strings.Repeat("\u0d3e", 31),
   697  		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
   698  	},
   699  
   700  	{ // Split combining characters.
   701  		"",
   702  		"\u0d4a" + strings.Repeat("\u0d3e", 30),
   703  		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
   704  	},
   705  
   706  	{ //  https://golang.org/issues/20079
   707  		"",
   708  		"\xeb\u0344",
   709  		"\xeb\u0308\u0301",
   710  	},
   711  
   712  	{ //  https://golang.org/issues/20079
   713  		"",
   714  		"\uac00" + strings.Repeat("\u0300", 30),
   715  		"\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300",
   716  	},
   717  
   718  	{ //  https://golang.org/issues/20079
   719  		"",
   720  		"\xeb" + strings.Repeat("\u0300", 31),
   721  		"\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300",
   722  	},
   723  }
   724  
   725  var appendTestsNFD = []AppendTest{
   726  	// TODO: Move some of the tests here.
   727  }
   728  
   729  var appendTestsNFKC = []AppendTest{
   730  	// empty buffers
   731  	{"", "", ""},
   732  	{"a", "", "a"},
   733  	{"", "a", "a"},
   734  	{"", "\u0041\u0307\u0304", "\u01E0"},
   735  	// segment split across buffers
   736  	{"", "a\u0300b", "\u00E0b"},
   737  	{"a", "\u0300b", "\u00E0b"},
   738  	{"a", "\u0300\u0316", "\u00E0\u0316"},
   739  	{"a", "\u0316\u0300", "\u00E0\u0316"},
   740  	{"a", "\u0300a\u0300", "\u00E0\u00E0"},
   741  	{"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
   742  	{"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
   743  	{"a\u0300", "\u0327", "\u00E0\u0327"},
   744  	{"a\u0327", "\u0300", "\u00E0\u0327"},
   745  	{"a\u0316", "\u0300", "\u00E0\u0316"},
   746  	{"\u0041\u0307", "\u0304", "\u01E0"},
   747  	// Hangul
   748  	{"", "\u110B\u1173", "\uC73C"},
   749  	{"", "\u1103\u1161", "\uB2E4"},
   750  	{"", "\u110B\u1173\u11B7", "\uC74C"},
   751  	{"", "\u320E", "\x28\uAC00\x29"},
   752  	{"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
   753  	{"\u1103", "\u1161", "\uB2E4"},
   754  	{"\u110B", "\u1173\u11B7", "\uC74C"},
   755  	{"\u110B\u1173", "\u11B7", "\uC74C"},
   756  	{"\uC73C", "\u11B7", "\uC74C"},
   757  	// UTF-8 encoding split across buffers
   758  	{"a\xCC", "\x80", "\u00E0"},
   759  	{"a\xCC", "\x80b", "\u00E0b"},
   760  	{"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
   761  	{"a\xCC", "\x80\x80", "\u00E0\x80"},
   762  	{"a\xCC", "\x80\xCC", "\u00E0\xCC"},
   763  	{"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
   764  	// ending in incomplete UTF-8 encoding
   765  	{"", "\xCC", "\xCC"},
   766  	{"a", "\xCC", "a\xCC"},
   767  	{"a", "b\xCC", "ab\xCC"},
   768  	{"\u0226", "\xCC", "\u0226\xCC"},
   769  	// illegal runes
   770  	{"", "\x80", "\x80"},
   771  	{"", "\x80\x80\x80", "\x80\x80\x80"},
   772  	{"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
   773  	{"", "a\x80", "a\x80"},
   774  	{"", "a\x80\x80\x80", "a\x80\x80\x80"},
   775  	{"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
   776  	{"a", "\x80\x80\x80", "a\x80\x80\x80"},
   777  	// overflow
   778  	{"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
   779  	{strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
   780  	{strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
   781  	// overflow of combining characters
   782  	{"", grave(34), grave(30) + cgj + grave(4)},
   783  	{"", grave(36), grave(30) + cgj + grave(6)},
   784  	{grave(29), grave(5), grave(30) + cgj + grave(4)},
   785  	{grave(30), grave(4), grave(30) + cgj + grave(4)},
   786  	{grave(30), grave(3), grave(30) + cgj + grave(3)},
   787  	{grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)},
   788  	{"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)},
   789  	{"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)},
   790  	// - First rune has a trailing non-starter.
   791  	{"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)},
   792  	// - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be
   793  	//   inserted even when FF9E starts a new segment.
   794  	{"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)},
   795  	{grave(30), "\uff9e", grave(30) + cgj + "\u3099"},
   796  	// - Many non-starter decompositions in a row causing overflow.
   797  	{"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
   798  	{"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
   799  
   800  	{"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"},
   801  	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
   802  	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
   803  
   804  	// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
   805  	{"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"},
   806  	{"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"},
   807  	{"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"},
   808  
   809  	// weird UTF-8
   810  	{"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
   811  	{"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
   812  	{"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
   813  	{"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
   814  	{"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
   815  	{"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
   816  	{"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
   817  	{"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
   818  
   819  	{"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
   820  	// large input.
   821  	{"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)},
   822  	{"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
   823  	{"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
   824  	{"", "\u0041\u0307\u0304", "\u01E0"},
   825  }
   826  
   827  var appendTestsNFKD = []AppendTest{
   828  	{"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)},
   829  
   830  	{ // segment overflow on unchanged character
   831  		"",
   832  		"a" + grave(64) + "\u0316",
   833  		"a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4),
   834  	},
   835  	{ // segment overflow on unchanged character + start value
   836  		"",
   837  		"a" + grave(98) + "\u0316",
   838  		"a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8),
   839  	},
   840  	{ // segment overflow on decomposition. (U+0340 decomposes to U+0300.)
   841  		"",
   842  		"a" + grave(59) + "\u0340",
   843  		"a" + grave(30) + cgj + grave(30),
   844  	},
   845  	{ // segment overflow on non-starter decomposition
   846  		"",
   847  		"a" + grave(33) + "\u0340" + grave(30) + "\u0320",
   848  		"a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4),
   849  	},
   850  	{ // start value after ASCII overflow
   851  		"",
   852  		rep('a', segSize) + grave(32) + "\u0320",
   853  		rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2),
   854  	},
   855  	{ // Jamo overflow
   856  		"",
   857  		"\u1100\u1161" + grave(30) + "\u0320" + grave(2),
   858  		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   859  	},
   860  	{ // Hangul
   861  		"",
   862  		"\uac00",
   863  		"\u1100\u1161",
   864  	},
   865  	{ // Hangul overflow
   866  		"",
   867  		"\uac00" + grave(32) + "\u0320",
   868  		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   869  	},
   870  	{ // Hangul overflow in Hangul mode.
   871  		"",
   872  		"\uac00\uac00" + grave(32) + "\u0320",
   873  		"\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
   874  	},
   875  	{ // Hangul overflow in Hangul mode.
   876  		"",
   877  		strings.Repeat("\uac00", 3) + grave(32) + "\u0320",
   878  		strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3),
   879  	},
   880  	{ // start value after cc=0
   881  		"",
   882  		"您您" + grave(34) + "\u0320",
   883  		"您您" + grave(30) + cgj + "\u0320" + grave(4),
   884  	},
   885  	{ // start value after normalization
   886  		"",
   887  		"\u0300\u0320a" + grave(34) + "\u0320",
   888  		"\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
   889  	},
   890  	{
   891  		// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
   892  		"",
   893  		"a\u0f7f" + rep(0xf71, 29) + "\u0f81",
   894  		"a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80",
   895  	},
   896  }
   897  
   898  func TestAppend(t *testing.T) {
   899  	runNormTests(t, "Append", func(f Form, out []byte, s string) []byte {
   900  		return f.Append(out, []byte(s)...)
   901  	})
   902  }
   903  
   904  func TestAppendString(t *testing.T) {
   905  	runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte {
   906  		return f.AppendString(out, s)
   907  	})
   908  }
   909  
   910  func TestBytes(t *testing.T) {
   911  	runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte {
   912  		buf := []byte{}
   913  		buf = append(buf, out...)
   914  		buf = append(buf, s...)
   915  		return f.Bytes(buf)
   916  	})
   917  }
   918  
   919  func TestString(t *testing.T) {
   920  	runNormTests(t, "String", func(f Form, out []byte, s string) []byte {
   921  		outs := string(out) + s
   922  		return []byte(f.String(outs))
   923  	})
   924  }
   925  
   926  func runNM(code string) (string, error) {
   927  	// Write the file.
   928  	tmpdir, err := os.MkdirTemp(os.TempDir(), "normalize_test")
   929  	if err != nil {
   930  		return "", fmt.Errorf("failed to create tmpdir: %v", err)
   931  	}
   932  	defer os.RemoveAll(tmpdir)
   933  	goTool := filepath.Join(runtime.GOROOT(), "bin", "go")
   934  	filename := filepath.Join(tmpdir, "main.go")
   935  	if err := os.WriteFile(filename, []byte(code), 0644); err != nil {
   936  		return "", fmt.Errorf("failed to write main.go: %v", err)
   937  	}
   938  	outputFile := filepath.Join(tmpdir, "main")
   939  
   940  	// Build the binary.
   941  	out, err := exec.Command(goTool, "build", "-o", outputFile, filename).CombinedOutput()
   942  	if err != nil {
   943  		return "", fmt.Errorf("failed to execute command: %v", err)
   944  	}
   945  
   946  	// Get the symbols.
   947  	out, err = exec.Command(goTool, "tool", "nm", outputFile).CombinedOutput()
   948  	return string(out), err
   949  }
   950  
   951  func TestLinking(t *testing.T) {
   952  	const prog = `
   953  	package main
   954  	import "fmt"
   955  	import "golang.org/x/text/unicode/norm"
   956  	func main() { fmt.Println(norm.%s) }
   957  	`
   958  
   959  	baseline, errB := runNM(fmt.Sprintf(prog, "MaxSegmentSize"))
   960  	withTables, errT := runNM(fmt.Sprintf(prog, `NFC.String("")`))
   961  	if errB != nil || errT != nil {
   962  		t.Skipf("TestLinking failed: %v and %v", errB, errT)
   963  	}
   964  
   965  	symbols := []string{"norm.formTable", "norm.nfkcValues", "norm.decomps"}
   966  	for _, symbol := range symbols {
   967  		if strings.Contains(baseline, symbol) {
   968  			t.Errorf("found: %q unexpectedly", symbol)
   969  		}
   970  		if !strings.Contains(withTables, symbol) {
   971  			t.Errorf("didn't find: %q unexpectedly", symbol)
   972  		}
   973  	}
   974  }
   975  
   976  func appendBench(f Form, in []byte) func() {
   977  	buf := make([]byte, 0, 4*len(in))
   978  	return func() {
   979  		f.Append(buf, in...)
   980  	}
   981  }
   982  
   983  func bytesBench(f Form, in []byte) func() {
   984  	return func() {
   985  		f.Bytes(in)
   986  	}
   987  }
   988  
   989  func iterBench(f Form, in []byte) func() {
   990  	iter := Iter{}
   991  	return func() {
   992  		iter.Init(f, in)
   993  		for !iter.Done() {
   994  			iter.Next()
   995  		}
   996  	}
   997  }
   998  
   999  func transformBench(f Form, in []byte) func() {
  1000  	buf := make([]byte, 4*len(in))
  1001  	return func() {
  1002  		if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n {
  1003  			log.Panic(n, len(in), err)
  1004  		}
  1005  	}
  1006  }
  1007  
  1008  func readerBench(f Form, in []byte) func() {
  1009  	buf := make([]byte, 4*len(in))
  1010  	return func() {
  1011  		r := f.Reader(bytes.NewReader(in))
  1012  		var err error
  1013  		for err == nil {
  1014  			_, err = r.Read(buf)
  1015  		}
  1016  		if err != io.EOF {
  1017  			panic("")
  1018  		}
  1019  	}
  1020  }
  1021  
  1022  func writerBench(f Form, in []byte) func() {
  1023  	buf := make([]byte, 0, 4*len(in))
  1024  	return func() {
  1025  		r := f.Writer(bytes.NewBuffer(buf))
  1026  		if _, err := r.Write(in); err != nil {
  1027  			panic("")
  1028  		}
  1029  	}
  1030  }
  1031  
  1032  func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
  1033  	bm = append(bm, appendBench(f, in))
  1034  	bm = append(bm, iterBench(f, in))
  1035  	bm = append(bm, transformBench(f, in))
  1036  	bm = append(bm, readerBench(f, in))
  1037  	bm = append(bm, writerBench(f, in))
  1038  	return bm
  1039  }
  1040  
  1041  func doFormBenchmark(b *testing.B, inf, f Form, s string) {
  1042  	b.StopTimer()
  1043  	in := inf.Bytes([]byte(s))
  1044  	bm := appendBenchmarks(nil, f, in)
  1045  	b.SetBytes(int64(len(in) * len(bm)))
  1046  	b.StartTimer()
  1047  	for i := 0; i < b.N; i++ {
  1048  		for _, fn := range bm {
  1049  			fn()
  1050  		}
  1051  	}
  1052  }
  1053  
  1054  func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) {
  1055  	b.StopTimer()
  1056  	fn := f(NFC, s)
  1057  	b.SetBytes(int64(len(s)))
  1058  	b.StartTimer()
  1059  	for i := 0; i < b.N; i++ {
  1060  		fn()
  1061  	}
  1062  }
  1063  
  1064  var (
  1065  	smallNoChange = []byte("nörmalization")
  1066  	smallChange   = []byte("No\u0308rmalization")
  1067  	ascii         = strings.Repeat("There is nothing to change here! ", 500)
  1068  )
  1069  
  1070  func lowerBench(f Form, in []byte) func() {
  1071  	// Use package strings instead of bytes as it doesn't allocate memory
  1072  	// if there aren't any changes.
  1073  	s := string(in)
  1074  	return func() {
  1075  		strings.ToLower(s)
  1076  	}
  1077  }
  1078  
  1079  func BenchmarkLowerCaseNoChange(b *testing.B) {
  1080  	doSingle(b, lowerBench, smallNoChange)
  1081  }
  1082  func BenchmarkLowerCaseChange(b *testing.B) {
  1083  	doSingle(b, lowerBench, smallChange)
  1084  }
  1085  
  1086  func quickSpanBench(f Form, in []byte) func() {
  1087  	return func() {
  1088  		f.QuickSpan(in)
  1089  	}
  1090  }
  1091  
  1092  func BenchmarkQuickSpanChangeNFC(b *testing.B) {
  1093  	doSingle(b, quickSpanBench, smallNoChange)
  1094  }
  1095  
  1096  func BenchmarkBytesNoChangeNFC(b *testing.B) {
  1097  	doSingle(b, bytesBench, smallNoChange)
  1098  }
  1099  func BenchmarkBytesChangeNFC(b *testing.B) {
  1100  	doSingle(b, bytesBench, smallChange)
  1101  }
  1102  
  1103  func BenchmarkAppendNoChangeNFC(b *testing.B) {
  1104  	doSingle(b, appendBench, smallNoChange)
  1105  }
  1106  func BenchmarkAppendChangeNFC(b *testing.B) {
  1107  	doSingle(b, appendBench, smallChange)
  1108  }
  1109  func BenchmarkAppendLargeNFC(b *testing.B) {
  1110  	doSingle(b, appendBench, txt_all_bytes)
  1111  }
  1112  
  1113  func BenchmarkIterNoChangeNFC(b *testing.B) {
  1114  	doSingle(b, iterBench, smallNoChange)
  1115  }
  1116  func BenchmarkIterChangeNFC(b *testing.B) {
  1117  	doSingle(b, iterBench, smallChange)
  1118  }
  1119  func BenchmarkIterLargeNFC(b *testing.B) {
  1120  	doSingle(b, iterBench, txt_all_bytes)
  1121  }
  1122  
  1123  func BenchmarkTransformNoChangeNFC(b *testing.B) {
  1124  	doSingle(b, transformBench, smallNoChange)
  1125  }
  1126  func BenchmarkTransformChangeNFC(b *testing.B) {
  1127  	doSingle(b, transformBench, smallChange)
  1128  }
  1129  func BenchmarkTransformLargeNFC(b *testing.B) {
  1130  	doSingle(b, transformBench, txt_all_bytes)
  1131  }
  1132  
  1133  func BenchmarkNormalizeAsciiNFC(b *testing.B) {
  1134  	doFormBenchmark(b, NFC, NFC, ascii)
  1135  }
  1136  func BenchmarkNormalizeAsciiNFD(b *testing.B) {
  1137  	doFormBenchmark(b, NFC, NFD, ascii)
  1138  }
  1139  func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
  1140  	doFormBenchmark(b, NFC, NFKC, ascii)
  1141  }
  1142  func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
  1143  	doFormBenchmark(b, NFC, NFKD, ascii)
  1144  }
  1145  
  1146  func BenchmarkNormalizeNFC2NFC(b *testing.B) {
  1147  	doFormBenchmark(b, NFC, NFC, txt_all)
  1148  }
  1149  func BenchmarkNormalizeNFC2NFD(b *testing.B) {
  1150  	doFormBenchmark(b, NFC, NFD, txt_all)
  1151  }
  1152  func BenchmarkNormalizeNFD2NFC(b *testing.B) {
  1153  	doFormBenchmark(b, NFD, NFC, txt_all)
  1154  }
  1155  func BenchmarkNormalizeNFD2NFD(b *testing.B) {
  1156  	doFormBenchmark(b, NFD, NFD, txt_all)
  1157  }
  1158  
  1159  // Hangul is often special-cased, so we test it separately.
  1160  func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
  1161  	doFormBenchmark(b, NFC, NFC, txt_kr)
  1162  }
  1163  func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
  1164  	doFormBenchmark(b, NFC, NFD, txt_kr)
  1165  }
  1166  func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
  1167  	doFormBenchmark(b, NFD, NFC, txt_kr)
  1168  }
  1169  func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
  1170  	doFormBenchmark(b, NFD, NFD, txt_kr)
  1171  }
  1172  
  1173  var forms = []Form{NFC, NFD, NFKC, NFKD}
  1174  
  1175  func doTextBenchmark(b *testing.B, s string) {
  1176  	b.StopTimer()
  1177  	in := []byte(s)
  1178  	bm := []func(){}
  1179  	for _, f := range forms {
  1180  		bm = appendBenchmarks(bm, f, in)
  1181  	}
  1182  	b.SetBytes(int64(len(s) * len(bm)))
  1183  	b.StartTimer()
  1184  	for i := 0; i < b.N; i++ {
  1185  		for _, f := range bm {
  1186  			f()
  1187  		}
  1188  	}
  1189  }
  1190  
  1191  func BenchmarkCanonicalOrdering(b *testing.B) {
  1192  	doTextBenchmark(b, txt_canon)
  1193  }
  1194  func BenchmarkExtendedLatin(b *testing.B) {
  1195  	doTextBenchmark(b, txt_vn)
  1196  }
  1197  func BenchmarkMiscTwoByteUtf8(b *testing.B) {
  1198  	doTextBenchmark(b, twoByteUtf8)
  1199  }
  1200  func BenchmarkMiscThreeByteUtf8(b *testing.B) {
  1201  	doTextBenchmark(b, threeByteUtf8)
  1202  }
  1203  func BenchmarkHangul(b *testing.B) {
  1204  	doTextBenchmark(b, txt_kr)
  1205  }
  1206  func BenchmarkJapanese(b *testing.B) {
  1207  	doTextBenchmark(b, txt_jp)
  1208  }
  1209  func BenchmarkChinese(b *testing.B) {
  1210  	doTextBenchmark(b, txt_cn)
  1211  }
  1212  func BenchmarkOverflow(b *testing.B) {
  1213  	doTextBenchmark(b, overflow)
  1214  }
  1215  
  1216  var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
  1217  
  1218  // Tests sampled from the Canonical ordering tests (Part 2) of
  1219  // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
  1220  const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
  1221  \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
  1222  \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
  1223  \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062 
  1224  \u0061\u059A\u0316\u302A\u0339       \u0061\u0341\u0315\u0300\u05AE\u0062
  1225  \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
  1226  \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
  1227  \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
  1228  \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
  1229  \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
  1230  \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
  1231  \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
  1232  \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
  1233  \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
  1234  \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
  1235  \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
  1236  \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
  1237  \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
  1238  
  1239  // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
  1240  const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả. 
  1241  Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ 
  1242  nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc 
  1243  một giấy phép khác có các điều khoản tương tự như giấy phép này
  1244  cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
  1245  trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
  1246  người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
  1247  bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
  1248  chúng theo quy định của pháp luật thì tình trạng của nó không 
  1249  bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
  1250  
  1251  // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
  1252  const txt_ru = `При обязательном соблюдении следующих условий:
  1253  Attribution — Вы должны атрибутировать произведение (указывать
  1254  автора и источник) в порядке, предусмотренном автором или
  1255  лицензиаром (но только так, чтобы никоим образом не подразумевалось,
  1256  что они поддерживают вас или использование вами данного произведения).
  1257  Υπό τις ακόλουθες προϋποθέσεις:`
  1258  
  1259  // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
  1260  const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
  1261  τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
  1262  (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
  1263  τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
  1264  τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
  1265  μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
  1266  παρόμοια άδεια.`
  1267  
  1268  // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
  1269  const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
  1270  تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
  1271  الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
  1272  المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
  1273  من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
  1274  لهذا الترخيص.`
  1275  
  1276  // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
  1277  const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
  1278  המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
  1279  שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
  1280  לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
  1281  החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
  1282  
  1283  const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
  1284  
  1285  // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
  1286  const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
  1287  (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
  1288  원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
  1289  이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다). 
  1290  동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
  1291  라이선스와 동일한 라이선스를 적용해야 합니다.`
  1292  
  1293  // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
  1294  const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
  1295  มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
  1296  ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
  1297  คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
  1298  อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
  1299  อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
  1300  
  1301  const threeByteUtf8 = txt_th
  1302  
  1303  // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
  1304  const txt_jp = `あなたの従うべき条件は以下の通りです。
  1305  表示 — あなたは原著作者のクレジットを表示しなければなりません。
  1306  継承 — もしあなたがこの作品を改変、変形または加工した場合、
  1307  あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
  1308  頒布することができます。`
  1309  
  1310  // http://creativecommons.org/licenses/by-sa/2.5/cn/
  1311  const txt_cn = `您可以自由： 复制、发行、展览、表演、放映、
  1312  广播或通过信息网络传播本作品 创作演绎作品
  1313  对本作品进行商业性使用 惟须遵守下列条件：
  1314  署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
  1315  相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作，
  1316  您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
  1317  
  1318  const txt_cjk = txt_cn + txt_jp + txt_kr
  1319  const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
  1320  
  1321  var txt_all_bytes = []byte(txt_all)
  1322
View as plain text