...

Source file src/golang.org/x/net/html/parse_test.go

Documentation: golang.org/x/net/html

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package html
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"os"
    14  	"path/filepath"
    15  	"runtime"
    16  	"sort"
    17  	"strings"
    18  	"testing"
    19  
    20  	"golang.org/x/net/html/atom"
    21  )
    22  
    23  type testAttrs struct {
    24  	text, want, context string
    25  	scripting           bool
    26  }
    27  
    28  // readParseTest reads a single test case from r.
    29  func readParseTest(r *bufio.Reader) (*testAttrs, error) {
    30  	ta := &testAttrs{scripting: true}
    31  	line, err := r.ReadSlice('\n')
    32  	if err != nil {
    33  		return nil, err
    34  	}
    35  	var b []byte
    36  
    37  	// Read the HTML.
    38  	if string(line) != "#data\n" {
    39  		return nil, fmt.Errorf(`got %q want "#data\n"`, line)
    40  	}
    41  	for {
    42  		line, err = r.ReadSlice('\n')
    43  		if err != nil {
    44  			return nil, err
    45  		}
    46  		if line[0] == '#' {
    47  			break
    48  		}
    49  		b = append(b, line...)
    50  	}
    51  	ta.text = strings.TrimSuffix(string(b), "\n")
    52  	b = b[:0]
    53  
    54  	// Skip the error list.
    55  	if string(line) != "#errors\n" {
    56  		return nil, fmt.Errorf(`got %q want "#errors\n"`, line)
    57  	}
    58  	for {
    59  		line, err = r.ReadSlice('\n')
    60  		if err != nil {
    61  			return nil, err
    62  		}
    63  		if line[0] == '#' {
    64  			break
    65  		}
    66  	}
    67  
    68  	// Skip the new-errors list.
    69  	if string(line) == "#new-errors\n" {
    70  		for {
    71  			line, err = r.ReadSlice('\n')
    72  			if err != nil {
    73  				return nil, err
    74  			}
    75  			if line[0] == '#' {
    76  				break
    77  			}
    78  		}
    79  	}
    80  
    81  	if ls := string(line); strings.HasPrefix(ls, "#script-") {
    82  		switch {
    83  		case strings.HasSuffix(ls, "-on\n"):
    84  			ta.scripting = true
    85  		case strings.HasSuffix(ls, "-off\n"):
    86  			ta.scripting = false
    87  		default:
    88  			return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line)
    89  		}
    90  		for {
    91  			line, err = r.ReadSlice('\n')
    92  			if err != nil {
    93  				return nil, err
    94  			}
    95  			if line[0] == '#' {
    96  				break
    97  			}
    98  		}
    99  	}
   100  
   101  	if string(line) == "#document-fragment\n" {
   102  		line, err = r.ReadSlice('\n')
   103  		if err != nil {
   104  			return nil, err
   105  		}
   106  		ta.context = strings.TrimSpace(string(line))
   107  		line, err = r.ReadSlice('\n')
   108  		if err != nil {
   109  			return nil, err
   110  		}
   111  	}
   112  
   113  	// Read the dump of what the parse tree should be.
   114  	if string(line) != "#document\n" {
   115  		return nil, fmt.Errorf(`got %q want "#document\n"`, line)
   116  	}
   117  	inQuote := false
   118  	for {
   119  		line, err = r.ReadSlice('\n')
   120  		if err != nil && err != io.EOF {
   121  			return nil, err
   122  		}
   123  		trimmed := bytes.Trim(line, "| \n")
   124  		if len(trimmed) > 0 {
   125  			if line[0] == '|' && trimmed[0] == '"' {
   126  				inQuote = true
   127  			}
   128  			if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
   129  				inQuote = false
   130  			}
   131  		}
   132  		if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
   133  			break
   134  		}
   135  		b = append(b, line...)
   136  	}
   137  	ta.want = string(b)
   138  	return ta, nil
   139  }
   140  
   141  func dumpIndent(w io.Writer, level int) {
   142  	io.WriteString(w, "| ")
   143  	for i := 0; i < level; i++ {
   144  		io.WriteString(w, "  ")
   145  	}
   146  }
   147  
   148  type sortedAttributes []Attribute
   149  
   150  func (a sortedAttributes) Len() int {
   151  	return len(a)
   152  }
   153  
   154  func (a sortedAttributes) Less(i, j int) bool {
   155  	if a[i].Namespace != a[j].Namespace {
   156  		return a[i].Namespace < a[j].Namespace
   157  	}
   158  	return a[i].Key < a[j].Key
   159  }
   160  
   161  func (a sortedAttributes) Swap(i, j int) {
   162  	a[i], a[j] = a[j], a[i]
   163  }
   164  
   165  func dumpLevel(w io.Writer, n *Node, level int) error {
   166  	dumpIndent(w, level)
   167  	level++
   168  	switch n.Type {
   169  	case ErrorNode:
   170  		return errors.New("unexpected ErrorNode")
   171  	case DocumentNode:
   172  		return errors.New("unexpected DocumentNode")
   173  	case ElementNode:
   174  		if n.Namespace != "" {
   175  			fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
   176  		} else {
   177  			fmt.Fprintf(w, "<%s>", n.Data)
   178  		}
   179  		attr := sortedAttributes(n.Attr)
   180  		sort.Sort(attr)
   181  		for _, a := range attr {
   182  			io.WriteString(w, "\n")
   183  			dumpIndent(w, level)
   184  			if a.Namespace != "" {
   185  				fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
   186  			} else {
   187  				fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
   188  			}
   189  		}
   190  		if n.Namespace == "" && n.DataAtom == atom.Template {
   191  			io.WriteString(w, "\n")
   192  			dumpIndent(w, level)
   193  			level++
   194  			io.WriteString(w, "content")
   195  		}
   196  	case TextNode:
   197  		fmt.Fprintf(w, `"%s"`, n.Data)
   198  	case CommentNode:
   199  		fmt.Fprintf(w, "<!-- %s -->", n.Data)
   200  	case DoctypeNode:
   201  		fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
   202  		if n.Attr != nil {
   203  			var p, s string
   204  			for _, a := range n.Attr {
   205  				switch a.Key {
   206  				case "public":
   207  					p = a.Val
   208  				case "system":
   209  					s = a.Val
   210  				}
   211  			}
   212  			if p != "" || s != "" {
   213  				fmt.Fprintf(w, ` "%s"`, p)
   214  				fmt.Fprintf(w, ` "%s"`, s)
   215  			}
   216  		}
   217  		io.WriteString(w, ">")
   218  	case scopeMarkerNode:
   219  		return errors.New("unexpected scopeMarkerNode")
   220  	default:
   221  		return errors.New("unknown node type")
   222  	}
   223  	io.WriteString(w, "\n")
   224  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   225  		if err := dumpLevel(w, c, level); err != nil {
   226  			return err
   227  		}
   228  	}
   229  	return nil
   230  }
   231  
   232  func dump(n *Node) (string, error) {
   233  	if n == nil || n.FirstChild == nil {
   234  		return "", nil
   235  	}
   236  	var b bytes.Buffer
   237  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   238  		if err := dumpLevel(&b, c, 0); err != nil {
   239  			return "", err
   240  		}
   241  	}
   242  	return b.String(), nil
   243  }
   244  
   245  var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
   246  
   247  func TestParser(t *testing.T) {
   248  	for _, testDataDir := range testDataDirs {
   249  		testFiles, err := filepath.Glob(testDataDir + "*.dat")
   250  		if err != nil {
   251  			t.Fatal(err)
   252  		}
   253  		for _, tf := range testFiles {
   254  			f, err := os.Open(tf)
   255  			if err != nil {
   256  				t.Fatal(err)
   257  			}
   258  			defer f.Close()
   259  			r := bufio.NewReader(f)
   260  
   261  			for i := 0; ; i++ {
   262  				ta, err := readParseTest(r)
   263  				if err == io.EOF {
   264  					break
   265  				}
   266  				if err != nil {
   267  					t.Fatal(err)
   268  				}
   269  				if parseTestBlacklist[ta.text] {
   270  					continue
   271  				}
   272  
   273  				err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting))
   274  
   275  				if err != nil {
   276  					t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err)
   277  				}
   278  			}
   279  		}
   280  	}
   281  }
   282  
   283  // Issue 16318
   284  func TestParserWithoutScripting(t *testing.T) {
   285  	text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
   286  	want := `| <html>
   287  |   <head>
   288  |     <noscript>
   289  |   <body>
   290  |     <img>
   291  |       src="https://golang.org/doc/gopher/frontpage.png"
   292  |     <p>
   293  |       <img>
   294  |         src="https://golang.org/doc/gopher/doc.png"
   295  `
   296  
   297  	if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil {
   298  		t.Errorf("test with scripting is disabled, %q, %s", text, err)
   299  	}
   300  }
   301  
   302  // testParseCase tests one test case from the test files. If the test does not
   303  // pass, it returns an error that explains the failure.
   304  // text is the HTML to be parsed, want is a dump of the correct parse tree,
   305  // and context is the name of the context node, if any.
   306  func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
   307  	defer func() {
   308  		if x := recover(); x != nil {
   309  			switch e := x.(type) {
   310  			case error:
   311  				err = e
   312  			default:
   313  				err = fmt.Errorf("%v", e)
   314  			}
   315  		}
   316  	}()
   317  
   318  	var doc *Node
   319  	if context == "" {
   320  		doc, err = ParseWithOptions(strings.NewReader(text), opts...)
   321  		if err != nil {
   322  			return err
   323  		}
   324  	} else {
   325  		namespace := ""
   326  		if i := strings.IndexByte(context, ' '); i >= 0 {
   327  			namespace, context = context[:i], context[i+1:]
   328  		}
   329  		contextNode := &Node{
   330  			Data:      context,
   331  			DataAtom:  atom.Lookup([]byte(context)),
   332  			Namespace: namespace,
   333  			Type:      ElementNode,
   334  		}
   335  		nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...)
   336  		if err != nil {
   337  			return err
   338  		}
   339  		doc = &Node{
   340  			Type: DocumentNode,
   341  		}
   342  		for _, n := range nodes {
   343  			doc.AppendChild(n)
   344  		}
   345  	}
   346  
   347  	if err := checkTreeConsistency(doc); err != nil {
   348  		return err
   349  	}
   350  
   351  	got, err := dump(doc)
   352  	if err != nil {
   353  		return err
   354  	}
   355  	// Compare the parsed tree to the #document section.
   356  	if got != want {
   357  		return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
   358  	}
   359  
   360  	if renderTestBlacklist[text] || context != "" {
   361  		return nil
   362  	}
   363  
   364  	// Check that rendering and re-parsing results in an identical tree.
   365  	pr, pw := io.Pipe()
   366  	go func() {
   367  		pw.CloseWithError(Render(pw, doc))
   368  	}()
   369  	doc1, err := ParseWithOptions(pr, opts...)
   370  	if err != nil {
   371  		return err
   372  	}
   373  	got1, err := dump(doc1)
   374  	if err != nil {
   375  		return err
   376  	}
   377  	if got != got1 {
   378  		return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
   379  	}
   380  
   381  	return nil
   382  }
   383  
   384  // Some test inputs are simply skipped - we would otherwise fail the test. We
   385  // blacklist such inputs from the parse test.
   386  var parseTestBlacklist = map[string]bool{
   387  	// See the a.Template TODO in inHeadIM.
   388  	`<math><template><mo><template>`:                                     true,
   389  	`<template><svg><foo><template><foreignObject><div></template><div>`: true,
   390  }
   391  
   392  // Some test input result in parse trees are not 'well-formed' despite
   393  // following the HTML5 recovery algorithms. Rendering and re-parsing such a
   394  // tree will not result in an exact clone of that tree. We blacklist such
   395  // inputs from the render test.
   396  var renderTestBlacklist = map[string]bool{
   397  	// The second <a> will be reparented to the first <table>'s parent. This
   398  	// results in an <a> whose parent is an <a>, which is not 'well-formed'.
   399  	`<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
   400  	// The same thing with a <p>:
   401  	`<p><table></p>`: true,
   402  	// More cases of <a> being reparented:
   403  	`<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
   404  	`<a><table><a></table><p><a><div><a>`:                                     true,
   405  	`<a><table><td><a><table></table><a></tr><a></table><a>`:                  true,
   406  	`<template><a><table><a>`:                                                 true,
   407  	// A similar reparenting situation involving <nobr>:
   408  	`<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
   409  	// A <plaintext> element is reparented, putting it before a table.
   410  	// A <plaintext> element can't have anything after it in HTML.
   411  	`<table><plaintext><td>`:                                   true,
   412  	`<!doctype html><table><plaintext></plaintext>`:            true,
   413  	`<!doctype html><table><tbody><plaintext></plaintext>`:     true,
   414  	`<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
   415  	// A form inside a table inside a form doesn't work either.
   416  	`<!doctype html><form><table></form><form></table></form>`: true,
   417  	// A script that ends at EOF may escape its own closing tag when rendered.
   418  	`<!doctype html><script><!--<script `:          true,
   419  	`<!doctype html><script><!--<script <`:         true,
   420  	`<!doctype html><script><!--<script <a`:        true,
   421  	`<!doctype html><script><!--<script </`:        true,
   422  	`<!doctype html><script><!--<script </s`:       true,
   423  	`<!doctype html><script><!--<script </script`:  true,
   424  	`<!doctype html><script><!--<script </scripta`: true,
   425  	`<!doctype html><script><!--<script -`:         true,
   426  	`<!doctype html><script><!--<script -a`:        true,
   427  	`<!doctype html><script><!--<script -<`:        true,
   428  	`<!doctype html><script><!--<script --`:        true,
   429  	`<!doctype html><script><!--<script --a`:       true,
   430  	`<!doctype html><script><!--<script --<`:       true,
   431  	`<script><!--<script `:                         true,
   432  	`<script><!--<script <a`:                       true,
   433  	`<script><!--<script </script`:                 true,
   434  	`<script><!--<script </scripta`:                true,
   435  	`<script><!--<script -`:                        true,
   436  	`<script><!--<script -a`:                       true,
   437  	`<script><!--<script --`:                       true,
   438  	`<script><!--<script --a`:                      true,
   439  	`<script><!--<script <`:                        true,
   440  	`<script><!--<script </`:                       true,
   441  	`<script><!--<script </s`:                      true,
   442  	// Reconstructing the active formatting elements results in a <plaintext>
   443  	// element that contains an <a> element.
   444  	`<!doctype html><p><a><plaintext>b`:                       true,
   445  	`<table><math><select><mi><select></table>`:               true,
   446  	`<!doctype html><table><colgroup><plaintext></plaintext>`: true,
   447  	`<!doctype html><svg><plaintext>a</plaintext>b`:           true,
   448  }
   449  
   450  func TestNodeConsistency(t *testing.T) {
   451  	// inconsistentNode is a Node whose DataAtom and Data do not agree.
   452  	inconsistentNode := &Node{
   453  		Type:     ElementNode,
   454  		DataAtom: atom.Frameset,
   455  		Data:     "table",
   456  	}
   457  	if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil {
   458  		t.Errorf("got nil error, want non-nil")
   459  	}
   460  }
   461  
   462  func TestParseFragmentWithNilContext(t *testing.T) {
   463  	// This shouldn't panic.
   464  	ParseFragment(strings.NewReader("<p>hello</p>"), nil)
   465  }
   466  
   467  func TestParseFragmentForeignContentTemplates(t *testing.T) {
   468  	srcs := []string{
   469  		"<math><html><template><mn><template></template></template>",
   470  		"<math><math><head><mi><template>",
   471  		"<svg><head><title><select><input>",
   472  	}
   473  	for _, src := range srcs {
   474  		// The next line shouldn't infinite-loop.
   475  		ParseFragment(strings.NewReader(src), nil)
   476  	}
   477  }
   478  
   479  func BenchmarkParser(b *testing.B) {
   480  	buf, err := os.ReadFile("testdata/go1.html")
   481  	if err != nil {
   482  		b.Fatalf("could not read testdata/go1.html: %v", err)
   483  	}
   484  	b.SetBytes(int64(len(buf)))
   485  	runtime.GC()
   486  	b.ReportAllocs()
   487  	b.ResetTimer()
   488  	for i := 0; i < b.N; i++ {
   489  		Parse(bytes.NewBuffer(buf))
   490  	}
   491  }
   492  

View as plain text