1
2
3
4
5 package html
6
7 import (
8 "bufio"
9 "bytes"
10 "errors"
11 "fmt"
12 "io"
13 "os"
14 "path/filepath"
15 "runtime"
16 "sort"
17 "strings"
18 "testing"
19
20 "golang.org/x/net/html/atom"
21 )
22
23 type testAttrs struct {
24 text, want, context string
25 scripting bool
26 }
27
28
29 func readParseTest(r *bufio.Reader) (*testAttrs, error) {
30 ta := &testAttrs{scripting: true}
31 line, err := r.ReadSlice('\n')
32 if err != nil {
33 return nil, err
34 }
35 var b []byte
36
37
38 if string(line) != "#data\n" {
39 return nil, fmt.Errorf(`got %q want "#data\n"`, line)
40 }
41 for {
42 line, err = r.ReadSlice('\n')
43 if err != nil {
44 return nil, err
45 }
46 if line[0] == '#' {
47 break
48 }
49 b = append(b, line...)
50 }
51 ta.text = strings.TrimSuffix(string(b), "\n")
52 b = b[:0]
53
54
55 if string(line) != "#errors\n" {
56 return nil, fmt.Errorf(`got %q want "#errors\n"`, line)
57 }
58 for {
59 line, err = r.ReadSlice('\n')
60 if err != nil {
61 return nil, err
62 }
63 if line[0] == '#' {
64 break
65 }
66 }
67
68
69 if string(line) == "#new-errors\n" {
70 for {
71 line, err = r.ReadSlice('\n')
72 if err != nil {
73 return nil, err
74 }
75 if line[0] == '#' {
76 break
77 }
78 }
79 }
80
81 if ls := string(line); strings.HasPrefix(ls, "#script-") {
82 switch {
83 case strings.HasSuffix(ls, "-on\n"):
84 ta.scripting = true
85 case strings.HasSuffix(ls, "-off\n"):
86 ta.scripting = false
87 default:
88 return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line)
89 }
90 for {
91 line, err = r.ReadSlice('\n')
92 if err != nil {
93 return nil, err
94 }
95 if line[0] == '#' {
96 break
97 }
98 }
99 }
100
101 if string(line) == "#document-fragment\n" {
102 line, err = r.ReadSlice('\n')
103 if err != nil {
104 return nil, err
105 }
106 ta.context = strings.TrimSpace(string(line))
107 line, err = r.ReadSlice('\n')
108 if err != nil {
109 return nil, err
110 }
111 }
112
113
114 if string(line) != "#document\n" {
115 return nil, fmt.Errorf(`got %q want "#document\n"`, line)
116 }
117 inQuote := false
118 for {
119 line, err = r.ReadSlice('\n')
120 if err != nil && err != io.EOF {
121 return nil, err
122 }
123 trimmed := bytes.Trim(line, "| \n")
124 if len(trimmed) > 0 {
125 if line[0] == '|' && trimmed[0] == '"' {
126 inQuote = true
127 }
128 if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
129 inQuote = false
130 }
131 }
132 if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
133 break
134 }
135 b = append(b, line...)
136 }
137 ta.want = string(b)
138 return ta, nil
139 }
140
141 func dumpIndent(w io.Writer, level int) {
142 io.WriteString(w, "| ")
143 for i := 0; i < level; i++ {
144 io.WriteString(w, " ")
145 }
146 }
147
148 type sortedAttributes []Attribute
149
150 func (a sortedAttributes) Len() int {
151 return len(a)
152 }
153
154 func (a sortedAttributes) Less(i, j int) bool {
155 if a[i].Namespace != a[j].Namespace {
156 return a[i].Namespace < a[j].Namespace
157 }
158 return a[i].Key < a[j].Key
159 }
160
161 func (a sortedAttributes) Swap(i, j int) {
162 a[i], a[j] = a[j], a[i]
163 }
164
165 func dumpLevel(w io.Writer, n *Node, level int) error {
166 dumpIndent(w, level)
167 level++
168 switch n.Type {
169 case ErrorNode:
170 return errors.New("unexpected ErrorNode")
171 case DocumentNode:
172 return errors.New("unexpected DocumentNode")
173 case ElementNode:
174 if n.Namespace != "" {
175 fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
176 } else {
177 fmt.Fprintf(w, "<%s>", n.Data)
178 }
179 attr := sortedAttributes(n.Attr)
180 sort.Sort(attr)
181 for _, a := range attr {
182 io.WriteString(w, "\n")
183 dumpIndent(w, level)
184 if a.Namespace != "" {
185 fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
186 } else {
187 fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
188 }
189 }
190 if n.Namespace == "" && n.DataAtom == atom.Template {
191 io.WriteString(w, "\n")
192 dumpIndent(w, level)
193 level++
194 io.WriteString(w, "content")
195 }
196 case TextNode:
197 fmt.Fprintf(w, `"%s"`, n.Data)
198 case CommentNode:
199 fmt.Fprintf(w, "<!-- %s -->", n.Data)
200 case DoctypeNode:
201 fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
202 if n.Attr != nil {
203 var p, s string
204 for _, a := range n.Attr {
205 switch a.Key {
206 case "public":
207 p = a.Val
208 case "system":
209 s = a.Val
210 }
211 }
212 if p != "" || s != "" {
213 fmt.Fprintf(w, ` "%s"`, p)
214 fmt.Fprintf(w, ` "%s"`, s)
215 }
216 }
217 io.WriteString(w, ">")
218 case scopeMarkerNode:
219 return errors.New("unexpected scopeMarkerNode")
220 default:
221 return errors.New("unknown node type")
222 }
223 io.WriteString(w, "\n")
224 for c := n.FirstChild; c != nil; c = c.NextSibling {
225 if err := dumpLevel(w, c, level); err != nil {
226 return err
227 }
228 }
229 return nil
230 }
231
232 func dump(n *Node) (string, error) {
233 if n == nil || n.FirstChild == nil {
234 return "", nil
235 }
236 var b bytes.Buffer
237 for c := n.FirstChild; c != nil; c = c.NextSibling {
238 if err := dumpLevel(&b, c, 0); err != nil {
239 return "", err
240 }
241 }
242 return b.String(), nil
243 }
244
245 var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
246
247 func TestParser(t *testing.T) {
248 for _, testDataDir := range testDataDirs {
249 testFiles, err := filepath.Glob(testDataDir + "*.dat")
250 if err != nil {
251 t.Fatal(err)
252 }
253 for _, tf := range testFiles {
254 f, err := os.Open(tf)
255 if err != nil {
256 t.Fatal(err)
257 }
258 defer f.Close()
259 r := bufio.NewReader(f)
260
261 for i := 0; ; i++ {
262 ta, err := readParseTest(r)
263 if err == io.EOF {
264 break
265 }
266 if err != nil {
267 t.Fatal(err)
268 }
269 if parseTestBlacklist[ta.text] {
270 continue
271 }
272
273 err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting))
274
275 if err != nil {
276 t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err)
277 }
278 }
279 }
280 }
281 }
282
283
284 func TestParserWithoutScripting(t *testing.T) {
285 text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
286 want := `| <html>
287 | <head>
288 | <noscript>
289 | <body>
290 | <img>
291 | src="https://golang.org/doc/gopher/frontpage.png"
292 | <p>
293 | <img>
294 | src="https://golang.org/doc/gopher/doc.png"
295 `
296
297 if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil {
298 t.Errorf("test with scripting is disabled, %q, %s", text, err)
299 }
300 }
301
302
303
304
305
306 func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
307 defer func() {
308 if x := recover(); x != nil {
309 switch e := x.(type) {
310 case error:
311 err = e
312 default:
313 err = fmt.Errorf("%v", e)
314 }
315 }
316 }()
317
318 var doc *Node
319 if context == "" {
320 doc, err = ParseWithOptions(strings.NewReader(text), opts...)
321 if err != nil {
322 return err
323 }
324 } else {
325 namespace := ""
326 if i := strings.IndexByte(context, ' '); i >= 0 {
327 namespace, context = context[:i], context[i+1:]
328 }
329 contextNode := &Node{
330 Data: context,
331 DataAtom: atom.Lookup([]byte(context)),
332 Namespace: namespace,
333 Type: ElementNode,
334 }
335 nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...)
336 if err != nil {
337 return err
338 }
339 doc = &Node{
340 Type: DocumentNode,
341 }
342 for _, n := range nodes {
343 doc.AppendChild(n)
344 }
345 }
346
347 if err := checkTreeConsistency(doc); err != nil {
348 return err
349 }
350
351 got, err := dump(doc)
352 if err != nil {
353 return err
354 }
355
356 if got != want {
357 return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
358 }
359
360 if renderTestBlacklist[text] || context != "" {
361 return nil
362 }
363
364
365 pr, pw := io.Pipe()
366 go func() {
367 pw.CloseWithError(Render(pw, doc))
368 }()
369 doc1, err := ParseWithOptions(pr, opts...)
370 if err != nil {
371 return err
372 }
373 got1, err := dump(doc1)
374 if err != nil {
375 return err
376 }
377 if got != got1 {
378 return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
379 }
380
381 return nil
382 }
383
384
385
386 var parseTestBlacklist = map[string]bool{
387
388 `<math><template><mo><template>`: true,
389 `<template><svg><foo><template><foreignObject><div></template><div>`: true,
390 }
391
392
393
394
395
396 var renderTestBlacklist = map[string]bool{
397
398
399 `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
400
401 `<p><table></p>`: true,
402
403 `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
404 `<a><table><a></table><p><a><div><a>`: true,
405 `<a><table><td><a><table></table><a></tr><a></table><a>`: true,
406 `<template><a><table><a>`: true,
407
408 `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
409
410
411 `<table><plaintext><td>`: true,
412 `<!doctype html><table><plaintext></plaintext>`: true,
413 `<!doctype html><table><tbody><plaintext></plaintext>`: true,
414 `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
415
416 `<!doctype html><form><table></form><form></table></form>`: true,
417
418 `<!doctype html><script><!--<script `: true,
419 `<!doctype html><script><!--<script <`: true,
420 `<!doctype html><script><!--<script <a`: true,
421 `<!doctype html><script><!--<script </`: true,
422 `<!doctype html><script><!--<script </s`: true,
423 `<!doctype html><script><!--<script </script`: true,
424 `<!doctype html><script><!--<script </scripta`: true,
425 `<!doctype html><script><!--<script -`: true,
426 `<!doctype html><script><!--<script -a`: true,
427 `<!doctype html><script><!--<script -<`: true,
428 `<!doctype html><script><!--<script --`: true,
429 `<!doctype html><script><!--<script --a`: true,
430 `<!doctype html><script><!--<script --<`: true,
431 `<script><!--<script `: true,
432 `<script><!--<script <a`: true,
433 `<script><!--<script </script`: true,
434 `<script><!--<script </scripta`: true,
435 `<script><!--<script -`: true,
436 `<script><!--<script -a`: true,
437 `<script><!--<script --`: true,
438 `<script><!--<script --a`: true,
439 `<script><!--<script <`: true,
440 `<script><!--<script </`: true,
441 `<script><!--<script </s`: true,
442
443
444 `<!doctype html><p><a><plaintext>b`: true,
445 `<table><math><select><mi><select></table>`: true,
446 `<!doctype html><table><colgroup><plaintext></plaintext>`: true,
447 `<!doctype html><svg><plaintext>a</plaintext>b`: true,
448 }
449
450 func TestNodeConsistency(t *testing.T) {
451
452 inconsistentNode := &Node{
453 Type: ElementNode,
454 DataAtom: atom.Frameset,
455 Data: "table",
456 }
457 if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil {
458 t.Errorf("got nil error, want non-nil")
459 }
460 }
461
462 func TestParseFragmentWithNilContext(t *testing.T) {
463
464 ParseFragment(strings.NewReader("<p>hello</p>"), nil)
465 }
466
467 func TestParseFragmentForeignContentTemplates(t *testing.T) {
468 srcs := []string{
469 "<math><html><template><mn><template></template></template>",
470 "<math><math><head><mi><template>",
471 "<svg><head><title><select><input>",
472 }
473 for _, src := range srcs {
474
475 ParseFragment(strings.NewReader(src), nil)
476 }
477 }
478
479 func BenchmarkParser(b *testing.B) {
480 buf, err := os.ReadFile("testdata/go1.html")
481 if err != nil {
482 b.Fatalf("could not read testdata/go1.html: %v", err)
483 }
484 b.SetBytes(int64(len(buf)))
485 runtime.GC()
486 b.ReportAllocs()
487 b.ResetTimer()
488 for i := 0; i < b.N; i++ {
489 Parse(bytes.NewBuffer(buf))
490 }
491 }
492
View as plain text