1 // Copyright Josh Komoroske. All rights reserved. 2 // Use of this source code is governed by the MIT license, 3 // a copy of which can be found in the LICENSE.txt file. 4 5 package junit 6 7 import ( 8 "bytes" 9 "encoding/xml" 10 "errors" 11 "html" 12 "io" 13 ) 14 15 // reparentXML will wrap the given reader (which is assumed to be valid XML), 16 // in a fake root nodeAlias. 17 // 18 // This action is useful in the event that the original XML document does not 19 // have a single root nodeAlias, which is required by the XML specification. 20 // Additionally, Go's XML parser will silently drop all nodes after the first 21 // that is encountered, which can lead to data loss from a parser perspective. 22 // This function also enables the ingestion of blank XML files, which would 23 // normally cause a parsing error. 24 func reparentXML(reader io.Reader) io.Reader { 25 return io.MultiReader( 26 bytes.NewReader([]byte("<fake-root>")), 27 reader, 28 bytes.NewReader([]byte("</fake-root>")), 29 ) 30 } 31 32 // extractContent parses the raw contents from an XML node, and returns it in a 33 // more consumable form. 34 // 35 // This function deals with two distinct classes of node data; Encoded entities 36 // and CDATA tags. These Encoded entities are normal (html escaped) text that 37 // you typically find between tags like so: 38 // • "Hello, world!" → "Hello, world!" 39 // • "I </3 XML" → "I </3 XML" 40 // CDATA tags are a special way to embed data that would normally require 41 // escaping, without escaping it, like so: 42 // • "<![CDATA[Hello, world!]]>" → "Hello, world!" 43 // • "<![CDATA[I </3 XML]]>" → "I </3 XML" 44 // • "<![CDATA[I </3 XML]]>" → "I </3 XML" 45 // 46 // This function specifically allows multiple interleaved instances of either 47 // encoded entities or cdata, and will decode them into one piece of normalized 48 // text, like so: 49 // • "I </3 XML <![CDATA[a lot]]>. You probably <![CDATA[</3 XML]]> too." → "I </3 XML a lot. You probably </3 XML too." 50 // └─────┬─────┘ └─┬─┘ └──────┬──────┘ └──┬──┘ └─┬─┘ 51 // "I </3 XML " │ ". You probably " │ " too." 52 // "a lot" "</3 XML" 53 // 54 // Errors are returned only when there are unmatched CDATA tags, although these 55 // should cause proper XML unmarshalling errors first, if encountered in an 56 // actual XML document. 57 func extractContent(data []byte) ([]byte, error) { 58 var ( 59 cdataStart = []byte("<![CDATA[") 60 cdataEnd = []byte("]]>") 61 mode int 62 output []byte 63 ) 64 65 for { 66 if mode == 0 { 67 offset := bytes.Index(data, cdataStart) 68 if offset == -1 { 69 // The string "<![CDATA[" does not appear in the data. Unescape all remaining data and finish 70 if bytes.Contains(data, cdataEnd) { 71 // The string "]]>" appears in the data. This is an error! 72 return nil, errors.New("unmatched CDATA end tag") 73 } 74 75 output = append(output, html.UnescapeString(string(data))...) 76 break 77 } 78 79 // The string "<![CDATA[" appears at some offset. Unescape up to that offset. Discard "<![CDATA[" prefix. 80 output = append(output, html.UnescapeString(string(data[:offset]))...) 81 data = data[offset:] 82 data = data[9:] 83 mode = 1 84 } else if mode == 1 { 85 offset := bytes.Index(data, cdataEnd) 86 if offset == -1 { 87 // The string "]]>" does not appear in the data. This is an error! 88 return nil, errors.New("unmatched CDATA start tag") 89 } 90 91 // The string "]]>" appears at some offset. Read up to that offset. Discard "]]>" prefix. 92 output = append(output, data[:offset]...) 93 data = data[offset:] 94 data = data[3:] 95 mode = 0 96 } 97 } 98 99 return output, nil 100 } 101 102 // parse unmarshalls the given XML data into a graph of nodes, and then returns 103 // a slice of all top-level nodes. 104 func parse(reader io.Reader) ([]xmlNode, error) { 105 var ( 106 dec = xml.NewDecoder(reparentXML(reader)) 107 root xmlNode 108 ) 109 110 if err := dec.Decode(&root); err != nil { 111 return nil, err 112 } 113 114 return root.Nodes, nil 115 } 116