parse.go

Documentation: github.com/joshdk/go-junit

     1  // Copyright Josh Komoroske. All rights reserved.
     2  // Use of this source code is governed by the MIT license,
     3  // a copy of which can be found in the LICENSE.txt file.
     4  
     5  package junit
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/xml"
    10  	"errors"
    11  	"html"
    12  	"io"
    13  )
    14  
    15  // reparentXML will wrap the given reader (which is assumed to be valid XML),
    16  // in a fake root nodeAlias.
    17  //
    18  // This action is useful in the event that the original XML document does not
    19  // have a single root nodeAlias, which is required by the XML specification.
    20  // Additionally, Go's XML parser will silently drop all nodes after the first
    21  // that is encountered, which can lead to data loss from a parser perspective.
    22  // This function also enables the ingestion of blank XML files, which would
    23  // normally cause a parsing error.
    24  func reparentXML(reader io.Reader) io.Reader {
    25  	return io.MultiReader(
    26  		bytes.NewReader([]byte("<fake-root>")),
    27  		reader,
    28  		bytes.NewReader([]byte("</fake-root>")),
    29  	)
    30  }
    31  
    32  // extractContent parses the raw contents from an XML node, and returns it in a
    33  // more consumable form.
    34  //
    35  // This function deals with two distinct classes of node data; Encoded entities
    36  // and CDATA tags. These Encoded entities are normal (html escaped) text that
    37  // you typically find between tags like so:
    38  //   • "Hello, world!"  →  "Hello, world!"
    39  //   • "I &lt;/3 XML"   →  "I </3 XML"
    40  // CDATA tags are a special way to embed data that would normally require
    41  // escaping, without escaping it, like so:
    42  //   • "<![CDATA[Hello, world!]]>"  →  "Hello, world!"
    43  //   • "<![CDATA[I &lt;/3 XML]]>"   →  "I &lt;/3 XML"
    44  //   • "<![CDATA[I </3 XML]]>"      →  "I </3 XML"
    45  //
    46  // This function specifically allows multiple interleaved instances of either
    47  // encoded entities or cdata, and will decode them into one piece of normalized
    48  // text, like so:
    49  //   • "I &lt;/3 XML <![CDATA[a lot]]>. You probably <![CDATA[</3 XML]]> too."  →  "I </3 XML a lot. You probably </3 XML too."
    50  //      └─────┬─────┘         └─┬─┘   └──────┬──────┘         └──┬──┘   └─┬─┘
    51  //      "I </3 XML "            │     ". You probably "          │      " too."
    52  //                          "a lot"                         "</3 XML"
    53  //
    54  // Errors are returned only when there are unmatched CDATA tags, although these
    55  // should cause proper XML unmarshalling errors first, if encountered in an
    56  // actual XML document.
    57  func extractContent(data []byte) ([]byte, error) {
    58  	var (
    59  		cdataStart = []byte("<![CDATA[")
    60  		cdataEnd   = []byte("]]>")
    61  		mode       int
    62  		output     []byte
    63  	)
    64  
    65  	for {
    66  		if mode == 0 {
    67  			offset := bytes.Index(data, cdataStart)
    68  			if offset == -1 {
    69  				// The string "<![CDATA[" does not appear in the data. Unescape all remaining data and finish
    70  				if bytes.Contains(data, cdataEnd) {
    71  					// The string "]]>" appears in the data. This is an error!
    72  					return nil, errors.New("unmatched CDATA end tag")
    73  				}
    74  
    75  				output = append(output, html.UnescapeString(string(data))...)
    76  				break
    77  			}
    78  
    79  			// The string "<![CDATA[" appears at some offset. Unescape up to that offset. Discard "<![CDATA[" prefix.
    80  			output = append(output, html.UnescapeString(string(data[:offset]))...)
    81  			data = data[offset:]
    82  			data = data[9:]
    83  			mode = 1
    84  		} else if mode == 1 {
    85  			offset := bytes.Index(data, cdataEnd)
    86  			if offset == -1 {
    87  				// The string "]]>" does not appear in the data. This is an error!
    88  				return nil, errors.New("unmatched CDATA start tag")
    89  			}
    90  
    91  			// The string "]]>" appears at some offset. Read up to that offset. Discard "]]>" prefix.
    92  			output = append(output, data[:offset]...)
    93  			data = data[offset:]
    94  			data = data[3:]
    95  			mode = 0
    96  		}
    97  	}
    98  
    99  	return output, nil
   100  }
   101  
   102  // parse unmarshalls the given XML data into a graph of nodes, and then returns
   103  // a slice of all top-level nodes.
   104  func parse(reader io.Reader) ([]xmlNode, error) {
   105  	var (
   106  		dec  = xml.NewDecoder(reparentXML(reader))
   107  		root xmlNode
   108  	)
   109  
   110  	if err := dec.Decode(&root); err != nil {
   111  		return nil, err
   112  	}
   113  
   114  	return root.Nodes, nil
   115  }
   116
View as plain text