1 package goquery 2 3 import ( 4 "errors" 5 "io" 6 "net/http" 7 "net/url" 8 9 "github.com/andybalholm/cascadia" 10 "golang.org/x/net/html" 11 ) 12 13 // Document represents an HTML document to be manipulated. Unlike jQuery, which 14 // is loaded as part of a DOM document, and thus acts upon its containing 15 // document, GoQuery doesn't know which HTML document to act upon. So it needs 16 // to be told, and that's what the Document class is for. It holds the root 17 // document node to manipulate, and can make selections on this document. 18 type Document struct { 19 *Selection 20 Url *url.URL 21 rootNode *html.Node 22 } 23 24 // NewDocumentFromNode is a Document constructor that takes a root html Node 25 // as argument. 26 func NewDocumentFromNode(root *html.Node) *Document { 27 return newDocument(root, nil) 28 } 29 30 // NewDocument is a Document constructor that takes a string URL as argument. 31 // It loads the specified document, parses it, and stores the root Document 32 // node, ready to be manipulated. 33 // 34 // Deprecated: Use the net/http standard library package to make the request 35 // and validate the response before calling goquery.NewDocumentFromReader 36 // with the response's body. 37 func NewDocument(url string) (*Document, error) { 38 // Load the URL 39 res, e := http.Get(url) 40 if e != nil { 41 return nil, e 42 } 43 return NewDocumentFromResponse(res) 44 } 45 46 // NewDocumentFromReader returns a Document from an io.Reader. 47 // It returns an error as second value if the reader's data cannot be parsed 48 // as html. It does not check if the reader is also an io.Closer, the 49 // provided reader is never closed by this call. It is the responsibility 50 // of the caller to close it if required. 51 func NewDocumentFromReader(r io.Reader) (*Document, error) { 52 root, e := html.Parse(r) 53 if e != nil { 54 return nil, e 55 } 56 return newDocument(root, nil), nil 57 } 58 59 // NewDocumentFromResponse is another Document constructor that takes an http response as argument. 60 // It loads the specified response's document, parses it, and stores the root Document 61 // node, ready to be manipulated. The response's body is closed on return. 62 // 63 // Deprecated: Use goquery.NewDocumentFromReader with the response's body. 64 func NewDocumentFromResponse(res *http.Response) (*Document, error) { 65 if res == nil { 66 return nil, errors.New("Response is nil") 67 } 68 defer res.Body.Close() 69 if res.Request == nil { 70 return nil, errors.New("Response.Request is nil") 71 } 72 73 // Parse the HTML into nodes 74 root, e := html.Parse(res.Body) 75 if e != nil { 76 return nil, e 77 } 78 79 // Create and fill the document 80 return newDocument(root, res.Request.URL), nil 81 } 82 83 // CloneDocument creates a deep-clone of a document. 84 func CloneDocument(doc *Document) *Document { 85 return newDocument(cloneNode(doc.rootNode), doc.Url) 86 } 87 88 // Private constructor, make sure all fields are correctly filled. 89 func newDocument(root *html.Node, url *url.URL) *Document { 90 // Create and fill the document 91 d := &Document{nil, url, root} 92 d.Selection = newSingleSelection(root, d) 93 return d 94 } 95 96 // Selection represents a collection of nodes matching some criteria. The 97 // initial Selection can be created by using Document.Find, and then 98 // manipulated using the jQuery-like chainable syntax and methods. 99 type Selection struct { 100 Nodes []*html.Node 101 document *Document 102 prevSel *Selection 103 } 104 105 // Helper constructor to create an empty selection 106 func newEmptySelection(doc *Document) *Selection { 107 return &Selection{nil, doc, nil} 108 } 109 110 // Helper constructor to create a selection of only one node 111 func newSingleSelection(node *html.Node, doc *Document) *Selection { 112 return &Selection{[]*html.Node{node}, doc, nil} 113 } 114 115 // Matcher is an interface that defines the methods to match 116 // HTML nodes against a compiled selector string. Cascadia's 117 // Selector implements this interface. 118 type Matcher interface { 119 Match(*html.Node) bool 120 MatchAll(*html.Node) []*html.Node 121 Filter([]*html.Node) []*html.Node 122 } 123 124 // Single compiles a selector string to a Matcher that stops after the first 125 // match is found. 126 // 127 // By default, Selection.Find and other functions that accept a selector string 128 // to select nodes will use all matches corresponding to that selector. By 129 // using the Matcher returned by Single, at most the first match will be 130 // selected. 131 // 132 // For example, those two statements are semantically equivalent: 133 // 134 // sel1 := doc.Find("a").First() 135 // sel2 := doc.FindMatcher(goquery.Single("a")) 136 // 137 // The one using Single is optimized to be potentially much faster on large 138 // documents. 139 // 140 // Only the behaviour of the MatchAll method of the Matcher interface is 141 // altered compared to standard Matchers. This means that the single-selection 142 // property of the Matcher only applies for Selection methods where the Matcher 143 // is used to select nodes, not to filter or check if a node matches the 144 // Matcher - in those cases, the behaviour of the Matcher is unchanged (e.g. 145 // FilterMatcher(Single("div")) will still result in a Selection with multiple 146 // "div"s if there were many "div"s in the Selection to begin with). 147 func Single(selector string) Matcher { 148 return singleMatcher{compileMatcher(selector)} 149 } 150 151 // SingleMatcher returns a Matcher matches the same nodes as m, but that stops 152 // after the first match is found. 153 // 154 // See the documentation of function Single for more details. 155 func SingleMatcher(m Matcher) Matcher { 156 if _, ok := m.(singleMatcher); ok { 157 // m is already a singleMatcher 158 return m 159 } 160 return singleMatcher{m} 161 } 162 163 // compileMatcher compiles the selector string s and returns 164 // the corresponding Matcher. If s is an invalid selector string, 165 // it returns a Matcher that fails all matches. 166 func compileMatcher(s string) Matcher { 167 cs, err := cascadia.Compile(s) 168 if err != nil { 169 return invalidMatcher{} 170 } 171 return cs 172 } 173 174 type singleMatcher struct { 175 Matcher 176 } 177 178 func (m singleMatcher) MatchAll(n *html.Node) []*html.Node { 179 // Optimized version - stops finding at the first match (cascadia-compiled 180 // matchers all use this code path). 181 if mm, ok := m.Matcher.(interface{ MatchFirst(*html.Node) *html.Node }); ok { 182 node := mm.MatchFirst(n) 183 if node == nil { 184 return nil 185 } 186 return []*html.Node{node} 187 } 188 189 // Fallback version, for e.g. test mocks that don't provide the MatchFirst 190 // method. 191 nodes := m.Matcher.MatchAll(n) 192 if len(nodes) > 0 { 193 return nodes[:1:1] 194 } 195 return nil 196 } 197 198 // invalidMatcher is a Matcher that always fails to match. 199 type invalidMatcher struct{} 200 201 func (invalidMatcher) Match(n *html.Node) bool { return false } 202 func (invalidMatcher) MatchAll(n *html.Node) []*html.Node { return nil } 203 func (invalidMatcher) Filter(ns []*html.Node) []*html.Node { return nil } 204