1 package parser
2
3 import (
4 "bytes"
5 "regexp"
6 "strings"
7
8 "github.com/yuin/goldmark/ast"
9 "github.com/yuin/goldmark/text"
10 "github.com/yuin/goldmark/util"
11 )
12
13 var allowedBlockTags = map[string]bool{
14 "address": true,
15 "article": true,
16 "aside": true,
17 "base": true,
18 "basefont": true,
19 "blockquote": true,
20 "body": true,
21 "caption": true,
22 "center": true,
23 "col": true,
24 "colgroup": true,
25 "dd": true,
26 "details": true,
27 "dialog": true,
28 "dir": true,
29 "div": true,
30 "dl": true,
31 "dt": true,
32 "fieldset": true,
33 "figcaption": true,
34 "figure": true,
35 "footer": true,
36 "form": true,
37 "frame": true,
38 "frameset": true,
39 "h1": true,
40 "h2": true,
41 "h3": true,
42 "h4": true,
43 "h5": true,
44 "h6": true,
45 "head": true,
46 "header": true,
47 "hr": true,
48 "html": true,
49 "iframe": true,
50 "legend": true,
51 "li": true,
52 "link": true,
53 "main": true,
54 "menu": true,
55 "menuitem": true,
56 "meta": true,
57 "nav": true,
58 "noframes": true,
59 "ol": true,
60 "optgroup": true,
61 "option": true,
62 "p": true,
63 "param": true,
64 "section": true,
65 "source": true,
66 "summary": true,
67 "table": true,
68 "tbody": true,
69 "td": true,
70 "tfoot": true,
71 "th": true,
72 "thead": true,
73 "title": true,
74 "tr": true,
75 "track": true,
76 "ul": true,
77 }
78
79 var htmlBlockType1OpenRegexp = regexp.MustCompile(`(?i)^[ ]{0,3}<(script|pre|style|textarea)(?:\s.*|>.*|/>.*|)(?:\r\n|\n)?$`)
80 var htmlBlockType1CloseRegexp = regexp.MustCompile(`(?i)^.*</(?:script|pre|style|textarea)>.*`)
81
82 var htmlBlockType2OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<!\-\-`)
83 var htmlBlockType2Close = []byte{'-', '-', '>'}
84
85 var htmlBlockType3OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\?`)
86 var htmlBlockType3Close = []byte{'?', '>'}
87
88 var htmlBlockType4OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<![A-Z]+.*(?:\r\n|\n)?$`)
89 var htmlBlockType4Close = []byte{'>'}
90
91 var htmlBlockType5OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\!\[CDATA\[`)
92 var htmlBlockType5Close = []byte{']', ']', '>'}
93
94 var htmlBlockType6Regexp = regexp.MustCompile(`^[ ]{0,3}<(?:/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(?:[ ].*|>.*|/>.*|)(?:\r\n|\n)?$`)
95
96 var htmlBlockType7Regexp = regexp.MustCompile(`^[ ]{0,3}<(/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(` + attributePattern + `*)[ ]*(?:>|/>)[ ]*(?:\r\n|\n)?$`)
97
98 type htmlBlockParser struct {
99 }
100
101 var defaultHTMLBlockParser = &htmlBlockParser{}
102
103
104
105 func NewHTMLBlockParser() BlockParser {
106 return defaultHTMLBlockParser
107 }
108
109 func (b *htmlBlockParser) Trigger() []byte {
110 return []byte{'<'}
111 }
112
113 func (b *htmlBlockParser) Open(parent ast.Node, reader text.Reader, pc Context) (ast.Node, State) {
114 var node *ast.HTMLBlock
115 line, segment := reader.PeekLine()
116 last := pc.LastOpenedBlock().Node
117 if pos := pc.BlockOffset(); pos < 0 || line[pos] != '<' {
118 return nil, NoChildren
119 }
120
121 if m := htmlBlockType1OpenRegexp.FindSubmatchIndex(line); m != nil {
122 node = ast.NewHTMLBlock(ast.HTMLBlockType1)
123 } else if htmlBlockType2OpenRegexp.Match(line) {
124 node = ast.NewHTMLBlock(ast.HTMLBlockType2)
125 } else if htmlBlockType3OpenRegexp.Match(line) {
126 node = ast.NewHTMLBlock(ast.HTMLBlockType3)
127 } else if htmlBlockType4OpenRegexp.Match(line) {
128 node = ast.NewHTMLBlock(ast.HTMLBlockType4)
129 } else if htmlBlockType5OpenRegexp.Match(line) {
130 node = ast.NewHTMLBlock(ast.HTMLBlockType5)
131 } else if match := htmlBlockType7Regexp.FindSubmatchIndex(line); match != nil {
132 isCloseTag := match[2] > -1 && bytes.Equal(line[match[2]:match[3]], []byte("/"))
133 hasAttr := match[6] != match[7]
134 tagName := strings.ToLower(string(line[match[4]:match[5]]))
135 _, ok := allowedBlockTags[tagName]
136 if ok {
137 node = ast.NewHTMLBlock(ast.HTMLBlockType6)
138 } else if tagName != "script" && tagName != "style" &&
139 tagName != "pre" && !ast.IsParagraph(last) && !(isCloseTag && hasAttr) {
140 node = ast.NewHTMLBlock(ast.HTMLBlockType7)
141 }
142 }
143 if node == nil {
144 if match := htmlBlockType6Regexp.FindSubmatchIndex(line); match != nil {
145 tagName := string(line[match[2]:match[3]])
146 _, ok := allowedBlockTags[strings.ToLower(tagName)]
147 if ok {
148 node = ast.NewHTMLBlock(ast.HTMLBlockType6)
149 }
150 }
151 }
152 if node != nil {
153 reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
154 node.Lines().Append(segment)
155 return node, NoChildren
156 }
157 return nil, NoChildren
158 }
159
160 func (b *htmlBlockParser) Continue(node ast.Node, reader text.Reader, pc Context) State {
161 htmlBlock := node.(*ast.HTMLBlock)
162 lines := htmlBlock.Lines()
163 line, segment := reader.PeekLine()
164 var closurePattern []byte
165
166 switch htmlBlock.HTMLBlockType {
167 case ast.HTMLBlockType1:
168 if lines.Len() == 1 {
169 firstLine := lines.At(0)
170 if htmlBlockType1CloseRegexp.Match(firstLine.Value(reader.Source())) {
171 return Close
172 }
173 }
174 if htmlBlockType1CloseRegexp.Match(line) {
175 htmlBlock.ClosureLine = segment
176 reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
177 return Close
178 }
179 case ast.HTMLBlockType2:
180 closurePattern = htmlBlockType2Close
181 fallthrough
182 case ast.HTMLBlockType3:
183 if closurePattern == nil {
184 closurePattern = htmlBlockType3Close
185 }
186 fallthrough
187 case ast.HTMLBlockType4:
188 if closurePattern == nil {
189 closurePattern = htmlBlockType4Close
190 }
191 fallthrough
192 case ast.HTMLBlockType5:
193 if closurePattern == nil {
194 closurePattern = htmlBlockType5Close
195 }
196
197 if lines.Len() == 1 {
198 firstLine := lines.At(0)
199 if bytes.Contains(firstLine.Value(reader.Source()), closurePattern) {
200 return Close
201 }
202 }
203 if bytes.Contains(line, closurePattern) {
204 htmlBlock.ClosureLine = segment
205 reader.Advance(segment.Len())
206 return Close
207 }
208
209 case ast.HTMLBlockType6, ast.HTMLBlockType7:
210 if util.IsBlank(line) {
211 return Close
212 }
213 }
214 node.Lines().Append(segment)
215 reader.Advance(segment.Len() - util.TrimRightSpaceLength(line))
216 return Continue | NoChildren
217 }
218
219 func (b *htmlBlockParser) Close(node ast.Node, reader text.Reader, pc Context) {
220
221 }
222
223 func (b *htmlBlockParser) CanInterruptParagraph() bool {
224 return true
225 }
226
227 func (b *htmlBlockParser) CanAcceptIndentedLine() bool {
228 return false
229 }
230
View as plain text