1 package charset
2
3 import (
4 "bytes"
5 "encoding/xml"
6 "strings"
7 "unicode/utf8"
8
9 "golang.org/x/net/html"
10 )
11
12 const (
13 F = 0
14 T = 1
15 I = 2
16 X = 3
17 )
18
19 var (
20 boms = []struct {
21 bom []byte
22 enc string
23 }{
24 {[]byte{0xEF, 0xBB, 0xBF}, "utf-8"},
25 {[]byte{0x00, 0x00, 0xFE, 0xFF}, "utf-32be"},
26 {[]byte{0xFF, 0xFE, 0x00, 0x00}, "utf-32le"},
27 {[]byte{0xFE, 0xFF}, "utf-16be"},
28 {[]byte{0xFF, 0xFE}, "utf-16le"},
29 }
30
31
32 textChars = [256]byte{
33
34 F, F, F, F, F, F, F, T, T, T, T, T, T, T, F, F,
35
36 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,
37 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
38 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
39 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
40 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
41 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,
42 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,
43
44 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,
45 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,
46 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
47 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
48 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
49 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
50 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
51 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,
52 }
53 )
54
55
56 func FromBOM(content []byte) string {
57 for _, b := range boms {
58 if bytes.HasPrefix(content, b.bom) {
59 return b.enc
60 }
61 }
62 return ""
63 }
64
65
66
67 func FromPlain(content []byte) string {
68 if len(content) == 0 {
69 return ""
70 }
71 if cset := FromBOM(content); cset != "" {
72 return cset
73 }
74 origContent := content
75
76
77 for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
78 b := content[i]
79 if b < 0x80 {
80 break
81 }
82 if utf8.RuneStart(b) {
83 content = content[:i]
84 break
85 }
86 }
87 hasHighBit := false
88 for _, c := range content {
89 if c >= 0x80 {
90 hasHighBit = true
91 break
92 }
93 }
94 if hasHighBit && utf8.Valid(content) {
95 return "utf-8"
96 }
97
98
99 if ascii(origContent) {
100 return "utf-8"
101 }
102
103 return latin(origContent)
104 }
105
106 func latin(content []byte) string {
107 hasControlBytes := false
108 for _, b := range content {
109 t := textChars[b]
110 if t != T && t != I {
111 return ""
112 }
113 if b >= 0x80 && b <= 0x9F {
114 hasControlBytes = true
115 }
116 }
117
118
119
120 if hasControlBytes {
121 return "windows-1252"
122 }
123 return "iso-8859-1"
124 }
125
126 func ascii(content []byte) bool {
127 for _, b := range content {
128 if textChars[b] != T {
129 return false
130 }
131 }
132 return true
133 }
134
135
136
137
138 func FromXML(content []byte) string {
139 if cset := fromXML(content); cset != "" {
140 return cset
141 }
142 return FromPlain(content)
143 }
144 func fromXML(content []byte) string {
145 content = trimLWS(content)
146 dec := xml.NewDecoder(bytes.NewReader(content))
147 rawT, err := dec.RawToken()
148 if err != nil {
149 return ""
150 }
151
152 t, ok := rawT.(xml.ProcInst)
153 if !ok {
154 return ""
155 }
156
157 return strings.ToLower(xmlEncoding(string(t.Inst)))
158 }
159
160
161
162
163
164 func FromHTML(content []byte) string {
165 if cset := FromBOM(content); cset != "" {
166 return cset
167 }
168 if cset := fromHTML(content); cset != "" {
169 return cset
170 }
171 return FromPlain(content)
172 }
173
174 func fromHTML(content []byte) string {
175 z := html.NewTokenizer(bytes.NewReader(content))
176 for {
177 switch z.Next() {
178 case html.ErrorToken:
179 return ""
180
181 case html.StartTagToken, html.SelfClosingTagToken:
182 tagName, hasAttr := z.TagName()
183 if !bytes.Equal(tagName, []byte("meta")) {
184 continue
185 }
186 attrList := make(map[string]bool)
187 gotPragma := false
188
189 const (
190 dontKnow = iota
191 doNeedPragma
192 doNotNeedPragma
193 )
194 needPragma := dontKnow
195
196 name := ""
197 for hasAttr {
198 var key, val []byte
199 key, val, hasAttr = z.TagAttr()
200 ks := string(key)
201 if attrList[ks] {
202 continue
203 }
204 attrList[ks] = true
205 for i, c := range val {
206 if 'A' <= c && c <= 'Z' {
207 val[i] = c + 0x20
208 }
209 }
210
211 switch ks {
212 case "http-equiv":
213 if bytes.Equal(val, []byte("content-type")) {
214 gotPragma = true
215 }
216
217 case "content":
218 name = fromMetaElement(string(val))
219 if name != "" {
220 needPragma = doNeedPragma
221 }
222
223 case "charset":
224 name = string(val)
225 needPragma = doNotNeedPragma
226 }
227 }
228
229 if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
230 continue
231 }
232
233 if strings.HasPrefix(name, "utf-16") {
234 name = "utf-8"
235 }
236
237 return name
238 }
239 }
240 }
241
242 func fromMetaElement(s string) string {
243 for s != "" {
244 csLoc := strings.Index(s, "charset")
245 if csLoc == -1 {
246 return ""
247 }
248 s = s[csLoc+len("charset"):]
249 s = strings.TrimLeft(s, " \t\n\f\r")
250 if !strings.HasPrefix(s, "=") {
251 continue
252 }
253 s = s[1:]
254 s = strings.TrimLeft(s, " \t\n\f\r")
255 if s == "" {
256 return ""
257 }
258 if q := s[0]; q == '"' || q == '\'' {
259 s = s[1:]
260 closeQuote := strings.IndexRune(s, rune(q))
261 if closeQuote == -1 {
262 return ""
263 }
264 return s[:closeQuote]
265 }
266
267 end := strings.IndexAny(s, "; \t\n\f\r")
268 if end == -1 {
269 end = len(s)
270 }
271 return s[:end]
272 }
273 return ""
274 }
275
276 func xmlEncoding(s string) string {
277 param := "encoding="
278 idx := strings.Index(s, param)
279 if idx == -1 {
280 return ""
281 }
282 v := s[idx+len(param):]
283 if v == "" {
284 return ""
285 }
286 if v[0] != '\'' && v[0] != '"' {
287 return ""
288 }
289 idx = strings.IndexRune(v[1:], rune(v[0]))
290 if idx == -1 {
291 return ""
292 }
293 return v[1 : idx+1]
294 }
295
296
297
298
299 func trimLWS(in []byte) []byte {
300 firstNonWS := 0
301 for ; firstNonWS < len(in) && isWS(in[firstNonWS]); firstNonWS++ {
302 }
303
304 return in[firstNonWS:]
305 }
306
307 func isWS(b byte) bool {
308 return b == '\t' || b == '\n' || b == '\x0c' || b == '\r' || b == ' '
309 }
310
View as plain text