...
1
2
3
4 package utfbom
5
6 import (
7 "errors"
8 "io"
9 )
10
11
12 type Encoding int
13
14
15 const (
16
17 Unknown Encoding = iota
18
19
20 UTF8
21
22
23 UTF16BigEndian
24
25
26 UTF16LittleEndian
27
28
29 UTF32BigEndian
30
31
32 UTF32LittleEndian
33 )
34
35
36 func (e Encoding) String() string {
37 switch e {
38 case UTF8:
39 return "UTF8"
40 case UTF16BigEndian:
41 return "UTF16BigEndian"
42 case UTF16LittleEndian:
43 return "UTF16LittleEndian"
44 case UTF32BigEndian:
45 return "UTF32BigEndian"
46 case UTF32LittleEndian:
47 return "UTF32LittleEndian"
48 default:
49 return "Unknown"
50 }
51 }
52
53 const maxConsecutiveEmptyReads = 100
54
55
56
57
58 func Skip(rd io.Reader) (*Reader, Encoding) {
59
60 b, ok := rd.(*Reader)
61 if ok {
62 return b, Unknown
63 }
64
65 enc, left, err := detectUtf(rd)
66 return &Reader{
67 rd: rd,
68 buf: left,
69 err: err,
70 }, enc
71 }
72
73
74 func SkipOnly(rd io.Reader) *Reader {
75 r, _ := Skip(rd)
76 return r
77 }
78
79
80
81 type Reader struct {
82 rd io.Reader
83 buf []byte
84 err error
85 }
86
87
88
89 func (r *Reader) Read(p []byte) (n int, err error) {
90 if len(p) == 0 {
91 return 0, nil
92 }
93
94 if r.buf == nil {
95 if r.err != nil {
96 return 0, r.readErr()
97 }
98
99 return r.rd.Read(p)
100 }
101
102
103 n = copy(p, r.buf)
104 r.buf = nilIfEmpty(r.buf[n:])
105 return n, nil
106 }
107
108 func (r *Reader) readErr() error {
109 err := r.err
110 r.err = nil
111 return err
112 }
113
114 var errNegativeRead = errors.New("utfbom: reader returned negative count from Read")
115
116 func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
117 buf, err = readBOM(rd)
118
119 if len(buf) >= 4 {
120 if isUTF32BigEndianBOM4(buf) {
121 return UTF32BigEndian, nilIfEmpty(buf[4:]), err
122 }
123 if isUTF32LittleEndianBOM4(buf) {
124 return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
125 }
126 }
127
128 if len(buf) > 2 && isUTF8BOM3(buf) {
129 return UTF8, nilIfEmpty(buf[3:]), err
130 }
131
132 if (err != nil && err != io.EOF) || (len(buf) < 2) {
133 return Unknown, nilIfEmpty(buf), err
134 }
135
136 if isUTF16BigEndianBOM2(buf) {
137 return UTF16BigEndian, nilIfEmpty(buf[2:]), err
138 }
139 if isUTF16LittleEndianBOM2(buf) {
140 return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
141 }
142
143 return Unknown, nilIfEmpty(buf), err
144 }
145
146 func readBOM(rd io.Reader) (buf []byte, err error) {
147 const maxBOMSize = 4
148 var bom [maxBOMSize]byte
149
150
151 for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
152 if n, err = rd.Read(bom[len(buf):]); n < 0 {
153 panic(errNegativeRead)
154 }
155 if n > 0 {
156 nEmpty = 0
157 } else {
158 nEmpty++
159 if nEmpty >= maxConsecutiveEmptyReads {
160 err = io.ErrNoProgress
161 }
162 }
163 }
164 return
165 }
166
167 func isUTF32BigEndianBOM4(buf []byte) bool {
168 return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
169 }
170
171 func isUTF32LittleEndianBOM4(buf []byte) bool {
172 return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
173 }
174
175 func isUTF8BOM3(buf []byte) bool {
176 return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
177 }
178
179 func isUTF16BigEndianBOM2(buf []byte) bool {
180 return buf[0] == 0xFE && buf[1] == 0xFF
181 }
182
183 func isUTF16LittleEndianBOM2(buf []byte) bool {
184 return buf[0] == 0xFF && buf[1] == 0xFE
185 }
186
187 func nilIfEmpty(buf []byte) (res []byte) {
188 if len(buf) > 0 {
189 res = buf
190 }
191 return
192 }
193
View as plain text