1 // Copyright 2015 Garrett D'Amore 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use file except in compliance with the License. 5 // You may obtain a copy of the license at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package encoding 16 17 import ( 18 "sync" 19 "unicode/utf8" 20 21 "golang.org/x/text/encoding" 22 "golang.org/x/text/transform" 23 ) 24 25 const ( 26 // RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'. 27 RuneError = '\uFFFD' 28 29 // RuneSelf is the rune below which UTF-8 and the Unicode values are 30 // identical. Its also the limit for ASCII. 31 RuneSelf = 0x80 32 33 // ASCIISub is the ASCII substitution character. 34 ASCIISub = '\x1a' 35 ) 36 37 // Charmap is a structure for setting up encodings for 8-bit character sets, 38 // for transforming between UTF8 and that other character set. It has some 39 // ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a 40 // different implementation. This implementation uses maps, and supports 41 // user-defined maps. 42 // 43 // We do assume that a character map has a reasonable substitution character, 44 // and that valid encodings are stable (exactly a 1:1 map) and stateless 45 // (that is there is no shift character or anything like that.) Hence this 46 // approach will not work for many East Asian character sets. 47 // 48 // Measurement shows little or no measurable difference in the performance of 49 // the two approaches. The difference was down to a couple of nsec/op, and 50 // no consistent pattern as to which ran faster. With the conversion to 51 // UTF-8 the code takes about 25 nsec/op. The conversion in the reverse 52 // direction takes about 100 nsec/op. (The larger cost for conversion 53 // from UTF-8 is most likely due to the need to convert the UTF-8 byte stream 54 // to a rune before conversion. 55 // 56 type Charmap struct { 57 transform.NopResetter 58 bytes map[rune]byte 59 runes [256][]byte 60 once sync.Once 61 62 // The map between bytes and runes. To indicate that a specific 63 // byte value is invalid for a charcter set, use the rune 64 // utf8.RuneError. Values that are absent from this map will 65 // be assumed to have the identity mapping -- that is the default 66 // is to assume ISO8859-1, where all 8-bit characters have the same 67 // numeric value as their Unicode runes. (Not to be confused with 68 // the UTF-8 values, which *will* be different for non-ASCII runes.) 69 // 70 // If no values less than RuneSelf are changed (or have non-identity 71 // mappings), then the character set is assumed to be an ASCII 72 // superset, and certain assumptions and optimizations become 73 // available for ASCII bytes. 74 Map map[byte]rune 75 76 // The ReplacementChar is the byte value to use for substitution. 77 // It should normally be ASCIISub for ASCII encodings. This may be 78 // unset (left to zero) for mappings that are strictly ASCII supersets. 79 // In that case ASCIISub will be assumed instead. 80 ReplacementChar byte 81 } 82 83 type cmapDecoder struct { 84 transform.NopResetter 85 runes [256][]byte 86 } 87 88 type cmapEncoder struct { 89 transform.NopResetter 90 bytes map[rune]byte 91 replace byte 92 } 93 94 // Init initializes internal values of a character map. This should 95 // be done early, to minimize the cost of allocation of transforms 96 // later. It is not strictly necessary however, as the allocation 97 // functions will arrange to call it if it has not already been done. 98 func (c *Charmap) Init() { 99 c.once.Do(c.initialize) 100 } 101 102 func (c *Charmap) initialize() { 103 c.bytes = make(map[rune]byte) 104 ascii := true 105 106 for i := 0; i < 256; i++ { 107 r, ok := c.Map[byte(i)] 108 if !ok { 109 r = rune(i) 110 } 111 if r < 128 && r != rune(i) { 112 ascii = false 113 } 114 if r != RuneError { 115 c.bytes[r] = byte(i) 116 } 117 utf := make([]byte, utf8.RuneLen(r)) 118 utf8.EncodeRune(utf, r) 119 c.runes[i] = utf 120 } 121 if ascii && c.ReplacementChar == '\x00' { 122 c.ReplacementChar = ASCIISub 123 } 124 } 125 126 // NewDecoder returns a Decoder the converts from the 8-bit 127 // character set to UTF-8. Unknown mappings, if any, are mapped 128 // to '\uFFFD'. 129 func (c *Charmap) NewDecoder() *encoding.Decoder { 130 c.Init() 131 return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}} 132 } 133 134 // NewEncoder returns a Transformer that converts from UTF8 to the 135 // 8-bit character set. Unknown mappings are mapped to 0x1A. 136 func (c *Charmap) NewEncoder() *encoding.Encoder { 137 c.Init() 138 return &encoding.Encoder{ 139 Transformer: &cmapEncoder{ 140 bytes: c.bytes, 141 replace: c.ReplacementChar, 142 }, 143 } 144 } 145 146 func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) { 147 var e error 148 var ndst, nsrc int 149 150 for _, c := range src { 151 b := d.runes[c] 152 l := len(b) 153 154 if ndst+l > len(dst) { 155 e = transform.ErrShortDst 156 break 157 } 158 for i := 0; i < l; i++ { 159 dst[ndst] = b[i] 160 ndst++ 161 } 162 nsrc++ 163 } 164 return ndst, nsrc, e 165 } 166 167 func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) { 168 var e error 169 var ndst, nsrc int 170 for nsrc < len(src) { 171 if ndst >= len(dst) { 172 e = transform.ErrShortDst 173 break 174 } 175 176 r, sz := utf8.DecodeRune(src[nsrc:]) 177 if r == utf8.RuneError && sz == 1 { 178 // If its inconclusive due to insufficient data in 179 // in the source, report it 180 if !atEOF && !utf8.FullRune(src[nsrc:]) { 181 e = transform.ErrShortSrc 182 break 183 } 184 } 185 186 if c, ok := d.bytes[r]; ok { 187 dst[ndst] = c 188 } else { 189 dst[ndst] = d.replace 190 } 191 nsrc += sz 192 ndst++ 193 } 194 195 return ndst, nsrc, e 196 } 197