charmap.go

Documentation: github.com/gdamore/encoding

     1  // Copyright 2015 Garrett D'Amore
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use file except in compliance with the License.
     5  // You may obtain a copy of the license at
     6  //
     7  //    http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package encoding
    16  
    17  import (
    18  	"sync"
    19  	"unicode/utf8"
    20  
    21  	"golang.org/x/text/encoding"
    22  	"golang.org/x/text/transform"
    23  )
    24  
    25  const (
    26  	// RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
    27  	RuneError = '\uFFFD'
    28  
    29  	// RuneSelf is the rune below which UTF-8 and the Unicode values are
    30  	// identical.  Its also the limit for ASCII.
    31  	RuneSelf = 0x80
    32  
    33  	// ASCIISub is the ASCII substitution character.
    34  	ASCIISub = '\x1a'
    35  )
    36  
    37  // Charmap is a structure for setting up encodings for 8-bit character sets,
    38  // for transforming between UTF8 and that other character set.  It has some
    39  // ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
    40  // different implementation.  This implementation uses maps, and supports
    41  // user-defined maps.
    42  //
    43  // We do assume that a character map has a reasonable substitution character,
    44  // and that valid encodings are stable (exactly a 1:1 map) and stateless
    45  // (that is there is no shift character or anything like that.)  Hence this
    46  // approach will not work for many East Asian character sets.
    47  //
    48  // Measurement shows little or no measurable difference in the performance of
    49  // the two approaches.  The difference was down to a couple of nsec/op, and
    50  // no consistent pattern as to which ran faster.  With the conversion to
    51  // UTF-8 the code takes about 25 nsec/op.  The conversion in the reverse
    52  // direction takes about 100 nsec/op.  (The larger cost for conversion
    53  // from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
    54  // to a rune before conversion.
    55  //
    56  type Charmap struct {
    57  	transform.NopResetter
    58  	bytes map[rune]byte
    59  	runes [256][]byte
    60  	once  sync.Once
    61  
    62  	// The map between bytes and runes.  To indicate that a specific
    63  	// byte value is invalid for a charcter set, use the rune
    64  	// utf8.RuneError.  Values that are absent from this map will
    65  	// be assumed to have the identity mapping -- that is the default
    66  	// is to assume ISO8859-1, where all 8-bit characters have the same
    67  	// numeric value as their Unicode runes.  (Not to be confused with
    68  	// the UTF-8 values, which *will* be different for non-ASCII runes.)
    69  	//
    70  	// If no values less than RuneSelf are changed (or have non-identity
    71  	// mappings), then the character set is assumed to be an ASCII
    72  	// superset, and certain assumptions and optimizations become
    73  	// available for ASCII bytes.
    74  	Map map[byte]rune
    75  
    76  	// The ReplacementChar is the byte value to use for substitution.
    77  	// It should normally be ASCIISub for ASCII encodings.  This may be
    78  	// unset (left to zero) for mappings that are strictly ASCII supersets.
    79  	// In that case ASCIISub will be assumed instead.
    80  	ReplacementChar byte
    81  }
    82  
    83  type cmapDecoder struct {
    84  	transform.NopResetter
    85  	runes [256][]byte
    86  }
    87  
    88  type cmapEncoder struct {
    89  	transform.NopResetter
    90  	bytes   map[rune]byte
    91  	replace byte
    92  }
    93  
    94  // Init initializes internal values of a character map.  This should
    95  // be done early, to minimize the cost of allocation of transforms
    96  // later.  It is not strictly necessary however, as the allocation
    97  // functions will arrange to call it if it has not already been done.
    98  func (c *Charmap) Init() {
    99  	c.once.Do(c.initialize)
   100  }
   101  
   102  func (c *Charmap) initialize() {
   103  	c.bytes = make(map[rune]byte)
   104  	ascii := true
   105  
   106  	for i := 0; i < 256; i++ {
   107  		r, ok := c.Map[byte(i)]
   108  		if !ok {
   109  			r = rune(i)
   110  		}
   111  		if r < 128 && r != rune(i) {
   112  			ascii = false
   113  		}
   114  		if r != RuneError {
   115  			c.bytes[r] = byte(i)
   116  		}
   117  		utf := make([]byte, utf8.RuneLen(r))
   118  		utf8.EncodeRune(utf, r)
   119  		c.runes[i] = utf
   120  	}
   121  	if ascii && c.ReplacementChar == '\x00' {
   122  		c.ReplacementChar = ASCIISub
   123  	}
   124  }
   125  
   126  // NewDecoder returns a Decoder the converts from the 8-bit
   127  // character set to UTF-8.  Unknown mappings, if any, are mapped
   128  // to '\uFFFD'.
   129  func (c *Charmap) NewDecoder() *encoding.Decoder {
   130  	c.Init()
   131  	return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
   132  }
   133  
   134  // NewEncoder returns a Transformer that converts from UTF8 to the
   135  // 8-bit character set.  Unknown mappings are mapped to 0x1A.
   136  func (c *Charmap) NewEncoder() *encoding.Encoder {
   137  	c.Init()
   138  	return &encoding.Encoder{
   139  		Transformer: &cmapEncoder{
   140  			bytes:   c.bytes,
   141  			replace: c.ReplacementChar,
   142  		},
   143  	}
   144  }
   145  
   146  func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
   147  	var e error
   148  	var ndst, nsrc int
   149  
   150  	for _, c := range src {
   151  		b := d.runes[c]
   152  		l := len(b)
   153  
   154  		if ndst+l > len(dst) {
   155  			e = transform.ErrShortDst
   156  			break
   157  		}
   158  		for i := 0; i < l; i++ {
   159  			dst[ndst] = b[i]
   160  			ndst++
   161  		}
   162  		nsrc++
   163  	}
   164  	return ndst, nsrc, e
   165  }
   166  
   167  func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
   168  	var e error
   169  	var ndst, nsrc int
   170  	for nsrc < len(src) {
   171  		if ndst >= len(dst) {
   172  			e = transform.ErrShortDst
   173  			break
   174  		}
   175  
   176  		r, sz := utf8.DecodeRune(src[nsrc:])
   177  		if r == utf8.RuneError && sz == 1 {
   178  			// If its inconclusive due to insufficient data in
   179  			// in the source, report it
   180  			if !atEOF && !utf8.FullRune(src[nsrc:]) {
   181  				e = transform.ErrShortSrc
   182  				break
   183  			}
   184  		}
   185  
   186  		if c, ok := d.bytes[r]; ok {
   187  			dst[ndst] = c
   188  		} else {
   189  			dst[ndst] = d.replace
   190  		}
   191  		nsrc += sz
   192  		ndst++
   193  	}
   194  
   195  	return ndst, nsrc, e
   196  }
   197
View as plain text