...

Source file src/github.com/dimchansky/utfbom/utfbom.go

Documentation: github.com/dimchansky/utfbom

     1  // Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary.
     2  // It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader
     3  // interface but provides automatic BOM checking and removing as necessary.
     4  package utfbom
     5  
     6  import (
     7  	"errors"
     8  	"io"
     9  )
    10  
    11  // Encoding is type alias for detected UTF encoding.
    12  type Encoding int
    13  
    14  // Constants to identify detected UTF encodings.
    15  const (
    16  	// Unknown encoding, returned when no BOM was detected
    17  	Unknown Encoding = iota
    18  
    19  	// UTF8, BOM bytes: EF BB BF
    20  	UTF8
    21  
    22  	// UTF-16, big-endian, BOM bytes: FE FF
    23  	UTF16BigEndian
    24  
    25  	// UTF-16, little-endian, BOM bytes: FF FE
    26  	UTF16LittleEndian
    27  
    28  	// UTF-32, big-endian, BOM bytes: 00 00 FE FF
    29  	UTF32BigEndian
    30  
    31  	// UTF-32, little-endian, BOM bytes: FF FE 00 00
    32  	UTF32LittleEndian
    33  )
    34  
    35  // String returns a user-friendly string representation of the encoding. Satisfies fmt.Stringer interface.
    36  func (e Encoding) String() string {
    37  	switch e {
    38  	case UTF8:
    39  		return "UTF8"
    40  	case UTF16BigEndian:
    41  		return "UTF16BigEndian"
    42  	case UTF16LittleEndian:
    43  		return "UTF16LittleEndian"
    44  	case UTF32BigEndian:
    45  		return "UTF32BigEndian"
    46  	case UTF32LittleEndian:
    47  		return "UTF32LittleEndian"
    48  	default:
    49  		return "Unknown"
    50  	}
    51  }
    52  
    53  const maxConsecutiveEmptyReads = 100
    54  
    55  // Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
    56  // It also returns the encoding detected by the BOM.
    57  // If the detected encoding is not needed, you can call the SkipOnly function.
    58  func Skip(rd io.Reader) (*Reader, Encoding) {
    59  	// Is it already a Reader?
    60  	b, ok := rd.(*Reader)
    61  	if ok {
    62  		return b, Unknown
    63  	}
    64  
    65  	enc, left, err := detectUtf(rd)
    66  	return &Reader{
    67  		rd:  rd,
    68  		buf: left,
    69  		err: err,
    70  	}, enc
    71  }
    72  
    73  // SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
    74  func SkipOnly(rd io.Reader) *Reader {
    75  	r, _ := Skip(rd)
    76  	return r
    77  }
    78  
    79  // Reader implements automatic BOM (Unicode Byte Order Mark) checking and
    80  // removing as necessary for an io.Reader object.
    81  type Reader struct {
    82  	rd  io.Reader // reader provided by the client
    83  	buf []byte    // buffered data
    84  	err error     // last error
    85  }
    86  
    87  // Read is an implementation of io.Reader interface.
    88  // The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary.
    89  func (r *Reader) Read(p []byte) (n int, err error) {
    90  	if len(p) == 0 {
    91  		return 0, nil
    92  	}
    93  
    94  	if r.buf == nil {
    95  		if r.err != nil {
    96  			return 0, r.readErr()
    97  		}
    98  
    99  		return r.rd.Read(p)
   100  	}
   101  
   102  	// copy as much as we can
   103  	n = copy(p, r.buf)
   104  	r.buf = nilIfEmpty(r.buf[n:])
   105  	return n, nil
   106  }
   107  
   108  func (r *Reader) readErr() error {
   109  	err := r.err
   110  	r.err = nil
   111  	return err
   112  }
   113  
   114  var errNegativeRead = errors.New("utfbom: reader returned negative count from Read")
   115  
   116  func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
   117  	buf, err = readBOM(rd)
   118  
   119  	if len(buf) >= 4 {
   120  		if isUTF32BigEndianBOM4(buf) {
   121  			return UTF32BigEndian, nilIfEmpty(buf[4:]), err
   122  		}
   123  		if isUTF32LittleEndianBOM4(buf) {
   124  			return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
   125  		}
   126  	}
   127  
   128  	if len(buf) > 2 && isUTF8BOM3(buf) {
   129  		return UTF8, nilIfEmpty(buf[3:]), err
   130  	}
   131  
   132  	if (err != nil && err != io.EOF) || (len(buf) < 2) {
   133  		return Unknown, nilIfEmpty(buf), err
   134  	}
   135  
   136  	if isUTF16BigEndianBOM2(buf) {
   137  		return UTF16BigEndian, nilIfEmpty(buf[2:]), err
   138  	}
   139  	if isUTF16LittleEndianBOM2(buf) {
   140  		return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
   141  	}
   142  
   143  	return Unknown, nilIfEmpty(buf), err
   144  }
   145  
   146  func readBOM(rd io.Reader) (buf []byte, err error) {
   147  	const maxBOMSize = 4
   148  	var bom [maxBOMSize]byte // used to read BOM
   149  
   150  	// read as many bytes as possible
   151  	for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
   152  		if n, err = rd.Read(bom[len(buf):]); n < 0 {
   153  			panic(errNegativeRead)
   154  		}
   155  		if n > 0 {
   156  			nEmpty = 0
   157  		} else {
   158  			nEmpty++
   159  			if nEmpty >= maxConsecutiveEmptyReads {
   160  				err = io.ErrNoProgress
   161  			}
   162  		}
   163  	}
   164  	return
   165  }
   166  
   167  func isUTF32BigEndianBOM4(buf []byte) bool {
   168  	return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
   169  }
   170  
   171  func isUTF32LittleEndianBOM4(buf []byte) bool {
   172  	return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
   173  }
   174  
   175  func isUTF8BOM3(buf []byte) bool {
   176  	return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
   177  }
   178  
   179  func isUTF16BigEndianBOM2(buf []byte) bool {
   180  	return buf[0] == 0xFE && buf[1] == 0xFF
   181  }
   182  
   183  func isUTF16LittleEndianBOM2(buf []byte) bool {
   184  	return buf[0] == 0xFF && buf[1] == 0xFE
   185  }
   186  
   187  func nilIfEmpty(buf []byte) (res []byte) {
   188  	if len(buf) > 0 {
   189  		res = buf
   190  	}
   191  	return
   192  }
   193  

View as plain text