token_reader_default.go

Documentation: github.com/launchdarkly/go-jsonstream/v3/jreader

     1  //go:build !launchdarkly_easyjson
     2  // +build !launchdarkly_easyjson
     3  
     4  package jreader
     5  
     6  // This file defines the default implementation of the low-level JSON tokenizer. If the launchdarkly_easyjson
     7  // build tag is enabled, we use the easyjson adapter in token_reader_easyjson.go instead. These have the same
     8  // methods so the Reader code does not need to know which implementation we're using; however, we don't
     9  // actually define an interface for these, because calling the methods through an interface would limit
    10  // performance.
    11  
    12  import (
    13  	"bytes"
    14  	"io"
    15  	"strconv"
    16  	"unicode"
    17  	"unicode/utf8"
    18  )
    19  
    20  var (
    21  	tokenNull  = []byte("null")  //nolint:gochecknoglobals
    22  	tokenTrue  = []byte("true")  //nolint:gochecknoglobals
    23  	tokenFalse = []byte("false") //nolint:gochecknoglobals
    24  )
    25  
    26  type token struct {
    27  	kind        tokenKind
    28  	boolValue   bool
    29  	numberValue float64
    30  	stringValue []byte
    31  	delimiter   byte
    32  }
    33  
    34  type tokenKind int
    35  
    36  const (
    37  	nullToken      tokenKind = iota
    38  	boolToken      tokenKind = iota
    39  	numberToken    tokenKind = iota
    40  	stringToken    tokenKind = iota
    41  	delimiterToken tokenKind = iota
    42  )
    43  
    44  func (t token) valueKind() ValueKind {
    45  	if t.kind == delimiterToken {
    46  		if t.delimiter == '[' {
    47  			return ArrayValue
    48  		}
    49  		if t.delimiter == '{' {
    50  			return ObjectValue
    51  		}
    52  	}
    53  	return valueKindFromTokenKind(t.kind)
    54  }
    55  
    56  func (t token) description() string {
    57  	if t.kind == delimiterToken && t.delimiter != '[' && t.delimiter != '{' {
    58  		return "'" + string(t.delimiter) + "'"
    59  	}
    60  	return t.valueKind().String()
    61  }
    62  
    63  type tokenReader struct {
    64  	data        []byte
    65  	pos         int
    66  	len         int
    67  	hasUnread   bool
    68  	unreadToken token
    69  	lastPos     int
    70  }
    71  
    72  func newTokenReader(data []byte) tokenReader {
    73  	tr := tokenReader{
    74  		data: data,
    75  		pos:  0,
    76  		len:  len(data),
    77  	}
    78  	return tr
    79  }
    80  
    81  // EOF returns true if we are at the end of the input (not counting whitespace).
    82  func (r *tokenReader) EOF() bool {
    83  	if r.hasUnread {
    84  		return false
    85  	}
    86  	_, ok := r.skipWhitespaceAndReadByte()
    87  	if !ok {
    88  		return true
    89  	}
    90  	r.unreadByte()
    91  	return false
    92  }
    93  
    94  // LastPos returns the byte offset within the input where we most recently started parsing a token.
    95  func (r *tokenReader) LastPos() int {
    96  	return r.lastPos
    97  }
    98  
    99  func (r *tokenReader) getPos() int {
   100  	if r.hasUnread {
   101  		return r.lastPos
   102  	}
   103  	return r.pos
   104  }
   105  
   106  // Null returns (true, nil) if the next token is a null (consuming the token); (false, nil) if the next
   107  // token is not a null (not consuming the token); or (false, error) if the next token is not a valid
   108  // JSON value.
   109  //
   110  // This and all other tokenReader methods skip transparently past whitespace between tokens.
   111  func (r *tokenReader) Null() (bool, error) {
   112  	t, err := r.next()
   113  	if err != nil {
   114  		return false, err
   115  	}
   116  	if t.kind == nullToken {
   117  		return true, nil
   118  	}
   119  	r.putBack(t)
   120  	if t.kind == delimiterToken && t.delimiter != '[' && t.delimiter != '{' {
   121  		return false, SyntaxError{Message: errMsgUnexpectedChar, Value: string(t.delimiter), Offset: r.getPos()}
   122  	}
   123  	return false, nil
   124  }
   125  
   126  // Bool requires that the next token is a JSON boolean, returning its value if successful (consuming
   127  // the token), or an error if the next token is anything other than a JSON boolean.
   128  //
   129  // This and all other tokenReader methods skip transparently past whitespace between tokens.
   130  func (r *tokenReader) Bool() (bool, error) {
   131  	t, err := r.consumeScalar(boolToken)
   132  	return t.boolValue, err
   133  }
   134  
   135  // Bool requires that the next token is a JSON number, returning its value if successful (consuming
   136  // the token), or an error if the next token is anything other than a JSON number.
   137  //
   138  // This and all other tokenReader methods skip transparently past whitespace between tokens.
   139  func (r *tokenReader) Number() (float64, error) {
   140  	t, err := r.consumeScalar(numberToken)
   141  	return t.numberValue, err
   142  }
   143  
   144  // String requires that the next token is a JSON string, returning its value if successful (consuming
   145  // the token), or an error if the next token is anything other than a JSON string.
   146  //
   147  // This and all other tokenReader methods skip transparently past whitespace between tokens.
   148  func (r *tokenReader) String() (string, error) {
   149  	t, err := r.consumeScalar(stringToken)
   150  	return string(t.stringValue), err
   151  }
   152  
   153  // PropertyName requires that the next token is a JSON string and the token after that is a colon,
   154  // returning the string as a byte slice if successful, or an error otherwise.
   155  //
   156  // Returning the string as a byte slice avoids the overhead of allocating a string, since normally
   157  // the names of properties will not be retained as strings but are only compared to constants while
   158  // parsing an object.
   159  //
   160  // This and all other tokenReader methods skip transparently past whitespace between tokens.
   161  func (r *tokenReader) PropertyName() ([]byte, error) {
   162  	t, err := r.consumeScalar(stringToken)
   163  	if err != nil {
   164  		return nil, err
   165  	}
   166  	b, ok := r.skipWhitespaceAndReadByte()
   167  	if !ok {
   168  		return nil, io.EOF
   169  	}
   170  	if b != ':' {
   171  		r.unreadByte()
   172  		return nil, r.syntaxErrorOnNextToken(errMsgExpectedColon)
   173  	}
   174  	return t.stringValue, nil
   175  }
   176  
   177  // Delimiter checks whether the next token is the specified ASCII delimiter character. If so, it
   178  // returns (true, nil) and consumes the token. If it is a delimiter, but not the same one, it
   179  // returns (false, nil) and does not consume the token. For anything else, it returns an error.
   180  //
   181  // This and all other tokenReader methods skip transparently past whitespace between tokens.
   182  func (r *tokenReader) Delimiter(delimiter byte) (bool, error) {
   183  	if r.hasUnread {
   184  		if r.unreadToken.kind == delimiterToken && r.unreadToken.delimiter == delimiter {
   185  			r.hasUnread = false
   186  			return true, nil
   187  		}
   188  		return false, nil
   189  	}
   190  	b, ok := r.skipWhitespaceAndReadByte()
   191  	if !ok {
   192  		return false, nil
   193  	}
   194  	if b == delimiter {
   195  		return true, nil
   196  	}
   197  	r.unreadByte() // we'll back up and try to parse a token, to see if it's valid JSON or not
   198  	token, err := r.next()
   199  	if err != nil {
   200  		return false, err // it was malformed JSON
   201  	}
   202  	r.putBack(token) // it was valid JSON, we just haven't hit that delimiter
   203  	return false, nil
   204  }
   205  
   206  // EndDelimiterOrComma checks whether the next token is the specified ASCII delimiter character
   207  // or a comma. If it is the specified delimiter, it returns (true, nil) and consumes the token.
   208  // If it is a comma, it returns (false, nil) and consumes the token. For anything else, it
   209  // returns an error. The delimiter parameter will always be either '}' or ']'.
   210  func (r *tokenReader) EndDelimiterOrComma(delimiter byte) (bool, error) {
   211  	if r.hasUnread {
   212  		if r.unreadToken.kind == delimiterToken &&
   213  			(r.unreadToken.delimiter == delimiter || r.unreadToken.delimiter == ',') {
   214  			r.hasUnread = false
   215  			return r.unreadToken.delimiter == delimiter, nil
   216  		}
   217  		return false, SyntaxError{Message: badArrayOrObjectItemMessage(delimiter == '}'),
   218  			Value: r.unreadToken.description(), Offset: r.lastPos}
   219  	}
   220  	b, ok := r.skipWhitespaceAndReadByte()
   221  	if !ok {
   222  		return false, io.EOF
   223  	}
   224  	if b == delimiter || b == ',' {
   225  		return b == delimiter, nil
   226  	}
   227  	r.unreadByte()
   228  	t, err := r.next()
   229  	if err != nil {
   230  		return false, err
   231  	}
   232  	return false, SyntaxError{Message: badArrayOrObjectItemMessage(delimiter == '}'),
   233  		Value: t.description(), Offset: r.lastPos}
   234  }
   235  
   236  func badArrayOrObjectItemMessage(isObject bool) string {
   237  	if isObject {
   238  		return errMsgBadObjectItem
   239  	}
   240  	return errMsgBadArrayItem
   241  }
   242  
   243  // Any checks whether the next token is either a valid JSON scalar value or the opening delimiter of
   244  // an array or object value. If so, it returns (AnyValue, nil) and consumes the token; if not, it
   245  // returns an error. Unlike Reader.Any(), for array and object values it does not create an
   246  // ArrayState or ObjectState.
   247  func (r *tokenReader) Any() (AnyValue, error) {
   248  	t, err := r.next()
   249  	if err != nil {
   250  		return AnyValue{}, err
   251  	}
   252  	switch t.kind {
   253  	case boolToken:
   254  		return AnyValue{Kind: BoolValue, Bool: t.boolValue}, nil
   255  	case numberToken:
   256  		return AnyValue{Kind: NumberValue, Number: t.numberValue}, nil
   257  	case stringToken:
   258  		return AnyValue{Kind: StringValue, String: string(t.stringValue)}, nil
   259  	case delimiterToken:
   260  		if t.delimiter == '[' {
   261  			return AnyValue{Kind: ArrayValue}, nil
   262  		}
   263  		if t.delimiter == '{' {
   264  			return AnyValue{Kind: ObjectValue}, nil
   265  		}
   266  		return AnyValue{},
   267  			SyntaxError{Message: errMsgUnexpectedChar, Value: string(t.delimiter), Offset: r.lastPos}
   268  	default:
   269  		return AnyValue{Kind: NullValue}, nil
   270  	}
   271  }
   272  
   273  // Attempts to parse and consume the next token, ignoring whitespace. A token is either a valid JSON scalar
   274  // value or an ASCII delimiter character. If a token was previously unread using putBack, it consumes that
   275  // instead.
   276  func (r *tokenReader) next() (token, error) {
   277  	if r.hasUnread {
   278  		r.hasUnread = false
   279  		return r.unreadToken, nil
   280  	}
   281  	b, ok := r.skipWhitespaceAndReadByte()
   282  	if !ok {
   283  		return token{}, io.EOF
   284  	}
   285  
   286  	switch {
   287  	// We can get away with reading bytes instead of runes because the JSON spec doesn't allow multi-byte
   288  	// characters except within a string literal.
   289  	case b >= 'a' && b <= 'z':
   290  		n := r.consumeASCIILowercaseAlphabeticChars() + 1
   291  		id := r.data[r.lastPos : r.lastPos+n]
   292  		if b == 'f' && bytes.Equal(id, tokenFalse) {
   293  			return token{kind: boolToken, boolValue: false}, nil
   294  		}
   295  		if b == 't' && bytes.Equal(id, tokenTrue) {
   296  			return token{kind: boolToken, boolValue: true}, nil
   297  		}
   298  		if b == 'n' && bytes.Equal(id, tokenNull) {
   299  			return token{kind: nullToken}, nil
   300  		}
   301  		return token{}, SyntaxError{Message: errMsgUnexpectedSymbol, Value: string(id), Offset: r.lastPos}
   302  	case (b >= '0' && b <= '9') || b == '-':
   303  		if n, ok := r.readNumber(b); ok {
   304  			return token{kind: numberToken, numberValue: n}, nil
   305  		}
   306  		return token{}, SyntaxError{Message: errMsgInvalidNumber, Offset: r.lastPos}
   307  	case b == '"':
   308  		s, err := r.readString()
   309  		if err != nil {
   310  			return token{}, err
   311  		}
   312  		return token{kind: stringToken, stringValue: s}, nil
   313  	case b == '[', b == ']', b == '{', b == '}', b == ':', b == ',':
   314  		return token{kind: delimiterToken, delimiter: b}, nil
   315  	}
   316  
   317  	return token{}, SyntaxError{Message: errMsgUnexpectedChar, Value: string(b), Offset: r.lastPos}
   318  }
   319  
   320  func (r *tokenReader) putBack(token token) {
   321  	r.unreadToken = token
   322  	r.hasUnread = true
   323  }
   324  
   325  func (r *tokenReader) consumeScalar(kind tokenKind) (token, error) {
   326  	t, err := r.next()
   327  	if err != nil {
   328  		return token{}, err
   329  	}
   330  	if t.kind == kind {
   331  		return t, nil
   332  	}
   333  	if t.kind == delimiterToken && t.delimiter != '[' && t.delimiter != '{' {
   334  		return token{}, SyntaxError{Message: errMsgUnexpectedChar, Value: string(t.delimiter), Offset: r.LastPos()}
   335  	}
   336  	return token{}, TypeError{Expected: valueKindFromTokenKind(kind),
   337  		Actual: t.valueKind(), Offset: r.LastPos()}
   338  }
   339  
   340  func (r *tokenReader) readByte() (byte, bool) {
   341  	if r.pos >= r.len {
   342  		return 0, false
   343  	}
   344  	b := r.data[r.pos]
   345  	r.pos++
   346  	return b, true
   347  }
   348  
   349  func (r *tokenReader) unreadByte() {
   350  	r.pos--
   351  }
   352  
   353  func (r *tokenReader) skipWhitespaceAndReadByte() (byte, bool) {
   354  	for {
   355  		ch, ok := r.readByte()
   356  		if !ok {
   357  			return 0, false
   358  		}
   359  		if !unicode.IsSpace(rune(ch)) {
   360  			r.lastPos = r.pos - 1
   361  			return ch, true
   362  		}
   363  	}
   364  }
   365  
   366  func (r *tokenReader) consumeASCIILowercaseAlphabeticChars() int {
   367  	n := 0
   368  	for {
   369  		ch, ok := r.readByte()
   370  		if !ok {
   371  			break
   372  		}
   373  		if ch < 'a' || ch > 'z' {
   374  			r.unreadByte()
   375  			break
   376  		}
   377  		n++
   378  	}
   379  	return n
   380  }
   381  
   382  func (r *tokenReader) readNumber(first byte) (float64, bool) { //nolint:unparam
   383  	startPos := r.lastPos
   384  	isFloat := false
   385  	var ch byte
   386  	var ok bool
   387  	for {
   388  		ch, ok = r.readByte()
   389  		if !ok {
   390  			break
   391  		}
   392  		if (ch < '0' || ch > '9') && !(ch == '.' && !isFloat) {
   393  			break
   394  		}
   395  		if ch == '.' {
   396  			isFloat = true
   397  		}
   398  	}
   399  	hasExponent := false
   400  	if ch == 'e' || ch == 'E' {
   401  		// exponent must match this regex: [eE][-+]?[0-9]+
   402  		ch, ok = r.readByte()
   403  		if !ok {
   404  			return 0, false
   405  		}
   406  		if ch == '+' || ch == '-' { //nolint:gocritic
   407  		} else if ch >= '0' && ch <= '9' {
   408  			r.unreadByte()
   409  		} else {
   410  			return 0, false
   411  		}
   412  		for {
   413  			ch, ok = r.readByte()
   414  			if !ok {
   415  				break
   416  			}
   417  			if ch < '0' || ch > '9' {
   418  				r.unreadByte()
   419  				break
   420  			}
   421  			hasExponent = true
   422  		}
   423  		if !hasExponent {
   424  			return 0, false
   425  		}
   426  		isFloat = true
   427  	} else { //nolint:gocritic
   428  		if ok {
   429  			r.unreadByte()
   430  		}
   431  	}
   432  	chars := r.data[startPos:r.pos]
   433  	if isFloat {
   434  		// Unfortunately, strconv.ParseFloat requires a string - there is no []byte equivalent. This means we can't
   435  		// avoid a heap allocation here. Easyjson works around this by creating an unsafe string that points directly
   436  		// at the existing bytes, but in our default implementation we can't use unsafe.
   437  		n, err := strconv.ParseFloat(string(chars), 64)
   438  		return n, err == nil
   439  	} else { //nolint:revive
   440  		n, ok := parseIntFromBytes(chars)
   441  		return float64(n), ok
   442  	}
   443  }
   444  
   445  func (r *tokenReader) readString() ([]byte, error) {
   446  	startPos := r.pos // the opening quote mark has already been read
   447  	var chars []byte
   448  	haveEscaped := false
   449  	var reader bytes.Reader // bytes.Reader understands multi-byte characters
   450  	reader.Reset(r.data)
   451  	_, _ = reader.Seek(int64(r.pos), io.SeekStart)
   452  
   453  	for {
   454  		ch, _, err := reader.ReadRune()
   455  		if err != nil {
   456  			return nil, r.syntaxErrorOnLastToken(errMsgInvalidString)
   457  		}
   458  		if ch == '"' {
   459  			break
   460  		}
   461  		if ch != '\\' {
   462  			if haveEscaped {
   463  				chars = appendRune(chars, ch)
   464  			}
   465  			continue
   466  		}
   467  		if !haveEscaped {
   468  			pos := (r.len - reader.Len()) - 1 // don't include the backslash we just read
   469  			chars = make([]byte, pos-startPos, pos-startPos+20)
   470  			if pos > startPos {
   471  				copy(chars, r.data[startPos:pos])
   472  			}
   473  			haveEscaped = true
   474  		}
   475  		ch, _, err = reader.ReadRune()
   476  		if err != nil {
   477  			return nil, r.syntaxErrorOnLastToken(errMsgInvalidString)
   478  		}
   479  		switch ch {
   480  		case '"', '\\', '/':
   481  			chars = appendRune(chars, ch)
   482  		case 'b':
   483  			chars = appendRune(chars, '\b')
   484  		case 'f':
   485  			chars = appendRune(chars, '\f')
   486  		case 'n':
   487  			chars = appendRune(chars, '\n')
   488  		case 'r':
   489  			chars = appendRune(chars, '\r')
   490  		case 't':
   491  			chars = appendRune(chars, '\t')
   492  		case 'u':
   493  			if ch, ok := readHexChar(&reader); ok {
   494  				chars = appendRune(chars, ch)
   495  			} else {
   496  				return nil, r.syntaxErrorOnLastToken(errMsgInvalidString)
   497  			}
   498  		default:
   499  			return nil, r.syntaxErrorOnLastToken(errMsgInvalidString)
   500  		}
   501  	}
   502  	r.pos = r.len - reader.Len()
   503  	if haveEscaped {
   504  		if len(chars) == 0 {
   505  			return nil, nil
   506  		}
   507  		return chars, nil
   508  	} else { //nolint:revive
   509  		pos := r.pos - 1
   510  		if pos <= startPos {
   511  			return nil, nil
   512  		}
   513  		return r.data[startPos:pos], nil
   514  	}
   515  }
   516  
   517  func readHexChar(reader *bytes.Reader) (rune, bool) {
   518  	var digits [4]byte
   519  	for i := 0; i < 4; i++ {
   520  		ch, err := reader.ReadByte()
   521  		if err != nil || !((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')) {
   522  			return 0, false
   523  		}
   524  		digits[i] = ch
   525  	}
   526  	n, _ := strconv.ParseUint(string(digits[:]), 16, 32)
   527  	return rune(n), true
   528  }
   529  
   530  func (r *tokenReader) syntaxErrorOnLastToken(msg string) error { //nolint:unparam
   531  	return SyntaxError{Message: msg, Offset: r.LastPos()}
   532  }
   533  
   534  func (r *tokenReader) syntaxErrorOnNextToken(msg string) error {
   535  	t, err := r.next()
   536  	if err != nil {
   537  		return err
   538  	}
   539  	return SyntaxError{Message: msg, Value: t.description(), Offset: r.LastPos()}
   540  }
   541  
   542  // This is faster than creating a string to pass to strconv.Atoi.
   543  func parseIntFromBytes(chars []byte) (int64, bool) {
   544  	negate := false
   545  	p := 0
   546  	var ret int64
   547  	if len(chars) == 0 {
   548  		return 0, false
   549  	}
   550  	if chars[0] == '-' {
   551  		negate = true
   552  		p++
   553  		if p == len(chars) {
   554  			return 0, false
   555  		}
   556  	}
   557  	for p < len(chars) {
   558  		ret = ret*10 + int64(chars[p]-'0')
   559  		p++
   560  	}
   561  	if negate {
   562  		ret = -ret
   563  	}
   564  	return ret, true
   565  }
   566  
   567  func appendRune(out []byte, ch rune) []byte {
   568  	var encodedRune [10]byte
   569  	n := utf8.EncodeRune(encodedRune[0:10], ch)
   570  	return append(out, encodedRune[0:n]...)
   571  }
   572  
   573  func valueKindFromTokenKind(k tokenKind) ValueKind {
   574  	switch k {
   575  	case nullToken:
   576  		return NullValue
   577  	case boolToken:
   578  		return BoolValue
   579  	case numberToken:
   580  		return NumberValue
   581  	case stringToken:
   582  		return StringValue
   583  	}
   584  	return -1
   585  }
   586
View as plain text