lex.go

Documentation: github.com/BurntSushi/toml

     1  package toml
     2  
     3  import (
     4  	"fmt"
     5  	"reflect"
     6  	"runtime"
     7  	"strings"
     8  	"unicode"
     9  	"unicode/utf8"
    10  )
    11  
    12  type itemType int
    13  
    14  const (
    15  	itemError itemType = iota
    16  	itemNIL            // used in the parser to indicate no type
    17  	itemEOF
    18  	itemText
    19  	itemString
    20  	itemRawString
    21  	itemMultilineString
    22  	itemRawMultilineString
    23  	itemBool
    24  	itemInteger
    25  	itemFloat
    26  	itemDatetime
    27  	itemArray // the start of an array
    28  	itemArrayEnd
    29  	itemTableStart
    30  	itemTableEnd
    31  	itemArrayTableStart
    32  	itemArrayTableEnd
    33  	itemKeyStart
    34  	itemKeyEnd
    35  	itemCommentStart
    36  	itemInlineTableStart
    37  	itemInlineTableEnd
    38  )
    39  
    40  const eof = 0
    41  
    42  type stateFn func(lx *lexer) stateFn
    43  
    44  func (p Position) String() string {
    45  	return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len)
    46  }
    47  
    48  type lexer struct {
    49  	input    string
    50  	start    int
    51  	pos      int
    52  	line     int
    53  	state    stateFn
    54  	items    chan item
    55  	tomlNext bool
    56  
    57  	// Allow for backing up up to 4 runes. This is necessary because TOML
    58  	// contains 3-rune tokens (""" and ''').
    59  	prevWidths [4]int
    60  	nprev      int  // how many of prevWidths are in use
    61  	atEOF      bool // If we emit an eof, we can still back up, but it is not OK to call next again.
    62  
    63  	// A stack of state functions used to maintain context.
    64  	//
    65  	// The idea is to reuse parts of the state machine in various places. For
    66  	// example, values can appear at the top level or within arbitrarily nested
    67  	// arrays. The last state on the stack is used after a value has been lexed.
    68  	// Similarly for comments.
    69  	stack []stateFn
    70  }
    71  
    72  type item struct {
    73  	typ itemType
    74  	val string
    75  	err error
    76  	pos Position
    77  }
    78  
    79  func (lx *lexer) nextItem() item {
    80  	for {
    81  		select {
    82  		case item := <-lx.items:
    83  			return item
    84  		default:
    85  			lx.state = lx.state(lx)
    86  			//fmt.Printf("     STATE %-24s  current: %-10s	stack: %s\n", lx.state, lx.current(), lx.stack)
    87  		}
    88  	}
    89  }
    90  
    91  func lex(input string, tomlNext bool) *lexer {
    92  	lx := &lexer{
    93  		input:    input,
    94  		state:    lexTop,
    95  		items:    make(chan item, 10),
    96  		stack:    make([]stateFn, 0, 10),
    97  		line:     1,
    98  		tomlNext: tomlNext,
    99  	}
   100  	return lx
   101  }
   102  
   103  func (lx *lexer) push(state stateFn) {
   104  	lx.stack = append(lx.stack, state)
   105  }
   106  
   107  func (lx *lexer) pop() stateFn {
   108  	if len(lx.stack) == 0 {
   109  		return lx.errorf("BUG in lexer: no states to pop")
   110  	}
   111  	last := lx.stack[len(lx.stack)-1]
   112  	lx.stack = lx.stack[0 : len(lx.stack)-1]
   113  	return last
   114  }
   115  
   116  func (lx *lexer) current() string {
   117  	return lx.input[lx.start:lx.pos]
   118  }
   119  
   120  func (lx lexer) getPos() Position {
   121  	p := Position{
   122  		Line:  lx.line,
   123  		Start: lx.start,
   124  		Len:   lx.pos - lx.start,
   125  	}
   126  	if p.Len <= 0 {
   127  		p.Len = 1
   128  	}
   129  	return p
   130  }
   131  
   132  func (lx *lexer) emit(typ itemType) {
   133  	// Needed for multiline strings ending with an incomplete UTF-8 sequence.
   134  	if lx.start > lx.pos {
   135  		lx.error(errLexUTF8{lx.input[lx.pos]})
   136  		return
   137  	}
   138  	lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()}
   139  	lx.start = lx.pos
   140  }
   141  
   142  func (lx *lexer) emitTrim(typ itemType) {
   143  	lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())}
   144  	lx.start = lx.pos
   145  }
   146  
   147  func (lx *lexer) next() (r rune) {
   148  	if lx.atEOF {
   149  		panic("BUG in lexer: next called after EOF")
   150  	}
   151  	if lx.pos >= len(lx.input) {
   152  		lx.atEOF = true
   153  		return eof
   154  	}
   155  
   156  	if lx.input[lx.pos] == '\n' {
   157  		lx.line++
   158  	}
   159  	lx.prevWidths[3] = lx.prevWidths[2]
   160  	lx.prevWidths[2] = lx.prevWidths[1]
   161  	lx.prevWidths[1] = lx.prevWidths[0]
   162  	if lx.nprev < 4 {
   163  		lx.nprev++
   164  	}
   165  
   166  	r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
   167  	if r == utf8.RuneError {
   168  		lx.error(errLexUTF8{lx.input[lx.pos]})
   169  		return utf8.RuneError
   170  	}
   171  
   172  	// Note: don't use peek() here, as this calls next().
   173  	if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) {
   174  		lx.errorControlChar(r)
   175  		return utf8.RuneError
   176  	}
   177  
   178  	lx.prevWidths[0] = w
   179  	lx.pos += w
   180  	return r
   181  }
   182  
   183  // ignore skips over the pending input before this point.
   184  func (lx *lexer) ignore() {
   185  	lx.start = lx.pos
   186  }
   187  
   188  // backup steps back one rune. Can be called 4 times between calls to next.
   189  func (lx *lexer) backup() {
   190  	if lx.atEOF {
   191  		lx.atEOF = false
   192  		return
   193  	}
   194  	if lx.nprev < 1 {
   195  		panic("BUG in lexer: backed up too far")
   196  	}
   197  	w := lx.prevWidths[0]
   198  	lx.prevWidths[0] = lx.prevWidths[1]
   199  	lx.prevWidths[1] = lx.prevWidths[2]
   200  	lx.prevWidths[2] = lx.prevWidths[3]
   201  	lx.nprev--
   202  
   203  	lx.pos -= w
   204  	if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
   205  		lx.line--
   206  	}
   207  }
   208  
   209  // accept consumes the next rune if it's equal to `valid`.
   210  func (lx *lexer) accept(valid rune) bool {
   211  	if lx.next() == valid {
   212  		return true
   213  	}
   214  	lx.backup()
   215  	return false
   216  }
   217  
   218  // peek returns but does not consume the next rune in the input.
   219  func (lx *lexer) peek() rune {
   220  	r := lx.next()
   221  	lx.backup()
   222  	return r
   223  }
   224  
   225  // skip ignores all input that matches the given predicate.
   226  func (lx *lexer) skip(pred func(rune) bool) {
   227  	for {
   228  		r := lx.next()
   229  		if pred(r) {
   230  			continue
   231  		}
   232  		lx.backup()
   233  		lx.ignore()
   234  		return
   235  	}
   236  }
   237  
   238  // error stops all lexing by emitting an error and returning `nil`.
   239  //
   240  // Note that any value that is a character is escaped if it's a special
   241  // character (newlines, tabs, etc.).
   242  func (lx *lexer) error(err error) stateFn {
   243  	if lx.atEOF {
   244  		return lx.errorPrevLine(err)
   245  	}
   246  	lx.items <- item{typ: itemError, pos: lx.getPos(), err: err}
   247  	return nil
   248  }
   249  
   250  // errorfPrevline is like error(), but sets the position to the last column of
   251  // the previous line.
   252  //
   253  // This is so that unexpected EOF or NL errors don't show on a new blank line.
   254  func (lx *lexer) errorPrevLine(err error) stateFn {
   255  	pos := lx.getPos()
   256  	pos.Line--
   257  	pos.Len = 1
   258  	pos.Start = lx.pos - 1
   259  	lx.items <- item{typ: itemError, pos: pos, err: err}
   260  	return nil
   261  }
   262  
   263  // errorPos is like error(), but allows explicitly setting the position.
   264  func (lx *lexer) errorPos(start, length int, err error) stateFn {
   265  	pos := lx.getPos()
   266  	pos.Start = start
   267  	pos.Len = length
   268  	lx.items <- item{typ: itemError, pos: pos, err: err}
   269  	return nil
   270  }
   271  
   272  // errorf is like error, and creates a new error.
   273  func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
   274  	if lx.atEOF {
   275  		pos := lx.getPos()
   276  		pos.Line--
   277  		pos.Len = 1
   278  		pos.Start = lx.pos - 1
   279  		lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)}
   280  		return nil
   281  	}
   282  	lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)}
   283  	return nil
   284  }
   285  
   286  func (lx *lexer) errorControlChar(cc rune) stateFn {
   287  	return lx.errorPos(lx.pos-1, 1, errLexControl{cc})
   288  }
   289  
   290  // lexTop consumes elements at the top level of TOML data.
   291  func lexTop(lx *lexer) stateFn {
   292  	r := lx.next()
   293  	if isWhitespace(r) || isNL(r) {
   294  		return lexSkip(lx, lexTop)
   295  	}
   296  	switch r {
   297  	case '#':
   298  		lx.push(lexTop)
   299  		return lexCommentStart
   300  	case '[':
   301  		return lexTableStart
   302  	case eof:
   303  		if lx.pos > lx.start {
   304  			return lx.errorf("unexpected EOF")
   305  		}
   306  		lx.emit(itemEOF)
   307  		return nil
   308  	}
   309  
   310  	// At this point, the only valid item can be a key, so we back up
   311  	// and let the key lexer do the rest.
   312  	lx.backup()
   313  	lx.push(lexTopEnd)
   314  	return lexKeyStart
   315  }
   316  
   317  // lexTopEnd is entered whenever a top-level item has been consumed. (A value
   318  // or a table.) It must see only whitespace, and will turn back to lexTop
   319  // upon a newline. If it sees EOF, it will quit the lexer successfully.
   320  func lexTopEnd(lx *lexer) stateFn {
   321  	r := lx.next()
   322  	switch {
   323  	case r == '#':
   324  		// a comment will read to a newline for us.
   325  		lx.push(lexTop)
   326  		return lexCommentStart
   327  	case isWhitespace(r):
   328  		return lexTopEnd
   329  	case isNL(r):
   330  		lx.ignore()
   331  		return lexTop
   332  	case r == eof:
   333  		lx.emit(itemEOF)
   334  		return nil
   335  	}
   336  	return lx.errorf(
   337  		"expected a top-level item to end with a newline, comment, or EOF, but got %q instead",
   338  		r)
   339  }
   340  
   341  // lexTable lexes the beginning of a table. Namely, it makes sure that
   342  // it starts with a character other than '.' and ']'.
   343  // It assumes that '[' has already been consumed.
   344  // It also handles the case that this is an item in an array of tables.
   345  // e.g., '[[name]]'.
   346  func lexTableStart(lx *lexer) stateFn {
   347  	if lx.peek() == '[' {
   348  		lx.next()
   349  		lx.emit(itemArrayTableStart)
   350  		lx.push(lexArrayTableEnd)
   351  	} else {
   352  		lx.emit(itemTableStart)
   353  		lx.push(lexTableEnd)
   354  	}
   355  	return lexTableNameStart
   356  }
   357  
   358  func lexTableEnd(lx *lexer) stateFn {
   359  	lx.emit(itemTableEnd)
   360  	return lexTopEnd
   361  }
   362  
   363  func lexArrayTableEnd(lx *lexer) stateFn {
   364  	if r := lx.next(); r != ']' {
   365  		return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r)
   366  	}
   367  	lx.emit(itemArrayTableEnd)
   368  	return lexTopEnd
   369  }
   370  
   371  func lexTableNameStart(lx *lexer) stateFn {
   372  	lx.skip(isWhitespace)
   373  	switch r := lx.peek(); {
   374  	case r == ']' || r == eof:
   375  		return lx.errorf("unexpected end of table name (table names cannot be empty)")
   376  	case r == '.':
   377  		return lx.errorf("unexpected table separator (table names cannot be empty)")
   378  	case r == '"' || r == '\'':
   379  		lx.ignore()
   380  		lx.push(lexTableNameEnd)
   381  		return lexQuotedName
   382  	default:
   383  		lx.push(lexTableNameEnd)
   384  		return lexBareName
   385  	}
   386  }
   387  
   388  // lexTableNameEnd reads the end of a piece of a table name, optionally
   389  // consuming whitespace.
   390  func lexTableNameEnd(lx *lexer) stateFn {
   391  	lx.skip(isWhitespace)
   392  	switch r := lx.next(); {
   393  	case isWhitespace(r):
   394  		return lexTableNameEnd
   395  	case r == '.':
   396  		lx.ignore()
   397  		return lexTableNameStart
   398  	case r == ']':
   399  		return lx.pop()
   400  	default:
   401  		return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
   402  	}
   403  }
   404  
   405  // lexBareName lexes one part of a key or table.
   406  //
   407  // It assumes that at least one valid character for the table has already been
   408  // read.
   409  //
   410  // Lexes only one part, e.g. only 'a' inside 'a.b'.
   411  func lexBareName(lx *lexer) stateFn {
   412  	r := lx.next()
   413  	if isBareKeyChar(r, lx.tomlNext) {
   414  		return lexBareName
   415  	}
   416  	lx.backup()
   417  	lx.emit(itemText)
   418  	return lx.pop()
   419  }
   420  
   421  // lexBareName lexes one part of a key or table.
   422  //
   423  // It assumes that at least one valid character for the table has already been
   424  // read.
   425  //
   426  // Lexes only one part, e.g. only '"a"' inside '"a".b'.
   427  func lexQuotedName(lx *lexer) stateFn {
   428  	r := lx.next()
   429  	switch {
   430  	case isWhitespace(r):
   431  		return lexSkip(lx, lexValue)
   432  	case r == '"':
   433  		lx.ignore() // ignore the '"'
   434  		return lexString
   435  	case r == '\'':
   436  		lx.ignore() // ignore the "'"
   437  		return lexRawString
   438  	case r == eof:
   439  		return lx.errorf("unexpected EOF; expected value")
   440  	default:
   441  		return lx.errorf("expected value but found %q instead", r)
   442  	}
   443  }
   444  
   445  // lexKeyStart consumes all key parts until a '='.
   446  func lexKeyStart(lx *lexer) stateFn {
   447  	lx.skip(isWhitespace)
   448  	switch r := lx.peek(); {
   449  	case r == '=' || r == eof:
   450  		return lx.errorf("unexpected '=': key name appears blank")
   451  	case r == '.':
   452  		return lx.errorf("unexpected '.': keys cannot start with a '.'")
   453  	case r == '"' || r == '\'':
   454  		lx.ignore()
   455  		fallthrough
   456  	default: // Bare key
   457  		lx.emit(itemKeyStart)
   458  		return lexKeyNameStart
   459  	}
   460  }
   461  
   462  func lexKeyNameStart(lx *lexer) stateFn {
   463  	lx.skip(isWhitespace)
   464  	switch r := lx.peek(); {
   465  	case r == '=' || r == eof:
   466  		return lx.errorf("unexpected '='")
   467  	case r == '.':
   468  		return lx.errorf("unexpected '.'")
   469  	case r == '"' || r == '\'':
   470  		lx.ignore()
   471  		lx.push(lexKeyEnd)
   472  		return lexQuotedName
   473  	default:
   474  		lx.push(lexKeyEnd)
   475  		return lexBareName
   476  	}
   477  }
   478  
   479  // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
   480  // separator).
   481  func lexKeyEnd(lx *lexer) stateFn {
   482  	lx.skip(isWhitespace)
   483  	switch r := lx.next(); {
   484  	case isWhitespace(r):
   485  		return lexSkip(lx, lexKeyEnd)
   486  	case r == eof:
   487  		return lx.errorf("unexpected EOF; expected key separator '='")
   488  	case r == '.':
   489  		lx.ignore()
   490  		return lexKeyNameStart
   491  	case r == '=':
   492  		lx.emit(itemKeyEnd)
   493  		return lexSkip(lx, lexValue)
   494  	default:
   495  		return lx.errorf("expected '.' or '=', but got %q instead", r)
   496  	}
   497  }
   498  
   499  // lexValue starts the consumption of a value anywhere a value is expected.
   500  // lexValue will ignore whitespace.
   501  // After a value is lexed, the last state on the next is popped and returned.
   502  func lexValue(lx *lexer) stateFn {
   503  	// We allow whitespace to precede a value, but NOT newlines.
   504  	// In array syntax, the array states are responsible for ignoring newlines.
   505  	r := lx.next()
   506  	switch {
   507  	case isWhitespace(r):
   508  		return lexSkip(lx, lexValue)
   509  	case isDigit(r):
   510  		lx.backup() // avoid an extra state and use the same as above
   511  		return lexNumberOrDateStart
   512  	}
   513  	switch r {
   514  	case '[':
   515  		lx.ignore()
   516  		lx.emit(itemArray)
   517  		return lexArrayValue
   518  	case '{':
   519  		lx.ignore()
   520  		lx.emit(itemInlineTableStart)
   521  		return lexInlineTableValue
   522  	case '"':
   523  		if lx.accept('"') {
   524  			if lx.accept('"') {
   525  				lx.ignore() // Ignore """
   526  				return lexMultilineString
   527  			}
   528  			lx.backup()
   529  		}
   530  		lx.ignore() // ignore the '"'
   531  		return lexString
   532  	case '\'':
   533  		if lx.accept('\'') {
   534  			if lx.accept('\'') {
   535  				lx.ignore() // Ignore """
   536  				return lexMultilineRawString
   537  			}
   538  			lx.backup()
   539  		}
   540  		lx.ignore() // ignore the "'"
   541  		return lexRawString
   542  	case '.': // special error case, be kind to users
   543  		return lx.errorf("floats must start with a digit, not '.'")
   544  	case 'i', 'n':
   545  		if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
   546  			lx.emit(itemFloat)
   547  			return lx.pop()
   548  		}
   549  	case '-', '+':
   550  		return lexDecimalNumberStart
   551  	}
   552  	if unicode.IsLetter(r) {
   553  		// Be permissive here; lexBool will give a nice error if the
   554  		// user wrote something like
   555  		//   x = foo
   556  		// (i.e. not 'true' or 'false' but is something else word-like.)
   557  		lx.backup()
   558  		return lexBool
   559  	}
   560  	if r == eof {
   561  		return lx.errorf("unexpected EOF; expected value")
   562  	}
   563  	return lx.errorf("expected value but found %q instead", r)
   564  }
   565  
   566  // lexArrayValue consumes one value in an array. It assumes that '[' or ','
   567  // have already been consumed. All whitespace and newlines are ignored.
   568  func lexArrayValue(lx *lexer) stateFn {
   569  	r := lx.next()
   570  	switch {
   571  	case isWhitespace(r) || isNL(r):
   572  		return lexSkip(lx, lexArrayValue)
   573  	case r == '#':
   574  		lx.push(lexArrayValue)
   575  		return lexCommentStart
   576  	case r == ',':
   577  		return lx.errorf("unexpected comma")
   578  	case r == ']':
   579  		return lexArrayEnd
   580  	}
   581  
   582  	lx.backup()
   583  	lx.push(lexArrayValueEnd)
   584  	return lexValue
   585  }
   586  
   587  // lexArrayValueEnd consumes everything between the end of an array value and
   588  // the next value (or the end of the array): it ignores whitespace and newlines
   589  // and expects either a ',' or a ']'.
   590  func lexArrayValueEnd(lx *lexer) stateFn {
   591  	switch r := lx.next(); {
   592  	case isWhitespace(r) || isNL(r):
   593  		return lexSkip(lx, lexArrayValueEnd)
   594  	case r == '#':
   595  		lx.push(lexArrayValueEnd)
   596  		return lexCommentStart
   597  	case r == ',':
   598  		lx.ignore()
   599  		return lexArrayValue // move on to the next value
   600  	case r == ']':
   601  		return lexArrayEnd
   602  	default:
   603  		return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r))
   604  	}
   605  }
   606  
   607  // lexArrayEnd finishes the lexing of an array.
   608  // It assumes that a ']' has just been consumed.
   609  func lexArrayEnd(lx *lexer) stateFn {
   610  	lx.ignore()
   611  	lx.emit(itemArrayEnd)
   612  	return lx.pop()
   613  }
   614  
   615  // lexInlineTableValue consumes one key/value pair in an inline table.
   616  // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
   617  func lexInlineTableValue(lx *lexer) stateFn {
   618  	r := lx.next()
   619  	switch {
   620  	case isWhitespace(r):
   621  		return lexSkip(lx, lexInlineTableValue)
   622  	case isNL(r):
   623  		if lx.tomlNext {
   624  			return lexSkip(lx, lexInlineTableValue)
   625  		}
   626  		return lx.errorPrevLine(errLexInlineTableNL{})
   627  	case r == '#':
   628  		lx.push(lexInlineTableValue)
   629  		return lexCommentStart
   630  	case r == ',':
   631  		return lx.errorf("unexpected comma")
   632  	case r == '}':
   633  		return lexInlineTableEnd
   634  	}
   635  	lx.backup()
   636  	lx.push(lexInlineTableValueEnd)
   637  	return lexKeyStart
   638  }
   639  
   640  // lexInlineTableValueEnd consumes everything between the end of an inline table
   641  // key/value pair and the next pair (or the end of the table):
   642  // it ignores whitespace and expects either a ',' or a '}'.
   643  func lexInlineTableValueEnd(lx *lexer) stateFn {
   644  	switch r := lx.next(); {
   645  	case isWhitespace(r):
   646  		return lexSkip(lx, lexInlineTableValueEnd)
   647  	case isNL(r):
   648  		if lx.tomlNext {
   649  			return lexSkip(lx, lexInlineTableValueEnd)
   650  		}
   651  		return lx.errorPrevLine(errLexInlineTableNL{})
   652  	case r == '#':
   653  		lx.push(lexInlineTableValueEnd)
   654  		return lexCommentStart
   655  	case r == ',':
   656  		lx.ignore()
   657  		lx.skip(isWhitespace)
   658  		if lx.peek() == '}' {
   659  			if lx.tomlNext {
   660  				return lexInlineTableValueEnd
   661  			}
   662  			return lx.errorf("trailing comma not allowed in inline tables")
   663  		}
   664  		return lexInlineTableValue
   665  	case r == '}':
   666  		return lexInlineTableEnd
   667  	default:
   668  		return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r))
   669  	}
   670  }
   671  
   672  func runeOrEOF(r rune) string {
   673  	if r == eof {
   674  		return "end of file"
   675  	}
   676  	return "'" + string(r) + "'"
   677  }
   678  
   679  // lexInlineTableEnd finishes the lexing of an inline table.
   680  // It assumes that a '}' has just been consumed.
   681  func lexInlineTableEnd(lx *lexer) stateFn {
   682  	lx.ignore()
   683  	lx.emit(itemInlineTableEnd)
   684  	return lx.pop()
   685  }
   686  
   687  // lexString consumes the inner contents of a string. It assumes that the
   688  // beginning '"' has already been consumed and ignored.
   689  func lexString(lx *lexer) stateFn {
   690  	r := lx.next()
   691  	switch {
   692  	case r == eof:
   693  		return lx.errorf(`unexpected EOF; expected '"'`)
   694  	case isNL(r):
   695  		return lx.errorPrevLine(errLexStringNL{})
   696  	case r == '\\':
   697  		lx.push(lexString)
   698  		return lexStringEscape
   699  	case r == '"':
   700  		lx.backup()
   701  		lx.emit(itemString)
   702  		lx.next()
   703  		lx.ignore()
   704  		return lx.pop()
   705  	}
   706  	return lexString
   707  }
   708  
   709  // lexMultilineString consumes the inner contents of a string. It assumes that
   710  // the beginning '"""' has already been consumed and ignored.
   711  func lexMultilineString(lx *lexer) stateFn {
   712  	r := lx.next()
   713  	switch r {
   714  	default:
   715  		return lexMultilineString
   716  	case eof:
   717  		return lx.errorf(`unexpected EOF; expected '"""'`)
   718  	case '\\':
   719  		return lexMultilineStringEscape
   720  	case '"':
   721  		/// Found " → try to read two more "".
   722  		if lx.accept('"') {
   723  			if lx.accept('"') {
   724  				/// Peek ahead: the string can contain " and "", including at the
   725  				/// end: """str"""""
   726  				/// 6 or more at the end, however, is an error.
   727  				if lx.peek() == '"' {
   728  					/// Check if we already lexed 5 's; if so we have 6 now, and
   729  					/// that's just too many man!
   730  					///
   731  					/// Second check is for the edge case:
   732  					///
   733  					///            two quotes allowed.
   734  					///            vv
   735  					///   """lol \""""""
   736  					///          ^^  ^^^---- closing three
   737  					///     escaped
   738  					///
   739  					/// But ugly, but it works
   740  					if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) {
   741  						return lx.errorf(`unexpected '""""""'`)
   742  					}
   743  					lx.backup()
   744  					lx.backup()
   745  					return lexMultilineString
   746  				}
   747  
   748  				lx.backup() /// backup: don't include the """ in the item.
   749  				lx.backup()
   750  				lx.backup()
   751  				lx.emit(itemMultilineString)
   752  				lx.next() /// Read over ''' again and discard it.
   753  				lx.next()
   754  				lx.next()
   755  				lx.ignore()
   756  				return lx.pop()
   757  			}
   758  			lx.backup()
   759  		}
   760  		return lexMultilineString
   761  	}
   762  }
   763  
   764  // lexRawString consumes a raw string. Nothing can be escaped in such a string.
   765  // It assumes that the beginning "'" has already been consumed and ignored.
   766  func lexRawString(lx *lexer) stateFn {
   767  	r := lx.next()
   768  	switch {
   769  	default:
   770  		return lexRawString
   771  	case r == eof:
   772  		return lx.errorf(`unexpected EOF; expected "'"`)
   773  	case isNL(r):
   774  		return lx.errorPrevLine(errLexStringNL{})
   775  	case r == '\'':
   776  		lx.backup()
   777  		lx.emit(itemRawString)
   778  		lx.next()
   779  		lx.ignore()
   780  		return lx.pop()
   781  	}
   782  }
   783  
   784  // lexMultilineRawString consumes a raw string. Nothing can be escaped in such a
   785  // string. It assumes that the beginning triple-' has already been consumed and
   786  // ignored.
   787  func lexMultilineRawString(lx *lexer) stateFn {
   788  	r := lx.next()
   789  	switch r {
   790  	default:
   791  		return lexMultilineRawString
   792  	case eof:
   793  		return lx.errorf(`unexpected EOF; expected "'''"`)
   794  	case '\'':
   795  		/// Found ' → try to read two more ''.
   796  		if lx.accept('\'') {
   797  			if lx.accept('\'') {
   798  				/// Peek ahead: the string can contain ' and '', including at the
   799  				/// end: '''str'''''
   800  				/// 6 or more at the end, however, is an error.
   801  				if lx.peek() == '\'' {
   802  					/// Check if we already lexed 5 's; if so we have 6 now, and
   803  					/// that's just too many man!
   804  					if strings.HasSuffix(lx.current(), "'''''") {
   805  						return lx.errorf(`unexpected "''''''"`)
   806  					}
   807  					lx.backup()
   808  					lx.backup()
   809  					return lexMultilineRawString
   810  				}
   811  
   812  				lx.backup() /// backup: don't include the ''' in the item.
   813  				lx.backup()
   814  				lx.backup()
   815  				lx.emit(itemRawMultilineString)
   816  				lx.next() /// Read over ''' again and discard it.
   817  				lx.next()
   818  				lx.next()
   819  				lx.ignore()
   820  				return lx.pop()
   821  			}
   822  			lx.backup()
   823  		}
   824  		return lexMultilineRawString
   825  	}
   826  }
   827  
   828  // lexMultilineStringEscape consumes an escaped character. It assumes that the
   829  // preceding '\\' has already been consumed.
   830  func lexMultilineStringEscape(lx *lexer) stateFn {
   831  	if isNL(lx.next()) { /// \ escaping newline.
   832  		return lexMultilineString
   833  	}
   834  	lx.backup()
   835  	lx.push(lexMultilineString)
   836  	return lexStringEscape(lx)
   837  }
   838  
   839  func lexStringEscape(lx *lexer) stateFn {
   840  	r := lx.next()
   841  	switch r {
   842  	case 'e':
   843  		if !lx.tomlNext {
   844  			return lx.error(errLexEscape{r})
   845  		}
   846  		fallthrough
   847  	case 'b':
   848  		fallthrough
   849  	case 't':
   850  		fallthrough
   851  	case 'n':
   852  		fallthrough
   853  	case 'f':
   854  		fallthrough
   855  	case 'r':
   856  		fallthrough
   857  	case '"':
   858  		fallthrough
   859  	case ' ', '\t':
   860  		// Inside """ .. """ strings you can use \ to escape newlines, and any
   861  		// amount of whitespace can be between the \ and \n.
   862  		fallthrough
   863  	case '\\':
   864  		return lx.pop()
   865  	case 'x':
   866  		if !lx.tomlNext {
   867  			return lx.error(errLexEscape{r})
   868  		}
   869  		return lexHexEscape
   870  	case 'u':
   871  		return lexShortUnicodeEscape
   872  	case 'U':
   873  		return lexLongUnicodeEscape
   874  	}
   875  	return lx.error(errLexEscape{r})
   876  }
   877  
   878  func lexHexEscape(lx *lexer) stateFn {
   879  	var r rune
   880  	for i := 0; i < 2; i++ {
   881  		r = lx.next()
   882  		if !isHexadecimal(r) {
   883  			return lx.errorf(
   884  				`expected two hexadecimal digits after '\x', but got %q instead`,
   885  				lx.current())
   886  		}
   887  	}
   888  	return lx.pop()
   889  }
   890  
   891  func lexShortUnicodeEscape(lx *lexer) stateFn {
   892  	var r rune
   893  	for i := 0; i < 4; i++ {
   894  		r = lx.next()
   895  		if !isHexadecimal(r) {
   896  			return lx.errorf(
   897  				`expected four hexadecimal digits after '\u', but got %q instead`,
   898  				lx.current())
   899  		}
   900  	}
   901  	return lx.pop()
   902  }
   903  
   904  func lexLongUnicodeEscape(lx *lexer) stateFn {
   905  	var r rune
   906  	for i := 0; i < 8; i++ {
   907  		r = lx.next()
   908  		if !isHexadecimal(r) {
   909  			return lx.errorf(
   910  				`expected eight hexadecimal digits after '\U', but got %q instead`,
   911  				lx.current())
   912  		}
   913  	}
   914  	return lx.pop()
   915  }
   916  
   917  // lexNumberOrDateStart processes the first character of a value which begins
   918  // with a digit. It exists to catch values starting with '0', so that
   919  // lexBaseNumberOrDate can differentiate base prefixed integers from other
   920  // types.
   921  func lexNumberOrDateStart(lx *lexer) stateFn {
   922  	r := lx.next()
   923  	switch r {
   924  	case '0':
   925  		return lexBaseNumberOrDate
   926  	}
   927  
   928  	if !isDigit(r) {
   929  		// The only way to reach this state is if the value starts
   930  		// with a digit, so specifically treat anything else as an
   931  		// error.
   932  		return lx.errorf("expected a digit but got %q", r)
   933  	}
   934  
   935  	return lexNumberOrDate
   936  }
   937  
   938  // lexNumberOrDate consumes either an integer, float or datetime.
   939  func lexNumberOrDate(lx *lexer) stateFn {
   940  	r := lx.next()
   941  	if isDigit(r) {
   942  		return lexNumberOrDate
   943  	}
   944  	switch r {
   945  	case '-', ':':
   946  		return lexDatetime
   947  	case '_':
   948  		return lexDecimalNumber
   949  	case '.', 'e', 'E':
   950  		return lexFloat
   951  	}
   952  
   953  	lx.backup()
   954  	lx.emit(itemInteger)
   955  	return lx.pop()
   956  }
   957  
   958  // lexDatetime consumes a Datetime, to a first approximation.
   959  // The parser validates that it matches one of the accepted formats.
   960  func lexDatetime(lx *lexer) stateFn {
   961  	r := lx.next()
   962  	if isDigit(r) {
   963  		return lexDatetime
   964  	}
   965  	switch r {
   966  	case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
   967  		return lexDatetime
   968  	}
   969  
   970  	lx.backup()
   971  	lx.emitTrim(itemDatetime)
   972  	return lx.pop()
   973  }
   974  
   975  // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
   976  func lexHexInteger(lx *lexer) stateFn {
   977  	r := lx.next()
   978  	if isHexadecimal(r) {
   979  		return lexHexInteger
   980  	}
   981  	switch r {
   982  	case '_':
   983  		return lexHexInteger
   984  	}
   985  
   986  	lx.backup()
   987  	lx.emit(itemInteger)
   988  	return lx.pop()
   989  }
   990  
   991  // lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
   992  func lexOctalInteger(lx *lexer) stateFn {
   993  	r := lx.next()
   994  	if isOctal(r) {
   995  		return lexOctalInteger
   996  	}
   997  	switch r {
   998  	case '_':
   999  		return lexOctalInteger
  1000  	}
  1001  
  1002  	lx.backup()
  1003  	lx.emit(itemInteger)
  1004  	return lx.pop()
  1005  }
  1006  
  1007  // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
  1008  func lexBinaryInteger(lx *lexer) stateFn {
  1009  	r := lx.next()
  1010  	if isBinary(r) {
  1011  		return lexBinaryInteger
  1012  	}
  1013  	switch r {
  1014  	case '_':
  1015  		return lexBinaryInteger
  1016  	}
  1017  
  1018  	lx.backup()
  1019  	lx.emit(itemInteger)
  1020  	return lx.pop()
  1021  }
  1022  
  1023  // lexDecimalNumber consumes a decimal float or integer.
  1024  func lexDecimalNumber(lx *lexer) stateFn {
  1025  	r := lx.next()
  1026  	if isDigit(r) {
  1027  		return lexDecimalNumber
  1028  	}
  1029  	switch r {
  1030  	case '.', 'e', 'E':
  1031  		return lexFloat
  1032  	case '_':
  1033  		return lexDecimalNumber
  1034  	}
  1035  
  1036  	lx.backup()
  1037  	lx.emit(itemInteger)
  1038  	return lx.pop()
  1039  }
  1040  
  1041  // lexDecimalNumber consumes the first digit of a number beginning with a sign.
  1042  // It assumes the sign has already been consumed. Values which start with a sign
  1043  // are only allowed to be decimal integers or floats.
  1044  //
  1045  // The special "nan" and "inf" values are also recognized.
  1046  func lexDecimalNumberStart(lx *lexer) stateFn {
  1047  	r := lx.next()
  1048  
  1049  	// Special error cases to give users better error messages
  1050  	switch r {
  1051  	case 'i':
  1052  		if !lx.accept('n') || !lx.accept('f') {
  1053  			return lx.errorf("invalid float: '%s'", lx.current())
  1054  		}
  1055  		lx.emit(itemFloat)
  1056  		return lx.pop()
  1057  	case 'n':
  1058  		if !lx.accept('a') || !lx.accept('n') {
  1059  			return lx.errorf("invalid float: '%s'", lx.current())
  1060  		}
  1061  		lx.emit(itemFloat)
  1062  		return lx.pop()
  1063  	case '0':
  1064  		p := lx.peek()
  1065  		switch p {
  1066  		case 'b', 'o', 'x':
  1067  			return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
  1068  		}
  1069  	case '.':
  1070  		return lx.errorf("floats must start with a digit, not '.'")
  1071  	}
  1072  
  1073  	if isDigit(r) {
  1074  		return lexDecimalNumber
  1075  	}
  1076  
  1077  	return lx.errorf("expected a digit but got %q", r)
  1078  }
  1079  
  1080  // lexBaseNumberOrDate differentiates between the possible values which
  1081  // start with '0'. It assumes that before reaching this state, the initial '0'
  1082  // has been consumed.
  1083  func lexBaseNumberOrDate(lx *lexer) stateFn {
  1084  	r := lx.next()
  1085  	// Note: All datetimes start with at least two digits, so we don't
  1086  	// handle date characters (':', '-', etc.) here.
  1087  	if isDigit(r) {
  1088  		return lexNumberOrDate
  1089  	}
  1090  	switch r {
  1091  	case '_':
  1092  		// Can only be decimal, because there can't be an underscore
  1093  		// between the '0' and the base designator, and dates can't
  1094  		// contain underscores.
  1095  		return lexDecimalNumber
  1096  	case '.', 'e', 'E':
  1097  		return lexFloat
  1098  	case 'b':
  1099  		r = lx.peek()
  1100  		if !isBinary(r) {
  1101  			lx.errorf("not a binary number: '%s%c'", lx.current(), r)
  1102  		}
  1103  		return lexBinaryInteger
  1104  	case 'o':
  1105  		r = lx.peek()
  1106  		if !isOctal(r) {
  1107  			lx.errorf("not an octal number: '%s%c'", lx.current(), r)
  1108  		}
  1109  		return lexOctalInteger
  1110  	case 'x':
  1111  		r = lx.peek()
  1112  		if !isHexadecimal(r) {
  1113  			lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r)
  1114  		}
  1115  		return lexHexInteger
  1116  	}
  1117  
  1118  	lx.backup()
  1119  	lx.emit(itemInteger)
  1120  	return lx.pop()
  1121  }
  1122  
  1123  // lexFloat consumes the elements of a float. It allows any sequence of
  1124  // float-like characters, so floats emitted by the lexer are only a first
  1125  // approximation and must be validated by the parser.
  1126  func lexFloat(lx *lexer) stateFn {
  1127  	r := lx.next()
  1128  	if isDigit(r) {
  1129  		return lexFloat
  1130  	}
  1131  	switch r {
  1132  	case '_', '.', '-', '+', 'e', 'E':
  1133  		return lexFloat
  1134  	}
  1135  
  1136  	lx.backup()
  1137  	lx.emit(itemFloat)
  1138  	return lx.pop()
  1139  }
  1140  
  1141  // lexBool consumes a bool string: 'true' or 'false.
  1142  func lexBool(lx *lexer) stateFn {
  1143  	var rs []rune
  1144  	for {
  1145  		r := lx.next()
  1146  		if !unicode.IsLetter(r) {
  1147  			lx.backup()
  1148  			break
  1149  		}
  1150  		rs = append(rs, r)
  1151  	}
  1152  	s := string(rs)
  1153  	switch s {
  1154  	case "true", "false":
  1155  		lx.emit(itemBool)
  1156  		return lx.pop()
  1157  	}
  1158  	return lx.errorf("expected value but found %q instead", s)
  1159  }
  1160  
  1161  // lexCommentStart begins the lexing of a comment. It will emit
  1162  // itemCommentStart and consume no characters, passing control to lexComment.
  1163  func lexCommentStart(lx *lexer) stateFn {
  1164  	lx.ignore()
  1165  	lx.emit(itemCommentStart)
  1166  	return lexComment
  1167  }
  1168  
  1169  // lexComment lexes an entire comment. It assumes that '#' has been consumed.
  1170  // It will consume *up to* the first newline character, and pass control
  1171  // back to the last state on the stack.
  1172  func lexComment(lx *lexer) stateFn {
  1173  	switch r := lx.next(); {
  1174  	case isNL(r) || r == eof:
  1175  		lx.backup()
  1176  		lx.emit(itemText)
  1177  		return lx.pop()
  1178  	default:
  1179  		return lexComment
  1180  	}
  1181  }
  1182  
  1183  // lexSkip ignores all slurped input and moves on to the next state.
  1184  func lexSkip(lx *lexer, nextState stateFn) stateFn {
  1185  	lx.ignore()
  1186  	return nextState
  1187  }
  1188  
  1189  func (s stateFn) String() string {
  1190  	name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
  1191  	if i := strings.LastIndexByte(name, '.'); i > -1 {
  1192  		name = name[i+1:]
  1193  	}
  1194  	if s == nil {
  1195  		name = "<nil>"
  1196  	}
  1197  	return name + "()"
  1198  }
  1199  
  1200  func (itype itemType) String() string {
  1201  	switch itype {
  1202  	case itemError:
  1203  		return "Error"
  1204  	case itemNIL:
  1205  		return "NIL"
  1206  	case itemEOF:
  1207  		return "EOF"
  1208  	case itemText:
  1209  		return "Text"
  1210  	case itemString, itemRawString, itemMultilineString, itemRawMultilineString:
  1211  		return "String"
  1212  	case itemBool:
  1213  		return "Bool"
  1214  	case itemInteger:
  1215  		return "Integer"
  1216  	case itemFloat:
  1217  		return "Float"
  1218  	case itemDatetime:
  1219  		return "DateTime"
  1220  	case itemTableStart:
  1221  		return "TableStart"
  1222  	case itemTableEnd:
  1223  		return "TableEnd"
  1224  	case itemKeyStart:
  1225  		return "KeyStart"
  1226  	case itemKeyEnd:
  1227  		return "KeyEnd"
  1228  	case itemArray:
  1229  		return "Array"
  1230  	case itemArrayEnd:
  1231  		return "ArrayEnd"
  1232  	case itemCommentStart:
  1233  		return "CommentStart"
  1234  	case itemInlineTableStart:
  1235  		return "InlineTableStart"
  1236  	case itemInlineTableEnd:
  1237  		return "InlineTableEnd"
  1238  	}
  1239  	panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
  1240  }
  1241  
  1242  func (item item) String() string {
  1243  	return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
  1244  }
  1245  
  1246  func isWhitespace(r rune) bool { return r == '\t' || r == ' ' }
  1247  func isNL(r rune) bool         { return r == '\n' || r == '\r' }
  1248  func isControl(r rune) bool { // Control characters except \t, \r, \n
  1249  	switch r {
  1250  	case '\t', '\r', '\n':
  1251  		return false
  1252  	default:
  1253  		return (r >= 0x00 && r <= 0x1f) || r == 0x7f
  1254  	}
  1255  }
  1256  func isDigit(r rune) bool  { return r >= '0' && r <= '9' }
  1257  func isBinary(r rune) bool { return r == '0' || r == '1' }
  1258  func isOctal(r rune) bool  { return r >= '0' && r <= '7' }
  1259  func isHexadecimal(r rune) bool {
  1260  	return (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')
  1261  }
  1262  
  1263  func isBareKeyChar(r rune, tomlNext bool) bool {
  1264  	if tomlNext {
  1265  		return (r >= 'A' && r <= 'Z') ||
  1266  			(r >= 'a' && r <= 'z') ||
  1267  			(r >= '0' && r <= '9') ||
  1268  			r == '_' || r == '-' ||
  1269  			r == 0xb2 || r == 0xb3 || r == 0xb9 || (r >= 0xbc && r <= 0xbe) ||
  1270  			(r >= 0xc0 && r <= 0xd6) || (r >= 0xd8 && r <= 0xf6) || (r >= 0xf8 && r <= 0x037d) ||
  1271  			(r >= 0x037f && r <= 0x1fff) ||
  1272  			(r >= 0x200c && r <= 0x200d) || (r >= 0x203f && r <= 0x2040) ||
  1273  			(r >= 0x2070 && r <= 0x218f) || (r >= 0x2460 && r <= 0x24ff) ||
  1274  			(r >= 0x2c00 && r <= 0x2fef) || (r >= 0x3001 && r <= 0xd7ff) ||
  1275  			(r >= 0xf900 && r <= 0xfdcf) || (r >= 0xfdf0 && r <= 0xfffd) ||
  1276  			(r >= 0x10000 && r <= 0xeffff)
  1277  	}
  1278  
  1279  	return (r >= 'A' && r <= 'Z') ||
  1280  		(r >= 'a' && r <= 'z') ||
  1281  		(r >= '0' && r <= '9') ||
  1282  		r == '_' || r == '-'
  1283  }
  1284
View as plain text