...

Source file src/go.starlark.net/syntax/scan.go

Documentation: go.starlark.net/syntax

     1  // Copyright 2017 The Bazel Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  // A lexical scanner for Starlark.
     8  
     9  import (
    10  	"fmt"
    11  	"io"
    12  	"io/ioutil"
    13  	"log"
    14  	"math/big"
    15  	"os"
    16  	"strconv"
    17  	"strings"
    18  	"unicode"
    19  	"unicode/utf8"
    20  )
    21  
    22  // A Token represents a Starlark lexical token.
    23  type Token int8
    24  
    25  const (
    26  	ILLEGAL Token = iota
    27  	EOF
    28  
    29  	NEWLINE
    30  	INDENT
    31  	OUTDENT
    32  
    33  	// Tokens with values
    34  	IDENT  // x
    35  	INT    // 123
    36  	FLOAT  // 1.23e45
    37  	STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
    38  	BYTES  // b"foo", etc
    39  
    40  	// Punctuation
    41  	PLUS          // +
    42  	MINUS         // -
    43  	STAR          // *
    44  	SLASH         // /
    45  	SLASHSLASH    // //
    46  	PERCENT       // %
    47  	AMP           // &
    48  	PIPE          // |
    49  	CIRCUMFLEX    // ^
    50  	LTLT          // <<
    51  	GTGT          // >>
    52  	TILDE         // ~
    53  	DOT           // .
    54  	COMMA         // ,
    55  	EQ            // =
    56  	SEMI          // ;
    57  	COLON         // :
    58  	LPAREN        // (
    59  	RPAREN        // )
    60  	LBRACK        // [
    61  	RBRACK        // ]
    62  	LBRACE        // {
    63  	RBRACE        // }
    64  	LT            // <
    65  	GT            // >
    66  	GE            // >=
    67  	LE            // <=
    68  	EQL           // ==
    69  	NEQ           // !=
    70  	PLUS_EQ       // +=    (keep order consistent with PLUS..GTGT)
    71  	MINUS_EQ      // -=
    72  	STAR_EQ       // *=
    73  	SLASH_EQ      // /=
    74  	SLASHSLASH_EQ // //=
    75  	PERCENT_EQ    // %=
    76  	AMP_EQ        // &=
    77  	PIPE_EQ       // |=
    78  	CIRCUMFLEX_EQ // ^=
    79  	LTLT_EQ       // <<=
    80  	GTGT_EQ       // >>=
    81  	STARSTAR      // **
    82  
    83  	// Keywords
    84  	AND
    85  	BREAK
    86  	CONTINUE
    87  	DEF
    88  	ELIF
    89  	ELSE
    90  	FOR
    91  	IF
    92  	IN
    93  	LAMBDA
    94  	LOAD
    95  	NOT
    96  	NOT_IN // synthesized by parser from NOT IN
    97  	OR
    98  	PASS
    99  	RETURN
   100  	WHILE
   101  
   102  	maxToken
   103  )
   104  
   105  func (tok Token) String() string { return tokenNames[tok] }
   106  
   107  // GoString is like String but quotes punctuation tokens.
   108  // Use Sprintf("%#v", tok) when constructing error messages.
   109  func (tok Token) GoString() string {
   110  	if tok >= PLUS && tok <= STARSTAR {
   111  		return "'" + tokenNames[tok] + "'"
   112  	}
   113  	return tokenNames[tok]
   114  }
   115  
   116  var tokenNames = [...]string{
   117  	ILLEGAL:       "illegal token",
   118  	EOF:           "end of file",
   119  	NEWLINE:       "newline",
   120  	INDENT:        "indent",
   121  	OUTDENT:       "outdent",
   122  	IDENT:         "identifier",
   123  	INT:           "int literal",
   124  	FLOAT:         "float literal",
   125  	STRING:        "string literal",
   126  	PLUS:          "+",
   127  	MINUS:         "-",
   128  	STAR:          "*",
   129  	SLASH:         "/",
   130  	SLASHSLASH:    "//",
   131  	PERCENT:       "%",
   132  	AMP:           "&",
   133  	PIPE:          "|",
   134  	CIRCUMFLEX:    "^",
   135  	LTLT:          "<<",
   136  	GTGT:          ">>",
   137  	TILDE:         "~",
   138  	DOT:           ".",
   139  	COMMA:         ",",
   140  	EQ:            "=",
   141  	SEMI:          ";",
   142  	COLON:         ":",
   143  	LPAREN:        "(",
   144  	RPAREN:        ")",
   145  	LBRACK:        "[",
   146  	RBRACK:        "]",
   147  	LBRACE:        "{",
   148  	RBRACE:        "}",
   149  	LT:            "<",
   150  	GT:            ">",
   151  	GE:            ">=",
   152  	LE:            "<=",
   153  	EQL:           "==",
   154  	NEQ:           "!=",
   155  	PLUS_EQ:       "+=",
   156  	MINUS_EQ:      "-=",
   157  	STAR_EQ:       "*=",
   158  	SLASH_EQ:      "/=",
   159  	SLASHSLASH_EQ: "//=",
   160  	PERCENT_EQ:    "%=",
   161  	AMP_EQ:        "&=",
   162  	PIPE_EQ:       "|=",
   163  	CIRCUMFLEX_EQ: "^=",
   164  	LTLT_EQ:       "<<=",
   165  	GTGT_EQ:       ">>=",
   166  	STARSTAR:      "**",
   167  	AND:           "and",
   168  	BREAK:         "break",
   169  	CONTINUE:      "continue",
   170  	DEF:           "def",
   171  	ELIF:          "elif",
   172  	ELSE:          "else",
   173  	FOR:           "for",
   174  	IF:            "if",
   175  	IN:            "in",
   176  	LAMBDA:        "lambda",
   177  	LOAD:          "load",
   178  	NOT:           "not",
   179  	NOT_IN:        "not in",
   180  	OR:            "or",
   181  	PASS:          "pass",
   182  	RETURN:        "return",
   183  	WHILE:         "while",
   184  }
   185  
   186  // A FilePortion describes the content of a portion of a file.
   187  // Callers may provide a FilePortion for the src argument of Parse
   188  // when the desired initial line and column numbers are not (1, 1),
   189  // such as when an expression is parsed from within larger file.
   190  type FilePortion struct {
   191  	Content             []byte
   192  	FirstLine, FirstCol int32
   193  }
   194  
   195  // A Position describes the location of a rune of input.
   196  type Position struct {
   197  	file *string // filename (indirect for compactness)
   198  	Line int32   // 1-based line number; 0 if line unknown
   199  	Col  int32   // 1-based column (rune) number; 0 if column unknown
   200  }
   201  
   202  // IsValid reports whether the position is valid.
   203  func (p Position) IsValid() bool { return p.file != nil }
   204  
   205  // Filename returns the name of the file containing this position.
   206  func (p Position) Filename() string {
   207  	if p.file != nil {
   208  		return *p.file
   209  	}
   210  	return "<invalid>"
   211  }
   212  
   213  // MakePosition returns position with the specified components.
   214  func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} }
   215  
   216  // add returns the position at the end of s, assuming it starts at p.
   217  func (p Position) add(s string) Position {
   218  	if n := strings.Count(s, "\n"); n > 0 {
   219  		p.Line += int32(n)
   220  		s = s[strings.LastIndex(s, "\n")+1:]
   221  		p.Col = 1
   222  	}
   223  	p.Col += int32(utf8.RuneCountInString(s))
   224  	return p
   225  }
   226  
   227  func (p Position) String() string {
   228  	file := p.Filename()
   229  	if p.Line > 0 {
   230  		if p.Col > 0 {
   231  			return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col)
   232  		}
   233  		return fmt.Sprintf("%s:%d", file, p.Line)
   234  	}
   235  	return file
   236  }
   237  
   238  func (p Position) isBefore(q Position) bool {
   239  	if p.Line != q.Line {
   240  		return p.Line < q.Line
   241  	}
   242  	return p.Col < q.Col
   243  }
   244  
   245  // An scanner represents a single input file being parsed.
   246  type scanner struct {
   247  	rest           []byte    // rest of input (in REPL, a line of input)
   248  	token          []byte    // token being scanned
   249  	pos            Position  // current input position
   250  	depth          int       // nesting of [ ] { } ( )
   251  	indentstk      []int     // stack of indentation levels
   252  	dents          int       // number of saved INDENT (>0) or OUTDENT (<0) tokens to return
   253  	lineStart      bool      // after NEWLINE; convert spaces to indentation tokens
   254  	keepComments   bool      // accumulate comments in slice
   255  	lineComments   []Comment // list of full line comments (if keepComments)
   256  	suffixComments []Comment // list of suffix comments (if keepComments)
   257  
   258  	readline func() ([]byte, error) // read next line of input (REPL only)
   259  }
   260  
   261  func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) {
   262  	var firstLine, firstCol int32 = 1, 1
   263  	if portion, ok := src.(FilePortion); ok {
   264  		firstLine, firstCol = portion.FirstLine, portion.FirstCol
   265  	}
   266  	sc := &scanner{
   267  		pos:          MakePosition(&filename, firstLine, firstCol),
   268  		indentstk:    make([]int, 1, 10), // []int{0} + spare capacity
   269  		lineStart:    true,
   270  		keepComments: keepComments,
   271  	}
   272  	sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only
   273  	if sc.readline == nil {
   274  		data, err := readSource(filename, src)
   275  		if err != nil {
   276  			return nil, err
   277  		}
   278  		sc.rest = data
   279  	}
   280  	return sc, nil
   281  }
   282  
   283  func readSource(filename string, src interface{}) ([]byte, error) {
   284  	switch src := src.(type) {
   285  	case string:
   286  		return []byte(src), nil
   287  	case []byte:
   288  		return src, nil
   289  	case io.Reader:
   290  		data, err := ioutil.ReadAll(src)
   291  		if err != nil {
   292  			err = &os.PathError{Op: "read", Path: filename, Err: err}
   293  			return nil, err
   294  		}
   295  		return data, nil
   296  	case FilePortion:
   297  		return src.Content, nil
   298  	case nil:
   299  		return ioutil.ReadFile(filename)
   300  	default:
   301  		return nil, fmt.Errorf("invalid source: %T", src)
   302  	}
   303  }
   304  
   305  // An Error describes the nature and position of a scanner or parser error.
   306  type Error struct {
   307  	Pos Position
   308  	Msg string
   309  }
   310  
   311  func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg }
   312  
   313  // errorf is called to report an error.
   314  // errorf does not return: it panics.
   315  func (sc *scanner) error(pos Position, s string) {
   316  	panic(Error{pos, s})
   317  }
   318  
   319  func (sc *scanner) errorf(pos Position, format string, args ...interface{}) {
   320  	sc.error(pos, fmt.Sprintf(format, args...))
   321  }
   322  
   323  func (sc *scanner) recover(err *error) {
   324  	// The scanner and parser panic both for routine errors like
   325  	// syntax errors and for programmer bugs like array index
   326  	// errors.  Turn both into error returns.  Catching bug panics
   327  	// is especially important when processing many files.
   328  	switch e := recover().(type) {
   329  	case nil:
   330  		// no panic
   331  	case Error:
   332  		*err = e
   333  	default:
   334  		*err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)}
   335  		if debug {
   336  			log.Fatal(*err)
   337  		}
   338  	}
   339  }
   340  
   341  // eof reports whether the input has reached end of file.
   342  func (sc *scanner) eof() bool {
   343  	return len(sc.rest) == 0 && !sc.readLine()
   344  }
   345  
   346  // readLine attempts to read another line of input.
   347  // Precondition: len(sc.rest)==0.
   348  func (sc *scanner) readLine() bool {
   349  	if sc.readline != nil {
   350  		var err error
   351  		sc.rest, err = sc.readline()
   352  		if err != nil {
   353  			sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt
   354  		}
   355  		return len(sc.rest) > 0
   356  	}
   357  	return false
   358  }
   359  
   360  // peekRune returns the next rune in the input without consuming it.
   361  // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
   362  func (sc *scanner) peekRune() rune {
   363  	// TODO(adonovan): opt: measure and perhaps inline eof.
   364  	if sc.eof() {
   365  		return 0
   366  	}
   367  
   368  	// fast path: ASCII
   369  	if b := sc.rest[0]; b < utf8.RuneSelf {
   370  		if b == '\r' {
   371  			return '\n'
   372  		}
   373  		return rune(b)
   374  	}
   375  
   376  	r, _ := utf8.DecodeRune(sc.rest)
   377  	return r
   378  }
   379  
   380  // readRune consumes and returns the next rune in the input.
   381  // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'.
   382  func (sc *scanner) readRune() rune {
   383  	// eof() has been inlined here, both to avoid a call
   384  	// and to establish len(rest)>0 to avoid a bounds check.
   385  	if len(sc.rest) == 0 {
   386  		if !sc.readLine() {
   387  			sc.error(sc.pos, "internal scanner error: readRune at EOF")
   388  		}
   389  		// Redundant, but eliminates the bounds-check below.
   390  		if len(sc.rest) == 0 {
   391  			return 0
   392  		}
   393  	}
   394  
   395  	// fast path: ASCII
   396  	if b := sc.rest[0]; b < utf8.RuneSelf {
   397  		r := rune(b)
   398  		sc.rest = sc.rest[1:]
   399  		if r == '\r' {
   400  			if len(sc.rest) > 0 && sc.rest[0] == '\n' {
   401  				sc.rest = sc.rest[1:]
   402  			}
   403  			r = '\n'
   404  		}
   405  		if r == '\n' {
   406  			sc.pos.Line++
   407  			sc.pos.Col = 1
   408  		} else {
   409  			sc.pos.Col++
   410  		}
   411  		return r
   412  	}
   413  
   414  	r, size := utf8.DecodeRune(sc.rest)
   415  	sc.rest = sc.rest[size:]
   416  	sc.pos.Col++
   417  	return r
   418  }
   419  
   420  // tokenValue records the position and value associated with each token.
   421  type tokenValue struct {
   422  	raw    string   // raw text of token
   423  	int    int64    // decoded int
   424  	bigInt *big.Int // decoded integers > int64
   425  	float  float64  // decoded float
   426  	string string   // decoded string or bytes
   427  	pos    Position // start position of token
   428  }
   429  
   430  // startToken marks the beginning of the next input token.
   431  // It must be followed by a call to endToken once the token has
   432  // been consumed using readRune.
   433  func (sc *scanner) startToken(val *tokenValue) {
   434  	sc.token = sc.rest
   435  	val.raw = ""
   436  	val.pos = sc.pos
   437  }
   438  
   439  // endToken marks the end of an input token.
   440  // It records the actual token string in val.raw if the caller
   441  // has not done that already.
   442  func (sc *scanner) endToken(val *tokenValue) {
   443  	if val.raw == "" {
   444  		val.raw = string(sc.token[:len(sc.token)-len(sc.rest)])
   445  	}
   446  }
   447  
   448  // nextToken is called by the parser to obtain the next input token.
   449  // It returns the token value and sets val to the data associated with
   450  // the token.
   451  //
   452  // For all our input tokens, the associated data is val.pos (the
   453  // position where the token begins), val.raw (the input string
   454  // corresponding to the token).  For string and int tokens, the string
   455  // and int fields additionally contain the token's interpreted value.
   456  func (sc *scanner) nextToken(val *tokenValue) Token {
   457  
   458  	// The following distribution of tokens guides case ordering:
   459  	//
   460  	//      COMMA          27   %
   461  	//      STRING         23   %
   462  	//      IDENT          15   %
   463  	//      EQL            11   %
   464  	//      LBRACK          5.5 %
   465  	//      RBRACK          5.5 %
   466  	//      NEWLINE         3   %
   467  	//      LPAREN          2.9 %
   468  	//      RPAREN          2.9 %
   469  	//      INT             2   %
   470  	//      others        < 1   %
   471  	//
   472  	// Although NEWLINE tokens are infrequent, and lineStart is
   473  	// usually (~97%) false on entry, skipped newlines account for
   474  	// about 50% of all iterations of the 'start' loop.
   475  
   476  start:
   477  	var c rune
   478  
   479  	// Deal with leading spaces and indentation.
   480  	blank := false
   481  	savedLineStart := sc.lineStart
   482  	if sc.lineStart {
   483  		sc.lineStart = false
   484  		col := 0
   485  		for {
   486  			c = sc.peekRune()
   487  			if c == ' ' {
   488  				col++
   489  				sc.readRune()
   490  			} else if c == '\t' {
   491  				const tab = 8
   492  				col += int(tab - (sc.pos.Col-1)%tab)
   493  				sc.readRune()
   494  			} else {
   495  				break
   496  			}
   497  		}
   498  
   499  		// The third clause matches EOF.
   500  		if c == '#' || c == '\n' || c == 0 {
   501  			blank = true
   502  		}
   503  
   504  		// Compute indentation level for non-blank lines not
   505  		// inside an expression.  This is not the common case.
   506  		if !blank && sc.depth == 0 {
   507  			cur := sc.indentstk[len(sc.indentstk)-1]
   508  			if col > cur {
   509  				// indent
   510  				sc.dents++
   511  				sc.indentstk = append(sc.indentstk, col)
   512  			} else if col < cur {
   513  				// outdent(s)
   514  				for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] {
   515  					sc.dents--
   516  					sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop
   517  				}
   518  				if col != sc.indentstk[len(sc.indentstk)-1] {
   519  					sc.error(sc.pos, "unindent does not match any outer indentation level")
   520  				}
   521  			}
   522  		}
   523  	}
   524  
   525  	// Return saved indentation tokens.
   526  	if sc.dents != 0 {
   527  		sc.startToken(val)
   528  		sc.endToken(val)
   529  		if sc.dents < 0 {
   530  			sc.dents++
   531  			return OUTDENT
   532  		} else {
   533  			sc.dents--
   534  			return INDENT
   535  		}
   536  	}
   537  
   538  	// start of line proper
   539  	c = sc.peekRune()
   540  
   541  	// Skip spaces.
   542  	for c == ' ' || c == '\t' {
   543  		sc.readRune()
   544  		c = sc.peekRune()
   545  	}
   546  
   547  	// comment
   548  	if c == '#' {
   549  		if sc.keepComments {
   550  			sc.startToken(val)
   551  		}
   552  		// Consume up to newline (included).
   553  		for c != 0 && c != '\n' {
   554  			sc.readRune()
   555  			c = sc.peekRune()
   556  		}
   557  		if sc.keepComments {
   558  			sc.endToken(val)
   559  			if blank {
   560  				sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw})
   561  			} else {
   562  				sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw})
   563  			}
   564  		}
   565  	}
   566  
   567  	// newline
   568  	if c == '\n' {
   569  		sc.lineStart = true
   570  
   571  		// Ignore newlines within expressions (common case).
   572  		if sc.depth > 0 {
   573  			sc.readRune()
   574  			goto start
   575  		}
   576  
   577  		// Ignore blank lines, except in the REPL,
   578  		// where they emit OUTDENTs and NEWLINE.
   579  		if blank {
   580  			if sc.readline == nil {
   581  				sc.readRune()
   582  				goto start
   583  			} else if len(sc.indentstk) > 1 {
   584  				sc.dents = 1 - len(sc.indentstk)
   585  				sc.indentstk = sc.indentstk[:1]
   586  				goto start
   587  			}
   588  		}
   589  
   590  		// At top-level (not in an expression).
   591  		sc.startToken(val)
   592  		sc.readRune()
   593  		val.raw = "\n"
   594  		return NEWLINE
   595  	}
   596  
   597  	// end of file
   598  	if c == 0 {
   599  		// Emit OUTDENTs for unfinished indentation,
   600  		// preceded by a NEWLINE if we haven't just emitted one.
   601  		if len(sc.indentstk) > 1 {
   602  			if savedLineStart {
   603  				sc.dents = 1 - len(sc.indentstk)
   604  				sc.indentstk = sc.indentstk[:1]
   605  				goto start
   606  			} else {
   607  				sc.lineStart = true
   608  				sc.startToken(val)
   609  				val.raw = "\n"
   610  				return NEWLINE
   611  			}
   612  		}
   613  
   614  		sc.startToken(val)
   615  		sc.endToken(val)
   616  		return EOF
   617  	}
   618  
   619  	// line continuation
   620  	if c == '\\' {
   621  		sc.readRune()
   622  		if sc.peekRune() != '\n' {
   623  			sc.errorf(sc.pos, "stray backslash in program")
   624  		}
   625  		sc.readRune()
   626  		goto start
   627  	}
   628  
   629  	// start of the next token
   630  	sc.startToken(val)
   631  
   632  	// comma (common case)
   633  	if c == ',' {
   634  		sc.readRune()
   635  		sc.endToken(val)
   636  		return COMMA
   637  	}
   638  
   639  	// string literal
   640  	if c == '"' || c == '\'' {
   641  		return sc.scanString(val, c)
   642  	}
   643  
   644  	// identifier or keyword
   645  	if isIdentStart(c) {
   646  		if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
   647  			//  r"..."
   648  			//  b"..."
   649  			sc.readRune()
   650  			c = sc.peekRune()
   651  			return sc.scanString(val, c)
   652  		} else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') {
   653  			// rb"..."
   654  			sc.readRune()
   655  			sc.readRune()
   656  			c = sc.peekRune()
   657  			return sc.scanString(val, c)
   658  		}
   659  
   660  		for isIdent(c) {
   661  			sc.readRune()
   662  			c = sc.peekRune()
   663  		}
   664  		sc.endToken(val)
   665  		if k, ok := keywordToken[val.raw]; ok {
   666  			return k
   667  		}
   668  
   669  		return IDENT
   670  	}
   671  
   672  	// brackets
   673  	switch c {
   674  	case '[', '(', '{':
   675  		sc.depth++
   676  		sc.readRune()
   677  		sc.endToken(val)
   678  		switch c {
   679  		case '[':
   680  			return LBRACK
   681  		case '(':
   682  			return LPAREN
   683  		case '{':
   684  			return LBRACE
   685  		}
   686  		panic("unreachable")
   687  
   688  	case ']', ')', '}':
   689  		if sc.depth == 0 {
   690  			sc.errorf(sc.pos, "unexpected %q", c)
   691  		} else {
   692  			sc.depth--
   693  		}
   694  		sc.readRune()
   695  		sc.endToken(val)
   696  		switch c {
   697  		case ']':
   698  			return RBRACK
   699  		case ')':
   700  			return RPAREN
   701  		case '}':
   702  			return RBRACE
   703  		}
   704  		panic("unreachable")
   705  	}
   706  
   707  	// int or float literal, or period
   708  	if isdigit(c) || c == '.' {
   709  		return sc.scanNumber(val, c)
   710  	}
   711  
   712  	// other punctuation
   713  	defer sc.endToken(val)
   714  	switch c {
   715  	case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '='
   716  		start := sc.pos
   717  		sc.readRune()
   718  		if sc.peekRune() == '=' {
   719  			sc.readRune()
   720  			switch c {
   721  			case '<':
   722  				return LE
   723  			case '>':
   724  				return GE
   725  			case '=':
   726  				return EQL
   727  			case '!':
   728  				return NEQ
   729  			case '+':
   730  				return PLUS_EQ
   731  			case '-':
   732  				return MINUS_EQ
   733  			case '/':
   734  				return SLASH_EQ
   735  			case '%':
   736  				return PERCENT_EQ
   737  			case '&':
   738  				return AMP_EQ
   739  			case '|':
   740  				return PIPE_EQ
   741  			case '^':
   742  				return CIRCUMFLEX_EQ
   743  			}
   744  		}
   745  		switch c {
   746  		case '=':
   747  			return EQ
   748  		case '<':
   749  			if sc.peekRune() == '<' {
   750  				sc.readRune()
   751  				if sc.peekRune() == '=' {
   752  					sc.readRune()
   753  					return LTLT_EQ
   754  				} else {
   755  					return LTLT
   756  				}
   757  			}
   758  			return LT
   759  		case '>':
   760  			if sc.peekRune() == '>' {
   761  				sc.readRune()
   762  				if sc.peekRune() == '=' {
   763  					sc.readRune()
   764  					return GTGT_EQ
   765  				} else {
   766  					return GTGT
   767  				}
   768  			}
   769  			return GT
   770  		case '!':
   771  			sc.error(start, "unexpected input character '!'")
   772  		case '+':
   773  			return PLUS
   774  		case '-':
   775  			return MINUS
   776  		case '/':
   777  			if sc.peekRune() == '/' {
   778  				sc.readRune()
   779  				if sc.peekRune() == '=' {
   780  					sc.readRune()
   781  					return SLASHSLASH_EQ
   782  				} else {
   783  					return SLASHSLASH
   784  				}
   785  			}
   786  			return SLASH
   787  		case '%':
   788  			return PERCENT
   789  		case '&':
   790  			return AMP
   791  		case '|':
   792  			return PIPE
   793  		case '^':
   794  			return CIRCUMFLEX
   795  		}
   796  		panic("unreachable")
   797  
   798  	case ':', ';', '~': // single-char tokens (except comma)
   799  		sc.readRune()
   800  		switch c {
   801  		case ':':
   802  			return COLON
   803  		case ';':
   804  			return SEMI
   805  		case '~':
   806  			return TILDE
   807  		}
   808  		panic("unreachable")
   809  
   810  	case '*': // possibly followed by '*' or '='
   811  		sc.readRune()
   812  		switch sc.peekRune() {
   813  		case '*':
   814  			sc.readRune()
   815  			return STARSTAR
   816  		case '=':
   817  			sc.readRune()
   818  			return STAR_EQ
   819  		}
   820  		return STAR
   821  	}
   822  
   823  	sc.errorf(sc.pos, "unexpected input character %#q", c)
   824  	panic("unreachable")
   825  }
   826  
   827  func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
   828  	start := sc.pos
   829  	triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote)
   830  	sc.readRune()
   831  
   832  	// String literals may contain escaped or unescaped newlines,
   833  	// causing them to span multiple lines (gulps) of REPL input;
   834  	// they are the only such token. Thus we cannot call endToken,
   835  	// as it assumes sc.rest is unchanged since startToken.
   836  	// Instead, buffer the token here.
   837  	// TODO(adonovan): opt: buffer only if we encounter a newline.
   838  	raw := new(strings.Builder)
   839  
   840  	// Copy the prefix, e.g. r' or " (see startToken).
   841  	raw.Write(sc.token[:len(sc.token)-len(sc.rest)])
   842  
   843  	if !triple {
   844  		// single-quoted string literal
   845  		for {
   846  			if sc.eof() {
   847  				sc.error(val.pos, "unexpected EOF in string")
   848  			}
   849  			c := sc.readRune()
   850  			raw.WriteRune(c)
   851  			if c == quote {
   852  				break
   853  			}
   854  			if c == '\n' {
   855  				sc.error(val.pos, "unexpected newline in string")
   856  			}
   857  			if c == '\\' {
   858  				if sc.eof() {
   859  					sc.error(val.pos, "unexpected EOF in string")
   860  				}
   861  				c = sc.readRune()
   862  				raw.WriteRune(c)
   863  			}
   864  		}
   865  	} else {
   866  		// triple-quoted string literal
   867  		sc.readRune()
   868  		raw.WriteRune(quote)
   869  		sc.readRune()
   870  		raw.WriteRune(quote)
   871  
   872  		quoteCount := 0
   873  		for {
   874  			if sc.eof() {
   875  				sc.error(val.pos, "unexpected EOF in string")
   876  			}
   877  			c := sc.readRune()
   878  			raw.WriteRune(c)
   879  			if c == quote {
   880  				quoteCount++
   881  				if quoteCount == 3 {
   882  					break
   883  				}
   884  			} else {
   885  				quoteCount = 0
   886  			}
   887  			if c == '\\' {
   888  				if sc.eof() {
   889  					sc.error(val.pos, "unexpected EOF in string")
   890  				}
   891  				c = sc.readRune()
   892  				raw.WriteRune(c)
   893  			}
   894  		}
   895  	}
   896  	val.raw = raw.String()
   897  
   898  	s, _, isByte, err := unquote(val.raw)
   899  	if err != nil {
   900  		sc.error(start, err.Error())
   901  	}
   902  	val.string = s
   903  	if isByte {
   904  		return BYTES
   905  	} else {
   906  		return STRING
   907  	}
   908  }
   909  
   910  func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
   911  	// https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements
   912  	//
   913  	// Python features not supported:
   914  	// - integer literals of >64 bits of precision
   915  	// - 123L or 123l long suffix
   916  	// - traditional octal: 0755
   917  	// https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals
   918  
   919  	start := sc.pos
   920  	fraction, exponent := false, false
   921  
   922  	if c == '.' {
   923  		// dot or start of fraction
   924  		sc.readRune()
   925  		c = sc.peekRune()
   926  		if !isdigit(c) {
   927  			sc.endToken(val)
   928  			return DOT
   929  		}
   930  		fraction = true
   931  	} else if c == '0' {
   932  		// hex, octal, binary or float
   933  		sc.readRune()
   934  		c = sc.peekRune()
   935  
   936  		if c == '.' {
   937  			fraction = true
   938  		} else if c == 'x' || c == 'X' {
   939  			// hex
   940  			sc.readRune()
   941  			c = sc.peekRune()
   942  			if !isxdigit(c) {
   943  				sc.error(start, "invalid hex literal")
   944  			}
   945  			for isxdigit(c) {
   946  				sc.readRune()
   947  				c = sc.peekRune()
   948  			}
   949  		} else if c == 'o' || c == 'O' {
   950  			// octal
   951  			sc.readRune()
   952  			c = sc.peekRune()
   953  			if !isodigit(c) {
   954  				sc.error(sc.pos, "invalid octal literal")
   955  			}
   956  			for isodigit(c) {
   957  				sc.readRune()
   958  				c = sc.peekRune()
   959  			}
   960  		} else if c == 'b' || c == 'B' {
   961  			// binary
   962  			sc.readRune()
   963  			c = sc.peekRune()
   964  			if !isbdigit(c) {
   965  				sc.error(sc.pos, "invalid binary literal")
   966  			}
   967  			for isbdigit(c) {
   968  				sc.readRune()
   969  				c = sc.peekRune()
   970  			}
   971  		} else {
   972  			// float (or obsolete octal "0755")
   973  			allzeros, octal := true, true
   974  			for isdigit(c) {
   975  				if c != '0' {
   976  					allzeros = false
   977  				}
   978  				if c > '7' {
   979  					octal = false
   980  				}
   981  				sc.readRune()
   982  				c = sc.peekRune()
   983  			}
   984  			if c == '.' {
   985  				fraction = true
   986  			} else if c == 'e' || c == 'E' {
   987  				exponent = true
   988  			} else if octal && !allzeros {
   989  				sc.endToken(val)
   990  				sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:])
   991  			}
   992  		}
   993  	} else {
   994  		// decimal
   995  		for isdigit(c) {
   996  			sc.readRune()
   997  			c = sc.peekRune()
   998  		}
   999  
  1000  		if c == '.' {
  1001  			fraction = true
  1002  		} else if c == 'e' || c == 'E' {
  1003  			exponent = true
  1004  		}
  1005  	}
  1006  
  1007  	if fraction {
  1008  		sc.readRune() // consume '.'
  1009  		c = sc.peekRune()
  1010  		for isdigit(c) {
  1011  			sc.readRune()
  1012  			c = sc.peekRune()
  1013  		}
  1014  
  1015  		if c == 'e' || c == 'E' {
  1016  			exponent = true
  1017  		}
  1018  	}
  1019  
  1020  	if exponent {
  1021  		sc.readRune() // consume [eE]
  1022  		c = sc.peekRune()
  1023  		if c == '+' || c == '-' {
  1024  			sc.readRune()
  1025  			c = sc.peekRune()
  1026  			if !isdigit(c) {
  1027  				sc.error(sc.pos, "invalid float literal")
  1028  			}
  1029  		}
  1030  		for isdigit(c) {
  1031  			sc.readRune()
  1032  			c = sc.peekRune()
  1033  		}
  1034  	}
  1035  
  1036  	sc.endToken(val)
  1037  	if fraction || exponent {
  1038  		var err error
  1039  		val.float, err = strconv.ParseFloat(val.raw, 64)
  1040  		if err != nil {
  1041  			sc.error(sc.pos, "invalid float literal")
  1042  		}
  1043  		return FLOAT
  1044  	} else {
  1045  		var err error
  1046  		s := val.raw
  1047  		val.bigInt = nil
  1048  		if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') {
  1049  			val.int, err = strconv.ParseInt(s[2:], 8, 64)
  1050  		} else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') {
  1051  			val.int, err = strconv.ParseInt(s[2:], 2, 64)
  1052  		} else {
  1053  			val.int, err = strconv.ParseInt(s, 0, 64)
  1054  			if err != nil {
  1055  				num := new(big.Int)
  1056  				var ok bool
  1057  				val.bigInt, ok = num.SetString(s, 0)
  1058  				if ok {
  1059  					err = nil
  1060  				}
  1061  			}
  1062  		}
  1063  		if err != nil {
  1064  			sc.error(start, "invalid int literal")
  1065  		}
  1066  		return INT
  1067  	}
  1068  }
  1069  
  1070  // isIdent reports whether c is an identifier rune.
  1071  func isIdent(c rune) bool {
  1072  	return isdigit(c) || isIdentStart(c)
  1073  }
  1074  
  1075  func isIdentStart(c rune) bool {
  1076  	return 'a' <= c && c <= 'z' ||
  1077  		'A' <= c && c <= 'Z' ||
  1078  		c == '_' ||
  1079  		unicode.IsLetter(c)
  1080  }
  1081  
  1082  func isdigit(c rune) bool  { return '0' <= c && c <= '9' }
  1083  func isodigit(c rune) bool { return '0' <= c && c <= '7' }
  1084  func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' }
  1085  func isbdigit(c rune) bool { return '0' == c || c == '1' }
  1086  
  1087  // keywordToken records the special tokens for
  1088  // strings that should not be treated as ordinary identifiers.
  1089  var keywordToken = map[string]Token{
  1090  	"and":      AND,
  1091  	"break":    BREAK,
  1092  	"continue": CONTINUE,
  1093  	"def":      DEF,
  1094  	"elif":     ELIF,
  1095  	"else":     ELSE,
  1096  	"for":      FOR,
  1097  	"if":       IF,
  1098  	"in":       IN,
  1099  	"lambda":   LAMBDA,
  1100  	"load":     LOAD,
  1101  	"not":      NOT,
  1102  	"or":       OR,
  1103  	"pass":     PASS,
  1104  	"return":   RETURN,
  1105  	"while":    WHILE,
  1106  
  1107  	// reserved words:
  1108  	"as": ILLEGAL,
  1109  	// "assert":   ILLEGAL, // heavily used by our tests
  1110  	"class":    ILLEGAL,
  1111  	"del":      ILLEGAL,
  1112  	"except":   ILLEGAL,
  1113  	"finally":  ILLEGAL,
  1114  	"from":     ILLEGAL,
  1115  	"global":   ILLEGAL,
  1116  	"import":   ILLEGAL,
  1117  	"is":       ILLEGAL,
  1118  	"nonlocal": ILLEGAL,
  1119  	"raise":    ILLEGAL,
  1120  	"try":      ILLEGAL,
  1121  	"with":     ILLEGAL,
  1122  	"yield":    ILLEGAL,
  1123  }
  1124  

View as plain text