scanner.go

Documentation: github.com/hashicorp/hcl/json/scanner

     1  package scanner
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"os"
     7  	"unicode"
     8  	"unicode/utf8"
     9  
    10  	"github.com/hashicorp/hcl/json/token"
    11  )
    12  
    13  // eof represents a marker rune for the end of the reader.
    14  const eof = rune(0)
    15  
    16  // Scanner defines a lexical scanner
    17  type Scanner struct {
    18  	buf *bytes.Buffer // Source buffer for advancing and scanning
    19  	src []byte        // Source buffer for immutable access
    20  
    21  	// Source Position
    22  	srcPos  token.Pos // current position
    23  	prevPos token.Pos // previous position, used for peek() method
    24  
    25  	lastCharLen int // length of last character in bytes
    26  	lastLineLen int // length of last line in characters (for correct column reporting)
    27  
    28  	tokStart int // token text start position
    29  	tokEnd   int // token text end  position
    30  
    31  	// Error is called for each error encountered. If no Error
    32  	// function is set, the error is reported to os.Stderr.
    33  	Error func(pos token.Pos, msg string)
    34  
    35  	// ErrorCount is incremented by one for each error encountered.
    36  	ErrorCount int
    37  
    38  	// tokPos is the start position of most recently scanned token; set by
    39  	// Scan. The Filename field is always left untouched by the Scanner.  If
    40  	// an error is reported (via Error) and Position is invalid, the scanner is
    41  	// not inside a token.
    42  	tokPos token.Pos
    43  }
    44  
    45  // New creates and initializes a new instance of Scanner using src as
    46  // its source content.
    47  func New(src []byte) *Scanner {
    48  	// even though we accept a src, we read from a io.Reader compatible type
    49  	// (*bytes.Buffer). So in the future we might easily change it to streaming
    50  	// read.
    51  	b := bytes.NewBuffer(src)
    52  	s := &Scanner{
    53  		buf: b,
    54  		src: src,
    55  	}
    56  
    57  	// srcPosition always starts with 1
    58  	s.srcPos.Line = 1
    59  	return s
    60  }
    61  
    62  // next reads the next rune from the bufferred reader. Returns the rune(0) if
    63  // an error occurs (or io.EOF is returned).
    64  func (s *Scanner) next() rune {
    65  	ch, size, err := s.buf.ReadRune()
    66  	if err != nil {
    67  		// advance for error reporting
    68  		s.srcPos.Column++
    69  		s.srcPos.Offset += size
    70  		s.lastCharLen = size
    71  		return eof
    72  	}
    73  
    74  	if ch == utf8.RuneError && size == 1 {
    75  		s.srcPos.Column++
    76  		s.srcPos.Offset += size
    77  		s.lastCharLen = size
    78  		s.err("illegal UTF-8 encoding")
    79  		return ch
    80  	}
    81  
    82  	// remember last position
    83  	s.prevPos = s.srcPos
    84  
    85  	s.srcPos.Column++
    86  	s.lastCharLen = size
    87  	s.srcPos.Offset += size
    88  
    89  	if ch == '\n' {
    90  		s.srcPos.Line++
    91  		s.lastLineLen = s.srcPos.Column
    92  		s.srcPos.Column = 0
    93  	}
    94  
    95  	// debug
    96  	// fmt.Printf("ch: %q, offset:column: %d:%d\n", ch, s.srcPos.Offset, s.srcPos.Column)
    97  	return ch
    98  }
    99  
   100  // unread unreads the previous read Rune and updates the source position
   101  func (s *Scanner) unread() {
   102  	if err := s.buf.UnreadRune(); err != nil {
   103  		panic(err) // this is user fault, we should catch it
   104  	}
   105  	s.srcPos = s.prevPos // put back last position
   106  }
   107  
   108  // peek returns the next rune without advancing the reader.
   109  func (s *Scanner) peek() rune {
   110  	peek, _, err := s.buf.ReadRune()
   111  	if err != nil {
   112  		return eof
   113  	}
   114  
   115  	s.buf.UnreadRune()
   116  	return peek
   117  }
   118  
   119  // Scan scans the next token and returns the token.
   120  func (s *Scanner) Scan() token.Token {
   121  	ch := s.next()
   122  
   123  	// skip white space
   124  	for isWhitespace(ch) {
   125  		ch = s.next()
   126  	}
   127  
   128  	var tok token.Type
   129  
   130  	// token text markings
   131  	s.tokStart = s.srcPos.Offset - s.lastCharLen
   132  
   133  	// token position, initial next() is moving the offset by one(size of rune
   134  	// actually), though we are interested with the starting point
   135  	s.tokPos.Offset = s.srcPos.Offset - s.lastCharLen
   136  	if s.srcPos.Column > 0 {
   137  		// common case: last character was not a '\n'
   138  		s.tokPos.Line = s.srcPos.Line
   139  		s.tokPos.Column = s.srcPos.Column
   140  	} else {
   141  		// last character was a '\n'
   142  		// (we cannot be at the beginning of the source
   143  		// since we have called next() at least once)
   144  		s.tokPos.Line = s.srcPos.Line - 1
   145  		s.tokPos.Column = s.lastLineLen
   146  	}
   147  
   148  	switch {
   149  	case isLetter(ch):
   150  		lit := s.scanIdentifier()
   151  		if lit == "true" || lit == "false" {
   152  			tok = token.BOOL
   153  		} else if lit == "null" {
   154  			tok = token.NULL
   155  		} else {
   156  			s.err("illegal char")
   157  		}
   158  	case isDecimal(ch):
   159  		tok = s.scanNumber(ch)
   160  	default:
   161  		switch ch {
   162  		case eof:
   163  			tok = token.EOF
   164  		case '"':
   165  			tok = token.STRING
   166  			s.scanString()
   167  		case '.':
   168  			tok = token.PERIOD
   169  			ch = s.peek()
   170  			if isDecimal(ch) {
   171  				tok = token.FLOAT
   172  				ch = s.scanMantissa(ch)
   173  				ch = s.scanExponent(ch)
   174  			}
   175  		case '[':
   176  			tok = token.LBRACK
   177  		case ']':
   178  			tok = token.RBRACK
   179  		case '{':
   180  			tok = token.LBRACE
   181  		case '}':
   182  			tok = token.RBRACE
   183  		case ',':
   184  			tok = token.COMMA
   185  		case ':':
   186  			tok = token.COLON
   187  		case '-':
   188  			if isDecimal(s.peek()) {
   189  				ch := s.next()
   190  				tok = s.scanNumber(ch)
   191  			} else {
   192  				s.err("illegal char")
   193  			}
   194  		default:
   195  			s.err("illegal char: " + string(ch))
   196  		}
   197  	}
   198  
   199  	// finish token ending
   200  	s.tokEnd = s.srcPos.Offset
   201  
   202  	// create token literal
   203  	var tokenText string
   204  	if s.tokStart >= 0 {
   205  		tokenText = string(s.src[s.tokStart:s.tokEnd])
   206  	}
   207  	s.tokStart = s.tokEnd // ensure idempotency of tokenText() call
   208  
   209  	return token.Token{
   210  		Type: tok,
   211  		Pos:  s.tokPos,
   212  		Text: tokenText,
   213  	}
   214  }
   215  
   216  // scanNumber scans a HCL number definition starting with the given rune
   217  func (s *Scanner) scanNumber(ch rune) token.Type {
   218  	zero := ch == '0'
   219  	pos := s.srcPos
   220  
   221  	s.scanMantissa(ch)
   222  	ch = s.next() // seek forward
   223  	if ch == 'e' || ch == 'E' {
   224  		ch = s.scanExponent(ch)
   225  		return token.FLOAT
   226  	}
   227  
   228  	if ch == '.' {
   229  		ch = s.scanFraction(ch)
   230  		if ch == 'e' || ch == 'E' {
   231  			ch = s.next()
   232  			ch = s.scanExponent(ch)
   233  		}
   234  		return token.FLOAT
   235  	}
   236  
   237  	if ch != eof {
   238  		s.unread()
   239  	}
   240  
   241  	// If we have a larger number and this is zero, error
   242  	if zero && pos != s.srcPos {
   243  		s.err("numbers cannot start with 0")
   244  	}
   245  
   246  	return token.NUMBER
   247  }
   248  
   249  // scanMantissa scans the mantissa beginning from the rune. It returns the next
   250  // non decimal rune. It's used to determine wheter it's a fraction or exponent.
   251  func (s *Scanner) scanMantissa(ch rune) rune {
   252  	scanned := false
   253  	for isDecimal(ch) {
   254  		ch = s.next()
   255  		scanned = true
   256  	}
   257  
   258  	if scanned && ch != eof {
   259  		s.unread()
   260  	}
   261  	return ch
   262  }
   263  
   264  // scanFraction scans the fraction after the '.' rune
   265  func (s *Scanner) scanFraction(ch rune) rune {
   266  	if ch == '.' {
   267  		ch = s.peek() // we peek just to see if we can move forward
   268  		ch = s.scanMantissa(ch)
   269  	}
   270  	return ch
   271  }
   272  
   273  // scanExponent scans the remaining parts of an exponent after the 'e' or 'E'
   274  // rune.
   275  func (s *Scanner) scanExponent(ch rune) rune {
   276  	if ch == 'e' || ch == 'E' {
   277  		ch = s.next()
   278  		if ch == '-' || ch == '+' {
   279  			ch = s.next()
   280  		}
   281  		ch = s.scanMantissa(ch)
   282  	}
   283  	return ch
   284  }
   285  
   286  // scanString scans a quoted string
   287  func (s *Scanner) scanString() {
   288  	braces := 0
   289  	for {
   290  		// '"' opening already consumed
   291  		// read character after quote
   292  		ch := s.next()
   293  
   294  		if ch == '\n' || ch < 0 || ch == eof {
   295  			s.err("literal not terminated")
   296  			return
   297  		}
   298  
   299  		if ch == '"' {
   300  			break
   301  		}
   302  
   303  		// If we're going into a ${} then we can ignore quotes for awhile
   304  		if braces == 0 && ch == '$' && s.peek() == '{' {
   305  			braces++
   306  			s.next()
   307  		} else if braces > 0 && ch == '{' {
   308  			braces++
   309  		}
   310  		if braces > 0 && ch == '}' {
   311  			braces--
   312  		}
   313  
   314  		if ch == '\\' {
   315  			s.scanEscape()
   316  		}
   317  	}
   318  
   319  	return
   320  }
   321  
   322  // scanEscape scans an escape sequence
   323  func (s *Scanner) scanEscape() rune {
   324  	// http://en.cppreference.com/w/cpp/language/escape
   325  	ch := s.next() // read character after '/'
   326  	switch ch {
   327  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
   328  		// nothing to do
   329  	case '0', '1', '2', '3', '4', '5', '6', '7':
   330  		// octal notation
   331  		ch = s.scanDigits(ch, 8, 3)
   332  	case 'x':
   333  		// hexademical notation
   334  		ch = s.scanDigits(s.next(), 16, 2)
   335  	case 'u':
   336  		// universal character name
   337  		ch = s.scanDigits(s.next(), 16, 4)
   338  	case 'U':
   339  		// universal character name
   340  		ch = s.scanDigits(s.next(), 16, 8)
   341  	default:
   342  		s.err("illegal char escape")
   343  	}
   344  	return ch
   345  }
   346  
   347  // scanDigits scans a rune with the given base for n times. For example an
   348  // octal notation \184 would yield in scanDigits(ch, 8, 3)
   349  func (s *Scanner) scanDigits(ch rune, base, n int) rune {
   350  	for n > 0 && digitVal(ch) < base {
   351  		ch = s.next()
   352  		n--
   353  	}
   354  	if n > 0 {
   355  		s.err("illegal char escape")
   356  	}
   357  
   358  	// we scanned all digits, put the last non digit char back
   359  	s.unread()
   360  	return ch
   361  }
   362  
   363  // scanIdentifier scans an identifier and returns the literal string
   364  func (s *Scanner) scanIdentifier() string {
   365  	offs := s.srcPos.Offset - s.lastCharLen
   366  	ch := s.next()
   367  	for isLetter(ch) || isDigit(ch) || ch == '-' {
   368  		ch = s.next()
   369  	}
   370  
   371  	if ch != eof {
   372  		s.unread() // we got identifier, put back latest char
   373  	}
   374  
   375  	return string(s.src[offs:s.srcPos.Offset])
   376  }
   377  
   378  // recentPosition returns the position of the character immediately after the
   379  // character or token returned by the last call to Scan.
   380  func (s *Scanner) recentPosition() (pos token.Pos) {
   381  	pos.Offset = s.srcPos.Offset - s.lastCharLen
   382  	switch {
   383  	case s.srcPos.Column > 0:
   384  		// common case: last character was not a '\n'
   385  		pos.Line = s.srcPos.Line
   386  		pos.Column = s.srcPos.Column
   387  	case s.lastLineLen > 0:
   388  		// last character was a '\n'
   389  		// (we cannot be at the beginning of the source
   390  		// since we have called next() at least once)
   391  		pos.Line = s.srcPos.Line - 1
   392  		pos.Column = s.lastLineLen
   393  	default:
   394  		// at the beginning of the source
   395  		pos.Line = 1
   396  		pos.Column = 1
   397  	}
   398  	return
   399  }
   400  
   401  // err prints the error of any scanning to s.Error function. If the function is
   402  // not defined, by default it prints them to os.Stderr
   403  func (s *Scanner) err(msg string) {
   404  	s.ErrorCount++
   405  	pos := s.recentPosition()
   406  
   407  	if s.Error != nil {
   408  		s.Error(pos, msg)
   409  		return
   410  	}
   411  
   412  	fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
   413  }
   414  
   415  // isHexadecimal returns true if the given rune is a letter
   416  func isLetter(ch rune) bool {
   417  	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
   418  }
   419  
   420  // isHexadecimal returns true if the given rune is a decimal digit
   421  func isDigit(ch rune) bool {
   422  	return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
   423  }
   424  
   425  // isHexadecimal returns true if the given rune is a decimal number
   426  func isDecimal(ch rune) bool {
   427  	return '0' <= ch && ch <= '9'
   428  }
   429  
   430  // isHexadecimal returns true if the given rune is an hexadecimal number
   431  func isHexadecimal(ch rune) bool {
   432  	return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
   433  }
   434  
   435  // isWhitespace returns true if the rune is a space, tab, newline or carriage return
   436  func isWhitespace(ch rune) bool {
   437  	return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
   438  }
   439  
   440  // digitVal returns the integer value of a given octal,decimal or hexadecimal rune
   441  func digitVal(ch rune) int {
   442  	switch {
   443  	case '0' <= ch && ch <= '9':
   444  		return int(ch - '0')
   445  	case 'a' <= ch && ch <= 'f':
   446  		return int(ch - 'a' + 10)
   447  	case 'A' <= ch && ch <= 'F':
   448  		return int(ch - 'A' + 10)
   449  	}
   450  	return 16 // larger than any legal digit val
   451  }
   452
View as plain text