lexer.go

Documentation: github.com/vektah/gqlparser/lexer

     1  package lexer
     2  
     3  import (
     4  	"bytes"
     5  	"unicode/utf8"
     6  
     7  	"github.com/vektah/gqlparser/ast"
     8  	"github.com/vektah/gqlparser/gqlerror"
     9  )
    10  
    11  // Lexer turns graphql request and schema strings into tokens
    12  type Lexer struct {
    13  	*ast.Source
    14  	// An offset into the string in bytes
    15  	start int
    16  	// An offset into the string in runes
    17  	startRunes int
    18  	// An offset into the string in bytes
    19  	end int
    20  	// An offset into the string in runes
    21  	endRunes int
    22  	// the current line number
    23  	line int
    24  	// An offset into the string in rune
    25  	lineStartRunes int
    26  }
    27  
    28  func New(src *ast.Source) Lexer {
    29  	return Lexer{
    30  		Source: src,
    31  		line:   1,
    32  	}
    33  }
    34  
    35  // take one rune from input and advance end
    36  func (s *Lexer) peek() (rune, int) {
    37  	return utf8.DecodeRuneInString(s.Input[s.end:])
    38  }
    39  
    40  func (s *Lexer) makeToken(kind Type) (Token, *gqlerror.Error) {
    41  	return s.makeValueToken(kind, s.Input[s.start:s.end])
    42  }
    43  
    44  func (s *Lexer) makeValueToken(kind Type, value string) (Token, *gqlerror.Error) {
    45  	return Token{
    46  		Kind:  kind,
    47  		Value: value,
    48  		Pos: ast.Position{
    49  			Start:  s.startRunes,
    50  			End:    s.endRunes,
    51  			Line:   s.line,
    52  			Column: s.startRunes - s.lineStartRunes + 1,
    53  			Src:    s.Source,
    54  		},
    55  	}, nil
    56  }
    57  
    58  func (s *Lexer) makeError(format string, args ...interface{}) (Token, *gqlerror.Error) {
    59  	column := s.endRunes - s.lineStartRunes + 1
    60  	return Token{
    61  		Kind: Invalid,
    62  		Pos: ast.Position{
    63  			Start:  s.startRunes,
    64  			End:    s.endRunes,
    65  			Line:   s.line,
    66  			Column: column,
    67  			Src:    s.Source,
    68  		},
    69  	}, gqlerror.ErrorLocf(s.Source.Name, s.line, column, format, args...)
    70  }
    71  
    72  // ReadToken gets the next token from the source starting at the given position.
    73  //
    74  // This skips over whitespace and comments until it finds the next lexable
    75  // token, then lexes punctuators immediately or calls the appropriate helper
    76  // function for more complicated tokens.
    77  func (s *Lexer) ReadToken() (token Token, err *gqlerror.Error) {
    78  
    79  	s.ws()
    80  	s.start = s.end
    81  	s.startRunes = s.endRunes
    82  
    83  	if s.end >= len(s.Input) {
    84  		return s.makeToken(EOF)
    85  	}
    86  	r := s.Input[s.start]
    87  	s.end++
    88  	s.endRunes++
    89  	switch r {
    90  	case '!':
    91  		return s.makeValueToken(Bang, "")
    92  
    93  	case '$':
    94  		return s.makeValueToken(Dollar, "")
    95  	case '&':
    96  		return s.makeValueToken(Amp, "")
    97  	case '(':
    98  		return s.makeValueToken(ParenL, "")
    99  	case ')':
   100  		return s.makeValueToken(ParenR, "")
   101  	case '.':
   102  		if len(s.Input) > s.start+2 && s.Input[s.start:s.start+3] == "..." {
   103  			s.end += 2
   104  			s.endRunes += 2
   105  			return s.makeValueToken(Spread, "")
   106  		}
   107  	case ':':
   108  		return s.makeValueToken(Colon, "")
   109  	case '=':
   110  		return s.makeValueToken(Equals, "")
   111  	case '@':
   112  		return s.makeValueToken(At, "")
   113  	case '[':
   114  		return s.makeValueToken(BracketL, "")
   115  	case ']':
   116  		return s.makeValueToken(BracketR, "")
   117  	case '{':
   118  		return s.makeValueToken(BraceL, "")
   119  	case '}':
   120  		return s.makeValueToken(BraceR, "")
   121  	case '|':
   122  		return s.makeValueToken(Pipe, "")
   123  	case '#':
   124  		s.readComment()
   125  		return s.ReadToken()
   126  
   127  	case '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z':
   128  		return s.readName()
   129  
   130  	case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   131  		return s.readNumber()
   132  
   133  	case '"':
   134  		if len(s.Input) > s.start+2 && s.Input[s.start:s.start+3] == `"""` {
   135  			return s.readBlockString()
   136  		}
   137  
   138  		return s.readString()
   139  	}
   140  
   141  	s.end--
   142  	s.endRunes--
   143  
   144  	if r < 0x0020 && r != 0x0009 && r != 0x000a && r != 0x000d {
   145  		return s.makeError(`Cannot contain the invalid character "\u%04d"`, r)
   146  	}
   147  
   148  	if r == '\'' {
   149  		return s.makeError(`Unexpected single quote character ('), did you mean to use a double quote (")?`)
   150  	}
   151  
   152  	return s.makeError(`Cannot parse the unexpected character "%s".`, string(r))
   153  }
   154  
   155  // ws reads from body starting at startPosition until it finds a non-whitespace
   156  // or commented character, and updates the token end to include all whitespace
   157  func (s *Lexer) ws() {
   158  	for s.end < len(s.Input) {
   159  		switch s.Input[s.end] {
   160  		case '\t', ' ', ',':
   161  			s.end++
   162  			s.endRunes++
   163  		case '\n':
   164  			s.end++
   165  			s.endRunes++
   166  			s.line++
   167  			s.lineStartRunes = s.endRunes
   168  		case '\r':
   169  			s.end++
   170  			s.endRunes++
   171  			s.line++
   172  			s.lineStartRunes = s.endRunes
   173  			// skip the following newline if its there
   174  			if s.end < len(s.Input) && s.Input[s.end] == '\n' {
   175  				s.end++
   176  				s.endRunes++
   177  			}
   178  			// byte order mark, given ws is hot path we aren't relying on the unicode package here.
   179  		case 0xef:
   180  			if s.end+2 < len(s.Input) && s.Input[s.end+1] == 0xBB && s.Input[s.end+2] == 0xBF {
   181  				s.end += 3
   182  				s.endRunes++
   183  			} else {
   184  				return
   185  			}
   186  		default:
   187  			return
   188  		}
   189  	}
   190  }
   191  
   192  // readComment from the input
   193  //
   194  // #[\u0009\u0020-\uFFFF]*
   195  func (s *Lexer) readComment() (Token, *gqlerror.Error) {
   196  	for s.end < len(s.Input) {
   197  		r, w := s.peek()
   198  
   199  		// SourceCharacter but not LineTerminator
   200  		if r > 0x001f || r == '\t' {
   201  			s.end += w
   202  			s.endRunes++
   203  		} else {
   204  			break
   205  		}
   206  	}
   207  
   208  	return s.makeToken(Comment)
   209  }
   210  
   211  // readNumber from the input, either a float
   212  // or an int depending on whether a decimal point appears.
   213  //
   214  // Int:   -?(0|[1-9][0-9]*)
   215  // Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
   216  func (s *Lexer) readNumber() (Token, *gqlerror.Error) {
   217  	float := false
   218  
   219  	// backup to the first digit
   220  	s.end--
   221  	s.endRunes--
   222  
   223  	s.acceptByte('-')
   224  
   225  	if s.acceptByte('0') {
   226  		if consumed := s.acceptDigits(); consumed != 0 {
   227  			s.end -= consumed
   228  			s.endRunes -= consumed
   229  			return s.makeError("Invalid number, unexpected digit after 0: %s.", s.describeNext())
   230  		}
   231  	} else {
   232  		if consumed := s.acceptDigits(); consumed == 0 {
   233  			return s.makeError("Invalid number, expected digit but got: %s.", s.describeNext())
   234  		}
   235  	}
   236  
   237  	if s.acceptByte('.') {
   238  		float = true
   239  
   240  		if consumed := s.acceptDigits(); consumed == 0 {
   241  			return s.makeError("Invalid number, expected digit but got: %s.", s.describeNext())
   242  		}
   243  	}
   244  
   245  	if s.acceptByte('e', 'E') {
   246  		float = true
   247  
   248  		s.acceptByte('-', '+')
   249  
   250  		if consumed := s.acceptDigits(); consumed == 0 {
   251  			return s.makeError("Invalid number, expected digit but got: %s.", s.describeNext())
   252  		}
   253  	}
   254  
   255  	if float {
   256  		return s.makeToken(Float)
   257  	} else {
   258  		return s.makeToken(Int)
   259  	}
   260  }
   261  
   262  // acceptByte if it matches any of given bytes, returning true if it found anything
   263  func (s *Lexer) acceptByte(bytes ...uint8) bool {
   264  	if s.end >= len(s.Input) {
   265  		return false
   266  	}
   267  
   268  	for _, accepted := range bytes {
   269  		if s.Input[s.end] == accepted {
   270  			s.end++
   271  			s.endRunes++
   272  			return true
   273  		}
   274  	}
   275  	return false
   276  }
   277  
   278  // acceptDigits from the input, returning the number of digits it found
   279  func (s *Lexer) acceptDigits() int {
   280  	consumed := 0
   281  	for s.end < len(s.Input) && s.Input[s.end] >= '0' && s.Input[s.end] <= '9' {
   282  		s.end++
   283  		s.endRunes++
   284  		consumed++
   285  	}
   286  
   287  	return consumed
   288  }
   289  
   290  // describeNext peeks at the input and returns a human readable string. This should will alloc
   291  // and should only be used in errors
   292  func (s *Lexer) describeNext() string {
   293  	if s.end < len(s.Input) {
   294  		return `"` + string(s.Input[s.end]) + `"`
   295  	}
   296  	return "<EOF>"
   297  }
   298  
   299  // readString from the input
   300  //
   301  // "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
   302  func (s *Lexer) readString() (Token, *gqlerror.Error) {
   303  	inputLen := len(s.Input)
   304  
   305  	// this buffer is lazily created only if there are escape characters.
   306  	var buf *bytes.Buffer
   307  
   308  	// skip the opening quote
   309  	s.start++
   310  	s.startRunes++
   311  
   312  	for s.end < inputLen {
   313  		r := s.Input[s.end]
   314  		if r == '\n' || r == '\r' {
   315  			break
   316  		}
   317  		if r < 0x0020 && r != '\t' {
   318  			return s.makeError(`Invalid character within String: "\u%04d".`, r)
   319  		}
   320  		switch r {
   321  		default:
   322  			var char = rune(r)
   323  			var w = 1
   324  
   325  			// skip unicode overhead if we are in the ascii range
   326  			if r >= 127 {
   327  				char, w = utf8.DecodeRuneInString(s.Input[s.end:])
   328  			}
   329  			s.end += w
   330  			s.endRunes++
   331  
   332  			if buf != nil {
   333  				buf.WriteRune(char)
   334  			}
   335  
   336  		case '"':
   337  			t, err := s.makeToken(String)
   338  			// the token should not include the quotes in its value, but should cover them in its position
   339  			t.Pos.Start--
   340  			t.Pos.End++
   341  
   342  			if buf != nil {
   343  				t.Value = buf.String()
   344  			}
   345  
   346  			// skip the close quote
   347  			s.end++
   348  			s.endRunes++
   349  
   350  			return t, err
   351  
   352  		case '\\':
   353  			if s.end+1 >= inputLen {
   354  				s.end++
   355  				s.endRunes++
   356  				return s.makeError(`Invalid character escape sequence.`)
   357  			}
   358  
   359  			if buf == nil {
   360  				buf = bytes.NewBufferString(s.Input[s.start:s.end])
   361  			}
   362  
   363  			escape := s.Input[s.end+1]
   364  
   365  			if escape == 'u' {
   366  				if s.end+6 >= inputLen {
   367  					s.end++
   368  					s.endRunes++
   369  					return s.makeError("Invalid character escape sequence: \\%s.", s.Input[s.end:])
   370  				}
   371  
   372  				r, ok := unhex(s.Input[s.end+2 : s.end+6])
   373  				if !ok {
   374  					s.end++
   375  					s.endRunes++
   376  					return s.makeError("Invalid character escape sequence: \\%s.", s.Input[s.end:s.end+5])
   377  				}
   378  				buf.WriteRune(r)
   379  				s.end += 6
   380  				s.endRunes += 6
   381  			} else {
   382  				switch escape {
   383  				case '"', '/', '\\':
   384  					buf.WriteByte(escape)
   385  				case 'b':
   386  					buf.WriteByte('\b')
   387  				case 'f':
   388  					buf.WriteByte('\f')
   389  				case 'n':
   390  					buf.WriteByte('\n')
   391  				case 'r':
   392  					buf.WriteByte('\r')
   393  				case 't':
   394  					buf.WriteByte('\t')
   395  				default:
   396  					s.end += 1
   397  					s.endRunes += 1
   398  					return s.makeError("Invalid character escape sequence: \\%s.", string(escape))
   399  				}
   400  				s.end += 2
   401  				s.endRunes += 2
   402  			}
   403  		}
   404  	}
   405  
   406  	return s.makeError("Unterminated string.")
   407  }
   408  
   409  // readBlockString from the input
   410  //
   411  // """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
   412  func (s *Lexer) readBlockString() (Token, *gqlerror.Error) {
   413  	inputLen := len(s.Input)
   414  
   415  	var buf bytes.Buffer
   416  
   417  	// skip the opening quote
   418  	s.start += 3
   419  	s.startRunes += 3
   420  	s.end += 2
   421  	s.endRunes += 2
   422  
   423  	for s.end < inputLen {
   424  		r := s.Input[s.end]
   425  
   426  		// Closing triple quote (""")
   427  		if r == '"' && s.end+3 <= inputLen && s.Input[s.end:s.end+3] == `"""` {
   428  			t, err := s.makeValueToken(BlockString, blockStringValue(buf.String()))
   429  
   430  			// the token should not include the quotes in its value, but should cover them in its position
   431  			t.Pos.Start -= 3
   432  			t.Pos.End += 3
   433  
   434  			// skip the close quote
   435  			s.end += 3
   436  			s.endRunes += 3
   437  
   438  			return t, err
   439  		}
   440  
   441  		// SourceCharacter
   442  		if r < 0x0020 && r != '\t' && r != '\n' && r != '\r' {
   443  			return s.makeError(`Invalid character within String: "\u%04d".`, r)
   444  		}
   445  
   446  		if r == '\\' && s.end+4 <= inputLen && s.Input[s.end:s.end+4] == `\"""` {
   447  			buf.WriteString(`"""`)
   448  			s.end += 4
   449  			s.endRunes += 4
   450  		} else if r == '\r' {
   451  			if s.end+1 < inputLen && s.Input[s.end+1] == '\n' {
   452  				s.end++
   453  				s.endRunes++
   454  			}
   455  
   456  			buf.WriteByte('\n')
   457  			s.end++
   458  			s.endRunes++
   459  		} else {
   460  			var char = rune(r)
   461  			var w = 1
   462  
   463  			// skip unicode overhead if we are in the ascii range
   464  			if r >= 127 {
   465  				char, w = utf8.DecodeRuneInString(s.Input[s.end:])
   466  			}
   467  			s.end += w
   468  			s.endRunes++
   469  			buf.WriteRune(char)
   470  		}
   471  	}
   472  
   473  	return s.makeError("Unterminated string.")
   474  }
   475  
   476  func unhex(b string) (v rune, ok bool) {
   477  	for _, c := range b {
   478  		v <<= 4
   479  		switch {
   480  		case '0' <= c && c <= '9':
   481  			v |= c - '0'
   482  		case 'a' <= c && c <= 'f':
   483  			v |= c - 'a' + 10
   484  		case 'A' <= c && c <= 'F':
   485  			v |= c - 'A' + 10
   486  		default:
   487  			return 0, false
   488  		}
   489  	}
   490  
   491  	return v, true
   492  }
   493  
   494  // readName from the input
   495  //
   496  // [_A-Za-z][_0-9A-Za-z]*
   497  func (s *Lexer) readName() (Token, *gqlerror.Error) {
   498  	for s.end < len(s.Input) {
   499  		r, w := s.peek()
   500  
   501  		if (r >= '0' && r <= '9') || (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || r == '_' {
   502  			s.end += w
   503  			s.endRunes++
   504  		} else {
   505  			break
   506  		}
   507  	}
   508  
   509  	return s.makeToken(Name)
   510  }
   511
View as plain text