lexer.go

Documentation: github.com/vektah/gqlparser/v2/lexer

     1  package lexer
     2  
     3  import (
     4  	"bytes"
     5  	"unicode/utf8"
     6  
     7  	"github.com/vektah/gqlparser/v2/ast"
     8  	"github.com/vektah/gqlparser/v2/gqlerror"
     9  )
    10  
    11  // Lexer turns graphql request and schema strings into tokens
    12  type Lexer struct {
    13  	*ast.Source
    14  	// An offset into the string in bytes
    15  	start int
    16  	// An offset into the string in runes
    17  	startRunes int
    18  	// An offset into the string in bytes
    19  	end int
    20  	// An offset into the string in runes
    21  	endRunes int
    22  	// the current line number
    23  	line int
    24  	// An offset into the string in rune
    25  	lineStartRunes int
    26  }
    27  
    28  func New(src *ast.Source) Lexer {
    29  	return Lexer{
    30  		Source: src,
    31  		line:   1,
    32  	}
    33  }
    34  
    35  // take one rune from input and advance end
    36  func (s *Lexer) peek() (rune, int) {
    37  	return utf8.DecodeRuneInString(s.Input[s.end:])
    38  }
    39  
    40  func (s *Lexer) makeToken(kind Type) (Token, error) {
    41  	return s.makeValueToken(kind, s.Input[s.start:s.end])
    42  }
    43  
    44  func (s *Lexer) makeValueToken(kind Type, value string) (Token, error) {
    45  	return Token{
    46  		Kind:  kind,
    47  		Value: value,
    48  		Pos: ast.Position{
    49  			Start:  s.startRunes,
    50  			End:    s.endRunes,
    51  			Line:   s.line,
    52  			Column: s.startRunes - s.lineStartRunes + 1,
    53  			Src:    s.Source,
    54  		},
    55  	}, nil
    56  }
    57  
    58  func (s *Lexer) makeError(format string, args ...interface{}) (Token, *gqlerror.Error) {
    59  	column := s.endRunes - s.lineStartRunes + 1
    60  	return Token{
    61  		Kind: Invalid,
    62  		Pos: ast.Position{
    63  			Start:  s.startRunes,
    64  			End:    s.endRunes,
    65  			Line:   s.line,
    66  			Column: column,
    67  			Src:    s.Source,
    68  		},
    69  	}, gqlerror.ErrorLocf(s.Source.Name, s.line, column, format, args...)
    70  }
    71  
    72  // ReadToken gets the next token from the source starting at the given position.
    73  //
    74  // This skips over whitespace and comments until it finds the next lexable
    75  // token, then lexes punctuators immediately or calls the appropriate helper
    76  // function for more complicated tokens.
    77  func (s *Lexer) ReadToken() (Token, error) {
    78  	s.ws()
    79  	s.start = s.end
    80  	s.startRunes = s.endRunes
    81  
    82  	if s.end >= len(s.Input) {
    83  		return s.makeToken(EOF)
    84  	}
    85  	r := s.Input[s.start]
    86  	s.end++
    87  	s.endRunes++
    88  	switch r {
    89  	case '!':
    90  		return s.makeValueToken(Bang, "")
    91  
    92  	case '$':
    93  		return s.makeValueToken(Dollar, "")
    94  	case '&':
    95  		return s.makeValueToken(Amp, "")
    96  	case '(':
    97  		return s.makeValueToken(ParenL, "")
    98  	case ')':
    99  		return s.makeValueToken(ParenR, "")
   100  	case '.':
   101  		if len(s.Input) > s.start+2 && s.Input[s.start:s.start+3] == "..." {
   102  			s.end += 2
   103  			s.endRunes += 2
   104  			return s.makeValueToken(Spread, "")
   105  		}
   106  	case ':':
   107  		return s.makeValueToken(Colon, "")
   108  	case '=':
   109  		return s.makeValueToken(Equals, "")
   110  	case '@':
   111  		return s.makeValueToken(At, "")
   112  	case '[':
   113  		return s.makeValueToken(BracketL, "")
   114  	case ']':
   115  		return s.makeValueToken(BracketR, "")
   116  	case '{':
   117  		return s.makeValueToken(BraceL, "")
   118  	case '}':
   119  		return s.makeValueToken(BraceR, "")
   120  	case '|':
   121  		return s.makeValueToken(Pipe, "")
   122  	case '#':
   123  		return s.readComment()
   124  
   125  	case '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z':
   126  		return s.readName()
   127  
   128  	case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
   129  		return s.readNumber()
   130  
   131  	case '"':
   132  		if len(s.Input) > s.start+2 && s.Input[s.start:s.start+3] == `"""` {
   133  			return s.readBlockString()
   134  		}
   135  
   136  		return s.readString()
   137  	}
   138  
   139  	s.end--
   140  	s.endRunes--
   141  
   142  	if r < 0x0020 && r != 0x0009 && r != 0x000a && r != 0x000d {
   143  		return s.makeError(`Cannot contain the invalid character "\u%04d"`, r)
   144  	}
   145  
   146  	if r == '\'' {
   147  		return s.makeError(`Unexpected single quote character ('), did you mean to use a double quote (")?`)
   148  	}
   149  
   150  	return s.makeError(`Cannot parse the unexpected character "%s".`, string(r))
   151  }
   152  
   153  // ws reads from body starting at startPosition until it finds a non-whitespace
   154  // or commented character, and updates the token end to include all whitespace
   155  func (s *Lexer) ws() {
   156  	for s.end < len(s.Input) {
   157  		switch s.Input[s.end] {
   158  		case '\t', ' ', ',':
   159  			s.end++
   160  			s.endRunes++
   161  		case '\n':
   162  			s.end++
   163  			s.endRunes++
   164  			s.line++
   165  			s.lineStartRunes = s.endRunes
   166  		case '\r':
   167  			s.end++
   168  			s.endRunes++
   169  			s.line++
   170  			s.lineStartRunes = s.endRunes
   171  			// skip the following newline if its there
   172  			if s.end < len(s.Input) && s.Input[s.end] == '\n' {
   173  				s.end++
   174  				s.endRunes++
   175  			}
   176  			// byte order mark, given ws is hot path we aren't relying on the unicode package here.
   177  		case 0xef:
   178  			if s.end+2 < len(s.Input) && s.Input[s.end+1] == 0xBB && s.Input[s.end+2] == 0xBF {
   179  				s.end += 3
   180  				s.endRunes++
   181  			} else {
   182  				return
   183  			}
   184  		default:
   185  			return
   186  		}
   187  	}
   188  }
   189  
   190  // readComment from the input
   191  //
   192  // #[\u0009\u0020-\uFFFF]*
   193  func (s *Lexer) readComment() (Token, error) {
   194  	for s.end < len(s.Input) {
   195  		r, w := s.peek()
   196  
   197  		// SourceCharacter but not LineTerminator
   198  		if r > 0x001f || r == '\t' {
   199  			s.end += w
   200  			s.endRunes++
   201  		} else {
   202  			break
   203  		}
   204  	}
   205  
   206  	return s.makeToken(Comment)
   207  }
   208  
   209  // readNumber from the input, either a float
   210  // or an int depending on whether a decimal point appears.
   211  //
   212  // Int:   -?(0|[1-9][0-9]*)
   213  // Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)?
   214  func (s *Lexer) readNumber() (Token, error) {
   215  	float := false
   216  
   217  	// backup to the first digit
   218  	s.end--
   219  	s.endRunes--
   220  
   221  	s.acceptByte('-')
   222  
   223  	if s.acceptByte('0') {
   224  		if consumed := s.acceptDigits(); consumed != 0 {
   225  			s.end -= consumed
   226  			s.endRunes -= consumed
   227  			return s.makeError("Invalid number, unexpected digit after 0: %s.", s.describeNext())
   228  		}
   229  	} else {
   230  		if consumed := s.acceptDigits(); consumed == 0 {
   231  			return s.makeError("Invalid number, expected digit but got: %s.", s.describeNext())
   232  		}
   233  	}
   234  
   235  	if s.acceptByte('.') {
   236  		float = true
   237  
   238  		if consumed := s.acceptDigits(); consumed == 0 {
   239  			return s.makeError("Invalid number, expected digit but got: %s.", s.describeNext())
   240  		}
   241  	}
   242  
   243  	if s.acceptByte('e', 'E') {
   244  		float = true
   245  
   246  		s.acceptByte('-', '+')
   247  
   248  		if consumed := s.acceptDigits(); consumed == 0 {
   249  			return s.makeError("Invalid number, expected digit but got: %s.", s.describeNext())
   250  		}
   251  	}
   252  
   253  	if float {
   254  		return s.makeToken(Float)
   255  	}
   256  	return s.makeToken(Int)
   257  
   258  }
   259  
   260  // acceptByte if it matches any of given bytes, returning true if it found anything
   261  func (s *Lexer) acceptByte(bytes ...uint8) bool {
   262  	if s.end >= len(s.Input) {
   263  		return false
   264  	}
   265  
   266  	for _, accepted := range bytes {
   267  		if s.Input[s.end] == accepted {
   268  			s.end++
   269  			s.endRunes++
   270  			return true
   271  		}
   272  	}
   273  	return false
   274  }
   275  
   276  // acceptDigits from the input, returning the number of digits it found
   277  func (s *Lexer) acceptDigits() int {
   278  	consumed := 0
   279  	for s.end < len(s.Input) && s.Input[s.end] >= '0' && s.Input[s.end] <= '9' {
   280  		s.end++
   281  		s.endRunes++
   282  		consumed++
   283  	}
   284  
   285  	return consumed
   286  }
   287  
   288  // describeNext peeks at the input and returns a human readable string. This should will alloc
   289  // and should only be used in errors
   290  func (s *Lexer) describeNext() string {
   291  	if s.end < len(s.Input) {
   292  		return `"` + string(s.Input[s.end]) + `"`
   293  	}
   294  	return "<EOF>"
   295  }
   296  
   297  // readString from the input
   298  //
   299  // "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*"
   300  func (s *Lexer) readString() (Token, error) {
   301  	inputLen := len(s.Input)
   302  
   303  	// this buffer is lazily created only if there are escape characters.
   304  	var buf *bytes.Buffer
   305  
   306  	// skip the opening quote
   307  	s.start++
   308  	s.startRunes++
   309  
   310  	for s.end < inputLen {
   311  		r := s.Input[s.end]
   312  		if r == '\n' || r == '\r' {
   313  			break
   314  		}
   315  		if r < 0x0020 && r != '\t' {
   316  			return s.makeError(`Invalid character within String: "\u%04d".`, r)
   317  		}
   318  		switch r {
   319  		default:
   320  			var char = rune(r)
   321  			var w = 1
   322  
   323  			// skip unicode overhead if we are in the ascii range
   324  			if r >= 127 {
   325  				char, w = utf8.DecodeRuneInString(s.Input[s.end:])
   326  			}
   327  			s.end += w
   328  			s.endRunes++
   329  
   330  			if buf != nil {
   331  				buf.WriteRune(char)
   332  			}
   333  
   334  		case '"':
   335  			t, err := s.makeToken(String)
   336  			// the token should not include the quotes in its value, but should cover them in its position
   337  			t.Pos.Start--
   338  			t.Pos.End++
   339  
   340  			if buf != nil {
   341  				t.Value = buf.String()
   342  			}
   343  
   344  			// skip the close quote
   345  			s.end++
   346  			s.endRunes++
   347  
   348  			return t, err
   349  
   350  		case '\\':
   351  			if s.end+1 >= inputLen {
   352  				s.end++
   353  				s.endRunes++
   354  				return s.makeError(`Invalid character escape sequence.`)
   355  			}
   356  
   357  			if buf == nil {
   358  				buf = bytes.NewBufferString(s.Input[s.start:s.end])
   359  			}
   360  
   361  			escape := s.Input[s.end+1]
   362  
   363  			if escape == 'u' {
   364  				if s.end+6 >= inputLen {
   365  					s.end++
   366  					s.endRunes++
   367  					return s.makeError("Invalid character escape sequence: \\%s.", s.Input[s.end:])
   368  				}
   369  
   370  				r, ok := unhex(s.Input[s.end+2 : s.end+6])
   371  				if !ok {
   372  					s.end++
   373  					s.endRunes++
   374  					return s.makeError("Invalid character escape sequence: \\%s.", s.Input[s.end:s.end+5])
   375  				}
   376  				buf.WriteRune(r)
   377  				s.end += 6
   378  				s.endRunes += 6
   379  			} else {
   380  				switch escape {
   381  				case '"', '/', '\\':
   382  					buf.WriteByte(escape)
   383  				case 'b':
   384  					buf.WriteByte('\b')
   385  				case 'f':
   386  					buf.WriteByte('\f')
   387  				case 'n':
   388  					buf.WriteByte('\n')
   389  				case 'r':
   390  					buf.WriteByte('\r')
   391  				case 't':
   392  					buf.WriteByte('\t')
   393  				default:
   394  					s.end++
   395  					s.endRunes++
   396  					return s.makeError("Invalid character escape sequence: \\%s.", string(escape))
   397  				}
   398  				s.end += 2
   399  				s.endRunes += 2
   400  			}
   401  		}
   402  	}
   403  
   404  	return s.makeError("Unterminated string.")
   405  }
   406  
   407  // readBlockString from the input
   408  //
   409  // """("?"?(\\"""|\\(?!=""")|[^"\\]))*"""
   410  func (s *Lexer) readBlockString() (Token, error) {
   411  	inputLen := len(s.Input)
   412  
   413  	var buf bytes.Buffer
   414  
   415  	// skip the opening quote
   416  	s.start += 3
   417  	s.startRunes += 3
   418  	s.end += 2
   419  	s.endRunes += 2
   420  
   421  	for s.end < inputLen {
   422  		r := s.Input[s.end]
   423  
   424  		// Closing triple quote (""")
   425  		if r == '"' && s.end+3 <= inputLen && s.Input[s.end:s.end+3] == `"""` {
   426  			t, err := s.makeValueToken(BlockString, blockStringValue(buf.String()))
   427  
   428  			// the token should not include the quotes in its value, but should cover them in its position
   429  			t.Pos.Start -= 3
   430  			t.Pos.End += 3
   431  
   432  			// skip the close quote
   433  			s.end += 3
   434  			s.endRunes += 3
   435  			return t, err
   436  		}
   437  
   438  		// SourceCharacter
   439  		if r < 0x0020 && r != '\t' && r != '\n' && r != '\r' {
   440  			return s.makeError(`Invalid character within String: "\u%04d".`, r)
   441  		}
   442  
   443  		if r == '\\' && s.end+4 <= inputLen && s.Input[s.end:s.end+4] == `\"""` {
   444  			buf.WriteString(`"""`)
   445  			s.end += 4
   446  			s.endRunes += 4
   447  		} else if r == '\r' {
   448  			if s.end+1 < inputLen && s.Input[s.end+1] == '\n' {
   449  				s.end++
   450  				s.endRunes++
   451  			}
   452  
   453  			buf.WriteByte('\n')
   454  			s.end++
   455  			s.endRunes++
   456  			s.line++
   457  			s.lineStartRunes = s.endRunes
   458  		} else {
   459  			var char = rune(r)
   460  			var w = 1
   461  
   462  			// skip unicode overhead if we are in the ascii range
   463  			if r >= 127 {
   464  				char, w = utf8.DecodeRuneInString(s.Input[s.end:])
   465  			}
   466  			s.end += w
   467  			s.endRunes++
   468  			buf.WriteRune(char)
   469  			if r == '\n' {
   470  				s.line++
   471  				s.lineStartRunes = s.endRunes
   472  			}
   473  		}
   474  	}
   475  
   476  	return s.makeError("Unterminated string.")
   477  }
   478  
   479  func unhex(b string) (v rune, ok bool) {
   480  	for _, c := range b {
   481  		v <<= 4
   482  		switch {
   483  		case '0' <= c && c <= '9':
   484  			v |= c - '0'
   485  		case 'a' <= c && c <= 'f':
   486  			v |= c - 'a' + 10
   487  		case 'A' <= c && c <= 'F':
   488  			v |= c - 'A' + 10
   489  		default:
   490  			return 0, false
   491  		}
   492  	}
   493  
   494  	return v, true
   495  }
   496  
   497  // readName from the input
   498  //
   499  // [_A-Za-z][_0-9A-Za-z]*
   500  func (s *Lexer) readName() (Token, error) {
   501  	for s.end < len(s.Input) {
   502  		r, w := s.peek()
   503  
   504  		if (r >= '0' && r <= '9') || (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || r == '_' {
   505  			s.end += w
   506  			s.endRunes++
   507  		} else {
   508  			break
   509  		}
   510  	}
   511  
   512  	return s.makeToken(Name)
   513  }
   514
View as plain text