string.go

Documentation: cuelang.org/go/cue/literal

     1  // Copyright 2019 CUE Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package literal
    16  
    17  import (
    18  	"errors"
    19  	"strings"
    20  	"unicode"
    21  	"unicode/utf8"
    22  )
    23  
    24  var (
    25  	errSyntax            = errors.New("invalid syntax")
    26  	errInvalidWhitespace = errors.New("invalid string: invalid whitespace")
    27  	errMissingNewline    = errors.New(
    28  		"invalid string: opening quote of multiline string must be followed by newline")
    29  	errUnmatchedQuote = errors.New("invalid string: unmatched quote")
    30  	// TODO: making this an error is optional according to RFC 4627. But we
    31  	// could make it not an error if this ever results in an issue.
    32  	errSurrogate          = errors.New("unmatched surrogate pair")
    33  	errEscapedLastNewline = errors.New("last newline of multiline string cannot be escaped")
    34  )
    35  
    36  // Unquote interprets s as a single- or double-quoted, single- or multi-line
    37  // string, possibly with custom escape delimiters, returning the string value
    38  // that s quotes.
    39  func Unquote(s string) (string, error) {
    40  	info, nStart, _, err := ParseQuotes(s, s)
    41  	if err != nil {
    42  		return "", err
    43  	}
    44  	s = s[nStart:]
    45  	return info.Unquote(s)
    46  }
    47  
    48  // QuoteInfo describes the type of quotes used for a string.
    49  type QuoteInfo struct {
    50  	quote      string
    51  	whitespace string
    52  	numHash    int
    53  	multiline  bool
    54  	char       byte
    55  	numChar    byte
    56  }
    57  
    58  // IsDouble reports whether the literal uses double quotes.
    59  func (q QuoteInfo) IsDouble() bool {
    60  	return q.char == '"'
    61  }
    62  
    63  // IsMulti reports whether a multi-line string was parsed.
    64  func (q QuoteInfo) IsMulti() bool {
    65  	return q.multiline
    66  }
    67  
    68  // Whitespace returns prefix whitespace for multiline strings.
    69  func (q QuoteInfo) Whitespace() string {
    70  	return q.whitespace
    71  }
    72  
    73  // ParseQuotes checks if the opening quotes in start matches the ending quotes
    74  // in end and reports its type as q or an error if they do not matching or are
    75  // invalid. nStart indicates the number of bytes used for the opening quote.
    76  func ParseQuotes(start, end string) (q QuoteInfo, nStart, nEnd int, err error) {
    77  	for i, c := range start {
    78  		if c != '#' {
    79  			break
    80  		}
    81  		q.numHash = i + 1
    82  	}
    83  	s := start[q.numHash:]
    84  	switch s[0] {
    85  	case '"', '\'':
    86  		q.char = s[0]
    87  		if len(s) > 3 && s[1] == s[0] && s[2] == s[0] {
    88  			switch s[3] {
    89  			case '\n':
    90  				q.quote = start[:3+q.numHash]
    91  			case '\r':
    92  				if len(s) > 4 && s[4] == '\n' {
    93  					q.quote = start[:4+q.numHash]
    94  					break
    95  				}
    96  				fallthrough
    97  			default:
    98  				return q, 0, 0, errMissingNewline
    99  			}
   100  			q.multiline = true
   101  			q.numChar = 3
   102  			nStart = len(q.quote) + 1 // add whitespace later
   103  		} else {
   104  			q.quote = start[:1+q.numHash]
   105  			q.numChar = 1
   106  			nStart = len(q.quote)
   107  		}
   108  	default:
   109  		return q, 0, 0, errSyntax
   110  	}
   111  	quote := start[:int(q.numChar)+q.numHash]
   112  	for i := 0; i < len(quote); i++ {
   113  		if j := len(end) - i - 1; j < 0 || quote[i] != end[j] {
   114  			return q, 0, 0, errUnmatchedQuote
   115  		}
   116  	}
   117  	if q.multiline {
   118  		i := len(end) - len(quote)
   119  		for i > 0 {
   120  			r, size := utf8.DecodeLastRuneInString(end[:i])
   121  			if r == '\n' || !unicode.IsSpace(r) {
   122  				break
   123  			}
   124  			i -= size
   125  		}
   126  		q.whitespace = end[i : len(end)-len(quote)]
   127  
   128  		if len(start) > nStart && start[nStart] != '\n' {
   129  			if !strings.HasPrefix(start[nStart:], q.whitespace) {
   130  				return q, 0, 0, errInvalidWhitespace
   131  			}
   132  			nStart += len(q.whitespace)
   133  		}
   134  	}
   135  
   136  	return q, nStart, int(q.numChar) + q.numHash, nil
   137  }
   138  
   139  // Unquote unquotes the given string, which should not contain
   140  // the initial quote character(s). It must be terminated with a quote or an
   141  // interpolation start. Escape sequences are expanded and surrogates
   142  // are replaced with the corresponding non-surrogate code points.
   143  func (q QuoteInfo) Unquote(s string) (string, error) {
   144  	if len(s) > 0 && !q.multiline {
   145  		if contains(s, '\n') || contains(s, '\r') {
   146  			return "", errSyntax
   147  		}
   148  
   149  		// Is it trivial? Avoid allocation.
   150  		if s[len(s)-1] == q.char && q.numHash == 0 {
   151  			if s := s[:len(s)-1]; isSimple(s, rune(q.char)) {
   152  				return s, nil
   153  			}
   154  		}
   155  	}
   156  
   157  	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   158  	stripNL := false
   159  	wasEscapedNewline := false
   160  	for len(s) > 0 {
   161  		switch s[0] {
   162  		case '\r':
   163  			s = s[1:]
   164  			wasEscapedNewline = false
   165  			continue
   166  		case '\n':
   167  			var err error
   168  			s, err = skipWhitespaceAfterNewline(s[1:], q)
   169  			if err != nil {
   170  				return "", err
   171  			}
   172  			stripNL = true
   173  			wasEscapedNewline = false
   174  			buf = append(buf, '\n')
   175  			continue
   176  		}
   177  		c, multibyte, ss, err := unquoteChar(s, q)
   178  		if surHigh <= c && c < surEnd {
   179  			if c >= surLow {
   180  				return "", errSurrogate
   181  			}
   182  			var cl rune
   183  			cl, _, ss, err = unquoteChar(ss, q)
   184  			if cl < surLow || surEnd <= cl {
   185  				return "", errSurrogate
   186  			}
   187  			c = 0x10000 + (c-surHigh)*0x400 + (cl - surLow)
   188  		}
   189  
   190  		if err != nil {
   191  			return "", err
   192  		}
   193  
   194  		s = ss
   195  		if c < 0 {
   196  			switch c {
   197  			case escapedNewline:
   198  				var err error
   199  				s, err = skipWhitespaceAfterNewline(s, q)
   200  				if err != nil {
   201  					return "", err
   202  				}
   203  				wasEscapedNewline = true
   204  				continue
   205  			case terminatedByQuote:
   206  				if wasEscapedNewline {
   207  					return "", errEscapedLastNewline
   208  				}
   209  				if stripNL {
   210  					// Strip the last newline, but only if it came from a closing
   211  					// quote.
   212  					buf = buf[:len(buf)-1]
   213  				}
   214  			case terminatedByExpr:
   215  			default:
   216  				panic("unreachable")
   217  			}
   218  			return string(buf), nil
   219  		}
   220  		stripNL = false
   221  		wasEscapedNewline = false
   222  		if !multibyte {
   223  			buf = append(buf, byte(c))
   224  		} else {
   225  			buf = utf8.AppendRune(buf, c)
   226  		}
   227  	}
   228  	// allow unmatched quotes if already checked.
   229  	return "", errUnmatchedQuote
   230  }
   231  
   232  func skipWhitespaceAfterNewline(s string, q QuoteInfo) (string, error) {
   233  	switch {
   234  	case !q.multiline:
   235  		// Can't happen because Unquote does an initial check for literal newlines
   236  		// in the non-multiline case, but be defensive.
   237  		fallthrough
   238  	default:
   239  		return "", errInvalidWhitespace
   240  	case strings.HasPrefix(s, q.whitespace):
   241  		s = s[len(q.whitespace):]
   242  	case strings.HasPrefix(s, "\n"):
   243  	case strings.HasPrefix(s, "\r\n"):
   244  	}
   245  	return s, nil
   246  }
   247  
   248  const (
   249  	surHigh = 0xD800
   250  	surLow  = 0xDC00
   251  	surEnd  = 0xE000
   252  )
   253  
   254  func isSimple(s string, quote rune) bool {
   255  	// TODO(perf): check if using a simple DFA to detect surrogate pairs is
   256  	// faster than converting to code points. At the very least there should
   257  	// be an ASCII fast path.
   258  	for _, r := range s {
   259  		if r == quote || r == '\\' {
   260  			return false
   261  		}
   262  		if surHigh <= r && r < surEnd {
   263  			return false
   264  		}
   265  	}
   266  	return true
   267  }
   268  
   269  // contains reports whether the string contains the byte c.
   270  func contains(s string, c byte) bool {
   271  	for i := 0; i < len(s); i++ {
   272  		if s[i] == c {
   273  			return true
   274  		}
   275  	}
   276  	return false
   277  }
   278  
   279  const (
   280  	terminatedByQuote = rune(-1)
   281  	terminatedByExpr  = rune(-2)
   282  	escapedNewline    = rune(-3)
   283  )
   284  
   285  // unquoteChar decodes the first character or byte in the escaped string.
   286  // It returns four values:
   287  //
   288  //  1. value, the decoded Unicode code point or byte value if non-negative, or
   289  //     one of the following special values:
   290  //     - terminatedByQuote indicates terminated by quotes
   291  //     - terminatedByExpr means terminated by \(
   292  //     - escapedNewline means that the line-termination character was quoted and should be omitted
   293  //  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   294  //  3. tail, the remainder of the string after the character; and
   295  //  4. an error that will be nil if the character is syntactically valid.
   296  //
   297  // The second argument, kind, specifies the type of literal being parsed
   298  // and therefore which kind of escape sequences are permitted.
   299  // For kind 's' only JSON escapes and \u{ are permitted.
   300  // For kind 'b' also hexadecimal and octal escape sequences are permitted.
   301  //
   302  // The third argument, quote, specifies that an ASCII quoting character that
   303  // is not permitted in the output.
   304  func unquoteChar(s string, info QuoteInfo) (value rune, multibyte bool, tail string, err error) {
   305  	// easy cases
   306  	switch c := s[0]; {
   307  	case c == info.char && info.char != 0:
   308  		for i := 1; byte(i) < info.numChar; i++ {
   309  			if i >= len(s) || s[i] != info.char {
   310  				return rune(info.char), false, s[1:], nil
   311  			}
   312  		}
   313  		for i := 0; i < info.numHash; i++ {
   314  			if i+int(info.numChar) >= len(s) || s[i+int(info.numChar)] != '#' {
   315  				return rune(info.char), false, s[1:], nil
   316  			}
   317  		}
   318  		if ln := int(info.numChar) + info.numHash; len(s) != ln {
   319  			// TODO: terminating quote in middle of string
   320  			return 0, false, s[ln:], errSyntax
   321  		}
   322  		return terminatedByQuote, false, "", nil
   323  	case c >= utf8.RuneSelf:
   324  		// TODO: consider handling surrogate values. These are discarded by
   325  		// DecodeRuneInString. It is technically correct to disallow it, but
   326  		// some JSON parsers allow this anyway.
   327  		r, size := utf8.DecodeRuneInString(s)
   328  		return r, true, s[size:], nil
   329  	case c != '\\':
   330  		return rune(s[0]), false, s[1:], nil
   331  	}
   332  
   333  	if len(s) <= 1+info.numHash {
   334  		return '\\', false, s[1:], nil
   335  	}
   336  	for i := 1; i <= info.numHash && i < len(s); i++ {
   337  		if s[i] != '#' {
   338  			return '\\', false, s[1:], nil
   339  		}
   340  	}
   341  
   342  	c := s[1+info.numHash]
   343  	s = s[2+info.numHash:]
   344  
   345  	switch c {
   346  	case 'a':
   347  		value = '\a'
   348  	case 'b':
   349  		value = '\b'
   350  	case 'f':
   351  		value = '\f'
   352  	case 'n':
   353  		value = '\n'
   354  	case 'r':
   355  		value = '\r'
   356  	case 't':
   357  		value = '\t'
   358  	case 'v':
   359  		value = '\v'
   360  	case '/':
   361  		value = '/'
   362  	case 'x', 'u', 'U':
   363  		n := 0
   364  		switch c {
   365  		case 'x':
   366  			n = 2
   367  		case 'u':
   368  			n = 4
   369  		case 'U':
   370  			n = 8
   371  		}
   372  		var v rune
   373  		if len(s) < n {
   374  			err = errSyntax
   375  			return
   376  		}
   377  		for j := 0; j < n; j++ {
   378  			x, ok := unhex(s[j])
   379  			if !ok {
   380  				err = errSyntax
   381  				return
   382  			}
   383  			v = v<<4 | x
   384  		}
   385  		s = s[n:]
   386  		if c == 'x' {
   387  			if info.char == '"' {
   388  				err = errSyntax
   389  				return
   390  			}
   391  			// single-byte string, possibly not UTF-8
   392  			value = v
   393  			break
   394  		}
   395  		if v > utf8.MaxRune {
   396  			err = errSyntax
   397  			return
   398  		}
   399  		value = v
   400  		multibyte = true
   401  	case '0', '1', '2', '3', '4', '5', '6', '7':
   402  		if info.char == '"' {
   403  			err = errSyntax
   404  			return
   405  		}
   406  		v := rune(c) - '0'
   407  		if len(s) < 2 {
   408  			err = errSyntax
   409  			return
   410  		}
   411  		for j := 0; j < 2; j++ { // one digit already; two more
   412  			x := rune(s[j]) - '0'
   413  			if x < 0 || x > 7 {
   414  				err = errSyntax
   415  				return
   416  			}
   417  			v = (v << 3) | x
   418  		}
   419  		s = s[2:]
   420  		if v > 255 {
   421  			err = errSyntax
   422  			return
   423  		}
   424  		value = v
   425  	case '\\':
   426  		value = '\\'
   427  	case '\'', '"':
   428  		// TODO: should we allow escaping of quotes regardless?
   429  		if c != info.char {
   430  			err = errSyntax
   431  			return
   432  		}
   433  		value = rune(c)
   434  	case '(':
   435  		if s != "" {
   436  			// TODO: terminating quote in middle of string
   437  			return 0, false, s, errSyntax
   438  		}
   439  		value = terminatedByExpr
   440  	case '\r':
   441  		if len(s) == 0 || s[0] != '\n' {
   442  			err = errSyntax
   443  			return
   444  		}
   445  		s = s[1:]
   446  		value = escapedNewline
   447  	case '\n':
   448  		value = escapedNewline
   449  	default:
   450  		err = errSyntax
   451  		return
   452  	}
   453  	tail = s
   454  	return
   455  }
   456  
   457  func unhex(b byte) (v rune, ok bool) {
   458  	c := rune(b)
   459  	switch {
   460  	case '0' <= c && c <= '9':
   461  		return c - '0', true
   462  	case 'a' <= c && c <= 'f':
   463  		return c - 'a' + 10, true
   464  	case 'A' <= c && c <= 'F':
   465  		return c - 'A' + 10, true
   466  	}
   467  	return
   468  }
   469
View as plain text