quote.go

Documentation: github.com/bazelbuild/buildtools/build

     1  /*
     2  Copyright 2016 Google LLC
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      https://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Python quoted strings.
    18  
    19  package build
    20  
    21  import (
    22  	"bytes"
    23  	"fmt"
    24  	"strconv"
    25  	"strings"
    26  )
    27  
    28  // unesc maps single-letter chars following \ to their actual values.
    29  var unesc = [256]byte{
    30  	'a':  '\a',
    31  	'b':  '\b',
    32  	'f':  '\f',
    33  	'n':  '\n',
    34  	'r':  '\r',
    35  	't':  '\t',
    36  	'v':  '\v',
    37  	'\\': '\\',
    38  	'\'': '\'',
    39  	'"':  '"',
    40  }
    41  
    42  // esc maps escape-worthy bytes to the char that should follow \.
    43  var esc = [256]byte{
    44  	'\a': 'a',
    45  	'\b': 'b',
    46  	'\f': 'f',
    47  	'\n': 'n',
    48  	'\r': 'r',
    49  	'\t': 't',
    50  	'\v': 'v',
    51  	'\\': '\\',
    52  	'\'': '\'',
    53  	'"':  '"',
    54  }
    55  
    56  // escapable is a set of all character that may follow an unescaped backslash
    57  // in a string literal
    58  var escapable = [256]bool{
    59  	'\n': true,
    60  	'a':  true,
    61  	'b':  true,
    62  	'f':  true,
    63  	'n':  true,
    64  	'r':  true,
    65  	't':  true,
    66  	'u':  true,
    67  	'U':  true,
    68  	'v':  true,
    69  	'x':  true,
    70  	'\'': true,
    71  	'\\': true,
    72  	'"':  true,
    73  	'0':  true,
    74  	'1':  true,
    75  	'2':  true,
    76  	'3':  true,
    77  	'4':  true,
    78  	'5':  true,
    79  	'6':  true,
    80  	'7':  true,
    81  	'8':  true,
    82  	'9':  true,
    83  }
    84  
    85  // Unquote unquotes the quoted string, returning the actual
    86  // string value, whether the original was triple-quoted, and
    87  // an error describing invalid input.
    88  func Unquote(quoted string) (s string, triple bool, err error) {
    89  	// Check for raw prefix: means don't interpret the inner \.
    90  	raw := false
    91  	if strings.HasPrefix(quoted, "r") {
    92  		raw = true
    93  		quoted = quoted[1:]
    94  	}
    95  
    96  	if len(quoted) < 2 {
    97  		err = fmt.Errorf("string literal too short")
    98  		return
    99  	}
   100  
   101  	if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
   102  		err = fmt.Errorf("string literal has invalid quotes")
   103  	}
   104  
   105  	// Check for triple quoted string.
   106  	quote := quoted[0]
   107  	if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
   108  		triple = true
   109  		quoted = quoted[3 : len(quoted)-3]
   110  	} else {
   111  		quoted = quoted[1 : len(quoted)-1]
   112  	}
   113  
   114  	// Now quoted is the quoted data, but no quotes.
   115  	// If we're in raw mode or there are no escapes, we're done.
   116  	if raw || !strings.Contains(quoted, `\`) {
   117  		s = quoted
   118  		return
   119  	}
   120  
   121  	// Otherwise process quoted string.
   122  	// Each iteration processes one escape sequence along with the
   123  	// plain text leading up to it.
   124  	var buf bytes.Buffer
   125  	for {
   126  		// Remove prefix before escape sequence.
   127  		i := strings.Index(quoted, `\`)
   128  		if i < 0 {
   129  			i = len(quoted)
   130  		}
   131  		buf.WriteString(quoted[:i])
   132  		quoted = quoted[i:]
   133  
   134  		if len(quoted) == 0 {
   135  			break
   136  		}
   137  
   138  		// Process escape sequence.
   139  		if len(quoted) == 1 {
   140  			err = fmt.Errorf(`truncated escape sequence \`)
   141  			return
   142  		}
   143  
   144  		switch quoted[1] {
   145  		default:
   146  			// In Python, if \z (for some byte z) is not a known escape sequence
   147  			// then it appears as literal text in the string.
   148  			buf.WriteString(quoted[:2])
   149  			quoted = quoted[2:]
   150  
   151  		case '\n':
   152  			// Ignore the escape and the line break.
   153  			quoted = quoted[2:]
   154  
   155  		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
   156  			// One-char escape
   157  			buf.WriteByte(unesc[quoted[1]])
   158  			quoted = quoted[2:]
   159  
   160  		case '0', '1', '2', '3', '4', '5', '6', '7':
   161  			// Octal escape, up to 3 digits.
   162  			n := int(quoted[1] - '0')
   163  			quoted = quoted[2:]
   164  			for i := 1; i < 3; i++ {
   165  				if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
   166  					break
   167  				}
   168  				n = n*8 + int(quoted[0]-'0')
   169  				quoted = quoted[1:]
   170  			}
   171  			if n >= 256 {
   172  				// NOTE: Python silently discards the high bit,
   173  				// so that '\541' == '\141' == 'a'.
   174  				// Let's see if we can avoid doing that in BUILD files.
   175  				err = fmt.Errorf(`invalid escape sequence \%03o`, n)
   176  				return
   177  			}
   178  			buf.WriteByte(byte(n))
   179  
   180  		case 'u':
   181  			// Unicode escape, exactly 4 digits for a 16-bit Unicode code point.
   182  			if len(quoted) < 6 {
   183  				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
   184  				return
   185  			}
   186  			n, err1 := strconv.ParseInt(quoted[2:6], 16, 0)
   187  			if err1 != nil {
   188  				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:6])
   189  				return
   190  			}
   191  			if n >= 0xD800 && n <= 0xDFFF {
   192  				err = fmt.Errorf(`invalid dangling surrogate %s`, quoted[:6])
   193  				return
   194  			}
   195  			buf.WriteRune(rune(n))
   196  			quoted = quoted[6:]
   197  
   198  		case 'U':
   199  			// Unicode escape, exactly 8 digits for a 16-bit Unicode code point.
   200  			if len(quoted) < 10 {
   201  				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
   202  				return
   203  			}
   204  			n, err1 := strconv.ParseInt(quoted[2:10], 16, 0)
   205  			if err1 != nil {
   206  				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:10])
   207  				return
   208  			}
   209  			if n >= 0xD800 && n <= 0xDFFF {
   210  				err = fmt.Errorf(`invalid dangling surrogate %s`, quoted[:10])
   211  				return
   212  			}
   213  			if n > 0x10FFFF {
   214  				err = fmt.Errorf(`Unicode value out of range %s`, quoted[:10])
   215  				return
   216  			}
   217  			buf.WriteRune(rune(n))
   218  			quoted = quoted[10:]
   219  
   220  		case 'x':
   221  			// Hexadecimal escape, exactly 2 digits.
   222  			if len(quoted) < 4 {
   223  				err = fmt.Errorf(`truncated escape sequence %s`, quoted)
   224  				return
   225  			}
   226  			n, err1 := strconv.ParseInt(quoted[2:4], 16, 0)
   227  			if err1 != nil {
   228  				err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
   229  				return
   230  			}
   231  			buf.WriteByte(byte(n))
   232  			quoted = quoted[4:]
   233  		}
   234  	}
   235  
   236  	s = buf.String()
   237  	return
   238  }
   239  
   240  // IsCorrectEscaping reports whether a string doesn't contain any incorrectly
   241  // escaped sequences such as "\a".
   242  func IsCorrectEscaping(value string) bool {
   243  	escaped := false
   244  	// This for-loop doesn't correctly check for a backlash at the end of the string literal, but
   245  	// such string can't be parsed anyway, neither by Bazel nor by Buildifier.
   246  	for _, ch := range value {
   247  		if !escaped {
   248  			if ch == '\\' {
   249  				escaped = true
   250  			}
   251  			continue
   252  		}
   253  
   254  		if ok := escapable[ch]; !ok {
   255  			return false
   256  		}
   257  		escaped = false
   258  	}
   259  	return true
   260  }
   261  
   262  // indexByte returns the index of the first instance of b in s, or else -1.
   263  func indexByte(s string, b byte) int {
   264  	for i := 0; i < len(s); i++ {
   265  		if s[i] == b {
   266  			return i
   267  		}
   268  	}
   269  	return -1
   270  }
   271  
   272  // hex is a list of the hexadecimal digits, for use in quoting.
   273  // We always print lower-case hexadecimal.
   274  const hex = "0123456789abcdef"
   275  
   276  // quote returns the quoted form of the string value "x".
   277  // If triple is true, quote uses the triple-quoted form """x""".
   278  func quote(unquoted string, triple bool) string {
   279  	q := `"`
   280  	if triple {
   281  		q = `"""`
   282  	}
   283  
   284  	var buf bytes.Buffer
   285  	buf.WriteString(q)
   286  
   287  	for i := 0; i < len(unquoted); i++ {
   288  		c := unquoted[i]
   289  		if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') {
   290  			// Can pass up to two quotes through, because they are followed by a non-quote byte.
   291  			buf.WriteByte(c)
   292  			if i+1 < len(unquoted) && unquoted[i+1] == '"' {
   293  				buf.WriteByte(c)
   294  				i++
   295  			}
   296  			continue
   297  		}
   298  		if triple && c == '\n' {
   299  			// Can allow newline in triple-quoted string.
   300  			buf.WriteByte(c)
   301  			continue
   302  		}
   303  		if c == '\'' {
   304  			// Can allow ' since we always use ".
   305  			buf.WriteByte(c)
   306  			continue
   307  		}
   308  		if c == '\\' {
   309  			// All backslashes should be escaped
   310  			buf.WriteByte('\\')
   311  			buf.WriteByte('\\')
   312  			continue
   313  		}
   314  		if esc[c] != 0 {
   315  			buf.WriteByte('\\')
   316  			buf.WriteByte(esc[c])
   317  			continue
   318  		}
   319  		if c < 0x20 || c >= 0x80 {
   320  			// BUILD files are supposed to be Latin-1, so escape all control and high bytes.
   321  			// I'd prefer to use \x here, but Blaze does not implement
   322  			// \x in quoted strings (b/7272572).
   323  			buf.WriteByte('\\')
   324  			buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7.
   325  			buf.WriteByte(hex[(c>>3)&7])
   326  			buf.WriteByte(hex[c&7])
   327  			/*
   328  				buf.WriteByte('\\')
   329  				buf.WriteByte('x')
   330  				buf.WriteByte(hex[c>>4])
   331  				buf.WriteByte(hex[c&0xF])
   332  			*/
   333  			continue
   334  		}
   335  		buf.WriteByte(c)
   336  		continue
   337  	}
   338  
   339  	buf.WriteString(q)
   340  	return buf.String()
   341  }
   342
View as plain text