// Package unquote provides a function to unquote txtpb-formatted quoted string literals. package unquote import ( "errors" "fmt" "strconv" "strings" "unicode/utf8" "github.com/protocolbuffers/txtpbfmt/ast" ) // Unquote returns the value of the string node. // Calling Unquote on non-string node doesn't panic, but is otherwise undefined. func Unquote(n *ast.Node) (string, error) { return unquoteValues(n.Values, unquote) } // Raw returns the raw value of the string node, with string escapes left in place. // Calling UnquoteRaw on non-string node doesn't panic, but is otherwise undefined. func Raw(n *ast.Node) (string, error) { return unquoteValues(n.Values, unquoteRaw) } func unquoteValues(values []*ast.Value, unquoter func(string) (string, error)) (string, error) { var ret strings.Builder for _, v := range values { uq, err := unquoter(v.Value) if err != nil { return "", err } ret.WriteString(uq) } return ret.String(), nil } // Returns the quote rune used in the given string (' or "). Returns an error if the string doesn't // start and end with a matching pair of valid quotes. func quoteRune(s string) (rune, error) { if len(s) < 2 { return 0, errors.New("not a quoted string") } quote := s[0] if quote != '"' && quote != '\'' { return 0, fmt.Errorf("invalid quote character %s", string(quote)) } if s[len(s)-1] != quote { return 0, errors.New("unmatched quote") } return rune(quote), nil } func unquote(s string) (string, error) { quote, err := quoteRune(s) if err != nil { return "", err } return unquoteC(s[1:len(s)-1], quote) } func unquoteRaw(s string) (string, error) { _, err := quoteRune(s) // Trigger validation, which guarantees this is a quote-wrapped string. if err != nil { return "", err } return s[1 : len(s)-1], nil } var ( errBadUTF8 = errors.New("bad UTF-8") ) func unquoteC(s string, quote rune) (string, error) { // Copied from third_party/golang/protobuf/proto/text_parser.go // This is based on C++'s tokenizer.cc. // Despite its name, this is *not* parsing C syntax. // For instance, "\0" is an invalid quoted string. // Avoid allocation in trivial cases. simple := true for _, r := range s { if r == '\\' || r == quote { simple = false break } } if simple { return s, nil } buf := make([]byte, 0, 3*len(s)/2) for len(s) > 0 { r, n := utf8.DecodeRuneInString(s) if r == utf8.RuneError && n == 1 { return "", errBadUTF8 } s = s[n:] if r != '\\' { if r < utf8.RuneSelf { buf = append(buf, byte(r)) } else { buf = append(buf, string(r)...) } continue } ch, tail, err := unescape(s) if err != nil { return "", err } buf = append(buf, ch...) s = tail } return string(buf), nil } func unescape(s string) (ch string, tail string, err error) { // Copied from third_party/golang/protobuf/proto/text_parser.go r, n := utf8.DecodeRuneInString(s) if r == utf8.RuneError && n == 1 { return "", "", errBadUTF8 } s = s[n:] switch r { case 'a': return "\a", s, nil case 'b': return "\b", s, nil case 'f': return "\f", s, nil case 'n': return "\n", s, nil case 'r': return "\r", s, nil case 't': return "\t", s, nil case 'v': return "\v", s, nil case '?': return "?", s, nil // trigraph workaround case '\'', '"', '\\': return string(r), s, nil case '0', '1', '2', '3', '4', '5', '6', '7': if len(s) < 2 { return "", "", fmt.Errorf(`\%c requires 2 following digits`, r) } ss := string(r) + s[:2] s = s[2:] i, err := strconv.ParseUint(ss, 8, 8) if err != nil { return "", "", fmt.Errorf(`\%s contains non-octal digits`, ss) } return string([]byte{byte(i)}), s, nil case 'x', 'X', 'u', 'U': var n int switch r { case 'x', 'X': n = 2 case 'u': n = 4 case 'U': n = 8 } if len(s) < n { return "", "", fmt.Errorf(`\%c requires %d following digits`, r, n) } ss := s[:n] s = s[n:] i, err := strconv.ParseUint(ss, 16, 64) if err != nil { return "", "", fmt.Errorf(`\%c%s contains non-hexadecimal digits`, r, ss) } if r == 'x' || r == 'X' { return string([]byte{byte(i)}), s, nil } if i > utf8.MaxRune { return "", "", fmt.Errorf(`\%c%s is not a valid Unicode code point`, r, ss) } return strconv.FormatUint(i, 10), s, nil } return "", "", fmt.Errorf(`unknown escape \%c`, r) }