util.go

Documentation: github.com/yuin/goldmark/util

     1  // Package util provides utility functions for the goldmark.
     2  package util
     3  
     4  import (
     5  	"bytes"
     6  	"io"
     7  	"net/url"
     8  	"regexp"
     9  	"sort"
    10  	"strconv"
    11  	"unicode"
    12  	"unicode/utf8"
    13  )
    14  
    15  // A CopyOnWriteBuffer is a byte buffer that copies buffer when
    16  // it need to be changed.
    17  type CopyOnWriteBuffer struct {
    18  	buffer []byte
    19  	copied bool
    20  }
    21  
    22  // NewCopyOnWriteBuffer returns a new CopyOnWriteBuffer.
    23  func NewCopyOnWriteBuffer(buffer []byte) CopyOnWriteBuffer {
    24  	return CopyOnWriteBuffer{
    25  		buffer: buffer,
    26  		copied: false,
    27  	}
    28  }
    29  
    30  // Write writes given bytes to the buffer.
    31  // Write allocate new buffer and clears it at the first time.
    32  func (b *CopyOnWriteBuffer) Write(value []byte) {
    33  	if !b.copied {
    34  		b.buffer = make([]byte, 0, len(b.buffer)+20)
    35  		b.copied = true
    36  	}
    37  	b.buffer = append(b.buffer, value...)
    38  }
    39  
    40  // WriteString writes given string to the buffer.
    41  // WriteString allocate new buffer and clears it at the first time.
    42  func (b *CopyOnWriteBuffer) WriteString(value string) {
    43  	b.Write(StringToReadOnlyBytes(value))
    44  }
    45  
    46  // Append appends given bytes to the buffer.
    47  // Append copy buffer at the first time.
    48  func (b *CopyOnWriteBuffer) Append(value []byte) {
    49  	if !b.copied {
    50  		tmp := make([]byte, len(b.buffer), len(b.buffer)+20)
    51  		copy(tmp, b.buffer)
    52  		b.buffer = tmp
    53  		b.copied = true
    54  	}
    55  	b.buffer = append(b.buffer, value...)
    56  }
    57  
    58  // AppendString appends given string to the buffer.
    59  // AppendString copy buffer at the first time.
    60  func (b *CopyOnWriteBuffer) AppendString(value string) {
    61  	b.Append(StringToReadOnlyBytes(value))
    62  }
    63  
    64  // WriteByte writes the given byte to the buffer.
    65  // WriteByte allocate new buffer and clears it at the first time.
    66  func (b *CopyOnWriteBuffer) WriteByte(c byte) error {
    67  	if !b.copied {
    68  		b.buffer = make([]byte, 0, len(b.buffer)+20)
    69  		b.copied = true
    70  	}
    71  	b.buffer = append(b.buffer, c)
    72  	return nil
    73  }
    74  
    75  // AppendByte appends given bytes to the buffer.
    76  // AppendByte copy buffer at the first time.
    77  func (b *CopyOnWriteBuffer) AppendByte(c byte) {
    78  	if !b.copied {
    79  		tmp := make([]byte, len(b.buffer), len(b.buffer)+20)
    80  		copy(tmp, b.buffer)
    81  		b.buffer = tmp
    82  		b.copied = true
    83  	}
    84  	b.buffer = append(b.buffer, c)
    85  }
    86  
    87  // Bytes returns bytes of this buffer.
    88  func (b *CopyOnWriteBuffer) Bytes() []byte {
    89  	return b.buffer
    90  }
    91  
    92  // IsCopied returns true if buffer has been copied, otherwise false.
    93  func (b *CopyOnWriteBuffer) IsCopied() bool {
    94  	return b.copied
    95  }
    96  
    97  // IsEscapedPunctuation returns true if character at a given index i
    98  // is an escaped punctuation, otherwise false.
    99  func IsEscapedPunctuation(source []byte, i int) bool {
   100  	return source[i] == '\\' && i < len(source)-1 && IsPunct(source[i+1])
   101  }
   102  
   103  // ReadWhile read the given source while pred is true.
   104  func ReadWhile(source []byte, index [2]int, pred func(byte) bool) (int, bool) {
   105  	j := index[0]
   106  	ok := false
   107  	for ; j < index[1]; j++ {
   108  		c1 := source[j]
   109  		if pred(c1) {
   110  			ok = true
   111  			continue
   112  		}
   113  		break
   114  	}
   115  	return j, ok
   116  }
   117  
   118  // IsBlank returns true if the given string is all space characters.
   119  func IsBlank(bs []byte) bool {
   120  	for _, b := range bs {
   121  		if !IsSpace(b) {
   122  			return false
   123  		}
   124  	}
   125  	return true
   126  }
   127  
   128  // VisualizeSpaces visualize invisible space characters.
   129  func VisualizeSpaces(bs []byte) []byte {
   130  	bs = bytes.Replace(bs, []byte(" "), []byte("[SPACE]"), -1)
   131  	bs = bytes.Replace(bs, []byte("\t"), []byte("[TAB]"), -1)
   132  	bs = bytes.Replace(bs, []byte("\n"), []byte("[NEWLINE]\n"), -1)
   133  	bs = bytes.Replace(bs, []byte("\r"), []byte("[CR]"), -1)
   134  	bs = bytes.Replace(bs, []byte("\v"), []byte("[VTAB]"), -1)
   135  	bs = bytes.Replace(bs, []byte("\x00"), []byte("[NUL]"), -1)
   136  	bs = bytes.Replace(bs, []byte("\ufffd"), []byte("[U+FFFD]"), -1)
   137  	return bs
   138  }
   139  
   140  // TabWidth calculates actual width of a tab at the given position.
   141  func TabWidth(currentPos int) int {
   142  	return 4 - currentPos%4
   143  }
   144  
   145  // IndentPosition searches an indent position with the given width for the given line.
   146  // If the line contains tab characters, paddings may be not zero.
   147  // currentPos==0 and width==2:
   148  //
   149  //	position: 0    1
   150  //	          [TAB]aaaa
   151  //	width:    1234 5678
   152  //
   153  // width=2 is in the tab character. In this case, IndentPosition returns
   154  // (pos=1, padding=2).
   155  func IndentPosition(bs []byte, currentPos, width int) (pos, padding int) {
   156  	return IndentPositionPadding(bs, currentPos, 0, width)
   157  }
   158  
   159  // IndentPositionPadding searches an indent position with the given width for the given line.
   160  // This function is mostly same as IndentPosition except this function
   161  // takes account into additional paddings.
   162  func IndentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
   163  	if width == 0 {
   164  		return 0, paddingv
   165  	}
   166  	w := 0
   167  	i := 0
   168  	l := len(bs)
   169  	for ; i < l; i++ {
   170  		if bs[i] == '\t' && w < width {
   171  			w += TabWidth(currentPos + w)
   172  		} else if bs[i] == ' ' && w < width {
   173  			w++
   174  		} else {
   175  			break
   176  		}
   177  	}
   178  	if w >= width {
   179  		return i - paddingv, w - width
   180  	}
   181  	return -1, -1
   182  }
   183  
   184  // DedentPosition dedents lines by the given width.
   185  //
   186  // Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition.
   187  func DedentPosition(bs []byte, currentPos, width int) (pos, padding int) {
   188  	if width == 0 {
   189  		return 0, 0
   190  	}
   191  	w := 0
   192  	l := len(bs)
   193  	i := 0
   194  	for ; i < l; i++ {
   195  		if bs[i] == '\t' {
   196  			w += TabWidth(currentPos + w)
   197  		} else if bs[i] == ' ' {
   198  			w++
   199  		} else {
   200  			break
   201  		}
   202  	}
   203  	if w >= width {
   204  		return i, w - width
   205  	}
   206  	return i, 0
   207  }
   208  
   209  // DedentPositionPadding dedents lines by the given width.
   210  // This function is mostly same as DedentPosition except this function
   211  // takes account into additional paddings.
   212  //
   213  // Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition.
   214  func DedentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
   215  	if width == 0 {
   216  		return 0, paddingv
   217  	}
   218  
   219  	w := 0
   220  	i := 0
   221  	l := len(bs)
   222  	for ; i < l; i++ {
   223  		if bs[i] == '\t' {
   224  			w += TabWidth(currentPos + w)
   225  		} else if bs[i] == ' ' {
   226  			w++
   227  		} else {
   228  			break
   229  		}
   230  	}
   231  	if w >= width {
   232  		return i - paddingv, w - width
   233  	}
   234  	return i - paddingv, 0
   235  }
   236  
   237  // IndentWidth calculate an indent width for the given line.
   238  func IndentWidth(bs []byte, currentPos int) (width, pos int) {
   239  	l := len(bs)
   240  	for i := 0; i < l; i++ {
   241  		b := bs[i]
   242  		if b == ' ' {
   243  			width++
   244  			pos++
   245  		} else if b == '\t' {
   246  			width += TabWidth(currentPos + width)
   247  			pos++
   248  		} else {
   249  			break
   250  		}
   251  	}
   252  	return
   253  }
   254  
   255  // FirstNonSpacePosition returns a position line that is a first nonspace
   256  // character.
   257  func FirstNonSpacePosition(bs []byte) int {
   258  	i := 0
   259  	for ; i < len(bs); i++ {
   260  		c := bs[i]
   261  		if c == ' ' || c == '\t' {
   262  			continue
   263  		}
   264  		if c == '\n' {
   265  			return -1
   266  		}
   267  		return i
   268  	}
   269  	return -1
   270  }
   271  
   272  // FindClosure returns a position that closes the given opener.
   273  // If codeSpan is set true, it ignores characters in code spans.
   274  // If allowNesting is set true, closures correspond to nested opener will be
   275  // ignored.
   276  //
   277  // Deprecated: This function can not handle newlines. Many elements
   278  // can be existed over multiple lines(e.g. link labels).
   279  // Use text.Reader.FindClosure.
   280  func FindClosure(bs []byte, opener, closure byte, codeSpan, allowNesting bool) int {
   281  	i := 0
   282  	opened := 1
   283  	codeSpanOpener := 0
   284  	for i < len(bs) {
   285  		c := bs[i]
   286  		if codeSpan && codeSpanOpener != 0 && c == '`' {
   287  			codeSpanCloser := 0
   288  			for ; i < len(bs); i++ {
   289  				if bs[i] == '`' {
   290  					codeSpanCloser++
   291  				} else {
   292  					i--
   293  					break
   294  				}
   295  			}
   296  			if codeSpanCloser == codeSpanOpener {
   297  				codeSpanOpener = 0
   298  			}
   299  		} else if codeSpanOpener == 0 && c == '\\' && i < len(bs)-1 && IsPunct(bs[i+1]) {
   300  			i += 2
   301  			continue
   302  		} else if codeSpan && codeSpanOpener == 0 && c == '`' {
   303  			for ; i < len(bs); i++ {
   304  				if bs[i] == '`' {
   305  					codeSpanOpener++
   306  				} else {
   307  					i--
   308  					break
   309  				}
   310  			}
   311  		} else if (codeSpan && codeSpanOpener == 0) || !codeSpan {
   312  			if c == closure {
   313  				opened--
   314  				if opened == 0 {
   315  					return i
   316  				}
   317  			} else if c == opener {
   318  				if !allowNesting {
   319  					return -1
   320  				}
   321  				opened++
   322  			}
   323  		}
   324  		i++
   325  	}
   326  	return -1
   327  }
   328  
   329  // TrimLeft trims characters in the given s from head of the source.
   330  // bytes.TrimLeft offers same functionalities, but bytes.TrimLeft
   331  // allocates new buffer for the result.
   332  func TrimLeft(source, b []byte) []byte {
   333  	i := 0
   334  	for ; i < len(source); i++ {
   335  		c := source[i]
   336  		found := false
   337  		for j := 0; j < len(b); j++ {
   338  			if c == b[j] {
   339  				found = true
   340  				break
   341  			}
   342  		}
   343  		if !found {
   344  			break
   345  		}
   346  	}
   347  	return source[i:]
   348  }
   349  
   350  // TrimRight trims characters in the given s from tail of the source.
   351  func TrimRight(source, b []byte) []byte {
   352  	i := len(source) - 1
   353  	for ; i >= 0; i-- {
   354  		c := source[i]
   355  		found := false
   356  		for j := 0; j < len(b); j++ {
   357  			if c == b[j] {
   358  				found = true
   359  				break
   360  			}
   361  		}
   362  		if !found {
   363  			break
   364  		}
   365  	}
   366  	return source[:i+1]
   367  }
   368  
   369  // TrimLeftLength returns a length of leading specified characters.
   370  func TrimLeftLength(source, s []byte) int {
   371  	return len(source) - len(TrimLeft(source, s))
   372  }
   373  
   374  // TrimRightLength returns a length of trailing specified characters.
   375  func TrimRightLength(source, s []byte) int {
   376  	return len(source) - len(TrimRight(source, s))
   377  }
   378  
   379  // TrimLeftSpaceLength returns a length of leading space characters.
   380  func TrimLeftSpaceLength(source []byte) int {
   381  	i := 0
   382  	for ; i < len(source); i++ {
   383  		if !IsSpace(source[i]) {
   384  			break
   385  		}
   386  	}
   387  	return i
   388  }
   389  
   390  // TrimRightSpaceLength returns a length of trailing space characters.
   391  func TrimRightSpaceLength(source []byte) int {
   392  	l := len(source)
   393  	i := l - 1
   394  	for ; i >= 0; i-- {
   395  		if !IsSpace(source[i]) {
   396  			break
   397  		}
   398  	}
   399  	if i < 0 {
   400  		return l
   401  	}
   402  	return l - 1 - i
   403  }
   404  
   405  // TrimLeftSpace returns a subslice of the given string by slicing off all leading
   406  // space characters.
   407  func TrimLeftSpace(source []byte) []byte {
   408  	return TrimLeft(source, spaces)
   409  }
   410  
   411  // TrimRightSpace returns a subslice of the given string by slicing off all trailing
   412  // space characters.
   413  func TrimRightSpace(source []byte) []byte {
   414  	return TrimRight(source, spaces)
   415  }
   416  
   417  // DoFullUnicodeCaseFolding performs full unicode case folding to given bytes.
   418  func DoFullUnicodeCaseFolding(v []byte) []byte {
   419  	var rbuf []byte
   420  	cob := NewCopyOnWriteBuffer(v)
   421  	n := 0
   422  	for i := 0; i < len(v); i++ {
   423  		c := v[i]
   424  		if c < 0xb5 {
   425  			if c >= 0x41 && c <= 0x5a {
   426  				// A-Z to a-z
   427  				cob.Write(v[n:i])
   428  				_ = cob.WriteByte(c + 32)
   429  				n = i + 1
   430  			}
   431  			continue
   432  		}
   433  
   434  		if !utf8.RuneStart(c) {
   435  			continue
   436  		}
   437  		r, length := utf8.DecodeRune(v[i:])
   438  		if r == utf8.RuneError {
   439  			continue
   440  		}
   441  		folded, ok := unicodeCaseFoldings[r]
   442  		if !ok {
   443  			continue
   444  		}
   445  
   446  		cob.Write(v[n:i])
   447  		if rbuf == nil {
   448  			rbuf = make([]byte, 4)
   449  		}
   450  		for _, f := range folded {
   451  			l := utf8.EncodeRune(rbuf, f)
   452  			cob.Write(rbuf[:l])
   453  		}
   454  		i += length - 1
   455  		n = i + 1
   456  	}
   457  	if cob.IsCopied() {
   458  		cob.Write(v[n:])
   459  	}
   460  	return cob.Bytes()
   461  }
   462  
   463  // ReplaceSpaces replaces sequence of spaces with the given repl.
   464  func ReplaceSpaces(source []byte, repl byte) []byte {
   465  	var ret []byte
   466  	start := -1
   467  	for i, c := range source {
   468  		iss := IsSpace(c)
   469  		if start < 0 && iss {
   470  			start = i
   471  			continue
   472  		} else if start >= 0 && iss {
   473  			continue
   474  		} else if start >= 0 {
   475  			if ret == nil {
   476  				ret = make([]byte, 0, len(source))
   477  				ret = append(ret, source[:start]...)
   478  			}
   479  			ret = append(ret, repl)
   480  			start = -1
   481  		}
   482  		if ret != nil {
   483  			ret = append(ret, c)
   484  		}
   485  	}
   486  	if start >= 0 && ret != nil {
   487  		ret = append(ret, repl)
   488  	}
   489  	if ret == nil {
   490  		return source
   491  	}
   492  	return ret
   493  }
   494  
   495  // ToRune decode given bytes start at pos and returns a rune.
   496  func ToRune(source []byte, pos int) rune {
   497  	i := pos
   498  	for ; i >= 0; i-- {
   499  		if utf8.RuneStart(source[i]) {
   500  			break
   501  		}
   502  	}
   503  	r, _ := utf8.DecodeRune(source[i:])
   504  	return r
   505  }
   506  
   507  // ToValidRune returns 0xFFFD if the given rune is invalid, otherwise v.
   508  func ToValidRune(v rune) rune {
   509  	if v == 0 || !utf8.ValidRune(v) {
   510  		return rune(0xFFFD)
   511  	}
   512  	return v
   513  }
   514  
   515  // ToLinkReference converts given bytes into a valid link reference string.
   516  // ToLinkReference performs unicode case folding, trims leading and trailing spaces,  converts into lower
   517  // case and replace spaces with a single space character.
   518  func ToLinkReference(v []byte) string {
   519  	v = TrimLeftSpace(v)
   520  	v = TrimRightSpace(v)
   521  	v = DoFullUnicodeCaseFolding(v)
   522  	return string(ReplaceSpaces(v, ' '))
   523  }
   524  
   525  var htmlEscapeTable = [256][]byte{nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("&quot;"), nil, nil, nil, []byte("&amp;"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, []byte("&lt;"), nil, []byte("&gt;"), nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil} //nolint:golint,lll
   526  
   527  // EscapeHTMLByte returns HTML escaped bytes if the given byte should be escaped,
   528  // otherwise nil.
   529  func EscapeHTMLByte(b byte) []byte {
   530  	return htmlEscapeTable[b]
   531  }
   532  
   533  // EscapeHTML escapes characters that should be escaped in HTML text.
   534  func EscapeHTML(v []byte) []byte {
   535  	cob := NewCopyOnWriteBuffer(v)
   536  	n := 0
   537  	for i := 0; i < len(v); i++ {
   538  		c := v[i]
   539  		escaped := htmlEscapeTable[c]
   540  		if escaped != nil {
   541  			cob.Write(v[n:i])
   542  			cob.Write(escaped)
   543  			n = i + 1
   544  		}
   545  	}
   546  	if cob.IsCopied() {
   547  		cob.Write(v[n:])
   548  	}
   549  	return cob.Bytes()
   550  }
   551  
   552  // UnescapePunctuations unescapes blackslash escaped punctuations.
   553  func UnescapePunctuations(source []byte) []byte {
   554  	cob := NewCopyOnWriteBuffer(source)
   555  	limit := len(source)
   556  	n := 0
   557  	for i := 0; i < limit; {
   558  		c := source[i]
   559  		if i < limit-1 && c == '\\' && IsPunct(source[i+1]) {
   560  			cob.Write(source[n:i])
   561  			_ = cob.WriteByte(source[i+1])
   562  			i += 2
   563  			n = i
   564  			continue
   565  		}
   566  		i++
   567  	}
   568  	if cob.IsCopied() {
   569  		cob.Write(source[n:])
   570  	}
   571  	return cob.Bytes()
   572  }
   573  
   574  // ResolveNumericReferences resolve numeric references like '&#1234;" .
   575  func ResolveNumericReferences(source []byte) []byte {
   576  	cob := NewCopyOnWriteBuffer(source)
   577  	buf := make([]byte, 6)
   578  	limit := len(source)
   579  	var ok bool
   580  	n := 0
   581  	for i := 0; i < limit; i++ {
   582  		if source[i] == '&' {
   583  			pos := i
   584  			next := i + 1
   585  			if next < limit && source[next] == '#' {
   586  				nnext := next + 1
   587  				if nnext < limit {
   588  					nc := source[nnext]
   589  					// code point like #x22;
   590  					if nnext < limit && nc == 'x' || nc == 'X' {
   591  						start := nnext + 1
   592  						i, ok = ReadWhile(source, [2]int{start, limit}, IsHexDecimal)
   593  						if ok && i < limit && source[i] == ';' {
   594  							v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 16, 32)
   595  							cob.Write(source[n:pos])
   596  							n = i + 1
   597  							runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
   598  							cob.Write(buf[:runeSize])
   599  							continue
   600  						}
   601  						// code point like #1234;
   602  					} else if nc >= '0' && nc <= '9' {
   603  						start := nnext
   604  						i, ok = ReadWhile(source, [2]int{start, limit}, IsNumeric)
   605  						if ok && i < limit && i-start < 8 && source[i] == ';' {
   606  							v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 0, 32)
   607  							cob.Write(source[n:pos])
   608  							n = i + 1
   609  							runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
   610  							cob.Write(buf[:runeSize])
   611  							continue
   612  						}
   613  					}
   614  				}
   615  			}
   616  			i = next - 1
   617  		}
   618  	}
   619  	if cob.IsCopied() {
   620  		cob.Write(source[n:])
   621  	}
   622  	return cob.Bytes()
   623  }
   624  
   625  // ResolveEntityNames resolve entity references like '&ouml;" .
   626  func ResolveEntityNames(source []byte) []byte {
   627  	cob := NewCopyOnWriteBuffer(source)
   628  	limit := len(source)
   629  	var ok bool
   630  	n := 0
   631  	for i := 0; i < limit; i++ {
   632  		if source[i] == '&' {
   633  			pos := i
   634  			next := i + 1
   635  			if !(next < limit && source[next] == '#') {
   636  				start := next
   637  				i, ok = ReadWhile(source, [2]int{start, limit}, IsAlphaNumeric)
   638  				if ok && i < limit && source[i] == ';' {
   639  					name := BytesToReadOnlyString(source[start:i])
   640  					entity, ok := LookUpHTML5EntityByName(name)
   641  					if ok {
   642  						cob.Write(source[n:pos])
   643  						n = i + 1
   644  						cob.Write(entity.Characters)
   645  						continue
   646  					}
   647  				}
   648  			}
   649  			i = next - 1
   650  		}
   651  	}
   652  	if cob.IsCopied() {
   653  		cob.Write(source[n:])
   654  	}
   655  	return cob.Bytes()
   656  }
   657  
   658  var htmlSpace = []byte("%20")
   659  
   660  // URLEscape escape the given URL.
   661  // If resolveReference is set true:
   662  //  1. unescape punctuations
   663  //  2. resolve numeric references
   664  //  3. resolve entity references
   665  //
   666  // URL encoded values (%xx) are kept as is.
   667  func URLEscape(v []byte, resolveReference bool) []byte {
   668  	if resolveReference {
   669  		v = UnescapePunctuations(v)
   670  		v = ResolveNumericReferences(v)
   671  		v = ResolveEntityNames(v)
   672  	}
   673  	cob := NewCopyOnWriteBuffer(v)
   674  	limit := len(v)
   675  	n := 0
   676  
   677  	for i := 0; i < limit; {
   678  		c := v[i]
   679  		if urlEscapeTable[c] == 1 {
   680  			i++
   681  			continue
   682  		}
   683  		if c == '%' && i+2 < limit && IsHexDecimal(v[i+1]) && IsHexDecimal(v[i+1]) {
   684  			i += 3
   685  			continue
   686  		}
   687  		u8len := utf8lenTable[c]
   688  		if u8len == 99 { // invalid utf8 leading byte, skip it
   689  			i++
   690  			continue
   691  		}
   692  		if c == ' ' {
   693  			cob.Write(v[n:i])
   694  			cob.Write(htmlSpace)
   695  			i++
   696  			n = i
   697  			continue
   698  		}
   699  		if int(u8len) > len(v) {
   700  			u8len = int8(len(v) - 1)
   701  		}
   702  		if u8len == 0 {
   703  			i++
   704  			n = i
   705  			continue
   706  		}
   707  		cob.Write(v[n:i])
   708  		stop := i + int(u8len)
   709  		if stop > len(v) {
   710  			i++
   711  			n = i
   712  			continue
   713  		}
   714  		cob.Write(StringToReadOnlyBytes(url.QueryEscape(string(v[i:stop]))))
   715  		i += int(u8len)
   716  		n = i
   717  	}
   718  	if cob.IsCopied() && n < limit {
   719  		cob.Write(v[n:])
   720  	}
   721  	return cob.Bytes()
   722  }
   723  
   724  // FindURLIndex returns a stop index value if the given bytes seem an URL.
   725  // This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* .
   726  func FindURLIndex(b []byte) int {
   727  	i := 0
   728  	if !(len(b) > 0 && urlTable[b[i]]&7 == 7) {
   729  		return -1
   730  	}
   731  	i++
   732  	for ; i < len(b); i++ {
   733  		c := b[i]
   734  		if urlTable[c]&4 != 4 {
   735  			break
   736  		}
   737  	}
   738  	if i == 1 || i > 33 || i >= len(b) {
   739  		return -1
   740  	}
   741  	if b[i] != ':' {
   742  		return -1
   743  	}
   744  	i++
   745  	for ; i < len(b); i++ {
   746  		c := b[i]
   747  		if urlTable[c]&1 != 1 {
   748  			break
   749  		}
   750  	}
   751  	return i
   752  }
   753  
   754  var emailDomainRegexp = regexp.MustCompile(`^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*`) //nolint:golint,lll
   755  
   756  // FindEmailIndex returns a stop index value if the given bytes seem an email address.
   757  func FindEmailIndex(b []byte) int {
   758  	// TODO: eliminate regexps
   759  	i := 0
   760  	for ; i < len(b); i++ {
   761  		c := b[i]
   762  		if emailTable[c]&1 != 1 {
   763  			break
   764  		}
   765  	}
   766  	if i == 0 {
   767  		return -1
   768  	}
   769  	if i >= len(b) || b[i] != '@' {
   770  		return -1
   771  	}
   772  	i++
   773  	if i >= len(b) {
   774  		return -1
   775  	}
   776  	match := emailDomainRegexp.FindSubmatchIndex(b[i:])
   777  	if match == nil {
   778  		return -1
   779  	}
   780  	return i + match[1]
   781  }
   782  
   783  var spaces = []byte(" \t\n\x0b\x0c\x0d")
   784  
   785  var spaceTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
   786  
   787  var punctTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
   788  
   789  // a-zA-Z0-9, ;/?:@&=+$,-_.!~*'()#
   790  
   791  var urlEscapeTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
   792  
   793  var utf8lenTable = [256]int8{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 99, 99, 99, 99, 99, 99, 99, 99} //nolint:golint,lll
   794  
   795  var urlTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 1, 0, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} //nolint:golint,lll
   796  
   797  var emailTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
   798  
   799  // UTF8Len returns a byte length of the utf-8 character.
   800  func UTF8Len(b byte) int8 {
   801  	return utf8lenTable[b]
   802  }
   803  
   804  // IsPunct returns true if the given character is a punctuation, otherwise false.
   805  func IsPunct(c byte) bool {
   806  	return punctTable[c] == 1
   807  }
   808  
   809  // IsPunctRune returns true if the given rune is a punctuation, otherwise false.
   810  func IsPunctRune(r rune) bool {
   811  	return int32(r) <= 256 && IsPunct(byte(r)) || unicode.IsPunct(r)
   812  }
   813  
   814  // IsSpace returns true if the given character is a space, otherwise false.
   815  func IsSpace(c byte) bool {
   816  	return spaceTable[c] == 1
   817  }
   818  
   819  // IsSpaceRune returns true if the given rune is a space, otherwise false.
   820  func IsSpaceRune(r rune) bool {
   821  	return int32(r) <= 256 && IsSpace(byte(r)) || unicode.IsSpace(r)
   822  }
   823  
   824  // IsNumeric returns true if the given character is a numeric, otherwise false.
   825  func IsNumeric(c byte) bool {
   826  	return c >= '0' && c <= '9'
   827  }
   828  
   829  // IsHexDecimal returns true if the given character is a hexdecimal, otherwise false.
   830  func IsHexDecimal(c byte) bool {
   831  	return c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F'
   832  }
   833  
   834  // IsAlphaNumeric returns true if the given character is a alphabet or a numeric, otherwise false.
   835  func IsAlphaNumeric(c byte) bool {
   836  	return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9'
   837  }
   838  
   839  // A BufWriter is a subset of the bufio.Writer .
   840  type BufWriter interface {
   841  	io.Writer
   842  	Available() int
   843  	Buffered() int
   844  	Flush() error
   845  	WriteByte(c byte) error
   846  	WriteRune(r rune) (size int, err error)
   847  	WriteString(s string) (int, error)
   848  }
   849  
   850  // A PrioritizedValue struct holds pair of an arbitrary value and a priority.
   851  type PrioritizedValue struct {
   852  	// Value is an arbitrary value that you want to prioritize.
   853  	Value interface{}
   854  	// Priority is a priority of the value.
   855  	Priority int
   856  }
   857  
   858  // PrioritizedSlice is a slice of the PrioritizedValues.
   859  type PrioritizedSlice []PrioritizedValue
   860  
   861  // Sort sorts the PrioritizedSlice in ascending order.
   862  func (s PrioritizedSlice) Sort() {
   863  	sort.Slice(s, func(i, j int) bool {
   864  		return s[i].Priority < s[j].Priority
   865  	})
   866  }
   867  
   868  // Remove removes the given value from this slice.
   869  func (s PrioritizedSlice) Remove(v interface{}) PrioritizedSlice {
   870  	i := 0
   871  	found := false
   872  	for ; i < len(s); i++ {
   873  		if s[i].Value == v {
   874  			found = true
   875  			break
   876  		}
   877  	}
   878  	if !found {
   879  		return s
   880  	}
   881  	return append(s[:i], s[i+1:]...)
   882  }
   883  
   884  // Prioritized returns a new PrioritizedValue.
   885  func Prioritized(v interface{}, priority int) PrioritizedValue {
   886  	return PrioritizedValue{v, priority}
   887  }
   888  
   889  func bytesHash(b []byte) uint64 {
   890  	var hash uint64 = 5381
   891  	for _, c := range b {
   892  		hash = ((hash << 5) + hash) + uint64(c)
   893  	}
   894  	return hash
   895  }
   896  
   897  // BytesFilter is a efficient data structure for checking whether bytes exist or not.
   898  // BytesFilter is thread-safe.
   899  type BytesFilter interface {
   900  	// Add adds given bytes to this set.
   901  	Add([]byte)
   902  
   903  	// Contains return true if this set contains given bytes, otherwise false.
   904  	Contains([]byte) bool
   905  
   906  	// Extend copies this filter and adds given bytes to new filter.
   907  	Extend(...[]byte) BytesFilter
   908  }
   909  
   910  type bytesFilter struct {
   911  	chars     [256]uint8
   912  	threshold int
   913  	slots     [][][]byte
   914  }
   915  
   916  // NewBytesFilter returns a new BytesFilter.
   917  func NewBytesFilter(elements ...[]byte) BytesFilter {
   918  	s := &bytesFilter{
   919  		threshold: 3,
   920  		slots:     make([][][]byte, 64),
   921  	}
   922  	for _, element := range elements {
   923  		s.Add(element)
   924  	}
   925  	return s
   926  }
   927  
   928  func (s *bytesFilter) Add(b []byte) {
   929  	l := len(b)
   930  	m := s.threshold
   931  	if l < s.threshold {
   932  		m = l
   933  	}
   934  	for i := 0; i < m; i++ {
   935  		s.chars[b[i]] |= 1 << uint8(i)
   936  	}
   937  	h := bytesHash(b) % uint64(len(s.slots))
   938  	slot := s.slots[h]
   939  	if slot == nil {
   940  		slot = [][]byte{}
   941  	}
   942  	s.slots[h] = append(slot, b)
   943  }
   944  
   945  func (s *bytesFilter) Extend(bs ...[]byte) BytesFilter {
   946  	newFilter := NewBytesFilter().(*bytesFilter)
   947  	newFilter.chars = s.chars
   948  	newFilter.threshold = s.threshold
   949  	for k, v := range s.slots {
   950  		newSlot := make([][]byte, len(v))
   951  		copy(newSlot, v)
   952  		newFilter.slots[k] = v
   953  	}
   954  	for _, b := range bs {
   955  		newFilter.Add(b)
   956  	}
   957  	return newFilter
   958  }
   959  
   960  func (s *bytesFilter) Contains(b []byte) bool {
   961  	l := len(b)
   962  	m := s.threshold
   963  	if l < s.threshold {
   964  		m = l
   965  	}
   966  	for i := 0; i < m; i++ {
   967  		if (s.chars[b[i]] & (1 << uint8(i))) == 0 {
   968  			return false
   969  		}
   970  	}
   971  	h := bytesHash(b) % uint64(len(s.slots))
   972  	slot := s.slots[h]
   973  	if len(slot) == 0 {
   974  		return false
   975  	}
   976  	for _, element := range slot {
   977  		if bytes.Equal(element, b) {
   978  			return true
   979  		}
   980  	}
   981  	return false
   982  }
   983
View as plain text