split.go

Documentation: github.com/go-openapi/swag

     1  // Copyright 2015 go-swagger maintainers
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //    http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package swag
    16  
    17  import (
    18  	"bytes"
    19  	"sync"
    20  	"unicode"
    21  	"unicode/utf8"
    22  )
    23  
    24  type (
    25  	splitter struct {
    26  		initialisms              []string
    27  		initialismsRunes         [][]rune
    28  		initialismsUpperCased    [][]rune // initialisms cached in their trimmed, upper-cased version
    29  		postSplitInitialismCheck bool
    30  	}
    31  
    32  	splitterOption func(*splitter)
    33  
    34  	initialismMatch struct {
    35  		body       []rune
    36  		start, end int
    37  		complete   bool
    38  	}
    39  	initialismMatches []initialismMatch
    40  )
    41  
    42  type (
    43  	// memory pools of temporary objects.
    44  	//
    45  	// These are used to recycle temporarily allocated objects
    46  	// and relieve the GC from undue pressure.
    47  
    48  	matchesPool struct {
    49  		*sync.Pool
    50  	}
    51  
    52  	buffersPool struct {
    53  		*sync.Pool
    54  	}
    55  
    56  	lexemsPool struct {
    57  		*sync.Pool
    58  	}
    59  
    60  	splittersPool struct {
    61  		*sync.Pool
    62  	}
    63  )
    64  
    65  var (
    66  	// poolOfMatches holds temporary slices for recycling during the initialism match process
    67  	poolOfMatches = matchesPool{
    68  		Pool: &sync.Pool{
    69  			New: func() any {
    70  				s := make(initialismMatches, 0, maxAllocMatches)
    71  
    72  				return &s
    73  			},
    74  		},
    75  	}
    76  
    77  	poolOfBuffers = buffersPool{
    78  		Pool: &sync.Pool{
    79  			New: func() any {
    80  				return new(bytes.Buffer)
    81  			},
    82  		},
    83  	}
    84  
    85  	poolOfLexems = lexemsPool{
    86  		Pool: &sync.Pool{
    87  			New: func() any {
    88  				s := make([]nameLexem, 0, maxAllocMatches)
    89  
    90  				return &s
    91  			},
    92  		},
    93  	}
    94  
    95  	poolOfSplitters = splittersPool{
    96  		Pool: &sync.Pool{
    97  			New: func() any {
    98  				s := newSplitter()
    99  
   100  				return &s
   101  			},
   102  		},
   103  	}
   104  )
   105  
   106  // nameReplaceTable finds a word representation for special characters.
   107  func nameReplaceTable(r rune) (string, bool) {
   108  	switch r {
   109  	case '@':
   110  		return "At ", true
   111  	case '&':
   112  		return "And ", true
   113  	case '|':
   114  		return "Pipe ", true
   115  	case '$':
   116  		return "Dollar ", true
   117  	case '!':
   118  		return "Bang ", true
   119  	case '-':
   120  		return "", true
   121  	case '_':
   122  		return "", true
   123  	default:
   124  		return "", false
   125  	}
   126  }
   127  
   128  // split calls the splitter.
   129  //
   130  // Use newSplitter for more control and options
   131  func split(str string) []string {
   132  	s := poolOfSplitters.BorrowSplitter()
   133  	lexems := s.split(str)
   134  	result := make([]string, 0, len(*lexems))
   135  
   136  	for _, lexem := range *lexems {
   137  		result = append(result, lexem.GetOriginal())
   138  	}
   139  	poolOfLexems.RedeemLexems(lexems)
   140  	poolOfSplitters.RedeemSplitter(s)
   141  
   142  	return result
   143  
   144  }
   145  
   146  func newSplitter(options ...splitterOption) splitter {
   147  	s := splitter{
   148  		postSplitInitialismCheck: false,
   149  		initialisms:              initialisms,
   150  		initialismsRunes:         initialismsRunes,
   151  		initialismsUpperCased:    initialismsUpperCased,
   152  	}
   153  
   154  	for _, option := range options {
   155  		option(&s)
   156  	}
   157  
   158  	return s
   159  }
   160  
   161  // withPostSplitInitialismCheck allows to catch initialisms after main split process
   162  func withPostSplitInitialismCheck(s *splitter) {
   163  	s.postSplitInitialismCheck = true
   164  }
   165  
   166  func (p matchesPool) BorrowMatches() *initialismMatches {
   167  	s := p.Get().(*initialismMatches)
   168  	*s = (*s)[:0] // reset slice, keep allocated capacity
   169  
   170  	return s
   171  }
   172  
   173  func (p buffersPool) BorrowBuffer(size int) *bytes.Buffer {
   174  	s := p.Get().(*bytes.Buffer)
   175  	s.Reset()
   176  
   177  	if s.Cap() < size {
   178  		s.Grow(size)
   179  	}
   180  
   181  	return s
   182  }
   183  
   184  func (p lexemsPool) BorrowLexems() *[]nameLexem {
   185  	s := p.Get().(*[]nameLexem)
   186  	*s = (*s)[:0] // reset slice, keep allocated capacity
   187  
   188  	return s
   189  }
   190  
   191  func (p splittersPool) BorrowSplitter(options ...splitterOption) *splitter {
   192  	s := p.Get().(*splitter)
   193  	s.postSplitInitialismCheck = false // reset options
   194  	for _, apply := range options {
   195  		apply(s)
   196  	}
   197  
   198  	return s
   199  }
   200  
   201  func (p matchesPool) RedeemMatches(s *initialismMatches) {
   202  	p.Put(s)
   203  }
   204  
   205  func (p buffersPool) RedeemBuffer(s *bytes.Buffer) {
   206  	p.Put(s)
   207  }
   208  
   209  func (p lexemsPool) RedeemLexems(s *[]nameLexem) {
   210  	p.Put(s)
   211  }
   212  
   213  func (p splittersPool) RedeemSplitter(s *splitter) {
   214  	p.Put(s)
   215  }
   216  
   217  func (m initialismMatch) isZero() bool {
   218  	return m.start == 0 && m.end == 0
   219  }
   220  
   221  func (s splitter) split(name string) *[]nameLexem {
   222  	nameRunes := []rune(name)
   223  	matches := s.gatherInitialismMatches(nameRunes)
   224  	if matches == nil {
   225  		return poolOfLexems.BorrowLexems()
   226  	}
   227  
   228  	return s.mapMatchesToNameLexems(nameRunes, matches)
   229  }
   230  
   231  func (s splitter) gatherInitialismMatches(nameRunes []rune) *initialismMatches {
   232  	var matches *initialismMatches
   233  
   234  	for currentRunePosition, currentRune := range nameRunes {
   235  		// recycle these allocations as we loop over runes
   236  		// with such recycling, only 2 slices should be allocated per call
   237  		// instead of o(n).
   238  		newMatches := poolOfMatches.BorrowMatches()
   239  
   240  		// check current initialism matches
   241  		if matches != nil { // skip first iteration
   242  			for _, match := range *matches {
   243  				if keepCompleteMatch := match.complete; keepCompleteMatch {
   244  					*newMatches = append(*newMatches, match)
   245  					continue
   246  				}
   247  
   248  				// drop failed match
   249  				currentMatchRune := match.body[currentRunePosition-match.start]
   250  				if currentMatchRune != currentRune {
   251  					continue
   252  				}
   253  
   254  				// try to complete ongoing match
   255  				if currentRunePosition-match.start == len(match.body)-1 {
   256  					// we are close; the next step is to check the symbol ahead
   257  					// if it is a small letter, then it is not the end of match
   258  					// but beginning of the next word
   259  
   260  					if currentRunePosition < len(nameRunes)-1 {
   261  						nextRune := nameRunes[currentRunePosition+1]
   262  						if newWord := unicode.IsLower(nextRune); newWord {
   263  							// oh ok, it was the start of a new word
   264  							continue
   265  						}
   266  					}
   267  
   268  					match.complete = true
   269  					match.end = currentRunePosition
   270  				}
   271  
   272  				*newMatches = append(*newMatches, match)
   273  			}
   274  		}
   275  
   276  		// check for new initialism matches
   277  		for i := range s.initialisms {
   278  			initialismRunes := s.initialismsRunes[i]
   279  			if initialismRunes[0] == currentRune {
   280  				*newMatches = append(*newMatches, initialismMatch{
   281  					start:    currentRunePosition,
   282  					body:     initialismRunes,
   283  					complete: false,
   284  				})
   285  			}
   286  		}
   287  
   288  		if matches != nil {
   289  			poolOfMatches.RedeemMatches(matches)
   290  		}
   291  		matches = newMatches
   292  	}
   293  
   294  	// up to the caller to redeem this last slice
   295  	return matches
   296  }
   297  
   298  func (s splitter) mapMatchesToNameLexems(nameRunes []rune, matches *initialismMatches) *[]nameLexem {
   299  	nameLexems := poolOfLexems.BorrowLexems()
   300  
   301  	var lastAcceptedMatch initialismMatch
   302  	for _, match := range *matches {
   303  		if !match.complete {
   304  			continue
   305  		}
   306  
   307  		if firstMatch := lastAcceptedMatch.isZero(); firstMatch {
   308  			s.appendBrokenDownCasualString(nameLexems, nameRunes[:match.start])
   309  			*nameLexems = append(*nameLexems, s.breakInitialism(string(match.body)))
   310  
   311  			lastAcceptedMatch = match
   312  
   313  			continue
   314  		}
   315  
   316  		if overlappedMatch := match.start <= lastAcceptedMatch.end; overlappedMatch {
   317  			continue
   318  		}
   319  
   320  		middle := nameRunes[lastAcceptedMatch.end+1 : match.start]
   321  		s.appendBrokenDownCasualString(nameLexems, middle)
   322  		*nameLexems = append(*nameLexems, s.breakInitialism(string(match.body)))
   323  
   324  		lastAcceptedMatch = match
   325  	}
   326  
   327  	// we have not found any accepted matches
   328  	if lastAcceptedMatch.isZero() {
   329  		*nameLexems = (*nameLexems)[:0]
   330  		s.appendBrokenDownCasualString(nameLexems, nameRunes)
   331  	} else if lastAcceptedMatch.end+1 != len(nameRunes) {
   332  		rest := nameRunes[lastAcceptedMatch.end+1:]
   333  		s.appendBrokenDownCasualString(nameLexems, rest)
   334  	}
   335  
   336  	poolOfMatches.RedeemMatches(matches)
   337  
   338  	return nameLexems
   339  }
   340  
   341  func (s splitter) breakInitialism(original string) nameLexem {
   342  	return newInitialismNameLexem(original, original)
   343  }
   344  
   345  func (s splitter) appendBrokenDownCasualString(segments *[]nameLexem, str []rune) {
   346  	currentSegment := poolOfBuffers.BorrowBuffer(len(str)) // unlike strings.Builder, bytes.Buffer initial storage can reused
   347  	defer func() {
   348  		poolOfBuffers.RedeemBuffer(currentSegment)
   349  	}()
   350  
   351  	addCasualNameLexem := func(original string) {
   352  		*segments = append(*segments, newCasualNameLexem(original))
   353  	}
   354  
   355  	addInitialismNameLexem := func(original, match string) {
   356  		*segments = append(*segments, newInitialismNameLexem(original, match))
   357  	}
   358  
   359  	var addNameLexem func(string)
   360  	if s.postSplitInitialismCheck {
   361  		addNameLexem = func(original string) {
   362  			for i := range s.initialisms {
   363  				if isEqualFoldIgnoreSpace(s.initialismsUpperCased[i], original) {
   364  					addInitialismNameLexem(original, s.initialisms[i])
   365  
   366  					return
   367  				}
   368  			}
   369  
   370  			addCasualNameLexem(original)
   371  		}
   372  	} else {
   373  		addNameLexem = addCasualNameLexem
   374  	}
   375  
   376  	for _, rn := range str {
   377  		if replace, found := nameReplaceTable(rn); found {
   378  			if currentSegment.Len() > 0 {
   379  				addNameLexem(currentSegment.String())
   380  				currentSegment.Reset()
   381  			}
   382  
   383  			if replace != "" {
   384  				addNameLexem(replace)
   385  			}
   386  
   387  			continue
   388  		}
   389  
   390  		if !unicode.In(rn, unicode.L, unicode.M, unicode.N, unicode.Pc) {
   391  			if currentSegment.Len() > 0 {
   392  				addNameLexem(currentSegment.String())
   393  				currentSegment.Reset()
   394  			}
   395  
   396  			continue
   397  		}
   398  
   399  		if unicode.IsUpper(rn) {
   400  			if currentSegment.Len() > 0 {
   401  				addNameLexem(currentSegment.String())
   402  			}
   403  			currentSegment.Reset()
   404  		}
   405  
   406  		currentSegment.WriteRune(rn)
   407  	}
   408  
   409  	if currentSegment.Len() > 0 {
   410  		addNameLexem(currentSegment.String())
   411  	}
   412  }
   413  
   414  // isEqualFoldIgnoreSpace is the same as strings.EqualFold, but
   415  // it ignores leading and trailing blank spaces in the compared
   416  // string.
   417  //
   418  // base is assumed to be composed of upper-cased runes, and be already
   419  // trimmed.
   420  //
   421  // This code is heavily inspired from strings.EqualFold.
   422  func isEqualFoldIgnoreSpace(base []rune, str string) bool {
   423  	var i, baseIndex int
   424  	// equivalent to b := []byte(str), but without data copy
   425  	b := hackStringBytes(str)
   426  
   427  	for i < len(b) {
   428  		if c := b[i]; c < utf8.RuneSelf {
   429  			// fast path for ASCII
   430  			if c != ' ' && c != '\t' {
   431  				break
   432  			}
   433  			i++
   434  
   435  			continue
   436  		}
   437  
   438  		// unicode case
   439  		r, size := utf8.DecodeRune(b[i:])
   440  		if !unicode.IsSpace(r) {
   441  			break
   442  		}
   443  		i += size
   444  	}
   445  
   446  	if i >= len(b) {
   447  		return len(base) == 0
   448  	}
   449  
   450  	for _, baseRune := range base {
   451  		if i >= len(b) {
   452  			break
   453  		}
   454  
   455  		if c := b[i]; c < utf8.RuneSelf {
   456  			// single byte rune case (ASCII)
   457  			if baseRune >= utf8.RuneSelf {
   458  				return false
   459  			}
   460  
   461  			baseChar := byte(baseRune)
   462  			if c != baseChar &&
   463  				!('a' <= c && c <= 'z' && c-'a'+'A' == baseChar) {
   464  				return false
   465  			}
   466  
   467  			baseIndex++
   468  			i++
   469  
   470  			continue
   471  		}
   472  
   473  		// unicode case
   474  		r, size := utf8.DecodeRune(b[i:])
   475  		if unicode.ToUpper(r) != baseRune {
   476  			return false
   477  		}
   478  		baseIndex++
   479  		i += size
   480  	}
   481  
   482  	if baseIndex != len(base) {
   483  		return false
   484  	}
   485  
   486  	// all passed: now we should only have blanks
   487  	for i < len(b) {
   488  		if c := b[i]; c < utf8.RuneSelf {
   489  			// fast path for ASCII
   490  			if c != ' ' && c != '\t' {
   491  				return false
   492  			}
   493  			i++
   494  
   495  			continue
   496  		}
   497  
   498  		// unicode case
   499  		r, size := utf8.DecodeRune(b[i:])
   500  		if !unicode.IsSpace(r) {
   501  			return false
   502  		}
   503  
   504  		i += size
   505  	}
   506  
   507  	return true
   508  }
   509
View as plain text