...

Source file src/golang.org/x/net/publicsuffix/gen.go

Documentation: golang.org/x/net/publicsuffix

     1  // Copyright 2012 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ignore
     6  
     7  package main
     8  
     9  // This program generates table.go and table_test.go based on the authoritative
    10  // public suffix list at https://publicsuffix.org/list/effective_tld_names.dat
    11  //
    12  // The version is derived from
    13  // https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat
    14  // and a human-readable form is at
    15  // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
    16  //
    17  // To fetch a particular git revision, such as 5c70ccd250, pass
    18  // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
    19  // and -version "an explicit version string".
    20  
    21  import (
    22  	"bufio"
    23  	"bytes"
    24  	"encoding/binary"
    25  	"flag"
    26  	"fmt"
    27  	"go/format"
    28  	"io"
    29  	"net/http"
    30  	"os"
    31  	"regexp"
    32  	"sort"
    33  	"strings"
    34  
    35  	"golang.org/x/net/idna"
    36  )
    37  
    38  const (
    39  	// This must be a multiple of 8 and no greater than 64.
    40  	// Update nodeValue in list.go if this changes.
    41  	nodesBits = 40
    42  
    43  	// These sum of these four values must be no greater than nodesBits.
    44  	nodesBitsChildren   = 10
    45  	nodesBitsICANN      = 1
    46  	nodesBitsTextOffset = 16
    47  	nodesBitsTextLength = 6
    48  
    49  	// These sum of these four values must be no greater than 32.
    50  	childrenBitsWildcard = 1
    51  	childrenBitsNodeType = 2
    52  	childrenBitsHi       = 14
    53  	childrenBitsLo       = 14
    54  )
    55  
    56  var (
    57  	combinedText  string
    58  	maxChildren   int
    59  	maxTextOffset int
    60  	maxTextLength int
    61  	maxHi         uint32
    62  	maxLo         uint32
    63  )
    64  
    65  func max(a, b int) int {
    66  	if a < b {
    67  		return b
    68  	}
    69  	return a
    70  }
    71  
    72  func u32max(a, b uint32) uint32 {
    73  	if a < b {
    74  		return b
    75  	}
    76  	return a
    77  }
    78  
    79  const (
    80  	nodeTypeNormal     = 0
    81  	nodeTypeException  = 1
    82  	nodeTypeParentOnly = 2
    83  	numNodeType        = 3
    84  )
    85  
    86  func nodeTypeStr(n int) string {
    87  	switch n {
    88  	case nodeTypeNormal:
    89  		return "+"
    90  	case nodeTypeException:
    91  		return "!"
    92  	case nodeTypeParentOnly:
    93  		return "o"
    94  	}
    95  	panic("unreachable")
    96  }
    97  
    98  const (
    99  	defaultURL   = "https://publicsuffix.org/list/effective_tld_names.dat"
   100  	gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat"
   101  )
   102  
   103  var (
   104  	labelEncoding = map[string]uint64{}
   105  	labelsList    = []string{}
   106  	labelsMap     = map[string]bool{}
   107  	rules         = []string{}
   108  	numICANNRules = 0
   109  
   110  	// validSuffixRE is used to check that the entries in the public suffix
   111  	// list are in canonical form (after Punycode encoding). Specifically,
   112  	// capital letters are not allowed.
   113  	validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
   114  
   115  	shaRE  = regexp.MustCompile(`"sha":"([^"]+)"`)
   116  	dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
   117  
   118  	subset  = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
   119  	url     = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
   120  	v       = flag.Bool("v", false, "verbose output (to stderr)")
   121  	version = flag.String("version", "", "the effective_tld_names.dat version")
   122  )
   123  
   124  func main() {
   125  	if err := main1(); err != nil {
   126  		fmt.Fprintln(os.Stderr, err)
   127  		os.Exit(1)
   128  	}
   129  }
   130  
   131  func main1() error {
   132  	flag.Parse()
   133  	if nodesBits > 64 {
   134  		return fmt.Errorf("nodesBits is too large")
   135  	}
   136  	if nodesBits%8 != 0 {
   137  		return fmt.Errorf("nodesBits must be a multiple of 8")
   138  	}
   139  	if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > nodesBits {
   140  		return fmt.Errorf("not enough bits to encode the nodes table")
   141  	}
   142  	if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
   143  		return fmt.Errorf("not enough bits to encode the children table")
   144  	}
   145  	if *version == "" {
   146  		if *url != defaultURL {
   147  			return fmt.Errorf("-version was not specified, and the -url is not the default one")
   148  		}
   149  		sha, date, err := gitCommit()
   150  		if err != nil {
   151  			return err
   152  		}
   153  		*version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date)
   154  	}
   155  	var r io.Reader = os.Stdin
   156  	if *url != "" {
   157  		res, err := http.Get(*url)
   158  		if err != nil {
   159  			return err
   160  		}
   161  		if res.StatusCode != http.StatusOK {
   162  			return fmt.Errorf("bad GET status for %s: %s", *url, res.Status)
   163  		}
   164  		r = res.Body
   165  		defer res.Body.Close()
   166  	}
   167  
   168  	var root node
   169  	icann := false
   170  	br := bufio.NewReader(r)
   171  	for {
   172  		s, err := br.ReadString('\n')
   173  		if err != nil {
   174  			if err == io.EOF {
   175  				break
   176  			}
   177  			return err
   178  		}
   179  		s = strings.TrimSpace(s)
   180  		if strings.Contains(s, "BEGIN ICANN DOMAINS") {
   181  			if len(rules) != 0 {
   182  				return fmt.Errorf(`expected no rules before "BEGIN ICANN DOMAINS"`)
   183  			}
   184  			icann = true
   185  			continue
   186  		}
   187  		if strings.Contains(s, "END ICANN DOMAINS") {
   188  			icann, numICANNRules = false, len(rules)
   189  			continue
   190  		}
   191  		if s == "" || strings.HasPrefix(s, "//") {
   192  			continue
   193  		}
   194  		s, err = idna.ToASCII(s)
   195  		if err != nil {
   196  			return err
   197  		}
   198  		if !validSuffixRE.MatchString(s) {
   199  			return fmt.Errorf("bad publicsuffix.org list data: %q", s)
   200  		}
   201  
   202  		if *subset {
   203  			switch {
   204  			case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
   205  			case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
   206  			case s == "ao" || strings.HasSuffix(s, ".ao"):
   207  			case s == "ar" || strings.HasSuffix(s, ".ar"):
   208  			case s == "arpa" || strings.HasSuffix(s, ".arpa"):
   209  			case s == "cy" || strings.HasSuffix(s, ".cy"):
   210  			case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
   211  			case s == "jp":
   212  			case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
   213  			case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
   214  			case s == "om" || strings.HasSuffix(s, ".om"):
   215  			case s == "uk" || strings.HasSuffix(s, ".uk"):
   216  			case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
   217  			case s == "tw" || strings.HasSuffix(s, ".tw"):
   218  			case s == "zw" || strings.HasSuffix(s, ".zw"):
   219  			case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
   220  				// xn--p1ai is Russian-Cyrillic "рф".
   221  			default:
   222  				continue
   223  			}
   224  		}
   225  
   226  		rules = append(rules, s)
   227  
   228  		nt, wildcard := nodeTypeNormal, false
   229  		switch {
   230  		case strings.HasPrefix(s, "*."):
   231  			s, nt = s[2:], nodeTypeParentOnly
   232  			wildcard = true
   233  		case strings.HasPrefix(s, "!"):
   234  			s, nt = s[1:], nodeTypeException
   235  		}
   236  		labels := strings.Split(s, ".")
   237  		for n, i := &root, len(labels)-1; i >= 0; i-- {
   238  			label := labels[i]
   239  			n = n.child(label)
   240  			if i == 0 {
   241  				if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
   242  					n.nodeType = nt
   243  				}
   244  				n.icann = n.icann && icann
   245  				n.wildcard = n.wildcard || wildcard
   246  			}
   247  			labelsMap[label] = true
   248  		}
   249  	}
   250  	labelsList = make([]string, 0, len(labelsMap))
   251  	for label := range labelsMap {
   252  		labelsList = append(labelsList, label)
   253  	}
   254  	sort.Strings(labelsList)
   255  
   256  	combinedText = combineText(labelsList)
   257  	if combinedText == "" {
   258  		return fmt.Errorf("internal error: combineText returned no text")
   259  	}
   260  	for _, label := range labelsList {
   261  		offset, length := strings.Index(combinedText, label), len(label)
   262  		if offset < 0 {
   263  			return fmt.Errorf("internal error: could not find %q in text %q", label, combinedText)
   264  		}
   265  		maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
   266  		if offset >= 1<<nodesBitsTextOffset {
   267  			return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
   268  		}
   269  		if length >= 1<<nodesBitsTextLength {
   270  			return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
   271  		}
   272  		labelEncoding[label] = uint64(offset)<<nodesBitsTextLength | uint64(length)
   273  	}
   274  
   275  	if err := root.walk(assignIndexes); err != nil {
   276  		return err
   277  	}
   278  
   279  	if err := generate(printMetadata, &root, "table.go"); err != nil {
   280  		return err
   281  	}
   282  	if err := generateBinaryData(&root, combinedText); err != nil {
   283  		return err
   284  	}
   285  	if err := generate(printTest, &root, "table_test.go"); err != nil {
   286  		return err
   287  	}
   288  	return nil
   289  }
   290  
   291  func generate(p func(io.Writer, *node) error, root *node, filename string) error {
   292  	buf := new(bytes.Buffer)
   293  	if err := p(buf, root); err != nil {
   294  		return err
   295  	}
   296  	b, err := format.Source(buf.Bytes())
   297  	if err != nil {
   298  		return err
   299  	}
   300  	return os.WriteFile(filename, b, 0644)
   301  }
   302  
   303  func gitCommit() (sha, date string, retErr error) {
   304  	res, err := http.Get(gitCommitURL)
   305  	if err != nil {
   306  		return "", "", err
   307  	}
   308  	if res.StatusCode != http.StatusOK {
   309  		return "", "", fmt.Errorf("bad GET status for %s: %s", gitCommitURL, res.Status)
   310  	}
   311  	defer res.Body.Close()
   312  	b, err := io.ReadAll(res.Body)
   313  	if err != nil {
   314  		return "", "", err
   315  	}
   316  	if m := shaRE.FindSubmatch(b); m != nil {
   317  		sha = string(m[1])
   318  	}
   319  	if m := dateRE.FindSubmatch(b); m != nil {
   320  		date = string(m[1])
   321  	}
   322  	if sha == "" || date == "" {
   323  		retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL)
   324  	}
   325  	return sha, date, retErr
   326  }
   327  
   328  func printTest(w io.Writer, n *node) error {
   329  	fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
   330  	fmt.Fprintf(w, "package publicsuffix\n\nconst numICANNRules = %d\n\nvar rules = [...]string{\n", numICANNRules)
   331  	for _, rule := range rules {
   332  		fmt.Fprintf(w, "%q,\n", rule)
   333  	}
   334  	fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
   335  	if err := n.walk(func(n *node) error {
   336  		return printNodeLabel(w, n)
   337  	}); err != nil {
   338  		return err
   339  	}
   340  	fmt.Fprintf(w, "}\n")
   341  	return nil
   342  }
   343  
   344  func generateBinaryData(root *node, combinedText string) error {
   345  	if err := os.WriteFile("data/text", []byte(combinedText), 0666); err != nil {
   346  		return err
   347  	}
   348  
   349  	var nodes []byte
   350  	if err := root.walk(func(n *node) error {
   351  		for _, c := range n.children {
   352  			nodes = appendNodeEncoding(nodes, c)
   353  		}
   354  		return nil
   355  	}); err != nil {
   356  		return err
   357  	}
   358  	if err := os.WriteFile("data/nodes", nodes, 0666); err != nil {
   359  		return err
   360  	}
   361  
   362  	var children []byte
   363  	for _, c := range childrenEncoding {
   364  		children = binary.BigEndian.AppendUint32(children, c)
   365  	}
   366  	if err := os.WriteFile("data/children", children, 0666); err != nil {
   367  		return err
   368  	}
   369  
   370  	return nil
   371  }
   372  
   373  func appendNodeEncoding(b []byte, n *node) []byte {
   374  	encoding := labelEncoding[n.label]
   375  	if n.icann {
   376  		encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
   377  	}
   378  	encoding |= uint64(n.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
   379  	for i := nodesBits - 8; i >= 0; i -= 8 {
   380  		b = append(b, byte((encoding>>i)&0xff))
   381  	}
   382  	return b
   383  }
   384  
   385  func printMetadata(w io.Writer, n *node) error {
   386  	const header = `// generated by go run gen.go; DO NOT EDIT
   387  
   388  package publicsuffix
   389  
   390  import _ "embed"
   391  
   392  const version = %q
   393  
   394  const (
   395  	nodesBits           = %d
   396  	nodesBitsChildren   = %d
   397  	nodesBitsICANN      = %d
   398  	nodesBitsTextOffset = %d
   399  	nodesBitsTextLength = %d
   400  
   401  	childrenBitsWildcard = %d
   402  	childrenBitsNodeType = %d
   403  	childrenBitsHi       = %d
   404  	childrenBitsLo       = %d
   405  )
   406  
   407  const (
   408  	nodeTypeNormal     = %d
   409  	nodeTypeException  = %d
   410  	nodeTypeParentOnly = %d
   411  )
   412  
   413  // numTLD is the number of top level domains.
   414  const numTLD = %d
   415  
   416  // text is the combined text of all labels.
   417  //
   418  //go:embed data/text
   419  var text string
   420  
   421  `
   422  	fmt.Fprintf(w, header, *version,
   423  		nodesBits,
   424  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
   425  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
   426  		nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
   427  	fmt.Fprintf(w, `
   428  // nodes is the list of nodes. Each node is represented as a %v-bit integer,
   429  // which encodes the node's children, wildcard bit and node type (as an index
   430  // into the children array), ICANN bit and text.
   431  //
   432  // The layout within the node, from MSB to LSB, is:
   433  //	[%2d bits] unused
   434  //	[%2d bits] children index
   435  //	[%2d bits] ICANN bit
   436  //	[%2d bits] text index
   437  //	[%2d bits] text length
   438  //
   439  //go:embed data/nodes
   440  var nodes uint40String
   441  `,
   442  		nodesBits,
   443  		nodesBits-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
   444  		nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
   445  	fmt.Fprintf(w, `
   446  // children is the list of nodes' children, the parent's wildcard bit and the
   447  // parent's node type. If a node has no children then their children index
   448  // will be in the range [0, 6), depending on the wildcard bit and node type.
   449  //
   450  // The layout within the uint32, from MSB to LSB, is:
   451  //	[%2d bits] unused
   452  //	[%2d bits] wildcard bit
   453  //	[%2d bits] node type
   454  //	[%2d bits] high nodes index (exclusive) of children
   455  //	[%2d bits] low nodes index (inclusive) of children
   456  //
   457  //go:embed data/children
   458  var children uint32String
   459  `,
   460  		32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
   461  		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
   462  
   463  	fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
   464  	fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
   465  	fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
   466  	fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
   467  	fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
   468  	return nil
   469  }
   470  
   471  type node struct {
   472  	label    string
   473  	nodeType int
   474  	icann    bool
   475  	wildcard bool
   476  	// nodesIndex and childrenIndex are the index of this node in the nodes
   477  	// and the index of its children offset/length in the children arrays.
   478  	nodesIndex, childrenIndex int
   479  	// firstChild is the index of this node's first child, or zero if this
   480  	// node has no children.
   481  	firstChild int
   482  	// children are the node's children, in strictly increasing node label order.
   483  	children []*node
   484  }
   485  
   486  func (n *node) walk(f func(*node) error) error {
   487  	if err := f(n); err != nil {
   488  		return err
   489  	}
   490  	for _, c := range n.children {
   491  		if err := c.walk(f); err != nil {
   492  			return err
   493  		}
   494  	}
   495  	return nil
   496  }
   497  
   498  // child returns the child of n with the given label. The child is created if
   499  // it did not exist beforehand.
   500  func (n *node) child(label string) *node {
   501  	for _, c := range n.children {
   502  		if c.label == label {
   503  			return c
   504  		}
   505  	}
   506  	c := &node{
   507  		label:    label,
   508  		nodeType: nodeTypeParentOnly,
   509  		icann:    true,
   510  	}
   511  	n.children = append(n.children, c)
   512  	sort.Sort(byLabel(n.children))
   513  	return c
   514  }
   515  
   516  type byLabel []*node
   517  
   518  func (b byLabel) Len() int           { return len(b) }
   519  func (b byLabel) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
   520  func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
   521  
   522  var nextNodesIndex int
   523  
   524  // childrenEncoding are the encoded entries in the generated children array.
   525  // All these pre-defined entries have no children.
   526  var childrenEncoding = []uint32{
   527  	0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
   528  	1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
   529  	2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
   530  	4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
   531  	5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
   532  	6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
   533  }
   534  
   535  var firstCallToAssignIndexes = true
   536  
   537  func assignIndexes(n *node) error {
   538  	if len(n.children) != 0 {
   539  		// Assign nodesIndex.
   540  		n.firstChild = nextNodesIndex
   541  		for _, c := range n.children {
   542  			c.nodesIndex = nextNodesIndex
   543  			nextNodesIndex++
   544  		}
   545  
   546  		// The root node's children is implicit.
   547  		if firstCallToAssignIndexes {
   548  			firstCallToAssignIndexes = false
   549  			return nil
   550  		}
   551  
   552  		// Assign childrenIndex.
   553  		maxChildren = max(maxChildren, len(childrenEncoding))
   554  		if len(childrenEncoding) >= 1<<nodesBitsChildren {
   555  			return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
   556  		}
   557  		n.childrenIndex = len(childrenEncoding)
   558  		lo := uint32(n.firstChild)
   559  		hi := lo + uint32(len(n.children))
   560  		maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
   561  		if lo >= 1<<childrenBitsLo {
   562  			return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
   563  		}
   564  		if hi >= 1<<childrenBitsHi {
   565  			return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
   566  		}
   567  		enc := hi<<childrenBitsLo | lo
   568  		enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
   569  		if n.wildcard {
   570  			enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
   571  		}
   572  		childrenEncoding = append(childrenEncoding, enc)
   573  	} else {
   574  		n.childrenIndex = n.nodeType
   575  		if n.wildcard {
   576  			n.childrenIndex += numNodeType
   577  		}
   578  	}
   579  	return nil
   580  }
   581  
   582  func printNodeLabel(w io.Writer, n *node) error {
   583  	for _, c := range n.children {
   584  		fmt.Fprintf(w, "%q,\n", c.label)
   585  	}
   586  	return nil
   587  }
   588  
   589  func icannStr(icann bool) string {
   590  	if icann {
   591  		return "I"
   592  	}
   593  	return " "
   594  }
   595  
   596  func wildcardStr(wildcard bool) string {
   597  	if wildcard {
   598  		return "*"
   599  	}
   600  	return " "
   601  }
   602  
   603  // combineText combines all the strings in labelsList to form one giant string.
   604  // Overlapping strings will be merged: "arpa" and "parliament" could yield
   605  // "arparliament".
   606  func combineText(labelsList []string) string {
   607  	beforeLength := 0
   608  	for _, s := range labelsList {
   609  		beforeLength += len(s)
   610  	}
   611  
   612  	text := crush(removeSubstrings(labelsList))
   613  	if *v {
   614  		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
   615  	}
   616  	return text
   617  }
   618  
   619  type byLength []string
   620  
   621  func (s byLength) Len() int           { return len(s) }
   622  func (s byLength) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
   623  func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
   624  
   625  // removeSubstrings returns a copy of its input with any strings removed
   626  // that are substrings of other provided strings.
   627  func removeSubstrings(input []string) []string {
   628  	// Make a copy of input.
   629  	ss := append(make([]string, 0, len(input)), input...)
   630  	sort.Sort(byLength(ss))
   631  
   632  	for i, shortString := range ss {
   633  		// For each string, only consider strings higher than it in sort order, i.e.
   634  		// of equal length or greater.
   635  		for _, longString := range ss[i+1:] {
   636  			if strings.Contains(longString, shortString) {
   637  				ss[i] = ""
   638  				break
   639  			}
   640  		}
   641  	}
   642  
   643  	// Remove the empty strings.
   644  	sort.Strings(ss)
   645  	for len(ss) > 0 && ss[0] == "" {
   646  		ss = ss[1:]
   647  	}
   648  	return ss
   649  }
   650  
   651  // crush combines a list of strings, taking advantage of overlaps. It returns a
   652  // single string that contains each input string as a substring.
   653  func crush(ss []string) string {
   654  	maxLabelLen := 0
   655  	for _, s := range ss {
   656  		if maxLabelLen < len(s) {
   657  			maxLabelLen = len(s)
   658  		}
   659  	}
   660  
   661  	for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
   662  		prefixes := makePrefixMap(ss, prefixLen)
   663  		for i, s := range ss {
   664  			if len(s) <= prefixLen {
   665  				continue
   666  			}
   667  			mergeLabel(ss, i, prefixLen, prefixes)
   668  		}
   669  	}
   670  
   671  	return strings.Join(ss, "")
   672  }
   673  
   674  // mergeLabel merges the label at ss[i] with the first available matching label
   675  // in prefixMap, where the last "prefixLen" characters in ss[i] match the first
   676  // "prefixLen" characters in the matching label.
   677  // It will merge ss[i] repeatedly until no more matches are available.
   678  // All matching labels merged into ss[i] are replaced by "".
   679  func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
   680  	s := ss[i]
   681  	suffix := s[len(s)-prefixLen:]
   682  	for _, j := range prefixes[suffix] {
   683  		// Empty strings mean "already used." Also avoid merging with self.
   684  		if ss[j] == "" || i == j {
   685  			continue
   686  		}
   687  		if *v {
   688  			fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
   689  				prefixLen, i, j, ss[i], ss[j], suffix)
   690  		}
   691  		ss[i] += ss[j][prefixLen:]
   692  		ss[j] = ""
   693  		// ss[i] has a new suffix, so merge again if possible.
   694  		// Note: we only have to merge again at the same prefix length. Shorter
   695  		// prefix lengths will be handled in the next iteration of crush's for loop.
   696  		// Can there be matches for longer prefix lengths, introduced by the merge?
   697  		// I believe that any such matches would by necessity have been eliminated
   698  		// during substring removal or merged at a higher prefix length. For
   699  		// instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
   700  		// would yield "abcde", which could be merged with "bcdef." However, in
   701  		// practice "cde" would already have been elimintated by removeSubstrings.
   702  		mergeLabel(ss, i, prefixLen, prefixes)
   703  		return
   704  	}
   705  }
   706  
   707  // prefixMap maps from a prefix to a list of strings containing that prefix. The
   708  // list of strings is represented as indexes into a slice of strings stored
   709  // elsewhere.
   710  type prefixMap map[string][]int
   711  
   712  // makePrefixMap constructs a prefixMap from a slice of strings.
   713  func makePrefixMap(ss []string, prefixLen int) prefixMap {
   714  	prefixes := make(prefixMap)
   715  	for i, s := range ss {
   716  		// We use < rather than <= because if a label matches on a prefix equal to
   717  		// its full length, that's actually a substring match handled by
   718  		// removeSubstrings.
   719  		if prefixLen < len(s) {
   720  			prefix := s[:prefixLen]
   721  			prefixes[prefix] = append(prefixes[prefix], i)
   722  		}
   723  	}
   724  
   725  	return prefixes
   726  }
   727  

View as plain text