1
2
3
4
5
6
7 package main
8
9
10
11
12
13
14
15
16
17
18
19
20
21 import (
22 "bufio"
23 "bytes"
24 "encoding/binary"
25 "flag"
26 "fmt"
27 "go/format"
28 "io"
29 "net/http"
30 "os"
31 "regexp"
32 "sort"
33 "strings"
34
35 "golang.org/x/net/idna"
36 )
37
38 const (
39
40
41 nodesBits = 40
42
43
44 nodesBitsChildren = 10
45 nodesBitsICANN = 1
46 nodesBitsTextOffset = 16
47 nodesBitsTextLength = 6
48
49
50 childrenBitsWildcard = 1
51 childrenBitsNodeType = 2
52 childrenBitsHi = 14
53 childrenBitsLo = 14
54 )
55
56 var (
57 combinedText string
58 maxChildren int
59 maxTextOffset int
60 maxTextLength int
61 maxHi uint32
62 maxLo uint32
63 )
64
65 func max(a, b int) int {
66 if a < b {
67 return b
68 }
69 return a
70 }
71
72 func u32max(a, b uint32) uint32 {
73 if a < b {
74 return b
75 }
76 return a
77 }
78
79 const (
80 nodeTypeNormal = 0
81 nodeTypeException = 1
82 nodeTypeParentOnly = 2
83 numNodeType = 3
84 )
85
86 func nodeTypeStr(n int) string {
87 switch n {
88 case nodeTypeNormal:
89 return "+"
90 case nodeTypeException:
91 return "!"
92 case nodeTypeParentOnly:
93 return "o"
94 }
95 panic("unreachable")
96 }
97
98 const (
99 defaultURL = "https://publicsuffix.org/list/effective_tld_names.dat"
100 gitCommitURL = "https://api.github.com/repos/publicsuffix/list/commits?path=public_suffix_list.dat"
101 )
102
103 var (
104 labelEncoding = map[string]uint64{}
105 labelsList = []string{}
106 labelsMap = map[string]bool{}
107 rules = []string{}
108 numICANNRules = 0
109
110
111
112
113 validSuffixRE = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
114
115 shaRE = regexp.MustCompile(`"sha":"([^"]+)"`)
116 dateRE = regexp.MustCompile(`"committer":{[^{]+"date":"([^"]+)"`)
117
118 subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
119 url = flag.String("url", defaultURL, "URL of the publicsuffix.org list. If empty, stdin is read instead")
120 v = flag.Bool("v", false, "verbose output (to stderr)")
121 version = flag.String("version", "", "the effective_tld_names.dat version")
122 )
123
124 func main() {
125 if err := main1(); err != nil {
126 fmt.Fprintln(os.Stderr, err)
127 os.Exit(1)
128 }
129 }
130
131 func main1() error {
132 flag.Parse()
133 if nodesBits > 64 {
134 return fmt.Errorf("nodesBits is too large")
135 }
136 if nodesBits%8 != 0 {
137 return fmt.Errorf("nodesBits must be a multiple of 8")
138 }
139 if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > nodesBits {
140 return fmt.Errorf("not enough bits to encode the nodes table")
141 }
142 if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
143 return fmt.Errorf("not enough bits to encode the children table")
144 }
145 if *version == "" {
146 if *url != defaultURL {
147 return fmt.Errorf("-version was not specified, and the -url is not the default one")
148 }
149 sha, date, err := gitCommit()
150 if err != nil {
151 return err
152 }
153 *version = fmt.Sprintf("publicsuffix.org's public_suffix_list.dat, git revision %s (%s)", sha, date)
154 }
155 var r io.Reader = os.Stdin
156 if *url != "" {
157 res, err := http.Get(*url)
158 if err != nil {
159 return err
160 }
161 if res.StatusCode != http.StatusOK {
162 return fmt.Errorf("bad GET status for %s: %s", *url, res.Status)
163 }
164 r = res.Body
165 defer res.Body.Close()
166 }
167
168 var root node
169 icann := false
170 br := bufio.NewReader(r)
171 for {
172 s, err := br.ReadString('\n')
173 if err != nil {
174 if err == io.EOF {
175 break
176 }
177 return err
178 }
179 s = strings.TrimSpace(s)
180 if strings.Contains(s, "BEGIN ICANN DOMAINS") {
181 if len(rules) != 0 {
182 return fmt.Errorf(`expected no rules before "BEGIN ICANN DOMAINS"`)
183 }
184 icann = true
185 continue
186 }
187 if strings.Contains(s, "END ICANN DOMAINS") {
188 icann, numICANNRules = false, len(rules)
189 continue
190 }
191 if s == "" || strings.HasPrefix(s, "//") {
192 continue
193 }
194 s, err = idna.ToASCII(s)
195 if err != nil {
196 return err
197 }
198 if !validSuffixRE.MatchString(s) {
199 return fmt.Errorf("bad publicsuffix.org list data: %q", s)
200 }
201
202 if *subset {
203 switch {
204 case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
205 case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
206 case s == "ao" || strings.HasSuffix(s, ".ao"):
207 case s == "ar" || strings.HasSuffix(s, ".ar"):
208 case s == "arpa" || strings.HasSuffix(s, ".arpa"):
209 case s == "cy" || strings.HasSuffix(s, ".cy"):
210 case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
211 case s == "jp":
212 case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
213 case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
214 case s == "om" || strings.HasSuffix(s, ".om"):
215 case s == "uk" || strings.HasSuffix(s, ".uk"):
216 case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
217 case s == "tw" || strings.HasSuffix(s, ".tw"):
218 case s == "zw" || strings.HasSuffix(s, ".zw"):
219 case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
220
221 default:
222 continue
223 }
224 }
225
226 rules = append(rules, s)
227
228 nt, wildcard := nodeTypeNormal, false
229 switch {
230 case strings.HasPrefix(s, "*."):
231 s, nt = s[2:], nodeTypeParentOnly
232 wildcard = true
233 case strings.HasPrefix(s, "!"):
234 s, nt = s[1:], nodeTypeException
235 }
236 labels := strings.Split(s, ".")
237 for n, i := &root, len(labels)-1; i >= 0; i-- {
238 label := labels[i]
239 n = n.child(label)
240 if i == 0 {
241 if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
242 n.nodeType = nt
243 }
244 n.icann = n.icann && icann
245 n.wildcard = n.wildcard || wildcard
246 }
247 labelsMap[label] = true
248 }
249 }
250 labelsList = make([]string, 0, len(labelsMap))
251 for label := range labelsMap {
252 labelsList = append(labelsList, label)
253 }
254 sort.Strings(labelsList)
255
256 combinedText = combineText(labelsList)
257 if combinedText == "" {
258 return fmt.Errorf("internal error: combineText returned no text")
259 }
260 for _, label := range labelsList {
261 offset, length := strings.Index(combinedText, label), len(label)
262 if offset < 0 {
263 return fmt.Errorf("internal error: could not find %q in text %q", label, combinedText)
264 }
265 maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
266 if offset >= 1<<nodesBitsTextOffset {
267 return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
268 }
269 if length >= 1<<nodesBitsTextLength {
270 return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
271 }
272 labelEncoding[label] = uint64(offset)<<nodesBitsTextLength | uint64(length)
273 }
274
275 if err := root.walk(assignIndexes); err != nil {
276 return err
277 }
278
279 if err := generate(printMetadata, &root, "table.go"); err != nil {
280 return err
281 }
282 if err := generateBinaryData(&root, combinedText); err != nil {
283 return err
284 }
285 if err := generate(printTest, &root, "table_test.go"); err != nil {
286 return err
287 }
288 return nil
289 }
290
291 func generate(p func(io.Writer, *node) error, root *node, filename string) error {
292 buf := new(bytes.Buffer)
293 if err := p(buf, root); err != nil {
294 return err
295 }
296 b, err := format.Source(buf.Bytes())
297 if err != nil {
298 return err
299 }
300 return os.WriteFile(filename, b, 0644)
301 }
302
303 func gitCommit() (sha, date string, retErr error) {
304 res, err := http.Get(gitCommitURL)
305 if err != nil {
306 return "", "", err
307 }
308 if res.StatusCode != http.StatusOK {
309 return "", "", fmt.Errorf("bad GET status for %s: %s", gitCommitURL, res.Status)
310 }
311 defer res.Body.Close()
312 b, err := io.ReadAll(res.Body)
313 if err != nil {
314 return "", "", err
315 }
316 if m := shaRE.FindSubmatch(b); m != nil {
317 sha = string(m[1])
318 }
319 if m := dateRE.FindSubmatch(b); m != nil {
320 date = string(m[1])
321 }
322 if sha == "" || date == "" {
323 retErr = fmt.Errorf("could not find commit SHA and date in %s", gitCommitURL)
324 }
325 return sha, date, retErr
326 }
327
328 func printTest(w io.Writer, n *node) error {
329 fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
330 fmt.Fprintf(w, "package publicsuffix\n\nconst numICANNRules = %d\n\nvar rules = [...]string{\n", numICANNRules)
331 for _, rule := range rules {
332 fmt.Fprintf(w, "%q,\n", rule)
333 }
334 fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
335 if err := n.walk(func(n *node) error {
336 return printNodeLabel(w, n)
337 }); err != nil {
338 return err
339 }
340 fmt.Fprintf(w, "}\n")
341 return nil
342 }
343
344 func generateBinaryData(root *node, combinedText string) error {
345 if err := os.WriteFile("data/text", []byte(combinedText), 0666); err != nil {
346 return err
347 }
348
349 var nodes []byte
350 if err := root.walk(func(n *node) error {
351 for _, c := range n.children {
352 nodes = appendNodeEncoding(nodes, c)
353 }
354 return nil
355 }); err != nil {
356 return err
357 }
358 if err := os.WriteFile("data/nodes", nodes, 0666); err != nil {
359 return err
360 }
361
362 var children []byte
363 for _, c := range childrenEncoding {
364 children = binary.BigEndian.AppendUint32(children, c)
365 }
366 if err := os.WriteFile("data/children", children, 0666); err != nil {
367 return err
368 }
369
370 return nil
371 }
372
373 func appendNodeEncoding(b []byte, n *node) []byte {
374 encoding := labelEncoding[n.label]
375 if n.icann {
376 encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
377 }
378 encoding |= uint64(n.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
379 for i := nodesBits - 8; i >= 0; i -= 8 {
380 b = append(b, byte((encoding>>i)&0xff))
381 }
382 return b
383 }
384
385 func printMetadata(w io.Writer, n *node) error {
386 const header = `// generated by go run gen.go; DO NOT EDIT
387
388 package publicsuffix
389
390 import _ "embed"
391
392 const version = %q
393
394 const (
395 nodesBits = %d
396 nodesBitsChildren = %d
397 nodesBitsICANN = %d
398 nodesBitsTextOffset = %d
399 nodesBitsTextLength = %d
400
401 childrenBitsWildcard = %d
402 childrenBitsNodeType = %d
403 childrenBitsHi = %d
404 childrenBitsLo = %d
405 )
406
407 const (
408 nodeTypeNormal = %d
409 nodeTypeException = %d
410 nodeTypeParentOnly = %d
411 )
412
413 // numTLD is the number of top level domains.
414 const numTLD = %d
415
416 // text is the combined text of all labels.
417 //
418 //go:embed data/text
419 var text string
420
421 `
422 fmt.Fprintf(w, header, *version,
423 nodesBits,
424 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
425 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
426 nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
427 fmt.Fprintf(w, `
428 // nodes is the list of nodes. Each node is represented as a %v-bit integer,
429 // which encodes the node's children, wildcard bit and node type (as an index
430 // into the children array), ICANN bit and text.
431 //
432 // The layout within the node, from MSB to LSB, is:
433 // [%2d bits] unused
434 // [%2d bits] children index
435 // [%2d bits] ICANN bit
436 // [%2d bits] text index
437 // [%2d bits] text length
438 //
439 //go:embed data/nodes
440 var nodes uint40String
441 `,
442 nodesBits,
443 nodesBits-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
444 nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
445 fmt.Fprintf(w, `
446 // children is the list of nodes' children, the parent's wildcard bit and the
447 // parent's node type. If a node has no children then their children index
448 // will be in the range [0, 6), depending on the wildcard bit and node type.
449 //
450 // The layout within the uint32, from MSB to LSB, is:
451 // [%2d bits] unused
452 // [%2d bits] wildcard bit
453 // [%2d bits] node type
454 // [%2d bits] high nodes index (exclusive) of children
455 // [%2d bits] low nodes index (inclusive) of children
456 //
457 //go:embed data/children
458 var children uint32String
459 `,
460 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
461 childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
462
463 fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
464 fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
465 fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
466 fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
467 fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
468 return nil
469 }
470
471 type node struct {
472 label string
473 nodeType int
474 icann bool
475 wildcard bool
476
477
478 nodesIndex, childrenIndex int
479
480
481 firstChild int
482
483 children []*node
484 }
485
486 func (n *node) walk(f func(*node) error) error {
487 if err := f(n); err != nil {
488 return err
489 }
490 for _, c := range n.children {
491 if err := c.walk(f); err != nil {
492 return err
493 }
494 }
495 return nil
496 }
497
498
499
500 func (n *node) child(label string) *node {
501 for _, c := range n.children {
502 if c.label == label {
503 return c
504 }
505 }
506 c := &node{
507 label: label,
508 nodeType: nodeTypeParentOnly,
509 icann: true,
510 }
511 n.children = append(n.children, c)
512 sort.Sort(byLabel(n.children))
513 return c
514 }
515
516 type byLabel []*node
517
518 func (b byLabel) Len() int { return len(b) }
519 func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
520 func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
521
522 var nextNodesIndex int
523
524
525
526 var childrenEncoding = []uint32{
527 0 << (childrenBitsLo + childrenBitsHi),
528 1 << (childrenBitsLo + childrenBitsHi),
529 2 << (childrenBitsLo + childrenBitsHi),
530 4 << (childrenBitsLo + childrenBitsHi),
531 5 << (childrenBitsLo + childrenBitsHi),
532 6 << (childrenBitsLo + childrenBitsHi),
533 }
534
535 var firstCallToAssignIndexes = true
536
537 func assignIndexes(n *node) error {
538 if len(n.children) != 0 {
539
540 n.firstChild = nextNodesIndex
541 for _, c := range n.children {
542 c.nodesIndex = nextNodesIndex
543 nextNodesIndex++
544 }
545
546
547 if firstCallToAssignIndexes {
548 firstCallToAssignIndexes = false
549 return nil
550 }
551
552
553 maxChildren = max(maxChildren, len(childrenEncoding))
554 if len(childrenEncoding) >= 1<<nodesBitsChildren {
555 return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
556 }
557 n.childrenIndex = len(childrenEncoding)
558 lo := uint32(n.firstChild)
559 hi := lo + uint32(len(n.children))
560 maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
561 if lo >= 1<<childrenBitsLo {
562 return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
563 }
564 if hi >= 1<<childrenBitsHi {
565 return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
566 }
567 enc := hi<<childrenBitsLo | lo
568 enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
569 if n.wildcard {
570 enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
571 }
572 childrenEncoding = append(childrenEncoding, enc)
573 } else {
574 n.childrenIndex = n.nodeType
575 if n.wildcard {
576 n.childrenIndex += numNodeType
577 }
578 }
579 return nil
580 }
581
582 func printNodeLabel(w io.Writer, n *node) error {
583 for _, c := range n.children {
584 fmt.Fprintf(w, "%q,\n", c.label)
585 }
586 return nil
587 }
588
589 func icannStr(icann bool) string {
590 if icann {
591 return "I"
592 }
593 return " "
594 }
595
596 func wildcardStr(wildcard bool) string {
597 if wildcard {
598 return "*"
599 }
600 return " "
601 }
602
603
604
605
606 func combineText(labelsList []string) string {
607 beforeLength := 0
608 for _, s := range labelsList {
609 beforeLength += len(s)
610 }
611
612 text := crush(removeSubstrings(labelsList))
613 if *v {
614 fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
615 }
616 return text
617 }
618
619 type byLength []string
620
621 func (s byLength) Len() int { return len(s) }
622 func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
623 func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
624
625
626
627 func removeSubstrings(input []string) []string {
628
629 ss := append(make([]string, 0, len(input)), input...)
630 sort.Sort(byLength(ss))
631
632 for i, shortString := range ss {
633
634
635 for _, longString := range ss[i+1:] {
636 if strings.Contains(longString, shortString) {
637 ss[i] = ""
638 break
639 }
640 }
641 }
642
643
644 sort.Strings(ss)
645 for len(ss) > 0 && ss[0] == "" {
646 ss = ss[1:]
647 }
648 return ss
649 }
650
651
652
653 func crush(ss []string) string {
654 maxLabelLen := 0
655 for _, s := range ss {
656 if maxLabelLen < len(s) {
657 maxLabelLen = len(s)
658 }
659 }
660
661 for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
662 prefixes := makePrefixMap(ss, prefixLen)
663 for i, s := range ss {
664 if len(s) <= prefixLen {
665 continue
666 }
667 mergeLabel(ss, i, prefixLen, prefixes)
668 }
669 }
670
671 return strings.Join(ss, "")
672 }
673
674
675
676
677
678
679 func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
680 s := ss[i]
681 suffix := s[len(s)-prefixLen:]
682 for _, j := range prefixes[suffix] {
683
684 if ss[j] == "" || i == j {
685 continue
686 }
687 if *v {
688 fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
689 prefixLen, i, j, ss[i], ss[j], suffix)
690 }
691 ss[i] += ss[j][prefixLen:]
692 ss[j] = ""
693
694
695
696
697
698
699
700
701
702 mergeLabel(ss, i, prefixLen, prefixes)
703 return
704 }
705 }
706
707
708
709
710 type prefixMap map[string][]int
711
712
713 func makePrefixMap(ss []string, prefixLen int) prefixMap {
714 prefixes := make(prefixMap)
715 for i, s := range ss {
716
717
718
719 if prefixLen < len(s) {
720 prefix := s[:prefixLen]
721 prefixes[prefix] = append(prefixes[prefix], i)
722 }
723 }
724
725 return prefixes
726 }
727
View as plain text