...

Source file src/github.com/rivo/uniseg/gen_properties.go

Documentation: github.com/rivo/uniseg

     1  //go:build generate
     2  
     3  // This program generates a property file in Go file from Unicode Character
     4  // Database auxiliary data files. The command line arguments are as follows:
     5  //
     6  //  1. The name of the Unicode data file (just the filename, without extension).
     7  //     Can be "-" (to skip) if the emoji flag is included.
     8  //  2. The name of the locally generated Go file.
     9  //  3. The name of the slice mapping code points to properties.
    10  //  4. The name of the generator, for logging purposes.
    11  //  5. (Optional) Flags, comma-separated. The following flags are available:
    12  //     - "emojis=<property>": include the specified emoji properties (e.g.
    13  //     "Extended_Pictographic").
    14  //     - "gencat": include general category properties.
    15  //
    16  //go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
    17  //go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
    18  //go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
    19  //go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
    20  //go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
    21  //go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
    22  package main
    23  
    24  import (
    25  	"bufio"
    26  	"bytes"
    27  	"errors"
    28  	"fmt"
    29  	"go/format"
    30  	"io/ioutil"
    31  	"log"
    32  	"net/http"
    33  	"os"
    34  	"regexp"
    35  	"sort"
    36  	"strconv"
    37  	"strings"
    38  	"time"
    39  )
    40  
    41  // We want to test against a specific version rather than the latest. When the
    42  // package is upgraded to a new version, change these to generate new tests.
    43  const (
    44  	propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
    45  	emojiURL    = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
    46  )
    47  
    48  // The regular expression for a line containing a code point range property.
    49  var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
    50  
    51  func main() {
    52  	if len(os.Args) < 5 {
    53  		fmt.Println("Not enough arguments, see code for details")
    54  		os.Exit(1)
    55  	}
    56  
    57  	log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
    58  	log.SetFlags(0)
    59  
    60  	// Parse flags.
    61  	flags := make(map[string]string)
    62  	if len(os.Args) >= 6 {
    63  		for _, flag := range strings.Split(os.Args[5], ",") {
    64  			flagFields := strings.Split(flag, "=")
    65  			if len(flagFields) == 1 {
    66  				flags[flagFields[0]] = "yes"
    67  			} else {
    68  				flags[flagFields[0]] = flagFields[1]
    69  			}
    70  		}
    71  	}
    72  
    73  	// Parse the text file and generate Go source code from it.
    74  	_, includeGeneralCategory := flags["gencat"]
    75  	var mainURL string
    76  	if os.Args[1] != "-" {
    77  		mainURL = fmt.Sprintf(propertyURL, os.Args[1])
    78  	}
    79  	src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
    80  	if err != nil {
    81  		log.Fatal(err)
    82  	}
    83  
    84  	// Format the Go code.
    85  	formatted, err := format.Source([]byte(src))
    86  	if err != nil {
    87  		log.Fatal("gofmt:", err)
    88  	}
    89  
    90  	// Save it to the (local) target file.
    91  	log.Print("Writing to ", os.Args[2])
    92  	if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
    93  		log.Fatal(err)
    94  	}
    95  }
    96  
    97  // parse parses the Unicode Properties text files located at the given URLs and
    98  // returns their equivalent Go source code to be used in the uniseg package. If
    99  // "emojiProperty" is not an empty string, emoji code points for that emoji
   100  // property (e.g. "Extended_Pictographic") will be included. In those cases, you
   101  // may pass an empty "propertyURL" to skip parsing the main properties file. If
   102  // "includeGeneralCategory" is true, the Unicode General Category property will
   103  // be extracted from the comments and included in the output.
   104  func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
   105  	if propertyURL == "" && emojiProperty == "" {
   106  		return "", errors.New("no properties to parse")
   107  	}
   108  
   109  	// Temporary buffer to hold properties.
   110  	var properties [][4]string
   111  
   112  	// Open the first URL.
   113  	if propertyURL != "" {
   114  		log.Printf("Parsing %s", propertyURL)
   115  		res, err := http.Get(propertyURL)
   116  		if err != nil {
   117  			return "", err
   118  		}
   119  		in1 := res.Body
   120  		defer in1.Close()
   121  
   122  		// Parse it.
   123  		scanner := bufio.NewScanner(in1)
   124  		num := 0
   125  		for scanner.Scan() {
   126  			num++
   127  			line := strings.TrimSpace(scanner.Text())
   128  
   129  			// Skip comments and empty lines.
   130  			if strings.HasPrefix(line, "#") || line == "" {
   131  				continue
   132  			}
   133  
   134  			// Everything else must be a code point range, a property and a comment.
   135  			from, to, property, comment, err := parseProperty(line)
   136  			if err != nil {
   137  				return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
   138  			}
   139  			properties = append(properties, [4]string{from, to, property, comment})
   140  		}
   141  		if err := scanner.Err(); err != nil {
   142  			return "", err
   143  		}
   144  	}
   145  
   146  	// Open the second URL.
   147  	if emojiProperty != "" {
   148  		log.Printf("Parsing %s", emojiURL)
   149  		res, err := http.Get(emojiURL)
   150  		if err != nil {
   151  			return "", err
   152  		}
   153  		in2 := res.Body
   154  		defer in2.Close()
   155  
   156  		// Parse it.
   157  		scanner := bufio.NewScanner(in2)
   158  		num := 0
   159  		for scanner.Scan() {
   160  			num++
   161  			line := scanner.Text()
   162  
   163  			// Skip comments, empty lines, and everything not containing
   164  			// "Extended_Pictographic".
   165  			if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
   166  				continue
   167  			}
   168  
   169  			// Everything else must be a code point range, a property and a comment.
   170  			from, to, property, comment, err := parseProperty(line)
   171  			if err != nil {
   172  				return "", fmt.Errorf("emojis line %d: %v", num, err)
   173  			}
   174  			properties = append(properties, [4]string{from, to, property, comment})
   175  		}
   176  		if err := scanner.Err(); err != nil {
   177  			return "", err
   178  		}
   179  	}
   180  
   181  	// Avoid overflow during binary search.
   182  	if len(properties) >= 1<<31 {
   183  		return "", errors.New("too many properties")
   184  	}
   185  
   186  	// Sort properties.
   187  	sort.Slice(properties, func(i, j int) bool {
   188  		left, _ := strconv.ParseUint(properties[i][0], 16, 64)
   189  		right, _ := strconv.ParseUint(properties[j][0], 16, 64)
   190  		return left < right
   191  	})
   192  
   193  	// Header.
   194  	var (
   195  		buf          bytes.Buffer
   196  		emojiComment string
   197  	)
   198  	columns := 3
   199  	if includeGeneralCategory {
   200  		columns = 4
   201  	}
   202  	if emojiURL != "" {
   203  		emojiComment = `
   204  // and
   205  // ` + emojiURL + `
   206  // ("Extended_Pictographic" only)`
   207  	}
   208  	buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
   209  
   210  package uniseg
   211  
   212  // ` + os.Args[3] + ` are taken from
   213  // ` + propertyURL + emojiComment + `
   214  // on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
   215  // license agreement.
   216  var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
   217  	`)
   218  
   219  	// Properties.
   220  	for _, prop := range properties {
   221  		if includeGeneralCategory {
   222  			generalCategory := "gc" + prop[3][:2]
   223  			if generalCategory == "gcL&" {
   224  				generalCategory = "gcLC"
   225  			}
   226  			prop[3] = prop[3][3:]
   227  			fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
   228  		} else {
   229  			fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
   230  		}
   231  	}
   232  
   233  	// Tail.
   234  	buf.WriteString("}")
   235  
   236  	return buf.String(), nil
   237  }
   238  
   239  // parseProperty parses a line of the Unicode properties text file containing a
   240  // property for a code point range and returns it along with its comment.
   241  func parseProperty(line string) (from, to, property, comment string, err error) {
   242  	fields := propertyPattern.FindStringSubmatch(line)
   243  	if fields == nil {
   244  		err = errors.New("no property found")
   245  		return
   246  	}
   247  	from = fields[1]
   248  	to = fields[3]
   249  	if to == "" {
   250  		to = from
   251  	}
   252  	property = fields[4]
   253  	comment = fields[5]
   254  	return
   255  }
   256  
   257  // translateProperty translates a property name as used in the Unicode data file
   258  // to a variable used in the Go code.
   259  func translateProperty(prefix, property string) string {
   260  	return prefix + strings.ReplaceAll(property, "_", "")
   261  }
   262  

View as plain text