...

Source file src/github.com/gabriel-vasile/mimetype/internal/magic/text.go

Documentation: github.com/gabriel-vasile/mimetype/internal/magic

     1  package magic
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"strings"
     7  	"time"
     8  
     9  	"github.com/gabriel-vasile/mimetype/internal/charset"
    10  	"github.com/gabriel-vasile/mimetype/internal/json"
    11  )
    12  
    13  var (
    14  	// HTML matches a Hypertext Markup Language file.
    15  	HTML = markup(
    16  		[]byte("<!DOCTYPE HTML"),
    17  		[]byte("<HTML"),
    18  		[]byte("<HEAD"),
    19  		[]byte("<SCRIPT"),
    20  		[]byte("<IFRAME"),
    21  		[]byte("<H1"),
    22  		[]byte("<DIV"),
    23  		[]byte("<FONT"),
    24  		[]byte("<TABLE"),
    25  		[]byte("<A"),
    26  		[]byte("<STYLE"),
    27  		[]byte("<TITLE"),
    28  		[]byte("<B"),
    29  		[]byte("<BODY"),
    30  		[]byte("<BR"),
    31  		[]byte("<P"),
    32  	)
    33  	// XML matches an Extensible Markup Language file.
    34  	XML = markup([]byte("<?XML"))
    35  	// Owl2 matches an Owl ontology file.
    36  	Owl2 = xml(newXMLSig("Ontology", `xmlns="http://www.w3.org/2002/07/owl#"`))
    37  	// Rss matches a Rich Site Summary file.
    38  	Rss = xml(newXMLSig("rss", ""))
    39  	// Atom matches an Atom Syndication Format file.
    40  	Atom = xml(newXMLSig("feed", `xmlns="http://www.w3.org/2005/Atom"`))
    41  	// Kml matches a Keyhole Markup Language file.
    42  	Kml = xml(
    43  		newXMLSig("kml", `xmlns="http://www.opengis.net/kml/2.2"`),
    44  		newXMLSig("kml", `xmlns="http://earth.google.com/kml/2.0"`),
    45  		newXMLSig("kml", `xmlns="http://earth.google.com/kml/2.1"`),
    46  		newXMLSig("kml", `xmlns="http://earth.google.com/kml/2.2"`),
    47  	)
    48  	// Xliff matches a XML Localization Interchange File Format file.
    49  	Xliff = xml(newXMLSig("xliff", `xmlns="urn:oasis:names:tc:xliff:document:1.2"`))
    50  	// Collada matches a COLLAborative Design Activity file.
    51  	Collada = xml(newXMLSig("COLLADA", `xmlns="http://www.collada.org/2005/11/COLLADASchema"`))
    52  	// Gml matches a Geography Markup Language file.
    53  	Gml = xml(
    54  		newXMLSig("", `xmlns:gml="http://www.opengis.net/gml"`),
    55  		newXMLSig("", `xmlns:gml="http://www.opengis.net/gml/3.2"`),
    56  		newXMLSig("", `xmlns:gml="http://www.opengis.net/gml/3.3/exr"`),
    57  	)
    58  	// Gpx matches a GPS Exchange Format file.
    59  	Gpx = xml(newXMLSig("gpx", `xmlns="http://www.topografix.com/GPX/1/1"`))
    60  	// Tcx matches a Training Center XML file.
    61  	Tcx = xml(newXMLSig("TrainingCenterDatabase", `xmlns="http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2"`))
    62  	// X3d matches an Extensible 3D Graphics file.
    63  	X3d = xml(newXMLSig("X3D", `xmlns:xsd="http://www.w3.org/2001/XMLSchema-instance"`))
    64  	// Amf matches an Additive Manufacturing XML file.
    65  	Amf = xml(newXMLSig("amf", ""))
    66  	// Threemf matches a 3D Manufacturing Format file.
    67  	Threemf = xml(newXMLSig("model", `xmlns="http://schemas.microsoft.com/3dmanufacturing/core/2015/02"`))
    68  	// Xfdf matches a XML Forms Data Format file.
    69  	Xfdf = xml(newXMLSig("xfdf", `xmlns="http://ns.adobe.com/xfdf/"`))
    70  	// VCard matches a Virtual Contact File.
    71  	VCard = ciPrefix([]byte("BEGIN:VCARD\n"), []byte("BEGIN:VCARD\r\n"))
    72  	// ICalendar matches a iCalendar file.
    73  	ICalendar = ciPrefix([]byte("BEGIN:VCALENDAR\n"), []byte("BEGIN:VCALENDAR\r\n"))
    74  	phpPageF  = ciPrefix(
    75  		[]byte("<?PHP"),
    76  		[]byte("<?\n"),
    77  		[]byte("<?\r"),
    78  		[]byte("<? "),
    79  	)
    80  	phpScriptF = shebang(
    81  		[]byte("/usr/local/bin/php"),
    82  		[]byte("/usr/bin/php"),
    83  		[]byte("/usr/bin/env php"),
    84  	)
    85  	// Js matches a Javascript file.
    86  	Js = shebang(
    87  		[]byte("/bin/node"),
    88  		[]byte("/usr/bin/node"),
    89  		[]byte("/bin/nodejs"),
    90  		[]byte("/usr/bin/nodejs"),
    91  		[]byte("/usr/bin/env node"),
    92  		[]byte("/usr/bin/env nodejs"),
    93  	)
    94  	// Lua matches a Lua programming language file.
    95  	Lua = shebang(
    96  		[]byte("/usr/bin/lua"),
    97  		[]byte("/usr/local/bin/lua"),
    98  		[]byte("/usr/bin/env lua"),
    99  	)
   100  	// Perl matches a Perl programming language file.
   101  	Perl = shebang(
   102  		[]byte("/usr/bin/perl"),
   103  		[]byte("/usr/bin/env perl"),
   104  	)
   105  	// Python matches a Python programming language file.
   106  	Python = shebang(
   107  		[]byte("/usr/bin/python"),
   108  		[]byte("/usr/local/bin/python"),
   109  		[]byte("/usr/bin/env python"),
   110  	)
   111  	// Tcl matches a Tcl programming language file.
   112  	Tcl = shebang(
   113  		[]byte("/usr/bin/tcl"),
   114  		[]byte("/usr/local/bin/tcl"),
   115  		[]byte("/usr/bin/env tcl"),
   116  		[]byte("/usr/bin/tclsh"),
   117  		[]byte("/usr/local/bin/tclsh"),
   118  		[]byte("/usr/bin/env tclsh"),
   119  		[]byte("/usr/bin/wish"),
   120  		[]byte("/usr/local/bin/wish"),
   121  		[]byte("/usr/bin/env wish"),
   122  	)
   123  	// Rtf matches a Rich Text Format file.
   124  	Rtf = prefix([]byte("{\\rtf1"))
   125  )
   126  
   127  // Text matches a plain text file.
   128  //
   129  // TODO: This function does not parse BOM-less UTF16 and UTF32 files. Not really
   130  // sure it should. Linux file utility also requires a BOM for UTF16 and UTF32.
   131  func Text(raw []byte, limit uint32) bool {
   132  	// First look for BOM.
   133  	if cset := charset.FromBOM(raw); cset != "" {
   134  		return true
   135  	}
   136  	// Binary data bytes as defined here: https://mimesniff.spec.whatwg.org/#binary-data-byte
   137  	for _, b := range raw {
   138  		if b <= 0x08 ||
   139  			b == 0x0B ||
   140  			0x0E <= b && b <= 0x1A ||
   141  			0x1C <= b && b <= 0x1F {
   142  			return false
   143  		}
   144  	}
   145  	return true
   146  }
   147  
   148  // Php matches a PHP: Hypertext Preprocessor file.
   149  func Php(raw []byte, limit uint32) bool {
   150  	if res := phpPageF(raw, limit); res {
   151  		return res
   152  	}
   153  	return phpScriptF(raw, limit)
   154  }
   155  
   156  // JSON matches a JavaScript Object Notation file.
   157  func JSON(raw []byte, limit uint32) bool {
   158  	raw = trimLWS(raw)
   159  	// #175 A single JSON string, number or bool is not considered JSON.
   160  	// JSON objects and arrays are reported as JSON.
   161  	if len(raw) < 2 || (raw[0] != '[' && raw[0] != '{') {
   162  		return false
   163  	}
   164  	parsed, err := json.Scan(raw)
   165  	// If the full file content was provided, check there is no error.
   166  	if limit == 0 || len(raw) < int(limit) {
   167  		return err == nil
   168  	}
   169  
   170  	// If a section of the file was provided, check if all of it was parsed.
   171  	return parsed == len(raw) && len(raw) > 0
   172  }
   173  
   174  // GeoJSON matches a RFC 7946 GeoJSON file.
   175  //
   176  // GeoJSON detection implies searching for key:value pairs like: `"type": "Feature"`
   177  // in the input.
   178  // BUG(gabriel-vasile): The "type" key should be searched for in the root object.
   179  func GeoJSON(raw []byte, limit uint32) bool {
   180  	raw = trimLWS(raw)
   181  	if len(raw) == 0 {
   182  		return false
   183  	}
   184  	// GeoJSON is always a JSON object, not a JSON array or any other JSON value.
   185  	if raw[0] != '{' {
   186  		return false
   187  	}
   188  
   189  	s := []byte(`"type"`)
   190  	si, sl := bytes.Index(raw, s), len(s)
   191  
   192  	if si == -1 {
   193  		return false
   194  	}
   195  
   196  	// If the "type" string is the suffix of the input,
   197  	// there is no need to search for the value of the key.
   198  	if si+sl == len(raw) {
   199  		return false
   200  	}
   201  	// Skip the "type" part.
   202  	raw = raw[si+sl:]
   203  	// Skip any whitespace before the colon.
   204  	raw = trimLWS(raw)
   205  	// Check for colon.
   206  	if len(raw) == 0 || raw[0] != ':' {
   207  		return false
   208  	}
   209  	// Skip any whitespace after the colon.
   210  	raw = trimLWS(raw[1:])
   211  
   212  	geoJSONTypes := [][]byte{
   213  		[]byte(`"Feature"`),
   214  		[]byte(`"FeatureCollection"`),
   215  		[]byte(`"Point"`),
   216  		[]byte(`"LineString"`),
   217  		[]byte(`"Polygon"`),
   218  		[]byte(`"MultiPoint"`),
   219  		[]byte(`"MultiLineString"`),
   220  		[]byte(`"MultiPolygon"`),
   221  		[]byte(`"GeometryCollection"`),
   222  	}
   223  	for _, t := range geoJSONTypes {
   224  		if bytes.HasPrefix(raw, t) {
   225  			return true
   226  		}
   227  	}
   228  
   229  	return false
   230  }
   231  
   232  // NdJSON matches a Newline delimited JSON file. All complete lines from raw
   233  // must be valid JSON documents meaning they contain one of the valid JSON data
   234  // types.
   235  func NdJSON(raw []byte, limit uint32) bool {
   236  	lCount, hasObjOrArr := 0, false
   237  	sc := bufio.NewScanner(dropLastLine(raw, limit))
   238  	for sc.Scan() {
   239  		l := sc.Bytes()
   240  		// Empty lines are allowed in NDJSON.
   241  		if l = trimRWS(trimLWS(l)); len(l) == 0 {
   242  			continue
   243  		}
   244  		_, err := json.Scan(l)
   245  		if err != nil {
   246  			return false
   247  		}
   248  		if l[0] == '[' || l[0] == '{' {
   249  			hasObjOrArr = true
   250  		}
   251  		lCount++
   252  	}
   253  
   254  	return lCount > 1 && hasObjOrArr
   255  }
   256  
   257  // HAR matches a HAR Spec file.
   258  // Spec: http://www.softwareishard.com/blog/har-12-spec/
   259  func HAR(raw []byte, limit uint32) bool {
   260  	s := []byte(`"log"`)
   261  	si, sl := bytes.Index(raw, s), len(s)
   262  
   263  	if si == -1 {
   264  		return false
   265  	}
   266  
   267  	// If the "log" string is the suffix of the input,
   268  	// there is no need to search for the value of the key.
   269  	if si+sl == len(raw) {
   270  		return false
   271  	}
   272  	// Skip the "log" part.
   273  	raw = raw[si+sl:]
   274  	// Skip any whitespace before the colon.
   275  	raw = trimLWS(raw)
   276  	// Check for colon.
   277  	if len(raw) == 0 || raw[0] != ':' {
   278  		return false
   279  	}
   280  	// Skip any whitespace after the colon.
   281  	raw = trimLWS(raw[1:])
   282  
   283  	harJSONTypes := [][]byte{
   284  		[]byte(`"version"`),
   285  		[]byte(`"creator"`),
   286  		[]byte(`"entries"`),
   287  	}
   288  	for _, t := range harJSONTypes {
   289  		si := bytes.Index(raw, t)
   290  		if si > -1 {
   291  			return true
   292  		}
   293  	}
   294  
   295  	return false
   296  }
   297  
   298  // Svg matches a SVG file.
   299  func Svg(raw []byte, limit uint32) bool {
   300  	return bytes.Contains(raw, []byte("<svg"))
   301  }
   302  
   303  // Srt matches a SubRip file.
   304  func Srt(in []byte, _ uint32) bool {
   305  	s := bufio.NewScanner(bytes.NewReader(in))
   306  	if !s.Scan() {
   307  		return false
   308  	}
   309  	// First line must be 1.
   310  	if s.Text() != "1" {
   311  		return false
   312  	}
   313  
   314  	if !s.Scan() {
   315  		return false
   316  	}
   317  	secondLine := s.Text()
   318  	// Timestamp format (e.g: 00:02:16,612 --> 00:02:19,376) limits secondLine
   319  	// length to exactly 29 characters.
   320  	if len(secondLine) != 29 {
   321  		return false
   322  	}
   323  	// Decimal separator of fractional seconds in the timestamps must be a
   324  	// comma, not a period.
   325  	if strings.Contains(secondLine, ".") {
   326  		return false
   327  	}
   328  	// For Go <1.17, comma is not recognised as a decimal separator by `time.Parse`.
   329  	secondLine = strings.ReplaceAll(secondLine, ",", ".")
   330  	// Second line must be a time range.
   331  	ts := strings.Split(secondLine, " --> ")
   332  	if len(ts) != 2 {
   333  		return false
   334  	}
   335  	const layout = "15:04:05.000"
   336  	t0, err := time.Parse(layout, ts[0])
   337  	if err != nil {
   338  		return false
   339  	}
   340  	t1, err := time.Parse(layout, ts[1])
   341  	if err != nil {
   342  		return false
   343  	}
   344  	if t0.After(t1) {
   345  		return false
   346  	}
   347  
   348  	// A third line must exist and not be empty. This is the actual subtitle text.
   349  	return s.Scan() && len(s.Bytes()) != 0
   350  }
   351  
   352  // Vtt matches a Web Video Text Tracks (WebVTT) file. See
   353  // https://www.iana.org/assignments/media-types/text/vtt.
   354  func Vtt(raw []byte, limit uint32) bool {
   355  	// Prefix match.
   356  	prefixes := [][]byte{
   357  		{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x0A}, // UTF-8 BOM, "WEBVTT" and a line feed
   358  		{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x0D}, // UTF-8 BOM, "WEBVTT" and a carriage return
   359  		{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x20}, // UTF-8 BOM, "WEBVTT" and a space
   360  		{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x09}, // UTF-8 BOM, "WEBVTT" and a horizontal tab
   361  		{0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x0A},                   // "WEBVTT" and a line feed
   362  		{0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x0D},                   // "WEBVTT" and a carriage return
   363  		{0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x20},                   // "WEBVTT" and a space
   364  		{0x57, 0x45, 0x42, 0x56, 0x54, 0x54, 0x09},                   // "WEBVTT" and a horizontal tab
   365  	}
   366  	for _, p := range prefixes {
   367  		if bytes.HasPrefix(raw, p) {
   368  			return true
   369  		}
   370  	}
   371  
   372  	// Exact match.
   373  	return bytes.Equal(raw, []byte{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) || // UTF-8 BOM and "WEBVTT"
   374  		bytes.Equal(raw, []byte{0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) // "WEBVTT"
   375  }
   376  

View as plain text