...

Source file src/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go

Documentation: github.com/opencontainers/runc/libcontainer/cgroups/devices

     1  // SPDX-License-Identifier: Apache-2.0
     2  /*
     3   * Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
     4   * Copyright (C) 2020 SUSE LLC
     5   *
     6   * Licensed under the Apache License, Version 2.0 (the "License");
     7   * you may not use this file except in compliance with the License.
     8   * You may obtain a copy of the License at
     9   *
    10   *     http://www.apache.org/licenses/LICENSE-2.0
    11   *
    12   * Unless required by applicable law or agreed to in writing, software
    13   * distributed under the License is distributed on an "AS IS" BASIS,
    14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15   * See the License for the specific language governing permissions and
    16   * limitations under the License.
    17   */
    18  
    19  package devices
    20  
    21  import (
    22  	"bufio"
    23  	"fmt"
    24  	"io"
    25  	"sort"
    26  	"strconv"
    27  	"strings"
    28  
    29  	"github.com/opencontainers/runc/libcontainer/devices"
    30  )
    31  
    32  // deviceMeta is a Rule without the Allow or Permissions fields, and no
    33  // wildcard-type support. It's effectively the "match" portion of a metadata
    34  // rule, for the purposes of our emulation.
    35  type deviceMeta struct {
    36  	node  devices.Type
    37  	major int64
    38  	minor int64
    39  }
    40  
    41  // deviceRule is effectively the tuple (deviceMeta, Permissions).
    42  type deviceRule struct {
    43  	meta  deviceMeta
    44  	perms devices.Permissions
    45  }
    46  
    47  // deviceRules is a mapping of device metadata rules to the associated
    48  // permissions in the ruleset.
    49  type deviceRules map[deviceMeta]devices.Permissions
    50  
    51  func (r deviceRules) orderedEntries() []deviceRule {
    52  	var rules []deviceRule
    53  	for meta, perms := range r {
    54  		rules = append(rules, deviceRule{meta: meta, perms: perms})
    55  	}
    56  	sort.Slice(rules, func(i, j int) bool {
    57  		// Sort by (major, minor, type).
    58  		a, b := rules[i].meta, rules[j].meta
    59  		return a.major < b.major ||
    60  			(a.major == b.major && a.minor < b.minor) ||
    61  			(a.major == b.major && a.minor == b.minor && a.node < b.node)
    62  	})
    63  	return rules
    64  }
    65  
    66  type Emulator struct {
    67  	defaultAllow bool
    68  	rules        deviceRules
    69  }
    70  
    71  func (e *Emulator) IsBlacklist() bool {
    72  	return e.defaultAllow
    73  }
    74  
    75  func (e *Emulator) IsAllowAll() bool {
    76  	return e.IsBlacklist() && len(e.rules) == 0
    77  }
    78  
    79  func parseLine(line string) (*deviceRule, error) {
    80  	// Input: node major:minor perms.
    81  	fields := strings.FieldsFunc(line, func(r rune) bool {
    82  		return r == ' ' || r == ':'
    83  	})
    84  	if len(fields) != 4 {
    85  		return nil, fmt.Errorf("malformed devices.list rule %s", line)
    86  	}
    87  
    88  	var (
    89  		rule  deviceRule
    90  		node  = fields[0]
    91  		major = fields[1]
    92  		minor = fields[2]
    93  		perms = fields[3]
    94  	)
    95  
    96  	// Parse the node type.
    97  	switch node {
    98  	case "a":
    99  		// Super-special case -- "a" always means every device with every
   100  		// access mode. In fact, for devices.list this actually indicates that
   101  		// the cgroup is in black-list mode.
   102  		// TODO: Double-check that the entire file is "a *:* rwm".
   103  		return nil, nil
   104  	case "b":
   105  		rule.meta.node = devices.BlockDevice
   106  	case "c":
   107  		rule.meta.node = devices.CharDevice
   108  	default:
   109  		return nil, fmt.Errorf("unknown device type %q", node)
   110  	}
   111  
   112  	// Parse the major number.
   113  	if major == "*" {
   114  		rule.meta.major = devices.Wildcard
   115  	} else {
   116  		val, err := strconv.ParseUint(major, 10, 32)
   117  		if err != nil {
   118  			return nil, fmt.Errorf("invalid major number: %w", err)
   119  		}
   120  		rule.meta.major = int64(val)
   121  	}
   122  
   123  	// Parse the minor number.
   124  	if minor == "*" {
   125  		rule.meta.minor = devices.Wildcard
   126  	} else {
   127  		val, err := strconv.ParseUint(minor, 10, 32)
   128  		if err != nil {
   129  			return nil, fmt.Errorf("invalid minor number: %w", err)
   130  		}
   131  		rule.meta.minor = int64(val)
   132  	}
   133  
   134  	// Parse the access permissions.
   135  	rule.perms = devices.Permissions(perms)
   136  	if !rule.perms.IsValid() || rule.perms.IsEmpty() {
   137  		return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
   138  	}
   139  	return &rule, nil
   140  }
   141  
   142  func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam
   143  	if e.rules == nil {
   144  		e.rules = make(map[deviceMeta]devices.Permissions)
   145  	}
   146  
   147  	// Merge with any pre-existing permissions.
   148  	oldPerms := e.rules[rule.meta]
   149  	newPerms := rule.perms.Union(oldPerms)
   150  	e.rules[rule.meta] = newPerms
   151  	return nil
   152  }
   153  
   154  func (e *Emulator) rmRule(rule deviceRule) error {
   155  	// Give an error if any of the permissions requested to be removed are
   156  	// present in a partially-matching wildcard rule, because such rules will
   157  	// be ignored by cgroupv1.
   158  	//
   159  	// This is a diversion from cgroupv1, but is necessary to avoid leading
   160  	// users into a false sense of security. cgroupv1 will silently(!) ignore
   161  	// requests to remove partial exceptions, but we really shouldn't do that.
   162  	//
   163  	// It may seem like we could just "split" wildcard rules which hit this
   164  	// issue, but unfortunately there are 2^32 possible major and minor
   165  	// numbers, which would exhaust kernel memory quickly if we did this. Not
   166  	// to mention it'd be really slow (the kernel side is implemented as a
   167  	// linked-list of exceptions).
   168  	for _, partialMeta := range []deviceMeta{
   169  		{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
   170  		{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
   171  		{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
   172  	} {
   173  		// This wildcard rule is equivalent to the requested rule, so skip it.
   174  		if rule.meta == partialMeta {
   175  			continue
   176  		}
   177  		// Only give an error if the set of permissions overlap.
   178  		partialPerms := e.rules[partialMeta]
   179  		if !partialPerms.Intersection(rule.perms).IsEmpty() {
   180  			return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
   181  		}
   182  	}
   183  
   184  	// Subtract all of the permissions listed from the full match rule. If the
   185  	// rule didn't exist, all of this is a no-op.
   186  	newPerms := e.rules[rule.meta].Difference(rule.perms)
   187  	if newPerms.IsEmpty() {
   188  		delete(e.rules, rule.meta)
   189  	} else {
   190  		e.rules[rule.meta] = newPerms
   191  	}
   192  	// TODO: The actual cgroup code doesn't care if an exception didn't exist
   193  	//       during removal, so not erroring out here is /accurate/ but quite
   194  	//       worrying. Maybe we should do additional validation, but again we
   195  	//       have to worry about backwards-compatibility.
   196  	return nil
   197  }
   198  
   199  func (e *Emulator) allow(rule *deviceRule) error {
   200  	// This cgroup is configured as a black-list. Reset the entire emulator,
   201  	// and put is into black-list mode.
   202  	if rule == nil || rule.meta.node == devices.WildcardDevice {
   203  		*e = Emulator{
   204  			defaultAllow: true,
   205  			rules:        nil,
   206  		}
   207  		return nil
   208  	}
   209  
   210  	var err error
   211  	if e.defaultAllow {
   212  		err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
   213  	} else {
   214  		err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
   215  	}
   216  	return err
   217  }
   218  
   219  func (e *Emulator) deny(rule *deviceRule) error {
   220  	// This cgroup is configured as a white-list. Reset the entire emulator,
   221  	// and put is into white-list mode.
   222  	if rule == nil || rule.meta.node == devices.WildcardDevice {
   223  		*e = Emulator{
   224  			defaultAllow: false,
   225  			rules:        nil,
   226  		}
   227  		return nil
   228  	}
   229  
   230  	var err error
   231  	if e.defaultAllow {
   232  		err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
   233  	} else {
   234  		err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
   235  	}
   236  	return err
   237  }
   238  
   239  func (e *Emulator) Apply(rule devices.Rule) error {
   240  	if !rule.Type.CanCgroup() {
   241  		return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
   242  	}
   243  
   244  	innerRule := &deviceRule{
   245  		meta: deviceMeta{
   246  			node:  rule.Type,
   247  			major: rule.Major,
   248  			minor: rule.Minor,
   249  		},
   250  		perms: rule.Permissions,
   251  	}
   252  	if innerRule.meta.node == devices.WildcardDevice {
   253  		innerRule = nil
   254  	}
   255  
   256  	if rule.Allow {
   257  		return e.allow(innerRule)
   258  	}
   259  
   260  	return e.deny(innerRule)
   261  }
   262  
   263  // EmulatorFromList takes a reader to a "devices.list"-like source, and returns
   264  // a new Emulator that represents the state of the devices cgroup. Note that
   265  // black-list devices cgroups cannot be fully reconstructed, due to limitations
   266  // in the devices cgroup API. Instead, such cgroups are always treated as
   267  // "allow all" cgroups.
   268  func EmulatorFromList(list io.Reader) (*Emulator, error) {
   269  	// Normally cgroups are in black-list mode by default, but the way we
   270  	// figure out the current mode is whether or not devices.list has an
   271  	// allow-all rule. So we default to a white-list, and the existence of an
   272  	// "a *:* rwm" entry will tell us otherwise.
   273  	e := &Emulator{
   274  		defaultAllow: false,
   275  	}
   276  
   277  	// Parse the "devices.list".
   278  	s := bufio.NewScanner(list)
   279  	for s.Scan() {
   280  		line := s.Text()
   281  		deviceRule, err := parseLine(line)
   282  		if err != nil {
   283  			return nil, fmt.Errorf("error parsing line %q: %w", line, err)
   284  		}
   285  		// "devices.list" is an allow list. Note that this means that in
   286  		// black-list mode, we have no idea what rules are in play. As a
   287  		// result, we need to be very careful in Transition().
   288  		if err := e.allow(deviceRule); err != nil {
   289  			return nil, fmt.Errorf("error adding devices.list rule: %w", err)
   290  		}
   291  	}
   292  	if err := s.Err(); err != nil {
   293  		return nil, fmt.Errorf("error reading devices.list lines: %w", err)
   294  	}
   295  	return e, nil
   296  }
   297  
   298  // Transition calculates what is the minimally-disruptive set of rules need to
   299  // be applied to a devices cgroup in order to transition to the given target.
   300  // This means that any already-existing rules will not be applied, and
   301  // disruptive rules (like denying all device access) will only be applied if
   302  // necessary.
   303  //
   304  // This function is the sole reason for all of Emulator -- to allow us
   305  // to figure out how to update a containers' cgroups without causing spurious
   306  // device errors (if possible).
   307  func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
   308  	var transitionRules []*devices.Rule
   309  	oldRules := source.rules
   310  
   311  	// If the default policy doesn't match, we need to include a "disruptive"
   312  	// rule (either allow-all or deny-all) in order to switch the cgroup to the
   313  	// correct default policy.
   314  	//
   315  	// However, due to a limitation in "devices.list" we cannot be sure what
   316  	// deny rules are in place in a black-list cgroup. Thus if the source is a
   317  	// black-list we also have to include a disruptive rule.
   318  	if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
   319  		transitionRules = append(transitionRules, &devices.Rule{
   320  			Type:        'a',
   321  			Major:       -1,
   322  			Minor:       -1,
   323  			Permissions: devices.Permissions("rwm"),
   324  			Allow:       target.defaultAllow,
   325  		})
   326  		// The old rules are only relevant if we aren't starting out with a
   327  		// disruptive rule.
   328  		oldRules = nil
   329  	}
   330  
   331  	// NOTE: We traverse through the rules in a sorted order so we always write
   332  	//       the same set of rules (this is to aid testing).
   333  
   334  	// First, we create inverse rules for any old rules not in the new set.
   335  	// This includes partial-inverse rules for specific permissions. This is a
   336  	// no-op if we added a disruptive rule, since oldRules will be empty.
   337  	for _, rule := range oldRules.orderedEntries() {
   338  		meta, oldPerms := rule.meta, rule.perms
   339  		newPerms := target.rules[meta]
   340  		droppedPerms := oldPerms.Difference(newPerms)
   341  		if !droppedPerms.IsEmpty() {
   342  			transitionRules = append(transitionRules, &devices.Rule{
   343  				Type:        meta.node,
   344  				Major:       meta.major,
   345  				Minor:       meta.minor,
   346  				Permissions: droppedPerms,
   347  				Allow:       target.defaultAllow,
   348  			})
   349  		}
   350  	}
   351  
   352  	// Add any additional rules which weren't in the old set. We happen to
   353  	// filter out rules which are present in both sets, though this isn't
   354  	// strictly necessary.
   355  	for _, rule := range target.rules.orderedEntries() {
   356  		meta, newPerms := rule.meta, rule.perms
   357  		oldPerms := oldRules[meta]
   358  		gainedPerms := newPerms.Difference(oldPerms)
   359  		if !gainedPerms.IsEmpty() {
   360  			transitionRules = append(transitionRules, &devices.Rule{
   361  				Type:        meta.node,
   362  				Major:       meta.major,
   363  				Minor:       meta.minor,
   364  				Permissions: gainedPerms,
   365  				Allow:       !target.defaultAllow,
   366  			})
   367  		}
   368  	}
   369  	return transitionRules, nil
   370  }
   371  
   372  // Rules returns the minimum set of rules necessary to convert a *deny-all*
   373  // cgroup to the emulated filter state (note that this is not the same as a
   374  // default cgroupv1 cgroup -- which is allow-all). This is effectively just a
   375  // wrapper around Transition() with the source emulator being an empty cgroup.
   376  func (e *Emulator) Rules() ([]*devices.Rule, error) {
   377  	defaultCgroup := &Emulator{defaultAllow: false}
   378  	return defaultCgroup.Transition(e)
   379  }
   380  
   381  func wrapErr(err error, text string) error {
   382  	if err == nil {
   383  		return nil
   384  	}
   385  	return fmt.Errorf(text+": %w", err)
   386  }
   387  

View as plain text