...

Source file src/github.com/Microsoft/hcsshim/cmd/gcs/main.go

Documentation: github.com/Microsoft/hcsshim/cmd/gcs

     1  //go:build linux
     2  // +build linux
     3  
     4  package main
     5  
     6  import (
     7  	"flag"
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"os/exec"
    12  	"path/filepath"
    13  	"syscall"
    14  	"time"
    15  
    16  	"github.com/cenkalti/backoff/v4"
    17  	"github.com/containerd/cgroups"
    18  	cgroupstats "github.com/containerd/cgroups/stats/v1"
    19  	oci "github.com/opencontainers/runtime-spec/specs-go"
    20  	"github.com/pkg/errors"
    21  	"github.com/sirupsen/logrus"
    22  	"go.opencensus.io/trace"
    23  
    24  	"github.com/Microsoft/hcsshim/internal/guest/bridge"
    25  	"github.com/Microsoft/hcsshim/internal/guest/kmsg"
    26  	"github.com/Microsoft/hcsshim/internal/guest/runtime/hcsv2"
    27  	"github.com/Microsoft/hcsshim/internal/guest/runtime/runc"
    28  	"github.com/Microsoft/hcsshim/internal/guest/transport"
    29  	"github.com/Microsoft/hcsshim/internal/guestpath"
    30  	"github.com/Microsoft/hcsshim/internal/log"
    31  	"github.com/Microsoft/hcsshim/internal/oc"
    32  	"github.com/Microsoft/hcsshim/pkg/securitypolicy"
    33  )
    34  
    35  func memoryLogFormat(metrics *cgroupstats.Metrics) logrus.Fields {
    36  	return logrus.Fields{
    37  		"memoryUsage":      metrics.Memory.Usage.Usage,
    38  		"memoryUsageMax":   metrics.Memory.Usage.Max,
    39  		"memoryUsageLimit": metrics.Memory.Usage.Limit,
    40  		"swapUsage":        metrics.Memory.Swap.Usage,
    41  		"swapUsageMax":     metrics.Memory.Swap.Max,
    42  		"swapUsageLimit":   metrics.Memory.Swap.Limit,
    43  		"kernelUsage":      metrics.Memory.Kernel.Usage,
    44  		"kernelUsageMax":   metrics.Memory.Kernel.Max,
    45  		"kernelUsageLimit": metrics.Memory.Kernel.Limit,
    46  	}
    47  }
    48  
    49  func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, threshold int64, cg cgroups.Cgroup) {
    50  	// Buffer must be >= 8 bytes for eventfd reads
    51  	// http://man7.org/linux/man-pages/man2/eventfd.2.html
    52  	count := 0
    53  	buf := make([]byte, 8)
    54  	for {
    55  		if _, err := efdFile.Read(buf); err != nil {
    56  			logrus.WithError(err).WithField("cgroup", cgName).Error("failed to read from eventfd")
    57  			return
    58  		}
    59  
    60  		// Sometimes an event is sent during cgroup teardown, but does not indicate that the
    61  		// threshold was actually crossed. In the teardown case the cgroup.event_control file
    62  		// won't exist anymore, so check that to determine if we should ignore this event.
    63  		_, err := os.Lstat(fmt.Sprintf("/sys/fs/cgroup/memory%s/cgroup.event_control", cgName))
    64  		if os.IsNotExist(err) {
    65  			return
    66  		}
    67  
    68  		count++
    69  		msg := "memory usage for cgroup exceeded threshold"
    70  		entry := logrus.WithFields(logrus.Fields{
    71  			"gcsStartTime":   startTime,
    72  			"time":           time.Now(),
    73  			"cgroup":         cgName,
    74  			"thresholdBytes": threshold,
    75  			"count":          count,
    76  		})
    77  		// Sleep for one second in case there is a series of allocations slightly after
    78  		// reaching threshold.
    79  		time.Sleep(time.Second)
    80  		metrics, err := cg.Stat(cgroups.IgnoreNotExist)
    81  		if err != nil {
    82  			// Don't return on Stat err as it will return an error if
    83  			// any of the cgroup subsystems Stat calls failed for any reason.
    84  			// We still want to log if we hit the cgroup threshold/limit
    85  			entry.WithError(err).Error(msg)
    86  		} else {
    87  			entry.WithFields(memoryLogFormat(metrics)).Warn(msg)
    88  		}
    89  	}
    90  }
    91  
    92  // runWithRestartMonitor starts a command with given args and waits for it to exit. If the
    93  // command exit code is non-zero the command is restarted with with some back off delay.
    94  // Any stdout or stderr of the command will be split into lines and written as a log with
    95  // logrus standard logger.  This function must be called in a separate goroutine.
    96  func runWithRestartMonitor(arg0 string, args ...string) {
    97  	backoffSettings := backoff.NewExponentialBackOff()
    98  	// After we hit 10 min retry interval keep retrying after every 10 mins instead of
    99  	// continuing to increase retry interval.
   100  	backoffSettings.MaxInterval = time.Minute * 10
   101  	for {
   102  		command := exec.Command(arg0, args...)
   103  		if err := command.Run(); err != nil {
   104  			logrus.WithFields(logrus.Fields{
   105  				"error":   err,
   106  				"command": command.Args,
   107  			}).Warn("restart monitor: run command returns error")
   108  		}
   109  		backOffTime := backoffSettings.NextBackOff()
   110  		// since backoffSettings.MaxElapsedTime is set to 0 we will never receive backoff.Stop.
   111  		time.Sleep(backOffTime)
   112  	}
   113  }
   114  
   115  // startTimeSyncService starts the `chronyd` deamon to keep the UVM time synchronized.  We
   116  // use a PTP device provided by the hypervisor as a source of correct time (instead of
   117  // using a network server). We need to create a configuration file that configures chronyd
   118  // to use the PTP device.  The system can have multiple PTP devices so we identify the
   119  // correct PTP device by verifying that the `clock_name` of that device is `hyperv`.
   120  func startTimeSyncService() error {
   121  	ptpClassDir, err := os.Open("/sys/class/ptp")
   122  	if err != nil {
   123  		return errors.Wrap(err, "failed to open PTP class directory")
   124  	}
   125  
   126  	ptpDirList, err := ptpClassDir.Readdirnames(-1)
   127  	if err != nil {
   128  		return errors.Wrap(err, "failed to list PTP class directory")
   129  	}
   130  
   131  	var ptpDirPath string
   132  	found := false
   133  	// The file ends with a new line
   134  	expectedClockName := "hyperv\n"
   135  	for _, ptpDirPath = range ptpDirList {
   136  		clockNameFilePath := filepath.Join(ptpClassDir.Name(), ptpDirPath, "clock_name")
   137  		buf, err := os.ReadFile(clockNameFilePath)
   138  		if err != nil && !os.IsNotExist(err) {
   139  			return errors.Wrapf(err, "failed to read clock name file at %s", clockNameFilePath)
   140  		}
   141  
   142  		if string(buf) == expectedClockName {
   143  			found = true
   144  			break
   145  		}
   146  	}
   147  
   148  	if !found {
   149  		return errors.Errorf("no PTP device found with name \"%s\"", expectedClockName)
   150  	}
   151  
   152  	// create chronyd config file
   153  	ptpDevPath := filepath.Join("/dev", filepath.Base(ptpDirPath))
   154  	// chronyd config file take from: https://docs.microsoft.com/en-us/azure/virtual-machines/linux/time-sync
   155  	chronydConfigString := fmt.Sprintf("refclock PHC %s poll 3 dpoll -2 offset 0 stratum 2\nmakestep 0.1 -1\n", ptpDevPath)
   156  	chronydConfPath := "/tmp/chronyd.conf"
   157  	err = os.WriteFile(chronydConfPath, []byte(chronydConfigString), 0644)
   158  	if err != nil {
   159  		return errors.Wrapf(err, "failed to create chronyd conf file %s", chronydConfPath)
   160  	}
   161  
   162  	// start chronyd. Do NOT start chronyd as daemon because creating a daemon
   163  	// involves double forking the restart monitor will attempt to restart chornyd
   164  	// after the first fork child exits.
   165  	go runWithRestartMonitor("chronyd", "-n", "-f", chronydConfPath)
   166  	return nil
   167  }
   168  
   169  func main() {
   170  	startTime := time.Now()
   171  	logLevel := flag.String("loglevel",
   172  		"debug",
   173  		"Logging Level: debug, info, warning, error, fatal, panic.")
   174  	coreDumpLoc := flag.String("core-dump-location",
   175  		"",
   176  		"The location/format where process core dumps will be written to.")
   177  	kmsgLogLevel := flag.Uint("kmsgLogLevel",
   178  		uint(kmsg.Warning),
   179  		"Log all kmsg entries with a priority less than or equal to the supplied level.")
   180  	logFile := flag.String("logfile",
   181  		"",
   182  		"Logging Target: An optional file name/path. Omit for console output.")
   183  	logFormat := flag.String("log-format", "text", "Logging Format: text or json")
   184  	useInOutErr := flag.Bool("use-inouterr",
   185  		false,
   186  		"If true use stdin/stdout for bridge communication and stderr for logging")
   187  	v4 := flag.Bool("v4", false, "enable the v4 protocol support and v2 schema")
   188  	rootMemReserveBytes := flag.Uint64("root-mem-reserve-bytes",
   189  		75*1024*1024, // 75Mib
   190  		"the amount of memory reserved for the orchestration, the rest will be assigned to containers")
   191  	gcsMemLimitBytes := flag.Uint64("gcs-mem-limit-bytes",
   192  		50*1024*1024, // 50 MiB
   193  		"the maximum amount of memory the gcs can use")
   194  	disableTimeSync := flag.Bool("disable-time-sync",
   195  		false,
   196  		"If true do not run chronyd time synchronization service inside the UVM")
   197  	scrubLogs := flag.Bool("scrub-logs", false, "If true, scrub potentially sensitive information from logging")
   198  	initialPolicyStance := flag.String("initial-policy-stance",
   199  		"allow",
   200  		"Stance: allow, deny.")
   201  
   202  	flag.Usage = func() {
   203  		fmt.Fprintf(os.Stderr, "\nUsage of %s:\n", os.Args[0])
   204  		flag.PrintDefaults()
   205  		fmt.Fprintf(os.Stderr, "Examples:\n")
   206  		fmt.Fprintf(os.Stderr, "    %s -loglevel=debug -logfile=/run/gcs/gcs.log\n", os.Args[0])
   207  		fmt.Fprintf(os.Stderr, "    %s -loglevel=info -logfile=stdout\n", os.Args[0])
   208  	}
   209  
   210  	flag.Parse()
   211  
   212  	// If v4 enable opencensus
   213  	if *v4 {
   214  		trace.ApplyConfig(trace.Config{DefaultSampler: trace.AlwaysSample()})
   215  		trace.RegisterExporter(&oc.LogrusExporter{})
   216  	}
   217  
   218  	logrus.AddHook(log.NewHook())
   219  
   220  	var logWriter *os.File
   221  	if *logFile != "" {
   222  		logFileHandle, err := os.OpenFile(*logFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
   223  		if err != nil {
   224  			logrus.WithFields(logrus.Fields{
   225  				"path":          *logFile,
   226  				logrus.ErrorKey: err,
   227  			}).Fatal("failed to create log file")
   228  		}
   229  		logWriter = logFileHandle
   230  	} else {
   231  		// logrus uses os.Stderr. see logrus.New()
   232  		logWriter = os.Stderr
   233  	}
   234  
   235  	// set up our initial stance policy enforcer
   236  	var initialEnforcer securitypolicy.SecurityPolicyEnforcer
   237  	switch *initialPolicyStance {
   238  	case "allow":
   239  		initialEnforcer = &securitypolicy.OpenDoorSecurityPolicyEnforcer{}
   240  		logrus.SetOutput(logWriter)
   241  	case "deny":
   242  		initialEnforcer = &securitypolicy.ClosedDoorSecurityPolicyEnforcer{}
   243  		logrus.SetOutput(io.Discard)
   244  	default:
   245  		logrus.WithFields(logrus.Fields{
   246  			"initial-policy-stance": *initialPolicyStance,
   247  		}).Fatal("unknown initial-policy-stance")
   248  	}
   249  
   250  	switch *logFormat {
   251  	case "text":
   252  		// retain logrus's default.
   253  	case "json":
   254  		logrus.SetFormatter(&logrus.JSONFormatter{
   255  			TimestampFormat: time.RFC3339Nano, // include ns for accurate comparisons on the host
   256  		})
   257  	default:
   258  		logrus.WithFields(logrus.Fields{
   259  			"log-format": *logFormat,
   260  		}).Fatal("unknown log-format")
   261  	}
   262  
   263  	level, err := logrus.ParseLevel(*logLevel)
   264  	if err != nil {
   265  		logrus.Fatal(err)
   266  	}
   267  
   268  	logrus.SetLevel(level)
   269  
   270  	log.SetScrubbing(*scrubLogs)
   271  
   272  	baseLogPath := guestpath.LCOWRootPrefixInUVM
   273  
   274  	logrus.Info("GCS started")
   275  
   276  	// Set the process core dump location. This will be global to all containers as it's a kernel configuration.
   277  	// If no path is specified core dumps will just be placed in the working directory of wherever the process
   278  	// was invoked to a file named "core".
   279  	if *coreDumpLoc != "" {
   280  		if err := os.WriteFile(
   281  			"/proc/sys/kernel/core_pattern",
   282  			[]byte(*coreDumpLoc),
   283  			0644,
   284  		); err != nil {
   285  			logrus.WithError(err).Fatal("failed to set core dump location")
   286  		}
   287  	}
   288  
   289  	// Continuously log /dev/kmsg
   290  	go kmsg.ReadForever(kmsg.LogLevel(*kmsgLogLevel))
   291  
   292  	tport := &transport.VsockTransport{}
   293  	rtime, err := runc.NewRuntime(baseLogPath)
   294  	if err != nil {
   295  		logrus.WithError(err).Fatal("failed to initialize new runc runtime")
   296  	}
   297  	mux := bridge.NewBridgeMux()
   298  	b := bridge.Bridge{
   299  		Handler:  mux,
   300  		EnableV4: *v4,
   301  	}
   302  	h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter)
   303  	b.AssignHandlers(mux, h)
   304  
   305  	var bridgeIn io.ReadCloser
   306  	var bridgeOut io.WriteCloser
   307  	if *useInOutErr {
   308  		bridgeIn = os.Stdin
   309  		bridgeOut = os.Stdout
   310  	} else {
   311  		const commandPort uint32 = 0x40000000
   312  		bridgeCon, err := tport.Dial(commandPort)
   313  		if err != nil {
   314  			logrus.WithFields(logrus.Fields{
   315  				"port":          commandPort,
   316  				logrus.ErrorKey: err,
   317  			}).Fatal("failed to dial host vsock connection")
   318  		}
   319  		bridgeIn = bridgeCon
   320  		bridgeOut = bridgeCon
   321  	}
   322  
   323  	// Setup the UVM cgroups to protect against a workload taking all available
   324  	// memory and causing the GCS to malfunction we create two cgroups: gcs,
   325  	// containers.
   326  	//
   327  
   328  	// Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy
   329  	// support. This needs to be set before we create any cgroups as the write
   330  	// will fail otherwise.
   331  	if err := os.WriteFile("/sys/fs/cgroup/memory/memory.use_hierarchy", []byte("1"), 0644); err != nil {
   332  		logrus.WithError(err).Fatal("failed to enable hierarchy support for root cgroup")
   333  	}
   334  
   335  	// The containers cgroup is limited only by {Totalram - 75 MB
   336  	// (reservation)}.
   337  	//
   338  	// The gcs cgroup is not limited but an event will get logged if memory
   339  	// usage exceeds 50 MB.
   340  	sinfo := syscall.Sysinfo_t{}
   341  	if err := syscall.Sysinfo(&sinfo); err != nil {
   342  		logrus.WithError(err).Fatal("failed to get sys info")
   343  	}
   344  	containersLimit := int64(sinfo.Totalram - *rootMemReserveBytes)
   345  	containersControl, err := cgroups.New(cgroups.V1, cgroups.StaticPath("/containers"), &oci.LinuxResources{
   346  		Memory: &oci.LinuxMemory{
   347  			Limit: &containersLimit,
   348  		},
   349  	})
   350  	if err != nil {
   351  		logrus.WithError(err).Fatal("failed to create containers cgroup")
   352  	}
   353  	defer containersControl.Delete() //nolint:errcheck
   354  
   355  	gcsControl, err := cgroups.New(cgroups.V1, cgroups.StaticPath("/gcs"), &oci.LinuxResources{})
   356  	if err != nil {
   357  		logrus.WithError(err).Fatal("failed to create gcs cgroup")
   358  	}
   359  	defer gcsControl.Delete() //nolint:errcheck
   360  	if err := gcsControl.Add(cgroups.Process{Pid: os.Getpid()}); err != nil {
   361  		logrus.WithError(err).Fatal("failed add gcs pid to gcs cgroup")
   362  	}
   363  
   364  	event := cgroups.MemoryThresholdEvent(*gcsMemLimitBytes, false)
   365  	gefd, err := gcsControl.RegisterMemoryEvent(event)
   366  	if err != nil {
   367  		logrus.WithError(err).Fatal("failed to register memory threshold for gcs cgroup")
   368  	}
   369  	gefdFile := os.NewFile(gefd, "gefd")
   370  	defer gefdFile.Close()
   371  
   372  	oom, err := containersControl.OOMEventFD()
   373  	if err != nil {
   374  		logrus.WithError(err).Fatal("failed to retrieve the container cgroups oom eventfd")
   375  	}
   376  	oomFile := os.NewFile(oom, "cefd")
   377  	defer oomFile.Close()
   378  
   379  	// time synchronization service
   380  	if !(*disableTimeSync) {
   381  		if err = startTimeSyncService(); err != nil {
   382  			logrus.WithError(err).Fatal("failed to start time synchronization service")
   383  		}
   384  	}
   385  
   386  	go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl)
   387  	go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl)
   388  	err = b.ListenAndServe(bridgeIn, bridgeOut)
   389  	if err != nil {
   390  		logrus.WithFields(logrus.Fields{
   391  			logrus.ErrorKey: err,
   392  		}).Fatal("failed to serve gcs service")
   393  	}
   394  }
   395  

View as plain text