...

Source file src/k8s.io/kubernetes/test/e2e_node/remote/gce/gce_runner.go

Documentation: k8s.io/kubernetes/test/e2e_node/remote/gce

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package gce
    18  
    19  import (
    20  	"encoding/base64"
    21  	"encoding/json"
    22  	"errors"
    23  	"flag"
    24  	"fmt"
    25  	"os"
    26  	"path/filepath"
    27  	"regexp"
    28  	"sort"
    29  	"strings"
    30  	"time"
    31  
    32  	"k8s.io/kubernetes/test/e2e_node/remote"
    33  
    34  	"github.com/google/uuid"
    35  	"k8s.io/apimachinery/pkg/util/wait"
    36  	"k8s.io/klog/v2"
    37  	"sigs.k8s.io/yaml"
    38  )
    39  
    40  var _ remote.Runner = (*GCERunner)(nil)
    41  
    42  func init() {
    43  	remote.RegisterRunner("gce", NewGCERunner)
    44  }
    45  
    46  // envs is the type used to collect all node envs. The key is the env name,
    47  // and the value is the env value
    48  type envs map[string]string
    49  
    50  // String function of flag.Value
    51  func (e *envs) String() string {
    52  	return fmt.Sprint(*e)
    53  }
    54  
    55  // Set function of flag.Value
    56  func (e *envs) Set(value string) error {
    57  	if value == "" {
    58  		return nil
    59  	}
    60  	kv := strings.SplitN(value, "=", 2)
    61  	if len(kv) != 2 {
    62  		return fmt.Errorf("invalid env string %s", value)
    63  	}
    64  	emap := *e
    65  	emap[kv[0]] = kv[1]
    66  	return nil
    67  }
    68  
    69  // nodeEnvs is the node envs from the flag `node-env`.
    70  var nodeEnvs = make(envs)
    71  
    72  var project = flag.String("project", "", "gce project the hosts live in (gce)")
    73  var zone = flag.String("zone", "", "gce zone that the hosts live in (gce)")
    74  var instanceMetadata = flag.String("instance-metadata", "", "key/value metadata for instances separated by '=' or '<', 'k=v' means the key is 'k' and the value is 'v'; 'k<p' means the key is 'k' and the value is extracted from the local path 'p', e.g. k1=v1,k2<p2  (gce)")
    75  var imageProject = flag.String("image-project", "", "gce project the hosts live in  (gce)")
    76  var instanceType = flag.String("instance-type", "e2-medium", "GCP Machine type to use for test")
    77  var preemptibleInstances = flag.Bool("preemptible-instances", false, "If true, gce instances will be configured to be preemptible  (gce)")
    78  
    79  func init() {
    80  	flag.Var(&nodeEnvs, "node-env", "An environment variable passed to instance as metadata, e.g. when '--node-env=PATH=/usr/bin' is specified, there will be an extra instance metadata 'PATH=/usr/bin'.")
    81  }
    82  
    83  type GCERunner struct {
    84  	cfg       remote.Config
    85  	gceImages *internalGCEImageConfig
    86  }
    87  
    88  const (
    89  	defaultGCEMachine = "e2-standard-2"
    90  )
    91  
    92  func NewGCERunner(cfg remote.Config) remote.Runner {
    93  	if cfg.InstanceNamePrefix == "" {
    94  		cfg.InstanceNamePrefix = "tmp-node-e2e-" + uuid.New().String()[:8]
    95  	}
    96  	return &GCERunner{cfg: cfg}
    97  }
    98  
    99  func (g *GCERunner) Validate() error {
   100  	if len(g.cfg.Hosts) == 0 && g.cfg.ImageConfigFile == "" && len(g.cfg.Images) == 0 {
   101  		klog.Fatalf("Must specify one of --image-config-file, --hosts, --images.")
   102  	}
   103  
   104  	_, err := runGCPCommandWithZones("compute", "instances", "list")
   105  	if err != nil {
   106  		klog.Fatalf("While listing GCE instances: %v", err)
   107  	}
   108  
   109  	if g.gceImages, err = g.prepareGceImages(); err != nil {
   110  		klog.Fatalf("While preparing GCE images: %v", err)
   111  	}
   112  	return nil
   113  }
   114  
   115  func (g *GCERunner) StartTests(suite remote.TestSuite, archivePath string, results chan *remote.TestResult) (numTests int) {
   116  	for shortName := range g.gceImages.images {
   117  		imageConfig := g.gceImages.images[shortName]
   118  		numTests++
   119  		fmt.Printf("Initializing e2e tests using image %s/%s/%s.\n", shortName, imageConfig.project, imageConfig.image)
   120  		go func(image *internalGCEImage, junitFileName string) {
   121  			results <- g.testGCEImage(suite, archivePath, image, junitFileName)
   122  		}(&imageConfig, shortName)
   123  	}
   124  	return
   125  }
   126  
   127  // Accelerator contains type and count about resource.
   128  type Accelerator struct {
   129  	Type  string `json:"type,omitempty"`
   130  	Count int64  `json:"count,omitempty"`
   131  }
   132  
   133  // Resources contains accelerators array.
   134  type Resources struct {
   135  	Accelerators []Accelerator `json:"accelerators,omitempty"`
   136  }
   137  
   138  // internalGCEImage is an internal GCE image representation for E2E node.
   139  type internalGCEImage struct {
   140  	image string
   141  	// imageDesc is the description of the image. If empty, the value in the
   142  	// 'image' will be used.
   143  	imageDesc       string
   144  	kernelArguments []string
   145  	project         string
   146  	resources       Resources
   147  	metadata        *gceMetadata
   148  	machine         string
   149  }
   150  
   151  type internalGCEImageConfig struct {
   152  	images map[string]internalGCEImage
   153  }
   154  
   155  // GCEImageConfig specifies what images should be run and how for these tests.
   156  // It can be created via the `--images` and `--image-project` flags, or by
   157  // specifying the `--image-config-file` flag, pointing to a json or yaml file
   158  // of the form:
   159  //
   160  //	images:
   161  //	  short-name:
   162  //	    image: gce-image-name
   163  //	    project: gce-image-project
   164  //	    machine: for benchmark only, the machine type (GCE instance) to run test
   165  //	    tests: for benchmark only, a list of ginkgo focus strings to match tests
   166  //
   167  // TODO(coufon): replace 'image' with 'node' in configurations
   168  // and we plan to support testing custom machines other than GCE by specifying Host
   169  type GCEImageConfig struct {
   170  	Images map[string]GCEImage `json:"images"`
   171  }
   172  
   173  // GCEImage contains some information about GCE Image.
   174  type GCEImage struct {
   175  	Image      string `json:"image,omitempty"`
   176  	ImageRegex string `json:"image_regex,omitempty"`
   177  	// ImageFamily is the image family to use. The latest image from the image family will be used, e.g cos-81-lts.
   178  	ImageFamily     string    `json:"image_family,omitempty"`
   179  	ImageDesc       string    `json:"image_description,omitempty"`
   180  	KernelArguments []string  `json:"kernel_arguments,omitempty"`
   181  	Project         string    `json:"project"`
   182  	Metadata        string    `json:"metadata"`
   183  	Machine         string    `json:"machine,omitempty"`
   184  	Resources       Resources `json:"resources,omitempty"`
   185  }
   186  
   187  // Returns an image name based on regex and given GCE project.
   188  func (g *GCERunner) getGCEImage(imageRegex, imageFamily string, project string) (string, error) {
   189  	data, err := runGCPCommandNoProject("compute", "images", "list",
   190  		"--format=json", "--project="+project)
   191  	if err != nil {
   192  		return "", fmt.Errorf("failed to list images in project %q: %w", project, err)
   193  	}
   194  	var images []gceImage
   195  	err = json.Unmarshal(data, &images)
   196  	if err != nil {
   197  		return "", fmt.Errorf("failed to parse images: %w", err)
   198  	}
   199  
   200  	imageObjs := []imageObj{}
   201  	imageRe := regexp.MustCompile(imageRegex)
   202  	for _, instance := range images {
   203  		if imageRegex != "" && !imageRe.MatchString(instance.Name) {
   204  			continue
   205  		}
   206  		if imageFamily != "" && instance.Family != imageFamily {
   207  			continue
   208  		}
   209  		creationTime, err := time.Parse(time.RFC3339, instance.CreationTimestamp)
   210  		if err != nil {
   211  			return "", fmt.Errorf("failed to parse instance creation timestamp %q: %w", instance.CreationTimestamp, err)
   212  		}
   213  		io := imageObj{
   214  			creationTime: creationTime,
   215  			name:         instance.Name,
   216  		}
   217  		imageObjs = append(imageObjs, io)
   218  	}
   219  
   220  	// Pick the latest image after sorting.
   221  	sort.Sort(byCreationTime(imageObjs))
   222  	if len(imageObjs) > 0 {
   223  		klog.V(4).Infof("found images %+v based on regex %q and family %q in project %q", imageObjs, imageRegex, imageFamily, project)
   224  		return imageObjs[0].name, nil
   225  	}
   226  	return "", fmt.Errorf("found zero images based on regex %q and family %q in project %q", imageRegex, imageFamily, project)
   227  }
   228  
   229  func (g *GCERunner) prepareGceImages() (*internalGCEImageConfig, error) {
   230  	gceImages := &internalGCEImageConfig{
   231  		images: make(map[string]internalGCEImage),
   232  	}
   233  
   234  	// Parse images from given config file and convert them to internalGCEImage.
   235  	if g.cfg.ImageConfigFile != "" {
   236  		configPath := g.cfg.ImageConfigFile
   237  		if g.cfg.ImageConfigDir != "" {
   238  			configPath = filepath.Join(g.cfg.ImageConfigDir, g.cfg.ImageConfigFile)
   239  		}
   240  
   241  		imageConfigData, err := os.ReadFile(configPath)
   242  		if err != nil {
   243  			return nil, fmt.Errorf("Could not read image config file provided: %w", err)
   244  		}
   245  		// Unmarshal the given image config file. All images for this test run will be organized into a map.
   246  		// shortName->GCEImage, e.g cos-stable->cos-stable-81-12871-103-0.
   247  		externalImageConfig := GCEImageConfig{Images: make(map[string]GCEImage)}
   248  		err = yaml.Unmarshal(imageConfigData, &externalImageConfig)
   249  		if err != nil {
   250  			return nil, fmt.Errorf("Could not parse image config file: %w", err)
   251  		}
   252  
   253  		for shortName, imageConfig := range externalImageConfig.Images {
   254  			var image string
   255  			if (imageConfig.ImageRegex != "" || imageConfig.ImageFamily != "") && imageConfig.Image == "" {
   256  				image, err = g.getGCEImage(imageConfig.ImageRegex, imageConfig.ImageFamily, imageConfig.Project)
   257  				if err != nil {
   258  					return nil, fmt.Errorf("Could not retrieve a image based on image regex %q and family %q: %v",
   259  						imageConfig.ImageRegex, imageConfig.ImageFamily, err)
   260  				}
   261  			} else {
   262  				image = imageConfig.Image
   263  			}
   264  			// Convert the given image into an internalGCEImage.
   265  			metadata := imageConfig.Metadata
   266  			if len(strings.TrimSpace(*instanceMetadata)) > 0 {
   267  				metadata += "," + *instanceMetadata
   268  			}
   269  			gceImage := internalGCEImage{
   270  				image:           image,
   271  				imageDesc:       imageConfig.ImageDesc,
   272  				project:         imageConfig.Project,
   273  				metadata:        g.getImageMetadata(metadata),
   274  				kernelArguments: imageConfig.KernelArguments,
   275  				machine:         imageConfig.Machine,
   276  				resources:       imageConfig.Resources,
   277  			}
   278  			if gceImage.imageDesc == "" {
   279  				gceImage.imageDesc = gceImage.image
   280  			}
   281  			gceImages.images[shortName] = gceImage
   282  		}
   283  	}
   284  
   285  	// Allow users to specify additional images via cli flags for local testing
   286  	// convenience; merge in with config file
   287  	if len(g.cfg.Images) > 0 {
   288  		if *imageProject == "" {
   289  			klog.Fatal("Must specify --image-project if you specify --images")
   290  		}
   291  		for _, image := range g.cfg.Images {
   292  			gceImage := internalGCEImage{
   293  				image:    image,
   294  				project:  *imageProject,
   295  				metadata: g.getImageMetadata(*instanceMetadata),
   296  			}
   297  			gceImages.images[image] = gceImage
   298  		}
   299  	}
   300  
   301  	if len(gceImages.images) != 0 && *zone == "" {
   302  		return nil, errors.New("must specify --zone flag")
   303  	}
   304  	// Make sure GCP project is set. Without a project, images can't be retrieved..
   305  	for shortName, imageConfig := range gceImages.images {
   306  		if imageConfig.project == "" {
   307  			return nil, fmt.Errorf("invalid config for %v; must specify a project", shortName)
   308  		}
   309  	}
   310  	if len(gceImages.images) != 0 {
   311  		if *project == "" {
   312  			return nil, errors.New("must specify --project flag to launch images into")
   313  		}
   314  	}
   315  
   316  	return gceImages, nil
   317  }
   318  
   319  type imageObj struct {
   320  	creationTime time.Time
   321  	name         string
   322  }
   323  
   324  type byCreationTime []imageObj
   325  
   326  func (a byCreationTime) Len() int           { return len(a) }
   327  func (a byCreationTime) Less(i, j int) bool { return a[i].creationTime.After(a[j].creationTime) }
   328  func (a byCreationTime) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
   329  
   330  func (g *GCERunner) getImageMetadata(input string) *gceMetadata {
   331  	if input == "" {
   332  		return nil
   333  	}
   334  	klog.V(3).Infof("parsing instance metadata: %q", input)
   335  	raw := g.parseInstanceMetadata(input)
   336  	klog.V(4).Infof("parsed instance metadata: %v", raw)
   337  	metadataItems := []gceMetadataItems{}
   338  	for k, v := range raw {
   339  		metadataItems = append(metadataItems, gceMetadataItems{
   340  			Key:   k,
   341  			Value: v,
   342  		})
   343  	}
   344  	ret := gceMetadata{Items: metadataItems}
   345  	return &ret
   346  }
   347  
   348  func (g *GCERunner) DeleteGCEInstance(host string) {
   349  	klog.Infof("Deleting instance %q", host)
   350  	_, err := runGCPCommandWithZone("compute", "instances", "delete", host)
   351  	if err != nil {
   352  		klog.Errorf("Error deleting instance %q: %v", host, err)
   353  	}
   354  }
   355  
   356  func (g *GCERunner) parseInstanceMetadata(str string) map[string]string {
   357  	metadata := make(map[string]string)
   358  	ss := strings.Split(str, ",")
   359  	for _, s := range ss {
   360  		kv := strings.Split(s, "=")
   361  		if len(kv) == 2 {
   362  			metadata[kv[0]] = kv[1]
   363  			continue
   364  		}
   365  		kp := strings.Split(s, "<")
   366  		if len(kp) != 2 {
   367  			klog.Fatalf("Invalid instance metadata: %q", s)
   368  			continue
   369  		}
   370  		metaPath := kp[1]
   371  		if g.cfg.ImageConfigDir != "" {
   372  			metaPath = filepath.Join(g.cfg.ImageConfigDir, metaPath)
   373  		}
   374  		v, err := os.ReadFile(metaPath)
   375  		if err != nil {
   376  			klog.Fatalf("Failed to read metadata file %q: %v", metaPath, err)
   377  			continue
   378  		}
   379  		metadata[kp[0]] = ignitionInjectGCEPublicKey(string(v))
   380  	}
   381  	for k, v := range nodeEnvs {
   382  		metadata[k] = v
   383  	}
   384  	return metadata
   385  }
   386  
   387  // ignitionInjectGCEPublicKey tries to inject the GCE SSH public key into the
   388  // provided ignition file path.
   389  //
   390  // This will only being done if the job has the
   391  // IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE environment variable set, while it
   392  // tried to replace the GCE_SSH_PUBLIC_KEY_FILE_CONTENT placeholder.
   393  func ignitionInjectGCEPublicKey(content string) string {
   394  	if os.Getenv("IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE") == "" {
   395  		return content
   396  	}
   397  
   398  	klog.Infof("Injecting SSH public key into ignition")
   399  
   400  	const publicKeyEnv = "GCE_SSH_PUBLIC_KEY_FILE"
   401  	sshPublicKeyFile := os.Getenv(publicKeyEnv)
   402  	if sshPublicKeyFile == "" {
   403  		klog.Errorf("Environment variable %s is not set", publicKeyEnv)
   404  		os.Exit(1)
   405  	}
   406  
   407  	sshPublicKey, err := os.ReadFile(sshPublicKeyFile)
   408  	if err != nil {
   409  		klog.ErrorS(err, "unable to read SSH public key file")
   410  		os.Exit(1)
   411  	}
   412  
   413  	const sshPublicKeyFileContentMarker = "GCE_SSH_PUBLIC_KEY_FILE_CONTENT"
   414  	key := base64.StdEncoding.EncodeToString(sshPublicKey)
   415  	base64Marker := base64.StdEncoding.EncodeToString([]byte(sshPublicKeyFileContentMarker))
   416  	replacer := strings.NewReplacer(
   417  		sshPublicKeyFileContentMarker, key,
   418  		base64Marker, key,
   419  	)
   420  	return replacer.Replace(content)
   421  }
   422  
   423  // Provision a gce instance using image and run the tests in archive against the instance.
   424  // Delete the instance afterward.
   425  func (g *GCERunner) testGCEImage(suite remote.TestSuite, archivePath string, imageConfig *internalGCEImage, junitFileName string) *remote.TestResult {
   426  	ginkgoFlagsStr := g.cfg.GinkgoFlags
   427  
   428  	host, err := g.createGCEInstance(imageConfig)
   429  	if g.cfg.DeleteInstances {
   430  		defer g.DeleteGCEInstance(host)
   431  	}
   432  	if err != nil {
   433  		return &remote.TestResult{
   434  			Err: fmt.Errorf("unable to create gce instance with running docker daemon for image %s.  %v", imageConfig.image, err),
   435  		}
   436  	}
   437  
   438  	// Only delete the files if we are keeping the instance and want it cleaned up.
   439  	// If we are going to delete the instance, don't bother with cleaning up the files
   440  	deleteFiles := !g.cfg.DeleteInstances && g.cfg.Cleanup
   441  
   442  	if err = g.registerGceHostIP(host); err != nil {
   443  		return &remote.TestResult{
   444  			Err:    err,
   445  			Host:   host,
   446  			ExitOK: false,
   447  		}
   448  	}
   449  
   450  	output, exitOk, err := remote.RunRemote(remote.RunRemoteConfig{
   451  		Suite:          suite,
   452  		Archive:        archivePath,
   453  		Host:           host,
   454  		Cleanup:        deleteFiles,
   455  		ImageDesc:      imageConfig.imageDesc,
   456  		JunitFileName:  junitFileName,
   457  		TestArgs:       g.cfg.TestArgs,
   458  		GinkgoArgs:     ginkgoFlagsStr,
   459  		SystemSpecName: g.cfg.SystemSpecName,
   460  		ExtraEnvs:      g.cfg.ExtraEnvs,
   461  		RuntimeConfig:  g.cfg.RuntimeConfig,
   462  	})
   463  	result := remote.TestResult{
   464  		Output: output,
   465  		Err:    err,
   466  		Host:   host,
   467  		ExitOK: exitOk,
   468  	}
   469  
   470  	// This is a temporary solution to collect serial node serial log. Only port 1 contains useful information.
   471  	// TODO(random-liu): Extract out and unify log collection logic with cluste e2e.
   472  	contents, err := g.getSerialOutput(host)
   473  	if err != nil {
   474  		klog.Errorf("Failed to get serial Output from node %q : %v", host, err)
   475  	}
   476  	logFilename := "serial-1.log"
   477  	err = remote.WriteLog(host, logFilename, contents)
   478  	if err != nil {
   479  		klog.Errorf("Failed to write serial Output from node %q to %q: %v", host, logFilename, err)
   480  	}
   481  	return &result
   482  }
   483  
   484  // Provision a gce instance using image
   485  func (g *GCERunner) createGCEInstance(imageConfig *internalGCEImage) (string, error) {
   486  	data, err := runGCPCommand("compute", "project-info", "describe", "--format=json", "--project="+*project)
   487  	if err != nil {
   488  		return "", fmt.Errorf("failed to get project info for %q: %w", *project, err)
   489  	}
   490  
   491  	var p projectInfo
   492  	err = json.Unmarshal(data, &p)
   493  	if err != nil {
   494  		return "", fmt.Errorf("failed parse project info %q: %w", *project, err)
   495  	}
   496  	// Use default service account
   497  	serviceAccount := p.DefaultServiceAccount
   498  	klog.V(1).Infof("Creating instance %+v  with service account %q", *imageConfig, serviceAccount)
   499  	name := g.imageToInstanceName(imageConfig)
   500  
   501  	diskArgs := []string{
   502  		"image-project=" + imageConfig.project,
   503  		"image=" + imageConfig.image,
   504  		"type=pd-standard",
   505  		"auto-delete=yes",
   506  		"boot=yes",
   507  		"size=20GB",
   508  	}
   509  
   510  	createArgs := []string{"compute", "instances", "create"}
   511  	createArgs = append(createArgs, name)
   512  	createArgs = append(createArgs, "--machine-type="+g.machineType(imageConfig.machine))
   513  	createArgs = append(createArgs, "--create-disk="+strings.Join(diskArgs, ","))
   514  	createArgs = append(createArgs, "--service-account="+serviceAccount)
   515  	if *preemptibleInstances {
   516  		createArgs = append(createArgs, "--preemptible")
   517  	}
   518  	if len(imageConfig.resources.Accelerators) > 0 {
   519  		createArgs = append(createArgs, "--maintenance-policy=TERMINATE")
   520  		createArgs = append(createArgs, "--restart-on-failure")
   521  		for _, accelerator := range imageConfig.resources.Accelerators {
   522  			createArgs = append(createArgs,
   523  				fmt.Sprintf("--accelerator=count=%d,type=%s", accelerator.Count, accelerator.Type))
   524  		}
   525  	}
   526  	if imageConfig.metadata != nil {
   527  		var itemArgs []string
   528  		var itemFileArgs []string
   529  		for _, item := range imageConfig.metadata.Items {
   530  			if strings.HasPrefix(item.Key, "user-") || strings.HasPrefix(item.Key, "startup-") ||
   531  				strings.HasPrefix(item.Key, "containerd-") || strings.HasPrefix(item.Key, "cni-") ||
   532  				strings.ContainsAny(item.Value, ",:") {
   533  				dataFile, err := os.CreateTemp("", "metadata")
   534  				if err != nil {
   535  					return "", fmt.Errorf("unable to create temp file %v", err)
   536  				}
   537  				defer os.Remove(dataFile.Name()) // clean up
   538  				if err = os.WriteFile(dataFile.Name(), []byte(item.Value), 0666); err != nil {
   539  					return "", fmt.Errorf("could not write contents of metadata item into file %v", err)
   540  				}
   541  				itemFileArgs = append(itemFileArgs, item.Key+"="+dataFile.Name())
   542  			} else {
   543  				itemArgs = append(itemArgs, item.Key+"="+item.Value)
   544  			}
   545  		}
   546  		if len(itemArgs) > 0 {
   547  			createArgs = append(createArgs, "--metadata="+strings.Join(itemArgs, ","))
   548  		}
   549  		if len(itemFileArgs) > 0 {
   550  			createArgs = append(createArgs, "--metadata-from-file="+strings.Join(itemFileArgs, ","))
   551  		}
   552  	}
   553  
   554  	if _, err := getGCEInstance(name); err != nil {
   555  		fmt.Printf("Running gcloud with parameters : %#v\n", createArgs)
   556  		_, err := runGCPCommandWithZone(createArgs...)
   557  		if err != nil {
   558  			fmt.Println(err)
   559  			return "", fmt.Errorf("failed to create instance in project %q: %w", *project, err)
   560  		}
   561  	}
   562  
   563  	instanceRunning := false
   564  	var instance *gceInstance
   565  	for i := 0; i < 30 && !instanceRunning; i++ {
   566  		if i > 0 {
   567  			time.Sleep(time.Second * 20)
   568  		}
   569  
   570  		instance, err := getGCEInstance(name)
   571  		if err != nil {
   572  			continue
   573  		}
   574  		if strings.ToUpper(instance.Status) != "RUNNING" {
   575  			_ = fmt.Errorf("instance %s not in state RUNNING, was %s", name, instance.Status)
   576  			continue
   577  		}
   578  		externalIP := g.getExternalIP(instance)
   579  		if len(externalIP) > 0 {
   580  			remote.AddHostnameIP(name, externalIP)
   581  		}
   582  
   583  		var output string
   584  		output, err = remote.SSH(name, "sh", "-c",
   585  			"'systemctl list-units  --type=service  --state=running | grep -e containerd -e crio'")
   586  		if err != nil {
   587  			_ = fmt.Errorf("instance %s not running containerd/crio daemon - Command failed: %s", name, output)
   588  			continue
   589  		}
   590  		if !strings.Contains(output, "containerd.service") &&
   591  			!strings.Contains(output, "crio.service") {
   592  			_ = fmt.Errorf("instance %s not running containerd/crio daemon: %s", name, output)
   593  			continue
   594  		}
   595  		instanceRunning = true
   596  	}
   597  	// If instance didn't reach running state in time, return with error now.
   598  	if err != nil {
   599  		return name, err
   600  	}
   601  	// Instance reached running state in time, make sure that cloud-init is complete
   602  	if g.isCloudInitUsed(imageConfig.metadata) {
   603  		cloudInitFinished := false
   604  		for i := 0; i < 60 && !cloudInitFinished; i++ {
   605  			if i > 0 {
   606  				time.Sleep(time.Second * 20)
   607  			}
   608  			var finished string
   609  			finished, err = remote.SSH(name, "ls", "/var/lib/cloud/instance/boot-finished")
   610  			if err != nil {
   611  				err = fmt.Errorf("instance %s has not finished cloud-init script: %s", name, finished)
   612  				continue
   613  			}
   614  			cloudInitFinished = true
   615  		}
   616  	}
   617  
   618  	// apply additional kernel arguments to the instance
   619  	if len(imageConfig.kernelArguments) > 0 {
   620  		klog.Info("Update kernel arguments")
   621  		if err := g.updateKernelArguments(instance, imageConfig.image, imageConfig.kernelArguments); err != nil {
   622  			return name, err
   623  		}
   624  	}
   625  
   626  	return name, err
   627  }
   628  
   629  func (g *GCERunner) isCloudInitUsed(metadata *gceMetadata) bool {
   630  	if metadata == nil {
   631  		return false
   632  	}
   633  	for _, item := range metadata.Items {
   634  		if item.Key == "user-data" && item.Value != "" && strings.HasPrefix(item.Value, "#cloud-config") {
   635  			return true
   636  		}
   637  	}
   638  	return false
   639  }
   640  
   641  func (g *GCERunner) imageToInstanceName(imageConfig *internalGCEImage) string {
   642  	if imageConfig.machine == "" {
   643  		return g.cfg.InstanceNamePrefix + "-" + imageConfig.image
   644  	}
   645  	// For benchmark test, node name has the format 'machine-image-uuid' to run
   646  	// different machine types with the same image in parallel
   647  	name := imageConfig.machine + "-" + imageConfig.image + "-" + uuid.New().String()[:8]
   648  	// Sometimes the image is too long, we need instance names to have a max length of 63
   649  	if len(name) > 63 {
   650  		return name[:63]
   651  	}
   652  	return name
   653  }
   654  
   655  func (g *GCERunner) registerGceHostIP(host string) error {
   656  	instance, err := getGCEInstance(host)
   657  	if err != nil {
   658  		return err
   659  	}
   660  	if strings.ToUpper(instance.Status) != "RUNNING" {
   661  		return fmt.Errorf("instance %s not in state RUNNING, was %s", host, instance.Status)
   662  	}
   663  	externalIP := g.getExternalIP(instance)
   664  	if len(externalIP) > 0 {
   665  		remote.AddHostnameIP(host, externalIP)
   666  	}
   667  	return nil
   668  }
   669  
   670  func (g *GCERunner) getExternalIP(instance *gceInstance) string {
   671  	for i := range instance.NetworkInterfaces {
   672  		ni := instance.NetworkInterfaces[i]
   673  		for j := range ni.AccessConfigs {
   674  			ac := ni.AccessConfigs[j]
   675  			if len(ac.NatIP) > 0 {
   676  				return ac.NatIP
   677  			}
   678  		}
   679  	}
   680  	return ""
   681  }
   682  
   683  func (g *GCERunner) updateKernelArguments(instance *gceInstance, image string, kernelArgs []string) error {
   684  	kernelArgsString := strings.Join(kernelArgs, " ")
   685  
   686  	var cmd []string
   687  	if strings.Contains(image, "cos") {
   688  		cmd = []string{
   689  			"dir=$(mktemp -d)",
   690  			"mount /dev/sda12 ${dir}",
   691  			fmt.Sprintf("sed -i -e \"s|cros_efi|cros_efi %s|g\" ${dir}/efi/boot/grub.cfg", kernelArgsString),
   692  			"umount ${dir}",
   693  			"rmdir ${dir}",
   694  		}
   695  	}
   696  
   697  	if strings.Contains(image, "ubuntu") {
   698  		cmd = []string{
   699  			fmt.Sprintf("echo \"GRUB_CMDLINE_LINUX_DEFAULT=%s ${GRUB_CMDLINE_LINUX_DEFAULT}\" > /etc/default/grub.d/99-additional-arguments.cfg", kernelArgsString),
   700  			"/usr/sbin/update-grub",
   701  		}
   702  	}
   703  
   704  	if len(cmd) == 0 {
   705  		klog.Warningf("The image %s does not support adding an additional kernel arguments", image)
   706  		return nil
   707  	}
   708  
   709  	out, err := remote.SSH(instance.Name, "sh", "-c", fmt.Sprintf("'%s'", strings.Join(cmd, "&&")))
   710  	if err != nil {
   711  		klog.Errorf("failed to run command %s: out: %s, Err: %v", cmd, out, err)
   712  		return err
   713  	}
   714  
   715  	if err := g.rebootInstance(instance); err != nil {
   716  		return err
   717  	}
   718  
   719  	return nil
   720  }
   721  
   722  func (g *GCERunner) machineType(machine string) string {
   723  	var ret string
   724  	if machine == "" && *instanceType != "" {
   725  		ret = *instanceType
   726  	} else if machine != "" {
   727  		ret = machine
   728  	} else {
   729  		ret = defaultGCEMachine
   730  	}
   731  	return ret
   732  }
   733  
   734  func (g *GCERunner) rebootInstance(instance *gceInstance) error {
   735  	// wait until the instance will not response to SSH
   736  	klog.Info("Reboot the node and wait for instance not to be available via SSH")
   737  	if waitErr := wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) {
   738  		if _, err := remote.SSH(instance.Name, "reboot"); err != nil {
   739  			return true, nil
   740  		}
   741  
   742  		return false, nil
   743  	}); waitErr != nil {
   744  		return fmt.Errorf("the instance %s still response to SSH: %v", instance.Name, waitErr)
   745  	}
   746  
   747  	// wait until the instance will response again to SSH
   748  	klog.Info("Wait for instance to be available via SSH")
   749  	if waitErr := wait.PollImmediate(30*time.Second, 5*time.Minute, func() (bool, error) {
   750  		if _, err := remote.SSH(instance.Name, "sh", "-c", "date"); err != nil {
   751  			return false, nil
   752  		}
   753  		return true, nil
   754  	}); waitErr != nil {
   755  		return fmt.Errorf("the instance %s does not response to SSH: %v", instance.Name, waitErr)
   756  	}
   757  
   758  	return nil
   759  }
   760  

View as plain text