1# Copyright 2019 The Kubernetes Authors.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15<#
16.SYNOPSIS
17 Library for configuring Windows nodes and joining them to the cluster.
18
19.NOTES
20 This module depends on common.psm1.
21
22 Some portions copied / adapted from
23 https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1.
24
25.EXAMPLE
26 Suggested usage for dev/test:
27 [Net.ServicePointManager]::SecurityProtocol = `
28 [Net.SecurityProtocolType]::Tls12
29 Invoke-WebRequest `
30 https://github.com/kubernetes/kubernetes/raw/master/cluster/gce/windows/k8s-node-setup.psm1 `
31 -OutFile C:\k8s-node-setup.psm1
32 Invoke-WebRequest `
33 https://github.com/kubernetes/kubernetes/raw/master/cluster/gce/windows/configure.ps1 `
34 -OutFile C:\configure.ps1
35 Import-Module -Force C:\k8s-node-setup.psm1 # -Force to override existing
36 # Execute functions manually or run configure.ps1.
37#>
38
39# IMPORTANT PLEASE NOTE:
40# Any time the file structure in the `windows` directory changes, `windows/BUILD`
41# and `k8s.io/release/lib/releaselib.sh` must be manually updated with the changes.
42# We HIGHLY recommend not changing the file structure, because consumers of
43# Kubernetes releases depend on the release structure remaining stable.
44
45# TODO: update scripts for these style guidelines:
46# - Remove {} around variable references unless actually needed for clarity.
47# - Always use single-quoted strings unless actually interpolating variables
48# or using escape characters.
49# - Use "approved verbs":
50# https://docs.microsoft.com/en-us/powershell/developer/cmdlet/approved-verbs-for-windows-powershell-commands
51# - Document functions using proper syntax:
52# https://technet.microsoft.com/en-us/library/hh847834(v=wps.620).aspx
53
54$GCE_METADATA_SERVER = "169.254.169.254"
55# The "management" interface is used by the kubelet and by Windows pods to talk
56# to the rest of the Kubernetes cluster *without NAT*. This interface does not
57# exist until an initial HNS network has been created on the Windows node - see
58# Add_InitialHnsNetwork().
59$MGMT_ADAPTER_NAME = "vEthernet (Ethernet*"
60$CRICTL_VERSION = 'v1.29.0'
61$CRICTL_SHA256 = '9b679305cb05f73e9e4868056e7d48805c47e24d2d38849e64395ff54cf5c701'
62
63Import-Module -Force C:\common.psm1
64
65# Writes a TODO with $Message to the console.
66function Log_Todo {
67 param (
68 [parameter(Mandatory=$true)] [string]$Message
69 )
70 Log-Output "TODO: ${Message}"
71}
72
73# Writes a not-implemented warning with $Message to the console and exits the
74# script.
75function Log_NotImplemented {
76 param (
77 [parameter(Mandatory=$true)] [string]$Message
78 )
79 Log-Output "Not implemented yet: ${Message}" -Fatal
80}
81
82# Fails and exits if the route to the GCE metadata server is not present,
83# otherwise does nothing and emits nothing.
84function Verify_GceMetadataServerRouteIsPresent {
85 Try {
86 Get-NetRoute `
87 -ErrorAction "Stop" `
88 -AddressFamily IPv4 `
89 -DestinationPrefix ${GCE_METADATA_SERVER}/32 | Out-Null
90 } Catch [Microsoft.PowerShell.Cmdletization.Cim.CimJobException] {
91 Log-Output -Fatal `
92 ("GCE metadata server route is not present as expected.`n" +
93 "$(Get-NetRoute -AddressFamily IPv4 | Out-String)")
94 }
95}
96
97# Checks if the route to the GCE metadata server is present. Returns when the
98# route is NOT present or after a timeout has expired.
99function WaitFor_GceMetadataServerRouteToBeRemoved {
100 $elapsed = 0
101 $timeout = 60
102 Log-Output ("Waiting up to ${timeout} seconds for GCE metadata server " +
103 "route to be removed")
104 while (${elapsed} -lt ${timeout}) {
105 Try {
106 Get-NetRoute `
107 -ErrorAction "Stop" `
108 -AddressFamily IPv4 `
109 -DestinationPrefix ${GCE_METADATA_SERVER}/32 | Out-Null
110 } Catch [Microsoft.PowerShell.Cmdletization.Cim.CimJobException] {
111 break
112 }
113 $sleeptime = 2
114 Start-Sleep ${sleeptime}
115 ${elapsed} += ${sleeptime}
116 }
117}
118
119# Adds a route to the GCE metadata server to every network interface.
120function Add_GceMetadataServerRoute {
121 # Before setting up HNS the Windows VM has a "vEthernet (nat)" interface and
122 # a "Ethernet" interface, and the route to the metadata server exists on the
123 # Ethernet interface. After adding the HNS network a "vEthernet (Ethernet)"
124 # interface is added, and it seems to subsume the routes of the "Ethernet"
125 # interface (trying to add routes on the Ethernet interface at this point just
126 # results in "New-NetRoute : Element not found" errors). I don't know what's
127 # up with that, but since it's hard to know what's the right thing to do here
128 # we just try to add the route on all of the network adapters.
129 Get-NetAdapter | ForEach-Object {
130 $adapter_index = $_.InterfaceIndex
131 New-NetRoute `
132 -ErrorAction Ignore `
133 -DestinationPrefix "${GCE_METADATA_SERVER}/32" `
134 -InterfaceIndex ${adapter_index} | Out-Null
135 }
136}
137
138# Returns a PowerShell object representing the Windows version.
139function Get_WindowsVersion {
140 # Unlike checking `[System.Environment]::OSVersion.Version`, this long-winded
141 # approach gets the OS revision/patch number correctly
142 # (https://superuser.com/a/1160428/652018).
143 $win_ver = New-Object -TypeName PSObject
144 $win_ver | Add-Member -MemberType NoteProperty -Name Major -Value $(Get-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\Software\Microsoft\Windows NT\CurrentVersion' CurrentMajorVersionNumber).CurrentMajorVersionNumber
145 $win_ver | Add-Member -MemberType NoteProperty -Name Minor -Value $(Get-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\Software\Microsoft\Windows NT\CurrentVersion' CurrentMinorVersionNumber).CurrentMinorVersionNumber
146 $win_ver | Add-Member -MemberType NoteProperty -Name Build -Value $(Get-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\Software\Microsoft\Windows NT\CurrentVersion' CurrentBuild).CurrentBuild
147 $win_ver | Add-Member -MemberType NoteProperty -Name Revision -Value $(Get-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\Software\Microsoft\Windows NT\CurrentVersion' UBR).UBR
148 return $win_ver
149}
150
151# Writes debugging information, such as Windows version and patch info, to the
152# console.
153function Dump-DebugInfoToConsole {
154 Try {
155 $version = Get_WindowsVersion | Out-String
156 $hotfixes = "$(Get-Hotfix | Out-String)"
157 $image = "$(Get-InstanceMetadata 'image' | Out-String)"
158 Log-Output "Windows version:`n$version"
159 Log-Output "Installed hotfixes:`n$hotfixes"
160 Log-Output "GCE Windows image:`n$image"
161 } Catch { }
162}
163
164# Configures Window Defender preferences
165function Configure-WindowsDefender {
166 if ((Get-WindowsFeature -Name 'Windows-Defender').Installed) {
167 Log-Output "Configuring Windows Defender preferences"
168 Set-MpPreference -SubmitSamplesConsent NeverSend
169 Log-Output "Disabling Windows Defender sample submission"
170 Set-MpPreference -MAPSReporting Disabled
171 Log-Output "Disabling Windows Defender Microsoft Active Protection Service Reporting"
172
173 Log-Output "Defender Preferences"
174 Get-MpPreference
175 }
176}
177
178# Converts the kube-env string in Yaml
179#
180# Returns: a PowerShell Hashtable object containing the key-value pairs from
181# kube-env.
182function ConvertFrom_Yaml_KubeEnv {
183 param (
184 [parameter(Mandatory=$true)] [string]$kube_env_str
185 )
186 $kube_env_table = @{}
187 $currentLine = $null
188 switch -regex (${kube_env_str} -split '\r?\n') {
189 '^(\S.*)' {
190 # record start pattern, line that doesn't start with a whitespace
191 if ($null -ne $currentLine) {
192 $key, $val = $currentLine -split ":",2
193 $kube_env_table[$key] = $val.Trim("'", " ", "`"")
194 }
195 $currentLine = $matches.1
196 continue
197 }
198
199 '^(\s+.*)' {
200 # line that start with whitespace
201 $currentLine += $matches.1
202 continue
203 }
204 }
205
206 # Handle the last line if any
207 if ($currentLine) {
208 $key, $val = $currentLine -split ":",2
209 $kube_env_table[$key] = $val.Trim("'", " ", "`"")
210 }
211
212 return ${kube_env_table}
213}
214
215# Fetches the kube-env from the instance metadata.
216#
217# Returns: a PowerShell Hashtable object containing the key-value pairs from
218# kube-env.
219function Fetch-KubeEnv {
220 # Testing / debugging:
221 # First:
222 # ${kube_env} = Get-InstanceMetadataAttribute 'kube-env'
223 # or:
224 # ${kube_env} = [IO.File]::ReadAllText(".\kubeEnv.txt")
225 # ${kube_env_table} = ConvertFrom_Yaml_KubeEnv ${kube_env}
226 # ${kube_env_table}
227 # ${kube_env_table}.GetType()
228
229 # The type of kube_env is a powershell String.
230 $kube_env = Get-InstanceMetadataAttribute 'kube-env'
231 $kube_env_table = ConvertFrom_Yaml_KubeEnv ${kube_env}
232
233 Log-Output "Logging kube-env key-value pairs except CERT and KEY values"
234 foreach ($entry in $kube_env_table.GetEnumerator()) {
235 if ((-not ($entry.Name.contains("CERT"))) -and (-not ($entry.Name.contains("KEY")))) {
236 Log-Output "$($entry.Name): $($entry.Value)"
237 }
238 }
239 return ${kube_env_table}
240}
241
242# Sets the environment variable $Key to $Value at the Machine scope (will
243# be present in the environment for all new shells after a reboot).
244function Set_MachineEnvironmentVar {
245 param (
246 [parameter(Mandatory=$true)] [string]$Key,
247 [parameter(Mandatory=$true)] [AllowEmptyString()] [string]$Value
248 )
249 [Environment]::SetEnvironmentVariable($Key, $Value, "Machine")
250}
251
252# Sets the environment variable $Key to $Value in the current shell.
253function Set_CurrentShellEnvironmentVar {
254 param (
255 [parameter(Mandatory=$true)] [string]$Key,
256 [parameter(Mandatory=$true)] [AllowEmptyString()] [string]$Value
257 )
258 $expression = '$env:' + $Key + ' = "' + $Value + '"'
259 Invoke-Expression ${expression}
260}
261
262# Sets environment variables used by Kubernetes binaries and by other functions
263# in this module. Depends on numerous ${kube_env} keys.
264function Set-EnvironmentVars {
265 if ($kube_env.ContainsKey('WINDOWS_CONTAINER_RUNTIME_ENDPOINT')) {
266 $container_runtime_endpoint = ${kube_env}['WINDOWS_CONTAINER_RUNTIME_ENDPOINT']
267 } else {
268 Log-Output "ERROR: WINDOWS_CONTAINER_RUNTIME_ENDPOINT not set in kube-env, falling back in CONTAINER_RUNTIME_ENDPOINT"
269 $container_runtime_endpoint = ${kube_env}['CONTAINER_RUNTIME_ENDPOINT']
270 }
271 # Turning the kube-env values into environment variables is not required but
272 # it makes debugging this script easier, and it also makes the syntax a lot
273 # easier (${env:K8S_DIR} can be expanded within a string but
274 # ${kube_env}['K8S_DIR'] cannot be afaik).
275 $env_vars = @{
276 "K8S_DIR" = ${kube_env}['K8S_DIR']
277 # Typically 'C:\etc\kubernetes\node\bin' (not just 'C:\etc\kubernetes\node')
278 "NODE_DIR" = ${kube_env}['NODE_DIR']
279 "CNI_DIR" = ${kube_env}['CNI_DIR']
280 "CNI_CONFIG_DIR" = ${kube_env}['CNI_CONFIG_DIR']
281 "WINDOWS_CNI_STORAGE_PATH" = ${kube_env}['WINDOWS_CNI_STORAGE_PATH']
282 "WINDOWS_CNI_VERSION" = ${kube_env}['WINDOWS_CNI_VERSION']
283 "CSI_PROXY_STORAGE_PATH" = ${kube_env}['CSI_PROXY_STORAGE_PATH']
284 "CSI_PROXY_VERSION" = ${kube_env}['CSI_PROXY_VERSION']
285 "CSI_PROXY_FLAGS" = ${kube_env}['CSI_PROXY_FLAGS']
286 "ENABLE_CSI_PROXY" = ${kube_env}['ENABLE_CSI_PROXY']
287 "PKI_DIR" = ${kube_env}['PKI_DIR']
288 "CA_FILE_PATH" = ${kube_env}['CA_FILE_PATH']
289 "KUBELET_CONFIG" = ${kube_env}['KUBELET_CONFIG_FILE']
290 "BOOTSTRAP_KUBECONFIG" = ${kube_env}['BOOTSTRAP_KUBECONFIG_FILE']
291 "KUBECONFIG" = ${kube_env}['KUBECONFIG_FILE']
292 "KUBEPROXY_KUBECONFIG" = ${kube_env}['KUBEPROXY_KUBECONFIG_FILE']
293 "LOGS_DIR" = ${kube_env}['LOGS_DIR']
294 "MANIFESTS_DIR" = ${kube_env}['MANIFESTS_DIR']
295 "INFRA_CONTAINER" = ${kube_env}['WINDOWS_INFRA_CONTAINER']
296 "WINDOWS_ENABLE_PIGZ" = ${kube_env}['WINDOWS_ENABLE_PIGZ']
297 "WINDOWS_ENABLE_HYPERV" = ${kube_env}['WINDOWS_ENABLE_HYPERV']
298 "ENABLE_NODE_PROBLEM_DETECTOR" = ${kube_env}['ENABLE_NODE_PROBLEM_DETECTOR']
299 "NODEPROBLEMDETECTOR_KUBECONFIG_FILE" = ${kube_env}['WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE']
300 "ENABLE_AUTH_PROVIDER_GCP" = ${kube_env}['ENABLE_AUTH_PROVIDER_GCP']
301 "AUTH_PROVIDER_GCP_STORAGE_PATH" = ${kube_env}['AUTH_PROVIDER_GCP_STORAGE_PATH']
302 "AUTH_PROVIDER_GCP_VERSION" = ${kube_env}['AUTH_PROVIDER_GCP_VERSION']
303 "AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64" = ${kube_env}['AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64']
304 "AUTH_PROVIDER_GCP_WINDOWS_BIN_DIR" = ${kube_env}['AUTH_PROVIDER_GCP_WINDOWS_BIN_DIR']
305 "AUTH_PROVIDER_GCP_WINDOWS_CONF_FILE" = ${kube_env}['AUTH_PROVIDER_GCP_WINDOWS_CONF_FILE']
306
307 "Path" = ${env:Path} + ";" + ${kube_env}['NODE_DIR']
308 "KUBE_NETWORK" = "l2bridge".ToLower()
309 "KUBELET_CERT_PATH" = ${kube_env}['PKI_DIR'] + '\kubelet.crt'
310 "KUBELET_KEY_PATH" = ${kube_env}['PKI_DIR'] + '\kubelet.key'
311
312 "CONTAINER_RUNTIME_ENDPOINT" = $container_runtime_endpoint
313
314 'LICENSE_DIR' = 'C:\Program Files\Google\Compute Engine\THIRD_PARTY_NOTICES'
315 }
316
317 # Set the environment variables in two ways: permanently on the machine (only
318 # takes effect after a reboot), and in the current shell.
319 $env_vars.GetEnumerator() | ForEach-Object{
320 $message = "Setting environment variable: " + $_.key + " = " + $_.value
321 Log-Output ${message}
322 Set_MachineEnvironmentVar $_.key $_.value
323 Set_CurrentShellEnvironmentVar $_.key $_.value
324 }
325}
326
327# Configures various settings and prerequisites needed for the rest of the
328# functions in this module and the Kubernetes binaries to operate properly.
329function Set-PrerequisiteOptions {
330 # Windows updates cause the node to reboot at arbitrary times.
331 Log-Output "Disabling Windows Update service"
332 & sc.exe config wuauserv start=disabled
333 & sc.exe stop wuauserv
334 Write-VerboseServiceInfoToConsole -Service 'wuauserv' -Delay 1
335
336 # Use TLS 1.2: needed for Invoke-WebRequest downloads from github.com.
337 [Net.ServicePointManager]::SecurityProtocol = `
338 [Net.SecurityProtocolType]::Tls12
339
340 Configure-WindowsDefender
341}
342
343# Creates directories where other functions in this module will read and write
344# data.
345# Note: C:\tmp is required for running certain kubernetes tests.
346# C:\var\log is used by kubelet to stored container logs and also
347# hard-coded in the fluentd/stackdriver config for log collection.
348function Create-Directories {
349 Log-Output "Creating ${env:K8S_DIR} and its subdirectories."
350 ForEach ($dir in ("${env:K8S_DIR}", "${env:NODE_DIR}", "${env:LOGS_DIR}",
351 "${env:CNI_DIR}", "${env:CNI_CONFIG_DIR}", "${env:MANIFESTS_DIR}",
352 "${env:PKI_DIR}", "${env:LICENSE_DIR}"), "C:\tmp", "C:\var\log") {
353 mkdir -Force $dir
354 }
355}
356
357# Downloads some external helper scripts needed by other functions in this
358# module.
359function Download-HelperScripts {
360 if (ShouldWrite-File ${env:K8S_DIR}\hns.psm1) {
361 MustDownload-File `
362 -OutFile ${env:K8S_DIR}\hns.psm1 `
363 -URLs 'https://storage.googleapis.com/gke-release/winnode/config/sdn/master/hns.psm1'
364 }
365}
366
367# Downloads the Kubernetes binaries from kube-env's NODE_BINARY_TAR_URL and
368# puts them in a subdirectory of $env:K8S_DIR.
369#
370# Required ${kube_env} keys:
371# NODE_BINARY_TAR_URL
372function DownloadAndInstall-KubernetesBinaries {
373 # Assume that presence of kubelet.exe indicates that the kubernetes binaries
374 # were already previously downloaded to this node.
375 if (-not (ShouldWrite-File ${env:NODE_DIR}\kubelet.exe)) {
376 return
377 }
378
379 $tmp_dir = 'C:\k8s_tmp'
380 New-Item -Force -ItemType 'directory' $tmp_dir | Out-Null
381
382 $urls = ${kube_env}['NODE_BINARY_TAR_URL'].Split(",")
383 $filename = Split-Path -leaf $urls[0]
384 $hash = $null
385 if ($kube_env.ContainsKey('NODE_BINARY_TAR_HASH')) {
386 $hash = ${kube_env}['NODE_BINARY_TAR_HASH']
387 }
388 MustDownload-File -Hash $hash -OutFile $tmp_dir\$filename -URLs $urls
389
390 tar xzvf $tmp_dir\$filename -C $tmp_dir
391 Move-Item -Force $tmp_dir\kubernetes\node\bin\* ${env:NODE_DIR}\
392 Move-Item -Force `
393 $tmp_dir\kubernetes\LICENSES ${env:LICENSE_DIR}\LICENSES_kubernetes
394
395 # Clean up the temporary directory
396 Remove-Item -Force -Recurse $tmp_dir
397}
398
399# Downloads the csi-proxy binaries from kube-env's CSI_PROXY_STORAGE_PATH and
400# CSI_PROXY_VERSION, and then puts them in a subdirectory of $env:NODE_DIR.
401# Note: for now the installation is skipped for non-test clusters. Will be
402# installed for all cluster after tests pass.
403# Required ${kube_env} keys:
404# CSI_PROXY_STORAGE_PATH and CSI_PROXY_VERSION
405function DownloadAndInstall-CSIProxyBinaries {
406 if ("${env:ENABLE_CSI_PROXY}" -eq "true") {
407 if (ShouldWrite-File ${env:NODE_DIR}\csi-proxy.exe) {
408 $tmp_dir = 'C:\k8s_tmp'
409 New-Item -Force -ItemType 'directory' $tmp_dir | Out-Null
410 $filename = 'csi-proxy.exe'
411 $urls = "${env:CSI_PROXY_STORAGE_PATH}/${env:CSI_PROXY_VERSION}/$filename"
412 MustDownload-File -OutFile $tmp_dir\$filename -URLs $urls
413 Move-Item -Force $tmp_dir\$filename ${env:NODE_DIR}\$filename
414 # Clean up the temporary directory
415 Remove-Item -Force -Recurse $tmp_dir
416 }
417 }
418}
419
420function Start-CSIProxy {
421 if ("${env:ENABLE_CSI_PROXY}" -eq "true") {
422 Log-Output "Creating CSI Proxy Service"
423 $flags = "-windows-service -log_file=${env:LOGS_DIR}\csi-proxy.log -logtostderr=false ${env:CSI_PROXY_FLAGS}"
424 & sc.exe create csiproxy binPath= "${env:NODE_DIR}\csi-proxy.exe $flags"
425 & sc.exe failure csiproxy reset= 0 actions= restart/10000
426 Log-Output "Starting CSI Proxy Service"
427 & sc.exe start csiproxy
428 Write-VerboseServiceInfoToConsole -Service 'csiproxy' -Delay 1
429 }
430}
431
432# TODO(pjh): this is copied from
433# https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1#L98.
434# See if there's a way to fetch or construct the "management subnet" so that
435# this is not needed.
436function ConvertTo_DecimalIP
437{
438 param(
439 [parameter(Mandatory = $true, Position = 0)]
440 [Net.IPAddress] $IPAddress
441 )
442
443 $i = 3; $decimal_ip = 0;
444 $IPAddress.GetAddressBytes() | % {
445 $decimal_ip += $_ * [Math]::Pow(256, $i); $i--
446 }
447 return [UInt32]$decimal_ip
448}
449
450# TODO(pjh): this is copied from
451# https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1#L98.
452# See if there's a way to fetch or construct the "management subnet" so that
453# this is not needed.
454function ConvertTo_DottedDecimalIP
455{
456 param(
457 [parameter(Mandatory = $true, Position = 0)]
458 [Uint32] $IPAddress
459 )
460
461 $dotted_ip = $(for ($i = 3; $i -gt -1; $i--) {
462 $remainder = $IPAddress % [Math]::Pow(256, $i)
463 ($IPAddress - $remainder) / [Math]::Pow(256, $i)
464 $IPAddress = $remainder
465 })
466 return [String]::Join(".", $dotted_ip)
467}
468
469# TODO(pjh): this is copied from
470# https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1#L98.
471# See if there's a way to fetch or construct the "management subnet" so that
472# this is not needed.
473function ConvertTo_MaskLength
474{
475 param(
476 [parameter(Mandatory = $True, Position = 0)]
477 [Net.IPAddress] $SubnetMask
478 )
479
480 $bits = "$($SubnetMask.GetAddressBytes() | % {
481 [Convert]::ToString($_, 2)
482 } )" -replace "[\s0]"
483 return $bits.Length
484}
485
486# Returns a network adapter object for the "management" interface via which the
487# Windows pods+kubelet will communicate with the rest of the Kubernetes cluster.
488#
489# This function will fail if Add_InitialHnsNetwork() has not been called first.
490function Get_MgmtNetAdapter {
491 $net_adapter = Get-NetAdapter | Where-Object Name -like ${MGMT_ADAPTER_NAME}
492 if (-not ${net_adapter}) {
493 Throw ("Failed to find a suitable network adapter, check your network " +
494 "settings.")
495 }
496
497 return $net_adapter
498}
499
500# Decodes the base64 $Data string and writes it as binary to $File. Does
501# nothing if $File already exists and $REDO_STEPS is not set.
502function Write_PkiData {
503 param (
504 [parameter(Mandatory=$true)] [string] $Data,
505 [parameter(Mandatory=$true)] [string] $File
506 )
507
508 if (-not (ShouldWrite-File $File)) {
509 return
510 }
511
512 # This command writes out a PEM certificate file, analogous to "base64
513 # --decode" on Linux. See https://stackoverflow.com/a/51914136/1230197.
514 [IO.File]::WriteAllBytes($File, [Convert]::FromBase64String($Data))
515 Log_Todo ("need to set permissions correctly on ${File}; not sure what the " +
516 "Windows equivalent of 'umask 077' is")
517 # Linux: owned by root, rw by user only.
518 # -rw------- 1 root root 1.2K Oct 12 00:56 ca-certificates.crt
519 # -rw------- 1 root root 1.3K Oct 12 00:56 kubelet.crt
520 # -rw------- 1 root root 1.7K Oct 12 00:56 kubelet.key
521 # Windows:
522 # https://docs.microsoft.com/en-us/dotnet/api/system.io.fileattributes
523 # https://docs.microsoft.com/en-us/dotnet/api/system.io.fileattributes
524}
525
526# Creates the node PKI files in $env:PKI_DIR.
527#
528# Required ${kube_env} keys:
529# CA_CERT
530# ${kube_env} keys that can be omitted for nodes that do not use an
531# authentication plugin:
532# KUBELET_CERT
533# KUBELET_KEY
534function Create-NodePki {
535 Log-Output 'Creating node pki files'
536
537 if ($kube_env.ContainsKey('CA_CERT')) {
538 $CA_CERT_BUNDLE = ${kube_env}['CA_CERT']
539 Write_PkiData "${CA_CERT_BUNDLE}" ${env:CA_FILE_PATH}
540 }
541 else {
542 Log-Output -Fatal 'CA_CERT not present in kube-env'
543 }
544
545 if ($kube_env.ContainsKey('KUBELET_CERT')) {
546 $KUBELET_CERT = ${kube_env}['KUBELET_CERT']
547 Write_PkiData "${KUBELET_CERT}" ${env:KUBELET_CERT_PATH}
548 }
549 else {
550 Log-Output -Fatal 'KUBELET_CERT not present in kube-env'
551 }
552 if ($kube_env.ContainsKey('KUBELET_KEY')) {
553 $KUBELET_KEY = ${kube_env}['KUBELET_KEY']
554 Write_PkiData "${KUBELET_KEY}" ${env:KUBELET_KEY_PATH}
555 }
556 else {
557 Log-Output -Fatal 'KUBELET_KEY not present in kube-env'
558 }
559
560 Get-ChildItem ${env:PKI_DIR}
561}
562
563# Creates the bootstrap kubelet kubeconfig at $env:BOOTSTRAP_KUBECONFIG.
564# https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet-tls-bootstrapping/
565#
566# Create-NodePki() must be called first.
567#
568# Required ${kube_env} keys:
569# KUBERNETES_MASTER_NAME: the apiserver IP address.
570function Write_BootstrapKubeconfig {
571 if (-not (ShouldWrite-File ${env:BOOTSTRAP_KUBECONFIG})) {
572 return
573 }
574
575 # TODO(mtaufen): is user "kubelet" correct? Other examples use e.g.
576 # "system:node:$(hostname)".
577
578 $apiserverAddress = ${kube_env}['KUBERNETES_MASTER_NAME']
579 New-Item -Force -ItemType file ${env:BOOTSTRAP_KUBECONFIG} | Out-Null
580 Set-Content ${env:BOOTSTRAP_KUBECONFIG} `
581'apiVersion: v1
582kind: Config
583users:
584- name: kubelet
585 user:
586 client-certificate: KUBELET_CERT_PATH
587 client-key: KUBELET_KEY_PATH
588clusters:
589- name: local
590 cluster:
591 server: https://APISERVER_ADDRESS
592 certificate-authority: CA_FILE_PATH
593contexts:
594- context:
595 cluster: local
596 user: kubelet
597 name: service-account-context
598current-context: service-account-context'.`
599 replace('KUBELET_CERT_PATH', ${env:KUBELET_CERT_PATH}).`
600 replace('KUBELET_KEY_PATH', ${env:KUBELET_KEY_PATH}).`
601 replace('APISERVER_ADDRESS', ${apiserverAddress}).`
602 replace('CA_FILE_PATH', ${env:CA_FILE_PATH})
603 Log-Output ("kubelet bootstrap kubeconfig:`n" +
604 "$(Get-Content -Raw ${env:BOOTSTRAP_KUBECONFIG})")
605}
606
607# Fetches the kubelet kubeconfig from the metadata server and writes it to
608# $env:KUBECONFIG.
609#
610# Create-NodePki() must be called first.
611function Write_KubeconfigFromMetadata {
612 if (-not (ShouldWrite-File ${env:KUBECONFIG})) {
613 return
614 }
615
616 $kubeconfig = Get-InstanceMetadataAttribute 'kubeconfig'
617 if ($kubeconfig -eq $null) {
618 Log-Output `
619 "kubeconfig metadata key not found, can't write ${env:KUBECONFIG}" `
620 -Fatal
621 }
622 Set-Content ${env:KUBECONFIG} $kubeconfig
623 Log-Output ("kubelet kubeconfig from metadata (non-bootstrap):`n" +
624 "$(Get-Content -Raw ${env:KUBECONFIG})")
625}
626
627# Creates the kubelet kubeconfig at $env:KUBECONFIG for nodes that use an
628# authentication plugin, or at $env:BOOTSTRAP_KUBECONFIG for nodes that do not.
629#
630# Create-NodePki() must be called first.
631#
632# Required ${kube_env} keys:
633# KUBERNETES_MASTER_NAME: the apiserver IP address.
634function Create-KubeletKubeconfig {
635 Write_BootstrapKubeconfig
636}
637
638# Creates the kubeconfig user file for applications that communicate with Kubernetes.
639#
640# Create-NodePki() must be called first.
641#
642# Required ${kube_env} keys:
643# CA_CERT
644# KUBERNETES_MASTER_NAME
645function Create-Kubeconfig {
646 param (
647 [parameter(Mandatory=$true)] [string]$Name,
648 [parameter(Mandatory=$true)] [string]$Path,
649 [parameter(Mandatory=$true)] [string]$Token
650 )
651 if (-not (ShouldWrite-File $Path)) {
652 return
653 }
654
655 New-Item -Force -ItemType file $Path | Out-Null
656
657 # In configure-helper.sh kubelet kubeconfig uses certificate-authority while
658 # kubeproxy kubeconfig uses certificate-authority-data, ugh. Does it matter?
659 # Use just one or the other for consistency?
660 Set-Content $Path `
661'apiVersion: v1
662kind: Config
663users:
664- name: APP_NAME
665 user:
666 token: APP_TOKEN
667clusters:
668- name: local
669 cluster:
670 server: https://APISERVER_ADDRESS
671 certificate-authority-data: CA_CERT
672contexts:
673- context:
674 cluster: local
675 user: APP_NAME
676 name: service-account-context
677current-context: service-account-context'.`
678 replace('APP_NAME', $Name).`
679 replace('APP_TOKEN', $Token).`
680 replace('CA_CERT', ${kube_env}['CA_CERT']).`
681 replace('APISERVER_ADDRESS', ${kube_env}['KUBERNETES_MASTER_NAME'])
682
683 Log-Output ("${Name} kubeconfig:`n" +
684 "$(Get-Content -Raw ${Path})")
685}
686
687# Creates the kube-proxy user kubeconfig file at $env:KUBEPROXY_KUBECONFIG.
688#
689# Create-NodePki() must be called first.
690#
691# Required ${kube_env} keys:
692# CA_CERT
693# KUBE_PROXY_TOKEN
694function Create-KubeproxyKubeconfig {
695 Create-Kubeconfig -Name 'kube-proxy' `
696 -Path ${env:KUBEPROXY_KUBECONFIG} `
697 -Token ${kube_env}['KUBE_PROXY_TOKEN']
698}
699
700# Returns the IP alias range configured for this GCE instance.
701function Get_IpAliasRange {
702 $url = ("http://${GCE_METADATA_SERVER}/computeMetadata/v1/instance/" +
703 "network-interfaces/0/ip-aliases/0")
704 $client = New-Object Net.WebClient
705 $client.Headers.Add('Metadata-Flavor', 'Google')
706 return ($client.DownloadString($url)).Trim()
707}
708
709# Retrieves the pod CIDR and sets it in $env:POD_CIDR.
710function Set-PodCidr {
711 while($true) {
712 $pod_cidr = Get_IpAliasRange
713 if (-not $?) {
714 Log-Output ${pod_cIDR}
715 Log-Output "Retrying Get_IpAliasRange..."
716 Start-Sleep -sec 1
717 continue
718 }
719 break
720 }
721
722 Log-Output "fetched pod CIDR (same as IP alias range): ${pod_cidr}"
723 Set_MachineEnvironmentVar "POD_CIDR" ${pod_cidr}
724 Set_CurrentShellEnvironmentVar "POD_CIDR" ${pod_cidr}
725}
726
727# Adds an initial HNS network on the Windows node which forces the creation of
728# a virtual switch and the "management" interface that will be used to
729# communicate with the rest of the Kubernetes cluster without NAT.
730#
731# Note that adding the initial HNS network may cause connectivity to the GCE
732# metadata server to be lost due to a Windows bug.
733# Configure-HostNetworkingService() restores connectivity, look there for
734# details.
735#
736# Download-HelperScripts() must have been called first.
737function Add_InitialHnsNetwork {
738 $INITIAL_HNS_NETWORK = 'External'
739
740 # This comes from
741 # https://github.com/Microsoft/SDN/blob/master/Kubernetes/flannel/l2bridge/start.ps1#L74
742 # (or
743 # https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1#L206).
744 #
745 # daschott noted on Slack: "L2bridge networks require an external vSwitch.
746 # The first network ("External") with hardcoded values in the script is just
747 # a placeholder to create an external vSwitch. This is purely for convenience
748 # to be able to remove/modify the actual HNS network ("cbr0") or rejoin the
749 # nodes without a network blip. Creating a vSwitch takes time, causes network
750 # blips, and it makes it more likely to hit the issue where flanneld is
751 # stuck, so we want to do this as rarely as possible."
752 $hns_network = Get-HnsNetwork | Where-Object Name -eq $INITIAL_HNS_NETWORK
753 if ($hns_network) {
754 if ($REDO_STEPS) {
755 Log-Output ("Warning: initial '$INITIAL_HNS_NETWORK' HNS network " +
756 "already exists, removing it and recreating it")
757 $hns_network | Remove-HnsNetwork
758 $hns_network = $null
759 }
760 else {
761 Log-Output ("Skip: initial '$INITIAL_HNS_NETWORK' HNS network " +
762 "already exists, not recreating it")
763 return
764 }
765 }
766 Log-Output ("Creating initial HNS network to force creation of " +
767 "${MGMT_ADAPTER_NAME} interface")
768 # Note: RDP connection will hiccup when running this command.
769 New-HNSNetwork `
770 -Type "L2Bridge" `
771 -AddressPrefix "192.168.255.0/30" `
772 -Gateway "192.168.255.1" `
773 -Name $INITIAL_HNS_NETWORK `
774 -Verbose
775}
776
777# Get the network in uint32 for the given cidr
778function Get_NetworkDecimal_From_CIDR([string] $cidr) {
779 $network, [int]$subnetlen = $cidr.Split('/')
780 $decimal_network = ConvertTo_DecimalIP($network)
781 return $decimal_network
782}
783
784# Get gateway ip string (the first address) based on pod cidr.
785# For Windows nodes the pod gateway IP address is the first address in the pod
786# CIDR for the host.
787function Get_Gateway_From_CIDR([string] $cidr) {
788 $network=Get_NetworkDecimal_From_CIDR($cidr)
789 $gateway=ConvertTo_DottedDecimalIP($network+1)
790 return $gateway
791}
792
793# Get endpoint gateway ip string (the second address) based on pod cidr.
794# For Windows nodes the pod gateway IP address is the first address in the pod
795# CIDR for the host, but from inside containers it's the second address.
796function Get_Endpoint_Gateway_From_CIDR([string] $cidr) {
797 $network=Get_NetworkDecimal_From_CIDR($cidr)
798 $gateway=ConvertTo_DottedDecimalIP($network+2)
799 return $gateway
800}
801
802# Get pod IP range start based (the third address) on pod cidr
803# We reserve the first two in the cidr range for gateways. Start the cidr
804# range from the third so that IPAM does not allocate those IPs to pods.
805function Get_PodIP_Range_Start([string] $cidr) {
806 $network=Get_NetworkDecimal_From_CIDR($cidr)
807 $start=ConvertTo_DottedDecimalIP($network+3)
808 return $start
809}
810
811# Configures HNS on the Windows node to enable Kubernetes networking:
812# - Creates the "management" interface associated with an initial HNS network.
813# - Creates the HNS network $env:KUBE_NETWORK for pod networking.
814# - Creates an HNS endpoint for pod networking.
815# - Adds necessary routes on the management interface.
816# - Verifies that the GCE metadata server connection remains intact.
817#
818# Prerequisites:
819# $env:POD_CIDR is set (by Set-PodCidr).
820# Download-HelperScripts() has been called.
821function Configure-HostNetworkingService {
822 Import-Module -Force ${env:K8S_DIR}\hns.psm1
823
824 Add_InitialHnsNetwork
825
826 $pod_gateway = Get_Gateway_From_CIDR(${env:POD_CIDR})
827 $pod_endpoint_gateway = Get_Endpoint_Gateway_From_CIDR(${env:POD_CIDR})
828 Log-Output ("Setting up Windows node HNS networking: " +
829 "podCidr = ${env:POD_CIDR}, podGateway = ${pod_gateway}, " +
830 "podEndpointGateway = ${pod_endpoint_gateway}")
831
832 $hns_network = Get-HnsNetwork | Where-Object Name -eq ${env:KUBE_NETWORK}
833 if ($hns_network) {
834 if ($REDO_STEPS) {
835 Log-Output ("Warning: ${env:KUBE_NETWORK} HNS network already exists, " +
836 "removing it and recreating it")
837 $hns_network | Remove-HnsNetwork
838 $hns_network = $null
839 }
840 else {
841 Log-Output "Skip: ${env:KUBE_NETWORK} HNS network already exists"
842 }
843 }
844 $created_hns_network = $false
845 if (-not $hns_network) {
846 # Note: RDP connection will hiccup when running this command.
847 $hns_network = New-HNSNetwork `
848 -Type "L2Bridge" `
849 -AddressPrefix ${env:POD_CIDR} `
850 -Gateway ${pod_gateway} `
851 -Name ${env:KUBE_NETWORK} `
852 -Verbose
853 $created_hns_network = $true
854 }
855 # This name of endpoint is referred in pkg/proxy/winkernel/proxier.go as part of
856 # kube-proxy as well. A health check port for every service that is specified as
857 # "externalTrafficPolicy: local" will be added on the endpoint.
858 # PLEASE KEEP THEM CONSISTENT!!!
859 $endpoint_name = "cbr0"
860
861 $vnic_name = "vEthernet (${endpoint_name})"
862
863 $hns_endpoint = Get-HnsEndpoint | Where-Object Name -eq $endpoint_name
864 # Note: we don't expect to ever enter this block currently - while the HNS
865 # network does seem to persist across reboots, the HNS endpoints do not.
866 if ($hns_endpoint) {
867 if ($REDO_STEPS) {
868 Log-Output ("Warning: HNS endpoint $endpoint_name already exists, " +
869 "removing it and recreating it")
870 $hns_endpoint | Remove-HnsEndpoint
871 $hns_endpoint = $null
872 }
873 else {
874 Log-Output "Skip: HNS endpoint $endpoint_name already exists"
875 }
876 }
877 if (-not $hns_endpoint) {
878 $hns_endpoint = New-HnsEndpoint `
879 -NetworkId ${hns_network}.Id `
880 -Name ${endpoint_name} `
881 -IPAddress ${pod_endpoint_gateway} `
882 -Gateway "0.0.0.0" `
883 -Verbose
884 # TODO(pjh): find out: why is this always CompartmentId 1?
885 Attach-HnsHostEndpoint `
886 -EndpointID ${hns_endpoint}.Id `
887 -CompartmentID 1 `
888 -Verbose
889 netsh interface ipv4 set interface "${vnic_name}" forwarding=enabled
890 }
891
892 Try {
893 Get-HNSPolicyList | Remove-HnsPolicyList
894 } Catch { }
895
896 # Add a route from the management NIC to the pod CIDR.
897 #
898 # When a packet from a Kubernetes service backend arrives on the destination
899 # Windows node, the reverse SNAT will be applied and the source address of
900 # the packet gets replaced from the pod IP to the service VIP. The packet
901 # will then leave the VM and return back through hairpinning.
902 #
903 # When IP alias is enabled, IP forwarding is disabled for anti-spoofing;
904 # the packet with the service VIP will get blocked and be lost. With this
905 # route, the packet will be routed to the pod subnetwork, and not leave the
906 # VM.
907 $mgmt_net_adapter = Get_MgmtNetAdapter
908 New-NetRoute `
909 -ErrorAction Ignore `
910 -InterfaceAlias ${mgmt_net_adapter}.ifAlias `
911 -DestinationPrefix ${env:POD_CIDR} `
912 -NextHop "0.0.0.0" `
913 -Verbose
914
915 if ($created_hns_network) {
916 # There is an HNS bug where the route to the GCE metadata server will be
917 # removed when the HNS network is created:
918 # https://github.com/Microsoft/hcsshim/issues/299#issuecomment-425491610.
919 # The behavior here is very unpredictable: the route may only be removed
920 # after some delay, or it may appear to be removed then you'll add it back
921 # but then it will be removed once again. So, we first wait a long
922 # unfortunate amount of time to ensure that things have quiesced, then we
923 # wait until we're sure the route is really gone before re-adding it again.
924 Log-Output "Waiting 45 seconds for host network state to quiesce"
925 Start-Sleep 45
926 WaitFor_GceMetadataServerRouteToBeRemoved
927 Log-Output "Re-adding the GCE metadata server route"
928 Add_GceMetadataServerRoute
929 }
930 Verify_GceMetadataServerRouteIsPresent
931
932 Log-Output "Host network setup complete"
933}
934
935function Configure-GcePdTools {
936 if (ShouldWrite-File ${env:K8S_DIR}\GetGcePdName.dll) {
937 MustDownload-File -OutFile ${env:K8S_DIR}\GetGcePdName.dll `
938 -URLs "https://storage.googleapis.com/gke-release/winnode/config/gce-tools/master/GetGcePdName/GetGcePdName.dll"
939 }
940 if (-not (Test-Path $PsHome\profile.ps1)) {
941 New-Item -path $PsHome\profile.ps1 -type file
942 }
943
944 Add-Content $PsHome\profile.ps1 `
945 '$modulePath = "K8S_DIR\GetGcePdName.dll"
946 Unblock-File $modulePath
947 Import-Module -Name $modulePath'.replace('K8S_DIR', ${env:K8S_DIR})
948}
949
950# Setup cni network for containerd.
951function Prepare-CniNetworking {
952 Configure_Containerd_CniNetworking
953}
954
955# Obtain the host dns conf and save it to a file so that kubelet/CNI
956# can use it to configure dns suffix search list for pods.
957# The value of DNS server is ignored right now because the pod will
958# always only use cluster DNS service, but for consistency, we still
959# parsed them here in the same format as Linux resolv.conf.
960# This function must be called after Configure-HostNetworkingService.
961function Configure-HostDnsConf {
962 $net_adapter = Get_MgmtNetAdapter
963 $server_ips = (Get-DnsClientServerAddress `
964 -InterfaceAlias ${net_adapter}.Name).ServerAddresses
965 $search_list = (Get-DnsClient).ConnectionSpecificSuffixSearchList
966 $conf = ""
967 ForEach ($ip in $server_ips) {
968 $conf = $conf + "nameserver $ip`r`n"
969 }
970 $conf = $conf + "search $search_list"
971 # Do not put hostdns.conf into the CNI config directory so as to
972 # avoid the container runtime treating it as CNI config.
973 $hostdns_conf = "${env:CNI_DIR}\hostdns.conf"
974 New-Item -Force -ItemType file ${hostdns_conf} | Out-Null
975 Set-Content ${hostdns_conf} $conf
976 Log-Output "HOST dns conf:`n$(Get-Content -Raw ${hostdns_conf})"
977}
978
979# Fetches the kubelet config from the instance metadata and puts it at
980# $env:KUBELET_CONFIG.
981function Configure-Kubelet {
982 if (-not (ShouldWrite-File ${env:KUBELET_CONFIG})) {
983 return
984 }
985
986 # The Kubelet config is built by build-kubelet-config() in
987 # cluster/gce/util.sh, and stored in the metadata server under the
988 # 'kubelet-config' key.
989 $kubelet_config = Get-InstanceMetadataAttribute 'kubelet-config'
990 Set-Content ${env:KUBELET_CONFIG} $kubelet_config
991 Log-Output "Kubelet config:`n$(Get-Content -Raw ${env:KUBELET_CONFIG})"
992}
993
994# Sets up the kubelet and kube-proxy arguments and starts them as native
995# Windows services.
996#
997# Required ${kube_env} keys:
998# KUBELET_ARGS
999# KUBEPROXY_ARGS
1000# CLUSTER_IP_RANGE
1001function Start-WorkerServices {
1002 # Compute kubelet args
1003 $kubelet_args_str = ${kube_env}['KUBELET_ARGS']
1004 $kubelet_args = $kubelet_args_str.Split(" ")
1005 Log-Output "kubelet_args from metadata: ${kubelet_args}"
1006
1007 # To join GCE instances to AD, we need to shorten their names, as NetBIOS name
1008 # must be <= 15 characters, and GKE generated names are longer than that.
1009 # To perform the join in an automated way, it's preferable to apply the rename
1010 # and domain join in the GCESysprep step. However, after sysprep is complete
1011 # and the machine restarts, kubelet bootstrapping should not use the shortened
1012 # computer name, and instead use the instance's name by using --hostname-override,
1013 # otherwise kubelet and kube-proxy will not be able to run properly.
1014 $instance_name = "$(Get-InstanceMetadata 'name' | Out-String)"
1015 $default_kubelet_args = @(`
1016 "--pod-infra-container-image=${env:INFRA_CONTAINER}",
1017 "--hostname-override=${instance_name}"
1018 )
1019
1020 $kubelet_args = ${default_kubelet_args} + ${kubelet_args}
1021 Log-Output 'Using bootstrap kubeconfig for authentication'
1022 $kubelet_args = (${kubelet_args} +
1023 "--bootstrap-kubeconfig=${env:BOOTSTRAP_KUBECONFIG}")
1024 Log-Output "Final kubelet_args: ${kubelet_args}"
1025
1026 # Compute kube-proxy args
1027 $kubeproxy_args_str = ${kube_env}['KUBEPROXY_ARGS']
1028 $kubeproxy_args = $kubeproxy_args_str.Split(" ")
1029 Log-Output "kubeproxy_args from metadata: ${kubeproxy_args}"
1030
1031 # kubeproxy is started on Linux nodes using
1032 # kube-manifests/kubernetes/gci-trusty/kube-proxy.manifest, which is
1033 # generated by start-kube-proxy in configure-helper.sh and contains e.g.:
1034 # kube-proxy --master=https://35.239.84.171
1035 # --kubeconfig=/var/lib/kube-proxy/kubeconfig --cluster-cidr=10.64.0.0/14
1036 # --oom-score-adj=-998 --v=2
1037 # --iptables-sync-period=1m --iptables-min-sync-period=10s
1038 # --ipvs-sync-period=1m --ipvs-min-sync-period=10s
1039 # And also with various volumeMounts and "securityContext: privileged: true".
1040 $default_kubeproxy_args = @(`
1041 "--kubeconfig=${env:KUBEPROXY_KUBECONFIG}",
1042 "--cluster-cidr=$(${kube_env}['CLUSTER_IP_RANGE'])",
1043 "--hostname-override=${instance_name}"
1044 )
1045
1046 $kubeproxy_args = ${default_kubeproxy_args} + ${kubeproxy_args}
1047 Log-Output "Final kubeproxy_args: ${kubeproxy_args}"
1048
1049 # TODO(pjh): kubelet is emitting these messages:
1050 # I1023 23:44:11.761915 2468 kubelet.go:274] Adding pod path:
1051 # C:\etc\kubernetes
1052 # I1023 23:44:11.775601 2468 file.go:68] Watching path
1053 # "C:\\etc\\kubernetes"
1054 # ...
1055 # E1023 23:44:31.794327 2468 file.go:182] Can't process manifest file
1056 # "C:\\etc\\kubernetes\\hns.psm1": C:\etc\kubernetes\hns.psm1: couldn't parse
1057 # as pod(yaml: line 10: did not find expected <document start>), please check
1058 # config file.
1059 #
1060 # Figure out how to change the directory that the kubelet monitors for new
1061 # pod manifests.
1062
1063 # We configure the service to restart on failure, after 10s wait. We reset
1064 # the restart count to 0 each time, so we re-use our restart/10000 action on
1065 # each failure. Note it currently restarts even when explicitly stopped, you
1066 # have to delete the service entry to *really* kill it (e.g. `sc.exe delete
1067 # kubelet`). See issue #72900.
1068 if (Get-Process | Where-Object Name -eq "kubelet") {
1069 Log-Output -Fatal `
1070 "A kubelet process is already running, don't know what to do"
1071 }
1072 Log-Output "Creating kubelet service"
1073 & sc.exe create kubelet binPath= "${env:NODE_DIR}\kube-log-runner.exe -log-file=${env:LOGS_DIR}\kubelet.log ${env:NODE_DIR}\kubelet.exe ${kubelet_args}" start= demand
1074 & sc.exe failure kubelet reset= 0 actions= restart/10000
1075 Log-Output "Starting kubelet service"
1076 & sc.exe start kubelet
1077
1078 Log-Output "Waiting 10 seconds for kubelet to stabilize"
1079 Start-Sleep 10
1080 Write-VerboseServiceInfoToConsole -Service 'kubelet'
1081
1082 if (Get-Process | Where-Object Name -eq "kube-proxy") {
1083 Log-Output -Fatal `
1084 "A kube-proxy process is already running, don't know what to do"
1085 }
1086 Log-Output "Creating kube-proxy service"
1087 & sc.exe create kube-proxy binPath= "${env:NODE_DIR}\kube-log-runner.exe -log-file=${env:LOGS_DIR}\kube-proxy.log ${env:NODE_DIR}\kube-proxy.exe ${kubeproxy_args}" start= demand
1088 & sc.exe failure kube-proxy reset= 0 actions= restart/10000
1089 Log-Output "Starting kube-proxy service"
1090 & sc.exe start kube-proxy
1091 Write-VerboseServiceInfoToConsole -Service 'kube-proxy' -Delay 1
1092
1093 # F1020 23:08:52.000083 9136 server.go:361] unable to load in-cluster
1094 # configuration, KUBERNETES_SERVICE_HOST and KUBERNETES_SERVICE_PORT must be
1095 # defined
1096 # TODO(pjh): still getting errors like these in kube-proxy log:
1097 # E1023 04:03:58.143449 4840 reflector.go:205] k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion/factory.go:129: Failed to list *core.Endpoints: Get https://35.239.84.171/api/v1/endpoints?limit=500&resourceVersion=0: dial tcp 35.239.84.171:443: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
1098 # E1023 04:03:58.150266 4840 reflector.go:205] k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion/factory.go:129: Failed to list *core.Service: Get https://35.239.84.171/api/v1/services?limit=500&resourceVersion=0: dial tcp 35.239.84.171:443: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
1099 WaitFor_KubeletAndKubeProxyReady
1100 Verify_GceMetadataServerRouteIsPresent
1101 Log-Output "Kubernetes components started successfully"
1102}
1103
1104# Stop and unregister both kubelet & kube-proxy services.
1105function Unregister-WorkerServices {
1106 & sc.exe delete kube-proxy
1107 & sc.exe delete kubelet
1108}
1109
1110# Wait for kubelet and kube-proxy to be ready within 10s.
1111function WaitFor_KubeletAndKubeProxyReady {
1112 $waited = 0
1113 $timeout = 10
1114 while (((Get-Service kube-proxy).Status -ne 'Running' -or (Get-Service kubelet).Status -ne 'Running') -and $waited -lt $timeout) {
1115 Start-Sleep 1
1116 $waited++
1117 }
1118
1119 # Timeout occurred
1120 if ($waited -ge $timeout) {
1121 Log-Output "$(Get-Service kube* | Out-String)"
1122 Throw ("Timeout while waiting ${timeout} seconds for kubelet and kube-proxy services to start")
1123 }
1124}
1125
1126# Runs 'kubectl get nodes'.
1127# Runs additional verification commands to ensure node successfully joined cluster
1128# and that it connects to the API Server.
1129function Verify-WorkerServices {
1130 $timeout = 12
1131 $retries = 0
1132 $retryDelayInSeconds = 5
1133
1134 Log-Output ("Testing node connection to API server...")
1135 do {
1136 $retries++
1137 $nodes_list = & "${env:NODE_DIR}\kubectl.exe" get nodes -o=custom-columns=:.metadata.name -A | Out-String
1138 $host_status = & "${env:NODE_DIR}\kubectl.exe" get nodes (hostname) -o=custom-columns=:.status.conditions[4].type | Out-String
1139 Start-Sleep $retryDelayInSeconds
1140 } while (((-Not $nodes_list) -or (-Not $nodes_list.contains((hostname))) -or (-Not $host_status.contains("Ready")))-and ($retries -le $timeout))
1141
1142 If (-Not $nodes_list){
1143 Throw ("Node: '$(hostname)' failed to connect to API server")
1144
1145 }ElseIf (-Not $nodes_list.contains((hostname))) {
1146 Throw ("Node: '$(hostname)' failed to join the cluster; NODES: '`n $($nodes_list)'")
1147
1148 }ELseIf (-Not $host_status.contains("Ready")) {
1149 Throw ("Node: '$(hostname)' is not in Ready state")
1150 }
1151
1152 Log-Output ("Node: $(hostname) successfully joined cluster `n NODES: `n $($nodes_list)")
1153 Verify_GceMetadataServerRouteIsPresent
1154
1155}
1156
1157# Downloads the Windows crictl package and installs its contents (e.g.
1158# crictl.exe) in $env:NODE_DIR.
1159function DownloadAndInstall-Crictl {
1160 if (-not (ShouldWrite-File ${env:NODE_DIR}\crictl.exe)) {
1161 return
1162 }
1163 $CRI_TOOLS_GCS_BUCKET = 'k8s-artifacts-cri-tools'
1164 $url = ('https://storage.googleapis.com/' + $CRI_TOOLS_GCS_BUCKET +
1165 '/release/' + $CRICTL_VERSION + '/crictl-' + $CRICTL_VERSION +
1166 '-windows-amd64.tar.gz')
1167 MustDownload-File `
1168 -URLs $url `
1169 -OutFile ${env:NODE_DIR}\crictl.tar.gz `
1170 -Hash $CRICTL_SHA256 `
1171 -Algorithm SHA256
1172 tar xzvf ${env:NODE_DIR}\crictl.tar.gz -C ${env:NODE_DIR}
1173}
1174
1175# Sets crictl configuration values.
1176function Configure-Crictl {
1177 if (${env:CONTAINER_RUNTIME_ENDPOINT}) {
1178 & "${env:NODE_DIR}\crictl.exe" config runtime-endpoint `
1179 ${env:CONTAINER_RUNTIME_ENDPOINT}
1180 }
1181}
1182
1183# Pulls the infra/pause container image onto the node so that it will be
1184# immediately available when the kubelet tries to run pods.
1185# TODO(pjh): downloading the container container image may take a few minutes;
1186# figure out how to run this in the background while perform the rest of the
1187# node startup steps!
1188# Pull-InfraContainer must be called AFTER Verify-WorkerServices.
1189function Pull-InfraContainer {
1190 $name, $label = ${env:INFRA_CONTAINER} -split ':',2
1191 if (-not ("$(& crictl images)" -match "$name.*$label")) {
1192 & crictl pull ${env:INFRA_CONTAINER}
1193 if (!$?) {
1194 throw "Error running 'crictl pull ${env:INFRA_CONTAINER}'"
1195 }
1196 }
1197 $inspect = "$(& crictl inspecti ${env:INFRA_CONTAINER} | Out-String)"
1198 Log-Output "Infra/pause container:`n$inspect"
1199}
1200
1201# Setup the containerd on the node.
1202function Setup-ContainerRuntime {
1203 Install-Pigz
1204 Install_Containerd
1205 Configure_Containerd
1206 Start_Containerd
1207}
1208
1209function Test-ContainersFeatureInstalled {
1210 return (Get-WindowsFeature Containers).Installed
1211}
1212
1213# After this function returns, the computer must be restarted to complete
1214# the installation!
1215function Install-ContainersFeature {
1216 Log-Output "Installing Windows 'Containers' feature"
1217 Install-WindowsFeature Containers
1218}
1219
1220# Verifies if Hyper-V should be enabled in the node
1221function Test-ShouldEnableHyperVFeature {
1222 return "${env:WINDOWS_ENABLE_HYPERV}" -eq "true"
1223}
1224
1225# Check if Hyper-V feature is enabled
1226function Test-HyperVFeatureEnabled {
1227 return ((Get-WindowsOptionalFeature -Online -FeatureName Microsoft-Hyper-V).State -eq 'Enabled')
1228}
1229
1230# After this function returns, the computer must be restarted to complete
1231# the installation!
1232function Enable-HyperVFeature {
1233 Log-Output "Enabling Windows 'HyperV' feature"
1234 Enable-WindowsOptionalFeature -Online -FeatureName Microsoft-Hyper-V -All -NoRestart
1235 Enable-WindowsOptionalFeature -Online -FeatureName Microsoft-Hyper-V-Management-PowerShell -All -NoRestart
1236}
1237
1238# Configures the TCP/IP parameters to be in sync with the GCP recommendation.
1239# Not setting these values correctly can cause network issues for connections
1240# that live longer than 10 minutes.
1241# See: https://cloud.google.com/compute/docs/troubleshooting/general-tips#idle-connections
1242function Set-WindowsTCPParameters {
1243 Set-ItemProperty -Force -Confirm:$false -Path `
1244 'HKLM:\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters' `
1245 -Name 'KeepAliveInterval' -Type Dword -Value 1000
1246 Set-ItemProperty -Force -Confirm:$false `
1247 -Path 'HKLM:\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters' `
1248 -Name 'KeepAliveTime' -Type Dword -Value 60000
1249 Set-ItemProperty -Force -Confirm:$false `
1250 -Path 'HKLM:\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters' `
1251 -Name 'TcpMaxDataRetransmissions' -Type Dword -Value 10
1252
1253 Log-Output 'TCP/IP Parameters'
1254 Get-ItemProperty -Path 'HKLM:\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters'
1255}
1256
1257# Writes a CNI config file under $env:CNI_CONFIG_DIR for containerd.
1258#
1259# Prerequisites:
1260# $env:POD_CIDR is set (by Set-PodCidr).
1261# The "management" interface exists (Configure-HostNetworkingService).
1262# The HNS network for pod networking has been configured
1263# (Configure-HostNetworkingService).
1264# Containerd is installed (Install_Containerd).
1265#
1266# Required ${kube_env} keys:
1267# DNS_SERVER_IP
1268# DNS_DOMAIN
1269# SERVICE_CLUSTER_IP_RANGE
1270function Configure_Containerd_CniNetworking {
1271 $l2bridge_conf = "${env:CNI_CONFIG_DIR}\l2bridge.conf"
1272 if (-not (ShouldWrite-File ${l2bridge_conf})) {
1273 return
1274 }
1275
1276 $mgmt_ip = (Get_MgmtNetAdapter |
1277 Get-NetIPAddress -AddressFamily IPv4).IPAddress
1278
1279 $pod_gateway = Get_Endpoint_Gateway_From_CIDR(${env:POD_CIDR})
1280
1281 # Explanation of the CNI config values:
1282 # POD_CIDR: the pod CIDR assigned to this node.
1283 # POD_GATEWAY: the gateway IP.
1284 # MGMT_IP: the IP address assigned to the node's primary network interface
1285 # (i.e. the internal IP of the GCE VM).
1286 # SERVICE_CIDR: the CIDR used for kubernetes services.
1287 # DNS_SERVER_IP: the cluster's DNS server IP address.
1288 # DNS_DOMAIN: the cluster's DNS domain, e.g. "cluster.local".
1289 #
1290 # OutBoundNAT ExceptionList: No SNAT for CIDRs in the list, the same as default GKE non-masquerade destination ranges listed at https://cloud.google.com/kubernetes-engine/docs/how-to/ip-masquerade-agent#default-non-masq-dests
1291
1292 New-Item -Force -ItemType file ${l2bridge_conf} | Out-Null
1293 Set-Content ${l2bridge_conf} `
1294'{
1295 "cniVersion": "0.2.0",
1296 "name": "l2bridge",
1297 "type": "sdnbridge",
1298 "master": "Ethernet",
1299 "capabilities": {
1300 "portMappings": true,
1301 "dns": true
1302 },
1303 "ipam": {
1304 "subnet": "POD_CIDR",
1305 "routes": [
1306 {
1307 "GW": "POD_GATEWAY"
1308 }
1309 ]
1310 },
1311 "dns": {
1312 "Nameservers": [
1313 "DNS_SERVER_IP"
1314 ],
1315 "Search": [
1316 "DNS_DOMAIN"
1317 ]
1318 },
1319 "AdditionalArgs": [
1320 {
1321 "Name": "EndpointPolicy",
1322 "Value": {
1323 "Type": "OutBoundNAT",
1324 "Settings": {
1325 "Exceptions": [
1326 "169.254.0.0/16",
1327 "10.0.0.0/8",
1328 "172.16.0.0/12",
1329 "192.168.0.0/16",
1330 "100.64.0.0/10",
1331 "192.0.0.0/24",
1332 "192.0.2.0/24",
1333 "192.88.99.0/24",
1334 "198.18.0.0/15",
1335 "198.51.100.0/24",
1336 "203.0.113.0/24",
1337 "240.0.0.0/4"
1338 ]
1339 }
1340 }
1341 },
1342 {
1343 "Name": "EndpointPolicy",
1344 "Value": {
1345 "Type": "SDNRoute",
1346 "Settings": {
1347 "DestinationPrefix": "SERVICE_CIDR",
1348 "NeedEncap": true
1349 }
1350 }
1351 },
1352 {
1353 "Name": "EndpointPolicy",
1354 "Value": {
1355 "Type": "SDNRoute",
1356 "Settings": {
1357 "DestinationPrefix": "MGMT_IP/32",
1358 "NeedEncap": true
1359 }
1360 }
1361 }
1362 ]
1363}'.replace('POD_CIDR', ${env:POD_CIDR}).`
1364 replace('POD_GATEWAY', ${pod_gateway}).`
1365 replace('DNS_SERVER_IP', ${kube_env}['DNS_SERVER_IP']).`
1366 replace('DNS_DOMAIN', ${kube_env}['DNS_DOMAIN']).`
1367 replace('MGMT_IP', ${mgmt_ip}).`
1368 replace('SERVICE_CIDR', ${kube_env}['SERVICE_CLUSTER_IP_RANGE'])
1369
1370 Log-Output "containerd CNI config:`n$(Get-Content -Raw ${l2bridge_conf})"
1371}
1372
1373# Download and install containerd and CNI binaries into $env:NODE_DIR.
1374function Install_Containerd {
1375 # Assume that presence of containerd.exe indicates that all containerd
1376 # binaries were already previously downloaded to this node.
1377 if (-not (ShouldWrite-File ${env:NODE_DIR}\containerd.exe)) {
1378 return
1379 }
1380
1381 $tmp_dir = 'C:\containerd_tmp'
1382 New-Item $tmp_dir -ItemType 'directory' -Force | Out-Null
1383
1384 # TODO(ibrahimab) Change this to a gcs bucket with CI maintained and accessible by community.
1385 $version = '1.6.2'
1386 $tar_url = ("https://github.com/containerd/containerd/releases/download/v${version}/" +
1387 "cri-containerd-cni-${version}-windows-amd64.tar.gz")
1388 $sha_url = $tar_url + ".sha256sum"
1389 MustDownload-File -URLs $sha_url -OutFile $tmp_dir\sha256sum
1390 $sha = $(Get-Content $tmp_dir\sha256sum).Split(" ")[0].ToUpper()
1391
1392 MustDownload-File `
1393 -URLs $tar_url `
1394 -OutFile $tmp_dir\containerd.tar.gz `
1395 -Hash $sha `
1396 -Algorithm SHA256
1397
1398 tar xzvf $tmp_dir\containerd.tar.gz -C $tmp_dir
1399 Move-Item -Force $tmp_dir\cni\bin\*.exe "${env:CNI_DIR}\"
1400 Move-Item -Force $tmp_dir\*.exe "${env:NODE_DIR}\"
1401 Remove-Item -Force -Recurse $tmp_dir
1402
1403 # Exclusion for Defender.
1404 Add-MpPreference -ExclusionProcess "${env:NODE_DIR}\containerd.exe"
1405}
1406
1407# Lookup the path of containerd config if exists, else returns a default.
1408function Get_Containerd_ConfigPath {
1409 $service = Get-WMIObject -Class Win32_Service -Filter "Name='containerd'"
1410 if (!($service -eq $null) -and
1411 $service.PathName -match ".*\s--config\s*(\S+).*" -and
1412 $matches.Count -eq 2) {
1413 return $matches[1]
1414 } else {
1415 return 'C:\Program Files\containerd\config.toml'
1416 }
1417}
1418
1419# Generates the containerd config.toml file.
1420function Configure_Containerd {
1421 $config_path = Get_Containerd_ConfigPath
1422 $config_dir = [System.IO.Path]::GetDirectoryName($config_path)
1423 New-Item $config_dir -ItemType 'directory' -Force | Out-Null
1424 Set-Content ${config_path} @"
1425[plugins.scheduler]
1426 schedule_delay = '0s'
1427 startup_delay = '0s'
1428[plugins.cri]
1429 sandbox_image = 'INFRA_CONTAINER_IMAGE'
1430[plugins.cri.containerd]
1431 snapshotter = 'windows'
1432 default_runtime_name = 'runhcs-wcow-process'
1433 disable_snapshot_annotations = true
1434 discard_unpacked_layers = true
1435[plugins.cri.cni]
1436 bin_dir = 'CNI_BIN_DIR'
1437 conf_dir = 'CNI_CONF_DIR'
1438"@.replace('INFRA_CONTAINER_IMAGE', ${env:INFRA_CONTAINER}).`
1439 replace('CNI_BIN_DIR', "${env:CNI_DIR}").`
1440 replace('CNI_CONF_DIR', "${env:CNI_CONFIG_DIR}")
1441}
1442
1443# Register if needed and start containerd service.
1444function Start_Containerd {
1445 # Do the registration only if the containerd service does not exist.
1446 if ((Get-WMIObject -Class Win32_Service -Filter "Name='containerd'") -eq $null) {
1447 Log-Output "Creating containerd service"
1448 & containerd.exe --register-service --log-file "${env:LOGS_DIR}/containerd.log"
1449 }
1450
1451 Log-Output "Starting containerd service"
1452 Restart-Service containerd
1453}
1454
1455# Pigz Resources
1456$PIGZ_ROOT = 'C:\pigz'
1457$PIGZ_VERSION = '2.3.1'
1458$PIGZ_TAR_URL = "https://storage.googleapis.com/gke-release/winnode/pigz/prod/gke_windows/pigz/release/5/20201104-134221/pigz-$PIGZ_VERSION.zip"
1459$PIGZ_TAR_HASH = '5a6f8f5530acc85ea51797f58c1409e5af6b69e55da243ffc608784cf14fec0cd16f74cc61c564d69e1a267750aecfc1e4c53b5219ff5f893b42a7576306f34c'
1460
1461# Install Pigz (https://github.com/madler/pigz) into Windows for improved image
1462# extraction performance.
1463function Install-Pigz {
1464 if ("${env:WINDOWS_ENABLE_PIGZ}" -eq "true") {
1465 if (-not (Test-Path $PIGZ_ROOT)) {
1466 Log-Output "Installing Pigz $PIGZ_VERSION"
1467 New-Item -Path $PIGZ_ROOT -ItemType Directory
1468 MustDownload-File `
1469 -Url $PIGZ_TAR_URL `
1470 -OutFile "$PIGZ_ROOT\pigz-$PIGZ_VERSION.zip" `
1471 -Hash $PIGZ_TAR_HASH `
1472 -Algorithm SHA512
1473 Expand-Archive -Path "$PIGZ_ROOT\pigz-$PIGZ_VERSION.zip" `
1474 -DestinationPath $PIGZ_ROOT
1475 Remove-Item -Path "$PIGZ_ROOT\pigz-$PIGZ_VERSION.zip"
1476 # Containerd search for unpigz.exe on the first container image
1477 # pull request after the service is started. If unpigz.exe is in the
1478 # Windows path it'll use it instead of the default unzipper.
1479 # See: https://github.com/containerd/containerd/issues/1896
1480 Add-MachineEnvironmentPath -Path $PIGZ_ROOT
1481 # Add process exclusion for Windows Defender to boost performance.
1482 Add-MpPreference -ExclusionProcess "$PIGZ_ROOT\unpigz.exe"
1483 Log-Output "Installed Pigz $PIGZ_VERSION"
1484 } else {
1485 Log-Output "Pigz already installed."
1486 }
1487 }
1488}
1489
1490# Node Problem Detector Resources
1491$NPD_SERVICE = "node-problem-detector"
1492$DEFAULT_NPD_VERSION = '0.8.10-gke0.1'
1493$DEFAULT_NPD_RELEASE_PATH = 'https://storage.googleapis.com/gke-release/winnode'
1494$DEFAULT_NPD_HASH = '97ddfe3544da9e02a1cfb55d24f329eb29d606fca7fbbf800415d5de9dbc29a00563f8e0d1919595c8e316fd989d45b09b13c07be528841fc5fd37e21d016a2d'
1495
1496# Install Node Problem Detector (NPD).
1497# NPD analyzes the host for problems that can disrupt workloads.
1498# https://github.com/kubernetes/node-problem-detector
1499function DownloadAndInstall-NodeProblemDetector {
1500 if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone") {
1501 if (ShouldWrite-File "${env:NODE_DIR}\node-problem-detector.exe") {
1502 $npd_version = $DEFAULT_NPD_VERSION
1503 $npd_hash = $DEFAULT_NPD_HASH
1504 if (-not [string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_VERSION'])) {
1505 $npd_version = ${kube_env}['NODE_PROBLEM_DETECTOR_VERSION']
1506 $npd_hash = ${kube_env}['NODE_PROBLEM_DETECTOR_TAR_HASH']
1507 }
1508 $npd_release_path = $DEFAULT_NPD_RELEASE_PATH
1509 if (-not [string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_RELEASE_PATH'])) {
1510 $npd_release_path = ${kube_env}['NODE_PROBLEM_DETECTOR_RELEASE_PATH']
1511 }
1512
1513 $npd_tar = "node-problem-detector-v${npd_version}-windows_amd64.tar.gz"
1514
1515 Log-Output "Downloading ${npd_tar}."
1516
1517 $npd_dir = "${env:K8S_DIR}\node-problem-detector"
1518 New-Item -Path $npd_dir -ItemType Directory -Force -Confirm:$false
1519
1520 MustDownload-File `
1521 -URLs "${npd_release_path}/node-problem-detector/${npd_tar}" `
1522 -Hash $npd_hash `
1523 -Algorithm SHA512 `
1524 -OutFile "${npd_dir}\${npd_tar}"
1525
1526 tar xzvf "${npd_dir}\${npd_tar}" -C $npd_dir
1527 Move-Item "${npd_dir}\bin\*" "${env:NODE_DIR}\" -Force -Confirm:$false
1528 Remove-Item "${npd_dir}\bin" -Force -Confirm:$false
1529 Remove-Item "${npd_dir}\${npd_tar}" -Force -Confirm:$false
1530 }
1531 else {
1532 Log-Output "Node Problem Detector already installed."
1533 }
1534 }
1535}
1536
1537# Creates the node-problem-detector user kubeconfig file at
1538# $env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE (if defined).
1539#
1540# Create-NodePki() must be called first.
1541#
1542# Required ${kube_env} keys:
1543# CA_CERT
1544# NODE_PROBLEM_DETECTOR_TOKEN
1545function Create-NodeProblemDetectorKubeConfig {
1546 if (-not [string]::IsNullOrEmpty(${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE})) {
1547 Create-Kubeconfig -Name 'node-problem-detector' `
1548 -Path ${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE} `
1549 -Token ${kube_env}['NODE_PROBLEM_DETECTOR_TOKEN']
1550 }
1551}
1552
1553# Configures NPD to run with the bundled monitor configs and report against the Kubernetes api server.
1554function Configure-NodeProblemDetector {
1555 $npd_bin = "${env:NODE_DIR}\node-problem-detector.exe"
1556 if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone" -and (Test-Path $npd_bin)) {
1557 $npd_svc = Get-Service -Name $NPD_SERVICE -ErrorAction SilentlyContinue
1558 if ($npd_svc -eq $null) {
1559 $npd_dir = "${env:K8S_DIR}\node-problem-detector"
1560 $npd_logs_dir = "${env:LOGS_DIR}\node-problem-detector"
1561
1562 New-Item -Path $npd_logs_dir -Type Directory -Force -Confirm:$false
1563
1564 $flags = ''
1565 if ([string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS'])) {
1566 $system_log_monitors = @()
1567 $system_stats_monitors = @()
1568 $custom_plugin_monitors = @()
1569
1570 # Custom Plugin Monitors
1571 $custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-kubelet.json")
1572 $custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-kubeproxy.json")
1573 $custom_plugin_monitors += @("${npd_dir}\config\windows-defender-monitor.json")
1574
1575 # System Stats Monitors
1576 $system_stats_monitors += @("${npd_dir}\config\windows-system-stats-monitor.json")
1577
1578 # NPD Configuration for CRI monitor
1579 $system_log_monitors += @("${npd_dir}\config\windows-containerd-monitor-filelog.json")
1580 $custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-containerd.json")
1581
1582 $flags="--v=2 --port=20256 --log_dir=${npd_logs_dir}"
1583 if ($system_log_monitors.count -gt 0) {
1584 $flags+=" --config.system-log-monitor={0}" -f ($system_log_monitors -join ",")
1585 }
1586 if ($system_stats_monitors.count -gt 0) {
1587 $flags+=" --config.system-stats-monitor={0}" -f ($system_stats_monitors -join ",")
1588 }
1589 if ($custom_plugin_monitors.count -gt 0) {
1590 $flags+=" --config.custom-plugin-monitor={0}" -f ($custom_plugin_monitors -join ",")
1591 }
1592 }
1593 else {
1594 $flags = ${kube_env}['NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS']
1595 }
1596 $kubernetes_master_name = ${kube_env}['KUBERNETES_MASTER_NAME']
1597 $flags = "${flags} --apiserver-override=`"https://${kubernetes_master_name}?inClusterConfig=false&auth=${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE}`""
1598
1599 Log-Output "Creating service: ${NPD_SERVICE}"
1600 Log-Output "${npd_bin} ${flags}"
1601 sc.exe create $NPD_SERVICE binpath= "${npd_bin} ${flags}" displayName= "Node Problem Detector"
1602 sc.exe failure $NPD_SERVICE reset= 30 actions= restart/5000
1603 sc.exe start $NPD_SERVICE
1604
1605 Write-VerboseServiceInfoToConsole -Service $NPD_SERVICE
1606 }
1607 else {
1608 Log-Output "${NPD_SERVICE} already configured."
1609 }
1610 }
1611}
1612
1613# TODO(pjh): move the logging agent code below into a separate
1614# module; it was put here temporarily to avoid disrupting the file layout in
1615# the K8s release machinery.
1616$LOGGINGAGENT_VERSION = '1.8.10'
1617$LOGGINGAGENT_ROOT = 'C:\fluent-bit'
1618$LOGGINGAGENT_SERVICE = 'fluent-bit'
1619$LOGGINGAGENT_CMDLINE = '*fluent-bit.exe*'
1620
1621$LOGGINGEXPORTER_VERSION = 'v0.17.0'
1622$LOGGINGEXPORTER_ROOT = 'C:\flb-exporter'
1623$LOGGINGEXPORTER_SERVICE = 'flb-exporter'
1624$LOGGINGEXPORTER_CMDLINE = '*flb-exporter.exe*'
1625$LOGGINGEXPORTER_HASH = 'c808c9645d84b06b89932bd707d51a9d1d0b451b5a702a5f9b2b4462c8be6502'
1626
1627# Restart Logging agent or starts it if it is not currently running
1628function Restart-LoggingAgent {
1629 if (IsStackdriverAgentInstalled) {
1630 Restart-StackdriverAgent
1631 return
1632 }
1633
1634 Restart-LogService $LOGGINGEXPORTER_SERVICE $LOGGINGEXPORTER_CMDLINE
1635 Restart-LogService $LOGGINGAGENT_SERVICE $LOGGINGAGENT_CMDLINE
1636}
1637
1638# Restarts the service, or starts it if it is not currently
1639# running. A standard `Restart-Service` may fail because
1640# the process is sometimes unstoppable, so this function works around it
1641# by killing the processes.
1642function Restart-LogService([string]$service, [string]$cmdline) {
1643 Stop-Service -NoWait -ErrorAction Ignore $service
1644
1645 # Wait (if necessary) for service to stop.
1646 $timeout = 10
1647 $stopped = (Get-service $service).Status -eq 'Stopped'
1648 for ($i = 0; $i -lt $timeout -and !($stopped); $i++) {
1649 Start-Sleep 1
1650 $stopped = (Get-service $service).Status -eq 'Stopped'
1651 }
1652
1653 if ((Get-service $service).Status -ne 'Stopped') {
1654 # Force kill the processes.
1655 Stop-Process -Force -PassThru -Id (Get-WmiObject win32_process |
1656 Where CommandLine -Like $cmdline).ProcessId
1657
1658 # Wait until process has stopped.
1659 $waited = 0
1660 $log_period = 10
1661 $timeout = 60
1662 while ((Get-service $service).Status -ne 'Stopped' -and $waited -lt $timeout) {
1663 Start-Sleep 1
1664 $waited++
1665
1666 if ($waited % $log_period -eq 0) {
1667 Log-Output "Waiting for ${service} service to stop"
1668 }
1669 }
1670
1671 # Timeout occurred
1672 if ($waited -ge $timeout) {
1673 Throw ("Timeout while waiting for ${service} service to stop")
1674 }
1675 }
1676
1677 Start-Service $service
1678}
1679
1680# Check whether the logging agent is installed by whether it's registered as service
1681function IsLoggingAgentInstalled {
1682 $logging_status = (Get-Service $LOGGINGAGENT_SERVICE -ErrorAction Ignore).Status
1683 return -not [string]::IsNullOrEmpty($logging_status)
1684}
1685
1686# Installs the logging agent according to https://docs.fluentbit.io/manual/installation/windows#
1687# Also installs fluent bit stackdriver exporter
1688function Install-LoggingAgent {
1689 if (IsStackdriverAgentInstalled) {
1690 # Remove the existing storage.json file if it exists. This is a workaround
1691 # for the bug where the logging agent cannot start up if the file is
1692 # corrupted.
1693 Remove-Item `
1694 -Force `
1695 -ErrorAction Ignore `
1696 ("$STACKDRIVER_ROOT\LoggingAgent\Main\pos\winevtlog.pos\worker0\" +
1697 "storage.json")
1698 Log-Output ("Skip: Stackdriver logging agent is already installed")
1699 return
1700 }
1701
1702 if (IsLoggingAgentInstalled) {
1703 # Note: we should reinstall the agent if $REDO_STEPS is true
1704 # here, but we don't know how to run the installer without it prompting
1705 # when logging agent is already installed. We dumped the strings in the
1706 # installer binary and searched for flags to do this but found nothing. Oh
1707 # well.
1708 Log-Output ("Skip: Fluentbit logging agent is already installed")
1709 return
1710 }
1711
1712 DownloadAndInstall-LoggingAgents
1713 Create-LoggingAgentServices
1714}
1715
1716function DownloadAndInstall-LoggingAgents {
1717 # Install Logging agent if not present
1718 if (ShouldWrite-File $LOGGINGAGENT_ROOT\td-agent-bit-${LOGGINGAGENT_VERSION}-win64) {
1719 $install_dir = 'C:\flb-installers'
1720 $url = ("https://storage.googleapis.com/gke-release/winnode/fluentbit/td-agent-bit-${LOGGINGAGENT_VERSION}-win64.zip")
1721
1722 Log-Output 'Downloading Logging agent'
1723 New-Item $install_dir -ItemType 'directory' -Force | Out-Null
1724 MustDownload-File -OutFile $install_dir\td.zip -URLs $url
1725
1726 cd $install_dir
1727 Log-Output 'Extracting Logging agent'
1728 Expand-Archive td.zip
1729 mv .\td\td-agent-bit-${LOGGINGAGENT_VERSION}-win64\ $LOGGINGAGENT_ROOT
1730 cd C:\
1731 Remove-Item -Force -Recurse $install_dir
1732 }
1733
1734 # Download Logging exporter if needed
1735 if (ShouldWrite-File $LOGGINGEXPORTER_ROOT\flb-exporter.exe) {
1736 $url = ("https://storage.googleapis.com/gke-release/winnode/fluentbit-exporter/${LOGGINGEXPORTER_VERSION}/flb-exporter-${LOGGINGEXPORTER_VERSION}.exe")
1737 Log-Output 'Downloading logging exporter'
1738 New-Item $LOGGINGEXPORTER_ROOT -ItemType 'directory' -Force | Out-Null
1739 MustDownload-File `
1740 -OutFile $LOGGINGEXPORTER_ROOT\flb-exporter.exe `
1741 -URLs $url `
1742 -Hash $LOGGINGEXPORTER_HASH `
1743 -Algorithm SHA256
1744 }
1745}
1746
1747function Create-LoggingAgentServices {
1748 cd $LOGGINGAGENT_ROOT
1749
1750 Log-Output "Creating service: ${LOGGINGAGENT_SERVICE}"
1751 sc.exe create $LOGGINGAGENT_SERVICE binpath= "${LOGGINGAGENT_ROOT}\bin\fluent-bit.exe -c \fluent-bit\conf\fluent-bit.conf"
1752 sc.exe failure $LOGGINGAGENT_SERVICE reset= 30 actions= restart/5000
1753 Write-VerboseServiceInfoToConsole -Service $LOGGINGAGENT_SERVICE
1754
1755 Log-Output "Creating service: ${LOGGINGEXPORTER_SERVICE}"
1756 sc.exe create $LOGGINGEXPORTER_SERVICE binpath= "${LOGGINGEXPORTER_ROOT}\flb-exporter.exe --kubernetes-separator=_ --stackdriver-resource-model=k8s --enable-pod-label-discovery --logtostderr --winsvc --pod-label-dot-replacement=_"
1757 sc.exe failure $LOGGINGEXPORTER_SERVICE reset= 30 actions= restart/5000
1758 Write-VerboseServiceInfoToConsole -Service $LOGGINGEXPORTER_SERVICE
1759}
1760
1761# Writes the logging configuration file for Logging agent. Restart-LoggingAgent
1762# should then be called to pick up the new configuration.
1763function Configure-LoggingAgent {
1764 if (IsStackdriverAgentInstalled) {
1765 Configure-StackdriverAgent
1766 return
1767 }
1768
1769 $fluentbit_config_file = "$LOGGINGAGENT_ROOT\conf\fluent-bit.conf"
1770 $FLUENTBIT_CONFIG | Out-File -FilePath $fluentbit_config_file -Encoding ASCII
1771 Log-Output "Wrote logging config to $fluentbit_config_file"
1772
1773 $fluentbit_parser_file = "$LOGGINGAGENT_ROOT\conf\parsers.conf"
1774 $PARSERS_CONFIG | Out-File -FilePath $fluentbit_parser_file -Encoding ASCII
1775
1776 # Create directory for all the log position files.
1777 New-Item -Type Directory -Path "/var/run/google-fluentbit/pos-files/" -Force | Out-Null
1778
1779 Log-Output "Wrote logging config to $fluentbit_parser_file"
1780}
1781
1782# Fluentbit main config file
1783$FLUENTBIT_CONFIG = @'
1784[SERVICE]
1785 Flush 5
1786 Grace 120
1787 Log_Level info
1788 Log_File /var/log/fluentbit.log
1789 Daemon off
1790 Parsers_File parsers.conf
1791 HTTP_Server off
1792 HTTP_Listen 0.0.0.0
1793 HTTP_PORT 2020
1794 plugins_file plugins.conf
1795
1796 # Storage
1797 # =======
1798 # Fluent Bit can use memory and filesystem buffering based mechanisms
1799 #
1800 # - https://docs.fluentbit.io/manual/administration/buffering-and-storage
1801 #
1802 # storage metrics
1803 # ---------------
1804 # publish storage pipeline metrics in '/api/v1/storage'. The metrics are
1805 # exported only if the 'http_server' option is enabled.
1806 #
1807 # storage.metrics on
1808
1809 # storage.path
1810 # ------------
1811 # absolute file system path to store filesystem data buffers (chunks).
1812 #
1813 # storage.path /tmp/storage
1814
1815 # storage.sync
1816 # ------------
1817 # configure the synchronization mode used to store the data into the
1818 # filesystem. It can take the values normal or full.
1819 #
1820 # storage.sync normal
1821
1822 # storage.checksum
1823 # ----------------
1824 # enable the data integrity check when writing and reading data from the
1825 # filesystem. The storage layer uses the CRC32 algorithm.
1826 #
1827 # storage.checksum off
1828
1829 # storage.backlog.mem_limit
1830 # -------------------------
1831 # if storage.path is set, Fluent Bit will look for data chunks that were
1832 # not delivered and are still in the storage layer, these are called
1833 # backlog data. This option configure a hint of maximum value of memory
1834 # to use when processing these records.
1835 #
1836 # storage.backlog.mem_limit 5M
1837
1838[INPUT]
1839 Name winlog
1840 Interval_Sec 2
1841 # Channels Setup,Windows PowerShell
1842 Channels application,system,security
1843 Tag winevt.raw
1844 DB /var/run/google-fluentbit/pos-files/winlog.db
1845
1846# Json Log Example:
1847# {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"}
1848[INPUT]
1849 Name tail
1850 Alias kube_containers
1851 Tag kube_<namespace_name>_<pod_name>_<container_name>
1852 Tag_Regex (?<pod_name>[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-
1853 Mem_Buf_Limit 5MB
1854 Skip_Long_Lines On
1855 Refresh_Interval 5
1856 Path C:\var\log\containers\*.log
1857 DB /var/run/google-fluentbit/pos-files/flb_kube.db
1858
1859[FILTER]
1860 Name parser
1861 Match kube_*
1862 Key_Name log
1863 Reserve_Data True
1864 Parser docker
1865 Parser containerd
1866
1867# Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
1868# Example:
1869# I0716 02:08:55.559351 3356 log_spam.go:42] Command line arguments:
1870[INPUT]
1871 Name tail
1872 Alias node-problem-detector
1873 Tag node-problem-detector
1874 Mem_Buf_Limit 5MB
1875 Skip_Long_Lines On
1876 Refresh_Interval 5
1877 Path C:\etc\kubernetes\logs\node-problem-detector\*.log.INFO*
1878 DB /var/run/google-fluentbit/pos-files/node-problem-detector.db
1879 Multiline On
1880 Parser_Firstline glog
1881
1882# Example:
1883# I0928 03:15:50.440223 4880 main.go:51] Starting CSI-Proxy Server ...
1884[INPUT]
1885 Name tail
1886 Alias csi-proxy
1887 Tag csi-proxy
1888 Mem_Buf_Limit 5MB
1889 Skip_Long_Lines On
1890 Refresh_Interval 5
1891 Path /etc/kubernetes/logs/csi-proxy.log
1892 DB /var/run/google-fluentbit/pos-files/csi-proxy.db
1893 Multiline On
1894 Parser_Firstline glog
1895
1896# I1118 21:26:53.975789 6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed
1897[INPUT]
1898 Name tail
1899 Alias kube-proxy
1900 Tag kube-proxy
1901 Mem_Buf_Limit 5MB
1902 Skip_Long_Lines On
1903 Refresh_Interval 5
1904 Path /etc/kubernetes/logs/kube-proxy.log
1905 DB /var/run/google-fluentbit/pos-files/kube-proxy.db
1906 Multiline On
1907 Parser_Firstline glog
1908
1909# Example:
1910# time="2019-12-10T21:27:59.836946700Z" level=info msg="loading plugin \"io.containerd.grpc.v1.cri\"..." type=io.containerd.grpc.v1
1911[INPUT]
1912 Name tail
1913 Alias container-runtime
1914 Tag container-runtime
1915 Mem_Buf_Limit 5MB
1916 Skip_Long_Lines On
1917 Refresh_Interval 5
1918 Path /etc/kubernetes/logs/containerd.log
1919 DB /var/run/google-fluentbit/pos-files/container-runtime.db
1920 # TODO: Add custom parser for containerd logs once format is settled.
1921
1922# Example:
1923# I0204 07:32:30.020537 3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537]
1924[INPUT]
1925 Name tail
1926 Alias kubelet
1927 Tag kubelet
1928 Mem_Buf_Limit 5MB
1929 Skip_Long_Lines On
1930 Refresh_Interval 5
1931 Path /etc/kubernetes/logs/kubelet.log
1932 DB /var/run/google-fluentbit/pos-files/kubelet.db
1933 Multiline On
1934 Parser_Firstline glog
1935
1936[FILTER]
1937 Name modify
1938 Match *
1939 Hard_rename log message
1940
1941[FILTER]
1942 Name modify
1943 Match winevt.raw
1944 Hard_rename Message message
1945
1946[FILTER]
1947 Name parser
1948 Match kube_*
1949 Key_Name message
1950 Reserve_Data True
1951 Parser glog
1952 Parser json
1953
1954[OUTPUT]
1955 Name http
1956 Match *
1957 Host 127.0.0.1
1958 Port 2021
1959 URI /logs
1960 header_tag FLUENT-TAG
1961 Format msgpack
1962 Retry_Limit 2
1963'@
1964
1965# Fluentbit parsers config file
1966$PARSERS_CONFIG = @'
1967[PARSER]
1968 Name docker
1969 Format json
1970 Time_Key time
1971 Time_Format %Y-%m-%dT%H:%M:%S.%L%z
1972
1973[PARSER]
1974 Name containerd
1975 Format regex
1976 Regex ^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$
1977 Time_Key time
1978 Time_Format %Y-%m-%dT%H:%M:%S.%L%z
1979
1980[PARSER]
1981 Name json
1982 Format json
1983
1984[PARSER]
1985 Name syslog
1986 Format regex
1987 Regex ^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$
1988 Time_Key time
1989 Time_Format %b %d %H:%M:%S
1990
1991[PARSER]
1992 Name glog
1993 Format regex
1994 Regex ^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source_file>[^ \]]+)\:(?<source_line>\d+)\]\s(?<message>.*)$
1995 Time_Key time
1996 Time_Format %m%d %H:%M:%S.%L
1997
1998[PARSER]
1999 Name network-log
2000 Format json
2001 Time_Key timestamp
2002 Time_Format %Y-%m-%dT%H:%M:%S.%L%z
2003
2004[PARSER]
2005 Name syslog-rfc5424
2006 Format regex
2007 Regex ^\<(?<pri>[0-9]{1,5})\>1 (?<time>[^ ]+) (?<host>[^ ]+) (?<ident>[^ ]+) (?<pid>[-0-9]+) (?<msgid>[^ ]+) (?<extradata>(\[(.*?)\]|-)) (?<message>.+)$
2008 Time_Key time
2009 Time_Format %Y-%m-%dT%H:%M:%S.%L%z
2010 Time_Keep On
2011
2012[PARSER]
2013 Name syslog-rfc3164-local
2014 Format regex
2015 Regex ^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$
2016 Time_Key time
2017 Time_Format %b %d %H:%M:%S
2018 Time_Keep On
2019
2020[PARSER]
2021 Name syslog-rfc3164
2022 Format regex
2023 Regex /^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$/
2024 Time_Key time
2025 Time_Format %b %d %H:%M:%S
2026 Time_Keep On
2027
2028[PARSER]
2029 Name kube-custom
2030 Format regex
2031 Regex (?<tag>[^.]+)?\.?(?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-(?<docker_id>[a-z0-9]{64})\.log$
2032'@
2033
2034
2035# ----------- Stackdriver logging setup --------------------------
2036# This section would be deprecated soon
2037#
2038
2039$STACKDRIVER_ROOT = 'C:\Program Files (x86)\Stackdriver'
2040
2041# Restarts the Stackdriver logging agent, or starts it if it is not currently
2042# running. A standard `Restart-Service StackdriverLogging` may fail because
2043# StackdriverLogging sometimes is unstoppable, so this function works around it
2044# by killing the processes.
2045function Restart-StackdriverAgent {
2046 Stop-Service -NoWait -ErrorAction Ignore StackdriverLogging
2047
2048 # Wait (if necessary) for service to stop.
2049 $timeout = 10
2050 $stopped = (Get-service StackdriverLogging).Status -eq 'Stopped'
2051 for ($i = 0; $i -lt $timeout -and !($stopped); $i++) {
2052 Start-Sleep 1
2053 $stopped = (Get-service StackdriverLogging).Status -eq 'Stopped'
2054 }
2055
2056 if ((Get-service StackdriverLogging).Status -ne 'Stopped') {
2057 # Force kill the processes.
2058 Stop-Process -Force -PassThru -Id (Get-WmiObject win32_process |
2059 Where CommandLine -Like '*Stackdriver/logging*').ProcessId
2060
2061 # Wait until process has stopped.
2062 $waited = 0
2063 $log_period = 10
2064 $timeout = 60
2065 while ((Get-service StackdriverLogging).Status -ne 'Stopped' -and $waited -lt $timeout) {
2066 Start-Sleep 1
2067 $waited++
2068
2069 if ($waited % $log_period -eq 0) {
2070 Log-Output "Waiting for StackdriverLogging service to stop"
2071 }
2072 }
2073
2074 # Timeout occurred
2075 if ($waited -ge $timeout) {
2076 Throw ("Timeout while waiting for StackdriverLogging service to stop")
2077 }
2078 }
2079
2080 Start-Service StackdriverLogging
2081}
2082
2083# Check whether the logging agent is installed by whether it's registered as service
2084function IsStackdriverAgentInstalled {
2085 $stackdriver_status = (Get-Service StackdriverLogging -ErrorAction Ignore).Status
2086 return -not [string]::IsNullOrEmpty($stackdriver_status)
2087}
2088
2089# Writes the logging configuration file for Stackdriver. Restart-LoggingAgent
2090# should then be called to pick up the new configuration.
2091function Configure-StackdriverAgent {
2092 $fluentd_config_dir = "$STACKDRIVER_ROOT\LoggingAgent\config.d"
2093 $fluentd_config_file = "$fluentd_config_dir\k8s_containers.conf"
2094
2095 # Create a configuration file for kubernetes containers.
2096 # The config.d directory should have already been created automatically, but
2097 # try creating again just in case.
2098 New-Item $fluentd_config_dir -ItemType 'directory' -Force | Out-Null
2099
2100 $config = $FLUENTD_CONFIG.replace('NODE_NAME', (hostname))
2101 $config | Out-File -FilePath $fluentd_config_file -Encoding ASCII
2102 Log-Output "Wrote fluentd logging config to $fluentd_config_file"
2103
2104 # Configure StackdriverLogging to automatically restart on failure after 10
2105 # seconds. The logging agent may die die to various disruptions but can be
2106 # resumed.
2107 sc.exe failure StackdriverLogging reset= 0 actions= restart/1000/restart/10000
2108 Write-VerboseServiceInfoToConsole -Service 'StackdriverLogging'
2109}
2110
2111# The NODE_NAME placeholder must be replaced with the node's name (hostname).
2112$FLUENTD_CONFIG = @'
2113# This configuration file for Fluentd is used to watch changes to kubernetes
2114# container logs in the directory /var/lib/docker/containers/ and submit the
2115# log records to Google Cloud Logging using the cloud-logging plugin.
2116#
2117# Example
2118# =======
2119# A line in the Docker log file might look like this JSON:
2120#
2121# {"log":"2014/09/25 21:15:03 Got request with path wombat\\n",
2122# "stream":"stderr",
2123# "time":"2014-09-25T21:15:03.499185026Z"}
2124#
2125# The original tag is derived from the log file's location.
2126# For example a Docker container's logs might be in the directory:
2127# /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b
2128# and in the file:
2129# 997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
2130# where 997599971ee6... is the Docker ID of the running container.
2131# The Kubernetes kubelet makes a symbolic link to this file on the host
2132# machine in the /var/log/containers directory which includes the pod name,
2133# the namespace name and the Kubernetes container name:
2134# synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
2135# ->
2136# /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
2137# The /var/log directory on the host is mapped to the /var/log directory in the container
2138# running this instance of Fluentd and we end up collecting the file:
2139# /var/log/containers/synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
2140# This results in the tag:
2141# var.log.containers.synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
2142# where 'synthetic-logger-0.25lps-pod' is the pod name, 'default' is the
2143# namespace name, 'synth-lgr' is the container name and '997599971ee6..' is
2144# the container ID.
2145# The record reformer is used to extract pod_name, namespace_name and
2146# container_name from the tag and set them in a local_resource_id in the
2147# format of:
2148# 'k8s_container.<NAMESPACE_NAME>.<POD_NAME>.<CONTAINER_NAME>'.
2149# The reformer also changes the tags to 'stderr' or 'stdout' based on the
2150# value of 'stream'.
2151# local_resource_id is later used by google_cloud plugin to determine the
2152# monitored resource to ingest logs against.
2153# Json Log Example:
2154# {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"}
2155# CRI Log Example:
2156# 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here
2157<source>
2158 @type tail
2159 path /var/log/containers/*.log
2160 pos_file /var/log/gcp-containers.log.pos
2161 # Tags at this point are in the format of:
2162 # reform.var.log.containers.<POD_NAME>_<NAMESPACE_NAME>_<CONTAINER_NAME>-<CONTAINER_ID>.log
2163 tag reform.*
2164 read_from_head true
2165 <parse>
2166 @type multi_format
2167 <pattern>
2168 format json
2169 time_key time
2170 time_format %Y-%m-%dT%H:%M:%S.%NZ
2171 keep_time_key
2172 </pattern>
2173 <pattern>
2174 format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/
2175 time_format %Y-%m-%dT%H:%M:%S.%N%:z
2176 </pattern>
2177 </parse>
2178</source>
2179# Example:
2180# I0204 07:32:30.020537 3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537]
2181<source>
2182 @type tail
2183 format multiline
2184 multiline_flush_interval 5s
2185 format_firstline /^\w\d{4}/
2186 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
2187 time_format %m%d %H:%M:%S.%N
2188 path /etc/kubernetes/logs/kubelet.log
2189 pos_file /etc/kubernetes/logs/gcp-kubelet.log.pos
2190 tag kubelet
2191</source>
2192# Example:
2193# I1118 21:26:53.975789 6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed
2194<source>
2195 @type tail
2196 format multiline
2197 multiline_flush_interval 5s
2198 format_firstline /^\w\d{4}/
2199 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
2200 time_format %m%d %H:%M:%S.%N
2201 path /etc/kubernetes/logs/kube-proxy.log
2202 pos_file /etc/kubernetes/logs/gcp-kube-proxy.log.pos
2203 tag kube-proxy
2204</source>
2205# Example:
2206# I0928 03:15:50.440223 4880 main.go:51] Starting CSI-Proxy Server ...
2207<source>
2208 @type tail
2209 format multiline
2210 multiline_flush_interval 5s
2211 format_firstline /^\w\d{4}/
2212 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
2213 time_format %m%d %H:%M:%S.%N
2214 path /etc/kubernetes/logs/csi-proxy.log
2215 pos_file /etc/kubernetes/logs/gcp-csi-proxy.log.pos
2216 tag csi-proxy
2217</source>
2218# Example:
2219# time="2019-12-10T21:27:59.836946700Z" level=info msg="loading plugin \"io.containerd.grpc.v1.cri\"..." type=io.containerd.grpc.v1
2220<source>
2221 @type tail
2222 format multiline
2223 multiline_flush_interval 5s
2224 format_firstline /^time=/
2225 format1 /^time="(?<time>[^ ]*)" level=(?<severity>\w*) (?<message>.*)/
2226 time_format %Y-%m-%dT%H:%M:%S.%N%z
2227 path /etc/kubernetes/logs/containerd.log
2228 pos_file /etc/kubernetes/logs/gcp-containerd.log.pos
2229 tag container-runtime
2230</source>
2231<match reform.**>
2232 @type record_reformer
2233 enable_ruby true
2234 <record>
2235 # Extract local_resource_id from tag for 'k8s_container' monitored
2236 # resource. The format is:
2237 # 'k8s_container.<namespace_name>.<pod_name>.<container_name>'.
2238 "logging.googleapis.com/local_resource_id" ${"k8s_container.#{tag_suffix[4].rpartition('.')[0].split('_')[1]}.#{tag_suffix[4].rpartition('.')[0].split('_')[0]}.#{tag_suffix[4].rpartition('.')[0].split('_')[2].rpartition('-')[0]}"}
2239 # Rename the field 'log' to a more generic field 'message'. This way the
2240 # fluent-plugin-google-cloud knows to flatten the field as textPayload
2241 # instead of jsonPayload after extracting 'time', 'severity' and
2242 # 'stream' from the record.
2243 message ${record['log']}
2244 # If 'severity' is not set, assume stderr is ERROR and stdout is INFO.
2245 severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end}
2246 </record>
2247 tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end}
2248 remove_keys stream,log
2249</match>
2250# TODO: detect exceptions and forward them as one log entry using the
2251# detect_exceptions plugin
2252# This section is exclusive for k8s_container logs. These logs come with
2253# 'raw.stderr' or 'raw.stdout' tags.
2254<match {raw.stderr,raw.stdout}>
2255 @type google_cloud
2256 # Try to detect JSON formatted log entries.
2257 detect_json true
2258 # Allow log entries from multiple containers to be sent in the same request.
2259 split_logs_by_tag false
2260 # Set the buffer type to file to improve the reliability and reduce the memory consumption
2261 buffer_type file
2262 buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
2263 # Set queue_full action to block because we want to pause gracefully
2264 # in case of the off-the-limits load instead of throwing an exception
2265 buffer_queue_full_action block
2266 # Set the chunk limit conservatively to avoid exceeding the recommended
2267 # chunk size of 5MB per write request.
2268 buffer_chunk_limit 512k
2269 # Cap the combined memory usage of this buffer and the one below to
2270 # 512KiB/chunk * (6 + 2) chunks = 4 MiB
2271 buffer_queue_limit 6
2272 # Never wait more than 5 seconds before flushing logs in the non-error case.
2273 flush_interval 5s
2274 # Never wait longer than 30 seconds between retries.
2275 max_retry_wait 30
2276 # Disable the limit on the number of retries (retry forever).
2277 disable_retry_limit
2278 # Use multiple threads for processing.
2279 num_threads 2
2280 use_grpc true
2281 # Skip timestamp adjustment as this is in a controlled environment with
2282 # known timestamp format. This helps with CPU usage.
2283 adjust_invalid_timestamps false
2284</match>
2285# Attach local_resource_id for 'k8s_node' monitored resource.
2286<filter **>
2287 @type record_transformer
2288 enable_ruby true
2289 <record>
2290 "logging.googleapis.com/local_resource_id" ${"k8s_node.NODE_NAME"}
2291 </record>
2292</filter>
2293'@
2294
2295# Downloads the out-of-tree kubelet image credential provider binaries.
2296function DownloadAndInstall-AuthProviderGcpBinary {
2297 if ("${env:ENABLE_AUTH_PROVIDER_GCP}" -eq "true") {
2298 $filename = 'auth-provider-gcp.exe'
2299 if (ShouldWrite-File ${env:AUTH_PROVIDER_GCP_WINDOWS_BIN_DIR}\$filename) {
2300 Log-Output "Installing auth provider gcp binaries"
2301 $tmp_dir = 'C:\k8s_tmp'
2302 New-Item -Force -ItemType 'directory' $tmp_dir | Out-Null
2303 $url = "${env:AUTH_PROVIDER_GCP_STORAGE_PATH}/${env:AUTH_PROVIDER_GCP_VERSION}/windows_amd64/$filename"
2304 MustDownload-File -Hash $AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64 -Algorithm SHA512 -OutFile $tmp_dir\$filename -URLs $url
2305 Move-Item -Force $tmp_dir\$filename ${env:AUTH_PROVIDER_GCP_WINDOWS_BIN_DIR}
2306 Remove-Item -Force -Recurse $tmp_dir
2307 } else {
2308 Log-Output "Skipping auth provider gcp binaries installation, auth-provider-gcp.exe file already exists."
2309 }
2310 }
2311}
2312
2313# Creates config file for the out-of-tree kubelet image credential provider.
2314function Create-AuthProviderGcpConfig {
2315 if ("${env:ENABLE_AUTH_PROVIDER_GCP}" -eq "true") {
2316 if (ShouldWrite-File ${env:AUTH_PROVIDER_GCP_WINDOWS_CONF_FILE}) {
2317 Log-Output "Creating auth provider gcp config file"
2318 Set-Content ${env:AUTH_PROVIDER_GCP_WINDOWS_CONF_FILE} @'
2319kind: CredentialProviderConfig
2320apiVersion: kubelet.config.k8s.io/v1
2321providers:
2322 - name: auth-provider-gcp.exe
2323 apiVersion: credentialprovider.kubelet.k8s.io/v1
2324 matchImages:
2325 - "container.cloud.google.com"
2326 - "gcr.io"
2327 - "*.gcr.io"
2328 - "*.pkg.dev"
2329 args:
2330 - get-credentials
2331 - --v=3
2332 defaultCacheDuration: 1m
2333'@
2334 } else {
2335 Log-Output "Skipping auth provider gcp config file creation, it already exists"
2336 }
2337 }
2338}
2339
2340
2341# Export all public functions:
2342Export-ModuleMember -Function *-*
View as plain text