fluentd-gcp-configmap.yaml

Documentation: k8s.io/kubernetes/cluster/addons/fluentd-gcp

     1# This ConfigMap is used to ingest logs against new resources like
     2# "k8s_container" and "k8s_node" when $LOGGING_STACKDRIVER_RESOURCE_TYPES is set
     3# to "new".
     4# When $LOGGING_STACKDRIVER_RESOURCE_TYPES is set to "old", the ConfigMap in
     5# fluentd-gcp-configmap-old.yaml will be used for ingesting logs against old
     6# resources like "gke_container" and "gce_instance".
     7kind: ConfigMap
     8apiVersion: v1
     9data:
    10  containers.input.conf: |-
    11    # This configuration file for Fluentd is used
    12    # to watch changes to Docker log files that live in the
    13    # directory /var/lib/docker/containers/ and are symbolically
    14    # linked to from the /var/log/containers directory using names that capture the
    15    # pod name and container name. These logs are then submitted to
    16    # Google Cloud Logging which assumes the installation of the cloud-logging plug-in.
    17    #
    18    # Example
    19    # =======
    20    # A line in the Docker log file might look like this JSON:
    21    #
    22    # {"log":"2014/09/25 21:15:03 Got request with path wombat\\n",
    23    #  "stream":"stderr",
    24    #   "time":"2014-09-25T21:15:03.499185026Z"}
    25    #
    26    # The original tag is derived from the log file's location.
    27    # For example a Docker container's logs might be in the directory:
    28    #  /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b
    29    # and in the file:
    30    #  997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
    31    # where 997599971ee6... is the Docker ID of the running container.
    32    # The Kubernetes kubelet makes a symbolic link to this file on the host
    33    # machine in the /var/log/containers directory which includes the pod name,
    34    # the namespace name and the Kubernetes container name:
    35    #    synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
    36    #    ->
    37    #    /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
    38    # The /var/log directory on the host is mapped to the /var/log directory in the container
    39    # running this instance of Fluentd and we end up collecting the file:
    40    #   /var/log/containers/synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
    41    # This results in the tag:
    42    #  var.log.containers.synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
    43    # where 'synthetic-logger-0.25lps-pod' is the pod name, 'default' is the
    44    # namespace name, 'synth-lgr' is the container name and '997599971ee6..' is
    45    # the container ID.
    46    # The record reformer is used to extract pod_name, namespace_name and
    47    # container_name from the tag and set them in a local_resource_id in the
    48    # format of:
    49    # 'k8s_container.<NAMESPACE_NAME>.<POD_NAME>.<CONTAINER_NAME>'.
    50    # The reformer also changes the tags to 'stderr' or 'stdout' based on the
    51    # value of 'stream'.
    52    # local_resource_id is later used by google_cloud plugin to determine the
    53    # monitored resource to ingest logs against.
    54
    55    # Json Log Example:
    56    # {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"}
    57    # CRI Log Example:
    58    # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here
    59    <source>
    60      @type tail
    61      path /var/log/containers/*.log
    62      pos_file /var/log/gcp-containers.log.pos
    63      # Tags at this point are in the format of:
    64      # reform.var.log.containers.<POD_NAME>_<NAMESPACE_NAME>_<CONTAINER_NAME>-<CONTAINER_ID>.log
    65      tag reform.*
    66      read_from_head true
    67      <parse>
    68        @type multi_format
    69        <pattern>
    70          format json
    71          time_key time
    72          time_format %Y-%m-%dT%H:%M:%S.%NZ
    73        </pattern>
    74        <pattern>
    75          format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/
    76          time_format %Y-%m-%dT%H:%M:%S.%N%:z
    77        </pattern>
    78      </parse>
    79    </source>
    80
    81    <filter reform.**>
    82      @type parser
    83      format /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<log>.*)/
    84      reserve_data true
    85      suppress_parse_error_log true
    86      emit_invalid_record_to_error false
    87      key_name log
    88    </filter>
    89
    90    <match reform.**>
    91      @type record_reformer
    92      enable_ruby true
    93      <record>
    94        # Extract local_resource_id from tag for 'k8s_container' monitored
    95        # resource. The format is:
    96        # 'k8s_container.<namespace_name>.<pod_name>.<container_name>'.
    97        "logging.googleapis.com/local_resource_id" ${"k8s_container.#{tag_suffix[4].rpartition('.')[0].split('_')[1]}.#{tag_suffix[4].rpartition('.')[0].split('_')[0]}.#{tag_suffix[4].rpartition('.')[0].split('_')[2].rpartition('-')[0]}"}
    98        # Rename the field 'log' to a more generic field 'message'. This way the
    99        # fluent-plugin-google-cloud knows to flatten the field as textPayload
   100        # instead of jsonPayload after extracting 'time', 'severity' and
   101        # 'stream' from the record.
   102        message ${record['log']}
   103        # If 'severity' is not set, assume stderr is ERROR and stdout is INFO.
   104        severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end}
   105      </record>
   106      tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end}
   107      remove_keys stream,log
   108    </match>
   109
   110    # Detect exceptions in the log output and forward them as one log entry.
   111    <match {raw.stderr,raw.stdout}>
   112      @type detect_exceptions
   113
   114      remove_tag_prefix raw
   115      message message
   116      stream "logging.googleapis.com/local_resource_id"
   117      multiline_flush_interval 5
   118      max_bytes 500000
   119      max_lines 1000
   120    </match>
   121  system.input.conf: |-
   122    # Example:
   123    # Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script
   124    <source>
   125      @type tail
   126      format syslog
   127      path /var/log/startupscript.log
   128      pos_file /var/log/gcp-startupscript.log.pos
   129      tag startupscript
   130    </source>
   131
   132    # Examples:
   133    # time="2016-02-04T06:51:03.053580605Z" level=info msg="GET /containers/json"
   134    # time="2016-02-04T07:53:57.505612354Z" level=error msg="HTTP Error" err="No such image: -f" statusCode=404
   135    # TODO(random-liu): Remove this after cri container runtime rolls out.
   136    <source>
   137      @type tail
   138      format /^time="(?<time>[^)]*)" level=(?<severity>[^ ]*) msg="(?<message>[^"]*)"( err="(?<error>[^"]*)")?( statusCode=($<status_code>\d+))?/
   139      path /var/log/docker.log
   140      pos_file /var/log/gcp-docker.log.pos
   141      tag docker
   142    </source>
   143
   144    # Example:
   145    # 2016/02/04 06:52:38 filePurge: successfully removed file /var/etcd/data/member/wal/00000000000006d0-00000000010a23d1.wal
   146    <source>
   147      @type tail
   148      # Not parsing this, because it doesn't have anything particularly useful to
   149      # parse out of it (like severities).
   150      format none
   151      path /var/log/etcd.log
   152      pos_file /var/log/gcp-etcd.log.pos
   153      tag etcd
   154    </source>
   155
   156    # Multi-line parsing is required for all the kube logs because very large log
   157    # statements, such as those that include entire object bodies, get split into
   158    # multiple lines by glog.
   159
   160    # Example:
   161    # I0204 07:32:30.020537    3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537]
   162    <source>
   163      @type tail
   164      format multiline
   165      multiline_flush_interval 5s
   166      format_firstline /^\w\d{4}/
   167      format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
   168      time_format %m%d %H:%M:%S.%N
   169      path /var/log/kubelet.log
   170      pos_file /var/log/gcp-kubelet.log.pos
   171      tag kubelet
   172    </source>
   173
   174    # Example:
   175    # I1118 21:26:53.975789       6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed
   176    <source>
   177      @type tail
   178      format multiline
   179      multiline_flush_interval 5s
   180      format_firstline /^\w\d{4}/
   181      format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
   182      time_format %m%d %H:%M:%S.%N
   183      path /var/log/kube-proxy.log
   184      pos_file /var/log/gcp-kube-proxy.log.pos
   185      tag kube-proxy
   186    </source>
   187
   188    # Example:
   189    # I0204 07:00:19.604280       5 handlers.go:131] GET /api/v1/nodes: (1.624207ms) 200 [[kube-controller-manager/v1.1.3 (linux/amd64) kubernetes/6a81b50] 127.0.0.1:38266]
   190    <source>
   191      @type tail
   192      format multiline
   193      multiline_flush_interval 5s
   194      format_firstline /^\w\d{4}/
   195      format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
   196      time_format %m%d %H:%M:%S.%N
   197      path /var/log/kube-apiserver.log
   198      pos_file /var/log/gcp-kube-apiserver.log.pos
   199      tag kube-apiserver
   200    </source>
   201
   202    # Example:
   203    # I0204 06:55:31.872680       5 servicecontroller.go:277] LB already exists and doesn't need update for service kube-system/kube-ui
   204    <source>
   205      @type tail
   206      format multiline
   207      multiline_flush_interval 5s
   208      format_firstline /^\w\d{4}/
   209      format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
   210      time_format %m%d %H:%M:%S.%N
   211      path /var/log/kube-controller-manager.log
   212      pos_file /var/log/gcp-kube-controller-manager.log.pos
   213      tag kube-controller-manager
   214    </source>
   215
   216    # Example:
   217    # W0204 06:49:18.239674       7 reflector.go:245] pkg/scheduler/factory/factory.go:193: watch of *api.Service ended with: 401: The event in requested index is outdated and cleared (the requested history has been cleared [2578313/2577886]) [2579312]
   218    <source>
   219      @type tail
   220      format multiline
   221      multiline_flush_interval 5s
   222      format_firstline /^\w\d{4}/
   223      format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
   224      time_format %m%d %H:%M:%S.%N
   225      path /var/log/kube-scheduler.log
   226      pos_file /var/log/gcp-kube-scheduler.log.pos
   227      tag kube-scheduler
   228    </source>
   229
   230    # Example:
   231    # I0603 15:31:05.793605       6 cluster_manager.go:230] Reading config from path /etc/gce.conf
   232    <source>
   233      @type tail
   234      format multiline
   235      multiline_flush_interval 5s
   236      format_firstline /^\w\d{4}/
   237      format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
   238      time_format %m%d %H:%M:%S.%N
   239      path /var/log/glbc.log
   240      pos_file /var/log/gcp-glbc.log.pos
   241      tag glbc
   242    </source>
   243
   244    # Example:
   245    # I0603 15:31:05.793605       6 cluster_manager.go:230] Reading config from path /etc/gce.conf
   246    <source>
   247      @type tail
   248      format multiline
   249      multiline_flush_interval 5s
   250      format_firstline /^\w\d{4}/
   251      format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
   252      time_format %m%d %H:%M:%S.%N
   253      path /var/log/cluster-autoscaler.log
   254      pos_file /var/log/gcp-cluster-autoscaler.log.pos
   255      tag cluster-autoscaler
   256    </source>
   257
   258    # Logs from systemd-journal for interesting services.
   259    # TODO(random-liu): Keep this for compatibility, remove this after
   260    # cri container runtime rolls out.
   261    <source>
   262      @type systemd
   263      filters [{ "_SYSTEMD_UNIT": "docker.service" }]
   264      pos_file /var/log/gcp-journald-docker.pos
   265      read_from_head true
   266      tag docker
   267    </source>
   268
   269    <source>
   270      @type systemd
   271      filters [{ "_SYSTEMD_UNIT": "{{ fluentd_container_runtime_service }}.service" }]
   272      pos_file /var/log/gcp-journald-container-runtime.pos
   273      read_from_head true
   274      tag container-runtime
   275    </source>
   276
   277    <source>
   278      @type systemd
   279      filters [{ "_SYSTEMD_UNIT": "kubelet.service" }]
   280      pos_file /var/log/gcp-journald-kubelet.pos
   281      read_from_head true
   282      tag kubelet
   283    </source>
   284
   285    <source>
   286      @type systemd
   287      filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }]
   288      pos_file /var/log/gcp-journald-node-problem-detector.pos
   289      read_from_head true
   290      tag node-problem-detector
   291    </source>
   292
   293    # BEGIN_NODE_JOURNAL
   294    # Whether to include node-journal or not is determined when starting the
   295    # cluster. It is not changed when the cluster is already running.
   296    <source>
   297      @type systemd
   298      pos_file /var/log/gcp-journald.pos
   299      read_from_head true
   300      tag node-journal
   301    </source>
   302
   303    <filter node-journal>
   304      @type grep
   305      <exclude>
   306        key _SYSTEMD_UNIT
   307        pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet|node-problem-detector)\.service$
   308      </exclude>
   309    </filter>
   310    # END_NODE_JOURNAL
   311  monitoring.conf: |-
   312    # This source is used to acquire approximate process start timestamp,
   313    # which purpose is explained before the corresponding output plugin.
   314    <source>
   315      @type exec
   316      command /bin/sh -c 'date +%s'
   317      tag process_start
   318      time_format %Y-%m-%d %H:%M:%S
   319      keys process_start_timestamp
   320    </source>
   321
   322    # This filter is used to convert process start timestamp to integer
   323    # value for correct ingestion in the prometheus output plugin.
   324    <filter process_start>
   325      @type record_transformer
   326      enable_ruby true
   327      auto_typecast true
   328      <record>
   329        process_start_timestamp ${record["process_start_timestamp"].to_i}
   330      </record>
   331    </filter>
   332  output.conf: |-
   333    # This match is placed before the all-matching output to provide metric
   334    # exporter with a process start timestamp for correct exporting of
   335    # cumulative metrics to Stackdriver.
   336    <match process_start>
   337      @type prometheus
   338
   339      <metric>
   340        type gauge
   341        name process_start_time_seconds
   342        desc Timestamp of the process start in seconds
   343        key process_start_timestamp
   344      </metric>
   345    </match>
   346
   347    # This filter allows to count the number of log entries read by fluentd
   348    # before they are processed by the output plugin. This in turn allows to
   349    # monitor the number of log entries that were read but never sent, e.g.
   350    # because of liveness probe removing buffer.
   351    <filter **>
   352      @type prometheus
   353      <metric>
   354        type counter
   355        name logging_entry_count
   356        desc Total number of log entries generated by either application containers or system components
   357      </metric>
   358    </filter>
   359
   360    # This section is exclusive for k8s_container logs. Those come with
   361    # 'stderr'/'stdout' tags.
   362    # TODO(instrumentation): Reconsider this workaround later.
   363    # Trim the entries which exceed slightly less than 100KB, to avoid
   364    # dropping them. It is a necessity, because Stackdriver only supports
   365    # entries that are up to 100KB in size.
   366    <filter {stderr,stdout}>
   367      @type record_transformer
   368      enable_ruby true
   369      <record>
   370        message ${record['message'].length > 100000 ? "[Trimmed]#{record['message'][0..100000]}..." : record['message']}
   371      </record>
   372    </filter>
   373
   374    # Do not collect fluentd's own logs to avoid infinite loops.
   375    <match fluent.**>
   376      @type null
   377    </match>
   378
   379    # Add a unique insertId to each log entry that doesn't already have it.
   380    # This helps guarantee the order and prevent log duplication.
   381    <filter **>
   382      @type add_insert_ids
   383    </filter>
   384
   385    # This section is exclusive for k8s_container logs. These logs come with
   386    # 'stderr'/'stdout' tags.
   387    # We use a separate output stanza for 'k8s_node' logs with a smaller buffer
   388    # because node logs are less important than user's container logs.
   389    <match {stderr,stdout}>
   390      @type google_cloud
   391
   392      # Try to detect JSON formatted log entries.
   393      detect_json true
   394      # Collect metrics in Prometheus registry about plugin activity.
   395      enable_monitoring true
   396      monitoring_type prometheus
   397      # Allow log entries from multiple containers to be sent in the same request.
   398      split_logs_by_tag false
   399      # Set the buffer type to file to improve the reliability and reduce the memory consumption
   400      buffer_type file
   401      buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
   402      # Set queue_full action to block because we want to pause gracefully
   403      # in case of the off-the-limits load instead of throwing an exception
   404      buffer_queue_full_action block
   405      # Set the chunk limit conservatively to avoid exceeding the recommended
   406      # chunk size of 5MB per write request.
   407      buffer_chunk_limit 512k
   408      # Cap the combined memory usage of this buffer and the one below to
   409      # 512KiB/chunk * (6 + 2) chunks = 4 MiB
   410      buffer_queue_limit 6
   411      # Never wait more than 5 seconds before flushing logs in the non-error case.
   412      flush_interval 5s
   413      # Never wait longer than 30 seconds between retries.
   414      max_retry_wait 30
   415      # Disable the limit on the number of retries (retry forever).
   416      disable_retry_limit
   417      # Use multiple threads for processing.
   418      num_threads 2
   419      use_grpc true
   420      # Skip timestamp adjustment as this is in a controlled environment with
   421      # known timestamp format. This helps with CPU usage.
   422      adjust_invalid_timestamps false
   423    </match>
   424
   425    # Attach local_resource_id for 'k8s_node' monitored resource.
   426    <filter **>
   427      @type record_transformer
   428      enable_ruby true
   429      <record>
   430        "logging.googleapis.com/local_resource_id" ${"k8s_node.#{ENV['NODE_NAME']}"}
   431      </record>
   432    </filter>
   433
   434    # This section is exclusive for 'k8s_node' logs. These logs come with tags
   435    # that are neither 'stderr' or 'stdout'.
   436    # We use a separate output stanza for 'k8s_container' logs with a larger
   437    # buffer because user's container logs are more important than node logs.
   438    <match **>
   439      @type google_cloud
   440
   441      detect_json true
   442      enable_monitoring true
   443      monitoring_type prometheus
   444      # Allow entries from multiple system logs to be sent in the same request.
   445      split_logs_by_tag false
   446      detect_subservice false
   447      buffer_type file
   448      buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
   449      buffer_queue_full_action block
   450      buffer_chunk_limit 512k
   451      buffer_queue_limit 2
   452      flush_interval 5s
   453      max_retry_wait 30
   454      disable_retry_limit
   455      num_threads 2
   456      use_grpc true
   457      # Skip timestamp adjustment as this is in a controlled environment with
   458      # known timestamp format. This helps with CPU usage.
   459      adjust_invalid_timestamps false
   460    </match>
   461metadata:
   462  name: fluentd-gcp-config-v1.2.5
   463  namespace: kube-system
   464  labels:
   465    addonmanager.kubernetes.io/mode: Reconcile
View as plain text