From 5bd7be614a76809177538b86516079734520ba09 Mon Sep 17 00:00:00 2001 From: wf-jenkins <34043090+wf-jenkins@users.noreply.github.com> Date: Wed, 14 Jun 2023 11:15:45 -0700 Subject: [PATCH] Release operator version: 2.8.0 --- README.md | 11 +++ collector/release/NEXT_RELEASE_VERSION | 2 +- collector/release/VERSION | 2 +- deploy/crd/wavefront.com_wavefronts.yaml | 31 ++++++- ...avefront-collector-existing-configmap.yaml | 43 ++++++++- .../wavefront-custom-private-registry.yaml | 19 ++++ deploy/scenarios/wavefront-full-config.yaml | 44 ++++++++- deploy/wavefront-operator.yaml | 86 ++++++++++++++++- docs/alerts/alerts.md | 58 ++++++++++++ docs/alerts/create-alert.sh | 84 +++++++++++++++++ .../templates/pods-stuck-in-pending.json.tmpl | 26 ++++++ docs/collector/configuration.md | 13 --- docs/collector/metrics.md | 92 ++++++++++--------- docs/operator/custom-configuration.md | 31 ++++++- docs/operator/helm-feature-comparison.md | 2 +- docs/troubleshooting.md | 2 - .../config/manager/component_versions.yaml | 2 +- operator/config/manager/kustomization.yaml | 2 +- .../deploy/wavefront-operator.yaml | 88 +++++++++++++++++- .../docs/operator/custom-configuration.md | 4 +- operator/release/NEXT_RELEASE_VERSION | 2 +- operator/release/OPERATOR_VERSION | 2 +- 22 files changed, 561 insertions(+), 85 deletions(-) create mode 100644 deploy/scenarios/wavefront-custom-private-registry.yaml create mode 100644 docs/alerts/alerts.md create mode 100755 docs/alerts/create-alert.sh create mode 100644 docs/alerts/templates/pods-stuck-in-pending.json.tmpl diff --git a/README.md b/README.md index 578cde328..eb4730613 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,7 @@ See below for configuration options. We have templates for common scenarios. See the comments in each file for usage instructions. + * [Using a custom private registry](deploy/scenarios/wavefront-custom-private-registry.yaml) * [Using an existing Collector ConfigMap](deploy/scenarios/wavefront-collector-existing-configmap.yaml) * [With plugin configuration in a secret](deploy/scenarios/wavefront-collector-with-plugin-secret.yaml) * [Filtering metrics upon collection](deploy/scenarios/wavefront-collector-filtering.yaml) @@ -152,6 +153,16 @@ We have templates for common scenarios. See the comments in each file for usage You can see all configuration options in the [wavefront-full-config.yaml](deploy/scenarios/wavefront-full-config.yaml). +# Creating Alerts + +We have alerts on common Kubernetes issues. For details on creating alerts, see [alerts.md](docs/alerts/alerts.md). + +### Pod Failure + +| Alert name | Description | +|---|---| +| [Pods Stuck in Pending](docs/alerts/templates/pods-stuck-in-pending.json.tmpl) | Workload has pods stuck in pending. | + ## Bring Your Own Logs Shipper The operator deploys a data export component (wavefront-proxy) which can receive log data and forward it to the Operations for Applications service. diff --git a/collector/release/NEXT_RELEASE_VERSION b/collector/release/NEXT_RELEASE_VERSION index 398935591..3500250a4 100644 --- a/collector/release/NEXT_RELEASE_VERSION +++ b/collector/release/NEXT_RELEASE_VERSION @@ -1 +1 @@ -1.20.0 +1.21.0 diff --git a/collector/release/VERSION b/collector/release/VERSION index 815d5ca06..398935591 100644 --- a/collector/release/VERSION +++ b/collector/release/VERSION @@ -1 +1 @@ -1.19.0 +1.20.0 diff --git a/deploy/crd/wavefront.com_wavefronts.yaml b/deploy/crd/wavefront.com_wavefronts.yaml index 210314204..a8fdcd6b9 100644 --- a/deploy/crd/wavefront.com_wavefronts.yaml +++ b/deploy/crd/wavefront.com_wavefronts.yaml @@ -215,7 +215,7 @@ spec: enable: default: true description: Enable is whether to include kubernetes.controlplane.* - metrics and whether to include kubernetes_control_plane_source + metrics type: boolean required: - enable @@ -590,6 +590,35 @@ spec: type: object type: object type: object + imagePullSecret: + description: ImagePullSecret is the name of the secret to authenticate + with a private custom registry. + maxLength: 253 + pattern: ^[a-z0-9]([a-z0-9\.\-]*[a-z0-9])?$ + type: string + experimental: + description: Experimental features + properties: + autoInstrumentation: + properties: + deployKey: + type: string + enable: + type: boolean + type: object + kubernetesEvents: + properties: + enable: + default: false + description: Enable is whether to enable events. Defaults + to false. + type: boolean + externalEndpointURL: + type: string + required: + - externalEndpointURL + type: object + type: object wavefrontTokenSecret: default: wavefront-secret description: WavefrontTokenSecret is the name of the secret that contains diff --git a/deploy/scenarios/wavefront-collector-existing-configmap.yaml b/deploy/scenarios/wavefront-collector-existing-configmap.yaml index e4a2184e3..531718002 100644 --- a/deploy/scenarios/wavefront-collector-existing-configmap.yaml +++ b/deploy/scenarios/wavefront-collector-existing-configmap.yaml @@ -79,9 +79,46 @@ data: kubernetes_state_source: prefix: kubernetes. - kubernetes_control_plane_source: - collection: - interval: 120s + prometheus_sources: + - url: 'https://kubernetes.default.svc:443/metrics' + name: 'etcd-workqueue' + httpConfig: + bearer_token_file: '/var/run/secrets/kubernetes.io/serviceaccount/token' + tls_config: + ca_file: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + insecure_skip_verify: true + prefix: 'kubernetes.controlplane.' + convertHistograms: true + filters: + metricAllowList: + - 'kubernetes.controlplane.etcd.request.duration.seconds' + - 'kubernetes.controlplane.etcd.db.total.size.in.bytes.gauge' + - 'kubernetes.controlplane.workqueue.adds.total.counter' + - 'kubernetes.controlplane.workqueue.queue.duration.seconds' + + - url: 'https://kubernetes.default.svc:443/metrics' + name: 'apiserver' + httpConfig: + bearer_token_file: '/var/run/secrets/kubernetes.io/serviceaccount/token' + tls_config: + ca_file: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + insecure_skip_verify: true + prefix: 'kubernetes.controlplane.' + convertHistograms: true + filters: + metricAllowList: + - 'kubernetes.controlplane.apiserver.request.duration.seconds' + - 'kubernetes.controlplane.apiserver.request.total.counter' + - 'kubernetes.controlplane.apiserver.storage.objects.gauge' + metricTagAllowList: + resource: + - 'customresourcedefinitions' + - 'namespaces' + - 'lease' + - 'nodes' + - 'pods' + - 'tokenreviews' + - 'subjectaccessreviews' discovery: enable_runtime_plugins: true diff --git a/deploy/scenarios/wavefront-custom-private-registry.yaml b/deploy/scenarios/wavefront-custom-private-registry.yaml new file mode 100644 index 000000000..67d6f86fa --- /dev/null +++ b/deploy/scenarios/wavefront-custom-private-registry.yaml @@ -0,0 +1,19 @@ +# Need to change YOUR_CLUSTER_NAME, YOUR_WAVEFRONT_URL and YOUR_IMAGE_REGISTRY_SECRET accordingly +apiVersion: wavefront.com/v1alpha1 +kind: Wavefront +metadata: + name: wavefront + namespace: observability-system +spec: + clusterName: YOUR_CLUSTER_NAME + wavefrontUrl: YOUR_WAVEFRONT_URL + # First follow steps for setting imagePullSecret for the operator at below link + # https://github.com/wavefrontHQ/observability-for-kubernetes/blob/main/docs/operator/custom-configuration.md. + # Set the image registry secret below for the operator components to authenticate with a private custom registry. + imagePullSecret: YOUR_IMAGE_REGISTRY_SECRET + dataCollection: + metrics: + enable: true + dataExport: + wavefrontProxy: + enable: true \ No newline at end of file diff --git a/deploy/scenarios/wavefront-full-config.yaml b/deploy/scenarios/wavefront-full-config.yaml index ead1b66d9..779fc9516 100644 --- a/deploy/scenarios/wavefront-full-config.yaml +++ b/deploy/scenarios/wavefront-full-config.yaml @@ -9,6 +9,7 @@ spec: clusterName: YOUR_CLUSTER_NAME wavefrontUrl: YOUR_WAVEFRONT_URL wavefrontTokenSecret: wavefront-secret + imagePullSecret: registry-secret dataCollection: # These are top level tolerations to be applied to all data collection (metrics and logging) DaemonSet pods. These # are meant to add custom tolerations to DaemonSet pods inorder to enable metrics and log collection from tainted @@ -207,9 +208,46 @@ data: kubernetes_state_source: prefix: kubernetes. - kubernetes_control_plane_source: - collection: - interval: 120s + prometheus_sources: + - url: 'https://kubernetes.default.svc:443/metrics' + name: 'etcd-workqueue' + httpConfig: + bearer_token_file: '/var/run/secrets/kubernetes.io/serviceaccount/token' + tls_config: + ca_file: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + insecure_skip_verify: true + prefix: 'kubernetes.controlplane.' + convertHistograms: true + filters: + metricAllowList: + - 'kubernetes.controlplane.etcd.request.duration.seconds' + - 'kubernetes.controlplane.etcd.db.total.size.in.bytes.gauge' + - 'kubernetes.controlplane.workqueue.adds.total.counter' + - 'kubernetes.controlplane.workqueue.queue.duration.seconds' + + - url: 'https://kubernetes.default.svc:443/metrics' + name: 'apiserver' + httpConfig: + bearer_token_file: '/var/run/secrets/kubernetes.io/serviceaccount/token' + tls_config: + ca_file: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + insecure_skip_verify: true + prefix: 'kubernetes.controlplane.' + convertHistograms: true + filters: + metricAllowList: + - 'kubernetes.controlplane.apiserver.request.duration.seconds' + - 'kubernetes.controlplane.apiserver.request.total.counter' + - 'kubernetes.controlplane.apiserver.storage.objects.gauge' + metricTagAllowList: + resource: + - 'customresourcedefinitions' + - 'namespaces' + - 'lease' + - 'nodes' + - 'pods' + - 'tokenreviews' + - 'subjectaccessreviews' discovery: enable_runtime_plugins: true diff --git a/deploy/wavefront-operator.yaml b/deploy/wavefront-operator.yaml index 3219de5f1..f0ca2a40c 100644 --- a/deploy/wavefront-operator.yaml +++ b/deploy/wavefront-operator.yaml @@ -222,7 +222,7 @@ spec: enable: default: true description: Enable is whether to include kubernetes.controlplane.* - metrics and whether to include kubernetes_control_plane_source + metrics type: boolean required: - enable @@ -597,6 +597,35 @@ spec: type: object type: object type: object + experimental: + description: Experimental features + properties: + autoInstrumentation: + properties: + deployKey: + type: string + enable: + type: boolean + type: object + kubernetesEvents: + properties: + enable: + default: false + description: Enable is whether to enable events. Defaults + to false. + type: boolean + externalEndpointURL: + type: string + required: + - externalEndpointURL + type: object + type: object + imagePullSecret: + description: ImagePullSecret is the name of the secret to authenticate + with a private custom registry. + maxLength: 253 + pattern: ^[a-z0-9]([a-z0-9\.\-]*[a-z0-9])?$ + type: string wavefrontTokenSecret: default: wavefront-secret description: WavefrontTokenSecret is the name of the secret that contains @@ -668,6 +697,16 @@ rules: - get - patch - update +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - create + - delete + - get + - patch + - update - apiGroups: - "" resources: @@ -681,8 +720,12 @@ rules: resources: - secrets verbs: + - create + - delete - get - list + - patch + - update - watch - apiGroups: - "" @@ -692,8 +735,10 @@ rules: - create - delete - get + - list - patch - update + - watch - apiGroups: - "" resources: @@ -726,6 +771,39 @@ rules: - patch - update - watch +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - create + - delete + - get + - patch + - update +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - get + - list + - patch + - update + - watch +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - wavefront.com resources: @@ -843,6 +921,8 @@ rules: - pods - services - replicationcontrollers + - persistentvolumeclaims + - persistentvolumes verbs: - get - list @@ -999,7 +1079,7 @@ subjects: --- apiVersion: v1 data: - collector: 1.19.0 + collector: 1.20.0 logging: 2.1.2 proxy: 12.4.1 kind: ConfigMap @@ -1071,7 +1151,7 @@ spec: configMapKeyRef: key: logging name: wavefront-component-versions - image: projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.7.0 + image: projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.8.0 imagePullPolicy: Always livenessProbe: httpGet: diff --git a/docs/alerts/alerts.md b/docs/alerts/alerts.md new file mode 100644 index 000000000..e5758f75b --- /dev/null +++ b/docs/alerts/alerts.md @@ -0,0 +1,58 @@ +# Alerts +This page contains the steps to create an alert template. + +We have alert templates on common Kubernetes issues. + +* [Detect pods stuck in pending](templates/pods-stuck-in-pending.json.tmpl) + +## Flags + +``` +Usage of ./create-alert.sh: + -t (Required) Wavefront API token + -c (Required) Wavefront instance name + -f (Required) path to alert file template + -n (Required) kubernetes cluster name + -h print usage info and exit +``` + +## Create an alert + +### Step 1: Download the alert template file. + +1. Replace ``, (ex: `/tmp/pods-stuck-in-pending.json`). +2. Replace ``, (ex: `pods-stuck-in-pending.json.tmpl`). + +```bash +export ALERT_FILE_OUTPUT_PATH= +export ALERT_TEMPLATE_FILE= +curl -sSL -o "$ALERT_FILE_OUTPUT_PATH" "https://raw.githubusercontent.com/wavefrontHQ/observability-for-kubernetes/main/docs/alerts/templates/$ALERT_TEMPLATE_FILE" +``` + +### Step 2: Create the alert template. + +1. Ensure that you have the information for the required fields: + - **Wavefront API token**. See [Managing API Tokens](https://docs.wavefront.com/wavefront_api.html#managing-api-tokens) page. + - **Wavefront instance**. For example, the value of `` from your wavefront url (`https://.wavefront.com`). + - **Cluster name**. For example, a partial regex value (ex: `"prod*"`), or the value of `clusterName` from your Wavefront Custom Resource configuration (ex: [wavefront.yaml](../../deploy/scenarios/wavefront-getting-started.yaml)). + - **Alert template file**. For example, the download output path of the alert template file from **Step 1**. + +```bash +curl -sSL https://raw.githubusercontent.com/wavefrontHQ/observability-for-kubernetes/main/docs/alerts/create-alert.sh | bash -s -- \ + -t \ + -c \ + -n \ + -f +``` + +**Note:** You will need to change YOUR_API_TOKEN, YOUR_WAVEFRONT_INSTANCE, YOUR_CLUSTER_NAME, and PATH_TO_ALERT_FILE in the above example. + +### Step 3: Customize the alert. + +1. Log in to your service instance `https://.wavefront.com` as a user with the Alerts permission. Click **Alerting** > **All Alerts** from the toolbar to display the Alerts Browser. +2. Click the alert name, or click the ellipsis icon next to the alert and select **Edit**. You can search for the alert by by typing the alert name in the search field. +3. Change the alert properties when you edit the alert. +4. Specify alert recipients to receive notifications when the alert changes state. +5. Click **Save** in the top right to save your changes. + +See [Create and Manage Alerts](https://docs.wavefront.com/alerts_manage.html) for an overview on how to create and manage alerts. diff --git a/docs/alerts/create-alert.sh b/docs/alerts/create-alert.sh new file mode 100755 index 000000000..c0ccb5699 --- /dev/null +++ b/docs/alerts/create-alert.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +set -eo pipefail + +function post_alert_to_wavefront() { + local wavefront_token=$1 + local wavefront_cluster=$2 + local alert_file=$3 + local k8s_cluster_name=$4 + + response=$(mktemp) + res_code=$(curl --silent --show-error --output "${response}" --write-out "%{http_code}" \ + -X POST "https://${wavefront_cluster}.wavefront.com/api/v2/alert?useMultiQuery=true" \ + -H "Accept: application/json" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${wavefront_token}" \ + -d @<(sed "s/K8S_CLUSTER_NAME/${k8s_cluster_name}/g" "${alert_file}")) + + if [[ ${res_code} -ne 200 ]]; then + echo "Unable to create alert: " + cat "${response}" + exit 1 + fi + + if [ -x "$(command -v jq)" ]; then + alert_name=$(jq -r '.name' "${alert_file}") + echo "Alert name: ${alert_name}" + fi + + alert_id=$(sed -n 's/.*id":"\([0-9]*\).*/\1/p' "${response}") + + echo "Alert has been created at: https://${wavefront_cluster}.wavefront.com/alerts/${alert_id}" +} + +function check_required_argument() { + local required_arg=$1 + local failure_msg=$2 + if [[ -z ${required_arg} ]]; then + print_usage_and_exit "$failure_msg" + fi +} + +function print_usage_and_exit() { + echo "Failure: $1" + print_usage + exit 1 +} + +function print_usage() { + echo "Usage: create-alert.sh -t -c -f -n -h" + echo -e "\t-t wavefront token (required)" + echo -e "\t-c wavefront instance name (required)" + echo -e "\t-f path to alert file (required)" + echo -e "\t-n kubernetes cluster name (required)" + echo -e "\t-h print usage" +} + +function main() { + # Required arguments + local WF_CLUSTER= + local ALERT_FILE= + local K8S_CLUSTER_NAME= + local WF_CLUSTER= + + while getopts 'c:t:f:n:h' opt; do + case "${opt}" in + t) WAVEFRONT_TOKEN="${OPTARG}" ;; + c) WF_CLUSTER="${OPTARG}" ;; + f) ALERT_FILE="${OPTARG}" ;; + n) K8S_CLUSTER_NAME="${OPTARG}" ;; + h) print_usage; exit 0 ;; + \?) print_usage_and_exit "Invalid option" ;; + esac + done + + # Checking for required arguments + check_required_argument "${WAVEFRONT_TOKEN}" "-t is required" + check_required_argument "${WF_CLUSTER}" "-c is required" + check_required_argument "${ALERT_FILE}" "-f is required" + check_required_argument "${K8S_CLUSTER_NAME}" "-n is required" + + post_alert_to_wavefront "${WAVEFRONT_TOKEN}" "${WF_CLUSTER}" "${ALERT_FILE}" "${K8S_CLUSTER_NAME}" +} + +main "$@" diff --git a/docs/alerts/templates/pods-stuck-in-pending.json.tmpl b/docs/alerts/templates/pods-stuck-in-pending.json.tmpl new file mode 100644 index 000000000..6144f7883 --- /dev/null +++ b/docs/alerts/templates/pods-stuck-in-pending.json.tmpl @@ -0,0 +1,26 @@ +{ + "name": "Pods Stuck in Pending", + "alertType": "THRESHOLD", + "alertSources": [ + { + "name": "Alert Condition", + "query": "(mcount(10m, count(ts(\"kubernetes.pod.status.phase\", phase=\"Pending\" AND cluster=\"K8S_CLUSTER_NAME\"), pod_name, namespace_name, cluster, nodename, message)) >= 10)", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "if(${Alert Condition}, ${Alert Condition})", + "queryType": "WQL", + "alertSourceType": ["VARIABLE","AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "(mcount(10m, count(ts(\"kubernetes.pod.status.phase\", phase=\"Pending\" AND cluster=\"K8S_CLUSTER_NAME\"), pod_name, namespace_name, cluster, nodename, message)) >= 10) >= 1" + }, + "displayExpression": "${Display Condition}", + "minutes": 5, + "resolveAfterMinutes": 2 +} \ No newline at end of file diff --git a/docs/collector/configuration.md b/docs/collector/configuration.md index 15fcc0386..272d6cffb 100644 --- a/docs/collector/configuration.md +++ b/docs/collector/configuration.md @@ -66,10 +66,6 @@ sources: kubernetes_cadvisor_source: # see kubernetes_cadvisor_source for details - # Optional source for collecting control plane metrics - kubernetes_control_plane_source: - # see kubernetes_control_plane_source for details - # Optional source for emitting internal collector stats. internal_stats_source: # see internal_stats_source for details @@ -168,15 +164,6 @@ prefix: prefix: ``` -### kubernetes_control_plane_source -For more information on control plane metrics, see [reference](metrics.md#control-plane-metrics). - -```yaml -# We recommend using `120s` -collection: - interval: "120s" -``` - ### prometheus_source ```yaml diff --git a/docs/collector/metrics.md b/docs/collector/metrics.md index 3eebfb537..d8b315eba 100644 --- a/docs/collector/metrics.md +++ b/docs/collector/metrics.md @@ -11,6 +11,7 @@ * [cAdvisor Metrics](#cadvisor-metrics) * [Control Plane Metrics](#control-plane-metrics) + ## Kubernetes Source These metrics are collected from the `/stats/summary` endpoint on each [kubelet](https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/) running on a node. @@ -86,39 +87,47 @@ Metrics collected per type: These are cluster level metrics about the state of Kubernetes objects collected by the Collector leader instance. -| Resource | Metric Name | Description | -|----------|---------|-------------| -| Deployment | deployment.desired_replicas | Number of desired pods. | -| Deployment | deployment.available_replicas | Total number of available pods (ready for at least minReadySeconds). | -| Deployment | deployment.ready_replicas | Total number of ready pods. | -| Replicaset | replicaset.desired_replicas | Number of desired replicas. | -| Replicaset | replicaset.available_replicas | Number of available replicas (ready for at least minReadySeconds). | -| Replicaset | replicaset.ready_replicas | Number of ready replicas. | -| ReplicationController | replicationcontroller.desired_replicas | Number of desired replicas. | -| ReplicationController | replicationcontroller.available_replicas | Number of available replicas (ready for at least minReadySeconds). | -| ReplicationController | replicationcontroller.ready_replicas | Number of ready replicas. | -| Daemonset | daemonset.desired_scheduled | Total number of nodes that should be running the daemon pod. | -| Daemonset | daemonset.current_scheduled | Number of nodes that are running at least 1 daemon pod and are supposed to run the daemon pod. | -| Daemonset | daemonset.misscheduled | Number of nodes that are running the daemon pod, but are not supposed to run the daemon pod. | -| Daemonset | daemonset.ready | Number of nodes that should be running the daemon pod and have one or more of the daemon pod running and ready. | -| Statefulset | statefulset.desired_replicas | Number of desired replicas. | -| Statefulset | statefulset.current_replicas | Number of Pods created by the StatefulSet controller from the StatefulSet version indicated by currentRevision. -| Statefulset | statefulset.ready_replicas | Number of Pods created by the StatefulSet controller that have a Ready Condition. | -| Statefulset | statefulset.updated_replicas | Number of Pods created by the StatefulSet controller from the StatefulSet version indicated by updateRevision. | -| Job | job.active | Number of actively running pods. | -| Job | job.failed | Number of pods which reached phase Failed. | -| Job | job.succeeded | Number of pods which reached phase Succeeded. | -| Job | job.completions | Desired number of successfully finished pods the job should be run with. -1.0 indicates the value was not set. | -| Job | job.parallelism | Maximum desired number of pods the job should run at any given time. -1.0 indicates the value was not set. | -| CronJob | cronjob.active | Number of currently running jobs. | -| HorizontalPodAutoscaler | hpa.desired_replicas | Desired number of replicas of pods managed by this autoscaler as last calculated by the autoscaler. | -| HorizontalPodAutoscaler | hpa.min_replicas | Lower limit for the number of replicas to which the autoscaler can scale down. | -| HorizontalPodAutoscaler | hpa.max_replicas | Upper limit for the number of replicas to which the autoscaler can scale up. | -| HorizontalPodAutoscaler | hpa.current_replicas | Current number of replicas of pods managed by this autoscaler, as last seen by the autoscaler. | -| Node | node.status.condition | Status of all running nodes. | -| Node | node.spec.taint | Node taints (one metric per node taint). | -| Node | node.info | Detailed node information (kernel version, kubelet version etc). | - +| Resource | Metric Name | Description | +|-------------------------|------------------------------------------|-----------------------------------------------------------------------------------------------------------------| +| Deployment | deployment.desired_replicas | Number of desired pods. | +| Deployment | deployment.available_replicas | Total number of available pods (ready for at least minReadySeconds). | +| Deployment | deployment.ready_replicas | Total number of ready pods. | +| Replicaset | replicaset.desired_replicas | Number of desired replicas. | +| Replicaset | replicaset.available_replicas | Number of available replicas (ready for at least minReadySeconds). | +| Replicaset | replicaset.ready_replicas | Number of ready replicas. | +| ReplicationController | replicationcontroller.desired_replicas | Number of desired replicas. | +| ReplicationController | replicationcontroller.available_replicas | Number of available replicas (ready for at least minReadySeconds). | +| ReplicationController | replicationcontroller.ready_replicas | Number of ready replicas. | +| Daemonset | daemonset.desired_scheduled | Total number of nodes that should be running the daemon pod. | +| Daemonset | daemonset.current_scheduled | Number of nodes that are running at least 1 daemon pod and are supposed to run the daemon pod. | +| Daemonset | daemonset.misscheduled | Number of nodes that are running the daemon pod, but are not supposed to run the daemon pod. | +| Daemonset | daemonset.ready | Number of nodes that should be running the daemon pod and have one or more of the daemon pod running and ready. | +| Statefulset | statefulset.desired_replicas | Number of desired replicas. | +| Statefulset | statefulset.current_replicas | Number of Pods created by the StatefulSet controller from the StatefulSet version indicated by currentRevision. | +| Statefulset | statefulset.ready_replicas | Number of Pods created by the StatefulSet controller that have a Ready Condition. | +| Statefulset | statefulset.updated_replicas | Number of Pods created by the StatefulSet controller from the StatefulSet version indicated by updateRevision. | +| Job | job.active | Number of actively running pods. | +| Job | job.failed | Number of pods which reached phase Failed. | +| Job | job.succeeded | Number of pods which reached phase Succeeded. | +| Job | job.completions | Desired number of successfully finished pods the job should be run with. -1.0 indicates the value was not set. | +| Job | job.parallelism | Maximum desired number of pods the job should run at any given time. -1.0 indicates the value was not set. | +| CronJob | cronjob.active | Number of currently running jobs. | +| HorizontalPodAutoscaler | hpa.desired_replicas | Desired number of replicas of pods managed by this autoscaler as last calculated by the autoscaler. | +| HorizontalPodAutoscaler | hpa.min_replicas | Lower limit for the number of replicas to which the autoscaler can scale down. | +| HorizontalPodAutoscaler | hpa.max_replicas | Upper limit for the number of replicas to which the autoscaler can scale up. | +| HorizontalPodAutoscaler | hpa.current_replicas | Current number of replicas of pods managed by this autoscaler, as last seen by the autoscaler. | +| Node | node.status.condition | Status of all running nodes. | +| Node | node.spec.taint | Node taints (one metric per node taint). | +| Node | node.info | Detailed node information (kernel version, kubelet version etc). | +| PersistentVolumeClaim | pvc.access_mode | The access mode(s) specified by the PersistentVolumeClaim. | +| PersistentVolumeClaim | pvc.info | Information about PersistentVolumeClaim. No storage_class_name tag implies pvc uses default storage class. | +| PersistentVolumeClaim | pvc.request.storage_bytes | The storage requested by the PersistentVolumeClaim in bytes. | +| PersistentVolumeClaim | pvc.status.phase | The phase of the PersistentVolumeClaim. | +| PersistentVolumeClaim | pvc.status.condition | Information about status of different conditions of PersistentVolumeClaim. | +| PersistentVolume | pv.capacity_bytes | PersistentVolume capacity in bytes. | +| PersistentVolume | pv.status.phase | The phase of the PersistentVolume. | +| PersistentVolume | pv.info | Information about PersistentVolume. | +| PersistentVolume | pv.access_mode | The access mode(s) specified by the PersistentVolume. | ## Prometheus Source Varies by scrape target. @@ -221,14 +230,11 @@ Metrics collected per type: | kubernetes.node.cpu.node_utilization (node_role="control-plane") | CPU utilization as a share of the contol-plane node allocatable in millicores. | Not available in AKS, EKS, GKE | | kubernetes.node.memory.working_set (node_role="control-plane") | Total working set usage of the control-plane node. Working set is the memory being used and not easily dropped by the kernel. | Not available in AKS, EKS, GKE | | kubernetes.node.filesystem.usage (node_role="control-plane") | Total number of bytes consumed on a filesyste of the control-plane node | Not available in AKS, EKS, GKE | -| kubernetes.controlplane.apiserver.request.duration.seconds.bucket | Histogram buckets for API server request latency | - | -| kubernetes.controlplane.apiserver.request.duration.seconds | API server request latency as an [Operations for Applications Histogram](https://docs.wavefront.com/proxies_histograms.html) | - | +| kubernetes.controlplane.apiserver.request.duration.seconds.m | API server request latency as an [Operations for Applications Histogram](https://docs.wavefront.com/proxies_histograms.html) | - | | kubernetes.controlplane.apiserver.request.total.counter | API server total request count | - | | kubernetes.controlplane.workqueue.adds.total.counter | Current depth of API server workqueue | - | -| kubernetes.controlplane.workqueue.queue.duration.seconds.bucket | Histogram buckets for workqueue latency | - | -| kubernetes.controlplane.workqueue.queue.duration.seconds | workqueue latency as an [Operations for Applications Histogram](https://docs.wavefront.com/proxies_histograms.html) | - | -| kubernetes.controlplane.coredns.dns.request.duration.seconds.bucket | Histogram buckets for CoreDNS request latency | Not available in GKE, OpenShift | -| kubernetes.controlplane.coredns.dns.request.duration.seconds | CoreDNS request latency as an [Operations for Applications Histogram](https://docs.wavefront.com/proxies_histograms.html) | Not available in GKE, OpenShift | +| kubernetes.controlplane.workqueue.queue.duration.seconds.m | workqueue latency as an [Operations for Applications Histogram](https://docs.wavefront.com/proxies_histograms.html) | - | +| kubernetes.controlplane.coredns.dns.request.duration.seconds.m | CoreDNS request latency as an [Operations for Applications Histogram](https://docs.wavefront.com/proxies_histograms.html) | Not available in GKE, OpenShift | | kubernetes.controlplane.coredns.dns.responses.total.counter | CoreDNS total response count | Not available in GKE, OpenShift | ### etcd Metrics @@ -245,8 +251,8 @@ Metrics collected for etcd: | kubernetes.controlplane.etcd.server.proposals.applied.total.gauge | The total number of concensus proposals applied. | Not available in AKS, EKS, GKE | | kubernetes.controlplane.etcd.server.proposals.committed.total.gauge | The total number of consensus proposals committed. | Not available in AKS, EKS, GKE | | kubernetes.controlplane.etcd.server.proposals.pending.gauge | The current number of pending proposals to commit. | Not available in AKS, EKS, GKE | -| kubernetes.controlplane.etcd.disk.wal.fsync.duration.seconds.bucket | The latency distributions of fsync called by wal. | Not available in AKS, EKS, GKE | -| kubernetes.controlplane.etcd.disk.backend.commit.duration.seconds.bucket | The latency distributions of commit called by backend. | Not available in AKS, EKS, GKE | -| kubernetes.controlplane.etcd.network.peer.round.trip.time.seconds.bucket | Round-Trip-Time between peers. | Not available in AKS, EKS, GKE | +| kubernetes.controlplane.etcd.disk.wal.fsync.duration.seconds.m | The latency distributions of fsync called by wal as an [Operations for Applications Histogram](https://docs.wavefront.com/proxies_histograms.html) | Not available in AKS, EKS, GKE | +| kubernetes.controlplane.etcd.disk.backend.commit.duration.seconds.m | The latency distributions of commit called by backend as an [Operations for Applications Histogram](https://docs.wavefront.com/proxies_histograms.html) | Not available in AKS, EKS, GKE | +| kubernetes.controlplane.etcd.network.peer.round.trip.time.seconds.m | Round-Trip-Time between peers as an [Operations for Applications Histogram](https://docs.wavefront.com/proxies_histograms.html) | Not available in AKS, EKS, GKE | | kubernetes.controlplane.etcd.network.peer.sent.failures.total.counter | The total number of failures sent by peers. | Not available in AKS, EKS, GKE | -| kubernetes.controlplane.etcd.network.peer.received.failures.total.counter | The total number of failures received by peers. | Not available in AKS, EKS, GKE | +| kubernetes.controlplane.etcd.network.peer.received.failures.total.counter | The total number of failures received by peers. | Not available in AKS, EKS, GKE | \ No newline at end of file diff --git a/docs/operator/custom-configuration.md b/docs/operator/custom-configuration.md index e62a2659b..499f3d733 100644 --- a/docs/operator/custom-configuration.md +++ b/docs/operator/custom-configuration.md @@ -9,8 +9,8 @@ Install the Observability for Kubernetes Operator into `observability-system` na | Component | From | To | |---|---|---| -| Observability for Kubernetes Operator | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.7.0` | `YOUR_IMAGE_REGISTRY/kubernetes-operator:2.7.0` | -| Kubernetes Metrics Collector | `projects.registry.vmware.com/tanzu_observability/kubernetes-collector:1.19.0` | `YOUR_IMAGE_REGISTRY/kubernetes-collector:1.19.0` | +| Observability for Kubernetes Operator | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.8.0` | `YOUR_IMAGE_REGISTRY/kubernetes-operator:2.8.0` | +| Kubernetes Metrics Collector | `projects.registry.vmware.com/tanzu_observability/kubernetes-collector:1.20.0` | `YOUR_IMAGE_REGISTRY/kubernetes-collector:1.20.0` | | Wavefront Proxy | `projects.registry.vmware.com/tanzu_observability/proxy:12.4.1` | `YOUR_IMAGE_REGISTRY/proxy:12.4.1` | | Operations for Applications logging | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator-fluentbit:2.1.2` | `YOUR_IMAGE_REGISTRY/kubernetes-operator-fluentbit:2.1.2` | @@ -29,11 +29,34 @@ Install the Observability for Kubernetes Operator into `observability-system` na - name: projects.registry.vmware.com/tanzu_observability/kubernetes-operator newName: YOUR_IMAGE_REGISTRY/kubernetes-operator ``` -5. Deploy the Observability for Kubernetes Operator +5. If your image registry needs authentication, create an image registry secret in the same namespace as the operator (The default namespace is `observability-system`) by following steps [here](https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/), then modify the `kustomization.yaml` to include your image registry secret. + ```yaml + # Need to change YOUR_IMAGE_REGISTRY and YOUR_IMAGE_REGISTRY_SECRET + apiVersion: kustomize.config.k8s.io/v1beta1 + kind: Kustomization + + resources: + - wavefront-operator.yaml + + images: + - name: projects.registry.vmware.com/tanzu_observability/kubernetes-operator + newName: YOUR_IMAGE_REGISTRY/kubernetes-operator + + patches: + - target: + kind: Deployment + name: wavefront-controller-manager + patch: |- + - op: add + path: /spec/template/spec/imagePullSecrets + value: + - name: YOUR_IMAGE_REGISTRY_SECRET + ``` +6. Deploy the Observability for Kubernetes Operator ``` kubectl apply -k observability ``` -6. Now follow the steps starting from step 2 in [Deploy the Kubernetes Metrics Collector and Wavefront Proxy with the Operator](../../README.md#Deploy-the-Kubernetes-Metrics-Collector-and-Wavefront-Proxy-with-the-Observability-for-Kubernetes-Operator) +7. Now follow the steps starting from step 2 in [Deploy the Kubernetes Metrics Collector and Wavefront Proxy with the Operator](../../README.md#Deploy-the-Kubernetes-Metrics-Collector-and-Wavefront-Proxy-with-the-Observability-for-Kubernetes-Operator). Also, add your image registry secret to the Wavefront Custom Resource as shown in this [example](../../deploy/scenarios/wavefront-custom-private-registry.yaml). # Deploy the Observability for Kubernetes Operator into a Custom Namespace diff --git a/docs/operator/helm-feature-comparison.md b/docs/operator/helm-feature-comparison.md index 616fa986d..b0234d9b9 100644 --- a/docs/operator/helm-feature-comparison.md +++ b/docs/operator/helm-feature-comparison.md @@ -35,7 +35,7 @@ Observability for Kubernetes Operator feature comparison with [Helm install](htt | `collector.discovery. annotationExcludes` | `Can configure with custom collector config` | Exclude resources from annotation based auto-discovery. | | `collector.discovery.config` | `Can configure with custom collector config` | Exclude resources from annotation based auto-discovery. | | `collector.resources` | `dataCollection.metrics.nodeCollector.resources` `dataCollection.metrics.clusterCollector.resources` | Configuration for rules based auto-discovery. | -| `imagePullSecrets` | `Not currently supported` | Enable Wavefront proxy and Kubernetes Metrics Collector to pull from private image repositories. **Note:** Secret must exist in namespace that will be used for the installation. | +| `imagePullSecrets` | `imagePullSecret` | Enable Wavefront proxy and Kubernetes Metrics Collector to pull from private image repositories. **Note:** Secret must exist in namespace that will be used for the installation. Currently, the operator supports a single imagePullSecret.| | `proxy.enabled` | `dataExport.wavefrontProxy.enable` | Whether to enable the Wavefront proxy. Defaults to true. Disable to use `dataExport.externalWavefrontProxy.Url`. | | `proxy.image.repository` | `Not currently supported` | Kubernetes Metrics Collector image registry and name. | | `proxy.image.tag` | `Not currently supported` | Wavefront proxy image tag. | diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 60193946a..14d8c7320 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -8,8 +8,6 @@ Get help when you have problems with your Kubernetes setup. This page is divided For an in depth overview of the integration and how it is deployed, please refer to our [GitHub page](https://github.com/wavefrontHQ/observability-for-kubernetes). -**Note:** If you are currently using the Helm managed and installed version of the Wavefront proxy and Collector, please refer to our legacy troubleshooting guide for instructions on how to troubleshoot your integration. - ## No Data Flowing into Operations for Applications If you have identified that there is a problem with data flowing into Operations for Applications, please follow the steps below. diff --git a/operator/config/manager/component_versions.yaml b/operator/config/manager/component_versions.yaml index 2153af993..fcbf36228 100644 --- a/operator/config/manager/component_versions.yaml +++ b/operator/config/manager/component_versions.yaml @@ -7,6 +7,6 @@ metadata: name: component-versions namespace: system data: - collector: "1.19.0" + collector: "1.20.0" logging: "2.1.2" proxy: "12.4.1" diff --git a/operator/config/manager/kustomization.yaml b/operator/config/manager/kustomization.yaml index 9c32a499c..d92f7f3d0 100644 --- a/operator/config/manager/kustomization.yaml +++ b/operator/config/manager/kustomization.yaml @@ -15,7 +15,7 @@ kind: Kustomization images: - name: controller newName: projects.registry.vmware.com/tanzu_observability/kubernetes-operator - newTag: 2.7.0 + newTag: 2.8.0 patches: - path: patches.yaml diff --git a/operator/dev-internal/deploy/wavefront-operator.yaml b/operator/dev-internal/deploy/wavefront-operator.yaml index ddda62170..f0ca2a40c 100644 --- a/operator/dev-internal/deploy/wavefront-operator.yaml +++ b/operator/dev-internal/deploy/wavefront-operator.yaml @@ -222,7 +222,7 @@ spec: enable: default: true description: Enable is whether to include kubernetes.controlplane.* - metrics and whether to include kubernetes_control_plane_source + metrics type: boolean required: - enable @@ -597,6 +597,35 @@ spec: type: object type: object type: object + experimental: + description: Experimental features + properties: + autoInstrumentation: + properties: + deployKey: + type: string + enable: + type: boolean + type: object + kubernetesEvents: + properties: + enable: + default: false + description: Enable is whether to enable events. Defaults + to false. + type: boolean + externalEndpointURL: + type: string + required: + - externalEndpointURL + type: object + type: object + imagePullSecret: + description: ImagePullSecret is the name of the secret to authenticate + with a private custom registry. + maxLength: 253 + pattern: ^[a-z0-9]([a-z0-9\.\-]*[a-z0-9])?$ + type: string wavefrontTokenSecret: default: wavefront-secret description: WavefrontTokenSecret is the name of the secret that contains @@ -668,6 +697,16 @@ rules: - get - patch - update +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - create + - delete + - get + - patch + - update - apiGroups: - "" resources: @@ -681,8 +720,12 @@ rules: resources: - secrets verbs: + - create + - delete - get - list + - patch + - update - watch - apiGroups: - "" @@ -692,8 +735,10 @@ rules: - create - delete - get + - list - patch - update + - watch - apiGroups: - "" resources: @@ -726,6 +771,39 @@ rules: - patch - update - watch +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - create + - delete + - get + - patch + - update +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - get + - list + - patch + - update + - watch +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - wavefront.com resources: @@ -843,6 +921,8 @@ rules: - pods - services - replicationcontrollers + - persistentvolumeclaims + - persistentvolumes verbs: - get - list @@ -999,9 +1079,9 @@ subjects: --- apiVersion: v1 data: - collector: 1.19.0 + collector: 1.20.0 logging: 2.1.2 - proxy: "12.4" + proxy: 12.4.1 kind: ConfigMap metadata: labels: @@ -1071,7 +1151,7 @@ spec: configMapKeyRef: key: logging name: wavefront-component-versions - image: projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.7.0 + image: projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.8.0 imagePullPolicy: Always livenessProbe: httpGet: diff --git a/operator/dev-internal/docs/operator/custom-configuration.md b/operator/dev-internal/docs/operator/custom-configuration.md index 241fffd12..499f3d733 100644 --- a/operator/dev-internal/docs/operator/custom-configuration.md +++ b/operator/dev-internal/docs/operator/custom-configuration.md @@ -9,8 +9,8 @@ Install the Observability for Kubernetes Operator into `observability-system` na | Component | From | To | |---|---|---| -| Observability for Kubernetes Operator | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.7.0` | `YOUR_IMAGE_REGISTRY/kubernetes-operator:2.7.0` | -| Kubernetes Metrics Collector | `projects.registry.vmware.com/tanzu_observability/kubernetes-collector:1.19.0` | `YOUR_IMAGE_REGISTRY/kubernetes-collector:1.19.0` | +| Observability for Kubernetes Operator | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.8.0` | `YOUR_IMAGE_REGISTRY/kubernetes-operator:2.8.0` | +| Kubernetes Metrics Collector | `projects.registry.vmware.com/tanzu_observability/kubernetes-collector:1.20.0` | `YOUR_IMAGE_REGISTRY/kubernetes-collector:1.20.0` | | Wavefront Proxy | `projects.registry.vmware.com/tanzu_observability/proxy:12.4.1` | `YOUR_IMAGE_REGISTRY/proxy:12.4.1` | | Operations for Applications logging | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator-fluentbit:2.1.2` | `YOUR_IMAGE_REGISTRY/kubernetes-operator-fluentbit:2.1.2` | diff --git a/operator/release/NEXT_RELEASE_VERSION b/operator/release/NEXT_RELEASE_VERSION index 834f26295..c8e38b614 100644 --- a/operator/release/NEXT_RELEASE_VERSION +++ b/operator/release/NEXT_RELEASE_VERSION @@ -1 +1 @@ -2.8.0 +2.9.0 diff --git a/operator/release/OPERATOR_VERSION b/operator/release/OPERATOR_VERSION index 24ba9a38d..834f26295 100644 --- a/operator/release/OPERATOR_VERSION +++ b/operator/release/OPERATOR_VERSION @@ -1 +1 @@ -2.7.0 +2.8.0