Skip to content

Commit

Permalink
expose gcsfuse metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
songjiaxun committed Aug 13, 2024
1 parent 0dd6167 commit d4bae3a
Show file tree
Hide file tree
Showing 22 changed files with 904 additions and 30 deletions.
9 changes: 9 additions & 0 deletions cmd/csi_driver/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/cloud_provider/storage"
driver "github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/csi_driver"
csimounter "github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/csi_mounter"
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/metrics"
"k8s.io/klog/v2"
"k8s.io/mount-utils"
)
Expand All @@ -44,6 +45,7 @@ var (
identityProvider = flag.String("identity-provider", "", "The Identity Provider to authenticate with GCS API.")
enableProfiling = flag.Bool("enable-profiling", false, "enable the golang pprof at port 6060")
informerResyncDurationSec = flag.Int("informer-resync-duration-sec", 1800, "informer resync duration in seconds")
metricsEndpoint = flag.String("metrics-endpoint", "", "The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled.")

// These are set at compile time.
version = "unknown"
Expand Down Expand Up @@ -91,6 +93,7 @@ func main() {
}

var mounter mount.Interface
var mm metrics.Manager
if *runNode {
if *nodeID == "" {
klog.Fatalf("NodeID cannot be empty for node service")
Expand All @@ -102,6 +105,11 @@ func main() {
if err != nil {
klog.Fatalf("Failed to prepare CSI mounter: %v", err)
}

if *metricsEndpoint != "" {
mm = metrics.NewMetricsManager(*metricsEndpoint)
mm.InitializeHTTPHandler()
}
}

config := &driver.GCSDriverConfig{
Expand All @@ -114,6 +122,7 @@ func main() {
TokenManager: tm,
Mounter: mounter,
K8sClients: clientset,
MetricsManager: mm,
}

gcfsDriver, err := driver.NewGCSDriver(config)
Expand Down
4 changes: 4 additions & 0 deletions deploy/base/node/node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ spec:
- --nodeid=$(KUBE_NODE_NAME)
- --node=true
- --identity-provider=$(IDENTITY_PROVIDER)
- --metrics-endpoint=:9920
ports:
- containerPort: 9920
name: metrics
resources:
limits:
cpu: 200m
Expand Down
2 changes: 2 additions & 0 deletions deploy/overlays/dev/node_pprof.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ spec:
- --endpoint=unix:/csi/csi.sock
- --nodeid=$(KUBE_NODE_NAME)
- --node=true
- --identity-provider=$(IDENTITY_PROVIDER)
- --metrics-endpoint=:9920
- --enable-profiling=true
ports:
- containerPort: 6060
130 changes: 130 additions & 0 deletions docs/metrics/metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
<!--
Copyright 2018 The Kubernetes Authors.
Copyright 2024 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

# Metrics

Google Cloud Storage FUSE offers client-side Prometheus metrics for monitoring file system operations and performance. These metrics, gathered by the sidecar container, are forwarded to the CSI driver node servers. You can access these metrics via a Prometheus server. This guide explains how to enable client-side metrics, set up a Prometheus server, and query the metrics for insights.

## Enable metrics collection in your workload

You don't need to set anything to enable the metrics collection. The metrics collection is enabled by default.

To **disable** the metrics collection, set the volume attribute `disableMetrics: "true"`.

For in-line ephemeral volumes:

```yaml
...
spec:
volumes:
- name: gcs-fuse-csi-ephemeral
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: <bucket-name>
disableMetrics: "true"
```
For `PersistentVolume` volumes:

```yaml
apiVersion: v1
kind: PersistentVolume
metadata:
name: gcs-fuse-csi-pv
spec:
...
csi:
driver: gcsfuse.csi.storage.gke.io
volumeHandle: <bucket-name>
volumeAttributes:
disableMetrics: "true"
```

## Install Helm

The example uses Helm charts to manage Prometheus server. Follow the [Helm documentation](https://helm.sh/docs/intro/install/#from-script) to install Helm.

## Install a Prometheus server to collect metrics

Add Prometheus Helm repo and update the repo.

```bash
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
```

Create a new Kubernetes namespace `prometheus`, and install a Prometheus server using Helm. Note that the following example only installs a Prometheus server without other auxiliary components.

```bash
kubectl create namespace prometheus
helm install prometheus prometheus-community/prometheus \
--namespace prometheus \
--values ./docs/metrics/prometheus-values.yaml
```

## Connect to the Prometheus server

Create a new terminal session and run the following command to forward traffic from your local machine to the Prometheus server.

```bash
export POD_NAME=$(kubectl get pods --namespace prometheus -l "app.kubernetes.io/name=prometheus,app.kubernetes.io/instance=prometheus" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace prometheus port-forward $POD_NAME 9920
```

## Open Prometheus UI to query metrics

Open a web browser and go to the following URL. The example shows a graph of the metric `fs_ops_count`.

> <http://localhost:9090/graph?g0.expr=fs_ops_count&g0.tab=0&g0.display_mode=lines&g0.show_exemplars=0&g0.range_input=10m>

Below is a list of supported Google Cloud Storage FUSE metrics:

- fs_ops_count
- fs_ops_error_count
- fs_ops_latency
- gcs_download_bytes_count
- gcs_read_count
- gcs_read_bytes_count
- gcs_reader_count
- gcs_request_count
- gcs_request_latencies
- file_cache_read_count
- file_cache_read_bytes_count
- file_cache_read_latencies

See the [Google Cloud Storage FUSE Metrics documentation](https://github.com/GoogleCloudPlatform/gcsfuse/blob/master/docs/metrics.md) for detailed explanation.

In the CSI driver, each metric record includes the following extra labels so that you can filter and aggregate metrics.

- pod_name
- namespace_name
- volume_name
- bucket_name

The Prometheus UI provides an easy interface to query and visualize metrics. See [Querying Prometheus documentation](https://prometheus.io/docs/prometheus/latest/querying/basics/) for details.

## Clean up Prometheus server

Run the following command to clean up the Prometheus server.

Warning: the following command will clean up the PV and PVC storing Prometheus data. If you need to retain the metrics data, in the step [Install a Prometheus server to collect metrics](#install-a-prometheus-server-to-collect-metrics), create a `StorageClass` with `reclaimPolicy: Retain`, and set the helm parameter `server.persistentVolume.storageClass` using the new `StorageClass` name.

```bash
helm uninstall prometheus --namespace prometheus
```
49 changes: 49 additions & 0 deletions docs/metrics/prometheus-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2018 The Kubernetes Authors.
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

server:
persistentVolume:
size: 20Gi
prometheus-pushgateway:
enabled: false
alertmanager:
enabled: false
kube-state-metrics:
enabled: false
prometheus-node-exporter:
enabled: false
serverFiles:
prometheus.yml:
scrape_configs:
- job_name: 'gcsfuse-csi-node-pods'

scrape_interval: 10s
scrape_timeout: 2s

kubernetes_sd_configs:
- role: pod

relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_k8s_app]
action: keep
regex: gcs-fuse-csi-driver
- source_labels: [__meta_kubernetes_pod_ip]
action: replace
regex: ((([0-9]+?)(\.|$)){4})
replacement: $1:9920
target_label: __address__
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: node
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ require (
github.com/kubernetes-csi/csi-test/v5 v5.2.0
github.com/onsi/ginkgo/v2 v2.19.1
github.com/onsi/gomega v1.34.1
github.com/prometheus/client_golang v1.18.0
github.com/prometheus/client_model v0.6.0
github.com/prometheus/common v0.46.0
golang.org/x/net v0.27.0
golang.org/x/oauth2 v0.22.0
golang.org/x/time v0.6.0
Expand Down Expand Up @@ -93,9 +96,6 @@ require (
github.com/opencontainers/selinux v1.11.0 // indirect
github.com/pelletier/go-toml/v2 v2.2.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.18.0 // indirect
github.com/prometheus/client_model v0.6.0 // indirect
github.com/prometheus/common v0.46.0 // indirect
github.com/prometheus/procfs v0.13.0 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions pkg/csi_driver/gcs_fuse_driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/cloud_provider/auth"
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/cloud_provider/clientset"
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/cloud_provider/storage"
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/metrics"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"k8s.io/klog/v2"
Expand All @@ -43,6 +44,7 @@ type GCSDriverConfig struct {
TokenManager auth.TokenManager
Mounter mount.Interface
K8sClients clientset.Interface
MetricsManager metrics.Manager
}

type GCSDriver struct {
Expand Down
2 changes: 2 additions & 0 deletions pkg/csi_driver/gcs_fuse_driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/cloud_provider/auth"
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/cloud_provider/clientset"
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/cloud_provider/storage"
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/metrics"
mount "k8s.io/mount-utils"
)

Expand All @@ -40,6 +41,7 @@ func initTestDriver(t *testing.T, fm *mount.FakeMounter) *GCSDriver {
TokenManager: auth.NewFakeTokenManager(),
Mounter: fm,
K8sClients: &clientset.FakeClientset{},
MetricsManager: &metrics.FakeMetricsManager{},
}
driver, err := NewGCSDriver(config)
if err != nil {
Expand Down
14 changes: 13 additions & 1 deletion pkg/csi_driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ func (s *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublish
}

// Validate arguments
targetPath, bucketName, fuseMountOptions, skipBucketAccessCheck, err := parseRequestArguments(req)
targetPath, bucketName, fuseMountOptions, skipBucketAccessCheck, disableMetricsCollection, err := parseRequestArguments(req)
if err != nil {
return nil, status.Error(codes.InvalidArgument, err.Error())
}
Expand Down Expand Up @@ -151,6 +151,13 @@ func (s *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublish
return nil, status.Error(codes.FailedPrecondition, "failed to find the sidecar container in Pod spec")
}

// Register metrics collecter.
// It is idempotent to register the same collector in node republish calls.
if s.driver.config.MetricsManager != nil && !disableMetricsCollection {
klog.V(6).Infof("NodePublishVolume enabling metrics collector for target path %q", targetPath)
s.driver.config.MetricsManager.RegisterMetricsCollector(targetPath, pod.Namespace, pod.Name, bucketName)
}

// Check if the sidecar container is still required,
// if not, put an exit file to the emptyDir path to
// notify the sidecar container to exit.
Expand Down Expand Up @@ -220,6 +227,11 @@ func (s *nodeServer) NodeUnpublishVolume(_ context.Context, req *csi.NodeUnpubli
}
defer s.volumeLocks.Release(targetPath)

// Unregister metrics collecter.
if s.driver.config.MetricsManager != nil {
s.driver.config.MetricsManager.UnregisterMetricsCollector(targetPath)
}

delete(s.volumeStateStore, targetPath)

// Check if the target path is already mounted
Expand Down
Loading

0 comments on commit d4bae3a

Please sign in to comment.