From c67b7327960253896fda275c7770a36dd2563051 Mon Sep 17 00:00:00 2001 From: Anton Bronnikov Date: Mon, 16 Sep 2024 08:41:56 +0300 Subject: [PATCH] feat: implement reth and op-node checks ..also: refactor a bit --- .github/workflows/release.yml | 15 +- .gitignore | 17 ++- .goreleaser.yml | 35 ++++- Dockerfile | 26 ++++ Dockerfile.goreleaser | 9 ++ LICENSE | 2 +- Makefile | 26 ++-- cmd/help.go | 18 +++ cmd/main.go | 107 ++++++------- cmd/serve.go | 219 +++++++++++++++++++-------- config/config.go | 15 ++ config/healthcheck.go | 7 + config/healthcheck_geth.go | 5 + config/healthcheck_lighthouse.go | 5 + config/healthcheck_op_node.go | 6 + config/healthcheck_reth.go | 5 + config/http_status.go | 7 + config/log.go | 6 + config/server.go | 5 + go.mod | 18 ++- go.sum | 53 +++++-- healthcheck/geth.go | 124 +++++++++++++++ healthcheck/healthcheck.go | 10 ++ healthcheck/lighthouse.go | 162 ++++++++++++++++++++ healthcheck/op_node.go | 175 +++++++++++++++++++++ healthcheck/reth.go | 116 ++++++++++++++ healthchecker/check_geth.go | 83 ---------- healthchecker/check_lighthouse.go | 121 --------------- healthchecker/healthchecker.go | 200 ------------------------ httplogger/middleware.go | 5 +- logutils/http_server_error_logger.go | 28 ++++ logutils/setup.go | 53 +++++++ metrics/metrics.go | 54 +++++++ readme.md | 5 +- server/healthcheck.go | 96 ++++++++++++ server/server.go | 146 ++++++++++++++++++ utils/nic.go | 51 +++++++ 37 files changed, 1460 insertions(+), 575 deletions(-) create mode 100644 Dockerfile create mode 100644 Dockerfile.goreleaser create mode 100644 cmd/help.go create mode 100644 config/config.go create mode 100644 config/healthcheck.go create mode 100644 config/healthcheck_geth.go create mode 100644 config/healthcheck_lighthouse.go create mode 100644 config/healthcheck_op_node.go create mode 100644 config/healthcheck_reth.go create mode 100644 config/http_status.go create mode 100644 config/log.go create mode 100644 config/server.go create mode 100644 healthcheck/geth.go create mode 100644 healthcheck/healthcheck.go create mode 100644 healthcheck/lighthouse.go create mode 100644 healthcheck/op_node.go create mode 100644 healthcheck/reth.go delete mode 100644 healthchecker/check_geth.go delete mode 100644 healthchecker/check_lighthouse.go delete mode 100644 healthchecker/healthchecker.go create mode 100644 logutils/http_server_error_logger.go create mode 100644 logutils/setup.go create mode 100644 metrics/metrics.go create mode 100644 server/healthcheck.go create mode 100644 server/server.go create mode 100644 utils/nic.go diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e77ffe2..bdfcf5b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,7 +16,20 @@ jobs: - name: setup go dependencies uses: actions/setup-go@v5 with: - go-version: "1.21" + go-version: "1.22" + + - name: setup quemu + uses: docker/setup-qemu-action@v3 + + - name: setup docker buildx + uses: docker/setup-buildx-action@v3 + + - name: login to ghcr + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - name: build and publish backend release uses: goreleaser/goreleaser-action@v5 diff --git a/.gitignore b/.gitignore index abf48bb..ace404b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,15 @@ -# Built binaries +# builds -bin/* -!bin/.gitkeep +/bin/* +!/bin/.gitkeep +dist -# Goreleaser artifacts +# ide -dist/* +.idea +.vscode +*.code-workspace -# VS Code +# temporary files -.vscode +.temp diff --git a/.goreleaser.yml b/.goreleaser.yml index db62b6c..ed91f60 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -4,16 +4,12 @@ env: builds: - main: ./cmd ldflags: - - -X main.version={{.Version}} + - -s + - -w + - -X main.version={{ .Version }} targets: - - darwin_amd64 - - darwin_arm64 - - linux_386 - linux_amd64 - - linux_arm - linux_arm64 - - windows_386 - - windows_amd64 archives: - id: zip @@ -27,3 +23,28 @@ checksum: release: prerelease: auto + +dockers: + - dockerfile: Dockerfile.goreleaser + goarch: amd64 + goos: linux + use: buildx + build_flag_templates: + - --platform=linux/amd64 + image_templates: + - "ghcr.io/flashbots/node-healthchecker:{{ .Tag }}-amd64" + + - dockerfile: Dockerfile.goreleaser + goarch: arm64 + goos: linux + use: buildx + build_flag_templates: + - --platform=linux/arm64 + image_templates: + - "ghcr.io/flashbots/node-healthchecker:{{ .Tag }}-arm64" + +docker_manifests: + - name_template: "ghcr.io/flashbots/node-healthchecker:{{ .Tag }}" + image_templates: + - "ghcr.io/flashbots/node-healthchecker:{{ .Tag }}-amd64" + - "ghcr.io/flashbots/node-healthchecker:{{ .Tag }}-arm64" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..296f6ae --- /dev/null +++ b/Dockerfile @@ -0,0 +1,26 @@ +# stage: build --------------------------------------------------------- + +FROM golang:1.22-alpine as build + +RUN apk add --no-cache gcc musl-dev linux-headers + +WORKDIR /go/src/github.com/flashbots/node-healthchecker + +COPY go.* ./ +RUN go mod download + +COPY . . + +RUN go build -o bin/node-healthchecker -ldflags "-s -w" github.com/flashbots/node-healthchecker/cmd + +# stage: run ----------------------------------------------------------- + +FROM alpine + +RUN apk add --no-cache ca-certificates + +WORKDIR /app + +COPY --from=build /go/src/github.com/flashbots/node-healthchecker/bin/node-healthchecker ./node-healthchecker + +ENTRYPOINT ["/app/node-healthchecker"] diff --git a/Dockerfile.goreleaser b/Dockerfile.goreleaser new file mode 100644 index 0000000..cd02ec3 --- /dev/null +++ b/Dockerfile.goreleaser @@ -0,0 +1,9 @@ +# stage: run + +FROM gcr.io/distroless/static-debian12 as runner + +WORKDIR /app + +COPY node-healthchecker ./ + +ENTRYPOINT [ "./node-healthchecker" ] diff --git a/LICENSE b/LICENSE index c6ca94d..74a8c75 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Flashbots +Copyright (c) 2023-2024 Flashbots Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index f3a1f3c..258963c 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,24 @@ -VERSION := $(shell git describe --tags --always --dirty="-dev") - -default: build +VERSION := $(shell git describe --tags --always --dirty="-dev" --match "v*.*.*" || echo "development" ) +VERSION := $(VERSION:v%=%) .PHONY: build build: - CGO_ENABLED=0 go build -ldflags "-X main.version=${VERSION}" -o ./bin/node-healthchecker github.com/flashbots/node-healthchecker/cmd + @CGO_ENABLED=0 go build \ + -ldflags "-X main.version=${VERSION}" \ + -o ./bin/node-healthchecker \ + github.com/flashbots/node-healthchecker/cmd .PHONY: snapshot snapshot: - goreleaser release --snapshot --rm-dist + @goreleaser release --snapshot --clean + +.PHONY: help +help: + @printf "\n=====\n\n" + @go run github.com/flashbots/node-healthchecker/cmd help + @printf "\n=====\n\n" + @go run github.com/flashbots/node-healthchecker/cmd serve --help -.PHONY: release -release: - @rm -rf ./dist - GITHUB_TOKEN=$$( gh auth token ) goreleaser release +.PHONY: serve +serve: + @go run github.com/flashbots/node-healthchecker/cmd serve diff --git a/cmd/help.go b/cmd/help.go new file mode 100644 index 0000000..999b961 --- /dev/null +++ b/cmd/help.go @@ -0,0 +1,18 @@ +package main + +import ( + "github.com/urfave/cli/v2" + + "github.com/flashbots/node-healthchecker/config" +) + +func CommandHelp(_ *config.Config) *cli.Command { + return &cli.Command{ + Usage: "show the list of commands or help for one command", + Name: "help", + + Action: func(clictx *cli.Context) error { + return cli.ShowAppHelp(clictx) + }, + } +} diff --git a/cmd/main.go b/cmd/main.go index 9a1dc67..6d020dd 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -3,91 +3,78 @@ package main import ( "fmt" "os" - "strings" "github.com/urfave/cli/v2" "go.uber.org/zap" - "go.uber.org/zap/zapcore" + + "github.com/flashbots/node-healthchecker/config" + "github.com/flashbots/node-healthchecker/logutils" ) var ( version = "development" ) +const ( + envPrefix = "NH_" +) + func main() { - var logFormat, logLevel string + cfg := &config.Config{} - app := &cli.App{ - Name: "node-healthchecker", - Usage: "Aggregates complex health-checks of a blockchain node into a simple http endpoint", - Version: version, + flags := []cli.Flag{ + &cli.StringFlag{ + Destination: &cfg.Log.Level, + EnvVars: []string{envPrefix + "LOG_LEVEL"}, + Name: "log-level", + Usage: "logging level", + Value: "info", + }, - Action: func(c *cli.Context) error { - return cli.ShowAppHelp(c) + &cli.StringFlag{ + Destination: &cfg.Log.Mode, + EnvVars: []string{envPrefix + "LOG_MODE"}, + Name: "log-mode", + Usage: "logging mode", + Value: "prod", }, + } - Flags: []cli.Flag{ - &cli.StringFlag{ - Destination: &logLevel, - EnvVars: []string{"LOG_LEVEL"}, - Name: "log-level", - Usage: "logging level", - Value: "info", - }, + commands := []*cli.Command{ + CommandServe(cfg), + CommandHelp(cfg), + } - &cli.StringFlag{ - Destination: &logFormat, - EnvVars: []string{"LOG_MODE"}, - Name: "log-mode", - Usage: "logging mode", - Value: "prod", - }, - }, + app := &cli.App{ + Name: "node-healthchecker", + Usage: "Report the sync-status of a blockchain node as HTTP status", + Version: version, + + Flags: flags, + Commands: commands, + DefaultCommand: commands[0].Name, - Before: func(ctx *cli.Context) error { - err := setupLogger(logLevel, logFormat) + Before: func(_ *cli.Context) error { + // setup logger + l, err := logutils.NewLogger(&cfg.Log) if err != nil { - fmt.Fprintf(os.Stderr, "failed to configure the logging: %s\n", err) + return err } - return err + zap.ReplaceGlobals(l) + + return nil }, - Commands: []*cli.Command{ - CommandServe(), + Action: func(clictx *cli.Context) error { + return cli.ShowAppHelp(clictx) }, } defer func() { - zap.L().Sync() + zap.L().Sync() //nolint:errcheck }() if err := app.Run(os.Args); err != nil { - zap.L().Error("Failed with error", zap.Error(err)) - } -} - -func setupLogger(level, mode string) error { - var config zap.Config - switch strings.ToLower(mode) { - case "dev": - config = zap.NewDevelopmentConfig() - case "prod": - config = zap.NewProductionConfig() - default: - return fmt.Errorf("invalid log-mode '%s'", mode) + fmt.Fprintf(os.Stderr, "\nFailed with error:\n\n%s\n\n", err.Error()) + os.Exit(1) } - config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder - - logLevel, err := zap.ParseAtomicLevel(level) - if err != nil { - return fmt.Errorf("invalid log-level '%s': %w", level, err) - } - config.Level = logLevel - - l, err := config.Build() - if err != nil { - return fmt.Errorf("failed to build the logger: %w", err) - } - zap.ReplaceGlobals(l) - - return nil } diff --git a/cmd/serve.go b/cmd/serve.go index 5b64d3d..3b8b834 100644 --- a/cmd/serve.go +++ b/cmd/serve.go @@ -2,86 +2,171 @@ package main import ( "net/http" + "slices" + "strings" "time" - "github.com/flashbots/node-healthchecker/healthchecker" "github.com/urfave/cli/v2" + + "github.com/flashbots/node-healthchecker/config" + "github.com/flashbots/node-healthchecker/server" + "github.com/flashbots/node-healthchecker/utils" +) + +const ( + categoryHealthcheck = "healthcheck" + categoryHealthcheckGeth = "healthcheck geth" + categoryHealthcheckLighthouse = "healthcheck lighthouse" + categoryHealthcheckOpNode = "healthcheck op-node" + categoryHealthcheckReth = "healthcheck reth" + categoryHttpStatus = "http status" + categoryServer = "server" ) -func CommandServe() *cli.Command { - var cfg healthchecker.Config +func CommandServe(cfg *config.Config) *cli.Command { + ip := "0.0.0.0" + if ipv4, err := utils.PrivateIPv4(); err == nil { + ip = ipv4.String() + } + // healthcheck + + healthcheckFlags := []cli.Flag{ + &cli.DurationFlag{ + Category: strings.ToUpper(categoryHealthcheck), + Destination: &cfg.Healthcheck.Timeout, + EnvVars: []string{envPrefix + strings.ToUpper(categoryHealthcheck) + "_TIMEOUT"}, + Name: categoryHealthcheck + "-timeout", + Usage: "maximum `duration` of a single healthcheck", + Value: time.Second, + }, + } + + // healthcheck geth + + healthcheckGethFlags := []cli.Flag{ + &cli.StringFlag{ + Category: strings.ToUpper(categoryHealthcheckGeth), + Destination: &cfg.HealthcheckGeth.BaseURL, + EnvVars: []string{envPrefix + strings.ReplaceAll(strings.ToUpper(categoryHealthcheckGeth), " ", "_") + "_BASE_URL"}, + Name: strings.ReplaceAll(categoryHealthcheckGeth, " ", "-") + "-base-url", + Usage: "base `url` of geth's HTTP-RPC endpoint", + }, + } + + // healthcheck lighthouse + + healthcheckLighthouseFlags := []cli.Flag{ + &cli.StringFlag{ + Category: strings.ToUpper(categoryHealthcheckLighthouse), + Destination: &cfg.HealthcheckLighthouse.BaseURL, + EnvVars: []string{envPrefix + strings.ReplaceAll(strings.ToUpper(categoryHealthcheckLighthouse), " ", "_") + "_BASE_URL"}, + Name: strings.ReplaceAll(categoryHealthcheckLighthouse, " ", "-") + "-base-url", + Usage: "base `url` of lighthouse's HTTP-API endpoint", + }, + } + + // healthcheck op-node + + healthcheckOpNodeFlags := []cli.Flag{ + &cli.StringFlag{ + Category: strings.ToUpper(categoryHealthcheckOpNode), + Destination: &cfg.HealthcheckOpNode.BaseURL, + EnvVars: []string{envPrefix + strings.ReplaceAll(strings.ReplaceAll(strings.ToUpper(categoryHealthcheckOpNode), " ", "_"), "-", "_") + "_BASE_URL"}, + Name: strings.ReplaceAll(categoryHealthcheckOpNode, " ", "-") + "-base-url", + Usage: "base `url` of op-node's RPC endpoint", + }, + + &cli.Uint64Flag{ + Category: strings.ToUpper(categoryHealthcheckOpNode), + Destination: &cfg.HealthcheckOpNode.ConfirmationDistance, + EnvVars: []string{envPrefix + strings.ReplaceAll(strings.ReplaceAll(strings.ToUpper(categoryHealthcheckOpNode), " ", "_"), "-", "_") + "_CONF_DISTANCE"}, + Name: strings.ReplaceAll(categoryHealthcheckOpNode, " ", "-") + "-conf-distance", + Usage: "number of l1 blocks that verifier keeps distance from the l1 head before deriving l2 data from", + Value: 0, + }, + } + + // healthcheck reth + + healthcheckRethFlags := []cli.Flag{ + &cli.StringFlag{ + Category: strings.ToUpper(categoryHealthcheckReth), + Destination: &cfg.HealthcheckReth.BaseURL, + EnvVars: []string{envPrefix + strings.ReplaceAll(strings.ToUpper(categoryHealthcheckReth), " ", "_") + "_BASE_URL"}, + Name: strings.ReplaceAll(categoryHealthcheckReth, " ", "-") + "-base-url", + Usage: "base `url` of reth's HTTP-RPC endpoint", + }, + } + + // http status + + httpStatusFlags := []cli.Flag{ + &cli.IntFlag{ + Category: strings.ToUpper(categoryHttpStatus), + Destination: &cfg.HttpStatus.Ok, + EnvVars: []string{envPrefix + strings.ReplaceAll(strings.ToUpper(categoryHttpStatus), " ", "_") + "_OK"}, + Name: strings.ReplaceAll(categoryHttpStatus, " ", "-") + "-ok", + Usage: "http `status` to report on good healthchecks", + Value: http.StatusOK, + }, + + &cli.IntFlag{ + Category: strings.ToUpper(categoryHttpStatus), + Destination: &cfg.HttpStatus.Warning, + EnvVars: []string{envPrefix + strings.ReplaceAll(strings.ToUpper(categoryHttpStatus), " ", "_") + "_WARNING"}, + Name: strings.ReplaceAll(categoryHttpStatus, " ", "-") + "-warning", + Usage: "http `status` to report on healthchecks with warnings", + Value: http.StatusAccepted, + }, + + &cli.IntFlag{ + Category: strings.ToUpper(categoryHttpStatus), + Destination: &cfg.HttpStatus.Error, + EnvVars: []string{envPrefix + strings.ReplaceAll(strings.ToUpper(categoryHttpStatus), " ", "_") + "_ERROR"}, + Name: strings.ReplaceAll(categoryHttpStatus, " ", "-") + "-error", + Usage: "http `status` to report on healthchecks with errors", + Value: http.StatusInternalServerError, + }, + } + + // server + + serverFlags := []cli.Flag{ + &cli.StringFlag{ + Category: strings.ToUpper(categoryServer), + Destination: &cfg.Server.ListenAddress, + EnvVars: []string{envPrefix + strings.ToUpper(categoryServer) + "_LISTEN_ADDRESS"}, + Name: categoryServer + "-listen-address", + Usage: "`host:port` for the server to listen on", + Value: ip + ":8080", + }, + } return &cli.Command{ Name: "serve", - Usage: "run the healthcheck server", - - Flags: []cli.Flag{ - - // Serving - - &cli.StringFlag{ - Category: "Serving:", - Destination: &cfg.ServeAddress, - Name: "serve-address", - Usage: "respond to health-checks at the address of `host:port`", - Value: "0.0.0.0:8080", - }, - - &cli.DurationFlag{ - Category: "Serving:", - Destination: &cfg.Timeout, - Name: "timeout", - Usage: "healthcheck(s) timeout `duration`", - Value: time.Second, - }, - - &cli.IntFlag{ - Category: "Serving:", - Destination: &cfg.StatusOk, - Name: "status-ok", - Usage: "http `status` to report for good healthchecks", - Value: http.StatusOK, - }, - - &cli.IntFlag{ - Category: "Serving:", - Destination: &cfg.StatusWarning, - Name: "status-warning", - Usage: "http `status` to report for warning healthchecks", - Value: http.StatusAccepted, - }, - - &cli.IntFlag{ - Category: "Serving:", - Destination: &cfg.StatusError, - Name: "status-error", - Usage: "http `status` to report for error healthchecks", - Value: http.StatusInternalServerError, - }, - - // Monitoring - - &cli.StringFlag{ - Category: "Monitoring:", - Destination: &cfg.MonitorGethURL, - Name: "monitor-geth-url", - Usage: "monitor geth sync-status via RPC at specified `URL`", - }, - - &cli.StringFlag{ - Category: "Monitoring:", - Destination: &cfg.MonitorLighthouseURL, - Name: "monitor-lighthouse-url", - Usage: "monitor lighthouse sync-status via RPC at specified `URL`", - }, + Usage: "run node-healthchecker server", + + Flags: slices.Concat( + healthcheckFlags, + healthcheckGethFlags, + healthcheckLighthouseFlags, + healthcheckOpNodeFlags, + healthcheckRethFlags, + httpStatusFlags, + serverFlags, + ), + + Before: func(ctx *cli.Context) error { + // TODO: validate inputs + return nil }, - Action: func(ctx *cli.Context) error { - h, err := healthchecker.New(&cfg) + Action: func(_ *cli.Context) error { + s, err := server.New(cfg) if err != nil { return err } - return h.Serve() + return s.Run() }, } } diff --git a/config/config.go b/config/config.go new file mode 100644 index 0000000..1495364 --- /dev/null +++ b/config/config.go @@ -0,0 +1,15 @@ +package config + +type Config struct { + Log Log `yaml:"log"` + Server Server `yaml:"server"` + + HttpStatus HttpStatus `yaml:"http_status"` + + Healthcheck Healthcheck `yaml:"healthcheck"` + + HealthcheckGeth HealthcheckGeth `yaml:"healthcheck_geth"` + HealthcheckLighthouse HealthcheckLighthouse `yaml:"healthcheck_lighthouse"` + HealthcheckOpNode HealthcheckOpNode `yaml:"healthcheck_op_node"` + HealthcheckReth HealthcheckReth `yaml:"healthcheck_reth"` +} diff --git a/config/healthcheck.go b/config/healthcheck.go new file mode 100644 index 0000000..966d908 --- /dev/null +++ b/config/healthcheck.go @@ -0,0 +1,7 @@ +package config + +import "time" + +type Healthcheck struct { + Timeout time.Duration `yaml:"timeout"` +} diff --git a/config/healthcheck_geth.go b/config/healthcheck_geth.go new file mode 100644 index 0000000..62fbad6 --- /dev/null +++ b/config/healthcheck_geth.go @@ -0,0 +1,5 @@ +package config + +type HealthcheckGeth struct { + BaseURL string `yaml:"base_url"` +} diff --git a/config/healthcheck_lighthouse.go b/config/healthcheck_lighthouse.go new file mode 100644 index 0000000..0203a79 --- /dev/null +++ b/config/healthcheck_lighthouse.go @@ -0,0 +1,5 @@ +package config + +type HealthcheckLighthouse struct { + BaseURL string `yaml:"base_url"` +} diff --git a/config/healthcheck_op_node.go b/config/healthcheck_op_node.go new file mode 100644 index 0000000..26b9b25 --- /dev/null +++ b/config/healthcheck_op_node.go @@ -0,0 +1,6 @@ +package config + +type HealthcheckOpNode struct { + BaseURL string `yaml:"base_url"` + ConfirmationDistance uint64 `yaml:"confirmation_distance"` +} diff --git a/config/healthcheck_reth.go b/config/healthcheck_reth.go new file mode 100644 index 0000000..47d5c34 --- /dev/null +++ b/config/healthcheck_reth.go @@ -0,0 +1,5 @@ +package config + +type HealthcheckReth struct { + BaseURL string `yaml:"base_url"` +} diff --git a/config/http_status.go b/config/http_status.go new file mode 100644 index 0000000..5ce8975 --- /dev/null +++ b/config/http_status.go @@ -0,0 +1,7 @@ +package config + +type HttpStatus struct { + Ok int `yaml:"ok"` + Warning int `yaml:"warning"` + Error int `yaml:"error"` +} diff --git a/config/log.go b/config/log.go new file mode 100644 index 0000000..c57b2db --- /dev/null +++ b/config/log.go @@ -0,0 +1,6 @@ +package config + +type Log struct { + Level string `yaml:"level"` + Mode string `yaml:"mode"` +} diff --git a/config/server.go b/config/server.go new file mode 100644 index 0000000..64c242b --- /dev/null +++ b/config/server.go @@ -0,0 +1,5 @@ +package config + +type Server struct { + ListenAddress string `yaml:"listen_address"` +} diff --git a/go.mod b/go.mod index 645b45c..63aebd0 100644 --- a/go.mod +++ b/go.mod @@ -1,16 +1,32 @@ module github.com/flashbots/node-healthchecker -go 1.21 +go 1.22.0 require ( github.com/google/uuid v1.6.0 + github.com/prometheus/client_golang v1.19.1 github.com/urfave/cli/v2 v2.27.2 + go.opentelemetry.io/otel/exporters/prometheus v0.49.0 + go.opentelemetry.io/otel/metric v1.27.0 + go.opentelemetry.io/otel/sdk v1.27.0 + go.opentelemetry.io/otel/sdk/metric v1.27.0 go.uber.org/zap v1.27.0 ) require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/prometheus/client_model v0.6.1 // indirect + github.com/prometheus/common v0.54.0 // indirect + github.com/prometheus/procfs v0.15.1 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect + go.opentelemetry.io/otel v1.27.0 // indirect + go.opentelemetry.io/otel/trace v1.27.0 // indirect go.uber.org/multierr v1.11.0 // indirect + golang.org/x/sys v0.21.0 // indirect + google.golang.org/protobuf v1.34.2 // indirect ) diff --git a/go.sum b/go.sum index f1a2fcc..24b3f37 100644 --- a/go.sum +++ b/go.sum @@ -1,34 +1,59 @@ -github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w= -github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= -github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= +github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/common v0.54.0 h1:ZlZy0BgJhTwVZUn7dLOkwCZHUkrAqd3WYtcFCWnM1D8= +github.com/prometheus/common v0.54.0/go.mod h1:/TQgMJP5CuVYveyT7n/0Ix8yLNNXy9yRSkhnLTHPDIQ= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= -github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/urfave/cli/v2 v2.25.7 h1:VAzn5oq403l5pHjc4OhD54+XGO9cdKVL/7lDjF+iKUs= -github.com/urfave/cli/v2 v2.25.7/go.mod h1:8qnjx1vcq5s2/wpsqoZFndg2CE5tNFyrTvS6SinrnYQ= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/urfave/cli/v2 v2.27.2 h1:6e0H+AkS+zDckwPCUrZkKX38mRaau4nL2uipkJpbkcI= github.com/urfave/cli/v2 v2.27.2/go.mod h1:g0+79LmHHATl7DAcHO99smiR/T7uGLw84w8Y42x+4eM= -github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 h1:bAn7/zixMGCfxrRTfdpNzjtPYqr8smhKouy9mxVdGPU= -github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673/go.mod h1:N3UwUGtsrSj3ccvlPHLoLsHnpR27oXr4ZE984MbSER8= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= -go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk= -go.uber.org/goleak v1.2.0/go.mod h1:XJYK+MuIchqpmGmUSAzotztawfKvYLUIgg7guXrwVUo= +go.opentelemetry.io/otel v1.27.0 h1:9BZoF3yMK/O1AafMiQTVu0YDj5Ea4hPhxCs7sGva+cg= +go.opentelemetry.io/otel v1.27.0/go.mod h1:DMpAK8fzYRzs+bi3rS5REupisuqTheUlSZJ1WnZaPAQ= +go.opentelemetry.io/otel/exporters/prometheus v0.49.0 h1:Er5I1g/YhfYv9Affk9nJLfH/+qCCVVg1f2R9AbJfqDQ= +go.opentelemetry.io/otel/exporters/prometheus v0.49.0/go.mod h1:KfQ1wpjf3zsHjzP149P4LyAwWRupc6c7t1ZJ9eXpKQM= +go.opentelemetry.io/otel/metric v1.27.0 h1:hvj3vdEKyeCi4YaYfNjv2NUje8FqKqUY8IlF0FxV/ik= +go.opentelemetry.io/otel/metric v1.27.0/go.mod h1:mVFgmRlhljgBiuk/MP/oKylr4hs85GZAylncepAX/ak= +go.opentelemetry.io/otel/sdk v1.27.0 h1:mlk+/Y1gLPLn84U4tI8d3GNJmGT/eXe3ZuOXN9kTWmI= +go.opentelemetry.io/otel/sdk v1.27.0/go.mod h1:Ha9vbLwJE6W86YstIywK2xFfPjbWlCuwPtMkKdz/Y4A= +go.opentelemetry.io/otel/sdk/metric v1.27.0 h1:5uGNOlpXi+Hbo/DRoI31BSb1v+OGcpv2NemcCrOL8gI= +go.opentelemetry.io/otel/sdk/metric v1.27.0/go.mod h1:we7jJVrYN2kh3mVBlswtPU22K0SA+769l93J6bsyvqw= +go.opentelemetry.io/otel/trace v1.27.0 h1:IqYb813p7cmbHk0a5y6pD5JPakbVfftRXABGt5/Rscw= +go.opentelemetry.io/otel/trace v1.27.0/go.mod h1:6RiD1hkAprV4/q+yd2ln1HG9GoPx39SuvvstaLBl+l4= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= -go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo= -go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/healthcheck/geth.go b/healthcheck/geth.go new file mode 100644 index 0000000..9775809 --- /dev/null +++ b/healthcheck/geth.go @@ -0,0 +1,124 @@ +package healthcheck + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + + "github.com/flashbots/node-healthchecker/config" +) + +// gethIsNotSyncing is the status reported by geth when it's not syncing. +type gethIsNotSyncing struct { + Result bool `json:"result"` +} + +// gethIsSyncing is the status reported by geth when it's in syncing state. +type gethIsSyncing struct { + Result struct { + // StartingBlock is the block number where sync began. + StartingBlock string `json:"startingBlock"` + + // CurrentBlock is a current block number where sync is at. + CurrentBlock string `json:"currentBlock"` + + // HighestBlock is the highest alleged block number in the chain. + HighestBlock string `json:"highestBlock"` + + // SyncedAccounts is a number of accounts downloaded (snap sync). + SyncedAccounts string `json:"syncedAccounts"` + + // Number of account trie bytes persisted to disk (snap sync). + SyncedAccountBytes string `json:""` + + // SyncedBytecodes is a number of bytecodes downloaded (snap sync). + SyncedBytecodes string `json:"syncedBytecodes"` + + // SyncedBytecodeBytes is a number of bytecode bytes downloaded (snap sync). + SyncedBytecodeBytes string `json:"syncedBytecodeBytes"` + + // SyncedStorage is a number of storage slots downloaded (snap sync). + SyncedStorage string `json:"syncedStorage"` + + // SyncedStorageBytes is a number of storage trie bytes persisted to disk (snap sync). + SyncedStorageBytes string `json:"syncedStorageBytes"` + + HealedTrienodes string `json:"healingTrienodes"` + HealedTrienodeBytes string `json:"healedTrienodeBytes"` + HealedBytecodes string `json:"healedBytecodes"` + HealedBytecodeBytes string `json:"healedBytecodeBytes"` + + HealingTrienodes string `json:"healedTrienodes"` + HealingBytecode string `json:"healingBytecode"` + + TxIndexFinishedBlocks string `json:"txIndexFinishedBlocks"` + TxIndexRemainingBlocks string `json:"txIndexRemainingBlocks"` + } `json:"result"` +} + +func Geth(ctx context.Context, cfg *config.HealthcheckGeth) *Result { + // https://ethereum.org/en/developers/docs/apis/json-rpc/#eth_syncing + // https://github.com/ethereum/go-ethereum/blob/v1.14.8/interfaces.go#L98-L127 + + const query = `{"jsonrpc":"2.0","method":"eth_syncing","params":[],"id":0}` + + req, err := http.NewRequestWithContext( + ctx, + http.MethodGet, + cfg.BaseURL, + bytes.NewReader([]byte(query)), + ) + if err != nil { + return &Result{Err: err} + } + req.Header.Set("accept", "application/json") + req.Header.Set("content-type", "application/json") + + res, err := http.DefaultClient.Do(req) + if err != nil { + return &Result{Err: err} + } + defer res.Body.Close() + + body, err := io.ReadAll(res.Body) + if err != nil { + return &Result{Err: err} + } + + if res.StatusCode != http.StatusOK { + return &Result{ + Err: fmt.Errorf("unexpected HTTP status '%d': %s", + res.StatusCode, + string(body), + ), + } + } + + var status gethIsNotSyncing + if err := json.Unmarshal(body, &status); err != nil { + var status gethIsSyncing + if err2 := json.Unmarshal(body, &status); err2 != nil { + return &Result{ + Err: fmt.Errorf("failed to parse JSON body '%s': %w", + string(body), + errors.Join(err, err2), + ), + } + } + return &Result{ + Err: fmt.Errorf("geth is still syncing: Current:=%s, Highest=%s", + status.Result.CurrentBlock, + status.Result.HighestBlock, + ), + } + } + if status.Result { // i.e. it's syncing + return &Result{Err: errors.New("geth is (still) syncing")} + } + + return &Result{Ok: true} +} diff --git a/healthcheck/healthcheck.go b/healthcheck/healthcheck.go new file mode 100644 index 0000000..589471b --- /dev/null +++ b/healthcheck/healthcheck.go @@ -0,0 +1,10 @@ +package healthcheck + +import "context" + +type Monitor = func(context.Context) *Result + +type Result struct { + Ok bool + Err error +} diff --git a/healthcheck/lighthouse.go b/healthcheck/lighthouse.go new file mode 100644 index 0000000..f4665e4 --- /dev/null +++ b/healthcheck/lighthouse.go @@ -0,0 +1,162 @@ +package healthcheck + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + + "github.com/flashbots/node-healthchecker/config" +) + +// lighthouseStateAsString represents sync-state of lighthouse as a string. +// +// Possible values: +// +// - "Synced" means that lighthouse is up to date with all known peers and is +// connected to at least one fully synced peer. In this state, parent +// lookups are enabled. +// +// - "Stalled" means that no useful peers are connected to lighthouse. +// Long-range sync's cannot proceed and there are no useful peers to +// download parents for. More peers need to be connected before lighthouse +// can proceed. +// +// - "SyncTransition" means that lighthouse has completed syncing a finalized +// chain and is in the process of re-evaluating which sync state to progress +// to. +type lighthouseStateAsString struct { + Data string `json:"data"` +} + +// lighthouseStateAsStruct represents sync-state of lighthouse as a struct. +type lighthouseStateAsStruct struct { + Data struct { + // BackFillSyncing means that lighthouse is undertaking a backfill sync. + // + // This occurs when a user has specified a trusted state. The node first + // syncs "forward" by downloading blocks up to the current head as + // specified by its peers. Once completed, the node enters this sync + // state and attempts to download all required historical blocks. + BackFillSyncing *struct { + Completed uint64 `json:"completed"` + Remaining uint64 `json:"remaining"` + } `json:"BackFillSyncing"` + + // SyncingFinalized means that lighthouse is performing a long-range + // (batch) sync over a finalized chain. + // + // In this state, parent lookups are disabled. + SyncingFinalized *struct { + StartSlot string `json:"start_slot"` + TargetSlot string `json:"target_slot"` + } `json:"SyncingFinalized"` + + // SyncingHead means that lighthouse is performing a long-range (batch) + // sync over one or many head chains. + // + // In this state parent lookups are disabled. + SyncingHead *struct { + StartSlot string `json:"start_slot"` + TargetSlot string `json:"target_slot"` + } `json:"SyncingHead"` + } `json:"data"` +} + +func Lighthouse(ctx context.Context, cfg *config.HealthcheckLighthouse) *Result { + // https://lighthouse-book.sigmaprime.io/api-lighthouse.html#lighthousesyncing + // https://github.com/sigp/lighthouse/blob/v4.5.0/beacon_node/lighthouse_network/src/types/sync_state.rs#L6-L27 + + _url, err := url.JoinPath(cfg.BaseURL, "lighthouse/syncing") + if err != nil { + return &Result{Err: err} + } + + req, err := http.NewRequestWithContext( + ctx, + http.MethodGet, + _url, + nil, + ) + if err != nil { + return &Result{Err: err} + } + req.Header.Set("accept", "application/json") + + res, err := http.DefaultClient.Do(req) + if err != nil { + return &Result{Err: err} + } + defer res.Body.Close() + + body, err := io.ReadAll(res.Body) + if err != nil { + return &Result{Err: err} + } + + if res.StatusCode != http.StatusOK { + return &Result{Err: fmt.Errorf("unexpected HTTP status '%d': %s", + res.StatusCode, + string(body), + )} + } + + var state lighthouseStateAsString + if err := json.Unmarshal(body, &state); err != nil { + var state lighthouseStateAsStruct + if err2 := json.Unmarshal(body, &state); err2 != nil { + return &Result{Err: fmt.Errorf("failed to parse JSON body '%s': %w", + string(body), + errors.Join(err, err2), + )} + } + switch { + case state.Data.BackFillSyncing != nil: + // + // BackBillSyncing is "ok" because that's the state lighthouse + // switches to after checkpoint sync is complete. + // + // See: https://lighthouse-book.sigmaprime.io/checkpoint-sync.html#backfilling-blocks + // + return &Result{ + Ok: true, + Err: fmt.Errorf("lighthouse is in 'BackFillSyncing' state (completed: %d, remaining: %d)", + state.Data.BackFillSyncing.Completed, + state.Data.BackFillSyncing.Remaining, + ), + } + case state.Data.SyncingFinalized != nil: + return &Result{ + Err: fmt.Errorf("lighthouse is in 'SyncingFinalized' state (start_slot: '%s', target_slot: '%s')", + state.Data.SyncingFinalized.StartSlot, + state.Data.SyncingFinalized.TargetSlot, + ), + } + case state.Data.SyncingHead != nil: + return &Result{ + Err: fmt.Errorf("lighthouse is in 'SyncingHead' state (start_slot: '%s', target_slot: '%s')", + state.Data.SyncingHead.StartSlot, + state.Data.SyncingHead.TargetSlot, + ), + } + default: + return &Result{ + Err: fmt.Errorf("lighthouse is in unrecognised state: %s", + string(body), + ), + } + } + } + if state.Data != "Synced" { + return &Result{ + Err: fmt.Errorf("lighthouse is not in synced state: %s", + state.Data, + ), + } + } + + return &Result{Ok: true} +} diff --git a/healthcheck/op_node.go b/healthcheck/op_node.go new file mode 100644 index 0000000..b87ae5b --- /dev/null +++ b/healthcheck/op_node.go @@ -0,0 +1,175 @@ +package healthcheck + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + + "github.com/flashbots/node-healthchecker/config" +) + +// opNodeSyncStatus is a snapshot of the op-node's driver. +// +// Values may be zeroed if not yet initialized. +type opNodeSyncStatus struct { + Result struct { + + // CurrentL1 is the L1 block that the derivation process is last idled at. + // + // This may not be fully derived into L2 data yet. + // + // The safe L2 blocks were produced/included fully from the L1 chain up to + // and including this L1 block. + // + // If the node is synced, this matches the HeadL1, minus the verifier + // confirmation distance. + CurrentL1 opNodeL1BlockRef `json:"current_l1"` + + // HeadL1 is the perceived head of the L1 chain, no confirmation distance. + // + // The head is not guaranteed to build on the other L1 sync status fields, + // as the node may be in progress of resetting to adapt to a L1 reorg. + HeadL1 opNodeL1BlockRef `json:"head_l1"` + + SafeL1 opNodeL1BlockRef `json:"safe_l1"` + + FinalizedL1 opNodeL1BlockRef `json:"finalized_l1"` + + // UnsafeL2 is the absolute tip of the L2 chain, pointing to block data that + // has not been submitted to L1 yet. + // + // The sequencer is building this, and verifiers may also be ahead of the + // SafeL2 block if they sync blocks via p2p or other offchain sources. + // + // This is considered to only be local-unsafe post-interop, see CrossUnsafe + // for cross-L2 guarantees. + UnsafeL2 opNodeL2BlockRef `json:"unsafe_l2"` + + // SafeL2 points to the L2 block that was derived from the L1 chain. + // + // This point may still reorg if the L1 chain reorgs. + // + // This is considered to be cross-safe post-interop, see LocalSafe to ignore + // cross-L2 guarantees. + SafeL2 opNodeL2BlockRef `json:"safe_l2"` + + // FinalizedL2 points to the L2 block that was derived fully from finalized + // L1 information, thus irreversible. + FinalizedL2 opNodeL2BlockRef `json:"finalized_l2"` + + // PendingSafeL2 points to the L2 block processed from the batch, but not + // consolidated to the safe block yet. + PendingSafeL2 opNodeL2BlockRef `json:"pending_safe_l2"` + + // CrossUnsafeL2 is an unsafe L2 block, that has been verified to match + // cross-L2 dependencies. + // + // Pre-interop every unsafe L2 block is also cross-unsafe. + CrossUnsafeL2 opNodeL2BlockRef `json:"cross_unsafe_l2"` + + // LocalSafeL2 is an L2 block derived from L1, not yet verified to have + // valid cross-L2 dependencies. + LocalSafeL2 opNodeL2BlockRef `json:"local_safe_l2"` + } `json:"result"` +} + +type opNodeL1BlockRef struct { + Hash string `json:"hash"` + Number uint64 `json:"number"` + ParentHash string `json:"parentHash"` + Time uint64 `json:"timestamp"` +} + +type opNodeL2BlockRef struct { + Hash string `json:"hash"` + Number uint64 `json:"number"` + ParentHash string `json:"parentHash"` + SequenceNumber uint64 `json:"sequenceNumber"` // distance to first block of epoch + Time uint64 `json:"timestamp"` + + L1Origin struct { + Hash string `json:"hash"` + Number uint64 `json:"number"` + } `json:"l1origin"` +} + +func OpNode(ctx context.Context, cfg *config.HealthcheckOpNode) *Result { + // https://docs.optimism.io/builders/node-operators/json-rpc#optimism_syncstatus + // https://github.com/ethereum-optimism/optimism/blob/v1.9.1/op-service/eth/sync_status.go#L5-L34 + + const query = `{"jsonrpc":"2.0","method":"optimism_syncStatus","params":[],"id":0}` + + req, err := http.NewRequestWithContext( + ctx, + http.MethodPost, + cfg.BaseURL, + bytes.NewReader([]byte(query)), + ) + if err != nil { + return &Result{Err: err} + } + req.Header.Set("accept", "application/json") + req.Header.Set("content-type", "application/json") + + res, err := http.DefaultClient.Do(req) + if err != nil { + return &Result{Err: err} + } + defer res.Body.Close() + + body, err := io.ReadAll(res.Body) + if err != nil { + return &Result{Err: err} + } + + if res.StatusCode != http.StatusOK { + return &Result{ + Err: fmt.Errorf("unexpected HTTP status '%d': %s", + res.StatusCode, + string(body), + ), + } + } + + var status opNodeSyncStatus + err = json.Unmarshal(body, &status) + if err != nil { + return &Result{ + Err: fmt.Errorf("failed to parse JSON body '%s': %w", + string(body), + err, + ), + } + } + + if status.Result.CurrentL1.Number > status.Result.HeadL1.Number { + dist := status.Result.CurrentL1.Number - status.Result.HeadL1.Number + if dist == 1 { + return &Result{Ok: true} + } + return &Result{ + Ok: true, + Err: fmt.Errorf("op-node's current l1 block (number: %d, hash: %s) is greater than head (number: %d, hash %s): %d - %d = %d", + status.Result.CurrentL1.Number, status.Result.CurrentL1.Hash, + status.Result.HeadL1.Number, status.Result.HeadL1.Hash, + status.Result.CurrentL1.Number, status.Result.HeadL1.Number, dist, + ), + } + } + + dist := status.Result.HeadL1.Number - status.Result.CurrentL1.Number + if dist > cfg.ConfirmationDistance { + return &Result{ + Err: fmt.Errorf("op-node's current l1 block (number: %d, hash: %s) is behind the l1 head (number: %d, hash: %s) for more than confirmation distance: %d > %d", + status.Result.CurrentL1.Number, status.Result.CurrentL1.Hash, + status.Result.HeadL1.Number, status.Result.HeadL1.Hash, + dist, cfg.ConfirmationDistance, + ), + } + } + + return &Result{Ok: true} +} diff --git a/healthcheck/reth.go b/healthcheck/reth.go new file mode 100644 index 0000000..fe5fe2f --- /dev/null +++ b/healthcheck/reth.go @@ -0,0 +1,116 @@ +package healthcheck + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "strings" + + "github.com/flashbots/node-healthchecker/config" +) + +// rethIsNotSyncing is the status reported by reth when it's not syncing. +type rethIsNotSyncing struct { + Result bool `json:"result"` +} + +// rethIsSyncing is the status reported by reth when it is in syncing state. +type rethIsSyncing struct { + Result struct { + // StartingBlock is a starting block. + StartingBlock string `json:"startingBlock"` + + // CurrentBlock is a current block. + CurrentBlock string `json:"currentBlock"` + + // HighestBlock is the highest block seen so far. + HighestBlock string `json:"highestBlock"` + + // WarpChunksAmount is a warp-sync snapshot chunks total. + WarpChunksAmount *string `json:"warpChunksAmount,omitempty"` + + // WarpChunksProcessed is a warp-sync snapshot chunks processed. + WarpChunksProcessed *string `json:"warpChunksProcessed,omitempty"` + + /// Stages contains the details of the sync-stages. + Stages []struct { + // Name of the sync-stage. + Name string `json:"name"` + + // Block indicates the progress of the sync-stage. + Block string `json:"block"` + } `json:"stages"` + } `json:"result"` +} + +func Reth(ctx context.Context, cfg *config.HealthcheckReth) *Result { + // https://ethereum.org/en/developers/docs/apis/json-rpc/#eth_syncing + // https://github.com/alloy-rs/alloy/blob/v0.3.5/crates/rpc-types-eth/src/syncing.rs#L8-L36 + + const query = `{"jsonrpc":"2.0","method":"eth_syncing","params":[],"id":0}` + + req, err := http.NewRequestWithContext( + ctx, + http.MethodPost, + cfg.BaseURL, + bytes.NewReader([]byte(query)), + ) + if err != nil { + return &Result{Err: err} + } + req.Header.Set("accept", "application/json; charset=utf-8") + req.Header.Set("content-type", "application/json; charset=utf-8") + + res, err := http.DefaultClient.Do(req) + if err != nil { + return &Result{Err: err} + } + defer res.Body.Close() + + body, err := io.ReadAll(res.Body) + if err != nil { + return &Result{Err: err} + } + + if res.StatusCode != http.StatusOK { + return &Result{ + Err: fmt.Errorf("unexpected HTTP status '%d': %s", + res.StatusCode, + string(body), + ), + } + } + + var status rethIsNotSyncing + if err := json.Unmarshal(body, &status); err != nil { + var status rethIsSyncing + if err2 := json.Unmarshal(body, &status); err2 != nil { + return &Result{ + Err: fmt.Errorf("failed to parse JSON body '%s': %w", + string(body), + errors.Join(err, err2), + ), + } + } + stages := make([]string, 0, len(status.Result.Stages)) + for idx, stage := range status.Result.Stages { + stages = append(stages, fmt.Sprintf("%s(%d)=%s", stage.Name, idx, stage.Block)) + } + return &Result{ + Err: fmt.Errorf("reth is still syncing: Current=%s, Highest=%s, %s", + status.Result.CurrentBlock, + status.Result.HighestBlock, + strings.Join(stages, ", "), + ), + } + } + if status.Result { // i.e. it's syncing + return &Result{Err: errors.New("reth is (still) syncing")} + } + + return &Result{Ok: true} +} diff --git a/healthchecker/check_geth.go b/healthchecker/check_geth.go deleted file mode 100644 index f4e87f8..0000000 --- a/healthchecker/check_geth.go +++ /dev/null @@ -1,83 +0,0 @@ -package healthchecker - -import ( - "bytes" - "context" - "encoding/json" - "errors" - "fmt" - "io" - "net/http" -) - -func (h *Healthchecker) checkGeth(ctx context.Context, url string) *healthcheckResult { - // https://ethereum.org/en/developers/docs/apis/json-rpc/#eth_syncing - - const query = `{"jsonrpc":"2.0","method":"eth_syncing","params":[],"id":0}` - - type isNotSyncing struct { - Result bool // `json:"result"` - } - - type isSyncing struct { - Result struct { - CurrentBlock string // `json:"currentBlock"` - HighestBlock string // `json:"highestBlock"` - } // `json:"result"` - } - - //-------------------------------------------------------------------------- - - req, err := http.NewRequestWithContext( - ctx, - http.MethodGet, - url, - bytes.NewReader([]byte(query)), - ) - if err != nil { - return &healthcheckResult{err: err} - } - req.Header.Set("accept", "application/json") - req.Header.Set("content-type", "application/json") - - res, err := http.DefaultClient.Do(req) - if err != nil { - return &healthcheckResult{err: err} - } - defer res.Body.Close() - - body, err := io.ReadAll(res.Body) - if err != nil { - return &healthcheckResult{err: err} - } - - if res.StatusCode != http.StatusOK { - return &healthcheckResult{err: fmt.Errorf( - "unexpected HTTP status '%d': %s", - res.StatusCode, - string(body), - )} - } - - var status isNotSyncing - if err := json.Unmarshal(body, &status); err != nil { - var status isSyncing - if err2 := json.Unmarshal(body, &status); err2 != nil { - return &healthcheckResult{err: fmt.Errorf( - "failed to parse JSON body '%s': %w", - string(body), - errors.Join(err, err2), - )} - } - return &healthcheckResult{err: fmt.Errorf( - "geth is still syncing (current: %s, highest: %s)", - status.Result.CurrentBlock, - status.Result.HighestBlock, - )} - } - if status.Result { // i.e. it's syncing - return &healthcheckResult{err: errors.New("geth is (still) syncing")} - } - - return &healthcheckResult{ok: true} -} diff --git a/healthchecker/check_lighthouse.go b/healthchecker/check_lighthouse.go deleted file mode 100644 index 5957291..0000000 --- a/healthchecker/check_lighthouse.go +++ /dev/null @@ -1,121 +0,0 @@ -package healthchecker - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "io" - "net/http" -) - -func (h *Healthchecker) checkLighthouse(ctx context.Context, url string) *healthcheckResult { - // https://lighthouse-book.sigmaprime.io/api-lighthouse.html#lighthousesyncing - // https://github.com/sigp/lighthouse/blob/v4.5.0/beacon_node/lighthouse_network/src/types/sync_state.rs#L6-L27 - - type stateString struct { - Data string // `json:"data"` - } - - type stateStruct struct { - Data struct { - BackFillSyncing *struct { - Completed uint64 // `json:"completed"` - Remaining uint64 // `json:"remaining"` - } // `json:"BackFillSyncing"` - - SyncingFinalized *struct { - StartSlot string // `json:"start_slot"` - TargetSlot string // `json:"target_slot"` - } // `json:"SyncingFinalized"` - - SyncingHead *struct { - StartSlot string // `json:"start_slot"` - TargetSlot string // `json:"target_slot"` - } // `json:"SyncingHead"` - } // `json:"data"` - } - - //-------------------------------------------------------------------------- - - req, err := http.NewRequestWithContext( - ctx, - http.MethodGet, - url, - nil, - ) - if err != nil { - return &healthcheckResult{err: err} - } - req.Header.Set("accept", "application/json") - - res, err := http.DefaultClient.Do(req) - if err != nil { - return &healthcheckResult{err: err} - } - defer res.Body.Close() - - body, err := io.ReadAll(res.Body) - if err != nil { - return &healthcheckResult{err: err} - } - - if res.StatusCode != http.StatusOK { - return &healthcheckResult{err: fmt.Errorf( - "unexpected HTTP status '%d': %s", - res.StatusCode, - string(body), - )} - } - - var state stateString - if err := json.Unmarshal(body, &state); err != nil { - var state stateStruct - if err2 := json.Unmarshal(body, &state); err2 != nil { - return &healthcheckResult{err: fmt.Errorf( - "failed to parse JSON body '%s': %w", - string(body), - errors.Join(err, err2), - )} - } - switch { - case state.Data.BackFillSyncing != nil: - // - // BackBillSyncing is "ok" because that's the state lighthouse - // switches to after checkpoint sync is complete. - // - // See: https://lighthouse-book.sigmaprime.io/checkpoint-sync.html#backfilling-blocks - // - return &healthcheckResult{ok: true, err: fmt.Errorf( - "lighthouse is in 'BackFillSyncing' state (completed: %d, remaining: %d)", - state.Data.BackFillSyncing.Completed, - state.Data.BackFillSyncing.Remaining, - )} - case state.Data.SyncingFinalized != nil: - return &healthcheckResult{err: fmt.Errorf( - "lighthouse is in 'SyncingFinalized' state (start_slot: '%s', target_slot: '%s')", - state.Data.SyncingFinalized.StartSlot, - state.Data.SyncingFinalized.TargetSlot, - )} - case state.Data.SyncingHead != nil: - return &healthcheckResult{err: fmt.Errorf( - "lighthouse is in 'SyncingHead' state (start_slot: '%s', target_slot: '%s')", - state.Data.SyncingHead.StartSlot, - state.Data.SyncingHead.TargetSlot, - )} - default: - return &healthcheckResult{err: fmt.Errorf( - "lighthouse is in unrecognised state: %s", - string(body), - )} - } - } - if state.Data != "Synced" { - return &healthcheckResult{err: fmt.Errorf( - "lighthouse is not in synced state: %s", - state.Data, - )} - } - - return &healthcheckResult{ok: true} -} diff --git a/healthchecker/healthchecker.go b/healthchecker/healthchecker.go deleted file mode 100644 index 5477184..0000000 --- a/healthchecker/healthchecker.go +++ /dev/null @@ -1,200 +0,0 @@ -package healthchecker - -import ( - "context" - "errors" - "fmt" - "net/http" - "net/url" - "os" - "os/signal" - "syscall" - "time" - - "github.com/flashbots/node-healthchecker/httplogger" - "github.com/flashbots/node-healthchecker/logutils" - "go.uber.org/zap" -) - -type Healthchecker struct { - addr string - log *zap.Logger - timeout time.Duration - - monitors []healthcheckMonitor - - statusOk int - statusWarning int - statusError int -} - -type healthcheckMonitor = func(context.Context) *healthcheckResult - -type healthcheckResult struct { - ok bool - err error -} - -type Config struct { - MonitorGethURL string - MonitorLighthouseURL string - - ServeAddress string - Timeout time.Duration - - StatusOk int - StatusWarning int - StatusError int -} - -func New(cfg *Config) (*Healthchecker, error) { - h := &Healthchecker{ - addr: cfg.ServeAddress, - log: zap.L(), - timeout: cfg.Timeout, - - statusOk: cfg.StatusOk, - statusWarning: cfg.StatusWarning, - statusError: cfg.StatusError, - } - - // Configure geth checks - - if cfg.MonitorGethURL != "" { - rpcURL, err := url.JoinPath(cfg.MonitorGethURL, "/") - if err != nil { - return nil, err - } - h.monitors = append(h.monitors, func(ctx context.Context) *healthcheckResult { - return h.checkGeth(ctx, rpcURL) - }) - } - - // Configure lighthouse checks - - if cfg.MonitorLighthouseURL != "" { - syncingURL, err := url.JoinPath(cfg.MonitorLighthouseURL, "lighthouse/syncing") - if err != nil { - return nil, err - } - h.monitors = append(h.monitors, func(ctx context.Context) *healthcheckResult { - return h.checkLighthouse(ctx, syncingURL) - }) - } - - return h, nil -} - -func (h *Healthchecker) Serve() error { - mux := http.NewServeMux() - mux.HandleFunc("/", h.handleHTTPRequest) - handler := httplogger.Middleware(h.log, mux) - - srv := &http.Server{ - Addr: h.addr, - Handler: handler, - MaxHeaderBytes: 1024, - ReadHeaderTimeout: 30 * time.Second, - ReadTimeout: 30 * time.Second, - WriteTimeout: 30 * time.Second, - } - - go func() { - terminator := make(chan os.Signal, 1) - signal.Notify(terminator, os.Interrupt, syscall.SIGTERM) - s := <-terminator - - h.log.Info("Stop signal received; shutting down...", zap.String("signal", s.String())) - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - if err := srv.Shutdown(ctx); err != nil { - h.log.Error("HTTP server shutdown failed", zap.Error(err)) - } - }() - - h.log.Info("Starting up...", zap.String("address", h.addr)) - if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { - h.log.Error("HTTP server failed", zap.Error(err)) - } - h.log.Info("Server is down") - - return nil -} - -func (h *Healthchecker) handleHTTPRequest(w http.ResponseWriter, r *http.Request) { - l := logutils.LoggerFromRequest(r) - - count := len(h.monitors) - results := make(chan *healthcheckResult, count) - - for _, m := range h.monitors { - monitor := m // https://go.dev/blog/loopvar-preview - ctx, cancel := context.WithTimeout(r.Context(), h.timeout) - defer cancel() - go func() { - results <- monitor(ctx) - }() - } - - errs := []error{} - warns := []error{} - for count > 0 { - count-- - if res := <-results; res != nil { - if !res.ok { - errs = append(errs, res.err) - } else if res.err != nil { - warns = append(warns, res.err) - } - } - } - close(results) - - switch { - case len(errs) == 0 && len(warns) == 0: - w.WriteHeader(h.statusOk) - return - - case len(errs) > 0: - w.WriteHeader(h.statusError) - w.Header().Set("Content-Type", "application/text") - - for idx, err := range errs { - line := fmt.Sprintf("%d: error: %s\n", idx, err) - _, err := w.Write([]byte(line)) - if err != nil { - l.Error("Failed to write the response body", zap.Error(err)) - } - } - offset := len(errs) - for idx, warn := range warns { - line := fmt.Sprintf("%d: warning: %s\n", offset+idx, warn) - _, err := w.Write([]byte(line)) - if err != nil { - l.Error("Failed to write the response body", zap.Error(err)) - } - } - - l.Warn( - "Failed the healthcheck due to upstream error(s)", - zap.Error(errors.Join(errs...)), - ) - - case len(errs) == 0 && len(warns) > 0: - w.WriteHeader(h.statusWarning) - w.Header().Set("Content-Type", "application/text") - - for idx, warn := range warns { - line := fmt.Sprintf("%d: %s\n", idx, warn) - _, err := w.Write([]byte(line)) - if err != nil { - l.Error("Failed to write the response body", zap.Error(err)) - } - } - - l.Warn( - "Failed the healthcheck due to upstream error(s)", - zap.Error(errors.Join(errs...)), - ) - } -} diff --git a/httplogger/middleware.go b/httplogger/middleware.go index 8d83959..d1e5ada 100644 --- a/httplogger/middleware.go +++ b/httplogger/middleware.go @@ -6,9 +6,10 @@ import ( "net/http" "time" - "github.com/flashbots/node-healthchecker/logutils" "github.com/google/uuid" "go.uber.org/zap" + + "github.com/flashbots/node-healthchecker/logutils" ) func Middleware(logger *zap.Logger, next http.Handler) http.Handler { @@ -46,7 +47,7 @@ func Middleware(logger *zap.Logger, next http.Handler) http.Handler { // Passing request stats both in-message (for the human reader) // as well as inside the structured log (for the machine parser) - logger.Info(fmt.Sprintf("%s %s %d", r.Method, r.URL.EscapedPath(), wrapped.Status()), + logger.Debug(fmt.Sprintf("%s %s %d", r.Method, r.URL.EscapedPath(), wrapped.Status()), zap.Int("durationMs", int(time.Since(start).Milliseconds())), zap.Int("status", wrapped.Status()), zap.String("httpRequestID", httpRequestID), diff --git a/logutils/http_server_error_logger.go b/logutils/http_server_error_logger.go new file mode 100644 index 0000000..88cdfb5 --- /dev/null +++ b/logutils/http_server_error_logger.go @@ -0,0 +1,28 @@ +package logutils + +import ( + "errors" + "log" + "strings" + + "go.uber.org/zap" +) + +type httpServerErrorLogger struct { + logger *zap.Logger +} + +func (s *httpServerErrorLogger) Write(p []byte) (n int, err error) { + msg := strings.TrimSpace(string(p)) + s.logger.Warn("HTTP server encountered an error", + zap.Error(errors.New(msg)), + ) + return len(p), nil +} + +func NewHttpServerErrorLogger(logger *zap.Logger) *log.Logger { + wrapped := &httpServerErrorLogger{ + logger: logger, + } + return log.New(wrapped, "", 0) +} diff --git a/logutils/setup.go b/logutils/setup.go new file mode 100644 index 0000000..ed3d78a --- /dev/null +++ b/logutils/setup.go @@ -0,0 +1,53 @@ +package logutils + +import ( + "errors" + "fmt" + "strings" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/flashbots/node-healthchecker/config" +) + +var ( + ErrLoggerFailedToBuild = errors.New("failed to build the logger") + ErrLoggerInvalidLevel = errors.New("invalid log-level") + ErrLoggerInvalidMode = errors.New("invalid log-mode") +) + +func NewLogger(cfg *config.Log) ( + *zap.Logger, error, +) { + var config zap.Config + switch strings.ToLower(cfg.Mode) { + case "dev": + config = zap.NewDevelopmentConfig() + config.EncoderConfig.EncodeCaller = nil + case "prod": + config = zap.NewProductionConfig() + default: + return nil, fmt.Errorf("%w: %s", + ErrLoggerInvalidMode, cfg.Mode, + ) + } + config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder + + logLevel, err := zap.ParseAtomicLevel(cfg.Level) + if err != nil { + return nil, fmt.Errorf("%w: %s: %w", + ErrLoggerInvalidLevel, cfg.Level, err, + ) + } + config.Level = logLevel + + l, err := config.Build() + if err != nil { + return nil, fmt.Errorf("%w: %w", + ErrLoggerFailedToBuild, err, + ) + } + + return l, nil +} diff --git a/metrics/metrics.go b/metrics/metrics.go new file mode 100644 index 0000000..c394c0d --- /dev/null +++ b/metrics/metrics.go @@ -0,0 +1,54 @@ +package metrics + +import ( + "context" + + "go.opentelemetry.io/otel/exporters/prometheus" + otelapi "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" +) + +const ( + metricsNamespace = "node-healthchecker" +) + +var ( + meter otelapi.Meter +) + +func Setup(ctx context.Context) error { + for _, setup := range []func(context.Context) error{ + setupMeter, // must come first + } { + if err := setup(ctx); err != nil { + return err + } + } + + return nil +} + +func setupMeter(ctx context.Context) error { + res, err := resource.New(ctx) + if err != nil { + return err + } + + exporter, err := prometheus.New( + prometheus.WithNamespace(metricsNamespace), + prometheus.WithoutScopeInfo(), + ) + if err != nil { + return err + } + + provider := metric.NewMeterProvider( + metric.WithReader(exporter), + metric.WithResource(res), + ) + + meter = provider.Meter(metricsNamespace) + + return nil +} diff --git a/readme.md b/readme.md index d34c370..7a3caa7 100644 --- a/readme.md +++ b/readme.md @@ -6,8 +6,9 @@ Composite health (sync status) checker for blockchain nodes. ```shell ./node-healthchecker serve \ - --monitor-geth-url http://127.0.0.1:8545 \ - --monitor-lighthouse-url http://127.0.0.1:3500 + --healthcheck-geth-base-url http://127.0.0.1:8545 \ + --healthcheck-lighthouse-base-url http://127.0.0.1:3500 \ + --server-listen-address 127.0.0.1:8080 ``` ```shell diff --git a/server/healthcheck.go b/server/healthcheck.go new file mode 100644 index 0000000..2793c16 --- /dev/null +++ b/server/healthcheck.go @@ -0,0 +1,96 @@ +package server + +import ( + "context" + "errors" + "fmt" + "net/http" + + "github.com/flashbots/node-healthchecker/healthcheck" + "github.com/flashbots/node-healthchecker/logutils" + "go.uber.org/zap" +) + +func (s *Server) healthcheck(w http.ResponseWriter, r *http.Request) { + l := logutils.LoggerFromRequest(r) + + count := len(s.monitors) + results := make(chan *healthcheck.Result, count) + + for _, m := range s.monitors { + monitor := m // https://go.dev/blog/loopvar-preview + ctx, cancel := context.WithTimeout(r.Context(), s.cfg.Healthcheck.Timeout) + defer cancel() + go func() { + results <- monitor(ctx) + }() + } + + errs := []error{} + warns := []error{} + for count > 0 { + count-- + if res := <-results; res != nil { + if !res.Ok { + errs = append(errs, res.Err) + } else if res.Err != nil { + warns = append(warns, res.Err) + } + } + } + close(results) + + switch { + case len(errs) == 0 && len(warns) == 0: + w.WriteHeader(s.cfg.HttpStatus.Ok) + return + + case len(errs) > 0: + w.WriteHeader(s.cfg.HttpStatus.Error) + w.Header().Set("Content-Type", "application/text") + + for idx, err := range errs { + line := fmt.Sprintf("%d: error: %s\n", idx, err) + _, err := w.Write([]byte(line)) + if err != nil { + l.Error("Failed to write the response body", + zap.Error(err), + ) + } + } + offset := len(errs) + for idx, warn := range warns { + line := fmt.Sprintf("%d: warning: %s\n", offset+idx, warn) + _, err := w.Write([]byte(line)) + if err != nil { + l.Error("Failed to write the response body", + zap.Error(err), + ) + } + } + + l.Warn("Healthcheck encountered upstream error(s)", + zap.Error(errors.Join(errs...)), + zap.Int("http_status", s.cfg.HttpStatus.Error), + ) + + case len(errs) == 0 && len(warns) > 0: + w.WriteHeader(s.cfg.HttpStatus.Warning) + w.Header().Set("Content-Type", "application/text") + + for idx, warn := range warns { + line := fmt.Sprintf("%d: %s\n", idx, warn) + _, err := w.Write([]byte(line)) + if err != nil { + l.Error("Failed to write the response body", + zap.Error(err), + ) + } + } + + l.Warn("Healthcheck encountered upstream error(s)", + zap.Error(errors.Join(errs...)), + zap.Int("http_status", s.cfg.HttpStatus.Warning), + ) + } +} diff --git a/server/server.go b/server/server.go new file mode 100644 index 0000000..9962658 --- /dev/null +++ b/server/server.go @@ -0,0 +1,146 @@ +package server + +import ( + "context" + "errors" + "net/http" + "os" + "os/signal" + "syscall" + "time" + + "github.com/prometheus/client_golang/prometheus/promhttp" + "go.uber.org/zap" + + "github.com/flashbots/node-healthchecker/config" + "github.com/flashbots/node-healthchecker/healthcheck" + "github.com/flashbots/node-healthchecker/httplogger" + "github.com/flashbots/node-healthchecker/logutils" +) + +type Server struct { + cfg *config.Config + + failure chan error + + logger *zap.Logger + server *http.Server + + monitors []healthcheck.Monitor +} + +func New(cfg *config.Config) (*Server, error) { + monitors := make([]healthcheck.Monitor, 0) + + if cfg.HealthcheckGeth.BaseURL != "" { + monitors = append(monitors, func(ctx context.Context) *healthcheck.Result { + return healthcheck.Geth(ctx, &cfg.HealthcheckGeth) + }) + } + + if cfg.HealthcheckLighthouse.BaseURL != "" { + monitors = append(monitors, func(ctx context.Context) *healthcheck.Result { + return healthcheck.Lighthouse(ctx, &cfg.HealthcheckLighthouse) + }) + } + + if cfg.HealthcheckOpNode.BaseURL != "" { + monitors = append(monitors, func(ctx context.Context) *healthcheck.Result { + return healthcheck.OpNode(ctx, &cfg.HealthcheckOpNode) + }) + } + + if cfg.HealthcheckReth.BaseURL != "" { + monitors = append(monitors, func(ctx context.Context) *healthcheck.Result { + return healthcheck.Reth(ctx, &cfg.HealthcheckReth) + }) + } + + s := &Server{ + cfg: cfg, + failure: make(chan error, 1), + logger: zap.L(), + monitors: monitors, + } + + mux := http.NewServeMux() + mux.HandleFunc("/", s.healthcheck) + mux.Handle("/metrics", promhttp.Handler()) + handler := httplogger.Middleware(s.logger, mux) + + s.server = &http.Server{ + Addr: cfg.Server.ListenAddress, + ErrorLog: logutils.NewHttpServerErrorLogger(s.logger), + Handler: handler, + MaxHeaderBytes: 1024, + ReadHeaderTimeout: 30 * time.Second, + ReadTimeout: 30 * time.Second, + WriteTimeout: 30 * time.Second, + } + + return s, nil +} + +func (s *Server) Run() error { + l := s.logger + ctx := logutils.ContextWithLogger(context.Background(), l) + + go func() { // run the server + l.Info("Blockchain node healthchecker server is going up...", + zap.String("server_listen_address", s.cfg.Server.ListenAddress), + ) + if err := s.server.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + s.failure <- err + } + l.Info("Blockchain node healthchecker server is down") + }() + + errs := []error{} + { // wait until termination or internal failure + terminator := make(chan os.Signal, 1) + signal.Notify(terminator, os.Interrupt, syscall.SIGTERM) + + select { + case stop := <-terminator: + l.Info("Stop signal received; shutting down...", + zap.String("signal", stop.String()), + ) + case err := <-s.failure: + l.Error("Internal failure; shutting down...", + zap.Error(err), + ) + errs = append(errs, err) + exhaustErrors: + for { // exhaust the errors + select { + case err := <-s.failure: + l.Error("Extra internal failure", + zap.Error(err), + ) + errs = append(errs, err) + default: + break exhaustErrors + } + } + } + } + + { // stop the server + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + if err := s.server.Shutdown(ctx); err != nil { + l.Error("Blockchain node healthchecker server shutdown failed", + zap.Error(err), + ) + } + } + + switch len(errs) { + default: + return errors.Join(errs...) + case 1: + return errs[0] + case 0: + return nil + } +} diff --git a/utils/nic.go b/utils/nic.go new file mode 100644 index 0000000..6274776 --- /dev/null +++ b/utils/nic.go @@ -0,0 +1,51 @@ +package utils + +import ( + "errors" + "fmt" + "net" +) + +var ( + errPrivateIPv4FailedToDerive = errors.New("failed to derive private ipv4") +) + +func PrivateIPv4() (net.IP, error) { + interfaces, err := net.Interfaces() + if err != nil { + return nil, fmt.Errorf("%w: %w", + errPrivateIPv4FailedToDerive, err, + ) + } + + for _, ifs := range interfaces { + addrs, err := ifs.Addrs() + if err != nil { + return nil, fmt.Errorf("%w: %w", + errPrivateIPv4FailedToDerive, err, + ) + } + + for _, addr := range addrs { + ipNet, ok := addr.(*net.IPNet) + if !ok { + continue + } + + ipv4 := ipNet.IP.To4() + if ipv4 == nil { + continue + } + + if !ipv4.IsPrivate() { + continue + } + + return ipv4, nil + } + } + + return nil, fmt.Errorf("%w: found no ipv4 interfaces", + errPrivateIPv4FailedToDerive, + ) +}