diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml index f90f791f..5b9789ea 100644 --- a/.github/workflows/checks.yaml +++ b/.github/workflows/checks.yaml @@ -83,6 +83,9 @@ jobs: - name: Install docker compose run: ./scripts/ci-setup-docker-compose.sh + - name: Build playground utils + run: ./scripts/ci-build-playground-utils.sh + - name: Run unit tests run: make integration-test diff --git a/.github/workflows/docker-utils-release.yaml b/.github/workflows/docker-utils-release.yaml index 270f186d..f6f61816 100644 --- a/.github/workflows/docker-utils-release.yaml +++ b/.github/workflows/docker-utils-release.yaml @@ -10,6 +10,7 @@ on: paths: - 'cl-proxy/**' - 'mev-boost-relay/**' + - 'healthmon/**' - 'go.mod' - 'go.sum' - 'Dockerfile' diff --git a/Dockerfile b/Dockerfile index 36d24568..8686f67b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,4 +13,5 @@ COPY . . # Build all applications with CGo enabled RUN go build -o /usr/local/bin/cl-proxy ./cl-proxy/cmd/main.go && \ - go build -o /usr/local/bin/mev-boost-relay ./mev-boost-relay/cmd/main.go + go build -o /usr/local/bin/mev-boost-relay ./mev-boost-relay/cmd/main.go && \ + go build -o /usr/local/bin/healthmon ./healthmon/cmd/main.go diff --git a/go.mod b/go.mod index e924ae97..8f9c5145 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,9 @@ require ( github.com/dustinkirkland/golang-petname v0.0.0-20240428194347-eebcea082ee0 github.com/ethereum/go-ethereum v1.15.10 github.com/flashbots/go-boost-utils v1.9.1-0.20250819134059-e5294cb450c9 + github.com/flashbots/go-template v1.0.0 github.com/flashbots/mev-boost-relay v0.32.0-rc2 + github.com/go-chi/httplog/v2 v2.1.1 github.com/hashicorp/go-uuid v1.0.3 github.com/holiman/uint256 v1.3.2 github.com/otiai10/copy v1.14.1 @@ -65,6 +67,7 @@ require ( github.com/ferranbt/fastssz v0.1.4 // indirect github.com/flashbots/go-utils v0.11.0 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect + github.com/go-chi/chi/v5 v5.2.1 // indirect github.com/go-gorp/gorp/v3 v3.1.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect @@ -156,7 +159,7 @@ require ( golang.org/x/crypto v0.37.0 // indirect golang.org/x/net v0.38.0 // indirect golang.org/x/oauth2 v0.26.0 // indirect - golang.org/x/sys v0.32.0 // indirect + golang.org/x/sys v0.33.0 // indirect golang.org/x/term v0.31.0 // indirect golang.org/x/text v0.24.0 // indirect golang.org/x/time v0.9.0 // indirect diff --git a/go.sum b/go.sum index 9b8477a7..01ee8efb 100644 --- a/go.sum +++ b/go.sum @@ -133,6 +133,8 @@ github.com/ferranbt/fastssz v0.1.4 h1:OCDB+dYDEQDvAgtAGnTSidK1Pe2tW3nFV40XyMkTeD github.com/ferranbt/fastssz v0.1.4/go.mod h1:Ea3+oeoRGGLGm5shYAeDgu6PGUlcvQhE2fILyD9+tGg= github.com/flashbots/go-boost-utils v1.9.1-0.20250819134059-e5294cb450c9 h1:uXedHkX9U9q6tej9VMlNhwGkcGpSK9x+kxBI5xo8Biw= github.com/flashbots/go-boost-utils v1.9.1-0.20250819134059-e5294cb450c9/go.mod h1:52faeojMg+vjCtjuekWpRzvYU8nlD1ZEPRvRJ2q3bv0= +github.com/flashbots/go-template v1.0.0 h1:ODeaR/kLOQcWaA8tKgTrdZD1BvxXJY53eVLbFfnmxiQ= +github.com/flashbots/go-template v1.0.0/go.mod h1:fBFD7uLaDZ0bEyNKNMx+ByJBg/B2oJKWeU2G7kZtrcY= github.com/flashbots/go-utils v0.11.0 h1:MuI9OOl40MukSL2ucKKQG1sxxl5Cqjla41TRubGNu0w= github.com/flashbots/go-utils v0.11.0/go.mod h1:i4xxEB6sHDFfNWEIfh+rP6nx3LxynEn8AOZa05EYgwA= github.com/flashbots/mev-boost-relay v0.32.0-rc2 h1:0//XfKzBmSnmvXI4iLiTbw0WxElo8Nc6mp7DWPPSsxU= @@ -146,6 +148,10 @@ github.com/gballet/go-libpcsclite v0.0.0-20191108122812-4678299bea08 h1:f6D9Hr8x github.com/gballet/go-libpcsclite v0.0.0-20191108122812-4678299bea08/go.mod h1:x7DCsMOv1taUwEWCzT4cmDeAkigA5/QCwUodaVOe8Ww= github.com/getsentry/sentry-go v0.27.0 h1:Pv98CIbtB3LkMWmXi4Joa5OOcwbmnX88sF5qbK3r3Ps= github.com/getsentry/sentry-go v0.27.0/go.mod h1:lc76E2QywIyW8WuBnwl8Lc4bkmQH4+w1gwTf25trprY= +github.com/go-chi/chi/v5 v5.2.1 h1:KOIHODQj58PmL80G2Eak4WdvUzjSJSm0vG72crDCqb8= +github.com/go-chi/chi/v5 v5.2.1/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops= +github.com/go-chi/httplog/v2 v2.1.1 h1:ojojiu4PIaoeJ/qAO4GWUxJqvYUTobeo7zmuHQJAxRk= +github.com/go-chi/httplog/v2 v2.1.1/go.mod h1:/XXdxicJsp4BA5fapgIC3VuTD+z0Z/VzukoB3VDc1YE= github.com/go-gorp/gorp/v3 v3.1.0 h1:ItKF/Vbuj31dmV4jxA1qblpSwkl9g1typ24xoe70IGs= github.com/go-gorp/gorp/v3 v3.1.0/go.mod h1:dLEjIyyRNiXvNZ8PSmzpt1GsWAUK8kjVhEpjH8TixEw= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -538,8 +544,8 @@ golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= -golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= diff --git a/healthmon/README.md b/healthmon/README.md new file mode 100644 index 00000000..967d273c --- /dev/null +++ b/healthmon/README.md @@ -0,0 +1,2 @@ + +# Healthmon diff --git a/healthmon/cmd/main.go b/healthmon/cmd/main.go new file mode 100644 index 00000000..202fb6e5 --- /dev/null +++ b/healthmon/cmd/main.go @@ -0,0 +1,19 @@ +package main + +import ( + "flag" + + "github.com/flashbots/builder-playground/healthmon" +) + +func main() { + var config healthmon.Config + + flag.StringVar(&config.Chain, "chain", "", "Type of ethereum chain to monitor (beacon or execution)") + flag.StringVar(&config.URL, "url", "", "Full node URL (e.g., http://localhost:8545)") + flag.StringVar(&config.Addr, "service.addr", "localhost:21171", "Address for the health check service to listen on (e.g., ':21171')") + flag.IntVar(&config.BlockTimeSeconds, "blocktime", 0, "expected block time in seconds (optional)") + flag.Parse() + + healthmon.Start(&config) +} diff --git a/healthmon/healthmon.go b/healthmon/healthmon.go new file mode 100644 index 00000000..b1bbf03c --- /dev/null +++ b/healthmon/healthmon.go @@ -0,0 +1,249 @@ +package healthmon + +import ( + "context" + "encoding/json" + "io" + "net/http" + "os" + "sync/atomic" + "time" + + "github.com/ethereum/go-ethereum" + "github.com/ethereum/go-ethereum/core/types" + "github.com/ethereum/go-ethereum/ethclient" + mevboostrelay "github.com/flashbots/builder-playground/mev-boost-relay" + "github.com/flashbots/go-template/common" + "github.com/flashbots/mev-boost-relay/beaconclient" + mevRCommon "github.com/flashbots/mev-boost-relay/common" + "github.com/go-chi/httplog/v2" +) + +var isHealthy atomic.Bool + +type Config struct { + Chain string + URL string + Addr string + BlockTimeSeconds int +} + +func Start(config *Config) { + log := common.SetupLogger(&common.LoggingOpts{ + Version: common.Version, + }) + + updates := make(chan blockUpdate, 10) + log.Info("Started", "chain", config.Chain, "url", config.URL) + + switch config.Chain { + case "beacon": + go monitorBeacon(log, context.Background(), config.URL, updates) + case "execution": + go monitorExecution(log, context.Background(), config.URL, updates) + default: + log.Error("Unknown chain", "chain", config.Chain) + os.Exit(1) + } + + go monitor(log, config.BlockTimeSeconds, context.Background(), updates) + + log.Info("Starting service server", "addr", config.Addr) + + http.HandleFunc("/ready", statusHandler) + http.ListenAndServe(config.Addr, nil) +} + +func statusHandler(w http.ResponseWriter, req *http.Request) { + if isHealthy.Load() { + io.WriteString(w, "OK") + } else { + w.WriteHeader(503) + io.WriteString(w, "NOT READY") + } +} + +func setHealthy(healthy bool) { + isHealthy.Store(healthy) +} + +type monitorState struct { + log *httplog.Logger + firstBlockUpdate *blockUpdate + blockTimeSeconds int + blockTimer *time.Timer +} + +func newMonitorState(log *httplog.Logger, blockTimeSeconds int) *monitorState { + // this timer will start after the blocks are received and we can figure out the block time + blockTimer := time.NewTimer(0) + blockTimer.Stop() + + return &monitorState{ + log: log, + firstBlockUpdate: nil, + blockTimeSeconds: blockTimeSeconds, + blockTimer: blockTimer, + } +} + +var wiggleRoomSeconds = 1 + +func (m *monitorState) handleUpdate(update blockUpdate) { + m.log.Info("Processing block update", "number", update.Number, "timestamp", update.Timestamp) + + if m.firstBlockUpdate == nil { + m.firstBlockUpdate = &update + } + + if m.blockTimeSeconds == 0 { + // if block time is not known, either: + // - use the block time provided in the update (beacon) + // - use the difference between the first and current block (execution) + if update.BlockTime != 0 { + m.log.Info("Using block time from update", "block time seconds", update.BlockTime) + m.blockTimeSeconds = update.BlockTime + } else if m.firstBlockUpdate != nil && update.Number > m.firstBlockUpdate.Number { + blocktime := update.Timestamp.Sub(m.firstBlockUpdate.Timestamp) + m.log.Info("Calculated block time from timestamps", "block time seconds", blocktime) + m.blockTimeSeconds = int(blocktime.Seconds()) + } + } + + if m.blockTimeSeconds != 0 { + m.log.Info("Resetting block timer", "blockTimeSeconds", m.blockTimeSeconds) + m.blockTimer.Reset(time.Duration(m.blockTimeSeconds+wiggleRoomSeconds) * time.Second) + } +} + +func monitor(log *httplog.Logger, blockTimeSeconds int, ctx context.Context, updates <-chan blockUpdate) { + state := newMonitorState(log, blockTimeSeconds) + + for { + select { + case <-ctx.Done(): + return + case update := <-updates: + // receiving a block always means healthy since the node is producing blocks + // and the unhealthy state is set during the block timer timeout + setHealthy(true) + + state.handleUpdate(update) + + case <-state.blockTimer.C: + log.Warn("Block timer expired, setting unhealthy") + setHealthy(false) + } + } +} + +type blockUpdate struct { + Number uint64 + Timestamp time.Time + BlockTime int +} + +func monitorBeacon(log *httplog.Logger, ctx context.Context, url string, updates chan<- blockUpdate) { + bLog := mevRCommon.LogSetup(false, "info") + beaconClient := beaconclient.NewProdBeaconInstance(bLog, url, url) + + var lastSlot *uint64 + var blockTime int + + for { + select { + case <-ctx.Done(): + return + case <-time.After(500 * time.Millisecond): + sync, err := beaconClient.SyncStatus() + if err != nil { + log.Error("Failed to get beacon sync status", "err", err) + continue + } + + if sync.IsSyncing { + log.Debug("Beacon node is syncing", "headSlot", sync.HeadSlot) + continue + } + + if blockTime == 0 { + spec, err := mevboostrelay.GetSpec(url) + if err != nil { + log.Error("Failed to get beacon spec", "err", err) + } else { + blockTime = int(spec.SecondsPerSlot) + log.Info("Fetched beacon spec", "blockTime", blockTime) + } + } + + if lastSlot == nil || *lastSlot < sync.HeadSlot { + lastSlot = &sync.HeadSlot + log.Info("New beacon block received", "slot", sync.HeadSlot) + updates <- blockUpdate{Number: sync.HeadSlot, BlockTime: blockTime} + } + } + } +} + +func monitorExecution(log *httplog.Logger, ctx context.Context, url string, updates chan<- blockUpdate) { + client, err := ethclient.Dial(url) + if err != nil { + log.Error("Failed to connect to execution client", "err", err) + os.Exit(1) + } + + getLatestBlock := func() (*types.Header, error) { + // We use a manual RPC call instead of the Geth SDK's HeaderByNumber because + // we query both OP and normal L1 clients which have different transaction types + // that cannot be decoded with a single Geth SDK. The Geth SDK only returns blocks + // with transactions fully decoded (not just hashes), so we call the RPC directly + // to avoid transaction decoding issues. + var raw json.RawMessage + if err := client.Client().CallContext(ctx, &raw, "eth_getBlockByNumber", "latest", false); err != nil { + return nil, err + } + + // Decode header and transactions. + var head *types.Header + if err := json.Unmarshal(raw, &head); err != nil { + return nil, err + } + // When the block is not found, the API returns JSON null. + if head == nil { + return nil, ethereum.NotFound + } + return head, nil + } + + var lastBlock *uint64 + for { + select { + case <-ctx.Done(): + return + case <-time.After(500 * time.Millisecond): + sync, err := client.SyncProgress(ctx) + if err != nil { + log.Error("Failed to get execution sync progress", "err", err) + continue + } + + if sync != nil && !sync.Done() { + log.Debug("Execution node is syncing", "currentBlock", sync.CurrentBlock, "highestBlock", sync.HighestBlock) + continue + } + header, err := getLatestBlock() + if err != nil { + log.Error("Failed to get execution block number", "err", err) + continue + } + num := header.Number.Uint64() + if lastBlock == nil || num > *lastBlock { + lastBlock = &num + timestamp := time.Unix(int64(header.Time), 0) + + log.Info("New execution block received", "number", num) + updates <- blockUpdate{Number: num, Timestamp: timestamp} + } + } + } +} diff --git a/healthmon/healthmon_test.go b/healthmon/healthmon_test.go new file mode 100644 index 00000000..0396af94 --- /dev/null +++ b/healthmon/healthmon_test.go @@ -0,0 +1,73 @@ +package healthmon + +import ( + "testing" + "time" + + "github.com/flashbots/go-template/common" + "github.com/go-chi/httplog/v2" + "github.com/stretchr/testify/assert" +) + +func TestHealthmonMonitor_BlockTimeInUpdate(t *testing.T) { + // health monitor detects block time from difference between first and current block + m := newMonitorState(testLogger(), 0) + + m.handleUpdate(blockUpdate{ + Number: 1, + BlockTime: 2, + }) + + assert.Equal(t, m.blockTimeSeconds, 2) + waitToTrigger(t, m) +} + +func TestHealthmonMonitor_BlockTimeDiff(t *testing.T) { + // health monitor detects block time from difference between first and current block + m := newMonitorState(testLogger(), 0) + + now := time.Now() + now1 := now.Add(2 * time.Second) + + m.handleUpdate(blockUpdate{ + Number: 1, + Timestamp: now, + }) + + // monitor does not have yet enough info to calculate block time + assert.Equal(t, m.blockTimeSeconds, 0) + + m.handleUpdate(blockUpdate{ + Number: 2, + Timestamp: now1, + }) + + assert.Equal(t, m.blockTimeSeconds, 2) + waitToTrigger(t, m) +} + +func TestHealthmonMonitor_ResetTimer(t *testing.T) { + m := newMonitorState(testLogger(), 2) + + m.handleUpdate(blockUpdate{}) + waitToTrigger(t, m) + + m.handleUpdate(blockUpdate{}) + waitToTrigger(t, m) +} + +func testLogger() *httplog.Logger { + logger := common.SetupLogger(&common.LoggingOpts{ + Version: common.Version, + }) + return logger +} + +func waitToTrigger(t *testing.T, m *monitorState) { + // this functions waits for wathever block time is specified in the monitor state + select { + case <-m.blockTimer.C: + case <-time.After(time.Duration(m.blockTimeSeconds+1) * time.Second): + t.Fatal("timeout waiting for block timer to trigger") + } +} diff --git a/mev-boost-relay/mev-boost-relay.go b/mev-boost-relay/mev-boost-relay.go index deb772d9..76d83e5d 100644 --- a/mev-boost-relay/mev-boost-relay.go +++ b/mev-boost-relay/mev-boost-relay.go @@ -89,7 +89,7 @@ func New(config *Config) (*MevBoostRelay, error) { log.Info("Beacon client synced") // get the spec and genesis info to compute the eth network details - spec, err := getSpec(config.BeaconClientAddr) + spec, err := GetSpec(config.BeaconClientAddr) if err != nil { return nil, fmt.Errorf("failed to get spec: %w", err) } @@ -443,7 +443,7 @@ type Spec struct { FuluForkVersion string `json:"FULU_FORK_VERSION"` //nolint:tagliatelle } -func getSpec(beaconURL string) (*Spec, error) { +func GetSpec(beaconURL string) (*Spec, error) { uri := fmt.Sprintf("%s/eth/v1/config/spec", beaconURL) resp, err := http.Get(uri) diff --git a/playground/components.go b/playground/components.go index e6b6b987..5f1e1d6e 100644 --- a/playground/components.go +++ b/playground/components.go @@ -382,7 +382,7 @@ func (o *OpGeth) Apply(manifest *Manifest) { WithArtifact("/data/jwtsecret", "jwtsecret"). WithArtifact("/data/p2p_key.txt", o.Enode.Artifact) - UseHealthmon(manifest, svc) + UseHealthmon(manifest, svc, healthmonExecution) } type RethEL struct { @@ -470,7 +470,7 @@ func (r *RethEL) Apply(manifest *Manifest) { svc.WithArgs("--ipcpath", "/data_reth/reth.ipc") } - UseHealthmon(manifest, svc) + UseHealthmon(manifest, svc, healthmonExecution) if r.UseNativeReth { // we need to use this otherwise the db cannot be binded @@ -515,14 +515,9 @@ func (l *LighthouseBeaconNode) Apply(manifest *Manifest) { ). WithArtifact("/data/testnet-dir", "testnet"). WithArtifact("/data/jwtsecret", "jwtsecret"). - WithVolume("data", "/data_beacon"). - WithReady(ReadyCheck{ - QueryURL: "http://localhost:3500/eth/v1/node/syncing", - Interval: 1 * time.Second, - Timeout: 30 * time.Second, - Retries: 3, - StartPeriod: 1 * time.Second, - }) + WithVolume("data", "/data_beacon") + + UseHealthmon(manifest, svc, healthmonBeacon) if l.MevBoostNode != "" { svc.WithArgs( @@ -643,7 +638,7 @@ func (o *OpReth) Apply(manifest *Manifest) { WithArtifact("/data/l2-genesis.json", "l2-genesis.json"). WithVolume("data", "/data_op_reth") - UseHealthmon(manifest, svc) + UseHealthmon(manifest, svc, healthmonExecution) } type MevBoost struct { @@ -890,16 +885,24 @@ func (b *BuilderHub) Apply(manifest *Manifest) { }) } -func UseHealthmon(m *Manifest, s *Service) { - m.NewService(s.Name+"_healthmon"). - WithImage("ghcr.io/flashbots/ethereum-healthmon"). - WithTag("v0.0.1"). - // TODO: Use this also for beacon node - WithArgs("--chain", "execution", "--url", Connect(s.Name, "http")). +const ( + healthmonBeacon = "beacon" + healthmonExecution = "execution" +) + +func UseHealthmon(m *Manifest, s *Service, chain string) { + healthmonName := s.Name + "_healthmon" + + s.WithLabel(healthCheckSidecarLabel, healthmonName) + m.NewService(healthmonName). + WithImage("docker.io/flashbots/playground-utils"). + WithTag("latest"). + WithEntrypoint("healthmon"). + WithArgs("--chain", chain, "--url", Connect(s.Name, "http")). WithReady(ReadyCheck{ Test: []string{"CMD", "wget", "--spider", "--quiet", "http://127.0.0.1:21171/ready"}, Interval: 1 * time.Second, - Timeout: 10 * time.Second, + Timeout: 10 * time.Minute, Retries: 20, StartPeriod: 1 * time.Second, })