Automattic · chrisbliss18 · May 3, 2026 · May 3, 2026 · May 3, 2026 · May 3, 2026
diff --git a/cmd/jetmon2/main.go b/cmd/jetmon2/main.go
@@ -89,6 +89,7 @@ func runServe() {
 	}
 	cfg := config.Get()
 	log.Printf("config: legacy_status_projection=%s", enabledLabel(cfg.LegacyStatusProjectionEnable))
+	log.Printf("config: wpcom_notify=%s", enabledLabel(cfg.WPCOMNotifyEnable))
 	log.Printf("config: bucket_ownership=%s", bucketOwnershipLabel(cfg))
 	log.Printf("config: scheduler=%s", schedulerConfigLabel(cfg))
 	log.Printf("config: email_transport=%s", emailTransportLabel(cfg))
@@ -363,6 +364,7 @@ func cmdValidateConfig() {
 
 	cfg := config.Get()
 	fmt.Printf("INFO legacy_status_projection=%s\n", enabledLabel(cfg.LegacyStatusProjectionEnable))
+	fmt.Printf("INFO wpcom_notify=%s\n", enabledLabel(cfg.WPCOMNotifyEnable))
 	fmt.Printf("INFO bucket_ownership=%s\n", bucketOwnershipLabel(cfg))
 	fmt.Printf("INFO scheduler=%s\n", schedulerConfigLabel(cfg))
 	for _, line := range rolloutAdviceLines(cfg) {
@@ -732,13 +734,13 @@ func schedulerConfigLabel(cfg *config.Config) string {
 	if cfg.UseVariableCheckIntervals {
 		return fmt.Sprintf(
 			"variable_intervals fetch_page_size=%d idle_poll=%s",
-			cfg.DatasetSize,
+			orchestrator.ConfiguredFetchPageSize(cfg),
 			orchestrator.VariableIntervalPollInterval(),
 		)
 	}
 	return fmt.Sprintf(
 		"fixed_rounds fetch_page_size=%d min_round_interval=%s",
-		cfg.DatasetSize,
+		orchestrator.ConfiguredFetchPageSize(cfg),
 		time.Duration(cfg.MinTimeBetweenRoundsSec)*time.Second,
 	)
 }

diff --git a/cmd/jetmon2/main_test.go b/cmd/jetmon2/main_test.go
@@ -64,6 +64,18 @@ func TestEnvOrDefault(t *testing.T) {
 	}
 }
 
+func TestSchedulerConfigLabelUsesEffectivePageSize(t *testing.T) {
+	cfg := &config.Config{
+		UseVariableCheckIntervals: true,
+		DatasetSize:               0,
+	}
+
+	got := schedulerConfigLabel(cfg)
+	if !strings.Contains(got, "fetch_page_size=100") {
+		t.Fatalf("schedulerConfigLabel() = %q, want effective default fetch page size", got)
+	}
+}
+
 func TestReadPIDFile(t *testing.T) {
 	dir := t.TempDir()
 	pidPath := filepath.Join(dir, "test.pid")

diff --git a/config/config-sample.json b/config/config-sample.json
@@ -1,28 +1,19 @@
 {
 	"DEBUG"             : false,
-	"NUM_WORKERS"       : 60,
-	"NUM_TO_PROCESS"    : 40,
-	"DATASET_SIZE"      : 100,
-	"WORKER_MAX_MEM_MB" : 0,
 
 	"LEGACY_STATUS_PROJECTION_ENABLE" : true,
 
 	"BUCKET_TOTAL"               : 1000,
-	"BUCKET_TARGET"              : 500,
 	"BUCKET_HEARTBEAT_GRACE_SEC" : 600,
 
-	"BATCH_SIZE"     : 32,
 	"AUTH_TOKEN"     : "<AUTH_TOKEN>",
 
-	"VERIFLIER_BATCH_SIZE"  : 200,
-	"SQL_UPDATE_BATCH"      : 1,
-	"DB_CONFIG_UPDATES_MIN" : 10,
 	"PEER_OFFLINE_LIMIT"    : 3,
 
-	"NUM_OF_CHECKS"           : 3,
-	"TIME_BETWEEN_CHECKS_SEC" : 30,
+	"NUM_OF_CHECKS" : 3,
 
 	"ALERT_COOLDOWN_MINUTES" : 30,
+	"WPCOM_NOTIFY_ENABLE"    : true,
 
 	"STATS_UPDATE_INTERVAL_MS"     : 10000,
 	"STATSD_SEND_MEM_USAGE"        : false,
@@ -33,7 +24,6 @@
 	"BODY_READ_MAX_MS"             : 250,
 	"KEYWORD_READ_MAX_BYTES"       : 1048576,
 	"KEYWORD_READ_MAX_MS"          : 0,
-	"USE_VARIABLE_CHECK_INTERVALS" : true,
 
 	"LOG_FORMAT"     : "text",
 	"DASHBOARD_PORT" : 8080,

diff --git a/config/config.readme b/config/config.readme
@@ -2,18 +2,28 @@ DEBUG
 Set to true to enable more verbose log messages. Default: false.
 
 NUM_WORKERS
-Initial number of goroutines in the check pool. The pool auto-scales between 1 and NUM_WORKERS based on queue depth. Default: 60.
+Compatibility baseline for the check pool. Omit it or set it to 0 for the
+default auto baseline. Jetmon auto-scales up when due backlog cannot fit inside
+the configured freshness window; NUM_WORKERS is not the burst ceiling in
+variable-interval mode. The adaptive ceiling is derived from due-site count,
+timeout, round freshness budget, and the host file-descriptor budget. Watch
+scheduler pool metrics and host FD limits rather than manually raising
+NUM_WORKERS for each fleet size.
+Default: 60.
 
 NUM_TO_PROCESS
 Legacy compatibility setting retained so copied v1-style configs continue to
-parse. The Go scheduler uses DATASET_SIZE as the fetch page and NUM_WORKERS as
-the concurrency guardrail; NUM_TO_PROCESS does not cap scheduler throughput.
+parse. It is ignored by Jetmon v2 and does not cap scheduler throughput.
 Default: 40.
 
 DATASET_SIZE
-Maximum number of sites to fetch from the database per scheduler page. The
-orchestrator keeps fetching additional pages until due work is drained, so this
-is a database/query guardrail rather than a cap on total checks per round.
+Optional minimum number of sites to fetch from the database per scheduler page.
+Omit it or set it to 0 for the default floor. The orchestrator keeps fetching
+additional pages until due work is drained, and raises the effective page size
+automatically for large due backlogs so one freshness window does not require
+hundreds of tiny SQL pages. This is a compatibility floor for copied v1-style
+configs, not a cap on total checks per round or a setting operators should tune
+for each fleet size.
 Default: 100.
 
 WORKER_MAX_MEM_MB
@@ -34,7 +44,10 @@ BUCKET_TOTAL
 Total number of buckets in the system across all hosts. Must match the range of bucket_no values in the jetpack_monitor_sites table. Default: 1000.
 
 BUCKET_TARGET
-Number of buckets this host should claim on startup. Used for initial distribution across hosts. Default: 500.
+Legacy compatibility setting retained so copied v1-style configs continue to
+parse. Dynamic ownership now ignores this value and evenly assigns the full
+BUCKET_TOTAL range across active hosts. Use PINNED_BUCKET_MIN/MAX for the
+temporary v1-to-v2 one-host-at-a-time migration shape. Default: 0.
 
 BUCKET_HEARTBEAT_GRACE_SEC
 Seconds after a host's last heartbeat before its buckets are considered available for reclaiming by another host. Default: 600.
@@ -47,12 +60,23 @@ Deprecated v1 names accepted as aliases for PINNED_BUCKET_MIN / PINNED_BUCKET_MA
 
 BATCH_SIZE
 Legacy compatibility setting retained so copied v1-style configs continue to
-parse. It is not used by the Go scheduler; scheduler DB paging is controlled by
-DATASET_SIZE. Default: 32.
+parse. It is ignored by Jetmon v2; scheduler DB paging uses the DATASET_SIZE
+floor plus adaptive page sizing. Default: 32.
+
+SQL_UPDATE_BATCH / DB_CONFIG_UPDATES_MIN
+Legacy compatibility settings retained so copied v1-style configs continue to
+parse. They are ignored by Jetmon v2; freshness/history/event writes use
+internal batching sized by the Go implementation and database behavior.
 
 AUTH_TOKEN
 Shared secret used to authenticate outbound WPCOM API calls. Required.
 
+WPCOM_NOTIFY_ENABLE
+Controls the legacy v1-compatible WPCOM status-change notification path. Keep
+enabled in production so Jetmon v2 remains a drop-in replacement for the v1
+notification contract. Set to false in synthetic capacity, local, or isolated
+test services so fake benchmark blog IDs do not contact WPCOM. Default: true.
+
 VERIFLIER_BATCH_SIZE
 Legacy compatibility setting retained for verifier batching compatibility. The
 current JSON-over-HTTP verifier path sends one verification request per site
@@ -85,9 +109,11 @@ Minimum minutes between a site going down and a confirmed-down notification bein
 
 MIN_TIME_BETWEEN_ROUNDS_SEC
 Minimum seconds between fixed-cadence full-fleet passes when
-USE_VARIABLE_CHECK_INTERVALS is false. When USE_VARIABLE_CHECK_INTERVALS is
-true, the scheduler uses a short idle poll and the SQL due predicate decides
-which sites are ready to check. Default: 300.
+USE_VARIABLE_CHECK_INTERVALS is explicitly false. In the default
+variable-interval mode, this is parsed for v1 config compatibility but does
+not control scheduler throughput, batching, or freshness. Variable-interval
+freshness is controlled by each site's check_interval and maintained
+next_check_at timestamp. Default: 300.
 
 NET_COMMS_TIMEOUT
 Default HTTP request timeout in seconds. Can be overridden per-site via timeout_seconds column. Default: 10.
@@ -105,16 +131,15 @@ KEYWORD_READ_MAX_MS
 Keyword body-read budget in milliseconds. Set to 0 to inherit the request timeout envelope (NET_COMMS_TIMEOUT or per-site timeout_seconds). If set > 0 and exhausted before the keyword is found, the check fails as timeout (ErrorTimeout), not keyword mismatch. Default: 0.
 
 USE_VARIABLE_CHECK_INTERVALS
-Set to true to respect the check_interval column per site instead of running
-all sites every fixed-cadence pass. Recommended for production freshness
-testing because newly due sites are discovered without waiting for
-MIN_TIME_BETWEEN_ROUNDS_SEC. Jetmon maintains next_check_at after each
-completed check so due-site selection can use an indexed timestamp range
-instead of recomputing each row's interval on every poll. Successful checks use
-the site's normal check_interval; failed checks get a bounded one-minute
-follow-up when the normal interval is longer. Default in the sample config:
-true. Minimal configs that omit the key retain the compatibility default of
-false.
+Defaults to true. Jetmon respects the check_interval column per site instead of
+running all sites every fixed-cadence pass, and newly due sites are discovered
+without waiting for MIN_TIME_BETWEEN_ROUNDS_SEC. Jetmon maintains next_check_at
+after each completed check so due-site selection can use an indexed timestamp
+range instead of recomputing each row's interval on every poll. Successful
+checks use the site's normal check_interval; failed checks get a bounded
+one-minute follow-up when the normal interval is longer. Explicitly setting this
+to false is kept for local debugging and v1-behavior comparison, not production
+capacity tuning.
 
 LOG_FORMAT
 Log output format. Set to "json" for structured logging (e.g. for log aggregators), or "text" for human-readable output. Default: "text".

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -97,8 +97,10 @@ This is the end-to-end path from database query to WPCOM notification.
 │  orchestrator.runRound()                                             │
 │    dbHeartbeat()          ── UPDATE jetmon_hosts SET last_heartbeat  │
 │    ClaimBuckets()         ── rebalance bucket ranges (each round)    │
-│    dbGetSitesForBucket()  ── SELECT due sites in DATASET_SIZE pages  │
-│                              ORDER BY next_check_at / last_checked_at │
+│    dbGetSitesForBucketPage() ─ SELECT due sites in adaptive pages      │
+│                                (DATASET_SIZE is the floor)             │
+│                                with a keyset cursor over               │
+│                                next_check_at / last_checked_at         │
 └──────────────────────────────────────────────────────────────────────┘
                   │  []db.Site
                   ▼
@@ -141,8 +143,7 @@ This is the end-to-end path from database query to WPCOM notification.
 │ retries     │   │ Stage 1 — Local retry                           │
 │  .clear()   │   │   retries.record(res) → failCount++             │
 │             │   │   if failCount < NumOfChecks (default 3):       │
-│ if site was │   │     auditLog("retry_dispatched")                │
-│ previously  │   │     ← return; retry next round                  │
+│ if site was │   │     ← return; retry next round                  │
 │ down:       │   │                                                 │
 │  dbUpdate   │   │ Stage 2 — Veriflier escalation                  │
 │  Status()   │   │   if failCount >= NumOfChecks:                  │
@@ -165,6 +166,14 @@ Failure Escalation Detail
 
 ```
   Local check fails (N times)
+          │
+          │  Broad timeout/connect-error storm and
+          │  Veriflier samples can reach affected URLs?
+          ├──────────────────────────► suppress per-site failure events
+          │                            for that chunk; count as
+          │                            monitor-side uncertainty
+          │                            (short-lived cached verdict
+          │                             covers repeated chunks)
           │
           │  failCount < NumOfChecks?
           ├──────────────────────────► queue in retryQueue, retry next round
@@ -203,16 +212,18 @@ orchestrator.Run()
     └── loop (until ctx.Done()):
           │
           ├─ config.Get()                    // fresh config snapshot each round
-          ├─ pool.SetMaxSize(cfg.NumWorkers)  // apply hot-reloaded worker limit
+          ├─ pool.SetMaxSize(cfg.NumWorkers)  // apply hot-reloaded baseline
           ├─ refreshVeriflierClients(cfg)     // rebuild list only on change
           │
           ├─ runRound()
           │     │
           │     ├─ dbHeartbeat()
           │     ├─ ClaimBuckets()             // rebalance every round
-          │     ├─ dbGetSitesForBucket()      // fetch due work in DATASET_SIZE pages
+          │     ├─ sample due count
+          │     ├─ pool.WarmTo(adaptive ceiling)
+          │     ├─ dbGetSitesForBucketPage()  // fetch due work in adaptive pages
           │     │
-          │     ├─ for each scheduler page:
+          │     ├─ for each scheduler batch built from multiple DB pages:
           │     │     pool.Submit(checker.Request)  // waits/collects on backpressure
           │     │
           │     ├─ collect results (deadline-bounded)
@@ -235,12 +246,14 @@ Checker Pool — Auto-Scaling
 ----------------------------
 
 The pool maintains a live set of worker goroutines bounded by `[minSize, maxSize]`.
+The scheduler can raise `maxSize` and pre-warm workers when due-count sampling
+shows the baseline would miss the freshness window.
 
 ```
-  NewPool(initial=30, min=1, max=60)
+  NewPoolWithQueueCapacity(initial=30, min=1, max=60, queueCapacity=120)
     │
-    ├─ work channel  (cap = max×2 = 120)
-    ├─ results channel (cap = max×2 = 120)
+    ├─ work channel  (bounded by explicit queueCapacity)
+    ├─ results channel (bounded by explicit queueCapacity)
     ├─ retire channel  (cap = max  =  60)
     └─ autoScale() goroutine (every 5 s)
 
@@ -252,6 +265,9 @@ The pool maintains a live set of worker goroutines bounded by `[minSize, maxSize
     │  Scale UP:   queue > current && current < maxSize   │
     │    spawn min(queue-current, maxSize-current) workers│
     │                                                     │
+    │  WarmTo: scheduler can immediately spawn to maxSize │
+    │    after due-count sampling for large due waves     │
+    │                                                     │
     │  Scale DOWN: current > maxSize                      │
     │    retire (current - maxSize) workers immediately   │
     │                                                     │
@@ -350,13 +366,13 @@ MySQL transaction. Expired hosts (heartbeat missed by `BucketHeartbeatGraceSec`)
 are removed and their ranges redistributed.
 
 ```
-  jetmon_hosts (3 active hosts, BucketTotal=1000, BucketTarget=500):
+  jetmon_hosts (3 active hosts, BucketTotal=1000):
 
   Hosts sorted by host_id: [host-a, host-b, host-c]
-  assignBucketRanges() water-fill:
-    host-a → buckets   0– 499  (capped at BucketTarget=500)
-    host-b → buckets 500– 749  (250 remaining, 2 hosts left)
-    host-c → buckets 750– 999
+  assignBucketRanges() even split:
+    host-a → buckets   0– 333
+    host-b → buckets 334– 666
+    host-c → buckets 667– 999
 
   host-b goes offline (heartbeat expires):
     host-a → buckets   0– 499
@@ -440,8 +456,8 @@ Database Tables
 
   jetmon_audit_log        Operational trail for compliance/debugging
     event_type            check | wpcom_sent | wpcom_retry |
-                          retry_dispatched | veriflier_sent |
-                          veriflier_result | maintenance_active |
+                          veriflier_sent | veriflier_result |
+                          maintenance_active |
                           alert_suppressed | api_access | config_reload
     blog_id, source, http_code, error_code, rtt_ms