Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 207 additions & 0 deletions pkg/cmd/drtprod/configs/archived/2025_12_drt_scale_300.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
# YAML for creating and configuring the drt-scale cluster. This also configures Datadog.
# Build the drtprod and roachtest binaries (using --cross=linux) before running this script
#
# Planned Execution Date: 2025-12
environment:
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: [email protected]
ROACHPROD_DNS: drt.crdb.io
ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io
ROACHPROD_GCE_DNS_ZONE: drt
ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt
CLUSTER: drt-scale-300
WORKLOAD_CLUSTER: workload-scale-300
CLUSTER_NODES: 300
RACKS: 300
NODES_PER_ZONE: 100
TOTAL_PARTITIONS: 300
# Both server-side partitions and client-side partitions were used during this
# run. Client-side partitions were used on the restored database.
PARTITION_TYPE: partitions
WORKLOAD_NODES: 15
# v25.4.1-14-g05f15f86753 was also used during this run but contained a
# dependency issue resulting in node panics.
VERSION: v25.4.1-16-g02c2602513a
WAREHOUSES: 4000000

dependent_file_locations:
- pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller
- pkg/cmd/drtprod/scripts/setup_datadog_cluster
- pkg/cmd/drtprod/scripts/setup_datadog_workload
- pkg/cmd/drtprod/scripts/tpcc_init.sh
- pkg/cmd/drtprod/scripts/generate_tpcc_run.sh
- pkg/cmd/drtprod/scripts/populate_workload_keys.sh
- artifacts/roachtest
- artifacts/drtprod

targets:
# crdb cluster specs
- target_name: $CLUSTER cluster initialisation
notify_progress: true
steps:
- command: create
args:
- $CLUSTER
flags:
clouds: gce
gce-managed: true
gce-enable-multiple-stores: true
gce-zones: "us-central1-a:$NODES_PER_ZONE,us-central1-b:$NODES_PER_ZONE,us-central1-c:$NODES_PER_ZONE"
nodes: $CLUSTER_NODES
gce-machine-type: n2-standard-16
local-ssd: false
gce-pd-volume-size: 2048
gce-pd-volume-type: pd-ssd
gce-pd-volume-count: 2
os-volume-size: 100
username: drt
lifetime: 8760h
gce-image: "ubuntu-2204-jammy-v20250112"
- command: sync
skip_notification: true
flags:
clouds: gce
- script: "pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller"
skip_notification: true
- command: stage
skip_notification: true
args:
- $CLUSTER
- lib # for libgeos
- command: stage
skip_notification: true
args:
- $CLUSTER
- customized
- $VERSION
- script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster"
skip_notification: true
- command: start
args:
- $CLUSTER
- "--binary"
- "./cockroach"
- "--env" # from defaults
- "COCKROACH_TESTING_FORCE_RELEASE_BRANCH=true"
- "--env" # from defaults
- "COCKROACH_INTERNAL_DISABLE_METAMORPHIC_TESTING=true"
- "--env" # for MMA test case scenario
- "COCKROACH_ALLOW_MMA=true"
flags:
# add flag to set provisioned throughput on each store according to their cloud provider limits
enable-fluent-sink: true
store-count: 2
args: --wal-failover=among-stores
restart: false
sql-port: 26257
racks: $RACKS
- command: sql
args:
- $CLUSTER:1
- --
- -e
- "SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='256 MB'"
# workload cluster specs
- target_name: $WORKLOAD_CLUSTER
steps:
- command: create
args:
- $WORKLOAD_CLUSTER
flags:
clouds: gce
gce-zones: "us-central1-a"
nodes: $WORKLOAD_NODES
gce-machine-type: n2-standard-8
os-volume-size: 100
username: workload
lifetime: 8760h
gce-image: "ubuntu-2204-jammy-v20250112"
on_rollback:
- command: destroy
args:
- $WORKLOAD_CLUSTER
- command: sync
flags:
clouds: gce
- command: stage
args:
- $WORKLOAD_CLUSTER
- release
- $VERSION
- command: put
args:
- $WORKLOAD_CLUSTER
- artifacts/roachtest
- roachtest-operations
- command: put
args:
- $WORKLOAD_CLUSTER
- artifacts/drtprod
- script: "pkg/cmd/drtprod/scripts/setup_datadog_workload"
- target_name: post_tasks
notify_progress: true
dependent_targets:
- $CLUSTER cluster initialisation
- $WORKLOAD_CLUSTER
steps:
- script: rm
skip_notification: true
args:
- -rf
- certs-$CLUSTER
- command: fetch-certs
skip_notification: true
args:
- $CLUSTER:1
- certs-$CLUSTER
- command: put
skip_notification: true
args:
- $WORKLOAD_CLUSTER
- certs-$CLUSTER
- certs
- script: "pkg/cmd/drtprod/scripts/tpcc_init.sh"
args:
- cct_tpcc
- false
flags:
partitions: $TOTAL_PARTITIONS
replicate-static-columns: true
partition-strategy: leases
warehouses: $WAREHOUSES
db: cct_tpcc
- script: pkg/cmd/drtprod/scripts/populate_workload_keys.sh
- target_name: tpcc_run
notify_progress: true
dependent_targets:
- $CLUSTER cluster initialisation
- $WORKLOAD_CLUSTER
steps:
- script: "pkg/cmd/drtprod/scripts/generate_tpcc_run.sh"
args:
- cct_tpcc
- false
flags:
# Workload parameters for generating a consistent customer-like load
# (open workload). The `active-workers` and `connections` are set to a
# point that would normally saturate the cluster, but gets limited by
# `max-rate` effectively making max-rate a target-rate. The total
# `tpmC` can be estimated by `max-rate` * `WORKLOAD_NODES` * 27.
db: cct_tpcc
warehouses: $WAREHOUSES
active-warehouses: 266666
workers: 266666
conns: 1000
active-workers: 1000
max-rate: 2100
ramp: 20m
wait: 0
# Parameters used for TPC-C benchmarking configured as a closed
# workload.
#db: cct_tpcc
#warehouses: $WAREHOUSES
#active-warehouses: 266666
#workers: 266666
#conns: 960
#active-workers: 960
#ramp: 20m
#wait: 0
Loading