From 41b6e92041c1209bec736f9db74b7cbd48c1f0d2 Mon Sep 17 00:00:00 2001 From: Herko Lategan Date: Mon, 22 Dec 2025 10:34:29 +0000 Subject: [PATCH] drtprod: archive 300 node scale dec 2025 run. Create an entry for the December 2025 300 Node Scale Test. This PR captures the parameters used for the workload and setting up the cluster. The workload had multiple configurations during the run. The difference in parameters is captured as a comment on the configuration. Epic: None Release note: None --- .../archived/2025_12_drt_scale_300.yaml | 207 ++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 pkg/cmd/drtprod/configs/archived/2025_12_drt_scale_300.yaml diff --git a/pkg/cmd/drtprod/configs/archived/2025_12_drt_scale_300.yaml b/pkg/cmd/drtprod/configs/archived/2025_12_drt_scale_300.yaml new file mode 100644 index 000000000000..ced74ebab455 --- /dev/null +++ b/pkg/cmd/drtprod/configs/archived/2025_12_drt_scale_300.yaml @@ -0,0 +1,207 @@ +# YAML for creating and configuring the drt-scale cluster. This also configures Datadog. +# Build the drtprod and roachtest binaries (using --cross=linux) before running this script +# +# Planned Execution Date: 2025-12 +environment: + ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT: 622274581499-compute@developer.gserviceaccount.com + ROACHPROD_DNS: drt.crdb.io + ROACHPROD_GCE_DNS_DOMAIN: drt.crdb.io + ROACHPROD_GCE_DNS_ZONE: drt + ROACHPROD_GCE_DEFAULT_PROJECT: cockroach-drt + CLUSTER: drt-scale-300 + WORKLOAD_CLUSTER: workload-scale-300 + CLUSTER_NODES: 300 + RACKS: 300 + NODES_PER_ZONE: 100 + TOTAL_PARTITIONS: 300 + # Both server-side partitions and client-side partitions were used during this + # run. Client-side partitions were used on the restored database. + PARTITION_TYPE: partitions + WORKLOAD_NODES: 15 + # v25.4.1-14-g05f15f86753 was also used during this run but contained a + # dependency issue resulting in node panics. + VERSION: v25.4.1-16-g02c2602513a + WAREHOUSES: 4000000 + +dependent_file_locations: + - pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller + - pkg/cmd/drtprod/scripts/setup_datadog_cluster + - pkg/cmd/drtprod/scripts/setup_datadog_workload + - pkg/cmd/drtprod/scripts/tpcc_init.sh + - pkg/cmd/drtprod/scripts/generate_tpcc_run.sh + - pkg/cmd/drtprod/scripts/populate_workload_keys.sh + - artifacts/roachtest + - artifacts/drtprod + +targets: + # crdb cluster specs + - target_name: $CLUSTER cluster initialisation + notify_progress: true + steps: + - command: create + args: + - $CLUSTER + flags: + clouds: gce + gce-managed: true + gce-enable-multiple-stores: true + gce-zones: "us-central1-a:$NODES_PER_ZONE,us-central1-b:$NODES_PER_ZONE,us-central1-c:$NODES_PER_ZONE" + nodes: $CLUSTER_NODES + gce-machine-type: n2-standard-16 + local-ssd: false + gce-pd-volume-size: 2048 + gce-pd-volume-type: pd-ssd + gce-pd-volume-count: 2 + os-volume-size: 100 + username: drt + lifetime: 8760h + gce-image: "ubuntu-2204-jammy-v20250112" + - command: sync + skip_notification: true + flags: + clouds: gce + - script: "pkg/cmd/drtprod/scripts/setup_dmsetup_disk_staller" + skip_notification: true + - command: stage + skip_notification: true + args: + - $CLUSTER + - lib # for libgeos + - command: stage + skip_notification: true + args: + - $CLUSTER + - customized + - $VERSION + - script: "pkg/cmd/drtprod/scripts/setup_datadog_cluster" + skip_notification: true + - command: start + args: + - $CLUSTER + - "--binary" + - "./cockroach" + - "--env" # from defaults + - "COCKROACH_TESTING_FORCE_RELEASE_BRANCH=true" + - "--env" # from defaults + - "COCKROACH_INTERNAL_DISABLE_METAMORPHIC_TESTING=true" + - "--env" # for MMA test case scenario + - "COCKROACH_ALLOW_MMA=true" + flags: + # add flag to set provisioned throughput on each store according to their cloud provider limits + enable-fluent-sink: true + store-count: 2 + args: --wal-failover=among-stores + restart: false + sql-port: 26257 + racks: $RACKS + - command: sql + args: + - $CLUSTER:1 + - -- + - -e + - "SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='256 MB'" + # workload cluster specs + - target_name: $WORKLOAD_CLUSTER + steps: + - command: create + args: + - $WORKLOAD_CLUSTER + flags: + clouds: gce + gce-zones: "us-central1-a" + nodes: $WORKLOAD_NODES + gce-machine-type: n2-standard-8 + os-volume-size: 100 + username: workload + lifetime: 8760h + gce-image: "ubuntu-2204-jammy-v20250112" + on_rollback: + - command: destroy + args: + - $WORKLOAD_CLUSTER + - command: sync + flags: + clouds: gce + - command: stage + args: + - $WORKLOAD_CLUSTER + - release + - $VERSION + - command: put + args: + - $WORKLOAD_CLUSTER + - artifacts/roachtest + - roachtest-operations + - command: put + args: + - $WORKLOAD_CLUSTER + - artifacts/drtprod + - script: "pkg/cmd/drtprod/scripts/setup_datadog_workload" + - target_name: post_tasks + notify_progress: true + dependent_targets: + - $CLUSTER cluster initialisation + - $WORKLOAD_CLUSTER + steps: + - script: rm + skip_notification: true + args: + - -rf + - certs-$CLUSTER + - command: fetch-certs + skip_notification: true + args: + - $CLUSTER:1 + - certs-$CLUSTER + - command: put + skip_notification: true + args: + - $WORKLOAD_CLUSTER + - certs-$CLUSTER + - certs + - script: "pkg/cmd/drtprod/scripts/tpcc_init.sh" + args: + - cct_tpcc + - false + flags: + partitions: $TOTAL_PARTITIONS + replicate-static-columns: true + partition-strategy: leases + warehouses: $WAREHOUSES + db: cct_tpcc + - script: pkg/cmd/drtprod/scripts/populate_workload_keys.sh + - target_name: tpcc_run + notify_progress: true + dependent_targets: + - $CLUSTER cluster initialisation + - $WORKLOAD_CLUSTER + steps: + - script: "pkg/cmd/drtprod/scripts/generate_tpcc_run.sh" + args: + - cct_tpcc + - false + flags: + # Workload parameters for generating a consistent customer-like load + # (open workload). The `active-workers` and `connections` are set to a + # point that would normally saturate the cluster, but gets limited by + # `max-rate` effectively making max-rate a target-rate. The total + # `tpmC` can be estimated by `max-rate` * `WORKLOAD_NODES` * 27. + db: cct_tpcc + warehouses: $WAREHOUSES + active-warehouses: 266666 + workers: 266666 + conns: 1000 + active-workers: 1000 + max-rate: 2100 + ramp: 20m + wait: 0 + # Parameters used for TPC-C benchmarking configured as a closed + # workload. + #db: cct_tpcc + #warehouses: $WAREHOUSES + #active-warehouses: 266666 + #workers: 266666 + #conns: 960 + #active-workers: 960 + #ramp: 20m + #wait: 0