From 151536bbee9642edc4a4985a61217ca52e3ce8b7 Mon Sep 17 00:00:00 2001 From: Ludovic Leroux Date: Thu, 4 Dec 2025 14:15:42 -0500 Subject: [PATCH] roachtest: randomize volume type and XFS Prior to this patch, tests had three possiblities for volume types: 1. Specify `PreferLocalSSD()` `DisableLocalSSD()` options. 2. Force a specific volume type via `VolumeType()` (volume type being provider specific, this was only possible for tests that target a single provider). 3. Rely on the default behavior for `PreferLocalSSD()` and use the default volume type in roachprod for the provider in case local SSD was not selected. The way `PreferLocalSSD()` was implemented meant that most of the tests were actually running on local SSDs (when available on the machine type), leading to a gap in the testing strategy with regards to volume types. This patch brings the new ClusterSpec option `RandomizeVolumeType()`. In case volume type is not forced and neither `PreferLocalSSD()` or `DisableLocalSSD()` are specified, specifying `RandomizeVolumeType()` will take precendence over the `--local-ssd` argument and will randomly select a volume type from those available in the targeted provider: - AWS: gp3, io2, local SSD - GCE: pd-ssd, local SSD - Azure: premium-ssd, premium-ssd-v2, ultra-disk, local SSD - IBM: 10iops-tier Note: volume type randomization gives the same weight to all options. This patch also introduces a random chance of provisioning XFS as a filesystem via the option `RandomlyUseXfs()`. The option is built in the same way as `RandomlyUseZfs()`, granting a 20% chance of XFS if present. If both `RandomlyUseZfs()` and `RandomlyUseXfs()` are used, they both get a 20% chance, leaving 60% chance for the default ext4. Epic: none Fixes: 146661 Release note: None --- pkg/cmd/roachtest/spec/cluster_spec.go | 187 ++++++++++++++---- pkg/cmd/roachtest/spec/cluster_spec_test.go | 21 ++ pkg/cmd/roachtest/spec/option.go | 25 ++- pkg/cmd/roachtest/test_runner.go | 14 +- ...mission_control_disk_bandwidth_overload.go | 13 +- .../tests/admission_control_elastic_io.go | 10 +- .../admission_control_intent_resolution.go | 8 +- .../admission_control_snapshot_overload_io.go | 2 + pkg/cmd/roachtest/tests/kv.go | 80 ++++++-- pkg/cmd/roachtest/tests/kvbench.go | 8 +- pkg/roachprod/vm/aws/aws.go | 60 +++++- pkg/roachprod/vm/aws/support_test.go | 100 ++++++++++ pkg/roachprod/vm/gce/gcloud.go | 5 +- pkg/roachprod/vm/vm.go | 2 +- 14 files changed, 467 insertions(+), 68 deletions(-) diff --git a/pkg/cmd/roachtest/spec/cluster_spec.go b/pkg/cmd/roachtest/spec/cluster_spec.go index 2c071c25cf07..6da46cb73a9b 100644 --- a/pkg/cmd/roachtest/spec/cluster_spec.go +++ b/pkg/cmd/roachtest/spec/cluster_spec.go @@ -32,6 +32,9 @@ const ( // Extra labels added by roachtest RoachtestBranch = "roachtest-branch" + + MetamorphicVolumeType = "MetamorphicVolumeType" + MetamorphicFilesystem = "MetamorphicFilesystem" ) type MemPerCPU int @@ -123,7 +126,9 @@ type ClusterSpec struct { // to be used. The default is ext4. FileSystem fileSystemType - RandomlyUseZfs bool + RandomlyUseZfs bool + RandomlyUseXfs bool + RandomizeVolumeType bool GatherCores bool @@ -150,11 +155,16 @@ type ClusterSpec struct { // VolumeIOPS is the provisioned IOPS for ultra-disks. VolumeIOPS int } `cloud:"azure"` + + ExposedMetamorphicInfo map[string]string `compareIgnore:"true"` } // MakeClusterSpec makes a ClusterSpec. func MakeClusterSpec(nodeCount int, opts ...Option) ClusterSpec { - spec := ClusterSpec{NodeCount: nodeCount} + spec := ClusterSpec{ + NodeCount: nodeCount, + ExposedMetamorphicInfo: make(map[string]string), + } defaultOpts := []Option{CPU(4), WorkloadNodeCPU(4), nodeLifetime(12 * time.Hour), ReuseAny()} for _, o := range append(defaultOpts, opts...) { o(&spec) @@ -165,24 +175,38 @@ func MakeClusterSpec(nodeCount int, opts ...Option) ClusterSpec { // ClustersCompatible returns true if the clusters are compatible, i.e. the test // asking for s2 can reuse s1. func ClustersCompatible(s1, s2 ClusterSpec, cloud Cloud) bool { - // only consider the specification of the cloud that we are running in + // Clear cloud-specific and comparison-irrelevant fields. clearClusterSpecFields(&s1, cloud) clearClusterSpecFields(&s2, cloud) - return s1 == s2 + + // We use `reflect.DeepEqual` instead of simple direct `==` comparison + // because ClusterSpec contains map fields which are not comparable + // with direct comparison. + return reflect.DeepEqual(s1, s2) } // clearClusterSpecFields clears the cloud specific specification from the cluster spec // if the cloud specification does not match the target cloud. This is done to ensure that // the specification for other clouds are not considered while comparing the cluster specifications. +// It also clears fields marked with `compareIgnore:"true"` tag, which should be excluded from +// cluster compatibility comparisons (e.g., ExposedMetamorphicInfo for tracking metamorphic choices). func clearClusterSpecFields(cs *ClusterSpec, targetCloud Cloud) { cs.Lifetime = 0 structType := reflect.TypeOf(*cs) for i := 0; i < structType.NumField(); i++ { field := structType.Field(i) + fieldValue := reflect.ValueOf(cs).Elem().FieldByName(field.Name) + + // Clear fields marked with compareIgnore tag - these should not affect + // cluster compatibility (e.g., metamorphic info is just metadata) + if _, ok := field.Tag.Lookup("compareIgnore"); ok { + fieldValue.Set(reflect.Zero(fieldValue.Type())) + continue + } + + // Clear cloud-specific fields if they don't match the target cloud if tag, ok := field.Tag.Lookup("cloud"); ok { - // Zero out struct if it is not the target cloud. if !strings.EqualFold(tag, targetCloud.String()) { - fieldValue := reflect.ValueOf(cs).Elem().FieldByName(field.Name) fieldValue.Set(reflect.Zero(fieldValue.Type())) } } @@ -206,6 +230,10 @@ func (s ClusterSpec) String() string { return str } +func (s ClusterSpec) GetMetamorphicInfo() map[string]string { + return s.ExposedMetamorphicInfo +} + // checks if an AWS machine supports SSD volumes func awsMachineSupportsSSD(machineType string) bool { typeAndSize := strings.Split(machineType, ".") @@ -366,14 +394,6 @@ func (s *ClusterSpec) RoachprodOpts( useIOBarrier := params.UseIOBarrierOnLocalSSD requestedArch := params.PreferredArch - preferLocalSSD := params.Defaults.PreferLocalSSD - switch s.LocalSSD { - case LocalSSDDisable: - preferLocalSSD = false - case LocalSSDPreferOn: - preferLocalSSD = true - } - createVMOpts := vm.DefaultCreateOpts() // N.B. We set "usage=roachtest" as the default, custom label for billing tracking. createVMOpts.CustomLabels = map[string]string{vm.TagUsage: "roachtest"} @@ -435,7 +455,12 @@ func (s *ClusterSpec) RoachprodOpts( var err error switch cloud { case AWS: - machineType, selectedArch, err = SelectAWSMachineType(s.CPUs, s.Mem, preferLocalSSD && s.VolumeSize == 0, requestedArch) + // We always pass true for shouldSupportLocalSSD here because the machine type selection + // logic should not depend on the user's preference for local SSDs. + // The actual decision to use provisioned local SSDs is handled in the disk configuration logic + // at the provider level, and EBS volume have priority over local SSDs. + // This means that if both EBS and local SSDs are available, EBS will be used. + machineType, selectedArch, err = SelectAWSMachineType(s.CPUs, s.Mem, true, requestedArch) case GCE: machineType, selectedArch = SelectGCEMachineType(s.CPUs, s.Mem, requestedArch) case Azure: @@ -452,34 +477,26 @@ func (s *ClusterSpec) RoachprodOpts( createVMOpts.Arch = string(selectedArch) } } - - // Local SSD can only be requested - // - if configured to prefer doing so, - // - if no particular volume size is requested, and, - // - on AWS, if the machine type supports it. - // - on GCE, if the machine type is not ARM64. - if preferLocalSSD && s.VolumeSize == 0 && (cloud != AWS || awsMachineSupportsSSD(machineType)) && - (cloud != GCE || selectedArch != vm.ArchARM64) { - // Ensure SSD count is at least 1 if UseLocalSSD is true. - if ssdCount == 0 { - ssdCount = 1 - } - createVMOpts.SSDOpts.UseLocalSSD = true - createVMOpts.SSDOpts.NoExt4Barrier = !useIOBarrier - } else { - createVMOpts.SSDOpts.UseLocalSSD = false - } } switch s.FileSystem { case Ext4: - // ext4 is the default, do nothing unless we randomly want to use zfs - if s.RandomlyUseZfs { + // ext4 is the default, but we can randomly select zfs/xfs if requested. + // Each alternative filesystem gets a 20% chance of being selected, + // leaving the remainder for ext4. + if s.RandomlyUseZfs || s.RandomlyUseXfs { rng, _ := randutil.NewPseudoRand() - if rng.Float64() <= 0.2 { + randFloat := rng.Float64() + + if s.RandomlyUseZfs && randFloat <= 0.2 { createVMOpts.SSDOpts.FileSystem = vm.Zfs + } else if s.RandomlyUseXfs && randFloat > 0.2 && randFloat <= 0.4 { + createVMOpts.SSDOpts.FileSystem = vm.Xfs } + + s.ExposedMetamorphicInfo[MetamorphicFilesystem] = string(createVMOpts.SSDOpts.FileSystem) } + case Zfs: createVMOpts.SSDOpts.FileSystem = vm.Zfs case Xfs: @@ -492,11 +509,101 @@ func (s *ClusterSpec) RoachprodOpts( return vm.CreateOpts{}, nil, nil, "", errors.Errorf("unknown file system type: %v", s.FileSystem) } + // Determine which storage type to use based on the following priority order: + // 1. Explicit volume type: If the user explicitly set s.VolumeType, use it. + // 2. Forced local SSD: If LocalSSDPreferOn is set, always use local SSD if available. + // 3. Randomized storage: If RandomizeVolumeType is enabled, randomly select + // from available storage types (cloud-specific volumes + optionally local SSD). + // Local SSD is excluded from randomization if LocalSSDDisable is set. + // 4. Default behavior: If params.Defaults.PreferLocalSSD is true AND the user + // did not explicitly disable local SSD, prefer local SSD. + // + // The selected storage type is then validated against availability constraints + // (e.g., volume size, machine type, architecture) before being applied. + selectedVolumeType := "" + switch { + case s.VolumeType != "": + // User explicitly set a volume type, use it directly. + selectedVolumeType = s.VolumeType + + case s.LocalSSD == LocalSSDPreferOn: + // User forced local SSD preference. + selectedVolumeType = "local-ssd" + + case s.RandomizeVolumeType: + // If the user selected RandomizeVolumeType, randomly pick a volume type + // from the available volume types. + availableVolumeTypes := []string{} + + // If the user did not explicitly disable local SSD and local SSD is available + // for the selected cloud provider, machine type and architecture, + // add it to the list of available volume types. + if s.LocalSSD != LocalSSDDisable && s.isLocalSSDAvailable(cloud, machineType, selectedArch) { + availableVolumeTypes = append(availableVolumeTypes, "local-ssd") + } + + switch cloud { + case AWS: + availableVolumeTypes = append(availableVolumeTypes, "gp3", "io2") + case GCE: + availableVolumeTypes = append(availableVolumeTypes, "pd-ssd") + case Azure: + availableVolumeTypes = append(availableVolumeTypes, "premium-ssd", "premium-ssd-v2", "ultra-disk") + } + + if len(availableVolumeTypes) > 0 { + rng, _ := randutil.NewPseudoRand() + selectedVolumeType = availableVolumeTypes[rng.Intn(len(availableVolumeTypes))] + + s.ExposedMetamorphicInfo[MetamorphicVolumeType] = selectedVolumeType + } + + case s.LocalSSD != LocalSSDDisable && params.Defaults.PreferLocalSSD: + // No forced preference, no randomization, but default is to use local SSD + // if available. + selectedVolumeType = "local-ssd" + } + + // Local SSD will be used if selected (either by preference or randomly), and + // - if no particular volume size is requested, and, + // - on AWS, if the machine type supports it. + // - on GCE, if the machine type is not ARM64. + if selectedVolumeType == "local-ssd" { + if s.isLocalSSDAvailable(cloud, machineType, selectedArch) { + if ssdCount == 0 { + ssdCount = 1 + } + createVMOpts.SSDOpts.UseLocalSSD = true + + // Disable ext4 barriers for local SSDs unless explicitly requested. + // This is because local SSDs have very low latency and ext4 barriers + // can significantly degrade performance. + // This setting is only relevant if the selected filesystem is ext4. + if !useIOBarrier && createVMOpts.SSDOpts.FileSystem == vm.Ext4 { + createVMOpts.SSDOpts.NoExt4Barrier = true + } + } else { + // Local SSD was selected but is not available; fall back to default volume type. + fmt.Printf( + "WARN: local SSD selected but not available for machine type %s or because volume size %d != 0;"+ + "falling back to default volume type\n", + machineType, + s.VolumeSize, + ) + createVMOpts.SSDOpts.UseLocalSSD = false + } + } else { + createVMOpts.SSDOpts.UseLocalSSD = false + if selectedVolumeType != "" { + s.VolumeType = selectedVolumeType + } + } + var workloadMachineType string var err error switch cloud { case AWS: - workloadMachineType, _, err = SelectAWSMachineType(s.WorkloadNodeCPUs, s.Mem, preferLocalSSD && s.VolumeSize == 0, selectedArch) + workloadMachineType, _, err = SelectAWSMachineType(s.WorkloadNodeCPUs, s.Mem, false, selectedArch) case GCE: workloadMachineType, _ = SelectGCEMachineType(s.WorkloadNodeCPUs, s.Mem, selectedArch) case Azure: @@ -542,6 +649,14 @@ func (s *ClusterSpec) RoachprodOpts( return createVMOpts, providerOpts, workloadProviderOpts, selectedArch, nil } +func (s *ClusterSpec) isLocalSSDAvailable( + cloud Cloud, machineType string, selectedArch vm.CPUArch, +) bool { + return s.VolumeSize == 0 && + (cloud != AWS || awsMachineSupportsSSD(machineType)) && + (cloud != GCE || selectedArch != vm.ArchARM64) +} + // SetRoachprodOptsZones updates the providerOpts with the VM zones as specified in the params/spec. // We separate this logic from RoachprodOpts as we may need to call this multiple times in order to // randomize the default GCE zone. diff --git a/pkg/cmd/roachtest/spec/cluster_spec_test.go b/pkg/cmd/roachtest/spec/cluster_spec_test.go index 81f55a529684..da9049015a12 100644 --- a/pkg/cmd/roachtest/spec/cluster_spec_test.go +++ b/pkg/cmd/roachtest/spec/cluster_spec_test.go @@ -44,3 +44,24 @@ func TestClustersCompatible(t *testing.T) { require.True(t, ClustersCompatible(s1, s2, AWS)) }) } + +func TestClustersRetainClearedInfo(t *testing.T) { + // Adding a test in case we switch the ClustersCompatible signature to take + // pointers to ClusterSpec in the future. + t.Run("original structs are not modified", func(t *testing.T) { + s1 := ClusterSpec{ + NodeCount: 5, + ExposedMetamorphicInfo: map[string]string{"VolumeType": "io2"}, + } + s2 := ClusterSpec{ + NodeCount: 5, + ExposedMetamorphicInfo: map[string]string{"VolumeType": "gp3"}, + } + + ClustersCompatible(s1, s2, GCE) + + // Original data should still be there + require.Equal(t, "io2", s1.ExposedMetamorphicInfo["VolumeType"]) + require.Equal(t, "gp3", s2.ExposedMetamorphicInfo["VolumeType"]) + }) +} diff --git a/pkg/cmd/roachtest/spec/option.go b/pkg/cmd/roachtest/spec/option.go index 22f1584477b3..be32aa62853e 100644 --- a/pkg/cmd/roachtest/spec/option.go +++ b/pkg/cmd/roachtest/spec/option.go @@ -204,6 +204,21 @@ func DisableLocalSSD() Option { } } +// RandomizeVolumeType is an Option which randomly picks the volume type +// to be used. Unless SSD is forced, the volume type is picked randomly +// between the available types for a provider: +// - GCE: pd-ssd, local-ssd +// - AWS: gp3, io2, local-ssd +// - Azure: premium-ssd, premium-ssd-v2, ultra-disk, local-ssd +// - IBM: 10iops-tier +// Note: this option has no effect if VolumeType is explicitly set +// or PreferLocalSSD/DisableLocalSSD is used. +func RandomizeVolumeType() Option { + return func(spec *ClusterSpec) { + spec.RandomizeVolumeType = true + } +} + // TerminateOnMigration ensures VM is terminated in case GCE triggers a live migration. func TerminateOnMigration() Option { return func(spec *ClusterSpec) { @@ -235,13 +250,21 @@ func SetFileSystem(fs fileSystemType) Option { // RandomlyUseZfs is an Option which randomly picks // the file system to be used, and sets it to zfs, // about 20% of the time. -// Zfs is only picked if the cloud is gce. func RandomlyUseZfs() Option { return func(spec *ClusterSpec) { spec.RandomlyUseZfs = true } } +// RandomlyUseXfs is an Option which randomly picks +// the file system to be used, and sets it to xfs, +// about 20% of the time. +func RandomlyUseXfs() Option { + return func(spec *ClusterSpec) { + spec.RandomlyUseXfs = true + } +} + // GCEMachineType sets the machine (instance) type when the cluster is on GCE. func GCEMachineType(machineType string) Option { return func(spec *ClusterSpec) { diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 0cf3cc82f320..57ce72fddee2 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -321,15 +321,15 @@ func (r *testRunner) Run( // We should also check against the spec of the cluster, but we don't // currently have a way of doing that; we're relying on the fact that attaching to the cluster // will fail if the cluster is incompatible. - spec := tests[0].Cluster - spec.Lifetime = 0 + spec1 := tests[0].Cluster + spec1.Lifetime = 0 for i := 1; i < len(tests); i++ { spec2 := tests[i].Cluster spec2.Lifetime = 0 - if spec != spec2 { + if !spec.ClustersCompatible(spec1, spec2, roachtestflags.Cloud) { return errors.Errorf("cluster specified but found tests "+ "with incompatible specs: %s (%s) - %s (%s)", - tests[0].Name, spec, tests[i].Name, spec2, + tests[0].Name, spec1, tests[i].Name, spec2, ) } } @@ -2197,6 +2197,12 @@ func getTestParameters(t *testImpl, c *clusterImpl, createOpts *vm.CreateOpts) m if spec.Cluster.Arch != "" { clusterParams["arch"] = string(spec.Cluster.Arch) } + + // Include cluster spec metamorphic test parameters. + for k, v := range spec.Cluster.GetMetamorphicInfo() { + clusterParams[k] = v + } + // These params can be probabilistically set, so we pass them here to // show what their actual values are in the posted issue. if createOpts != nil { diff --git a/pkg/cmd/roachtest/tests/admission_control_disk_bandwidth_overload.go b/pkg/cmd/roachtest/tests/admission_control_disk_bandwidth_overload.go index 741fc0f541a5..1d73c31026b2 100644 --- a/pkg/cmd/roachtest/tests/admission_control_disk_bandwidth_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_disk_bandwidth_overload.go @@ -38,9 +38,16 @@ func registerDiskBandwidthOverload(r registry.Registry) { Benchmark: true, CompatibleClouds: registry.AllExceptAzure, // TODO(aaditya): change to weekly once the test stabilizes. - Suites: registry.Suites(registry.Nightly), - Cluster: r.MakeClusterSpec(2, spec.CPU(8), spec.WorkloadNode(), spec.ReuseNone()), - Leases: registry.MetamorphicLeases, + Suites: registry.Suites(registry.Nightly), + Cluster: r.MakeClusterSpec( + 2, + spec.CPU(8), + spec.WorkloadNode(), + spec.ReuseNone(), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ), + Leases: registry.MetamorphicLeases, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { if c.Spec().NodeCount != 2 { t.Fatalf("expected 2 nodes, found %d", c.Spec().NodeCount) diff --git a/pkg/cmd/roachtest/tests/admission_control_elastic_io.go b/pkg/cmd/roachtest/tests/admission_control_elastic_io.go index 20b4cfacc40f..481ca6c2399a 100644 --- a/pkg/cmd/roachtest/tests/admission_control_elastic_io.go +++ b/pkg/cmd/roachtest/tests/admission_control_elastic_io.go @@ -40,8 +40,14 @@ func registerElasticIO(r registry.Registry) { Suites: registry.Suites(registry.Nightly), // Tags: registry.Tags(`weekly`), // Second node is solely for Prometheus. - Cluster: r.MakeClusterSpec(2, spec.CPU(8), spec.WorkloadNode()), - Leases: registry.MetamorphicLeases, + Cluster: r.MakeClusterSpec( + 2, + spec.CPU(8), + spec.WorkloadNode(), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ), + Leases: registry.MetamorphicLeases, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { if c.IsLocal() { t.Skip("IO overload test is not meant to run locally") diff --git a/pkg/cmd/roachtest/tests/admission_control_intent_resolution.go b/pkg/cmd/roachtest/tests/admission_control_intent_resolution.go index c082e75e24a1..5877d0e1848d 100644 --- a/pkg/cmd/roachtest/tests/admission_control_intent_resolution.go +++ b/pkg/cmd/roachtest/tests/admission_control_intent_resolution.go @@ -40,7 +40,13 @@ func registerIntentResolutionOverload(r registry.Registry) { // TODO(sumeer): Reduce to weekly after working well. // Tags: registry.Tags(`weekly`), // Second node is solely for Prometheus. - Cluster: r.MakeClusterSpec(2, spec.CPU(8), spec.WorkloadNode()), + Cluster: r.MakeClusterSpec( + 2, + spec.CPU(8), + spec.WorkloadNode(), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ), Leases: registry.MetamorphicLeases, CompatibleClouds: registry.AllExceptAWS, Suites: registry.Suites(registry.Nightly), diff --git a/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go b/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go index 0badfef04acc..4514c663d3d6 100644 --- a/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go +++ b/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go @@ -47,6 +47,8 @@ func registerSnapshotOverloadIO(r registry.Registry) { spec.VolumeSize(cfg.volumeSize), spec.ReuseNone(), spec.DisableLocalSSD(), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), ), Leases: registry.MetamorphicLeases, Timeout: 12 * time.Hour, diff --git a/pkg/cmd/roachtest/tests/kv.go b/pkg/cmd/roachtest/tests/kv.go index d239219d95ba..b1e999f9cba2 100644 --- a/pkg/cmd/roachtest/tests/kv.go +++ b/pkg/cmd/roachtest/tests/kv.go @@ -343,7 +343,16 @@ func registerKV(r registry.Registry) { if opts.nodes > 3 { workloadNodeCPUs = opts.cpus } - cSpec := r.MakeClusterSpec(opts.nodes+1, spec.CPU(opts.cpus), spec.WorkloadNode(), spec.WorkloadNodeCPU(workloadNodeCPUs), spec.SSD(opts.ssds), spec.RAID0(opts.raid0)) + cSpec := r.MakeClusterSpec( + opts.nodes+1, + spec.CPU(opts.cpus), + spec.WorkloadNode(), + spec.WorkloadNodeCPU(workloadNodeCPUs), + spec.SSD(opts.ssds), + spec.RAID0(opts.raid0), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ) var clouds registry.CloudSet tags := make(map[string]struct{}) @@ -389,10 +398,15 @@ func registerKV(r registry.Registry) { func registerKVContention(r registry.Registry) { const nodes = 4 r.Add(registry.TestSpec{ - Name: fmt.Sprintf("kv/contention/nodes=%d", nodes), - Owner: registry.OwnerKV, - Benchmark: true, - Cluster: r.MakeClusterSpec(nodes+1, spec.WorkloadNode()), + Name: fmt.Sprintf("kv/contention/nodes=%d", nodes), + Owner: registry.OwnerKV, + Benchmark: true, + Cluster: r.MakeClusterSpec( + nodes+1, + spec.WorkloadNode(), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ), CompatibleClouds: registry.AllExceptAWS, Suites: registry.Suites(registry.Nightly), Leases: registry.MetamorphicLeases, @@ -534,9 +548,14 @@ func registerKVQuiescenceDead(r registry.Registry) { func registerKVGracefulDraining(r registry.Registry) { r.Add(registry.TestSpec{ - Name: "kv/gracefuldraining", - Owner: registry.OwnerKV, - Cluster: r.MakeClusterSpec(7, spec.WorkloadNode()), + Name: "kv/gracefuldraining", + Owner: registry.OwnerKV, + Cluster: r.MakeClusterSpec( + 7, + spec.WorkloadNode(), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ), CompatibleClouds: registry.OnlyGCE, Suites: registry.Suites(registry.Nightly), Leases: registry.MetamorphicLeases, @@ -783,7 +802,12 @@ func registerKVSplits(r registry.Registry) { Name: name, Owner: registry.OwnerKV, Timeout: item.timeout, - Cluster: r.MakeClusterSpec(4, spec.WorkloadNode()), + Cluster: r.MakeClusterSpec( + 4, + spec.WorkloadNode(), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ), // These tests are carefully tuned to succeed up to certain number of // splits; they are flaky in slower environments. CompatibleClouds: registry.Clouds(spec.GCE, spec.Local), @@ -851,9 +875,16 @@ func registerKVScalability(r registry.Registry) { for _, p := range []int{0, 95} { p := p r.Add(registry.TestSpec{ - Name: fmt.Sprintf("kv%d/scale/nodes=6", p), - Owner: registry.OwnerKV, - Cluster: r.MakeClusterSpec(7, spec.CPU(8), spec.WorkloadNode(), spec.WorkloadNodeCPU(8)), + Name: fmt.Sprintf("kv%d/scale/nodes=6", p), + Owner: registry.OwnerKV, + Cluster: r.MakeClusterSpec( + 7, + spec.CPU(8), + spec.WorkloadNode(), + spec.WorkloadNodeCPU(8), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ), CompatibleClouds: registry.AllExceptAWS, Suites: registry.Suites(registry.Nightly), Leases: registry.MetamorphicLeases, @@ -986,9 +1017,16 @@ func registerKVRangeLookups(r registry.Registry) { panic("unexpected") } r.Add(registry.TestSpec{ - Name: fmt.Sprintf("kv50/rangelookups/%s/nodes=%d", workloadName, nodes), - Owner: registry.OwnerKV, - Cluster: r.MakeClusterSpec(nodes+1, spec.CPU(cpus), spec.WorkloadNode(), spec.WorkloadNodeCPU(cpus)), + Name: fmt.Sprintf("kv50/rangelookups/%s/nodes=%d", workloadName, nodes), + Owner: registry.OwnerKV, + Cluster: r.MakeClusterSpec( + nodes+1, + spec.CPU(cpus), + spec.WorkloadNode(), + spec.WorkloadNodeCPU(cpus), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ), CompatibleClouds: registry.AllExceptAWS, Suites: registry.Suites(registry.Nightly), Leases: registry.MetamorphicLeases, @@ -1013,8 +1051,16 @@ func registerKVRestartImpact(r registry.Registry) { Suites: registry.Suites(registry.Weekly), Owner: registry.OwnerAdmissionControl, Timeout: 4 * time.Hour, - Cluster: r.MakeClusterSpec(13, spec.CPU(8), spec.WorkloadNode(), spec.WorkloadNodeCPU(8), spec.DisableLocalSSD()), - Leases: registry.MetamorphicLeases, + Cluster: r.MakeClusterSpec( + 13, + spec.CPU(8), + spec.WorkloadNode(), + spec.WorkloadNodeCPU(8), + spec.DisableLocalSSD(), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + ), + Leases: registry.MetamorphicLeases, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { nodes := len(c.CRDBNodes()) startOpts := option.NewStartOpts(option.NoBackupSchedule) diff --git a/pkg/cmd/roachtest/tests/kvbench.go b/pkg/cmd/roachtest/tests/kvbench.go index c7f48cf504f5..892cdaee191d 100644 --- a/pkg/cmd/roachtest/tests/kvbench.go +++ b/pkg/cmd/roachtest/tests/kvbench.go @@ -63,7 +63,13 @@ func registerKVBenchSpec(r registry.Registry, b kvBenchSpec) { if b.NumShards > 0 { nameParts = append(nameParts, fmt.Sprintf("shards=%d", b.NumShards)) } - opts := []spec.Option{spec.CPU(b.CPUs), spec.WorkloadNode(), spec.WorkloadNodeCPU(b.CPUs)} + opts := []spec.Option{ + spec.CPU(b.CPUs), + spec.WorkloadNode(), + spec.WorkloadNodeCPU(b.CPUs), + spec.RandomizeVolumeType(), + spec.RandomlyUseXfs(), + } switch b.KeyDistribution { case sequential: nameParts = append(nameParts, "sequential") diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index e0964376bb0e..696da76dce75 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -1343,7 +1343,8 @@ func (p *Provider) runInstance( extraMountOpts := "" // Dynamic args. if opts.SSDOpts.UseLocalSSD { - if opts.SSDOpts.NoExt4Barrier { + // Disable ext4 barriers if specified and using ext4. + if opts.SSDOpts.NoExt4Barrier && opts.SSDOpts.FileSystem == vm.Ext4 { extraMountOpts = "nobarrier" } } @@ -1580,6 +1581,56 @@ func getSpotInstanceRequestId( return spotInstanceRequestId, nil } +// calculateProvisionedIOPS calculates the appropriate IOPS for io1/io2 volumes +// based on volume size, respecting AWS constraints. +// +// AWS enforces maximum IOPS-to-size ratios: +// - io1: 50 IOPS/GB (max 64,000 IOPS) +// - io2: 500 IOPS/GB for standard, 1000 IOPS/GB for Block Express (max 256,000 IOPS) +// +// We use 10 IOPS/GB as a baseline to match Azure's ultra-disk default ratio, +// with a minimum of 3,000 IOPS (matching gp3 baseline), but we must respect +// AWS's IOPS-to-size ratio constraints. +func calculateProvisionedIOPS(volumeType string, volumeSize int) int { + if volumeType != "io1" && volumeType != "io2" { + return 0 + } + + // Calculate baseline: 10 IOPS/GB + iops := volumeSize * 10 + + // Determine AWS constraints for this volume type + var maxIOPSPerGB int + var absoluteMaxIOPS int + switch volumeType { + case "io1": + maxIOPSPerGB = 50 + absoluteMaxIOPS = 64000 + case "io2": + // As of April 2025, all io2 volumes are Block Express with 1000 IOPS/GB. + // We use the more conservative 500 IOPS/GB for compatibility. + maxIOPSPerGB = 500 + absoluteMaxIOPS = 64000 // Use 64k for compatibility; Block Express supports 256k + } + + // Apply constraint-aware minimum + if iops < 3000 { + // Set a minimum of 3,000 IOPS (matching gp3 baseline) + iops = 3000 + + // But if that exceeds the maximum allowed IOPS for this volume size, + // set to the maximum allowed instead. + maxAllowedIOPS := volumeSize * maxIOPSPerGB + if iops > maxAllowedIOPS { + iops = maxAllowedIOPS + } + } else if iops > absoluteMaxIOPS { + iops = absoluteMaxIOPS + } + + return iops +} + func genDeviceMapping(ebsVolumes ebsVolumeList, args []string) ([]string, error) { mapping, err := json.Marshal(ebsVolumes) if err != nil { @@ -1627,6 +1678,13 @@ func assignEBSVolumes(opts *vm.CreateOpts, providerOpts *ProviderOpts) ebsVolume v := ebsVolumes.newVolume() v.Disk = providerOpts.DefaultEBSVolume.Disk v.Disk.DeleteOnTermination = true + + // io2/io1 volumes require IOPS to be specified. If not already set, + // calculate based on volume size using AWS-compliant logic. + if v.Disk.IOPs == 0 { + v.Disk.IOPs = calculateProvisionedIOPS(v.Disk.VolumeType, v.Disk.VolumeSize) + } + ebsVolumes = append(ebsVolumes, v) } } diff --git a/pkg/roachprod/vm/aws/support_test.go b/pkg/roachprod/vm/aws/support_test.go index 8b64452e7c01..1ed51e2486dd 100644 --- a/pkg/roachprod/vm/aws/support_test.go +++ b/pkg/roachprod/vm/aws/support_test.go @@ -25,3 +25,103 @@ func TestWriteStartupScriptTemplate(t *testing.T) { echotest.Require(t, string(f), datapathutils.TestDataPath(t, "startup_script")) } + +// TestIOPSCalculation tests that IOPS are calculated correctly for io1/io2 volumes, +// respecting AWS's maximum IOPS-to-size ratio constraints. +func TestIOPSCalculation(t *testing.T) { + testCases := []struct { + name string + volumeType string + volumeSize int + expectedIOPS int + description string + }{ + // io1 volume tests (50 IOPS/GB max) + { + name: "io1_50gb_should_cap_at_2500", + volumeType: "io1", + volumeSize: 50, + expectedIOPS: 2500, // 50 GB * 50 IOPS/GB = 2,500 (not 3,000) + description: "50GB io1 volume should cap at 2,500 IOPS, not apply 3,000 minimum", + }, + { + name: "io1_30gb_should_cap_at_1500", + volumeType: "io1", + volumeSize: 30, + expectedIOPS: 1500, // 30 GB * 50 IOPS/GB = 1,500 (not 3,000) + description: "30GB io1 volume should cap at 1,500 IOPS", + }, + { + name: "io1_59gb_should_cap_at_2950", + volumeType: "io1", + volumeSize: 59, + expectedIOPS: 2950, // 59 GB * 50 IOPS/GB = 2,950 (not 3,000) + description: "59GB io1 volume should cap at 2,950 IOPS", + }, + { + name: "io1_60gb_should_use_3000_minimum", + volumeType: "io1", + volumeSize: 60, + expectedIOPS: 3000, // 60 GB * 50 IOPS/GB = 3,000 (minimum applies) + description: "60GB io1 volume should use 3,000 IOPS minimum", + }, + { + name: "io1_100gb_uses_calculated_1000_iops", + volumeType: "io1", + volumeSize: 100, + expectedIOPS: 3000, // 100 GB * 10 IOPS/GB = 1,000, but minimum is 3,000 + description: "100GB io1 volume uses 3,000 IOPS minimum (calculated 1,000 < 3,000)", + }, + { + name: "io1_500gb_uses_5000_iops", + volumeType: "io1", + volumeSize: 500, + expectedIOPS: 5000, // 500 GB * 10 IOPS/GB = 5,000 + description: "500GB io1 volume uses calculated 5,000 IOPS", + }, + { + name: "io1_max_size_caps_at_64000", + volumeType: "io1", + volumeSize: 10000, + expectedIOPS: 64000, // 10,000 GB * 10 IOPS/GB = 100,000, but max is 64,000 + description: "Large io1 volume should cap at 64,000 IOPS", + }, + + // io2 volume tests (500 IOPS/GB max for standard, using conservative limit) + { + name: "io2_5gb_should_cap_at_2500", + volumeType: "io2", + volumeSize: 5, + expectedIOPS: 2500, // 5 GB * 500 IOPS/GB = 2,500 (not 3,000) + description: "5GB io2 volume should cap at 2,500 IOPS", + }, + { + name: "io2_6gb_should_use_3000_minimum", + volumeType: "io2", + volumeSize: 6, + expectedIOPS: 3000, // 6 GB * 500 IOPS/GB = 3,000 (minimum applies) + description: "6GB io2 volume should use 3,000 IOPS minimum", + }, + { + name: "io2_500gb_uses_5000_iops", + volumeType: "io2", + volumeSize: 500, + expectedIOPS: 5000, // 500 GB * 10 IOPS/GB = 5,000 + description: "500GB io2 volume uses calculated 5,000 IOPS", + }, + { + name: "io2_max_size_caps_at_64000", + volumeType: "io2", + volumeSize: 10000, + expectedIOPS: 64000, // 10,000 GB * 10 IOPS/GB = 100,000, but max is 64,000 + description: "Large io2 volume should cap at 64,000 IOPS", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + iops := calculateProvisionedIOPS(tc.volumeType, tc.volumeSize) + require.Equal(t, tc.expectedIOPS, iops, tc.description) + }) + } +} diff --git a/pkg/roachprod/vm/gce/gcloud.go b/pkg/roachprod/vm/gce/gcloud.go index 61e6a23160b1..3f9b4d3bdb0f 100644 --- a/pkg/roachprod/vm/gce/gcloud.go +++ b/pkg/roachprod/vm/gce/gcloud.go @@ -1545,9 +1545,12 @@ func (p *Provider) computeInstanceArgs( // Add `discard` for Local SSDs on NVMe, as is advised in: // https://cloud.google.com/compute/docs/disks/add-local-ssd extraMountOpts = "discard" - if opts.SSDOpts.NoExt4Barrier { + + // Disable ext4 barriers if specified and using ext4. + if opts.SSDOpts.NoExt4Barrier && opts.SSDOpts.FileSystem == vm.Ext4 { extraMountOpts = fmt.Sprintf("%s,nobarrier", extraMountOpts) } + } else { // create the "PDVolumeCount" number of persistent disks with the same configuration for i := 0; i < providerOpts.PDVolumeCount; i++ { diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index c7512c0b03dd..1270026fba23 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -315,7 +315,7 @@ type CreateOpts struct { SSDOpts struct { UseLocalSSD bool // NoExt4Barrier, if set, makes the "-o nobarrier" flag be used when - // mounting the SSD. Ignored if UseLocalSSD is not set. + // mounting the SSD. Ignored if UseLocalSSD is not set or filesystem is not ext4. NoExt4Barrier bool // The file system to be used. This is set to "ext4" by default. FileSystem Filesystem