Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 151 additions & 36 deletions pkg/cmd/roachtest/spec/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ const (

// Extra labels added by roachtest
RoachtestBranch = "roachtest-branch"

MetamorphicVolumeType = "MetamorphicVolumeType"
MetamorphicFilesystem = "MetamorphicFilesystem"
)

type MemPerCPU int
Expand Down Expand Up @@ -123,7 +126,9 @@ type ClusterSpec struct {
// to be used. The default is ext4.
FileSystem fileSystemType

RandomlyUseZfs bool
RandomlyUseZfs bool
RandomlyUseXfs bool
RandomizeVolumeType bool

GatherCores bool

Expand All @@ -150,11 +155,16 @@ type ClusterSpec struct {
// VolumeIOPS is the provisioned IOPS for ultra-disks.
VolumeIOPS int
} `cloud:"azure"`

ExposedMetamorphicInfo map[string]string `compareIgnore:"true"`
}

// MakeClusterSpec makes a ClusterSpec.
func MakeClusterSpec(nodeCount int, opts ...Option) ClusterSpec {
spec := ClusterSpec{NodeCount: nodeCount}
spec := ClusterSpec{
NodeCount: nodeCount,
ExposedMetamorphicInfo: make(map[string]string),
}
defaultOpts := []Option{CPU(4), WorkloadNodeCPU(4), nodeLifetime(12 * time.Hour), ReuseAny()}
for _, o := range append(defaultOpts, opts...) {
o(&spec)
Expand All @@ -165,24 +175,38 @@ func MakeClusterSpec(nodeCount int, opts ...Option) ClusterSpec {
// ClustersCompatible returns true if the clusters are compatible, i.e. the test
// asking for s2 can reuse s1.
func ClustersCompatible(s1, s2 ClusterSpec, cloud Cloud) bool {
// only consider the specification of the cloud that we are running in
// Clear cloud-specific and comparison-irrelevant fields.
clearClusterSpecFields(&s1, cloud)
clearClusterSpecFields(&s2, cloud)
return s1 == s2

// We use `reflect.DeepEqual` instead of simple direct `==` comparison
// because ClusterSpec contains map fields which are not comparable
// with direct comparison.
return reflect.DeepEqual(s1, s2)
}

// clearClusterSpecFields clears the cloud specific specification from the cluster spec
// if the cloud specification does not match the target cloud. This is done to ensure that
// the specification for other clouds are not considered while comparing the cluster specifications.
// It also clears fields marked with `compareIgnore:"true"` tag, which should be excluded from
// cluster compatibility comparisons (e.g., ExposedMetamorphicInfo for tracking metamorphic choices).
func clearClusterSpecFields(cs *ClusterSpec, targetCloud Cloud) {
cs.Lifetime = 0
structType := reflect.TypeOf(*cs)
for i := 0; i < structType.NumField(); i++ {
field := structType.Field(i)
fieldValue := reflect.ValueOf(cs).Elem().FieldByName(field.Name)

// Clear fields marked with compareIgnore tag - these should not affect
// cluster compatibility (e.g., metamorphic info is just metadata)
if _, ok := field.Tag.Lookup("compareIgnore"); ok {
fieldValue.Set(reflect.Zero(fieldValue.Type()))
continue
}

// Clear cloud-specific fields if they don't match the target cloud
if tag, ok := field.Tag.Lookup("cloud"); ok {
// Zero out struct if it is not the target cloud.
if !strings.EqualFold(tag, targetCloud.String()) {
fieldValue := reflect.ValueOf(cs).Elem().FieldByName(field.Name)
fieldValue.Set(reflect.Zero(fieldValue.Type()))
}
}
Expand All @@ -206,6 +230,10 @@ func (s ClusterSpec) String() string {
return str
}

func (s ClusterSpec) GetMetamorphicInfo() map[string]string {
return s.ExposedMetamorphicInfo
}

// checks if an AWS machine supports SSD volumes
func awsMachineSupportsSSD(machineType string) bool {
typeAndSize := strings.Split(machineType, ".")
Expand Down Expand Up @@ -366,14 +394,6 @@ func (s *ClusterSpec) RoachprodOpts(
useIOBarrier := params.UseIOBarrierOnLocalSSD
requestedArch := params.PreferredArch

preferLocalSSD := params.Defaults.PreferLocalSSD
switch s.LocalSSD {
case LocalSSDDisable:
preferLocalSSD = false
case LocalSSDPreferOn:
preferLocalSSD = true
}

createVMOpts := vm.DefaultCreateOpts()
// N.B. We set "usage=roachtest" as the default, custom label for billing tracking.
createVMOpts.CustomLabels = map[string]string{vm.TagUsage: "roachtest"}
Expand Down Expand Up @@ -435,7 +455,12 @@ func (s *ClusterSpec) RoachprodOpts(
var err error
switch cloud {
case AWS:
machineType, selectedArch, err = SelectAWSMachineType(s.CPUs, s.Mem, preferLocalSSD && s.VolumeSize == 0, requestedArch)
// We always pass true for shouldSupportLocalSSD here because the machine type selection
// logic should not depend on the user's preference for local SSDs.
// The actual decision to use provisioned local SSDs is handled in the disk configuration logic
// at the provider level, and EBS volume have priority over local SSDs.
// This means that if both EBS and local SSDs are available, EBS will be used.
machineType, selectedArch, err = SelectAWSMachineType(s.CPUs, s.Mem, true, requestedArch)
case GCE:
machineType, selectedArch = SelectGCEMachineType(s.CPUs, s.Mem, requestedArch)
case Azure:
Expand All @@ -452,34 +477,26 @@ func (s *ClusterSpec) RoachprodOpts(
createVMOpts.Arch = string(selectedArch)
}
}

// Local SSD can only be requested
// - if configured to prefer doing so,
// - if no particular volume size is requested, and,
// - on AWS, if the machine type supports it.
// - on GCE, if the machine type is not ARM64.
if preferLocalSSD && s.VolumeSize == 0 && (cloud != AWS || awsMachineSupportsSSD(machineType)) &&
(cloud != GCE || selectedArch != vm.ArchARM64) {
// Ensure SSD count is at least 1 if UseLocalSSD is true.
if ssdCount == 0 {
ssdCount = 1
}
createVMOpts.SSDOpts.UseLocalSSD = true
createVMOpts.SSDOpts.NoExt4Barrier = !useIOBarrier
} else {
createVMOpts.SSDOpts.UseLocalSSD = false
}
}

switch s.FileSystem {
case Ext4:
// ext4 is the default, do nothing unless we randomly want to use zfs
if s.RandomlyUseZfs {
// ext4 is the default, but we can randomly select zfs/xfs if requested.
// Each alternative filesystem gets a 20% chance of being selected,
// leaving the remainder for ext4.
if s.RandomlyUseZfs || s.RandomlyUseXfs {
rng, _ := randutil.NewPseudoRand()
if rng.Float64() <= 0.2 {
randFloat := rng.Float64()

if s.RandomlyUseZfs && randFloat <= 0.2 {
createVMOpts.SSDOpts.FileSystem = vm.Zfs
} else if s.RandomlyUseXfs && randFloat > 0.2 && randFloat <= 0.4 {
createVMOpts.SSDOpts.FileSystem = vm.Xfs
}

s.ExposedMetamorphicInfo[MetamorphicFilesystem] = string(createVMOpts.SSDOpts.FileSystem)
}

case Zfs:
createVMOpts.SSDOpts.FileSystem = vm.Zfs
case Xfs:
Expand All @@ -492,11 +509,101 @@ func (s *ClusterSpec) RoachprodOpts(
return vm.CreateOpts{}, nil, nil, "", errors.Errorf("unknown file system type: %v", s.FileSystem)
}

// Determine which storage type to use based on the following priority order:
// 1. Explicit volume type: If the user explicitly set s.VolumeType, use it.
// 2. Forced local SSD: If LocalSSDPreferOn is set, always use local SSD if available.
// 3. Randomized storage: If RandomizeVolumeType is enabled, randomly select
// from available storage types (cloud-specific volumes + optionally local SSD).
// Local SSD is excluded from randomization if LocalSSDDisable is set.
// 4. Default behavior: If params.Defaults.PreferLocalSSD is true AND the user
// did not explicitly disable local SSD, prefer local SSD.
//
// The selected storage type is then validated against availability constraints
// (e.g., volume size, machine type, architecture) before being applied.
selectedVolumeType := ""
switch {
case s.VolumeType != "":
// User explicitly set a volume type, use it directly.
selectedVolumeType = s.VolumeType

case s.LocalSSD == LocalSSDPreferOn:
// User forced local SSD preference.
selectedVolumeType = "local-ssd"

case s.RandomizeVolumeType:
// If the user selected RandomizeVolumeType, randomly pick a volume type
// from the available volume types.
availableVolumeTypes := []string{}

// If the user did not explicitly disable local SSD and local SSD is available
// for the selected cloud provider, machine type and architecture,
// add it to the list of available volume types.
if s.LocalSSD != LocalSSDDisable && s.isLocalSSDAvailable(cloud, machineType, selectedArch) {
availableVolumeTypes = append(availableVolumeTypes, "local-ssd")
}

switch cloud {
case AWS:
availableVolumeTypes = append(availableVolumeTypes, "gp3", "io2")
case GCE:
availableVolumeTypes = append(availableVolumeTypes, "pd-ssd")
case Azure:
availableVolumeTypes = append(availableVolumeTypes, "premium-ssd", "premium-ssd-v2", "ultra-disk")
}

if len(availableVolumeTypes) > 0 {
rng, _ := randutil.NewPseudoRand()
selectedVolumeType = availableVolumeTypes[rng.Intn(len(availableVolumeTypes))]

s.ExposedMetamorphicInfo[MetamorphicVolumeType] = selectedVolumeType
}

case s.LocalSSD != LocalSSDDisable && params.Defaults.PreferLocalSSD:
// No forced preference, no randomization, but default is to use local SSD
// if available.
selectedVolumeType = "local-ssd"
}

// Local SSD will be used if selected (either by preference or randomly), and
// - if no particular volume size is requested, and,
// - on AWS, if the machine type supports it.
// - on GCE, if the machine type is not ARM64.
if selectedVolumeType == "local-ssd" {
if s.isLocalSSDAvailable(cloud, machineType, selectedArch) {
if ssdCount == 0 {
ssdCount = 1
}
createVMOpts.SSDOpts.UseLocalSSD = true

// Disable ext4 barriers for local SSDs unless explicitly requested.
// This is because local SSDs have very low latency and ext4 barriers
// can significantly degrade performance.
// This setting is only relevant if the selected filesystem is ext4.
if !useIOBarrier && createVMOpts.SSDOpts.FileSystem == vm.Ext4 {
createVMOpts.SSDOpts.NoExt4Barrier = true
}
} else {
// Local SSD was selected but is not available; fall back to default volume type.
fmt.Printf(
"WARN: local SSD selected but not available for machine type %s or because volume size %d != 0;"+
"falling back to default volume type\n",
machineType,
s.VolumeSize,
)
createVMOpts.SSDOpts.UseLocalSSD = false
}
} else {
createVMOpts.SSDOpts.UseLocalSSD = false
if selectedVolumeType != "" {
s.VolumeType = selectedVolumeType
}
}

var workloadMachineType string
var err error
switch cloud {
case AWS:
workloadMachineType, _, err = SelectAWSMachineType(s.WorkloadNodeCPUs, s.Mem, preferLocalSSD && s.VolumeSize == 0, selectedArch)
workloadMachineType, _, err = SelectAWSMachineType(s.WorkloadNodeCPUs, s.Mem, false, selectedArch)
case GCE:
workloadMachineType, _ = SelectGCEMachineType(s.WorkloadNodeCPUs, s.Mem, selectedArch)
case Azure:
Expand Down Expand Up @@ -542,6 +649,14 @@ func (s *ClusterSpec) RoachprodOpts(
return createVMOpts, providerOpts, workloadProviderOpts, selectedArch, nil
}

func (s *ClusterSpec) isLocalSSDAvailable(
cloud Cloud, machineType string, selectedArch vm.CPUArch,
) bool {
return s.VolumeSize == 0 &&
(cloud != AWS || awsMachineSupportsSSD(machineType)) &&
(cloud != GCE || selectedArch != vm.ArchARM64)
}

// SetRoachprodOptsZones updates the providerOpts with the VM zones as specified in the params/spec.
// We separate this logic from RoachprodOpts as we may need to call this multiple times in order to
// randomize the default GCE zone.
Expand Down
21 changes: 21 additions & 0 deletions pkg/cmd/roachtest/spec/cluster_spec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,24 @@ func TestClustersCompatible(t *testing.T) {
require.True(t, ClustersCompatible(s1, s2, AWS))
})
}

func TestClustersRetainClearedInfo(t *testing.T) {
// Adding a test in case we switch the ClustersCompatible signature to take
// pointers to ClusterSpec in the future.
t.Run("original structs are not modified", func(t *testing.T) {
s1 := ClusterSpec{
NodeCount: 5,
ExposedMetamorphicInfo: map[string]string{"VolumeType": "io2"},
}
s2 := ClusterSpec{
NodeCount: 5,
ExposedMetamorphicInfo: map[string]string{"VolumeType": "gp3"},
}

ClustersCompatible(s1, s2, GCE)

// Original data should still be there
require.Equal(t, "io2", s1.ExposedMetamorphicInfo["VolumeType"])
require.Equal(t, "gp3", s2.ExposedMetamorphicInfo["VolumeType"])
})
}
25 changes: 24 additions & 1 deletion pkg/cmd/roachtest/spec/option.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,21 @@ func DisableLocalSSD() Option {
}
}

// RandomizeVolumeType is an Option which randomly picks the volume type
// to be used. Unless SSD is forced, the volume type is picked randomly
// between the available types for a provider:
// - GCE: pd-ssd, local-ssd
// - AWS: gp3, io2, local-ssd
// - Azure: premium-ssd, premium-ssd-v2, ultra-disk, local-ssd
// - IBM: 10iops-tier
// Note: this option has no effect if VolumeType is explicitly set
// or PreferLocalSSD/DisableLocalSSD is used.
func RandomizeVolumeType() Option {
return func(spec *ClusterSpec) {
spec.RandomizeVolumeType = true
}
}

// TerminateOnMigration ensures VM is terminated in case GCE triggers a live migration.
func TerminateOnMigration() Option {
return func(spec *ClusterSpec) {
Expand Down Expand Up @@ -235,13 +250,21 @@ func SetFileSystem(fs fileSystemType) Option {
// RandomlyUseZfs is an Option which randomly picks
// the file system to be used, and sets it to zfs,
// about 20% of the time.
// Zfs is only picked if the cloud is gce.
func RandomlyUseZfs() Option {
return func(spec *ClusterSpec) {
spec.RandomlyUseZfs = true
}
}

// RandomlyUseXfs is an Option which randomly picks
// the file system to be used, and sets it to xfs,
// about 20% of the time.
func RandomlyUseXfs() Option {
return func(spec *ClusterSpec) {
spec.RandomlyUseXfs = true
}
}

// GCEMachineType sets the machine (instance) type when the cluster is on GCE.
func GCEMachineType(machineType string) Option {
return func(spec *ClusterSpec) {
Expand Down
14 changes: 10 additions & 4 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,15 @@ func (r *testRunner) Run(
// We should also check against the spec of the cluster, but we don't
// currently have a way of doing that; we're relying on the fact that attaching to the cluster
// will fail if the cluster is incompatible.
spec := tests[0].Cluster
spec.Lifetime = 0
spec1 := tests[0].Cluster
spec1.Lifetime = 0
for i := 1; i < len(tests); i++ {
spec2 := tests[i].Cluster
spec2.Lifetime = 0
if spec != spec2 {
if !spec.ClustersCompatible(spec1, spec2, roachtestflags.Cloud) {
return errors.Errorf("cluster specified but found tests "+
"with incompatible specs: %s (%s) - %s (%s)",
tests[0].Name, spec, tests[i].Name, spec2,
tests[0].Name, spec1, tests[i].Name, spec2,
)
}
}
Expand Down Expand Up @@ -2197,6 +2197,12 @@ func getTestParameters(t *testImpl, c *clusterImpl, createOpts *vm.CreateOpts) m
if spec.Cluster.Arch != "" {
clusterParams["arch"] = string(spec.Cluster.Arch)
}

// Include cluster spec metamorphic test parameters.
for k, v := range spec.Cluster.GetMetamorphicInfo() {
clusterParams[k] = v
}

// These params can be probabilistically set, so we pass them here to
// show what their actual values are in the posted issue.
if createOpts != nil {
Expand Down
Loading