Problem on Auto tune and custom resampling mlr3 #926

NicoD1995 · 2023-05-04T18:16:37Z

NicoD1995
May 4, 2023

Hi everybody,
I have some problems on my datas, here they are :

    LesionResponse = structure(c(2L, 1L, 2L, 2L, 1L, 2L, 2L, 
    2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L),.Label = c("0", 
    "1"), class = "factor"), F1 = c(1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 
    0.625, 0.625, 0.625, 0.625, 0.625, 0.625, 1.25, 0.625, 0.625, 
    1.25, 1.25, 1.25), F2 = c(1, 5, 3, 2, 1, 1, 6, 9, 0, 5, 0, 4, 4, 4, 5, 2, 1, 1), F3 = c(0, 4, 3, 1, 1, 0, 3, 8, 4, 5, 0, 4, 4, 3, 5, 2, 0, 0), F4 = c(0, 9, 0, 7, 4, 0, 3, 8, 4, 5, 9, 1, 1, 3, 5, 3, 9, 0)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L), class = "data.frame")```

It's just a piece of my datas, which has a big dimension 850 features and 1050 rows.
I'm new in the mlr3verse and I'm trying to learn and apply the methods written in the mlr3 book or whatever I find on GitHub.

I've found a way to split my datas, by grouping and by stratifications...not an easy thing, but mlr3 doesn't provide a method to do the two things simultaneously whereas it's a very common thing in medical datas.

Here is my code for those interested :
```set.seed(1234)

split_data <- function(data, patient_col = "PatientID", target_col = "LesionResponse", train_ratio = 0.6, valid_ratio = 0.2) {
repeat {
# Grouper les patients par PatientID et mélanger
patients <- data[[patient_col]]
patient_groups <- split(data, patients)
patient_groups_shuffled <- sample(patient_groups)

# Nombre de patients
n_train <- round(length(patient_groups_shuffled) * train_ratio)
n_valid <- round(length(patient_groups_shuffled) * valid_ratio)
train_groups <- patient_groups_shuffled[1:n_train]
valid_groups <- patient_groups_shuffled[(n_train + 1):(n_train + n_valid)]
test_groups <- patient_groups_shuffled[(n_train + n_valid + 1):length(patient_groups_shuffled)]

# Combinaison dans un data frame
train_set <- do.call(rbind, train_groups)
valid_set <- do.call(rbind, valid_groups)
test_set <- do.call(rbind, test_groups)

# Contrôle de la proportion de 0
proportion_train <- mean(train_set[[target_col]] == "0")
proportion_valid <- mean(valid_set[[target_col]] == "0")
proportion_test <- mean(test_set[[target_col]] == "0")

if (proportion_train >= 0.70 && proportion_train <= 0.75 && 
    proportion_valid >= 0.70 && proportion_valid <= 0.75 && 
    proportion_test >= 0.70 && proportion_test <= 0.75) {
  break
}
}
return(list(train = train_set, validation = valid_set, test = test_set))
}

split_data_sets <- split_data(data)
train_set <- split_data_sets$train
valid_set <- split_data_sets$validation
test_set <- split_data_sets$test
combined_data <- rbind(train_set, valid_set)

#Lignes en commun
train_rows_outer <- as.numeric(which(data$PatientID %in% combined_data$PatientID))
train_rows <-as.numeric(which(data$PatientID %in% train_set$PatientID))
valid_rows <- as.numeric(which(data$PatientID %in% valid_set$PatientID))
test_rows <- as.numeric(which(data$PatientID %in% test_set$PatientID))```

# Check the proportion of "0" in each set
train_proportion <- mean(train_set$LesionResponse == "0")
valid_proportion <- mean(valid_set$LesionResponse == "0")
test_proportion <- mean(test_set$LesionResponse == "0")

cat("Proportion of '0' in Train set: ", train_proportion, "\n")
cat("Proportion of '0' in Validation set: ", valid_proportion, "\n")
cat("Proportion of '0' in Test set: ", test_proportion, "\n")

# Check for common rows
common_train_valid <- intersect(train_set$PatientID, valid_set$PatientID)
common_train_test <- intersect(train_set$PatientID, test_set$PatientID)
common_valid_test <- intersect(valid_set$PatientID, test_set$PatientID)

# Print the number of rows in each set
cat("\nNumber of rows in Train set: ", nrow(train_set), "\n")
cat("Number of rows in Validation set: ", nrow(valid_set), "\n")
cat("Number of rows in Test set: ", nrow(test_set), "\n")

# Print the common rows
cat("\nCommon PatientIDs between Train and Validation sets:\n")
if (length(common_train_valid) == 0) {
  cat("Il n'y a aucun patient en commun.\n")
} else {
  cat("Ces IDs sont communs entre les deux groupes:\n")
  print(common_train_valid)
}

cat("\nCommon PatientIDs between Train and Test sets:\n")
if (length(common_train_test) == 0) {
  cat("Il n'y a aucun patient en commun.\n")
} else {
  cat("Ces IDs sont communs entre les deux groupes:\n")
  print(common_train_test)
}

cat("\nCommon PatientIDs between Validation and Test sets:\n")
if (length(common_valid_test) == 0) {
  cat("Il n'y a aucun patient en commun.\n")
} else {
  cat("Ces IDs sont communs entre les deux groupes:\n")
  print(common_valid_test)
}
data$PatientID <- NULL
train_set$PatientID <- NULL
valid_set$PatientID <- NULL
test_set$PatientID <- NULL

With this, I have no problem to create the three datasets. I normalized them, then, I created inner and outer resamples to have a test_set to estimate final performances.

data<-rbind(train_set, valid_set, test_set)
task <- as_task_classif(data, target = "LesionResponse")

# Création du OUTER resampling via customisation
resampling_outer = rsmp("custom")
resampling_outer$instantiate(task, train = list(train_rows_outer), test = list(test_rows))

#Création du INNER resampling via customisation
resampling_inner = rsmp("custom")
resampling_inner$instantiate(task, train = list(train_rows), test = list(valid_rows))

outer_resampling = resampling_outer

Then, I'm trying to create, tune and run some learners as this one (with the help of "autotune") :

#Auto tuning xgboost
learner_xgboost = lrn("classif.xgboost", nrounds = to_tune(1, 500))

at_xgboost = auto_tuner(
    tuner = tnr("random_search"),
    learner = learner_xgboost,
    resampling = resampling_inner,
    measure = msr("classif.ce"),
    terminator = trm("evals", n_evals = 5),
    store_models = TRUE
)

design = benchmark_grid(task = task, learner = at_xgboost, resampling = resampling_outer)
bmr = benchmark(design, store_models = TRUE)

#Extraction des learners optimisés dans la boucle externe :
data_bmr = as.data.table(bmr)
outer_learners = map(data_bmr$learner, "learner")

#Extraction des learners après optimisation boucle interne :
archives = extract_inner_tuning_archives(bmr)
inner_learners = map(archives$resample_result, "learners")

#Prédiction sur données nouvelles
at_xgboost$train(task, row_ids = train_rows_outer)
at_xgboost$predict(task, row_ids = test_rows)->prediction_xgboost
prediction_xgboost$confusion
autoplot(prediction_xgboost)

The results are very bad...so I tried to study the influence of the imbalanced datas with this code :

# Définition
learner = at_xgboost

# 1. Augmentation de la classe minoritaire
graph_up <- po("classbalancing", id = "oversample", adjust = "minor",  reference = "minor", shuffle = FALSE, ratio = 2) %>>% at_xgboost

# 2. Réduction de la classe majoritaire d'un facteur '1/ratio'
graph_down <- po("classbalancing", id = "undersample", adjust = "major", reference = "major", shuffle = FALSE, ratio = 1/3) %>>% at_xgboost

# 3. Témoin négatif (pas de stratégie particulière)
graph_nop <- at_xgboost

graph_nop <-  GraphLearner$new(graph_nop)
graph_nop$predict_type <- "prob"

graph_down <- GraphLearner$new(graph_down)
graph_down$predict_type <- "prob"

graph_up <- GraphLearner$new(graph_up)
graph_up$predict_type <- "prob"

#Benchmark
bmr <- benchmark(design = benchmark_grid(task, learner = list(graph_nop, graph_up, graph_down), resampling_outer), store_models = TRUE)

#Résultats 
bmr$aggregate(msr("classif.ce"))
bmr$aggregate(msr("classif.auc"))

When I run the code, it doesn't work...I have a warning of this type just when the code reach the bmr part :

Error: Test set 1 of inner resampling 'custom' contains row ids not present in task 'task_test': {651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824}

When I check the train_rows, valid_rows and test_rows, they are all present in the "data" set, and when I check if these rows are in the resampling_inner or outer, it's all ok.

I don't really know where my mistake(s) is/are... If someone could help me ?

EDIT : The piece of set I gave has too low informations to run the code correctly in the sets creation part.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Problem on Auto tune and custom resampling mlr3 #926

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 0 comments

Select a reply

Problem on Auto tune and custom resampling mlr3 #926

NicoD1995 May 4, 2023

Replies: 0 comments

NicoD1995
May 4, 2023