diff --git a/.buildlibrary b/.buildlibrary index 6641f4b..ff690f5 100644 --- a/.buildlibrary +++ b/.buildlibrary @@ -1,4 +1,4 @@ -ValidationKey: '67118392' +ValidationKey: '67319980' AcceptedWarnings: - 'Warning: package ''.*'' was built under R version' - 'Warning: namespace ''.*'' is not available and has been replaced' diff --git a/CITATION.cff b/CITATION.cff index a824da2..e012e24 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,8 +2,8 @@ cff-version: 1.2.0 message: If you use this software, please cite it using the metadata from this file. type: software title: 'madrat: May All Data be Reproducible and Transparent (MADRaT) *' -version: 3.28.4 -date-released: '2025-12-16' +version: 3.29.0 +date-released: '2026-01-09' abstract: Provides a framework which should improve reproducibility and transparency in data processing. It provides functionality such as automatic meta data creation and management, rudimentary quality management, data caching, work-flow management diff --git a/DESCRIPTION b/DESCRIPTION index da77b49..b863876 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: madrat Title: May All Data be Reproducible and Transparent (MADRaT) * -Version: 3.28.4 -Date: 2025-12-16 +Version: 3.29.0 +Date: 2026-01-09 Authors@R: c( person("Jan Philipp", "Dietrich", , "dietrich@pik-potsdam.de", role = c("aut", "cre"), comment = c(affiliation = "Potsdam Institute for Climate Impact Research", ORCID = "0000-0002-4309-6431")), @@ -45,6 +45,7 @@ Depends: Imports: callr, digest, + filelock (>= 1.0.3), igraph (>= 2.1.1), Matrix, methods, diff --git a/R/retrieveData.R b/R/retrieveData.R index 6131abf..a376933 100644 --- a/R/retrieveData.R +++ b/R/retrieveData.R @@ -94,8 +94,10 @@ retrieveData <- function(model, rev = 0, dev = "", cachetype = "def", puc = iden if (length(matchingPUCs) == 1) { vcat(-2, " - data will be created from existing puc (", matchingPUCs, ").", fill = 300) - do.call(pucAggregate, c(list(puc = file.path(getConfig("pucfolder"), matchingPUCs)), renv = renv, - strict = NULL, cfg$input[cfg$pucArguments])) + .withLockedPuc(matchingPUCs, function() { + do.call(pucAggregate, c(list(puc = file.path(getConfig("pucfolder"), matchingPUCs)), + renv = renv, strict = NULL, cfg$input[cfg$pucArguments])) + }) return(invisible(paste0(outputfolder, ".tgz"))) } } @@ -176,31 +178,33 @@ retrieveData <- function(model, rev = 0, dev = "", cachetype = "def", puc = iden } pucName <- paste0(cfg$pucName, ".puc") pucPath <- file.path(getConfig("pucfolder"), pucName) - if (!file.exists(pucPath)) { - cacheFiles <- readLines(pucFiles) - if (all(file.exists(cacheFiles))) { - vcat(1, " - create puc (", pucPath, ")", fill = 300, show_prefix = FALSE) - with_tempdir({ - file.copy(cacheFiles, ".") - otherFiles <- c("config.rds", "diagnostics.log") - file.copy(file.path(outputfolder, otherFiles), ".") - - .fillRenvCache(requiredPackages = attr(cfg$functionName, "package")) - - missingFiles <- basename(cacheFiles)[!file.exists(basename(cacheFiles))] - if (length(missingFiles) == 0) { - # create the actual puc file: a tar gz archive containing config, diagnostics, renv.lock, and all - # required madrat cache files - .tarAndVerify(pucPath) - } else { - vcat(1, "puc file not created, some cache files are missing:\n", - paste(missingFiles, collapse = "\n")) - } - }, tmpdir = madTempDir()) - } else { - vcat(1, "puc file not created: could not find all relevant files.") + .withLockedPuc(pucName, function() { + if (!file.exists(pucPath)) { + cacheFiles <- readLines(pucFiles) + if (all(file.exists(cacheFiles))) { + vcat(1, " - create puc (", pucPath, ")", fill = 300, show_prefix = FALSE) + with_tempdir({ + file.copy(cacheFiles, ".") + otherFiles <- c("config.rds", "diagnostics.log") + file.copy(file.path(outputfolder, otherFiles), ".") + + .fillRenvCache(requiredPackages = attr(cfg$functionName, "package")) + + missingFiles <- basename(cacheFiles)[!file.exists(basename(cacheFiles))] + if (length(missingFiles) == 0) { + # create the actual puc file: a tar gz archive containing config, diagnostics, renv.lock, and all + # required madrat cache files + .tarAndVerify(pucPath) + } else { + vcat(1, "puc file not created, some cache files are missing:\n", + paste(missingFiles, collapse = "\n")) + } + }, tmpdir = madTempDir()) + } else { + vcat(1, "puc file not created: could not find all relevant files.") + } } - } + }) } else { vcat(1, "puc file not created: could not find list of files to be added.") } @@ -394,3 +398,32 @@ retrieveData <- function(model, rev = 0, dev = "", cachetype = "def", puc = iden return(returnCode) } + +.withLockedPuc <- function(pucName, fn) { + # This could be improved by using separate locks for read and + # write access to not block simultaneous reads. This would + # however significantly increase the complexity of this mechanism. + + # Ensure .locks folder exists + lockFileFolder <- file.path(getConfig("pucfolder"), ".locks") + if (!dir.exists(lockFileFolder)) { + dir.create(lockFileFolder) + } + + # Get the lock + lockFilePath <- file.path(getConfig("pucfolder"), ".locks", paste0(pucName, ".lock")) + lock <- filelock::lock(lockFilePath, timeout = 6 * 60 * 60 * 1000) # Wait for 6h + if (is.null(lock)) { + # This should really not happen, as the Inf timeout should + # ensure we always get a lock. + stop("Failed to acquire lock for puc file ", lockFilePath) + } + + # Execute the fn with the lock + tryCatch({ + return(fn()) + }, + finally = { + filelock::unlock(lock) + }) +} diff --git a/README.md b/README.md index 898abc1..9756894 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # May All Data be Reproducible and Transparent (MADRaT) * -R package **madrat**, version **3.28.4** +R package **madrat**, version **3.29.0** [![CRAN status](https://www.r-pkg.org/badges/version/madrat)](https://cran.r-project.org/package=madrat) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1115490.svg)](https://doi.org/10.5281/zenodo.1115490) [![R build status](https://github.com/pik-piam/madrat/workflows/check/badge.svg)](https://github.com/pik-piam/madrat/actions) [![codecov](https://codecov.io/gh/pik-piam/madrat/branch/master/graph/badge.svg)](https://app.codecov.io/gh/pik-piam/madrat) [![r-universe](https://pik-piam.r-universe.dev/badges/madrat)](https://pik-piam.r-universe.dev/builds) @@ -55,7 +55,7 @@ In case of questions / problems please contact Jan Philipp Dietrich , Version: 3.28.4, . +Dietrich J, Sauer P, Baumstark L, Wirth S, Giannousakis A, Rodrigues R, Bodirsky B, Leip D, Crawford M, Kreidenweis U, Klein D, Rein P (2026). "madrat: May All Data be Reproducible and Transparent (MADRaT) *." doi:10.5281/zenodo.1115490 , Version: 3.29.0, . A BibTeX entry for LaTeX users is @@ -64,9 +64,9 @@ A BibTeX entry for LaTeX users is title = {madrat: May All Data be Reproducible and Transparent (MADRaT) *}, author = {Jan Philipp Dietrich and Pascal Sauer and Lavinia Baumstark and Stephen Wirth and Anastasis Giannousakis and Renato Rodrigues and Benjamin Leon Bodirsky and Debbora Leip and Michael Crawford and Ulrich Kreidenweis and David Klein and Patrick Rein}, doi = {10.5281/zenodo.1115490}, - date = {2025-12-16}, - year = {2025}, + date = {2026-01-09}, + year = {2026}, url = {https://github.com/pik-piam/madrat}, - note = {Version: 3.28.4}, + note = {Version: 3.29.0}, } ``` diff --git a/tests/testthat/test-puc.R b/tests/testthat/test-puc.R index cda95d0..9b291a3 100644 --- a/tests/testthat/test-puc.R +++ b/tests/testthat/test-puc.R @@ -17,6 +17,121 @@ test_that("puc creation works", { expect_message(pucAggregate("rev42_extra_example_tag.puc", extra = "blub", regionmapping = "regionmappingH12.csv", renv = FALSE), "already available") expect_true(file.exists(file.path(getConfig("outputfolder"), "rev42_h12_7a5441e5_example_customizable_tag.tgz"))) + expect_message(retrieveData("example", rev = 42, extra = "test2", renv = FALSE), "Run pucAggregate") expect_true(file.exists(file.path(getConfig("outputfolder"), "rev42_h12_5f3d77a0_example_customizable_tag.tgz"))) }) + +test_that("puc creation is thread-safe", { + skip_on_cran() + + # Tip for debugging this test: + # If something breaks in one of the sub-processes, use p1$get_result() + # to check what happened. + + # Store paths + lockDirName <- ".locks" + lockDir <- file.path(getConfig("pucfolder"), lockDirName) + + # Set up utility functions + .waitForMessage <- function(process, message) { + while (!(length(messages <- process$read_output_lines()) > 0 && any(messages == message))) { + Sys.sleep(0.1) + } + } + + # Set up PUC-folder as working directory + unlink(list.files(getConfig("pucfolder"), full.names = TRUE)) # To ensure no remaining pucs are in there + unlink(list.files(lockDir, full.names = TRUE)) # To ensure no remaining locks + dir.create(lockDir, showWarnings = FALSE) + + # Set up locks + firstCheckpoint <- filelock::lock(file.path(lockDir, "checkpoint1.lock")) + secondCheckpoint <- filelock::lock(file.path(lockDir, "checkpoint2.lock")) + + p1 <- callr::r_bg(function(madratConfig) { + # Set up environment + pkgload::load_all("../..") + do.call(madrat::setConfig, madratConfig) + lockDir <- file.path(getConfig("pucfolder"), ".locks") + + # Set up .withLockedPuc wrapper to inject control logic + # into the passed function + originalWithLockedPuc <- madrat:::.withLockedPuc + assignInNamespace(".withLockedPuc", function(pucName, fn) { + originalWithLockedPuc(pucName, function() { + if (any(grepl("pucAggregate", deparse(fn), fixed = TRUE))) { + # Only interested in creation, so we do a quick return for reading the puc. + return(fn()) + } + + tryCatch({ + cat("ready\n") + firstCheckpoint <- filelock::lock(file.path(lockDir, "checkpoint1.lock")) + fn() + cat("fn done\n") + secondCheckpoint <- filelock::lock(file.path(lockDir, "checkpoint2.lock")) + }, + finally = { + filelock::unlock(firstCheckpoint) + filelock::unlock(secondCheckpoint) + }) + }) + }, ns = "madrat") + + # Start + madrat::retrieveData("example", rev = 45) + }, args = list(madratConfig = getConfig())) + + # Wait for p1 to signal that it is ready, i.e. it has entered the critical section + .waitForMessage(p1, "ready") + + # This is a copy of testFunction1 except for the call at the end + p2 <- callr::r_bg(function(madratConfig) { + # Set up environment + pkgload::load_all("../..") + do.call(madrat::setConfig, madratConfig) + + # Set up .withLockedPuc wrapper to inject control logic + # into the passed function + originalWithLockedPuc <- madrat:::.withLockedPuc + assignInNamespace(".withLockedPuc", function(pucName, fn) { + originalWithLockedPuc(pucName, function() { + if (any(grepl("pucAggregate", deparse(fn), fixed = TRUE))) { + # Only interested in creation, so we do a quick return for reading the puc. + return(fn()) + } + fn() + }) + }, ns = "madrat") + + # Start + cat("ready\n") + madrat::retrieveData("example", rev = 45) + }, args = list(madratConfig = getConfig())) + + # Wait for p2 to signal that it is ready, i.e. it was at the point where it could execute + # the critical section (there is no guarantee that it tried getting in yet, if this test + # is flaky, this is one of the critical spots). + .waitForMessage(p2, "ready") + + filelock::unlock(firstCheckpoint) + + # Wait for p1 to signal that it is done, i.e. it has executed fn + .waitForMessage(p1, "fn done") + + Sys.sleep(1) # Give p2 some time to fall asleep (start waiting for the puc lock) + + expect_true(p2$get_status() == "sleeping") + expect_true(file.exists(file.path(getConfig("pucfolder"), "rev45_extra_example_tag.puc"))) + unlink("rev45_extra_example_tag.puc") + + filelock::unlock(secondCheckpoint) + p1$wait() + p2$wait() + + expect_true(file.exists(file.path(getConfig("pucfolder"), "rev45_extra_example_tag.puc"))) + + expect_false(p1$is_alive()) + expect_false(p2$is_alive()) +})