ropensci · bnicenboim · Apr 8, 2025 · Mar 12, 2025 · Mar 14, 2025 · Mar 14, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -27,3 +27,4 @@ REVIEW.md
 ^Meta$
 ^cran-comments\.md$
 ^CRAN-SUBMISSION$
+^checklist\.md$
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,6 @@ README_cache
 REVIEW.md
 /doc/
 /Meta/
+/cache/
+/dev/
+*.py$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: pangoling
 Title: Access to Large Language Model Predictions
-Version: 1.0.1
+Version: 1.0.3
 Authors@R: c(
     person("Bruno", "Nicenboim", , "b.nicenboim@tilburguniversity.edu", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-5176-3943")),
@@ -12,16 +12,19 @@ Authors@R: c(
   )
 Description: Provides access to word predictability estimates using large 
              language models (LLMs) based on 'transformer' architectures via 
-             integration with the 'Hugging Face' ecosystem. The package 
+             integration with the 'Hugging Face' ecosystem 
+             <https://huggingface.co/>. The package 
              interfaces with pre-trained neural networks and supports both 
-             causal/auto-regressive LLMs (e.g., 'GPT-2'; Radford et al., 2019) 
-             and masked/bidirectional LLMs (e.g., 'BERT'; Devlin et al., 2019, 
-             <doi:10.48550/arXiv.1810.04805>) to compute the 
-             probability of words, phrases, or tokens given their linguistic 
-             context. By enabling a straightforward estimation of word 
-             predictability, the package facilitates research in 
-             psycholinguistics, computational linguistics, and natural 
-             language processing (NLP).
+             causal/auto-regressive LLMs (e.g., 'GPT-2') and 
+             masked/bidirectional LLMs (e.g., 'BERT') to compute the probability
+             of words, phrases, or tokens given their linguistic context. For 
+             details on GPT-2 and causal models, see Radford et al. (2019) 
+             <https://storage.prod.researchhub.com/uploads/papers/2020/06/01/language-models.pdf>, 
+             for details on BERT and masked models, see 
+             Devlin et al. (2019) <doi:10.48550/arXiv.1810.04805>. By enabling a
+             straightforward estimation of word predictability, the package 
+             facilitates research in psycholinguistics, computational 
+             linguistics, and natural language processing (NLP).
 License: MIT + file LICENSE
 URL: https://docs.ropensci.org/pangoling/, https://github.com/ropensci/pangoling
 BugReports: https://github.com/ropensci/pangoling/issues

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,14 @@
+# pangoling 
+
+# pangoling 1.0.3
+
+- Internal changes to comply with CRAN requirements.
+- HF_HOME is used now to store the models rather than TRANSFORMERS_CACHE
+
+# pangoling 1.0.2
+
+- Internal changes: OMP THREAD LIMIT was set to 1.
+
 # pangoling 1.0.1
 
 ### New Features

diff --git a/R/tr_utils.R b/R/tr_utils.R
@@ -450,7 +450,7 @@ word_lp <- function(words,
 #'
 #' This function sets the cache directory for HuggingFace transformers. If a 
 #' path is given, the function checks if the directory exists and then sets the 
-#' `TRANSFORMERS_CACHE` environment variable to this path.
+#' `HF_HOME` environment variable to this path.
 #' If no path is provided, the function checks for the existing cache directory 
 #' in a number of environment variables.
 #' If none of these environment variables are set, it provides the user with 
@@ -461,7 +461,7 @@ word_lp <- function(words,
 #'             number of environment variables. Default is NULL.
 #'
 #' @return Nothing is returned, this function is called for its side effect of 
-#'        setting the `TRANSFORMERS_CACHE` environment variable, or providing 
+#'        setting the `HF_HOME` environment variable, or providing 
 #'        information to the user.
 #' @export
 #'
@@ -476,10 +476,10 @@ set_cache_folder <- function(path = NULL){
   if(!is.null(path)){
     if(!dir.exists(path)) stop2("Folder '", path, "' doesn't exist.")
     reticulate::py_run_string(
-                  paste0("import os\nos.environ['TRANSFORMERS_CACHE']='",
+                  paste0("import os\nos.environ['HF_HOME']='",
                          path,"'"))
-    reticulate::py_run_string(
-                  paste0("import os\nos.environ['HF_HOME']='",path,"'"))
+    # reticulate::py_run_string(
+    #               paste0("import os\nos.environ['HF_HOME']='",path,"'"))
   }
   path <- c(Sys.getenv("TRANSFORMERS_CACHE"),
             Sys.getenv("HUGGINGFACE_HUB_CACHE"),
@@ -495,7 +495,7 @@ set_cache_folder <- function(path = NULL){
       "By default pretrained models are downloaded and locally",
       " cached at: ~/.cache/huggingface/hub. ",
       "This is the default directory given by the shell ",
-      "environment variable TRANSFORMERS_CACHE. On Windows, ",
+      "environment variable HF_HOME. On Windows, ",
       "the default directory is given by ",
       "C:\\Users\\username\\.cache\\huggingface\\hub.\n",
       "For changing the shell environment variables that ",

diff --git a/R/utils.R b/R/utils.R
@@ -50,7 +50,7 @@
 #' @examples
 #' 
 #' # Install with default settings:
-#' if (FALSE) {
+#' \dontrun{
 #'  install_py_pangoling()
 #' }
 #' 
@@ -110,7 +110,7 @@ install_py_pangoling <- function(method = c("auto", "virtualenv", "conda"),
   )
 
   do.call(reticulate::py_install, py_install_args)
-  cat("\nInstallation complete.\n\n")
+  message_verbose("\nInstallation complete.\n\n")
 
   if (restart_session &&
       requireNamespace("rstudioapi", quietly = TRUE) &&

diff --git a/R/zzz.R b/R/zzz.R
@@ -1,22 +1,18 @@
-# global references (will be initialized in .onLoad)
-transformers <- NULL
 torch <- NULL
 # data table :=
 .datatable.aware <- TRUE
 
 #' @noRd
 .onLoad <- function(libname, pkgname) { # nocov start
+  # CRAN OMP THREAD LIMIT
+  Sys.setenv("OMP_THREAD_LIMIT" = 1)
   if (is_mac()) {
     # Workaround for R's built-in OpenMP conflicts
     Sys.setenv(KMP_DUPLICATE_LIB_OK = 'TRUE')
   }
   reticulate::use_virtualenv("r-pangoling", required = FALSE)
 
   # use superassignment to update global reference
-  transformers <<- reticulate::import("transformers",
-                                      delay_load = TRUE,
-                                      convert = FALSE)
-  inspect <<- reticulate::import("inspect", delay_load = TRUE, convert = TRUE) 
   torch <<- reticulate::import("torch", delay_load = TRUE, convert = FALSE)
   # TODO message or something if it's not installed
   # ask about the env

diff --git a/README.Rmd b/README.Rmd
@@ -20,6 +20,9 @@ knitr::opts_chunk$set(
 [![Project Status: active](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
 [![DOI](https://zenodo.org/badge/497831295.svg)](https://zenodo.org/badge/latestdoi/497831295)
 [![Status at rOpenSci Software Peer Review](https://badges.ropensci.org/575_status.svg)](https://github.com/ropensci/software-review/issues/575)
+[![CRAN status](https://www.r-pkg.org/badges/version/pangoling)](https://CRAN.R-project.org/package=pangoling)
+[![metacran downloads](https://cranlogs.r-pkg.org/badges/grand-total/pangoling)](https://cran.r-project.org/package=pangoling)
+
 <!-- badges: end -->
 
 
@@ -40,7 +43,13 @@ The training data of the most popular models (such as GPT-2) haven't been releas
 
 ## Installation
 
-There is still no CRAN version of `pangoling`. To install the latest version from github use:
+To install the latest CRAN version of `pangoling` use:
+
+```{r, eval = FALSE}
+install.packages("pangoling")
+```
+
+To install the latest version from github use:
 
 ```{r, eval = FALSE}
 install.packages("pangoling", repos = "https://ropensci.r-universe.dev")

diff --git a/README.md b/README.md
@@ -13,6 +13,11 @@ active](https://www.repostatus.org/badges/latest/active.svg)](https://www.repost
 [![DOI](https://zenodo.org/badge/497831295.svg)](https://zenodo.org/badge/latestdoi/497831295)
 [![Status at rOpenSci Software Peer
 Review](https://badges.ropensci.org/575_status.svg)](https://github.com/ropensci/software-review/issues/575)
+[![CRAN
+status](https://www.r-pkg.org/badges/version/pangoling)](https://CRAN.R-project.org/package=pangoling)
+[![metacran
+downloads](https://cranlogs.r-pkg.org/badges/grand-total/pangoling)](https://cran.r-project.org/package=pangoling)
+
 <!-- badges: end -->
 
 `pangoling`[^1] is an R package for estimating the predictability of
@@ -40,8 +45,13 @@ website](https://huggingface.co/gpt2).
 
 ## Installation
 
-There is still no CRAN version of `pangoling`. To install the latest
-version from github use:
+To install the latest CRAN version of `pangoling` use:
+
+``` r
+install.packages("pangoling")
+```
+
+To install the latest version from github use:
 
 ``` r
 install.packages("pangoling", repos = "https://ropensci.r-universe.dev")
@@ -107,6 +117,7 @@ df_sent <- df_sent |>
 #> `The apple doesn't fall far from the tree.`
 #> Text id: 2
 #> `Don't judge a book by its cover.`
+#> ***
 df_sent
 #> # A tidytable: 15 × 3
 #>    sent_n word         lp
@@ -135,9 +146,9 @@ citation("pangoling")
 Users are encouraged to not only cite pangoling, but also the python
 package `transformers` (and the specific LLM they are using):
 
-  Nicenboim B (2025). _pangoling: Access to large language model
-  predictions in R_. doi:10.5281/zenodo.7637526
-  <https://doi.org/10.5281/zenodo.7637526>, R package version 1.0.1,
+  Nicenboim B (2025-04-07 17:00:02 UTC). _pangoling: Access to large
+  language model predictions in R_. doi:10.5281/zenodo.7637526
+  <https://doi.org/10.5281/zenodo.7637526>, R package version 1.0.3,
   <https://github.com/ropensci/pangoling>.
 
   Wolf T, Debut L, Sanh V, Chaumond J, Delangue C, Moi A, Cistac P,

diff --git a/checklist.md b/checklist.md
@@ -0,0 +1,11 @@
+# usethis::use_release_issue()
+# For usethis::use_release_issue()
+release_bullets <- function() {
+  c(
+    "Update static imports: `devtools::spell_check()`",
+    "`codemetar::write_codemeta()`",
+    "Precompile vignettes"
+  )
+}
+
+https://r-pkgs.org/release.html
diff --git a/codemeta.json b/codemeta.json
@@ -2,19 +2,25 @@
   "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
   "@type": "SoftwareSourceCode",
   "identifier": "pangoling",
-  "description": "Provides access to word predictability estimates using large language models (LLMs) based on 'transformer' architectures via integration with the 'Hugging Face' ecosystem. The package interfaces with pre-trained neural networks and supports both causal/auto-regressive LLMs (e.g., 'GPT-2'; Radford et al., 2019) and masked/bidirectional LLMs (e.g., 'BERT'; Devlin et al., 2019, <doi:10.48550/arXiv.1810.04805>) to compute the probability of words, phrases, or tokens given their linguistic context. By enabling a straightforward estimation of word predictability, the package facilitates research in psycholinguistics, computational linguistics, and natural language processing (NLP).",
+  "description": "Provides access to word predictability estimates using large language models (LLMs) based on 'transformer' architectures via integration with the 'Hugging Face' ecosystem <https://huggingface.co/>. The package interfaces with pre-trained neural networks and supports both causal/auto-regressive LLMs (e.g., 'GPT-2') and masked/bidirectional LLMs (e.g., 'BERT') to compute the probability of words, phrases, or tokens given their linguistic context. For details on GPT-2 and causal models, see Radford et al. (2019) <https://storage.prod.researchhub.com/uploads/papers/2020/06/01/language-models.pdf>, for details on BERT and masked models, see Devlin et al. (2019) <doi:10.48550/arXiv.1810.04805>. By enabling a straightforward estimation of word predictability, the package facilitates research in psycholinguistics, computational linguistics, and natural language processing (NLP).",
   "name": "pangoling: Access to Large Language Model Predictions",
-  "relatedLink": "https://docs.ropensci.org/pangoling/",
+  "relatedLink": ["https://docs.ropensci.org/pangoling/", "https://CRAN.R-project.org/package=pangoling"],
   "codeRepository": "https://github.com/ropensci/pangoling",
   "issueTracker": "https://github.com/ropensci/pangoling/issues",
   "license": "https://spdx.org/licenses/MIT",
-  "version": "1.0.1",
+  "version": "1.0.3",
   "programmingLanguage": {
     "@type": "ComputerLanguage",
     "name": "R",
     "url": "https://r-project.org"
   },
   "runtimePlatform": "R version 4.4.1 (2024-06-14)",
+  "provider": {
+    "@id": "https://cran.r-project.org",
+    "@type": "Organization",
+    "name": "Comprehensive R Archive Network (CRAN)",
+    "url": "https://cran.r-project.org"
+  },
   "author": [
     {
       "@type": "Person",
@@ -241,7 +247,7 @@
     },
     "SystemRequirements": null
   },
-  "fileSize": "632.284KB",
+  "fileSize": "632.634KB",
   "citation": [
     {
       "@type": "SoftwareSourceCode",
@@ -256,7 +262,7 @@
       "name": "{pangoling}: {Access} to large language model predictions in {R}",
       "identifier": "10.5281/zenodo.7637526",
       "url": "https://github.com/ropensci/pangoling",
-      "description": "R package version 1.0.1",
+      "description": "R package version 1.0.3",
       "@id": "https://doi.org/10.5281/zenodo.7637526",
       "sameAs": "https://doi.org/10.5281/zenodo.7637526"
     },

diff --git a/cran-comments.md b/cran-comments.md
@@ -1,5 +1,55 @@
+## Resubmission
+
+This is a resubmission. I addressed the following comments:
+
+> Please write references in the description of the DESCRIPTION file in the form
+> authors (year) <doi:...>
+> authors (year, ISBN:...)
+> or if those are not available: authors (year) <https:...>
+> with no space after 'doi:', 'https:' and angle brackets for auto-linking. (If you want to add a title as well please put it in quotes: "Title") -> please add some form of linking to Radford et al., 2019 and write the years in parentheses.
+> For more details: <https://contributor.r-project.org/cran-cookbook/description_issues.html#references>
+
+**Answer**:
+This has been fixed.
+
+> Please provide a link to the used webservices to the description field of your DESCRIPTION file in the form
+> <http:...> or <https:...>
+> with angle brackets for auto-linking and no space after 'http:' and 'https:'.
+> For more details: <https://contributor.r-project.org/cran-cookbook/description_issues.html#references>
+
+**Answer**:
+I have added the link <https://huggingface.co/> to the description
+
+> You have examples wrapped in if(FALSE). Please never do that. Ideally find toy examples that can be regularly executed and checked. Lengthy examples (> 5 sec), can be wrapped in \donttest{}. \dontrun{} can be used if the example really cannot be executed (e.g. because of missing additional software, missing API keys, ...) by the user.
+
+**Answer**:
+There was one example that triggered the installation of python packages wrapped in an `if(FALSE)`. This has been changed to `\dontrun{}`. But notice that very few examples can be actually run in CRAN, since almost all functions depend on additional software (python packages). 
+
+
+> You write information messages to the console that cannot be easily suppressed.
+> It is more R like to generate objects that can be used to extract the information a user is interested in, and then print() that object. Instead of cat() rather use message()/warning() or if(verbose)cat(..) (or maybe stop()) if you really have to write text to the console. (except for print, summary, interactive functions) -> R/utils.R
+For more details: <https://contributor.r-project.org/cran-cookbook/code_issues.html#using-printcat>
+
+**Answer**:
+I changed the `cat()` call to `message()` through the wrapper function `message_verbose()`.
+
+> Please do not modify the global environment (e.g. by using <<-) in your functions. This is not allowed by the CRAN policies. -> R/zzz.R
+
+**Answer**:
+`.Onload` function  had:
+`inspect <<- reticulate::import("inspect", delay_load = TRUE, convert = TRUE)`
+but no corresponding `inspect <- NULL` call in the parent scope, which caused `<<-` to continue searching parent environments until it reaches the `globalenv.`
+I removed it. (I also removed `transformers <<-` because it wasn't necessary.)
+
+Notice that `.Onload` function in zzz.R still has 
+`torch <<- reticulate::import("torch", delay_load = TRUE, convert = FALSE)`
+but there is a corresponding `torch <- NULL` call in the parent scope.
+
 ## R CMD check results
 
 0 errors | 0 warnings | 1 note
 
 * This is a new release.
+
+
+
diff --git a/man/install_py_pangoling.Rd b/man/install_py_pangoling.Rd
diff --git a/man/pangoling-package.Rd b/man/pangoling-package.Rd
diff --git a/man/set_cache_folder.Rd b/man/set_cache_folder.Rd
diff --git a/tests/testthat/test-tr_utils.R b/tests/testthat/test-tr_utils.R
@@ -57,7 +57,6 @@ test_that("set_cache_folder sets and retrieves the cache folder correctly", {
   expect_silent(set_cache_folder(temp_dir))
 
   # Check if environment variables were set correctly
-  transformers_cache <- Sys.getenv("TRANSFORMERS_CACHE")
   hf_home <- Sys.getenv("HF_HOME")
 
   expect_equal(transformers_cache, temp_dir)

diff --git a/tests/testthat/test-zzz.R b/tests/testthat/test-zzz.R
@@ -1,5 +1,4 @@
 test_that("`.onLoad` initializes correctly", {
-  expect_true(!is.null(pangoling:::transformers))
   expect_true(!is.null(pangoling:::torch))
 
   # Test options are set correctly

diff --git a/vignettes/.gitignore b/vignettes/.gitignore
@@ -1,2 +1 @@
 *.html
-*.R
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,3 +14,6 @@ README_cache @@
     REVIEW.md
     /doc/
     /Meta/
+    /cache/
+    /dev/
+    *.py$