Merge pull request #40 from DataONEorg/develop

Release 1.0.0
DataONEorg · Sep 10, 2024 · 7389c94 · 7389c94
2 parents 9a9ae0e + 5d5e063
commit 7389c94
Show file tree

Hide file tree

Showing 36 changed files with 898 additions and 293 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -6,3 +6,6 @@
 ^CONTRIBUTING\.md$
 ^api-scopus-search.sh$
 ^\.github$
+^logo$
+^data$
+^results$
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -19,7 +19,7 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - uses: actions/checkout@v2
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-r@v2
       - name: Install dependencies
         run: |
           install.packages(c("remotes", "rcmdcheck"))

diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,6 @@
 /results
 
 key.txt
-
+data-raw
 # Ignore Vim's swap files
 .*.swp
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,25 +1,29 @@
 Package: scythe
 Title: Harvest and register data package citations 
-Version: 0.9.1
+Version: 1.0.0
 Authors@R: c(
-    person("Jeanette", "Clark", role = c("aut", "cre"), email = "jeanetteclark@nceas.ucsb.edu", comment=c(ORCID = "0000-0003-4703-1974")),
+    person("Jeanette", "Clark", role = c("aut", "cre"), email = "jclark@nceas.ucsb.edu", comment=c(ORCID = "0000-0003-4703-1974")),
     person("Matthew B.", "Jones", role = "aut", email = "[email protected]", comment=c(ORCID = "0000-0003-0077-4738")),
-    person("Maya", "Samet", role = "aut", email = "[email protected]", comment=c(ORCID = "0000-0002-5248-9712"))
+    person("Maya", "Samet", role = "aut", email = "[email protected]", comment=c(ORCID = "0000-0002-5248-9712")),
+    person("Althea", "Marks", role = "aut", email = "[email protected]", comment=c(ORCID = "0000-0002-9370-9128"))
     )
-Description: Harvests data package citations from several API sources, including PLOS, Scopus, and Springer.
+Description: Harvests data package citations from several API sources, including PLOS, Scopus, and Springer. This package uses modified functions from `rplos`, which is no longer maintained.
 License: Apache License (>= 2.0)
 Encoding: UTF-8
 LazyData: true
 Imports:
-      bib2df,
       curl,
       dplyr,
       jsonlite,
       keyring,
       rcrossref,
-      rplos
-Suggests: 
+      solrium,
+      stats
+Remotes: ropensci/bib2df@a8e96e13f5
+Suggests:
+    bib2df,
     covr,
     purrr,
-    testthat (>= 2.1.0)
-RoxygenNote: 7.1.1
+    testthat (>= 3.0.0)
+RoxygenNote: 7.3.1
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,10 +4,11 @@ export(citation_search)
 export(citation_search_plos)
 export(citation_search_scopus)
 export(citation_search_springer)
+export(citation_search_xdd)
 export(scythe_get_key)
 export(scythe_set_key)
 export(write_citation_pairs)
 import(dplyr)
 importFrom(curl,curl)
 importFrom(jsonlite,fromJSON)
-importFrom(rplos,searchplos)
+importFrom(stats,complete.cases)
diff --git a/R/citation_search.R b/R/citation_search.R
@@ -1,7 +1,8 @@
 #' Search for citations in text across all APIs
 #'
 #' @param identifiers a vector of identifiers to be searched for
-#' @param sources a vector indicating which sources to query (one or more of plos, scopus, springer)
+#' @param sources a vector indicating which sources to query (one or
+#' more of plos, scopus, springer)
 #' @return tibble of matching dataset and publication identifiers
 #'
 #' @export
@@ -11,42 +12,44 @@
 #' result <- citation_search(identifiers, sources = c("plos"))
 #' }
 citation_search <- function(identifiers,
-                            sources = c("plos", "scopus", "springer")) {
-
-  if(!("character" %in% class(identifiers))){
-    stop("Identifiers must be a character vector.")
-  }
-
-  # run the 'citation_search_*' function for each source
-  for (source in sources) {
-    search_function <- paste0(source, " <- citation_search_", source, "(identifiers)")
-    eval(parse(text = search_function))
+                            sources = c("plos", "scopus", "springer", "xdd")) {
+  stopifnot(is.character(identifiers))
+
+  unid <- sources[!(sources %in% c("plos", "scopus", "springer", "xdd"))]
+  if (length(unid) > 0) {
+    stop(paste("Source", unid, "is not recognized.", collapse = ". "))
   }
 
-  # combine all of the resulting data frames and return the result df
-  bind_function <- paste0("rbind(", paste0(sources, collapse = ","), ")")
-  result <- eval(parse(text = bind_function))
+  search_funs <- sapply(sources, function(source) {
+    get(paste0("citation_search_", source), mode = "function")
+  })
 
-  return(result)
+  # Run each search, producing a list of dataframes
+  result_df_list <- lapply(search_funs, function(search_fun) {
+    search_fun(identifiers)
+  })
+
+  # Combine the resulting data frames and return the result df
+  result <- dplyr::bind_rows(result_df_list)
 
+  return(result)
 }
 
 # Check identifiers to remove characters that interfere with query strings
 
-check_identifiers <- function(identifiers){
-  if (any(!grepl("10\\.|urn:uuid", identifiers))){
-    warning(call. = FALSE,
-            "One or more identifiers does not appear to be a DOI or uuid",
-            immediate. = TRUE)
+check_identifiers <- function(identifiers) {
+  if (any(!grepl("10\\.|urn:uuid", identifiers))) {
+    warning(
+      call. = FALSE,
+      "One or more identifiers does not appear to be a DOI or uuid",
+      immediate. = TRUE
+    )
   }
 
-  if (any(grepl("doi:|urn:uuid", identifiers))){
+  if (any(grepl("doi:|urn:uuid", identifiers))) {
     identifiers <- gsub("(doi:)|(urn:uuid:)", "", identifiers)
-    message("Identifier prefix (doi: or urn:uuid) has been stripped out of the search term.")
   }
 
 
   return(identifiers)
 }
-
-
diff --git a/R/citation_search_plos.R b/R/citation_search_plos.R
@@ -2,57 +2,186 @@
 #'
 #' This function searches for citations in PLOS. Requests are throttled
 #' at one identifier every 6 seconds so as to not overload the PLOS
-#' API.
+#' API. This function uses modified source code from the `rplos` package,
+#' which is no longer maintained.
 #'
 #' @param identifiers a vector of identifiers to be searched for
 #'
 #' @return tibble of matching dataset and publication identifiers
 #' @export
-#' @importFrom rplos searchplos
 #' @examples
 #' \dontrun{
 #' identifiers <- c("10.18739/A22274", "10.18739/A2D08X", "10.5063/F1T151VR")
 #' result <- citation_search_plos(identifiers)
 #' }
 citation_search_plos <- function(identifiers) {
-    if (length(identifiers) > 1){
-        message(paste0("Your result will take ~", length(identifiers)*6 ," seconds to return,
-                   since this function is rate limited to one call every 6 seconds."))
-    }
+  wait_seconds <- 6
+  report_est_wait(length(identifiers), wait_seconds)
 
-    identifiers <- check_identifiers(identifiers)
+  identifiers <- check_identifiers(identifiers)
 
-    # search for identifier
-    results <- lapply(identifiers, function(x){
-        Sys.sleep(6)
-        v <- rplos::searchplos(q = x,
-                               fl = c("id","title"),
-                               limit = 1000)
-        return(v)
+  # encode colons to not break PLOS API
+  identifiers <- gsub(":", "%3A", identifiers)
 
-    }
+  # search for identifier
+  results <- lapply(identifiers, function(x) {
+    Sys.sleep(wait_seconds)
+    v <- searchplos(
+      q = x,
+      fl = c("id", "title"),
+      limit = 1000
     )
+    return(v)
+  })
+
+  plos_results <- list()
+  # assign dataset identifier to each result
+  for (i in 1:length(results)) {
+    if (results[[i]]$meta$numFound == 0 | is.null(results[[i]])) {
+      plos_results[[i]] <- data.frame(
+        id = NA,
+        dataset_id = identifiers[i],
+        title = NA,
+        source = "plos"
+      )
+    } else if (results[[i]]$meta$numFound > 0) {
+      plos_results[[i]] <- results[[i]]$data
+      plos_results[[i]]$dataset_id <- identifiers[i]
+      plos_results[[i]]$source <- "plos"
+    }
+  }
+
+  # bind resulting tibbles
+  plos_results <- do.call(rbind, plos_results)
+  names(plos_results)[which(names(plos_results) == "id")] <-
+    "article_id"
+  names(plos_results)[which(names(plos_results) == "title")] <-
+    "article_title"
+  plos_results <-
+    plos_results[stats::complete.cases(plos_results), ] # remove incomplete cases (NAs)
 
-    plos_results <- list()
-    # assign dataset identifier to each result
-    for (i in 1:length(results)){
-        if (results[[i]]$meta$numFound == 0 | is.null(results[[i]])){
-            plos_results[[i]] <- data.frame(id = NA,
-                                            dataset_id = identifiers[i],
-                                            title = NA)
-        }
-        else if (results[[i]]$meta$numFound > 0){
-            plos_results[[i]] <- results[[i]]$data
-            plos_results[[i]]$dataset_id <- identifiers[i]
-        }
+  return(plos_results)
+}
 
+#' A Modified Version of rplos::searchplos
+#'
+#' This function is adapted from the searchplos in the `rplos` package, which is no longer maintained.
+#'
+#' @param q Search terms, eg: field:query
+#' @param fl Fields to return
+#' @param fq Fields to filter query on
+#' @param sort Sort results according to field
+#' @param start Record to start at for pagination
+#' @param limit Number of results to return for pagination
+#' @param sleep Seconds to wait between requests
+#' @param errors One of simple or complete
+#' @param proxy List of args for proxy connection
+#' @param callopts Optional curl options
+#' @param progress Optional logic for progress bar
+#' @param ... Addtl Solr arguments
+searchplos <- function(q = NULL, fl = "id", fq = NULL, sort = NULL, start = 0,
+                       limit = 10, sleep = 6, errors = "simple", proxy = NULL, callopts = list(),
+                       progress = NULL, ...) {
+  # Make sure limit is a numeric or integer
+  limit <- tryCatch(as.numeric(as.character(limit)), warning = function(e) e)
+  if ("warning" %in% class(limit)) {
+    stop("limit should be a numeric or integer class value", call. = FALSE)
+  }
+  if (!inherits(limit, "numeric") | is.na(limit)) {
+    stop("limit should be a numeric or integer class value", call. = FALSE)
+  }
+
+  if (is.null(limit)) limit <- 999
+  if (limit == 0) fl <- NULL
+  fl <- paste(fl, collapse = ",")
+
+  args <- list()
+  if (!is.null(fq[[1]])) {
+    if (length(fq) == 1) {
+      args$fq <- fq
+    } else {
+      args <- fq
+      names(args) <- rep("fq", length(args))
     }
+  }
+  args <- c(args, ploscompact(list(
+    q = q, fl = fl, start = as.integer(start),
+    rows = as.integer(limit), sort = sort, wt = "json"
+  )))
+
+  conn_plos <- solrium::SolrClient$new(host = "api.plos.org", path = "search", port = NULL)
 
-    # bind resulting tibbles
-    plos_results <- do.call(rbind, plos_results)
-    names(plos_results)[which(names(plos_results) == "id")] <- "article_id"
-    names(plos_results)[which(names(plos_results) == "title")] <- "article_title"
-    plos_results <- plos_results[complete.cases(plos_results), ]
+  getnum_tmp <- suppressMessages(
+    conn_plos$search(params = list(q = q, fl = fl, rows = 0, wt = "json"))
+  )
+  getnumrecords <- attr(getnum_tmp, "numFound")
 
-    return(plos_results)
+  if (getnumrecords > limit) {
+    getnumrecords <- limit
+  } else {
+    getnumrecords <- getnumrecords
+  }
+
+  if (min(getnumrecords, limit) < 1000) {
+    if (!is.null(limit)) args$rows <- limit
+    if (length(args) == 0) args <- NULL
+    jsonout <- suppressMessages(
+      conn_plos$search(
+        params = args, callopts = callopts,
+        minOptimizedRows = FALSE, progress = progress, ...
+      )
+    )
+    meta <- dplyr::tibble(
+      numFound = attr(jsonout, "numFound"),
+      start = attr(jsonout, "start")
+    )
+    return(list(meta = meta, data = jsonout))
+  } else {
+    byby <- 500
+    getvecs <- seq(from = 0, to = getnumrecords - 1, by = byby)
+    lastnum <- as.numeric(strextract(getnumrecords, "[0-9]{3}$"))
+    if (lastnum == 0) {
+      lastnum <- byby
+    }
+    if (lastnum > byby) {
+      lastnum <- getnumrecords - getvecs[length(getvecs)]
+    } else {
+      lastnum <- lastnum
+    }
+    getrows <- c(rep(byby, length(getvecs) - 1), lastnum)
+    out <- list()
+    for (i in seq_along(getvecs)) {
+      args$start <- as.integer(getvecs[i])
+      args$rows <- as.integer(getrows[i])
+      if (length(args) == 0) args <- NULL
+      jsonout <- suppressMessages(conn_plos$search(
+        params = ploscompact(list(
+          q = args$q, fl = args$fl,
+          fq = args[names(args) == "fq"],
+          sort = args$sort,
+          rows = as.integer(args$rows), start = as.integer(args$start),
+          wt = "json"
+        )), minOptimizedRows = FALSE, callopts = callopts,
+        progress = progress, ...
+      ))
+      out[[i]] <- jsonout
+    }
+    resdf <- dplyr::bind_rows(out)
+    meta <- dplyr::tibble(
+      numFound = attr(jsonout, "numFound"),
+      start = attr(jsonout, "start")
+    )
+    return(list(meta = meta, data = resdf))
+  }
+}
+#' This function is from the `rplos` package, which is no longer maintained.
+#' @param l a list
+ploscompact <- function(l) Filter(Negate(is.null), l)
+
+#' This function is from the `rplos` package, which is no longer maintained.
+#'
+#' @param str A string
+#' @param pattern A regex pattern
+strextract <- function(str, pattern) {
+  regmatches(str, regexpr(pattern, str))
 }