Skip to content

Commit

Permalink
Merge pull request #40 from DataONEorg/develop
Browse files Browse the repository at this point in the history
Release 1.0.0
  • Loading branch information
jeanetteclark authored Sep 10, 2024
2 parents 9a9ae0e + 5d5e063 commit 7389c94
Show file tree
Hide file tree
Showing 36 changed files with 898 additions and 293 deletions.
3 changes: 3 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@
^CONTRIBUTING\.md$
^api-scopus-search.sh$
^\.github$
^logo$
^data$
^results$
2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v2
- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-r@v2
- name: Install dependencies
run: |
install.packages(c("remotes", "rcmdcheck"))
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
/results

key.txt

data-raw
# Ignore Vim's swap files
.*.swp
22 changes: 13 additions & 9 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
Package: scythe
Title: Harvest and register data package citations
Version: 0.9.1
Version: 1.0.0
Authors@R: c(
person("Jeanette", "Clark", role = c("aut", "cre"), email = "jeanetteclark@nceas.ucsb.edu", comment=c(ORCID = "0000-0003-4703-1974")),
person("Jeanette", "Clark", role = c("aut", "cre"), email = "jclark@nceas.ucsb.edu", comment=c(ORCID = "0000-0003-4703-1974")),
person("Matthew B.", "Jones", role = "aut", email = "[email protected]", comment=c(ORCID = "0000-0003-0077-4738")),
person("Maya", "Samet", role = "aut", email = "[email protected]", comment=c(ORCID = "0000-0002-5248-9712"))
person("Maya", "Samet", role = "aut", email = "[email protected]", comment=c(ORCID = "0000-0002-5248-9712")),
person("Althea", "Marks", role = "aut", email = "[email protected]", comment=c(ORCID = "0000-0002-9370-9128"))
)
Description: Harvests data package citations from several API sources, including PLOS, Scopus, and Springer.
Description: Harvests data package citations from several API sources, including PLOS, Scopus, and Springer. This package uses modified functions from `rplos`, which is no longer maintained.
License: Apache License (>= 2.0)
Encoding: UTF-8
LazyData: true
Imports:
bib2df,
curl,
dplyr,
jsonlite,
keyring,
rcrossref,
rplos
Suggests:
solrium,
stats
Remotes: ropensci/bib2df@a8e96e13f5
Suggests:
bib2df,
covr,
purrr,
testthat (>= 2.1.0)
RoxygenNote: 7.1.1
testthat (>= 3.0.0)
RoxygenNote: 7.3.1
Config/testthat/edition: 3
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ export(citation_search)
export(citation_search_plos)
export(citation_search_scopus)
export(citation_search_springer)
export(citation_search_xdd)
export(scythe_get_key)
export(scythe_set_key)
export(write_citation_pairs)
import(dplyr)
importFrom(curl,curl)
importFrom(jsonlite,fromJSON)
importFrom(rplos,searchplos)
importFrom(stats,complete.cases)
51 changes: 27 additions & 24 deletions R/citation_search.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#' Search for citations in text across all APIs
#'
#' @param identifiers a vector of identifiers to be searched for
#' @param sources a vector indicating which sources to query (one or more of plos, scopus, springer)
#' @param sources a vector indicating which sources to query (one or
#' more of plos, scopus, springer)
#' @return tibble of matching dataset and publication identifiers
#'
#' @export
Expand All @@ -11,42 +12,44 @@
#' result <- citation_search(identifiers, sources = c("plos"))
#' }
citation_search <- function(identifiers,
sources = c("plos", "scopus", "springer")) {

if(!("character" %in% class(identifiers))){
stop("Identifiers must be a character vector.")
}

# run the 'citation_search_*' function for each source
for (source in sources) {
search_function <- paste0(source, " <- citation_search_", source, "(identifiers)")
eval(parse(text = search_function))
sources = c("plos", "scopus", "springer", "xdd")) {
stopifnot(is.character(identifiers))

unid <- sources[!(sources %in% c("plos", "scopus", "springer", "xdd"))]
if (length(unid) > 0) {
stop(paste("Source", unid, "is not recognized.", collapse = ". "))
}

# combine all of the resulting data frames and return the result df
bind_function <- paste0("rbind(", paste0(sources, collapse = ","), ")")
result <- eval(parse(text = bind_function))
search_funs <- sapply(sources, function(source) {
get(paste0("citation_search_", source), mode = "function")
})

return(result)
# Run each search, producing a list of dataframes
result_df_list <- lapply(search_funs, function(search_fun) {
search_fun(identifiers)
})

# Combine the resulting data frames and return the result df
result <- dplyr::bind_rows(result_df_list)

return(result)
}

# Check identifiers to remove characters that interfere with query strings

check_identifiers <- function(identifiers){
if (any(!grepl("10\\.|urn:uuid", identifiers))){
warning(call. = FALSE,
"One or more identifiers does not appear to be a DOI or uuid",
immediate. = TRUE)
check_identifiers <- function(identifiers) {
if (any(!grepl("10\\.|urn:uuid", identifiers))) {
warning(
call. = FALSE,
"One or more identifiers does not appear to be a DOI or uuid",
immediate. = TRUE
)
}

if (any(grepl("doi:|urn:uuid", identifiers))){
if (any(grepl("doi:|urn:uuid", identifiers))) {
identifiers <- gsub("(doi:)|(urn:uuid:)", "", identifiers)
message("Identifier prefix (doi: or urn:uuid) has been stripped out of the search term.")
}


return(identifiers)
}


195 changes: 162 additions & 33 deletions R/citation_search_plos.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,186 @@
#'
#' This function searches for citations in PLOS. Requests are throttled
#' at one identifier every 6 seconds so as to not overload the PLOS
#' API.
#' API. This function uses modified source code from the `rplos` package,
#' which is no longer maintained.
#'
#' @param identifiers a vector of identifiers to be searched for
#'
#' @return tibble of matching dataset and publication identifiers
#' @export
#' @importFrom rplos searchplos
#' @examples
#' \dontrun{
#' identifiers <- c("10.18739/A22274", "10.18739/A2D08X", "10.5063/F1T151VR")
#' result <- citation_search_plos(identifiers)
#' }
citation_search_plos <- function(identifiers) {
if (length(identifiers) > 1){
message(paste0("Your result will take ~", length(identifiers)*6 ," seconds to return,
since this function is rate limited to one call every 6 seconds."))
}
wait_seconds <- 6
report_est_wait(length(identifiers), wait_seconds)

identifiers <- check_identifiers(identifiers)
identifiers <- check_identifiers(identifiers)

# search for identifier
results <- lapply(identifiers, function(x){
Sys.sleep(6)
v <- rplos::searchplos(q = x,
fl = c("id","title"),
limit = 1000)
return(v)
# encode colons to not break PLOS API
identifiers <- gsub(":", "%3A", identifiers)

}
# search for identifier
results <- lapply(identifiers, function(x) {
Sys.sleep(wait_seconds)
v <- searchplos(
q = x,
fl = c("id", "title"),
limit = 1000
)
return(v)
})

plos_results <- list()
# assign dataset identifier to each result
for (i in 1:length(results)) {
if (results[[i]]$meta$numFound == 0 | is.null(results[[i]])) {
plos_results[[i]] <- data.frame(
id = NA,
dataset_id = identifiers[i],
title = NA,
source = "plos"
)
} else if (results[[i]]$meta$numFound > 0) {
plos_results[[i]] <- results[[i]]$data
plos_results[[i]]$dataset_id <- identifiers[i]
plos_results[[i]]$source <- "plos"
}
}

# bind resulting tibbles
plos_results <- do.call(rbind, plos_results)
names(plos_results)[which(names(plos_results) == "id")] <-
"article_id"
names(plos_results)[which(names(plos_results) == "title")] <-
"article_title"
plos_results <-
plos_results[stats::complete.cases(plos_results), ] # remove incomplete cases (NAs)

plos_results <- list()
# assign dataset identifier to each result
for (i in 1:length(results)){
if (results[[i]]$meta$numFound == 0 | is.null(results[[i]])){
plos_results[[i]] <- data.frame(id = NA,
dataset_id = identifiers[i],
title = NA)
}
else if (results[[i]]$meta$numFound > 0){
plos_results[[i]] <- results[[i]]$data
plos_results[[i]]$dataset_id <- identifiers[i]
}
return(plos_results)
}

#' A Modified Version of rplos::searchplos
#'
#' This function is adapted from the searchplos in the `rplos` package, which is no longer maintained.
#'
#' @param q Search terms, eg: field:query
#' @param fl Fields to return
#' @param fq Fields to filter query on
#' @param sort Sort results according to field
#' @param start Record to start at for pagination
#' @param limit Number of results to return for pagination
#' @param sleep Seconds to wait between requests
#' @param errors One of simple or complete
#' @param proxy List of args for proxy connection
#' @param callopts Optional curl options
#' @param progress Optional logic for progress bar
#' @param ... Addtl Solr arguments
searchplos <- function(q = NULL, fl = "id", fq = NULL, sort = NULL, start = 0,
limit = 10, sleep = 6, errors = "simple", proxy = NULL, callopts = list(),
progress = NULL, ...) {
# Make sure limit is a numeric or integer
limit <- tryCatch(as.numeric(as.character(limit)), warning = function(e) e)
if ("warning" %in% class(limit)) {
stop("limit should be a numeric or integer class value", call. = FALSE)
}
if (!inherits(limit, "numeric") | is.na(limit)) {
stop("limit should be a numeric or integer class value", call. = FALSE)
}

if (is.null(limit)) limit <- 999
if (limit == 0) fl <- NULL
fl <- paste(fl, collapse = ",")

args <- list()
if (!is.null(fq[[1]])) {
if (length(fq) == 1) {
args$fq <- fq
} else {
args <- fq
names(args) <- rep("fq", length(args))
}
}
args <- c(args, ploscompact(list(
q = q, fl = fl, start = as.integer(start),
rows = as.integer(limit), sort = sort, wt = "json"
)))

conn_plos <- solrium::SolrClient$new(host = "api.plos.org", path = "search", port = NULL)

# bind resulting tibbles
plos_results <- do.call(rbind, plos_results)
names(plos_results)[which(names(plos_results) == "id")] <- "article_id"
names(plos_results)[which(names(plos_results) == "title")] <- "article_title"
plos_results <- plos_results[complete.cases(plos_results), ]
getnum_tmp <- suppressMessages(
conn_plos$search(params = list(q = q, fl = fl, rows = 0, wt = "json"))
)
getnumrecords <- attr(getnum_tmp, "numFound")

return(plos_results)
if (getnumrecords > limit) {
getnumrecords <- limit
} else {
getnumrecords <- getnumrecords
}

if (min(getnumrecords, limit) < 1000) {
if (!is.null(limit)) args$rows <- limit
if (length(args) == 0) args <- NULL
jsonout <- suppressMessages(
conn_plos$search(
params = args, callopts = callopts,
minOptimizedRows = FALSE, progress = progress, ...
)
)
meta <- dplyr::tibble(
numFound = attr(jsonout, "numFound"),
start = attr(jsonout, "start")
)
return(list(meta = meta, data = jsonout))
} else {
byby <- 500
getvecs <- seq(from = 0, to = getnumrecords - 1, by = byby)
lastnum <- as.numeric(strextract(getnumrecords, "[0-9]{3}$"))
if (lastnum == 0) {
lastnum <- byby
}
if (lastnum > byby) {
lastnum <- getnumrecords - getvecs[length(getvecs)]
} else {
lastnum <- lastnum
}
getrows <- c(rep(byby, length(getvecs) - 1), lastnum)
out <- list()
for (i in seq_along(getvecs)) {
args$start <- as.integer(getvecs[i])
args$rows <- as.integer(getrows[i])
if (length(args) == 0) args <- NULL
jsonout <- suppressMessages(conn_plos$search(
params = ploscompact(list(
q = args$q, fl = args$fl,
fq = args[names(args) == "fq"],
sort = args$sort,
rows = as.integer(args$rows), start = as.integer(args$start),
wt = "json"
)), minOptimizedRows = FALSE, callopts = callopts,
progress = progress, ...
))
out[[i]] <- jsonout
}
resdf <- dplyr::bind_rows(out)
meta <- dplyr::tibble(
numFound = attr(jsonout, "numFound"),
start = attr(jsonout, "start")
)
return(list(meta = meta, data = resdf))
}
}
#' This function is from the `rplos` package, which is no longer maintained.
#' @param l a list
ploscompact <- function(l) Filter(Negate(is.null), l)

#' This function is from the `rplos` package, which is no longer maintained.
#'
#' @param str A string
#' @param pattern A regex pattern
strextract <- function(str, pattern) {
regmatches(str, regexpr(pattern, str))
}
Loading

0 comments on commit 7389c94

Please sign in to comment.