Skip to content

Commit 3865d7d

Browse files
committed
Bug fixes and addition of geneAnnotationSearch Function
1 parent 5f7a448 commit 3865d7d

27 files changed

+803
-249
lines changed

DESCRIPTION

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@ Description:
1717
Automate the detection of gaps and elevations in mapped sequencing
1818
read coverage using a 2D pattern-matching algorithm. 'ProActive' detects, characterizes
1919
and visualizes read coverage patterns in both genomes and metagenomes. Optionally,
20-
users may provide gene predictions associated with their genome or metagenome
20+
users may provide gene annotations associated with their genome or metagenome
2121
in the form of a .gff file. In this case, 'ProActive' will generate an additional
22-
output table containing the gene predictions found within the detected regions of gapped
23-
and elevated read coverage.
22+
output table containing the gene annotations found within the detected regions of gapped
23+
and elevated read coverage. Additionally, users can search for gene
24+
annotations of interest in the output read coverage plots.
2425
License: GPL-2
2526
Encoding: UTF-8
2627
LazyData: true

NAMESPACE

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
# Generated by roxygen2: do not edit by hand
22

3-
export(ProActive)
3+
export(ProActiveDetect)
4+
export(geneAnnotationSearch)
45
export(plotProActiveResults)
56
import(ggplot2)
7+
importFrom(dplyr,"%>%")
68
importFrom(dplyr,bind_rows)
79
importFrom(stats,median)
810
importFrom(stats,na.omit)
911
importFrom(stringr,regex)
1012
importFrom(stringr,str_detect)
1113
importFrom(stringr,str_extract)
1214
importFrom(stringr,str_extract_all)
15+
importFrom(stringr,str_which)
1316
importFrom(utils,capture.output)
1417
importFrom(utils,write.table)

NEWS.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1-
# ProActive 0.0.1
1+
## Changes in 0.1.2
2+
+ Changed name of main function from `ProActive()` to `ProActiveDetect()`
3+
+ Added functionality for searching read coverage plots for gene annotations of
4+
interest with the geneAnnotationSearch() function
5+
+ Fixed bug with pattern-match start and stop positions on genome/contig chunks
26

3-
* Initial CRAN submission.
7+
## Changes in 0.0.2
8+
+ Changes for CRAN re-submission after review
9+
10+
## Changes in 0.0.1
11+
12+
+ Initial CRAN submission.

R/GPsInElevGaps.R

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@
1313
#' @param chunkContigs TRUE or FALSE, If TRUE and `mode`="metagenome", contigs longer
1414
#' than the `chunkSize` will be 'chunked' into smaller subsets and pattern-matching
1515
#' will be performed on each subset. Default is FALSE.
16+
#' @param chunkSize If `mode`="genome" OR if `mode`="metagenome" and `chunkContigs`=TRUE,
17+
#' chunk the genome or contigs, respectively, into smaller subsets for pattern-matching.
1618
#' @importFrom stringr str_detect str_extract_all regex
1719
#' @importFrom dplyr bind_rows
1820
#' @keywords internal
19-
GPsInElevGaps <- function(elevGapSummList, windowSize, gffTSV, mode, chunkContigs) {
21+
GPsInElevGaps <- function(elevGapSummList, windowSize, gffTSV, mode, chunkContigs, chunkSize) {
2022
colnames(gffTSV) <- c("seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes")
2123
if (TRUE %in% (str_detect(gffTSV[,9], regex('product', ignore_case = T)))){
2224
product <- str_extract_all(gffTSV [,9], regex("(?<=product=)[\\s\\S]*",ignore_case = T))
@@ -26,13 +28,17 @@ GPsInElevGaps <- function(elevGapSummList, windowSize, gffTSV, mode, chunkContig
2628
trueRefName <- elevGapSummList[[i]][[8]]
2729
if(mode == "metagenome"){
2830
refName <- elevGapSummList[[i]][[8]]
29-
if(chunkContigs==TRUE){
31+
if(chunkContigs == TRUE){
3032
refName <- gsub("_chunk_.*", "", refName)
3133
}
32-
gffTSV <- gffTSV[which(gffTSV[, 1] == refName), ]
34+
gffTSV <- gffTSV[which(gffTSV[, 1] == refName), ]
35+
startPos <- elevGapSummList[[i]][[4]] * windowSize
36+
endPos <- elevGapSummList[[i]][[5]] * windowSize
37+
} else {
38+
chunkNumber <- as.numeric(str_extract(trueRefName, "(?<=\\_)\\d+$")) - 1
39+
startPos <- (elevGapSummList[[i]][[4]] * windowSize) + (chunkNumber * chunkSize)
40+
endPos <- (elevGapSummList[[i]][[5]] * windowSize) + (chunkNumber * chunkSize)
3341
}
34-
startPos <- elevGapSummList[[i]][[4]] * windowSize
35-
endPos <- elevGapSummList[[i]][[5]] * windowSize
3642
matchRegion <- seq(startPos, endPos, 1)
3743
GPs <- gffTSV[which(gffTSV[, 5] %in% matchRegion), ]
3844
GPs$seqid <- rep(trueRefName, nrow(GPs))

R/ProActive-package.R

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
#' @title \bold{ProActive}
22
#'
3-
#' @description Automatic detection of gaps and elevations in mapped sequencing reads using read coverage
4-
#' pattern-matching. Pattern-matching results are summarized into a dataframe and users can visualize the
5-
#' pattern-matches and associated read coverage patterns. If a .gff file is supplied, ProActive will output
6-
#' the ORFs found within the detected regions of elevated and/or gapped read coverage. ProActive works with both
7-
#' metagenomes and genomes.
3+
#' @description `ProActive` automatically detects regions of gapped and elevated read coverage
4+
#' using a 2D pattern-matching algorithm. `ProActive` detects, characterizes and
5+
#' visualizes read coverage patterns in both genomes and metagenomes. Optionally,
6+
#' users may provide gene annotations associated with their genome or metagenome
7+
#' in the form of a .gff file. In this case, `ProActive` will generate an additional
8+
#' output table containing the gene annotations found within the detected regions of
9+
#' gapped and elevated read coverage. Additionally, users can search for gene
10+
#' annotations of interest in the output read coverage plots.
811
#'
9-
#' @details The two main functions in ProActive are:
12+
#' @details The three main functions in `ProActive` are:
1013
#' \enumerate{
11-
#' \item \code{\link{ProActive}} performs the pattern-matching and characterization of read coverage patterns.
14+
#' \item \code{\link{ProActiveDetect}} performs the pattern-matching and characterization of read coverage patterns.
1215
#' \item \code{\link{plotProActiveResults}} plots the results from
13-
#' \code{ProActive()}
16+
#' \code{ProActiveDetect()}
17+
#' \item \code{\link{geneAnnotationSearch}} searches classified contigs/chunks for gene annotations that match
18+
#' user-provided keywords.
1419
#' }
1520
#'
1621
#' @author Jessie Maier \email{[email protected]}

R/ProActive.R renamed to R/ProActiveDetect.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,12 @@
4040
#' @return A list containing 6 objects described in the function description.
4141
#' @export
4242
#' @examples
43-
#' metagenome_results <- ProActive(
43+
#' metagenome_results <- ProActiveDetect(
4444
#' pileup = sampleMetagenomePileup,
4545
#' mode = "metagenome",
4646
#' gffTSV = sampleMetagenomegffTSV
4747
#' )
48-
ProActive <- function(pileup, mode, gffTSV, windowSize = 1000, chunkContigs = FALSE,
48+
ProActiveDetect <- function(pileup, mode, gffTSV, windowSize = 1000, chunkContigs = FALSE,
4949
minSize = 10000, maxSize = Inf, minContigLength = 30000,
5050
chunkSize = 100000, IncludeNoPatterns = FALSE, verbose = TRUE,
5151
saveFilesTo) {
@@ -83,11 +83,11 @@ ProActive <- function(pileup, mode, gffTSV, windowSize = 1000, chunkContigs = FA
8383
}
8484
filteredOutContigsDf <- patternMatchSummary[[2]]
8585
if(verbose){message("Summarizing pattern-matching results")}
86-
summaryTable <- classifSumm(pileup, patternMatchSummary[[1]], windowSize, mode)
86+
summaryTable <- classifSumm(pileup, patternMatchSummary[[1]], windowSize, mode, chunkSize)
8787
if (missing(gffTSV) == FALSE) {
8888
if(verbose){message("Finding gene predictions in elevated or gapped regions of read coverage...")}
8989
elevGapSummList <- removeNoPatterns(patternMatchSummary[[1]])
90-
GPSummTable <- GPsInElevGaps(elevGapSummList, windowSize, gffTSV, mode, chunkContigs)
90+
GPSummTable <- GPsInElevGaps(elevGapSummList, windowSize, gffTSV, mode, chunkContigs, chunkSize)
9191
}
9292
if(verbose){message("Finalizing output")}
9393
endTime <- Sys.time()

R/bestMatchListFunctions.R

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,29 @@
88
#' 100 bp windows/bins.
99
#' @param windowSize The number of basepairs to average read coverage values over.
1010
#' Options are 100, 200, 500, 1000 ONLY. Default is 1000.
11+
#' @param chunkSize If `mode`="genome" OR if `mode`="metagenome" and `chunkContigs`=TRUE,
12+
#' chunk the genome or contigs, respectively, into smaller subsets for pattern-matching.
1113
#' @param mode Either "genome" or "metagenome"
1214
#' @keywords internal
13-
classifSumm <- function(pileup, bestMatchList, windowSize, mode) {
15+
classifSumm <- function(pileup, bestMatchList, windowSize, mode, chunkSize) {
1416
if (length(bestMatchList) == 0) {
1517
stop("No pattern-matches detected")
1618
}
1719
refName <- vapply(seq_along(bestMatchList), function(i) {bestMatchList[[i]][[8]]}, character(1))
1820
elevRatio <- vapply(seq_along(bestMatchList), function(i) {bestMatchList[[i]][[6]]}, numeric(1))
19-
startPos <- vapply(seq_along(bestMatchList), function(i) {bestMatchList[[i]][[4]]} * windowSize, numeric(1))
20-
endPos <- vapply(seq_along(bestMatchList), function(i) {bestMatchList[[i]][[5]]} * windowSize, numeric(1))
21+
startEndPos <- lapply(seq_along(bestMatchList), function(i) {
22+
if(mode == "genome"){
23+
refName <- bestMatchList[[i]][[8]]
24+
chunkNumber <- as.numeric(str_extract(refName, "(?<=\\_)\\d+$")) - 1
25+
startPos <- (bestMatchList[[i]][[4]] * windowSize) + (chunkNumber * chunkSize)
26+
endPos <- (bestMatchList[[i]][[5]] * windowSize) + (chunkNumber * chunkSize)
27+
} else {
28+
startPos <- bestMatchList[[i]][[4]] * windowSize
29+
endPos <- bestMatchList[[i]][[5]] * windowSize
30+
}
31+
list(startPos, endPos)})
32+
startPos <- vapply(seq_along(startEndPos), function(i){startEndPos[[i]][[1]]}, numeric(1))
33+
endPos <- vapply(seq_along(startEndPos), function(i){startEndPos[[i]][[2]]}, numeric(1))
2134
classification <- vapply(seq_along(bestMatchList), function(i) {bestMatchList[[i]][[7]]}, character(1))
2235
matchSize <- vapply(seq_along(bestMatchList), function(i) {
2336
pileupSubset <- pileup[which(pileup[, 1] == bestMatchList[[i]][[8]]), ]

R/chunkingFunctions.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,14 @@ contigChunks <- function(pileup, chunkSize) {
3939
refNameChunk[c(remainIdx:nrow(pileupSubset))] <- rep(paste0(refName, "_chunk_", Z + 1), nrow(contigChunk))
4040
}
4141
chunkedPileupSubset <- cbind.data.frame(refNameChunk, pileupSubset$coverage, pileupSubset$position)
42-
colnames(chunkedPileupSubset) <- c("refName", "coverage", "position")
4342
NAIdx <- which(is.na(chunkedPileup[, 1]))[[1]]
4443
chunkedPileup[c(NAIdx:(NAIdx + nrow(pileupSubset) - 1)), ] <- chunkedPileupSubset
4544
} else {
4645
NAIdx <- which(is.na(chunkedPileup[, 1]))[[1]]
4746
chunkedPileup[c(NAIdx:(NAIdx + nrow(pileupSubset) - 1)), ] <- pileupSubset
4847
}
4948
}
49+
colnames(chunkedPileup) <- c("refName", "coverage", "position")
5050
return(chunkedPileup)
5151
}
5252

R/geneAnnotationPlot.R

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#' Gene annotation plot
2+
#'
3+
#' Plot read coverage and location of gene annotations that match the keywords and
4+
#' search criteria for contig/chunk currently being assessed
5+
#'
6+
#' @param geneAnnotSubset Subset of gene annotations to be plotted
7+
#' @param keywords The key-word(s) used for the search.
8+
#' @param pileupSubset A subset of the pileup associated with the contig/chunk being assessed
9+
#' @param colIdx The column index 'gene' or 'product' column
10+
#' @param startbpRange The basepair at which the search is started if a 'specific' search is used
11+
#' @param endbpRange The basepair at which the search is ended if a 'specific' search is used
12+
#' @param elevRatio The maximum/minimum values of the pattern-match
13+
#' @param pattern The pattern-match information associated with the contig/chunk being assessed
14+
#' @param windowSize The number of basepairs to average read coverage values over.
15+
#' @keywords internal
16+
#' @importFrom stringr str_which
17+
geneAnnotationPlot <- function(geneAnnotSubset, keywords, pileupSubset,
18+
colIdx, startbpRange, endbpRange, elevRatio,
19+
pattern, windowSize, chunkSize, mode){
20+
position <- coverage <- start <- NULL
21+
classification <- pattern[[7]]
22+
refName <- pileupSubset[1, 1]
23+
if(mode == "genome"){
24+
chunkNumber <- as.numeric(str_extract(refName, "(?<=\\_)\\d+$")) - 1
25+
startPos <- (pattern[[4]] * windowSize) + (chunkNumber * chunkSize)
26+
endPos <- (pattern[[5]] * windowSize) + (chunkNumber * chunkSize)
27+
} else {
28+
startPos <- pattern[[4]] * windowSize
29+
endPos <- pattern[[5]] * windowSize
30+
}
31+
matchIdxs <- str_which(geneAnnotSubset[, colIdx], regex(paste(keywords, collapse = "|"), ignore_case = TRUE))
32+
geneAnnotMatches <- geneAnnotSubset[matchIdxs,]
33+
geneStartPos <- geneAnnotMatches$start
34+
geneAnnotLabels <- paste0("#", c(1:nrow(geneAnnotMatches)), ": ", geneAnnotMatches[, colIdx],
35+
sep = " ", collapse = " \n ")
36+
plot <- ggplot(data = pileupSubset, aes(x = position, y = coverage)) +
37+
geom_area(fill = "#009E73") +
38+
geom_vline(xintercept = geneStartPos, linewidth = 1) +
39+
geom_vline(xintercept = c(startPos, endPos), color = "#D55E00", linewidth = 1) +
40+
geom_vline(xintercept = c(startbpRange, endbpRange), color = "#D55E00", linewidth = 1, linetype = "dotted") +
41+
geom_label(data = geneAnnotMatches, aes(x = start, y = (max(pileupSubset$coverage) / 2),
42+
label = paste0("#", c(1 : nrow(geneAnnotMatches)))),
43+
size = 2.75) +
44+
scale_x_continuous(expand = c(0, 0)) +
45+
theme(panel.grid.major = element_blank(),
46+
panel.grid.minor = element_blank(),
47+
panel.background = element_blank(),
48+
axis.line = element_line(colour = "black"),
49+
text = element_text(size = 15),
50+
plot.margin = margin(
51+
t = 0,
52+
r = 10,
53+
b = 0,
54+
l = 2
55+
)) +
56+
labs(title = paste(refName, classification),
57+
subtitle = paste("elevation ratio:", round(elevRatio, digits = 4)),
58+
x = "Basepair position",
59+
caption = geneAnnotLabels,
60+
y = "Read coverage")
61+
return(plot)
62+
}

0 commit comments

Comments
 (0)