The goal of pipelineTools is to streamline the NGS analysis pipelines and result reporting within RStudio. PipelineTools provides packages to run standard open source NGS tools
Can be installed using devtools directly from GitHub using the following commands.
# Install devtools
install.packages("devtools")
library("devtools")
#Install package from GitHub
install_github("GrahamHamilton/pipelineTools")
Load the required libraries
library("pipelineTools")
library("biomaRt")
library("tximport")
Set the paths to the software installed on the system
fastq.screen.path <- "/software/fastq_screen_v0.13.0/fastq_screen"
fastp.path <- "/software/bin/fastp"
kallisto.path <- "/software/kallisto_v0.45.1/kallisto"
hisat.path <- "/software/hisat_v2-2.1.0/hisat2"
multiqc.path <- "/usr/local/bin/multiqc"
samtools.path <- "/software/samtools_v1.9/samtools"
stringtie.path <- "/software/stringtie-1.3.6/stringtie"
Version numbers for the software used
fastqscreen.version <- run_fastq_screen(fastq_screen = fastq.screen.path, version = TRUE)
fastp.version <- run_fastp(fastp = fastp.path, version = TRUE)
kallisto.version <- run_kallisto(kallisto = kallisto.path, version = TRUE)
hisat.version <- run_hisat2(hisat2 = hisat.path, version = TRUE)
multiqc.version <- run_multiqc(multiqc = multiqc.path, version = TRUE)
samtools.version <- run_samtools(samtools = samtools.path, version = TRUE)
stringtie.version <- run_stringtie(stringtie = stringtie.path, version = TRUE)
r.version <- getRversion()
pipelineTools.version <- packageDescription("pipelineTools")$Version
deseq2.version <- packageDescription("DESeq2")$Version
Create the results directories
# Trimmed reads directory
trimmed.reads.dir <- "trimmed_reads"
#Create the directory for the trimmed reads
dir.create(trimmed.reads.dir, showWarnings = FALSE)
# FastQScreen results directory
fastq.screen.dir <- "Screen"
# Create the directory for the fastq screen results
dir.create(fastq.screen.dir, showWarnings = FALSE)
# FastP results directory
fastp.results.dir <- "FastpQC"
# Create the directory for the FastP results
dir.create(fastp.results.dir, showWarnings = FALSE)
# Kallisto results directory
kalisto.results.dir <- "kallisto"
#Create the directory for the Kallisto results
dir.create(kalisto.results.dir, showWarnings = FALSE)
# Hisat2 alignment results directory
hisat2.alignments.dir <- "hisat2_alignments"
#Create the directory for the HiSat2 results
dir.create(hisat2.alignments.dir, showWarnings = FALSE)
# Stringtie results directory
stringtie.dir <- "stringtie"
#Create the directory for the HiSat2 results
dir.create(stringtie.dir, showWarnings = FALSE)
Read in the fastq file paths to lists and then create sample names lists and trimmed read file paths
reads.path <- "raw_reads"
reads.patt.1 <- "_S\\d{1,2}\\_L001_R1_001.fastq.gz$"
reads.patt.2 <- "_S\\d{1,2}\\_L001_R2_001.fastq.gz$"
sample.dataframe <- prepare_samples(reads.path, c(reads.patt.1,reads.patt.2),trimmed.reads.dir)
mate1 <- as.character(sample.dataframe$reads.path.1)
mate1.trim <- as.character(sample.dataframe$trimmed.reads.path.1)
# For paired end sequence
mate2 <- as.character(sample.dataframe$reads.path.2)
mate2.trim <- as.character(sample.dataframe$trimmed.reads.path.2)
sample.names <- as.character(sample.dataframe$sample.names)
Sequence adapters
adapter1 <- "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
adapter2 <- "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
The full paths to the reference genome/transcriptome indexes and annotation files
# Path to the reference transcriptome
transcriptome <- "/path/to/kallisto/index"
# Path to the reference genome
genome <- "/path/to/hisat2/index"
# Path to the gtf file
gtf <- "/path/to/gtf/file"
Results are stored in the Screen folder
fastq.screen.conf <- "/export/jessie3/gmh5n/PipelineTest/fastq_screen.conf"
fastq.screen.cmds <- run_fastq_screen(fq.files = mate1,
out.dir = fastq.screen.dir,
conf = fastq.screen.conf,
fastq_screen = fastq.screen.path)
write.table(fastq.screen.cmds,"FastqScreen_commands.sh", quote = FALSE, row.names = FALSE, col.names = FALSE)
Adapter and quality trimmed reads are stored in the r trimmed.reads.dir
directory, QC files are stored in the FastpQC folder
fastp.cmds <- run_fastp(mate1 = mate1,
mate2 = mate2,
mate1.out = mate1.out,
mate2.out = mate2.out,
adapter1 = adapter1,
adapter2 = adapter2,
sample.name = sample.names,
out.dir = fastp.results.dir,
fastp = fastp.path)
write.table(fastp.cmds,"FastP_commands.sh", quote = FALSE, row.names = FALSE, col.names = FALSE)
Pseudo align the reads to the reference transcriptome with Kallisto
strandedness <- "second"
# Paired end
kallisto.cmds <- run_kallisto(mate1 = mate1,
mate2 = mate2,
index = transcriptome,
sample.name = sample.names,
strandedness = strandedness,
out.dir = kalisto.results.dir,
kallisto = kallisto.path)
write.table(kallisto.cmds,"Kallisto_commands.sh", quote = FALSE, row.names = FALSE, col.names = FALSE)
mart<-"ensembl"
db<-"mmusculus_gene_ensembl" # Change to organism in study
filt<-"ensembl_gene_id"
# Create the biomaRt object
ensembl = useEnsembl(biomart=mart, dataset=db)
# Get all the transcript ids and corresponding gene ids from BiomaRt
att<-c("ensembl_transcript_id","ensembl_gene_id")
txTable<-getBM(attributes=att,mart=ensembl)
# Set the column names for the transcript to gene table
colnames(txTable)<-c("tx_id","gene_id")
# Read in the experimental design file, tab seperated
sampleTable <- read.csv("SampleDescription.txt", sep="\t", row.names=1)
# Read in the file names form the kallisto results directory
dir <- getwd()
files <- file.path(dir,kalisto.results.dir,row.names(sampleTable),"abundance.h5", fsep = .Platform$file.sep)
names(files)<-row.names(sampleTable)
txi <- tximport(files, type = "kallisto", tx2gene = txTable,ignoreTxVersion = TRUE, ignoreAfterBar = TRUE)