Skip to content

GrahamHamilton/pipelineTools

Repository files navigation

pipelineTools

The goal of pipelineTools is to streamline the NGS analysis pipelines and result reporting within RStudio. PipelineTools provides packages to run standard open source NGS tools

Installation

Can be installed using devtools directly from GitHub using the following commands.

# Install devtools
install.packages("devtools")
library("devtools")
#Install package from GitHub
install_github("GrahamHamilton/pipelineTools")

Example

Load the required libraries

library("pipelineTools")
library("biomaRt")
library("tximport")

Set the paths to the software installed on the system

fastq.screen.path <- "/software/fastq_screen_v0.13.0/fastq_screen"
fastp.path <- "/software/bin/fastp"
kallisto.path <- "/software/kallisto_v0.45.1/kallisto"
hisat.path <- "/software/hisat_v2-2.1.0/hisat2"
multiqc.path <- "/usr/local/bin/multiqc"
samtools.path <- "/software/samtools_v1.9/samtools"
stringtie.path <- "/software/stringtie-1.3.6/stringtie"

Version numbers for the software used

fastqscreen.version <- run_fastq_screen(fastq_screen = fastq.screen.path, version = TRUE)
fastp.version <- run_fastp(fastp = fastp.path, version = TRUE)
kallisto.version <- run_kallisto(kallisto = kallisto.path, version = TRUE)
hisat.version <- run_hisat2(hisat2 = hisat.path, version = TRUE)
multiqc.version <- run_multiqc(multiqc = multiqc.path, version = TRUE)
samtools.version <- run_samtools(samtools = samtools.path, version = TRUE)
stringtie.version <- run_stringtie(stringtie = stringtie.path, version = TRUE)
r.version <- getRversion()
pipelineTools.version <- packageDescription("pipelineTools")$Version
deseq2.version <- packageDescription("DESeq2")$Version

Create the results directories

# Trimmed reads directory
trimmed.reads.dir <- "trimmed_reads"
#Create the directory for the trimmed reads
dir.create(trimmed.reads.dir, showWarnings = FALSE)

# FastQScreen results directory
fastq.screen.dir <- "Screen"
# Create the directory for the fastq screen results
dir.create(fastq.screen.dir, showWarnings = FALSE)

# FastP results directory
fastp.results.dir <- "FastpQC"
# Create the directory for the FastP results
dir.create(fastp.results.dir, showWarnings = FALSE)

# Kallisto results directory
kalisto.results.dir <- "kallisto"
#Create the directory for the Kallisto results
dir.create(kalisto.results.dir, showWarnings = FALSE)

# Hisat2 alignment results directory
hisat2.alignments.dir <- "hisat2_alignments"
#Create the directory for the HiSat2 results
dir.create(hisat2.alignments.dir, showWarnings = FALSE)

# Stringtie results directory
stringtie.dir <- "stringtie"
#Create the directory for the HiSat2 results
dir.create(stringtie.dir, showWarnings = FALSE)

Read in the fastq file paths to lists and then create sample names lists and trimmed read file paths

reads.path <- "raw_reads"

reads.patt.1 <- "_S\\d{1,2}\\_L001_R1_001.fastq.gz$"
reads.patt.2 <- "_S\\d{1,2}\\_L001_R2_001.fastq.gz$"

sample.dataframe <- prepare_samples(reads.path, c(reads.patt.1,reads.patt.2),trimmed.reads.dir)

mate1 <- as.character(sample.dataframe$reads.path.1)
mate1.trim <- as.character(sample.dataframe$trimmed.reads.path.1)
# For paired end sequence
mate2 <- as.character(sample.dataframe$reads.path.2)
mate2.trim <- as.character(sample.dataframe$trimmed.reads.path.2)


sample.names <- as.character(sample.dataframe$sample.names)

Sequence adapters

adapter1 <- "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
adapter2 <- "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"

The full paths to the reference genome/transcriptome indexes and annotation files

# Path to the reference transcriptome
transcriptome <- "/path/to/kallisto/index"

# Path to the reference genome
genome <- "/path/to/hisat2/index"

# Path to the gtf file
gtf <- "/path/to/gtf/file"

Fastq Screen

Results are stored in the Screen folder

fastq.screen.conf <- "/export/jessie3/gmh5n/PipelineTest/fastq_screen.conf"
fastq.screen.cmds <- run_fastq_screen(fq.files = mate1,
                                      out.dir = fastq.screen.dir,
                                      conf = fastq.screen.conf,
                                      fastq_screen = fastq.screen.path)
write.table(fastq.screen.cmds,"FastqScreen_commands.sh", quote = FALSE, row.names = FALSE, col.names = FALSE)

FastP

Adapter and quality trimmed reads are stored in the r trimmed.reads.dir directory, QC files are stored in the FastpQC folder

fastp.cmds <- run_fastp(mate1 = mate1,
                        mate2 = mate2,
                        mate1.out = mate1.out,
                        mate2.out = mate2.out,
                        adapter1 = adapter1,
                        adapter2 = adapter2,
                        sample.name =  sample.names,
                        out.dir = fastp.results.dir,
                        fastp = fastp.path)

write.table(fastp.cmds,"FastP_commands.sh", quote = FALSE, row.names = FALSE, col.names = FALSE)

Kallisto

Pseudo align the reads to the reference transcriptome with Kallisto

strandedness <- "second"
# Paired end
kallisto.cmds <- run_kallisto(mate1 = mate1,
                              mate2 = mate2,
                              index = transcriptome,
                              sample.name = sample.names,
                              strandedness = strandedness,
                              out.dir = kalisto.results.dir,
                              kallisto = kallisto.path)

write.table(kallisto.cmds,"Kallisto_commands.sh", quote = FALSE, row.names = FALSE, col.names = FALSE)

TXImport for Kallisto data

mart<-"ensembl"
db<-"mmusculus_gene_ensembl" # Change to organism in study
filt<-"ensembl_gene_id"

# Create the biomaRt object
ensembl = useEnsembl(biomart=mart, dataset=db)

# Get all the transcript ids and corresponding gene ids from BiomaRt
att<-c("ensembl_transcript_id","ensembl_gene_id")
txTable<-getBM(attributes=att,mart=ensembl)

# Set the column names for the transcript to gene table
colnames(txTable)<-c("tx_id","gene_id")

# Read in the experimental design file, tab seperated
sampleTable <- read.csv("SampleDescription.txt", sep="\t", row.names=1)

# Read in the file names form the kallisto results directory
dir <- getwd()
files <- file.path(dir,kalisto.results.dir,row.names(sampleTable),"abundance.h5", fsep = .Platform$file.sep)
names(files)<-row.names(sampleTable)
 
txi <- tximport(files, type = "kallisto", tx2gene = txTable,ignoreTxVersion = TRUE, ignoreAfterBar = TRUE)

About

Repository for pipelineTools

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages