#!/usr/bin/perl
#I developed this pipeline using perl to take raw fastq files and run quality control during different stages of file generation; merging and unmerging of fastq files; sorting to remove RNA contamination; trimming to remove sequencing adapters; alignment of RNA-seq reads to a reference genome; sorting of .sam files and generation of .bam files; indexing of .bam files to view in IGV; merging aligned reads into full-length transcripts; merging .gtf transcript files; and generation of table counts for differential expression using R.
#To run this script, you need the following programs: FastQC, SortMeRNA, Trimmomatic, HISAT2, SAMtools, and StringTie. The Ballgown package can be downloaded in R to utilize the table count files for differential expression.
use warnings;
use strict;
my @samples = qw(VcV1 VcV2 VcV3 VcN1 VcN2 VcN3);
foreach(@samples){
do {
system ("fastqc -o fastqc_results --extract ${_}*gz");
system ("gunzip ${_}*.gz");
system ("bash", "merge-paired-reads.sh", "${_}_R1_001.fastq", "${_}_R2_001.fastq", "${_}_merged.fastq");
system ("sortmerna", "--ref", "/home/alexanderselvey/sortmerna/rRNA_databases/silva-bac-16s-id90.fasta,/home/alexanderselvey/sortmerna/index/silva-bac-16s-db:\/home/alexanderselvey/sortmerna/rRNA_databases/silva-bac-23s-id98.fasta,/home/alexanderselvey/sortmerna/index/silva-bac-23s-db:\/home/alexanderselvey/sortmerna/rRNA_databases/silva-arc-16s-id95.fasta,/home/alexanderselvey/sortmerna/index/silva-arc-16s-db:\/home/alexanderselvey/sortmerna/rRNA_databases/silva-arc-23s-id98.fasta,/home/alexanderselvey/sortmerna/index/silva-arc-23s-db:\/home/alexanderselvey/sortmerna/rRNA_databases/silva-euk-18s-id95.fasta,/home/alexanderselvey/sortmerna/index/silva-euk-18s-db:\/home/alexanderselvey/sortmerna/rRNA_databases/silva-euk-28s-id98.fasta,/home/alexanderselvey/sortmerna/index/silva-euk-28s:\/home/alexanderselvey/sortmerna/rRNA_databases/rfam-5s-database-id98.fasta,/home/alexanderselvey/sortmerna/index/rfam-5s-db:\/home/alexanderselvey/sortmerna/rRNA_databases/rfam-5.8s-database-id98.fasta,/home/alexanderselvey/sortmerna/index/rfam-5.8s-db", "--reads", "/media/alexanderselvey/selvey_alexander_external/rnaseq_pipeline_project/VcB_data/${_}_merged.fastq", "--paired_in", "-a", "11", "--log", "--fastx", "--aligned", "${_}_merged_rRNA", "--other", "${_}_merged_sortmerna");
system ("bash", "unmerge-paired-reads.sh", "${_}_merged_sortmerna.fastq", "${_}_sortmerna_R1.fastq", "${_}_sortmerna_R2.fastq");
system ("fastqc -o fastqc_results --extract ${_}_sortmerna_R*.fastq");
system ("java", "-jar", "/home/alexanderselvey/Downloads/Trimmomatic/Trimmomatic-0.38/trimmomatic-0.38.jar", "PE", "-threads", "11", "-trimlog", "${_}_trimmomatic_trimlog.log", "${_}_sortmerna_R1.fastq", "${_}_sortmerna_R2.fastq", "${_}_sortmerna_R1_trim_pe.fastq", "${_}_sortmerna_R1_trim_se.fastq", "${_}_sortmerna_R2_trim_pe.fastq", "${_}_sortmerna_R2_trim_se.fastq", "ILLUMINACLIP:/home/alexanderselvey/Downloads/Trimmomatic/Trimmomatic-0.38/adapters/TruSeq3-PE.fa:2:30:10", "LEADING:3", "TRAILING:3", "SLIDINGWINDOW:4:15", "MINLEN:36");
system ("fastqc -o fastqc_results --extract ${_}_sortmerna_R*_trim_pe.fastq");
system("hisat2", "-p", "11", "--dta", "-x", "Vcardui_hisat2_index/Vcardui_index", "-1", "${_}_sortmerna_R1_trim_pe.fastq", "-2", "${_}_sortmerna_R2_trim_pe.fastq", "-S", "${_}_alignedh2_Vcardui.sam");
system("samtools", "sort", "-@", "11", "-o", "${_}_alignedh2_Vcardui.bam", "${_}_alignedh2_Vcardui.sam");
system("samtools", "index", "${_}_alignedh2_Vcardui.bam", "${_}_alignedh2_Vcardui.bam.bai");
system("stringtie", "-p", "11", "-o", "${_}.gtf", "-l", "${_}", "${_}_alignedh2_Vcardui.bam");
}
}
system("stringtie", "--merge", "-p", "11", "-o", "VcB_combined.gtf", "mergelist.txt");
foreach(@samples){
do{
system("stringtie", "-e", "-B", "-p", "11", "-G", "VcB_combined.gtf", "-o", "ballgown/${_}/${_}.gtf", "${_}_alignedh2_Vcardui.bam");
}
}
library(DESeq2)
library(EnhancedVolcano)
setwd()
countData = as.matrix(read.csv("/media/alexander/selvey_alexander_external/rnaseq_pipeline_project/Vc_data_all/gene_count_matrix.csv",row.names="gene_id"))
colData = read.csv("/media/alexander/selvey_alexander_external/rnaseq_pipeline_project/Vc_data_all/Vcardui_all_phenodata.csv",sep=',',row.names=1)
colData$sample = factor(colData$sample)
all(rownames(colData) %in% colnames(countData))
countData = countData[,rownames(colData)]
all(rownames(colData)==colnames(countData))
dds = DESeqDataSetFromMatrix(countData=countData,colData=colData,design=~treatment)
dds = DESeq(dds, fitType='local')
res = results(dds,alpha=0.05)
resvn = results(dds, contrast=c("treatment","V","N"),alpha=0.05)
resbs = results(dds, contrast=c("treatment","B","S"),alpha=0.05)
resvb = results(dds, contrast=c("treatment","V","B"),alpha=0.05)
ressn = results(dds, contrast=c("treatment","S","N"),alpha=0.05)
#V-vs-N
(resOrderedvn = resvn[order(resvn$padj),])
(resOrderedvn1 = resvn[order(resvn$log2FoldChange),])
write.csv(resOrderedvn,"VcVN_genes_ordered_by_padj.csv",row.names=FALSE)
write.csv(resOrderedvn1,"VcVN_genes_ordered_by_fc.csv",row.names=FALSE)
#B-vs-S
(resOrderedbs = resbs[order(resbs$padj),])
(resOrderedbs1 = resbs[order(resbs$log2FoldChange),])
write.csv(resOrderedbs,"VcBS_genes_ordered_by_padj.csv",row.names=FALSE)
write.csv(resOrderedbs1,"VcBS_genes_ordered_by_fc.csv",row.names=FALSE)
#V-vs-B
(resOrderedvb = resvb[order(resvb$padj),])
(resOrderedvb1 = resvb[order(resvb$log2FoldChange),])
write.csv(resOrderedvb,"VcVB_genes_ordered_by_padj.csv",row.names=FALSE)
write.csv(resOrderedvb1,"VcVB_genes_ordered_by_fc.csv",row.names=FALSE)
#S-vs-N
(resOrderedsn = ressn[order(ressn$padj),])
(resOrderedsn1 = ressn[order(ressn$log2FoldChange),])
write.csv(resOrderedsn,"VcSN_genes_ordered_by_padj.csv",row.names=FALSE)
write.csv(resOrderedsn1,"VcSN_genes_ordered_by_fc.csv",row.names=FALSE)
EnhancedVolcano(resvn,x='log2FoldChange',y='pvalue',lab=rownames(resvn),xlab=bquote(~Log[2]~ "Fold Change"),ylab=bquote(~-Log[10]~italic(P)),title='V vs. N Volcano Plot by P-Value',xlim=c(-35,35))
EnhancedVolcano(resvn,x='log2FoldChange',y='padj',lab=rownames(resvn),xlab=bquote(~Log[2]~ "Fold Change"),ylab=bquote(~-Log[10]~adjusted~italic(P)),title='V vs. N Volcano Plot by Adjusted P-Value (FDR)',xlim=c(-35,35))
EnhancedVolcano(resbs,x='log2FoldChange',y='pvalue',lab=rownames(resbs),xlab=bquote(~Log[2]~ "Fold Change"),ylab=bquote(~-Log[10]~italic(P)),title='B vs. S Volcano Plot by P-Value',xlim=c(-35,35))
EnhancedVolcano(resbs,x='log2FoldChange',y='padj',lab=rownames(resbs),xlab=bquote(~Log[2]~ "Fold Change"),ylab=bquote(~-Log[10]~adjusted~italic(P)),title='B vs. S Volcano Plot by Adjusted P-Value (FDR)',xlim=c(-35,35))
EnhancedVolcano(resvb,x='log2FoldChange',y='pvalue',lab=rownames(resvb),xlab=bquote(~Log[2]~ "Fold Change"),ylab=bquote(~-Log[10]~italic(P)),title='V vs. B Volcano Plot by P-Value',xlim=c(-35,35))
EnhancedVolcano(resvb,x='log2FoldChange',y='padj',lab=rownames(resvb),xlab=bquote(~Log[2]~ "Fold Change"),ylab=bquote(~-Log[10]~adjusted~italic(P)),title='V vs. B Volcano Plot by Adjusted P-Value (FDR)',xlim=c(-35,35))
EnhancedVolcano(ressn,x='log2FoldChange',y='pvalue',lab=rownames(ressn),xlab=bquote(~Log[2]~ "Fold Change"),ylab=bquote(~-Log[10]~italic(P)),title='S vs. N Volcano Plot by P-Value',xlim=c(-35,35))
EnhancedVolcano(ressn,x='log2FoldChange',y='padj',lab=rownames(ressn),xlab=bquote(~Log[2]~ "Fold Change"),ylab=bquote(~-Log[10]~adjusted~italic(P)),title='S vs. N Volcano Plot by Adjusted P-Value (FDR)',xlim=c(-35,35))
plotMA(resvn,alpha=0.05,main='V vs. N MA Plot (alpha=0.05)')
plotMA(resbs,alpha=0.05,main='B vs. S MA Plot (alpha=0.05)')
plotMA(resvb,alpha=0.05,main='V vs. B MA Plot (alpha=0.05)')
plotMA(ressn,alpha=0.05,main='S vs. N MA Plot (alpha=0.05)')
res1 = subset(res,subset=res@listData[['padj']] < 0.05)
vsd = vst(dds,blind=FALSE,fitType='local')
plotPCA(vsd,intgroup='treatment')
plotPCA(vsd,intgroup='sample')
res.df = as.data.frame(c(resvn,resvb,resbs,ressn))
ggplot(res.df[!is.na(res.df$pvalue), ], aes(x = pvalue)) +
geom_histogram(alpha=.5, position='identity', bins = 50) +
labs(title='Histogram of unadjusted p-values') +
xlab('Unadjusted p-values') +
xlim(c(0, 1.0005))
ggplot(res.df[!is.na(res.df$padj), ], aes(x = padj)) +
geom_histogram(alpha=.5, position='identity', bins = 50) +
labs(title=paste('Histogram of', elementMetadata(c(resvn,resvb,resbs,ressn))$description[grep('adjusted', elementMetadata(c(resvn,resvb,resbs,ressn))$description)])) +
xlab('Adjusted p-values') +
xlim(c(0, 1.0005))+ggplot2::ylim(c(0,175))