Ballgown is a software package designed to facilitate flexible differential expression analysis of RNA-Seq data. It also provides functions to organize, visualize, and analyze the expression measurements for your transcriptome assembly.
We will use the output from the StringTie step that created the ballgown input files.Here we are doing a pairwise comparison for differential expression in sensitive genotype(IS20351) under Drought Stress(DS) and Well Watered(WW) condition. We have 3 replicates under each condition
Follow the steps; copy paste the command on the R studio console and hit enter to run the code
setwd("/de-app-work/")
library(ballgown)
library(ggplot2)
install.packages("gplots")
library(gplots)
library(genefilter)
library(GenomicRanges)
library(plyr)
pheno_data = read.table(file ="design_matrix", header = TRUE, sep = "\t")
sample_full_path <- paste("ballgown_input_files",pheno_data[,1], sep = '/')
bg = ballgown(samples=as.vector(sample_full_path),pData=pheno_data)
bg
## ballgown instance with 80950 transcripts and 6 samples
bg_filt = subset(bg,"rowVars(texpr(bg)) >1",genomesubset=TRUE)
bg_table = texpr(bg_filt, 'all')
bg_gene_names = unique(bg_table[, 9:10])
gene_expression = as.data.frame(gexpr(bg_filt))
head(gene_expression)
## FPKM.IS20351_DS_1_1 FPKM.IS20351_DS_2_1 FPKM.IS20351_DS_3_1
## MSTRG.1 6.6100650 11.252592 11.347587
## MSTRG.100 5.3192730 10.290616 14.342060
## MSTRG.1000 8.6150110 8.754116 11.728596
## MSTRG.1001 5.8462371 7.061811 7.776445
## MSTRG.10010 0.4591360 4.146388 6.504951
## MSTRG.10011 0.1425843 2.168228 2.894545
## FPKM.IS20351_WW_1_1 FPKM.IS20351_WW_2_1 FPKM.IS20351_WW_3_1
## MSTRG.1 5.477058 7.710475 11.964094
## MSTRG.100 8.029486 13.910149 21.050735
## MSTRG.1000 3.819834 6.246689 5.960262
## MSTRG.1001 4.112864 3.408974 5.633885
## MSTRG.10010 16.000067 19.211029 20.660242
## MSTRG.10011 16.959782 10.992865 12.070408
colnames(gene_expression) <- c("IS20351_DS_1_1","IS20351_DS_2_1","IS20351_DS_3_1","IS20351_WW_1_1","IS20351_WW_2_1","IS20351_WW_3_1")
row.names(gene_expression)[1:10]
dim(gene_expression)
## [1] 23371 6
data_colors=c("tomato1","tomato2","tomato3","wheat1","wheat2","wheat3")
i = row.names(gene_expression) == "MSTRG.27571"
gene_expression[i,]
## IS20351_DS_1_1 IS20351_DS_2_1 IS20351_DS_3_1 IS20351_WW_1_1
## MSTRG.27571 1.960471 3.215415 3.697611 0.872269
## IS20351_WW_2_1 IS20351_WW_3_1
## MSTRG.27571 0.476039 0.632291
genes_of_interest = c("MSTRG.28956", "MSTRG.28959", "MSTRG.2896", "MSTRG.28962")
i = which(row.names(gene_expression) %in% genes_of_interest)
gene_expression[i,]
## IS20351_DS_1_1 IS20351_DS_2_1 IS20351_DS_3_1 IS20351_WW_1_1
## MSTRG.28956 0.981967 1.469531 0.925434 2.838750
## MSTRG.28959 0.107997 0.365174 0.196593 0.215648
## MSTRG.2896 8.408193 7.598804 9.875687 17.607841
## MSTRG.28962 4.832179 1.124734 0.357997 2.039790
## IS20351_WW_2_1 IS20351_WW_3_1
## MSTRG.28956 1.959053 4.331974
## MSTRG.28959 3.626117 0.199521
## MSTRG.2896 28.338808 26.362604
## MSTRG.28962 0.000000 0.000000
transcript_gene_table = indexes(bg)$t2g
head(transcript_gene_table)
## t_id g_id
## 1 1 Sb01g000200
## 2 2 MSTRG.1
## 3 3 MSTRG.2
## 4 4 MSTRG.1
## 5 5 MSTRG.3
## 6 6 MSTRG.4
length(row.names(transcript_gene_table)) #Transcript count
## [1] 80950
length(unique(transcript_gene_table[,"g_id"])) #Unique Gene count
## [1] 61732
counts=table(transcript_gene_table[,"g_id"])
c_one = length(which(counts == 1))
c_more_than_one = length(which(counts > 1))
c_max = max(counts)
hist(counts, breaks=50, col="bisque4", xlab="Transcripts per gene", main="Distribution of transcript count per gene")
legend_text = c(paste("Genes with one transcript =", c_one), paste("Genes with more than one transcript =", c_more_than_one), paste("Max transcripts for single gene = ", c_max))
legend("topright", legend_text, lty=NULL)
full_table <- texpr(bg , 'all')
hist(full_table$length, breaks=50, xlab="Transcript length (bp)", main="Distribution of transcript lengths", col="steelblue")
min(gene_expression[,"IS20351_DS_1_1"])
## [1] 0
max(gene_expression[,"IS20351_DS_2_1"])
## [1] 39919.22
min(gene_expression[,"IS20351_DS_3_1"])
## [1] 0
max(gene_expression[,"IS20351_WW_1_1"])
## [1] 12070.91
min(gene_expression[,"IS20351_WW_2_1"])
## [1] 0
max(gene_expression[,"IS20351_WW_3_1"])
## [1] 30315.73
min_nonzero=1
data_columns=c(1:6)
short_names=c("sen_DS_1","sen_DS_2","sen_DS_3","sen_WW_1","sen_WW_2","sen_WW_3")
boxplot(log2(gene_expression[,data_columns]+min_nonzero), col=data_colors, names=short_names, las=2, ylab="log2(FPKM)", main="Distribution of FPKMs for all 6 libraries")
Note that the bold horizontal line on each boxplot is the median
x = gene_expression[,"IS20351_DS_1_1"]
y = gene_expression[,"IS20351_DS_2_1"]
plot(x=log2(x+min_nonzero), y=log2(y+min_nonzero), pch=16, col="blue", cex=0.25, xlab="FPKM (IS20351_DS, Replicate 1)", ylab="FPKM (IS20351_DS, Replicate 2)", main="Comparison of expression values for a pair of replicates")
abline(a=0,b=1)
rs=cor(x,y)^2
legend("topleft", paste("R squared = ", round(rs, digits=3), sep=""), lwd=1, col="black")
colors = colorRampPalette(c("white", "blue", "#007FFF", "cyan","#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
smoothScatter(x=log2(x+min_nonzero), y=log2(y+min_nonzero), xlab="FPKM (IS20351_DS, Replicate 1)", ylab="FPKM (IS20351_DS, Replicate 2)", main="Comparison of expression values for a pair of replicates", colramp=colors, nbin=200)
gene_expression[,"sum"]=apply(gene_expression[,data_columns], 1, sum)
i = which(gene_expression[,"sum"] > 5)
r=cor(gene_expression[i,data_columns], use="pairwise.complete.obs", method="pearson")
r
## IS20351_DS_1_1 IS20351_DS_2_1 IS20351_DS_3_1 IS20351_WW_1_1
## IS20351_DS_1_1 1.0000000 0.9942428 0.9840948 0.8211440
## IS20351_DS_2_1 0.9942428 1.0000000 0.9915486 0.8122250
## IS20351_DS_3_1 0.9840948 0.9915486 1.0000000 0.7987260
## IS20351_WW_1_1 0.8211440 0.8122250 0.7987260 1.0000000
## IS20351_WW_2_1 0.9775692 0.9777724 0.9772803 0.8317814
## IS20351_WW_3_1 0.9614445 0.9617983 0.9727414 0.8210495
## IS20351_WW_2_1 IS20351_WW_3_1
## IS20351_DS_1_1 0.9775692 0.9614445
## IS20351_DS_2_1 0.9777724 0.9617983
## IS20351_DS_3_1 0.9772803 0.9727414
## IS20351_WW_1_1 0.8317814 0.8210495
## IS20351_WW_2_1 1.0000000 0.9825595
## IS20351_WW_3_1 0.9825595 1.0000000
d=1-r
mds=cmdscale(d, k=2, eig=TRUE)
par(mfrow=c(1,1))
plot(mds$points, type="n", xlab="", ylab="", main="MDS distance plot (all non-zero genes) for all libraries", xlim=c(-0.15,0.15), ylim=c(-0.15,0.15))
points(mds$points[,1], mds$points[,2], col="grey", cex=2, pch=16)
text(mds$points[,1], mds$points[,2], short_names, col=data_colors)
results_genes = stattest(bg_filt, feature="gene", covariate="group", getFC=TRUE, meas="FPKM")
results_genes = merge(results_genes,bg_gene_names,by.x=c("id"),by.y=c("gene_id"))
sig=which(results_genes$pval<0.05)
results_genes[,"de"] = log2(results_genes[,"fc"])
hist(results_genes[sig,"de"], breaks=50, col="seagreen", xlab="log2(Fold change) Sen_DS vs Sen_WW", main="Distribution of differential expression values")
abline(v=-2, col="black", lwd=2, lty=2)
abline(v=2, col="black", lwd=2, lty=2)
legend("topleft", "Fold-change > 4", lwd=2, lty=2)
gene_expression[,"Sen_DS"]=apply(gene_expression[,c(1:3)], 1, mean)
gene_expression[,"Sen_WW"]=apply(gene_expression[,c(3:6)], 1, mean)
x=log2(gene_expression[,"Sen_DS"]+min_nonzero)
y=log2(gene_expression[,"Sen_WW"]+min_nonzero)
plot(x=x, y=y, pch=16, cex=0.25, xlab="Sen_DS FPKM (log2)", ylab="Sen_WW FPKM (log2)", main="Sen_DS vs Sen_WW FPKMs")
abline(a=0, b=1)
xsig=x[sig]
ysig=y[sig]
points(x=xsig, y=ysig, col="magenta", pch=16, cex=0.5)
legend("topleft", "Significant", col="magenta", pch=16)
sigpi = which(results_genes[,"pval"]<0.05)
sigp = results_genes[sigpi,]
sigde = which(abs(sigp[,"de"]) >= 2)
sig_tn_de = sigp[sigde,]
o = order(sig_tn_de[,"qval"], -abs(sig_tn_de[,"de"]), decreasing=FALSE)
output = sig_tn_de[o,c("gene_name","id","fc","pval","qval","de")]
write.table(output, file="SigDE.txt", sep="\t", row.names=FALSE, quote=FALSE)
#View selected columns of the first 25 lines of output
output[1:10,c(1,4,5)]
## gene_name pval qval
## 10693 Sb04g022780 4.468390e-06 0.01670524
## 17561 . 4.519096e-05 0.03902020
## 12960 . 3.018199e-05 0.03902020
## 17707 . 4.355827e-05 0.03902020
## 13446 Sb05g022540 4.814110e-05 0.03902020
## 15005 Sb06g022500 6.746485e-05 0.04379781
## 21545 . 7.239141e-05 0.04474925
## 10772 Sb04g023540 9.863582e-05 0.04610436
## 21394 Sb04g003900 9.559125e-05 0.04610436
## 21342 . 2.114246e-04 0.04661513