Using Ballgown R package for differential expression analysis using DE and VICE

Ballgown analysis steps

set the working directory

setwd("/de-app-work/")

install ballgown R package along with its dependencies

library(ballgown)
library(ggplot2)
install.packages("gplots")
library(gplots)
library(genefilter)
library(GenomicRanges)
library(plyr)

Read the design_matrix file

pheno_data = read.table(file ="design_matrix", header = TRUE, sep = "\t")

full path to the sample directories

sample_full_path <- paste("ballgown_input_files",pheno_data[,1], sep = '/')

Load ballgown data structure and save it to a variable “bg”

bg = ballgown(samples=as.vector(sample_full_path),pData=pheno_data)

Display a description of this object

bg

## ballgown instance with 80950 transcripts and 6 samples

Filter low-abundance genes. Here we remove all transcripts with a variance across the samples of less than one

bg_filt = subset(bg,"rowVars(texpr(bg)) >1",genomesubset=TRUE)

Load gene names for lookup later in the tutorial

bg_table = texpr(bg_filt, 'all')
bg_gene_names = unique(bg_table[, 9:10])

Pull the gene_expression data frame from the ballgown object

gene_expression = as.data.frame(gexpr(bg_filt))

View the first five rows of data (all columns) in one of the dataframes created

head(gene_expression)

##             FPKM.IS20351_DS_1_1 FPKM.IS20351_DS_2_1 FPKM.IS20351_DS_3_1
## MSTRG.1               6.6100650           11.252592           11.347587
## MSTRG.100             5.3192730           10.290616           14.342060
## MSTRG.1000            8.6150110            8.754116           11.728596
## MSTRG.1001            5.8462371            7.061811            7.776445
## MSTRG.10010           0.4591360            4.146388            6.504951
## MSTRG.10011           0.1425843            2.168228            2.894545
##             FPKM.IS20351_WW_1_1 FPKM.IS20351_WW_2_1 FPKM.IS20351_WW_3_1
## MSTRG.1                5.477058            7.710475           11.964094
## MSTRG.100              8.029486           13.910149           21.050735
## MSTRG.1000             3.819834            6.246689            5.960262
## MSTRG.1001             4.112864            3.408974            5.633885
## MSTRG.10010           16.000067           19.211029           20.660242
## MSTRG.10011           16.959782           10.992865           12.070408

View the column names; change the column names to your sample names

colnames(gene_expression) <- c("IS20351_DS_1_1","IS20351_DS_2_1","IS20351_DS_3_1","IS20351_WW_1_1","IS20351_WW_2_1","IS20351_WW_3_1")

View the row names

row.names(gene_expression)[1:10]

Determine the dimensions of the dataframe. ‘dim()’ will return the number of rows and columns

dim(gene_expression)

## [1] 23371     6

Assign colors to each. You can specify color by RGB, Hex code, or name To get a list of color names:

data_colors=c("tomato1","tomato2","tomato3","wheat1","wheat2","wheat3")

View expression values for the transcripts of a particular gene e.g “MSTRG.27571”, then display only those rows of the data.frame

i = row.names(gene_expression) == "MSTRG.27571"
gene_expression[i,]

##             IS20351_DS_1_1 IS20351_DS_2_1 IS20351_DS_3_1 IS20351_WW_1_1
## MSTRG.27571       1.960471       3.215415       3.697611       0.872269
##             IS20351_WW_2_1 IS20351_WW_3_1
## MSTRG.27571       0.476039       0.632291

What if we want to view values for a list of genes of interest all at once? e,g: “MSTRG.28956” “MSTRG.28959” “MSTRG.2896” “MSTRG.28962”

genes_of_interest = c("MSTRG.28956", "MSTRG.28959", "MSTRG.2896", "MSTRG.28962")
i = which(row.names(gene_expression) %in% genes_of_interest)
gene_expression[i,]

##             IS20351_DS_1_1 IS20351_DS_2_1 IS20351_DS_3_1 IS20351_WW_1_1
## MSTRG.28956       0.981967       1.469531       0.925434       2.838750
## MSTRG.28959       0.107997       0.365174       0.196593       0.215648
## MSTRG.2896        8.408193       7.598804       9.875687      17.607841
## MSTRG.28962       4.832179       1.124734       0.357997       2.039790
##             IS20351_WW_2_1 IS20351_WW_3_1
## MSTRG.28956       1.959053       4.331974
## MSTRG.28959       3.626117       0.199521
## MSTRG.2896       28.338808      26.362604
## MSTRG.28962       0.000000       0.000000

Load the transcript to gene index from the ballgown object

transcript_gene_table = indexes(bg)$t2g
head(transcript_gene_table)

##   t_id        g_id
## 1    1 Sb01g000200
## 2    2     MSTRG.1
## 3    3     MSTRG.2
## 4    4     MSTRG.1
## 5    5     MSTRG.3
## 6    6     MSTRG.4

Each row of data represents a transcript. Many of these transcripts represent the same gene. Determine the numbers of transcripts and unique genes

length(row.names(transcript_gene_table)) #Transcript count

## [1] 80950

length(unique(transcript_gene_table[,"g_id"])) #Unique Gene count

## [1] 61732

Plot #1 - the number of transcripts per gene.
Many genes will have only 1 transcript, some genes will have several transcripts Use the ‘table()’ command to count the number of times each gene symbol occurs (i.e. the # of transcripts that have each gene symbol) Then use the ‘hist’ command to create a histogram of these counts How many genes have 1 transcript? More than one transcript? What is the maximum number of transcripts for a single gene?

counts=table(transcript_gene_table[,"g_id"])
c_one = length(which(counts == 1))
c_more_than_one = length(which(counts > 1))
c_max = max(counts)
hist(counts, breaks=50, col="bisque4", xlab="Transcripts per gene", main="Distribution of transcript count per gene")
legend_text = c(paste("Genes with one transcript =", c_one), paste("Genes with more than one transcript =", c_more_than_one), paste("Max transcripts for single gene = ", c_max))
legend("topright", legend_text, lty=NULL)

Plot #2 - the distribution of transcript sizes as a histogram In this analysis we supplied StringTie with transcript models so the lengths will be those of known transcripts However, if we had used a de novo transcript discovery mode, this step would give us some idea of how well transcripts were being assembled If we had a low coverage library, or other problems, we might get short ‘transcripts’ that are actually only pieces of real transcripts

full_table <- texpr(bg , 'all')
hist(full_table$length, breaks=50, xlab="Transcript length (bp)", main="Distribution of transcript lengths", col="steelblue")

Summarize FPKM values for all samples What are the minimum and maximum FPKM values for a particular library?

min(gene_expression[,"IS20351_DS_1_1"])

## [1] 0

max(gene_expression[,"IS20351_DS_2_1"])

## [1] 39919.22

min(gene_expression[,"IS20351_DS_3_1"])

## [1] 0

max(gene_expression[,"IS20351_WW_1_1"])

## [1] 12070.91

min(gene_expression[,"IS20351_WW_2_1"])

## [1] 0

max(gene_expression[,"IS20351_WW_3_1"])

## [1] 30315.73

Set the minimum non-zero FPKM values for use later. Do this by grabbing a copy of all data values, coverting 0’s to NA, and calculating the minimum or all non NA values

min_nonzero=1

Set the columns for finding FPKM and create shorter names for figures

data_columns=c(1:6)
short_names=c("sen_DS_1","sen_DS_2","sen_DS_3","sen_WW_1","sen_WW_2","sen_WW_3")

Plot #3 - View the range of values and general distribution of FPKM values for all libraries Create boxplots for this purpose Display on a log2 scale and add the minimum non-zero value to avoid log2(0)

boxplot(log2(gene_expression[,data_columns]+min_nonzero), col=data_colors, names=short_names, las=2, ylab="log2(FPKM)", main="Distribution of FPKMs for all 6 libraries")

Note that the bold horizontal line on each boxplot is the median

Plot #4 - plot a pair of replicates to assess reproducibility of technical replicates Tranform the data by converting to log2 scale after adding an arbitrary small value to avoid log2(0)

x = gene_expression[,"IS20351_DS_1_1"]
y = gene_expression[,"IS20351_DS_2_1"]
plot(x=log2(x+min_nonzero), y=log2(y+min_nonzero), pch=16, col="blue", cex=0.25, xlab="FPKM (IS20351_DS, Replicate 1)", ylab="FPKM (IS20351_DS, Replicate 2)", main="Comparison of expression values for a pair of replicates")
abline(a=0,b=1)
rs=cor(x,y)^2
legend("topleft", paste("R squared = ", round(rs, digits=3), sep=""), lwd=1, col="black")

Plot #5 - Scatter plots with a large number of data points can be misleading … regenerate this figure as a density scatter plot

colors = colorRampPalette(c("white", "blue", "#007FFF", "cyan","#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))
smoothScatter(x=log2(x+min_nonzero), y=log2(y+min_nonzero), xlab="FPKM (IS20351_DS, Replicate 1)", ylab="FPKM (IS20351_DS, Replicate 2)", main="Comparison of expression values for a pair of replicates", colramp=colors, nbin=200)

Compare the correlation ‘distance’ between all replicates Do we see the expected pattern for all libraries (i.e. replicates most similar, then DS vs. WW)? Calculate the FPKM sum for all 6 libraries

gene_expression[,"sum"]=apply(gene_expression[,data_columns], 1, sum)

Identify the genes with a grand sum FPKM of at least 5 - we will filter out the genes with very low expression across the board

i = which(gene_expression[,"sum"] > 5)

Calculate the correlation between all pairs of data

r=cor(gene_expression[i,data_columns], use="pairwise.complete.obs", method="pearson")
r

##                IS20351_DS_1_1 IS20351_DS_2_1 IS20351_DS_3_1 IS20351_WW_1_1
## IS20351_DS_1_1      1.0000000      0.9942428      0.9840948      0.8211440
## IS20351_DS_2_1      0.9942428      1.0000000      0.9915486      0.8122250
## IS20351_DS_3_1      0.9840948      0.9915486      1.0000000      0.7987260
## IS20351_WW_1_1      0.8211440      0.8122250      0.7987260      1.0000000
## IS20351_WW_2_1      0.9775692      0.9777724      0.9772803      0.8317814
## IS20351_WW_3_1      0.9614445      0.9617983      0.9727414      0.8210495
##                IS20351_WW_2_1 IS20351_WW_3_1
## IS20351_DS_1_1      0.9775692      0.9614445
## IS20351_DS_2_1      0.9777724      0.9617983
## IS20351_DS_3_1      0.9772803      0.9727414
## IS20351_WW_1_1      0.8317814      0.8210495
## IS20351_WW_2_1      1.0000000      0.9825595
## IS20351_WW_3_1      0.9825595      1.0000000

Plot #8 - Convert correlation to ‘distance’, and use ‘multi-dimensional scaling’ to display the relative differences between libraries This step calculates 2-dimensional coordinates to plot points for each library Libraries with similar expression patterns (highly correlated to each other) should group together What pattern do we expect to see, given the types of libraries we have (technical replicates, biologal replicates, DS/WW)?

d=1-r
mds=cmdscale(d, k=2, eig=TRUE)
par(mfrow=c(1,1))
plot(mds$points, type="n", xlab="", ylab="", main="MDS distance plot (all non-zero genes) for all libraries", xlim=c(-0.15,0.15), ylim=c(-0.15,0.15))
points(mds$points[,1], mds$points[,2], col="grey", cex=2, pch=16)
text(mds$points[,1], mds$points[,2], short_names, col=data_colors)

Calculate the differential expression results including significance

results_genes = stattest(bg_filt, feature="gene", covariate="group", getFC=TRUE, meas="FPKM")
results_genes = merge(results_genes,bg_gene_names,by.x=c("id"),by.y=c("gene_id"))

Plot #9 - View the distribution of differential expression values as a histogram Display only those that are significant according to Ballgown

sig=which(results_genes$pval<0.05)
results_genes[,"de"] = log2(results_genes[,"fc"])
hist(results_genes[sig,"de"], breaks=50, col="seagreen", xlab="log2(Fold change) Sen_DS vs Sen_WW", main="Distribution of differential expression values")
abline(v=-2, col="black", lwd=2, lty=2)
abline(v=2, col="black", lwd=2, lty=2)
legend("topleft", "Fold-change > 4", lwd=2, lty=2)

Plot #10 - Display the grand expression values from UHR and HBR and mark those that are significantly differentially expressed

gene_expression[,"Sen_DS"]=apply(gene_expression[,c(1:3)], 1, mean)
gene_expression[,"Sen_WW"]=apply(gene_expression[,c(3:6)], 1, mean)
x=log2(gene_expression[,"Sen_DS"]+min_nonzero)
y=log2(gene_expression[,"Sen_WW"]+min_nonzero)
plot(x=x, y=y, pch=16, cex=0.25, xlab="Sen_DS FPKM (log2)", ylab="Sen_WW FPKM (log2)", main="Sen_DS vs Sen_WW FPKMs")
abline(a=0, b=1)
xsig=x[sig]
ysig=y[sig]
points(x=xsig, y=ysig, col="magenta", pch=16, cex=0.5)
legend("topleft", "Significant", col="magenta", pch=16)

Write a simple table of differentially expressed transcripts to an output file Each should be significant with a log2 fold-change >= 2

sigpi = which(results_genes[,"pval"]<0.05)
sigp = results_genes[sigpi,]
sigde = which(abs(sigp[,"de"]) >= 2)
sig_tn_de = sigp[sigde,]

Order the output by or p-value and then break ties using fold-change

o = order(sig_tn_de[,"qval"], -abs(sig_tn_de[,"de"]), decreasing=FALSE)
output = sig_tn_de[o,c("gene_name","id","fc","pval","qval","de")]
write.table(output, file="SigDE.txt", sep="\t", row.names=FALSE, quote=FALSE)
#View selected columns of the first 25 lines of output
output[1:10,c(1,4,5)]

##         gene_name         pval       qval
## 10693 Sb04g022780 4.468390e-06 0.01670524
## 17561           . 4.519096e-05 0.03902020
## 12960           . 3.018199e-05 0.03902020
## 17707           . 4.355827e-05 0.03902020
## 13446 Sb05g022540 4.814110e-05 0.03902020
## 15005 Sb06g022500 6.746485e-05 0.04379781
## 21545           . 7.239141e-05 0.04474925
## 10772 Sb04g023540 9.863582e-05 0.04610436
## 21394 Sb04g003900 9.559125e-05 0.04610436
## 21342           . 2.114246e-04 0.04661513

Using Ballgown R package for differential expression analysis using DE and VICE

Upendra Devisetty

02/11/2019

Introduction

Ballgown analysis steps