This is a script that will do differential gene expression (DGE) analysis for RNA-seq experiments using the bioconductor package edgeR.
This project is an RNA-seq experiment being done to look at the differential gene expression levels of an RNAi knockdown of the histone modifier ‘gene1’, compared to an RNAi control, ‘control’. This was done at three time points, day 9, 12 and 15 after RNAi exposure. As stated above, DGE analysis was done using the bioconductor package edgeR.
The first step is to load in the necessary packages that are going to be used in this experiment.
library(RColorBrewer)
library(genefilter)
library(edgeR)
library(ggplot2)
library(knitr)
Next, I load in the counts table and combine the samples that are run on multiple lanes. This is done so that a sample has one column of counts for all genes.
setwd("~/Desktop/RNA.seq.example/")
### Read in the counts table
counts <- read.table("sam_counts.txt", sep="\t", header=T, as.is=T)
### Remove the lane from the column name so it just shows each sample name (gene1_12_1_6 -> gene1_12_1)
colnames(counts) <- sub("_[0-9]$","",colnames(counts))
### Sum the counts from the aliquots (all lanes combined into one row)
scounts <- t(apply(counts, 1, function(x) {
tapply(x, factor(colnames(counts)), sum)
}))
### Test a couple of columns to check and make sure our counts table looks good
| gene1_12_1 | gene1_12_1.1 | gene1_12_1.2 | gene1_12_2 | gene1_12_2.1 | |
|---|---|---|---|---|---|
| SMED30030523 | 19 | 21 | 20 | 11 | 19 |
| SMED30018010 | 4 | 3 | 2 | 2 | 1 |
| SMED30032462 | 2 | 4 | 0 | 1 | 0 |
| SMED30024982 | 0 | 0 | 1 | 0 | 0 |
| SMED30017564 | 92 | 116 | 102 | 132 | 142 |
| SMED30034967 | 932 | 934 | 976 | 1019 | 983 |
Following, I create a summary table that I can send back to my researcher that outlines what each sample is and the experimental factors involved.
### Creating a Samples Data frame
### Sample Column
Sample <- colnames(scounts)
### RNAi column
RNAi <- c(rep("control", 12), rep("gene1", 12))
### Replicate Column
Replicate <- rep(1:4,6)
### Day Column
Day <- c(rep(12,4), rep(15, 4), rep(9, 4), rep(12, 4), rep(15, 4), rep(9, 4))
### Create a name that is more structured
Alias <- paste(RNAi, Day, Replicate, sep="_")
### Creat Summary Table
targets <- data.frame(Sample, RNAi, Day, Replicate, Alias)
### Check to make sure it works
kable(head(targets), align = 'c')
| Sample | RNAi | Day | Replicate | Alias |
|---|---|---|---|---|
| control_12_1 | control | 12 | 1 | control_12_1 |
| control_12_2 | control | 12 | 2 | control_12_2 |
| control_12_3 | control | 12 | 3 | control_12_3 |
| control_12_4 | control | 12 | 4 | control_12_4 |
| control_15_1 | control | 15 | 1 | control_15_1 |
| control_15_2 | control | 15 | 2 | control_15_2 |
### Write this file out for researcher
write.table(targets, "Samples.txt", quote=FALSE, row.names=FALSE, col.names=T, sep="\t")
### Change the sample names in our counts table to our more structured name (RNAi_Tisssue_Day_Replicate)
colnames(scounts) <- Alias
The next step is to filter out unmapped and ribosomal reads, as well as genes with low read counts.
### Filter out unmapped reads designated "*"
mcounts <- scounts[-match("*", rownames(scounts)),]
### Get rid of main ribo transcript
mcounts <- mcounts[-match("SMED30032887", rownames(mcounts)),]
### Get rid of other ribo28s and 16s transcripts
mcounts <- mcounts[-which(rownames(mcounts) == "SMED30032663"),]
mcounts <- mcounts[-which(rownames(mcounts) == "SMED30027845"),]
### Filter out genes with low read counts
countData <- mcounts[rowSums(cpm(mcounts)) > 4,]
Next, to ensure quality it is important to plot some statistics about the library. Specifically, I plotted the correlation between samples and a bar plot of the amount of mapped and total reads.
### Preload a predifined correlation plot function
source("~/Desktop/RNA.seq.example/myImagePlot.R")
myImagePlot(cor(cpm(countData)),
title= "Mapped Counts All Samples")
### Create a vector of total counts in millions for each sample to be plotted
totalCounts <- colSums(scounts[-match(c("SMED30032887",
"SMED30032663",
"SMED30027845"),
rownames(scounts)),])/10^6
###Create a vector of mapped counts to be plotted in Millions of Reads
mappedCounts <- colSums(mcounts)/10^6
### Create a vector of colors for each sample type using RColorBrewer
allSample.cols <- rep(brewer.pal(6, "Set1"), each=4)
### Change the margins so the sample names will fit
mymar <- par()$mar
mymar[2] <- 7
mymar[1] <- 4
par(mar = mymar)
###Use the barplot function
barplot(totalCounts,
col = allSample.cols,
xlab="Million Reads",
main="Total Counts",
horiz=TRUE,
las=1)
barplot(mappedCounts,
col=allSample.cols,
xlab="Million Reads",
main="Mapped Counts",
horiz=TRUE,
las=1)
Next, I run the edgeR pipeline for DGE following the glm model.
###EdgeR
y <- DGEList(countData)
### plot an MDS plot to see how all the samples scale in multidemensional space
plotMDS(y, col=allSample.cols, main="MDS Plot")
###Create Groups represententing individual sample properties so they can be contrasted
Group <- factor(paste(targets$RNAi, targets$Day, sep="."))
###Calculate Dispersions
y <- calcNormFactors(y)
design <- model.matrix(~0+Group)
colnames(design) <- levels(Group)
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTrendedDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)
### Create contrasts
my.contrasts <- makeContrasts(gene1.12=gene1.12-control.12,
gene1.15=gene1.15-control.15,
gene1.9=gene1.9-control.9,
levels=design)
# Run the likelihood ratio test on all contrasts
gene1.12 <- glmLRT(fit, contrast=my.contrasts[,"gene1.12"])
gene1.15 <- glmLRT(fit, contrast=my.contrasts[,"gene1.15"])
gene1.9 <- glmLRT(fit, contrast=my.contrasts[,"gene1.9"])
This is a custom function that pulls out the indexes of DE genes, as well as prints out some stats about how many genes were found.
sumStats <- function(x,fc=1,pc=0.05){
up <- x$table$logFC > 0
dn <- x$table$logFC < 0
# indexes of genes with padj < 0.05
padj <- p.adjust(x$table$PValue, method="BH") < pc
# indexes of genes with pvals < 0.05
pval <- x$table$PValue < pc
# indexes of genes with pval of 0.05 and FC of greator than 1 or -1
ng.up <- x$table$logFC > fc & pval
ng.dn <- x$table$logFC < -1*fc & pval
# print out a summary table
cat(c("padj", "pval","up","dn"),"\n")
cat(c(sum(padj),
sum(pval),
sum(ng.up),
sum(ng.dn)),
"\n")
# return a list of indexes for the genes that are DE by pval and padj
return(list(up=which(pval & up), dn=which(pval & dn), upfc=which(ng.up), dnfc=which(ng.dn), padj=which(padj)))
}
Next, I parse the edgeR output to make objects of stats about each gene.
### Create an object with the indexes of genes that are DE for each experiment
DEi <- list(gene1.12=sumStats(gene1.12, pc=0.01),
gene1.15=sumStats(gene1.15, pc=0.01),
gene1.9=sumStats(gene1.9, pc=0.01))
## padj pval up dn
## 1006 2832 122 367
## padj pval up dn
## 1055 2648 169 376
## padj pval up dn
## 1308 3079 75 491
### Create an object of the logFC for each expiriment
m <- cbind(gene1.12$table$logFC,
gene1.15$table$logFC,
gene1.9$table$logFC)
colnames(m) <- colnames(my.contrasts)
rownames(m) <- rownames(gene1.15$table)
### Create an object of the pvals for each gene in each expiriment
pvals <- cbind(gene1.12$table$PValue,
gene1.15$table$PValue,
gene1.9$table$PValue)
colnames(pvals) <- colnames(my.contrasts)
rownames(pvals) <- rownames(gene1.15$table)
### Adjust the pval object to fdr for a padj object
p.adj <- apply(pvals, 2, p.adjust, method="fdr")
### Create an object of the logCPM for each gene in each expiriment
logCPM <- cbind(gene1.12$table$logCPM,
gene1.15$table$logCPM,
gene1.9$table$logCPM)
colnames(logCPM) <- colnames(my.contrasts)
rownames(logCPM) <- rownames(gene1.15$table)
##Create an object that I can reload without having to rerun edgeR for further analysis
m.emd41 <- m
pval.emd41 <- pvals
logCPM.emd41 <- logCPM
p.adj.emd41 <- p.adj
DEi.emd41 <- DEi
save(m.emd41, pval.emd41, logCPM.emd41, p.adj.emd41, DEi.emd41, file="~/Desktop/edgeR_emd41.RData")
After running the DGE analysis using edgeR, I wanted to examine the output and explore the data. The first thing I did was make an MA plot. Then, to make sure the RNAi knockdown was quality; I plotted the RPKM of gene1 for all samples.
## Function that takes the objects reshapes the data and plots in ggplot2
MAplots <- function(v) {
exp.df <- as.data.frame(cbind(logCPM[,v], m[,v]))
colnames(exp.df) <- c("logCPM", "log2FC")
exp.df$DE <- "NONE"
exp.df$DE[DEi[[v]][["upfc"]]] <- "UP"
exp.df$DE[DEi[[v]][["dnfc"]]] <- "DOWN"
p <- ggplot(exp.df, aes(x=logCPM, y=log2FC, colour=DE)) +
geom_point(alpha=0.8) +
ylim(-5,5) +
theme_bw() +
theme(legend.position="top") +
scale_color_manual(values=c("red", "black", "green3")) +
labs(colour="Differential Expression") +
ggtitle(paste0(v,"/control MA Plot")) +
geom_hline(yintercept=c(1,-1), linetype="dashed", colour="grey")
print(p)
}
MAplots("gene1.9")
MAplots("gene1.12")
MAplots("gene1.15")
## Convert data to rpkm and plot gene1 rpkm levels
## rerun edgeR norm factors
mc <- countData
mc <- DGEList(mc)
mc <- calcNormFactors(mc)
### Get list of gene lengths for normalization
gene.length <- read.table(file="anno1.txt", sep="\t", header=F)[,c(1,4)]
colnames(gene.length) <- c("id", "length")
### Make rpkm for all genes in each sample matrix
rpkm.all <- rpkm(mc, gene.length=gene.length$length[match(rownames(mc), gene.length$id)])
#### Get an average and sd for the rpkm for each condition
### avg
v <- factor(sub("_[0-9]$", "", colnames(rpkm.all)))
rpkm.avg <- t(apply(rpkm.all, 1, function(x) tapply(as.numeric(x), v, mean)))
## sd
rpkm.sd <- t(apply(rpkm.all, 1, function(x) tapply(x, v, sd)))
colnames(rpkm.sd) <- paste(colnames(rpkm.sd), "sd", sep=".")
write.table(rpkm.all, file="rpkm_data_edgeR.txt", sep="\t", col.names=NA)
write.table(cbind(rpkm.avg, rpkm.sd), file="rpkm_avg_sd.txt", sep="\t", col.names=NA)
###Plot gene1 levels
gene1 <- match("SMED30029784", rownames(pvals))
mymar <- par()$mar
mymar[2] <- 7
mymar[1] <- 4
par(mar = mymar)
barplot(rpkm.all[gene1,],
col=allSample.cols,
xlab="rpkm",
main="gene1",
horiz=TRUE,
las=1)
The last thing I did was adding annotation columns that tell the function, common name and other properties of each gene. Planarian is a non-model organism so we have two special custom annotation data frames that have give information about each gene (why I didn’t use biomart for this). The final table is saved as a .txt file and sent to the researcher.
###################################
#### Annotations ####
###################################
setwd("~/Desktop/RNA.seq.example/")
anno1 <- read.table(file="anno1.txt", sep="\t", header=F)
colnames(anno1) <- c("id","oid","source","length", "u033226","e")
## Change so that non existing values come up as NA
anno1$u033226[anno1$u033226 == "\\N"] <- NA
anno1$e[anno1$e == "\\N"] <- NA
anno2 <- read.delim(file="anno2.txt", sep="\t", header=T)
anno2 <- anno2[,-1*match(c("perc_gc","pfam_ids"), colnames(anno2))]
## Columns from anno2 we want with anno1
foi <- c("num_orfs", "signalp", "num_tmhmms", "sp_id", "sp_evalue", "sp_desc", "nr_id", "nr_evalue", "nr_desc")
## Bomine into one annotation table
iv <- match(rownames(m), anno1$id)
anno <- cbind(anno1[iv,], anno2[iv,foi])
## add an object that will be designated flag, if up-regulated = 1, down = -1 and not DE = 0
flags <- matrix(0, ncol=length(DEi), nrow=nrow(m))
for (i in 1:length(DEi)) {
flags[DEi[[i]]$dn,i] <- -1
flags[DEi[[i]]$up,i] <- 1
}
## Add researcher readable column names so that they can be combined for final output
colnames(flags) <- paste0("f", 1:ncol(flags))
rownames(flags) <- rownames(m)
colnames(pvals) <- paste(colnames(pvals), "p", sep=".")
colnames(p.adj) <- paste(colnames(pvals), "padj", sep=".")
colnames(m) <- paste(colnames(m), "fc", sep=".")
## Create and write final table
final.table <- data.frame(m, A=rowMeans(logCPM), flags, pvals, p.adj, anno)
write.table(final.table, "fit_data.txt", sep="\t", col.names=NA)