RNA-seq Analysis Example

Overview

This is a script that will do differential gene expression (DGE) analysis for RNA-seq experiments using the bioconductor package edgeR.

Project Details

This project is an RNA-seq experiment being done to look at the differential gene expression levels of an RNAi knockdown of the histone modifier ‘gene1’, compared to an RNAi control, ‘control’. This was done at three time points, day 9, 12 and 15 after RNAi exposure. As stated above, DGE analysis was done using the bioconductor package edgeR.

Loading In Packages

The first step is to load in the necessary packages that are going to be used in this experiment.

library(RColorBrewer)
library(genefilter)
library(edgeR)
library(ggplot2)
library(knitr)

Load in Files And Combine Samples Used on Multiple Lanes

Next, I load in the counts table and combine the samples that are run on multiple lanes. This is done so that a sample has one column of counts for all genes.

setwd("~/Desktop/RNA.seq.example/")
### Read in the counts table
counts <- read.table("sam_counts.txt", sep="\t", header=T, as.is=T)

### Remove the lane from the column name so it just shows each sample name (gene1_12_1_6 -> gene1_12_1)
colnames(counts) <- sub("_[0-9]$","",colnames(counts))

### Sum the counts from the aliquots (all lanes combined into one row)
scounts <- t(apply(counts, 1, function(x) {
  tapply(x, factor(colnames(counts)), sum)
  }))

### Test a couple of columns to check and make sure our counts table looks good

	gene1_12_1	gene1_12_1.1	gene1_12_1.2	gene1_12_2	gene1_12_2.1
SMED30030523	19	21	20	11	19
SMED30018010	4	3	2	2	1
SMED30032462	2	4	0	1	0
SMED30024982	0	0	1	0	0
SMED30017564	92	116	102	132	142
SMED30034967	932	934	976	1019	983

Project Summary Table

Following, I create a summary table that I can send back to my researcher that outlines what each sample is and the experimental factors involved.

### Creating a Samples Data frame
### Sample Column
Sample <- colnames(scounts)

### RNAi column
RNAi <- c(rep("control", 12), rep("gene1", 12))

### Replicate Column
Replicate <- rep(1:4,6)

### Day Column
Day <- c(rep(12,4), rep(15, 4), rep(9, 4), rep(12, 4), rep(15, 4), rep(9, 4))

### Create a name that is more structured
Alias <- paste(RNAi, Day, Replicate, sep="_")

### Creat Summary Table
targets <- data.frame(Sample, RNAi, Day, Replicate, Alias)

### Check to make sure it works
kable(head(targets), align = 'c')

Sample	RNAi	Day	Replicate	Alias
control_12_1	control	12	1	control_12_1
control_12_2	control	12	2	control_12_2
control_12_3	control	12	3	control_12_3
control_12_4	control	12	4	control_12_4
control_15_1	control	15	1	control_15_1
control_15_2	control	15	2	control_15_2

### Write this file out for researcher
write.table(targets, "Samples.txt", quote=FALSE, row.names=FALSE, col.names=T, sep="\t")

### Change the sample names in our counts table to our more structured name (RNAi_Tisssue_Day_Replicate)
colnames(scounts) <- Alias

Filtering Steps

The next step is to filter out unmapped and ribosomal reads, as well as genes with low read counts.

### Filter out unmapped reads designated "*"
mcounts <- scounts[-match("*", rownames(scounts)),]

### Get rid of main ribo transcript
mcounts <- mcounts[-match("SMED30032887", rownames(mcounts)),]

### Get rid of other ribo28s and 16s transcripts
mcounts <- mcounts[-which(rownames(mcounts) == "SMED30032663"),]
mcounts <- mcounts[-which(rownames(mcounts) == "SMED30027845"),]

### Filter out genes with low read counts
countData <- mcounts[rowSums(cpm(mcounts)) > 4,]

Plot Stats About RNA-seq Library

Next, to ensure quality it is important to plot some statistics about the library. Specifically, I plotted the correlation between samples and a bar plot of the amount of mapped and total reads.

### Preload a predifined correlation plot function
source("~/Desktop/RNA.seq.example/myImagePlot.R")
myImagePlot(cor(cpm(countData)), 
            title= "Mapped Counts All Samples")

### Create a vector of total counts in millions for each sample to be plotted
totalCounts <- colSums(scounts[-match(c("SMED30032887",
                                        "SMED30032663",
                                        "SMED30027845"),
                                      rownames(scounts)),])/10^6

###Create a vector of mapped counts to be plotted in Millions of Reads
mappedCounts <- colSums(mcounts)/10^6

### Create a vector of colors for each sample type using RColorBrewer
allSample.cols <- rep(brewer.pal(6, "Set1"), each=4)

### Change the margins so the sample names will fit
mymar <- par()$mar
mymar[2] <- 7
mymar[1] <- 4
par(mar = mymar)

###Use the barplot function
barplot(totalCounts, 
        col = allSample.cols, 
        xlab="Million Reads", 
        main="Total Counts", 
        horiz=TRUE, 
        las=1)

barplot(mappedCounts, 
        col=allSample.cols, 
        xlab="Million Reads", 
        main="Mapped Counts", 
        horiz=TRUE, 
        las=1)

edgeR Analysis

Next, I run the edgeR pipeline for DGE following the glm model.

###EdgeR
y <- DGEList(countData)
### plot an MDS plot to see how all the samples scale in multidemensional space
plotMDS(y, col=allSample.cols, main="MDS Plot")

###Create Groups represententing individual sample properties so they can be contrasted
Group <- factor(paste(targets$RNAi, targets$Day, sep="."))

###Calculate Dispersions
y <- calcNormFactors(y)
design <- model.matrix(~0+Group)
colnames(design) <- levels(Group)
y <- estimateGLMCommonDisp(y, design)
y <- estimateGLMTrendedDisp(y, design)
y <- estimateGLMTagwiseDisp(y, design)
fit <- glmFit(y, design)

### Create contrasts
my.contrasts <- makeContrasts(gene1.12=gene1.12-control.12,
                              gene1.15=gene1.15-control.15,
                              gene1.9=gene1.9-control.9,
                              levels=design)

# Run the likelihood ratio test on all contrasts
gene1.12 <- glmLRT(fit, contrast=my.contrasts[,"gene1.12"])
gene1.15 <- glmLRT(fit, contrast=my.contrasts[,"gene1.15"])
gene1.9  <- glmLRT(fit, contrast=my.contrasts[,"gene1.9"])

Function that will pull out DE genes

This is a custom function that pulls out the indexes of DE genes, as well as prints out some stats about how many genes were found.

sumStats <- function(x,fc=1,pc=0.05){
  up <- x$table$logFC > 0
  dn <- x$table$logFC < 0
  
  # indexes of genes with padj < 0.05
  padj <- p.adjust(x$table$PValue, method="BH") < pc
  # indexes of genes with pvals < 0.05
  pval <- x$table$PValue < pc
  # indexes of genes with pval of 0.05 and FC of greator than 1 or -1
  ng.up <- x$table$logFC > fc & pval
  ng.dn <- x$table$logFC < -1*fc & pval
  
  # print out a summary table   
  cat(c("padj", "pval","up","dn"),"\n")
  cat(c(sum(padj), 
        sum(pval), 
        sum(ng.up), 
        sum(ng.dn)), 
        "\n")
  
  # return a list of indexes for the genes that are DE by pval and padj
  return(list(up=which(pval & up), dn=which(pval & dn), upfc=which(ng.up), dnfc=which(ng.dn), padj=which(padj)))
}

Parsing the edgeR output

Next, I parse the edgeR output to make objects of stats about each gene.

### Create an object with the indexes of genes that are DE for each experiment
DEi <- list(gene1.12=sumStats(gene1.12, pc=0.01),
             gene1.15=sumStats(gene1.15, pc=0.01),             
             gene1.9=sumStats(gene1.9, pc=0.01))

## padj pval up dn 
## 1006 2832 122 367 
## padj pval up dn 
## 1055 2648 169 376 
## padj pval up dn 
## 1308 3079 75 491

### Create an object of the logFC for each expiriment
m <- cbind(gene1.12$table$logFC,
           gene1.15$table$logFC,
           gene1.9$table$logFC)
colnames(m) <- colnames(my.contrasts)
rownames(m) <- rownames(gene1.15$table)

### Create an object of the pvals for each gene in each expiriment
pvals <- cbind(gene1.12$table$PValue,
               gene1.15$table$PValue,
               gene1.9$table$PValue)
colnames(pvals) <- colnames(my.contrasts)
rownames(pvals) <- rownames(gene1.15$table)

### Adjust the pval object to fdr for a padj object
p.adj <- apply(pvals, 2, p.adjust, method="fdr")

### Create an object of the logCPM for each gene in each expiriment
logCPM <- cbind(gene1.12$table$logCPM,
                gene1.15$table$logCPM,
                gene1.9$table$logCPM)
colnames(logCPM) <- colnames(my.contrasts)
rownames(logCPM) <- rownames(gene1.15$table)

##Create an object that I can reload without having to rerun edgeR for further analysis
m.emd41 <- m
pval.emd41 <- pvals
logCPM.emd41 <- logCPM
p.adj.emd41 <- p.adj
DEi.emd41 <- DEi
save(m.emd41, pval.emd41, logCPM.emd41, p.adj.emd41, DEi.emd41, file="~/Desktop/edgeR_emd41.RData")

Examine Output of edgeR

After running the DGE analysis using edgeR, I wanted to examine the output and explore the data. The first thing I did was make an MA plot. Then, to make sure the RNAi knockdown was quality; I plotted the RPKM of gene1 for all samples.

## Function that takes the objects reshapes the data and plots in ggplot2
MAplots <- function(v) {
    exp.df <- as.data.frame(cbind(logCPM[,v], m[,v]))
    colnames(exp.df) <- c("logCPM", "log2FC")
    exp.df$DE <- "NONE"
    exp.df$DE[DEi[[v]][["upfc"]]] <- "UP"
    exp.df$DE[DEi[[v]][["dnfc"]]] <- "DOWN"
    p <- ggplot(exp.df, aes(x=logCPM, y=log2FC, colour=DE)) + 
         geom_point(alpha=0.8) + 
         ylim(-5,5) +
         theme_bw() + 
         theme(legend.position="top") +
         scale_color_manual(values=c("red", "black", "green3")) +
         labs(colour="Differential Expression") + 
         ggtitle(paste0(v,"/control MA Plot")) +
         geom_hline(yintercept=c(1,-1), linetype="dashed", colour="grey")      
    print(p)
}


MAplots("gene1.9")

MAplots("gene1.12")

MAplots("gene1.15")

## Convert data to rpkm and plot gene1 rpkm levels
## rerun edgeR norm factors
mc <- countData
mc <- DGEList(mc)
mc <- calcNormFactors(mc)

### Get list of gene lengths for normalization
gene.length <- read.table(file="anno1.txt", sep="\t", header=F)[,c(1,4)]
colnames(gene.length) <- c("id", "length")

### Make rpkm for all genes in each sample matrix
rpkm.all <- rpkm(mc, gene.length=gene.length$length[match(rownames(mc), gene.length$id)])

#### Get an average and sd for the rpkm for each condition
### avg
v <- factor(sub("_[0-9]$", "", colnames(rpkm.all)))
rpkm.avg <- t(apply(rpkm.all, 1, function(x) tapply(as.numeric(x), v, mean)))

## sd
rpkm.sd <- t(apply(rpkm.all, 1, function(x) tapply(x, v, sd)))
colnames(rpkm.sd) <- paste(colnames(rpkm.sd), "sd", sep=".")

write.table(rpkm.all, file="rpkm_data_edgeR.txt", sep="\t", col.names=NA)
write.table(cbind(rpkm.avg, rpkm.sd), file="rpkm_avg_sd.txt", sep="\t", col.names=NA)

###Plot gene1 levels
gene1 <- match("SMED30029784", rownames(pvals))
mymar <- par()$mar
mymar[2] <- 7
mymar[1] <- 4
par(mar = mymar)
barplot(rpkm.all[gene1,],
        col=allSample.cols,
        xlab="rpkm",
        main="gene1",
        horiz=TRUE,
        las=1)

Add Annotation Columns And Make Final Output

The last thing I did was adding annotation columns that tell the function, common name and other properties of each gene. Planarian is a non-model organism so we have two special custom annotation data frames that have give information about each gene (why I didn’t use biomart for this). The final table is saved as a .txt file and sent to the researcher.

###################################
####         Annotations       ####
###################################

setwd("~/Desktop/RNA.seq.example/")
anno1 <- read.table(file="anno1.txt", sep="\t", header=F)
colnames(anno1) <- c("id","oid","source","length", "u033226","e")

## Change so that non existing values come up as NA
anno1$u033226[anno1$u033226 == "\\N"] <- NA
anno1$e[anno1$e == "\\N"] <- NA

anno2 <- read.delim(file="anno2.txt", sep="\t", header=T)
anno2 <- anno2[,-1*match(c("perc_gc","pfam_ids"), colnames(anno2))]

## Columns from anno2 we want with anno1
foi <- c("num_orfs", "signalp", "num_tmhmms", "sp_id", "sp_evalue", "sp_desc", "nr_id", "nr_evalue", "nr_desc")

## Bomine into one annotation table
iv <- match(rownames(m), anno1$id)
anno <- cbind(anno1[iv,], anno2[iv,foi])

## add an object that will be designated flag, if up-regulated = 1, down = -1 and not DE = 0
flags <- matrix(0, ncol=length(DEi), nrow=nrow(m))
for (i in 1:length(DEi)) {
    flags[DEi[[i]]$dn,i] <- -1
    flags[DEi[[i]]$up,i] <- 1
}

## Add researcher readable column names so that they can be combined for final output
colnames(flags) <- paste0("f", 1:ncol(flags))
rownames(flags) <- rownames(m)
colnames(pvals)  <- paste(colnames(pvals), "p", sep=".")
colnames(p.adj) <- paste(colnames(pvals), "padj", sep=".")
colnames(m) <- paste(colnames(m), "fc", sep=".")

## Create and write final table
final.table <- data.frame(m, A=rowMeans(logCPM), flags, pvals, p.adj, anno)
write.table(final.table, "fit_data.txt", sep="\t", col.names=NA)