Sorghum mRNA-seq Analysis

library(ggplot2)
# load in mrna_raw_data file as a table
setwd("~/hydrogen/sorghum")
raw_data <- read.table("raw_mrna_data", sep = ",", header = FALSE)
names(raw_data) <- c("filepath", "rep", "cell", "pair")

Number of reads in each of the raw data files

# plot bar chart of this 'mrna_raw_read_counts.txt'
library(plyr)
setwd("~/hydrogen/sorghum/mrna")
raw_read_count <- read.table("mrna_raw_read_counts.txt", sep = "\t", header = FALSE)
names(raw_read_count) <- c("count", "cell", "rep", "pair")

raw_read_count_paired <- ddply(raw_read_count, c("cell", "rep"), transform, 
    sum_count = sum(count))
raw_read_count_paired <- unique(raw_read_count_paired[, c(2, 3, 5)])

ggplot(raw_read_count_paired, aes(x = paste(cell, rep), y = sum_count, fill = cell)) + 
    geom_bar(stat = "identity") + ggtitle("Raw read counts") + geom_text(aes(label = sum_count, 
    y = sum_count + 5e+05)) + labs(x = "Cell and rep") + ylim(0, 6e+07)

plot of chunk raw_read_count

Quality scores of raw data file

# plot quality scores
setwd("~/hydrogen/sorghum/mrna")
library(reshape2)
library(plyr)

sum_read_quals <- data.frame()
for (i in 1:nrow(raw_data)) {
    cell = raw_data[i, 3]
    rep = raw_data[i, 2]
    pair = raw_data[i, 4]
    raw_read_qualities <- read.table(paste("raw_read_qualities_", cell, "-", 
        rep, "-", pair, ".txt", sep = ""), sep = "\t", header = FALSE)
    names(raw_read_qualities) <- c("base", "mean", "lower", "upper", "10%", 
        "90%")
    raw_read_qualities_melt <- melt(raw_read_qualities, id = "base")  # convert to long format
    raw_read_qualities_melt$sample <- paste(cell, "-", rep, "-", pair, sep = "")
    sum_read_quals <- rbind(sum_read_quals, raw_read_qualities_melt)
}

mean_read_quals <- ddply(sum_read_quals, .(base, variable), numcolwise(mean))  # sum columns, grouping by annotation

ggplot(data = mean_read_quals, aes(x = base, y = value, colour = variable)) + 
    geom_line() + ggtitle("Raw Read Qualities ") + ylim(0, 42)

plot of chunk raw_quality_scores

Number of reads in each of the trimmed data files

# plot chart 'mrna_trimmed_read_counts.txt'
library(plyr)
setwd("~/hydrogen/sorghum/mrna")  # mrna_trimmed_read_counts.txt
trimmed_read_count <- read.table("mrna_trimmed_read_counts.txt", sep = "\t", 
    header = FALSE)
names(trimmed_read_count) <- c("count", "cell", "rep", "pair")

trimmed_read_count_paired <- ddply(trimmed_read_count, c("cell", "rep"), transform, 
    sum_count = sum(count))
trimmed_read_count_paired <- unique(trimmed_read_count_paired[, c(2, 3, 5)])

ggplot(trimmed_read_count_paired, aes(x = paste(cell, rep), y = sum_count, fill = cell)) + 
    geom_bar(stat = "identity") + ggtitle("Trimmed read counts") + geom_text(aes(label = sum_count, 
    y = sum_count + 5e+05)) + labs(x = "Cell and rep") + ylim(0, 6e+07)

plot of chunk trimmed_read_count

Lengths of reads after trimming

# load the tables of trimmed read lengths smoosh together and plot as a bar
# chart (possibly with log y scale)

setwd("~/hydrogen/sorghum/mrna")

sum_read_lengths <- data.frame()
for (i in 1:nrow(raw_data)) {
    cell = raw_data[i, 3]
    rep = raw_data[i, 2]
    pair = raw_data[i, 4]
    trimmed_read_length <- read.table(paste("t_read_length_", cell, "-", rep, 
        "-", pair, ".txt", sep = ""), sep = "\t", header = FALSE)
    names(trimmed_read_length) <- c("length", "count")
    trimmed_read_length$length <- gsub(trimmed_read_length$length, pattern = "([0-9]+)-([0-9]+)", 
        replacement = "\\1")
    sum_read_lengths <- rbind(sum_read_lengths, trimmed_read_length)
}
total_read_lengths <- ddply(sum_read_lengths, .(length), numcolwise(sum))

ggplot(total_read_lengths, aes(x = as.numeric(length), y = count)) + geom_bar(stat = "identity") + 
    ggtitle(paste("Trimmed Read Lengths for ", cell, " Rep:", rep, " Pair:", 
        pair)) + labs(x = "Read Length")

plot of chunk trimmed_read_lengths

Quality scores of trimmed data files

# plot quality scores
setwd("~/hydrogen/sorghum/mrna")
library(reshape2)
library(plyr)


sum_trimmed_read_quals <- data.frame()
for (i in 1:nrow(raw_data)) {
    cell = raw_data[i, 3]
    rep = raw_data[i, 2]
    pair = raw_data[i, 4]
    trimmed_read_qualities <- read.table(paste("t_read_qualities_", cell, "-", 
        rep, "-", pair, ".txt", sep = ""), sep = "\t", header = FALSE)
    names(trimmed_read_qualities) <- c("base", "mean", "lower", "upper", "10%", 
        "90%")
    trimmed_read_qualities_melt <- melt(trimmed_read_qualities, id = "base")  # convert to long format
    trimmed_read_qualities_melt$sample <- paste(cell, "-", rep, "-", pair, sep = "")
    sum_trimmed_read_quals <- rbind(sum_trimmed_read_quals, trimmed_read_qualities_melt)
}
mean_trimed_read_quals <- ddply(sum_trimmed_read_quals, .(base, variable), numcolwise(mean))
ggplot(data = mean_trimed_read_quals, aes(x = base, y = value, colour = variable)) + 
    geom_line() + ggtitle("Trimmed Read Qualities") + ylim(0, 42)

plot of chunk trimmed_quality_scores

Transcripts per million expression data

# load express outputs

library(reshape2)
library(ggplot2)
setwd("~/hydrogen/sorghum/mrna")
cells = c("BS", "M")
reps = c(1, 2, 3)
data <- data.frame()

for (i in cells) {
    for (j in reps) {
        file <- paste("express_", i, "-", j, "/results.xprs", sep = "")
        print(file)
        tmp <- read.table(file, header = T)
        tmp <- tmp[, c(2, 15)]  # only keep contig name and tpm column
        tmp$sample <- paste(i, "-", j, sep = "")
        data <- rbind(data, tmp)
    }
}

## [1] "express_BS-1/results.xprs"
## [1] "express_BS-2/results.xprs"
## [1] "express_BS-3/results.xprs"
## [1] "express_M-1/results.xprs"
## [1] "express_M-2/results.xprs"
## [1] "express_M-3/results.xprs"

tpm_data <- dcast(data, target_id ~ sample, value.var = "tpm")

write.table(tpm_data, file = "sorghum_mrna.by_transcript.tpm.csv", sep = "\t", 
    row.names = FALSE, quote = F)

row.names(tpm_data) <- tpm_data$target_id
tpm_data <- tpm_data[, -c(1)]

correlation <- cor(tpm_data)
melted <- melt(correlation)
ggplot(data = melted, aes_string(x = names(melted)[1], y = names(melted)[2], 
    fill = "value")) + geom_tile() + ggtitle("TPM Correlation Matrix") + geom_text(aes(label = round(value, 
    2))) + xlab("") + ylab("")

plot of chunk tpm_corr

Effective Count expression data

# load express outputs

library(reshape2)
library(ggplot2)
setwd("~/hydrogen/sorghum/mrna")
cells = c("BS", "M")
reps = c(1, 2, 3)
data <- data.frame()

for (i in cells) {
    for (j in reps) {
        file <- paste("express_", i, "-", j, "/results.xprs", sep = "")
        print(file)
        tmp <- read.table(file, header = T)
        tmp <- tmp[, c(2, 8)]
        tmp$sample <- paste(i, "-", j, sep = "")
        data <- rbind(data, tmp)
    }
}

## [1] "express_BS-1/results.xprs"
## [1] "express_BS-2/results.xprs"
## [1] "express_BS-3/results.xprs"
## [1] "express_M-1/results.xprs"
## [1] "express_M-2/results.xprs"
## [1] "express_M-3/results.xprs"

count_data <- dcast(data, target_id ~ sample, value.var = "eff_counts")

write.table(count_data, file = "sorghum_mrna.by_transcript.eff_counts.csv", 
    sep = "\t", row.names = FALSE, quote = F)

row.names(count_data) <- count_data$target_id
counts <- count_data[, -c(1)]
correlation <- cor(counts)
melted <- melt(correlation)
ggplot(data = melted, aes_string(x = names(melted)[1], y = names(melted)[2], 
    fill = "value")) + geom_tile() + ggtitle("Effective Count Correlation Matrix") + 
    geom_text(aes(label = round(value, 2))) + xlab("") + ylab("")

plot of chunk eff_count_corr

Run EBSeq for differential expression analysis on effective count data

setwd("~/hydrogen/sorghum/mrna")
library(EBSeq)

## Loading required package: blockmodeling
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess

# express outputs stored in 'counts' matrix

counts <- counts[-c(rowSums(counts) == 0), ]  # remove rows that are all zeros

mat <- as.matrix(counts)

sizes <- MedianNorm(counts)
# sizes=QuantileNorm(counts, 0.75) # upper-quantile normalisation

conditions <- as.factor(rep(c("BS", "M"), each = 3))

ebOut <- EBTest(Data = mat, Conditions = conditions, sizeFactors = sizes, maxround = 6)

## Remove transcripts with all zero 
## 
## iteration 1 done 
## 
## time 13.69 
## 
## iteration 2 done 
## 
## time 9.6 
## 
## iteration 3 done 
## 
## time 9.84 
## 
## iteration 4 done 
## 
## time 10.16 
## 
## iteration 5 done 
## 
## time 8.19 
## 
## iteration 6 done 
## 
## time 8.58


posteriorProbs <- GetPPMat(ebOut)

probs <- data.frame(posteriorProbs)
names(probs) <- c("PPEE", "PPDE")

write.table(probs, "ppde_sorghum_mrna.csv", sep = "\t", quote = F, row.names = T)

# checking out a list of key sorghum genes plot bar charts of tpm with
# standard error bars

library(plyr)
stderr <- function(vector) {
    sd(vector)/sqrt(length(vector))
}
setwd("~/hydrogen/sorghum/mrna")
# load key
key <- read.table("key_sorghum.txt", sep = "\t", header = FALSE)
names(key) <- c("gene", "desc")

for (i in 1:nrow(key)) {
    gene <- tpm_data[which(row.names(tpm_data) == key[i, 1]), ]
    gene$name <- row.names(gene)
    gene_melt <- melt(gene, id = "name")
    gene_melt$cell <- gsub(gene_melt$variable, pattern = "-[0-9]", replacement = "")

    tpm <- ddply(gene_melt, .(cell), numcolwise(mean))  # sum columns, grouping by annotation
    tpm[which(tpm$cell == "BS"), "stderr"] <- stderr(gene_melt[which(gene_melt$cell == 
        "BS"), ]$value)  # set cell [bs,stderr]
    tpm[which(tpm$cell == "M"), "stderr"] <- stderr(gene_melt[which(gene_melt$cell == 
        "M"), ]$value)

    g1 <- ggplot(data = tpm, aes(x = cell, y = value, fill = cell)) + geom_bar(stat = "identity") + 
        ggtitle(paste("Expression of ", key[i, 2], " (", key[i, 1], ")", sep = "")) + 
        labs(x = "Cell") + geom_errorbar(aes(ymax = value + stderr, ymin = value - 
        stderr))
    print(g1)
}

plot of chunk mdh

Log Fold Changes

tpm_data$logfc = (tpm_data[, "M-1"] + tpm_data[, "M-2"] + tpm_data[, "M-3"])/(tpm_data[, 
    "BS-1"] + tpm_data[, "BS-2"] + tpm_data[, "BS-3"])
tpm_data$logfc = log(tpm_data$logfc)

tpm_data_PPDE <- merge(x = tpm_data, y = probs, by = "row.names")

DE_tpm_data <- tpm_data_PPDE[which(tpm_data_PPDE$PPDE > 0.95), ]
DE_tpm_data <- DE_tpm_data[-which(is.infinite(DE_tpm_data$logfc)), ]

DE_tpm_data <- DE_tpm_data[order(-DE_tpm_data$logfc), ]

M_tpm_data <- DE_tpm_data[which(DE_tpm_data$logfc >= 2), ]  # M

DE_tpm_data <- DE_tpm_data[order(DE_tpm_data$logfc), ]  # resort
BS_tpm_data <- DE_tpm_data[which(DE_tpm_data$logfc <= -2), ]  # BS

# head(M_tpm_data) head(BS_tpm_data)

write.table(M_tpm_data, "tpm_data.m.csv", sep = "\t", quote = F, row.names = F)
write.table(BS_tpm_data, "tpm_data.bs.csv", sep = "\t", quote = F, row.names = F)