Sorghum mRNA-seq Analysis

library(ggplot2)
# load in mrna_raw_data file as a table
setwd("~/hydrogen/sorghum")
raw_data <- read.table("raw_mrna_data", sep = ",", header = FALSE)
names(raw_data) <- c("filepath", "rep", "cell", "pair")

Number of reads in each of the raw data files

# plot bar chart of this 'mrna_raw_read_counts.txt'
library(plyr)
setwd("~/hydrogen/sorghum/mrna")
raw_read_count <- read.table("mrna_raw_read_counts.txt", sep = "\t", header = FALSE)
names(raw_read_count) <- c("count", "cell", "rep", "pair")

raw_read_count_paired <- ddply(raw_read_count, c("cell", "rep"), transform, 
    sum_count = sum(count))
raw_read_count_paired <- unique(raw_read_count_paired[, c(2, 3, 5)])

ggplot(raw_read_count_paired, aes(x = paste(cell, rep), y = sum_count, fill = cell)) + 
    geom_bar(stat = "identity") + ggtitle("Raw read counts") + geom_text(aes(label = sum_count, 
    y = sum_count + 5e+05)) + labs(x = "Cell and rep") + ylim(0, 6e+07)

plot of chunk raw_read_count

Quality scores of raw data file

# plot quality scores
setwd("~/hydrogen/sorghum/mrna")
library(reshape2)
library(plyr)

sum_read_quals <- data.frame()
for (i in 1:nrow(raw_data)) {
    cell = raw_data[i, 3]
    rep = raw_data[i, 2]
    pair = raw_data[i, 4]
    raw_read_qualities <- read.table(paste("raw_read_qualities_", cell, "-", 
        rep, "-", pair, ".txt", sep = ""), sep = "\t", header = FALSE)
    names(raw_read_qualities) <- c("base", "mean", "lower", "upper", "10%", 
        "90%")
    raw_read_qualities_melt <- melt(raw_read_qualities, id = "base")  # convert to long format
    raw_read_qualities_melt$sample <- paste(cell, "-", rep, "-", pair, sep = "")
    sum_read_quals <- rbind(sum_read_quals, raw_read_qualities_melt)
}

mean_read_quals <- ddply(sum_read_quals, .(base, variable), numcolwise(mean))  # sum columns, grouping by annotation

ggplot(data = mean_read_quals, aes(x = base, y = value, colour = variable)) + 
    geom_line() + ggtitle("Raw Read Qualities ") + ylim(0, 42)

plot of chunk raw_quality_scores

Number of reads in each of the trimmed data files

# plot chart 'mrna_trimmed_read_counts.txt'
library(plyr)
setwd("~/hydrogen/sorghum/mrna")  # mrna_trimmed_read_counts.txt
trimmed_read_count <- read.table("mrna_trimmed_read_counts.txt", sep = "\t", 
    header = FALSE)
names(trimmed_read_count) <- c("count", "cell", "rep", "pair")

trimmed_read_count_paired <- ddply(trimmed_read_count, c("cell", "rep"), transform, 
    sum_count = sum(count))
trimmed_read_count_paired <- unique(trimmed_read_count_paired[, c(2, 3, 5)])

ggplot(trimmed_read_count_paired, aes(x = paste(cell, rep), y = sum_count, fill = cell)) + 
    geom_bar(stat = "identity") + ggtitle("Trimmed read counts") + geom_text(aes(label = sum_count, 
    y = sum_count + 5e+05)) + labs(x = "Cell and rep") + ylim(0, 6e+07)

plot of chunk trimmed_read_count

Lengths of reads after trimming

# load the tables of trimmed read lengths smoosh together and plot as a bar
# chart (possibly with log y scale)

setwd("~/hydrogen/sorghum/mrna")

sum_read_lengths <- data.frame()
for (i in 1:nrow(raw_data)) {
    cell = raw_data[i, 3]
    rep = raw_data[i, 2]
    pair = raw_data[i, 4]
    trimmed_read_length <- read.table(paste("t_read_length_", cell, "-", rep, 
        "-", pair, ".txt", sep = ""), sep = "\t", header = FALSE)
    names(trimmed_read_length) <- c("length", "count")
    trimmed_read_length$length <- gsub(trimmed_read_length$length, pattern = "([0-9]+)-([0-9]+)", 
        replacement = "\\1")
    sum_read_lengths <- rbind(sum_read_lengths, trimmed_read_length)
}
total_read_lengths <- ddply(sum_read_lengths, .(length), numcolwise(sum))

ggplot(total_read_lengths, aes(x = as.numeric(length), y = count)) + geom_bar(stat = "identity") + 
    ggtitle(paste("Trimmed Read Lengths for ", cell, " Rep:", rep, " Pair:", 
        pair)) + labs(x = "Read Length")

plot of chunk trimmed_read_lengths

Quality scores of trimmed data files

# plot quality scores
setwd("~/hydrogen/sorghum/mrna")
library(reshape2)
library(plyr)


sum_trimmed_read_quals <- data.frame()
for (i in 1:nrow(raw_data)) {
    cell = raw_data[i, 3]
    rep = raw_data[i, 2]
    pair = raw_data[i, 4]
    trimmed_read_qualities <- read.table(paste("t_read_qualities_", cell, "-", 
        rep, "-", pair, ".txt", sep = ""), sep = "\t", header = FALSE)
    names(trimmed_read_qualities) <- c("base", "mean", "lower", "upper", "10%", 
        "90%")
    trimmed_read_qualities_melt <- melt(trimmed_read_qualities, id = "base")  # convert to long format
    trimmed_read_qualities_melt$sample <- paste(cell, "-", rep, "-", pair, sep = "")
    sum_trimmed_read_quals <- rbind(sum_trimmed_read_quals, trimmed_read_qualities_melt)
}
mean_trimed_read_quals <- ddply(sum_trimmed_read_quals, .(base, variable), numcolwise(mean))
ggplot(data = mean_trimed_read_quals, aes(x = base, y = value, colour = variable)) + 
    geom_line() + ggtitle("Trimmed Read Qualities") + ylim(0, 42)

plot of chunk trimmed_quality_scores

Transcripts per million expression data

# load express outputs

library(reshape2)
library(ggplot2)
setwd("~/hydrogen/sorghum/mrna")
cells = c("BS", "M")
reps = c(1, 2, 3)
data <- data.frame()

for (i in cells) {
    for (j in reps) {
        file <- paste("express_", i, "-", j, "/results.xprs", sep = "")
        print(file)
        tmp <- read.table(file, header = T)
        tmp <- tmp[, c(2, 15)]  # only keep contig name and tpm column
        tmp$sample <- paste(i, "-", j, sep = "")
        data <- rbind(data, tmp)
    }
}
## [1] "express_BS-1/results.xprs"
## [1] "express_BS-2/results.xprs"
## [1] "express_BS-3/results.xprs"
## [1] "express_M-1/results.xprs"
## [1] "express_M-2/results.xprs"
## [1] "express_M-3/results.xprs"
tpm_data <- dcast(data, target_id ~ sample, value.var = "tpm")

write.table(tpm_data, file = "sorghum_mrna.by_transcript.tpm.csv", sep = "\t", 
    row.names = FALSE, quote = F)

row.names(tpm_data) <- tpm_data$target_id
tpm_data <- tpm_data[, -c(1)]

correlation <- cor(tpm_data)
melted <- melt(correlation)
ggplot(data = melted, aes_string(x = names(melted)[1], y = names(melted)[2], 
    fill = "value")) + geom_tile() + ggtitle("TPM Correlation Matrix") + geom_text(aes(label = round(value, 
    2))) + xlab("") + ylab("")

plot of chunk tpm_corr

Effective Count expression data

# load express outputs

library(reshape2)
library(ggplot2)
setwd("~/hydrogen/sorghum/mrna")
cells = c("BS", "M")
reps = c(1, 2, 3)
data <- data.frame()

for (i in cells) {
    for (j in reps) {
        file <- paste("express_", i, "-", j, "/results.xprs", sep = "")
        print(file)
        tmp <- read.table(file, header = T)
        tmp <- tmp[, c(2, 8)]
        tmp$sample <- paste(i, "-", j, sep = "")
        data <- rbind(data, tmp)
    }
}
## [1] "express_BS-1/results.xprs"
## [1] "express_BS-2/results.xprs"
## [1] "express_BS-3/results.xprs"
## [1] "express_M-1/results.xprs"
## [1] "express_M-2/results.xprs"
## [1] "express_M-3/results.xprs"
count_data <- dcast(data, target_id ~ sample, value.var = "eff_counts")

write.table(count_data, file = "sorghum_mrna.by_transcript.eff_counts.csv", 
    sep = "\t", row.names = FALSE, quote = F)

row.names(count_data) <- count_data$target_id
counts <- count_data[, -c(1)]
correlation <- cor(counts)
melted <- melt(correlation)
ggplot(data = melted, aes_string(x = names(melted)[1], y = names(melted)[2], 
    fill = "value")) + geom_tile() + ggtitle("Effective Count Correlation Matrix") + 
    geom_text(aes(label = round(value, 2))) + xlab("") + ylab("")

plot of chunk eff_count_corr

Run EBSeq for differential expression analysis on effective count data

setwd("~/hydrogen/sorghum/mrna")
library(EBSeq)
## Loading required package: blockmodeling
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
# express outputs stored in 'counts' matrix

counts <- counts[-c(rowSums(counts) == 0), ]  # remove rows that are all zeros

mat <- as.matrix(counts)

sizes <- MedianNorm(counts)
# sizes=QuantileNorm(counts, 0.75) # upper-quantile normalisation

conditions <- as.factor(rep(c("BS", "M"), each = 3))

ebOut <- EBTest(Data = mat, Conditions = conditions, sizeFactors = sizes, maxround = 6)
## Remove transcripts with all zero 
## 
## iteration 1 done 
## 
## time 13.69 
## 
## iteration 2 done 
## 
## time 9.6 
## 
## iteration 3 done 
## 
## time 9.84 
## 
## iteration 4 done 
## 
## time 10.16 
## 
## iteration 5 done 
## 
## time 8.19 
## 
## iteration 6 done 
## 
## time 8.58

posteriorProbs <- GetPPMat(ebOut)

probs <- data.frame(posteriorProbs)
names(probs) <- c("PPEE", "PPDE")

write.table(probs, "ppde_sorghum_mrna.csv", sep = "\t", quote = F, row.names = T)
# checking out a list of key sorghum genes plot bar charts of tpm with
# standard error bars

library(plyr)
stderr <- function(vector) {
    sd(vector)/sqrt(length(vector))
}
setwd("~/hydrogen/sorghum/mrna")
# load key
key <- read.table("key_sorghum.txt", sep = "\t", header = FALSE)
names(key) <- c("gene", "desc")

for (i in 1:nrow(key)) {
    gene <- tpm_data[which(row.names(tpm_data) == key[i, 1]), ]
    gene$name <- row.names(gene)
    gene_melt <- melt(gene, id = "name")
    gene_melt$cell <- gsub(gene_melt$variable, pattern = "-[0-9]", replacement = "")

    tpm <- ddply(gene_melt, .(cell), numcolwise(mean))  # sum columns, grouping by annotation
    tpm[which(tpm$cell == "BS"), "stderr"] <- stderr(gene_melt[which(gene_melt$cell == 
        "BS"), ]$value)  # set cell [bs,stderr]
    tpm[which(tpm$cell == "M"), "stderr"] <- stderr(gene_melt[which(gene_melt$cell == 
        "M"), ]$value)

    g1 <- ggplot(data = tpm, aes(x = cell, y = value, fill = cell)) + geom_bar(stat = "identity") + 
        ggtitle(paste("Expression of ", key[i, 2], " (", key[i, 1], ")", sep = "")) + 
        labs(x = "Cell") + geom_errorbar(aes(ymax = value + stderr, ymin = value - 
        stderr))
    print(g1)
}

plot of chunk mdh plot of chunk mdh plot of chunk mdh plot of chunk mdh plot of chunk mdh plot of chunk mdh plot of chunk mdh plot of chunk mdh

Log Fold Changes

tpm_data$logfc = (tpm_data[, "M-1"] + tpm_data[, "M-2"] + tpm_data[, "M-3"])/(tpm_data[, 
    "BS-1"] + tpm_data[, "BS-2"] + tpm_data[, "BS-3"])
tpm_data$logfc = log(tpm_data$logfc)

tpm_data_PPDE <- merge(x = tpm_data, y = probs, by = "row.names")

DE_tpm_data <- tpm_data_PPDE[which(tpm_data_PPDE$PPDE > 0.95), ]
DE_tpm_data <- DE_tpm_data[-which(is.infinite(DE_tpm_data$logfc)), ]

DE_tpm_data <- DE_tpm_data[order(-DE_tpm_data$logfc), ]

M_tpm_data <- DE_tpm_data[which(DE_tpm_data$logfc >= 2), ]  # M

DE_tpm_data <- DE_tpm_data[order(DE_tpm_data$logfc), ]  # resort
BS_tpm_data <- DE_tpm_data[which(DE_tpm_data$logfc <= -2), ]  # BS

# head(M_tpm_data) head(BS_tpm_data)

write.table(M_tpm_data, "tpm_data.m.csv", sep = "\t", quote = F, row.names = F)
write.table(BS_tpm_data, "tpm_data.bs.csv", sep = "\t", quote = F, row.names = F)