library(ggplot2)
# load in mrna_raw_data file as a table
setwd("~/hydrogen/sorghum")
raw_data <- read.table("raw_mrna_data", sep = ",", header = FALSE)
names(raw_data) <- c("filepath", "rep", "cell", "pair")
Number of reads in each of the raw data files
# plot bar chart of this 'mrna_raw_read_counts.txt'
library(plyr)
setwd("~/hydrogen/sorghum/mrna")
raw_read_count <- read.table("mrna_raw_read_counts.txt", sep = "\t", header = FALSE)
names(raw_read_count) <- c("count", "cell", "rep", "pair")
raw_read_count_paired <- ddply(raw_read_count, c("cell", "rep"), transform,
sum_count = sum(count))
raw_read_count_paired <- unique(raw_read_count_paired[, c(2, 3, 5)])
ggplot(raw_read_count_paired, aes(x = paste(cell, rep), y = sum_count, fill = cell)) +
geom_bar(stat = "identity") + ggtitle("Raw read counts") + geom_text(aes(label = sum_count,
y = sum_count + 5e+05)) + labs(x = "Cell and rep") + ylim(0, 6e+07)
Quality scores of raw data file
# plot quality scores
setwd("~/hydrogen/sorghum/mrna")
library(reshape2)
library(plyr)
sum_read_quals <- data.frame()
for (i in 1:nrow(raw_data)) {
cell = raw_data[i, 3]
rep = raw_data[i, 2]
pair = raw_data[i, 4]
raw_read_qualities <- read.table(paste("raw_read_qualities_", cell, "-",
rep, "-", pair, ".txt", sep = ""), sep = "\t", header = FALSE)
names(raw_read_qualities) <- c("base", "mean", "lower", "upper", "10%",
"90%")
raw_read_qualities_melt <- melt(raw_read_qualities, id = "base") # convert to long format
raw_read_qualities_melt$sample <- paste(cell, "-", rep, "-", pair, sep = "")
sum_read_quals <- rbind(sum_read_quals, raw_read_qualities_melt)
}
mean_read_quals <- ddply(sum_read_quals, .(base, variable), numcolwise(mean)) # sum columns, grouping by annotation
ggplot(data = mean_read_quals, aes(x = base, y = value, colour = variable)) +
geom_line() + ggtitle("Raw Read Qualities ") + ylim(0, 42)
Number of reads in each of the trimmed data files
# plot chart 'mrna_trimmed_read_counts.txt'
library(plyr)
setwd("~/hydrogen/sorghum/mrna") # mrna_trimmed_read_counts.txt
trimmed_read_count <- read.table("mrna_trimmed_read_counts.txt", sep = "\t",
header = FALSE)
names(trimmed_read_count) <- c("count", "cell", "rep", "pair")
trimmed_read_count_paired <- ddply(trimmed_read_count, c("cell", "rep"), transform,
sum_count = sum(count))
trimmed_read_count_paired <- unique(trimmed_read_count_paired[, c(2, 3, 5)])
ggplot(trimmed_read_count_paired, aes(x = paste(cell, rep), y = sum_count, fill = cell)) +
geom_bar(stat = "identity") + ggtitle("Trimmed read counts") + geom_text(aes(label = sum_count,
y = sum_count + 5e+05)) + labs(x = "Cell and rep") + ylim(0, 6e+07)
Lengths of reads after trimming
# load the tables of trimmed read lengths smoosh together and plot as a bar
# chart (possibly with log y scale)
setwd("~/hydrogen/sorghum/mrna")
sum_read_lengths <- data.frame()
for (i in 1:nrow(raw_data)) {
cell = raw_data[i, 3]
rep = raw_data[i, 2]
pair = raw_data[i, 4]
trimmed_read_length <- read.table(paste("t_read_length_", cell, "-", rep,
"-", pair, ".txt", sep = ""), sep = "\t", header = FALSE)
names(trimmed_read_length) <- c("length", "count")
trimmed_read_length$length <- gsub(trimmed_read_length$length, pattern = "([0-9]+)-([0-9]+)",
replacement = "\\1")
sum_read_lengths <- rbind(sum_read_lengths, trimmed_read_length)
}
total_read_lengths <- ddply(sum_read_lengths, .(length), numcolwise(sum))
ggplot(total_read_lengths, aes(x = as.numeric(length), y = count)) + geom_bar(stat = "identity") +
ggtitle(paste("Trimmed Read Lengths for ", cell, " Rep:", rep, " Pair:",
pair)) + labs(x = "Read Length")
Quality scores of trimmed data files
# plot quality scores
setwd("~/hydrogen/sorghum/mrna")
library(reshape2)
library(plyr)
sum_trimmed_read_quals <- data.frame()
for (i in 1:nrow(raw_data)) {
cell = raw_data[i, 3]
rep = raw_data[i, 2]
pair = raw_data[i, 4]
trimmed_read_qualities <- read.table(paste("t_read_qualities_", cell, "-",
rep, "-", pair, ".txt", sep = ""), sep = "\t", header = FALSE)
names(trimmed_read_qualities) <- c("base", "mean", "lower", "upper", "10%",
"90%")
trimmed_read_qualities_melt <- melt(trimmed_read_qualities, id = "base") # convert to long format
trimmed_read_qualities_melt$sample <- paste(cell, "-", rep, "-", pair, sep = "")
sum_trimmed_read_quals <- rbind(sum_trimmed_read_quals, trimmed_read_qualities_melt)
}
mean_trimed_read_quals <- ddply(sum_trimmed_read_quals, .(base, variable), numcolwise(mean))
ggplot(data = mean_trimed_read_quals, aes(x = base, y = value, colour = variable)) +
geom_line() + ggtitle("Trimmed Read Qualities") + ylim(0, 42)
Transcripts per million expression data
# load express outputs
library(reshape2)
library(ggplot2)
setwd("~/hydrogen/sorghum/mrna")
cells = c("BS", "M")
reps = c(1, 2, 3)
data <- data.frame()
for (i in cells) {
for (j in reps) {
file <- paste("express_", i, "-", j, "/results.xprs", sep = "")
print(file)
tmp <- read.table(file, header = T)
tmp <- tmp[, c(2, 15)] # only keep contig name and tpm column
tmp$sample <- paste(i, "-", j, sep = "")
data <- rbind(data, tmp)
}
}
## [1] "express_BS-1/results.xprs"
## [1] "express_BS-2/results.xprs"
## [1] "express_BS-3/results.xprs"
## [1] "express_M-1/results.xprs"
## [1] "express_M-2/results.xprs"
## [1] "express_M-3/results.xprs"
tpm_data <- dcast(data, target_id ~ sample, value.var = "tpm")
write.table(tpm_data, file = "sorghum_mrna.by_transcript.tpm.csv", sep = "\t",
row.names = FALSE, quote = F)
row.names(tpm_data) <- tpm_data$target_id
tpm_data <- tpm_data[, -c(1)]
correlation <- cor(tpm_data)
melted <- melt(correlation)
ggplot(data = melted, aes_string(x = names(melted)[1], y = names(melted)[2],
fill = "value")) + geom_tile() + ggtitle("TPM Correlation Matrix") + geom_text(aes(label = round(value,
2))) + xlab("") + ylab("")
Effective Count expression data
# load express outputs
library(reshape2)
library(ggplot2)
setwd("~/hydrogen/sorghum/mrna")
cells = c("BS", "M")
reps = c(1, 2, 3)
data <- data.frame()
for (i in cells) {
for (j in reps) {
file <- paste("express_", i, "-", j, "/results.xprs", sep = "")
print(file)
tmp <- read.table(file, header = T)
tmp <- tmp[, c(2, 8)]
tmp$sample <- paste(i, "-", j, sep = "")
data <- rbind(data, tmp)
}
}
## [1] "express_BS-1/results.xprs"
## [1] "express_BS-2/results.xprs"
## [1] "express_BS-3/results.xprs"
## [1] "express_M-1/results.xprs"
## [1] "express_M-2/results.xprs"
## [1] "express_M-3/results.xprs"
count_data <- dcast(data, target_id ~ sample, value.var = "eff_counts")
write.table(count_data, file = "sorghum_mrna.by_transcript.eff_counts.csv",
sep = "\t", row.names = FALSE, quote = F)
row.names(count_data) <- count_data$target_id
counts <- count_data[, -c(1)]
correlation <- cor(counts)
melted <- melt(correlation)
ggplot(data = melted, aes_string(x = names(melted)[1], y = names(melted)[2],
fill = "value")) + geom_tile() + ggtitle("Effective Count Correlation Matrix") +
geom_text(aes(label = round(value, 2))) + xlab("") + ylab("")
Run EBSeq for differential expression analysis on effective count data
setwd("~/hydrogen/sorghum/mrna")
library(EBSeq)
## Loading required package: blockmodeling
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
# express outputs stored in 'counts' matrix
counts <- counts[-c(rowSums(counts) == 0), ] # remove rows that are all zeros
mat <- as.matrix(counts)
sizes <- MedianNorm(counts)
# sizes=QuantileNorm(counts, 0.75) # upper-quantile normalisation
conditions <- as.factor(rep(c("BS", "M"), each = 3))
ebOut <- EBTest(Data = mat, Conditions = conditions, sizeFactors = sizes, maxround = 6)
## Remove transcripts with all zero
##
## iteration 1 done
##
## time 13.69
##
## iteration 2 done
##
## time 9.6
##
## iteration 3 done
##
## time 9.84
##
## iteration 4 done
##
## time 10.16
##
## iteration 5 done
##
## time 8.19
##
## iteration 6 done
##
## time 8.58
posteriorProbs <- GetPPMat(ebOut)
probs <- data.frame(posteriorProbs)
names(probs) <- c("PPEE", "PPDE")
write.table(probs, "ppde_sorghum_mrna.csv", sep = "\t", quote = F, row.names = T)
# checking out a list of key sorghum genes plot bar charts of tpm with
# standard error bars
library(plyr)
stderr <- function(vector) {
sd(vector)/sqrt(length(vector))
}
setwd("~/hydrogen/sorghum/mrna")
# load key
key <- read.table("key_sorghum.txt", sep = "\t", header = FALSE)
names(key) <- c("gene", "desc")
for (i in 1:nrow(key)) {
gene <- tpm_data[which(row.names(tpm_data) == key[i, 1]), ]
gene$name <- row.names(gene)
gene_melt <- melt(gene, id = "name")
gene_melt$cell <- gsub(gene_melt$variable, pattern = "-[0-9]", replacement = "")
tpm <- ddply(gene_melt, .(cell), numcolwise(mean)) # sum columns, grouping by annotation
tpm[which(tpm$cell == "BS"), "stderr"] <- stderr(gene_melt[which(gene_melt$cell ==
"BS"), ]$value) # set cell [bs,stderr]
tpm[which(tpm$cell == "M"), "stderr"] <- stderr(gene_melt[which(gene_melt$cell ==
"M"), ]$value)
g1 <- ggplot(data = tpm, aes(x = cell, y = value, fill = cell)) + geom_bar(stat = "identity") +
ggtitle(paste("Expression of ", key[i, 2], " (", key[i, 1], ")", sep = "")) +
labs(x = "Cell") + geom_errorbar(aes(ymax = value + stderr, ymin = value -
stderr))
print(g1)
}
Log Fold Changes
tpm_data$logfc = (tpm_data[, "M-1"] + tpm_data[, "M-2"] + tpm_data[, "M-3"])/(tpm_data[,
"BS-1"] + tpm_data[, "BS-2"] + tpm_data[, "BS-3"])
tpm_data$logfc = log(tpm_data$logfc)
tpm_data_PPDE <- merge(x = tpm_data, y = probs, by = "row.names")
DE_tpm_data <- tpm_data_PPDE[which(tpm_data_PPDE$PPDE > 0.95), ]
DE_tpm_data <- DE_tpm_data[-which(is.infinite(DE_tpm_data$logfc)), ]
DE_tpm_data <- DE_tpm_data[order(-DE_tpm_data$logfc), ]
M_tpm_data <- DE_tpm_data[which(DE_tpm_data$logfc >= 2), ] # M
DE_tpm_data <- DE_tpm_data[order(DE_tpm_data$logfc), ] # resort
BS_tpm_data <- DE_tpm_data[which(DE_tpm_data$logfc <= -2), ] # BS
# head(M_tpm_data) head(BS_tpm_data)
write.table(M_tpm_data, "tpm_data.m.csv", sep = "\t", quote = F, row.names = F)
write.table(BS_tpm_data, "tpm_data.bs.csv", sep = "\t", quote = F, row.names = F)