Cleome/Tareneya Hassleriana mRNA-seq Analysis

library(ggplot2)
# load in mrna_raw_data file as a table
setwd("/home/chris/documents/hassleriana")
raw_data <- read.table("data", sep = ",", header = FALSE)
names(raw_data) <- c("filepath", "rep", "cell", "pair")

RAW

Number of reads in each of the raw data files

# plot bar chart of this 'mrna_raw_read_counts.txt'
library(plyr)
setwd("/home/chris/documents/hassleriana")
raw_read_count <- read.table("raw_read_counts.txt", sep = "\t", header = FALSE)
names(raw_read_count) <- c("count", "section", "rep", "pair")

raw_read_count_paired <- ddply(raw_read_count, c("section", "rep"), transform, 
    sum_count = sum(count))
raw_read_count_paired <- unique(raw_read_count_paired[, c(2, 3, 5)])

ggplot(raw_read_count_paired, aes(x = paste(section, rep), y = sum_count, fill = section)) + 
    geom_bar(stat = "identity") + ggtitle("Raw read counts") + # geom_text(aes(label=sum_count, y=sum_count+200000)) +
labs(x = "Section and rep") + ylim(0, 9e+07)

plot of chunk raw_read_count

Quality scores of raw data file

# plot quality scores
setwd("/home/chris/documents/hassleriana")
library(reshape2)
library(plyr)

raw_read_qualities <- read.table("raw_read_qualities.txt", sep = "\t", header = FALSE)
names(raw_read_qualities) <- c("type_rep_pair", "base", "mean", "lower", "upper", 
    "10%", "90%")
mean_read_quals <- ddply(raw_read_qualities, .(base), numcolwise(mean))  # sum columns, grouping by annotation           

mean_quals_melt <- melt(mean_read_quals, id = "base")  # convert to long format
mean_quals_melt2 <- melt(raw_read_qualities, id = c("base", "type_rep_pair"))  # convert to long format

samples <- unique(raw_read_qualities$type_rep_pair)

ggplot(data = mean_quals_melt, aes(x = base, y = value, colour = variable)) + 
    geom_line() + ggtitle("Total Raw Read Qualities") + ylim(0, 42)

plot of chunk raw_quality_scores

Individual quality plots for each sample


for (sample in levels(samples)) {
    print(sample)
    sample_data <- mean_quals_melt2[which(mean_quals_melt2$type_rep_pair == 
        sample), ]
    g <- ggplot(data = sample_data, aes(x = base, y = value, colour = variable)) + 
        geom_line() + ggtitle(paste("Raw Read Qualities ", sample)) + ylim(0, 
        42)
    print(g)
}

## [1] "B-1-1"

plot of chunk raw_quality_scores2

## [1] "B-1-2"

plot of chunk raw_quality_scores2

## [1] "B-2-1"

plot of chunk raw_quality_scores2

## [1] "B-2-2"

plot of chunk raw_quality_scores2

## [1] "B-3-1"

plot of chunk raw_quality_scores2

## [1] "B-3-2"

plot of chunk raw_quality_scores2

## [1] "M-1-1"

plot of chunk raw_quality_scores2

## [1] "M-1-2"

plot of chunk raw_quality_scores2

## [1] "M-2-1"

plot of chunk raw_quality_scores2

## [1] "M-2-2"

plot of chunk raw_quality_scores2

## [1] "M-3-1"

plot of chunk raw_quality_scores2

## [1] "M-3-2"

plot of chunk raw_quality_scores2

## [1] "T-1-1"

plot of chunk raw_quality_scores2

## [1] "T-1-2"

plot of chunk raw_quality_scores2

## [1] "T-2-2"

plot of chunk raw_quality_scores2

## [1] "T-3-1"

plot of chunk raw_quality_scores2

## [1] "T-3-2"

plot of chunk raw_quality_scores2

TRIMMED

Number of reads in each of the trimmed data files

# plot bar chart of 'trimmed_read_counts.txt'
library(plyr)
setwd("/home/chris/documents/hassleriana")
trimmed_read_count <- read.table("trimmed_read_counts.txt", sep = "\t", header = FALSE)
names(trimmed_read_count) <- c("count", "section", "rep", "pair")

trimmed_read_count_paired <- ddply(trimmed_read_count, c("section", "rep"), 
    transform, sum_count = sum(count))
trimmed_read_count_paired <- unique(trimmed_read_count_paired[, c(2, 3, 5)])

ggplot(trimmed_read_count_paired, aes(x = paste(section, rep), y = sum_count, 
    fill = section)) + geom_bar(stat = "identity") + ggtitle("Trimmed read counts") + 
    # geom_text(aes(label=sum_count, y=sum_count+200000)) +
labs(x = "Section and rep") + ylim(0, 9e+07)

plot of chunk trimmed_read_count

Comparison of raw and trimmed read counts

raw_read_count_paired$name <- "raw"
trimmed_read_count_paired$name <- "trimmed"
both <- join(raw_read_count_paired, trimmed_read_count_paired, by = "name", 
    type = "full", match = "all")

ggplot(both, aes(x = paste(section, rep, name), y = sum_count, fill = paste(section, 
    name))) + geom_bar(stat = "identity") + ggtitle("Compare read counts") + 
    # geom_text(aes(label=sum_count, y=sum_count+200000)) +
labs(x = "Section and rep") + ylim(0, 9e+07)

plot of chunk read_count_comparison

Quality scores of trimmed data file

# plot quality scores
setwd("/home/chris/documents/hassleriana")
library(reshape2)
library(plyr)

raw_read_qualities <- read.table("trimmed_read_qualities.txt", sep = "\t", header = FALSE)
names(raw_read_qualities) <- c("type_rep_pair", "base", "mean", "lower", "upper", 
    "10%", "90%")
mean_read_quals <- ddply(raw_read_qualities, .(base), numcolwise(mean))  # sum columns, grouping by annotation           

mean_quals_melt <- melt(mean_read_quals, id = "base")  # convert to long format
mean_quals_melt2 <- melt(raw_read_qualities, id = c("base", "type_rep_pair"))  # convert to long format

samples <- unique(raw_read_qualities$type_rep_pair)

ggplot(data = mean_quals_melt, aes(x = base, y = value, colour = variable)) + 
    geom_line() + ggtitle("Total Trimmed Read Qualities") + ylim(0, 42)

plot of chunk trimmed_quality_scores

Individual quality plots for each sample


for (sample in levels(samples)) {
    print(sample)
    sample_data <- mean_quals_melt2[which(mean_quals_melt2$type_rep_pair == 
        sample), ]
    g <- ggplot(data = sample_data, aes(x = base, y = value, colour = variable)) + 
        geom_line() + ggtitle(paste("Trimmed Read Qualities ", sample)) + ylim(0, 
        42)
    print(g)
}

## [1] "B-1-1"

plot of chunk trimmed_quality_scores2

## [1] "B-1-2"

plot of chunk trimmed_quality_scores2

## [1] "B-2-1"

plot of chunk trimmed_quality_scores2

## [1] "B-2-2"

plot of chunk trimmed_quality_scores2

## [1] "B-3-1"

plot of chunk trimmed_quality_scores2

## [1] "B-3-2"

plot of chunk trimmed_quality_scores2

## [1] "M-1-1"

plot of chunk trimmed_quality_scores2

## [1] "M-1-2"

plot of chunk trimmed_quality_scores2

## [1] "M-2-1"

plot of chunk trimmed_quality_scores2

## [1] "M-2-2"

plot of chunk trimmed_quality_scores2

## [1] "M-3-1"

plot of chunk trimmed_quality_scores2

## [1] "M-3-2"

plot of chunk trimmed_quality_scores2

## [1] "T-1-1"

plot of chunk trimmed_quality_scores2

## [1] "T-1-2"

plot of chunk trimmed_quality_scores2

## [1] "T-2-2"

## [1] "T-3-1"

plot of chunk trimmed_quality_scores2

## [1] "T-3-2"

plot of chunk trimmed_quality_scores2