Exploration of FPKM data for James Hane

Here are the packaged that we'll use:

require(reshape2)
## Loading required package: reshape2
require(ggplot2)
## Loading required package: ggplot2

Let's start by reading in the data. ID is really a label for each gene, so we turn it into a factor

fc.df <- read.csv("data/JKH_pca_qa_data.txt", header = TRUE, sep = "\t")
fc.df$ID <- factor(fc.df$ID)
head(fc.df)
##   ID   FPKM1   FPKM2   FPKM3
## 1  1 303.862 266.715 235.261
## 2  2 370.738 589.651 168.393
## 3  3   0.000   0.000   0.000
## 4  4   0.000   0.000   0.000
## 5  5   5.277   5.939   5.608
## 6  6   0.925   2.173   3.142

Let's get it ready for plotting

require(reshape2)
fc.melt <- melt(fc.df)
## Using ID as id variables

Let's look at the distributions of the three FPKM variables

ggplot(data = fc.melt, aes(x = variable, y = value)) + geom_boxplot(aes(group = variable))

plot of chunk unnamed-chunk-4

Whoa there! Maybe on a log scale

blue <- "#3366FF"
ggplot(data = fc.melt, aes(x = variable, y = value)) + geom_jitter(aes(group = variable), 
    alpha = 0.1) + geom_boxplot(aes(group = variable), fill = NA, colour = blue, 
    outlier.colour = blue) + scale_y_log10(breaks = 10^(-5:5), labels = sub("[.]?0*$", 
    "", sprintf("%0.5f", 10^(-5:5))))
## Warning: Removed 6726 rows containing non-finite values (stat_boxplot).

plot of chunk unnamed-chunk-5

I'm wondering about zero FPKM values

colSums(fc.df == 0)
##    ID FPKM1 FPKM2 FPKM3 
##     0  2025  2812  1889

Hmmm… and not all in the same places…

table(rowSums(fc.df == 0))
## 
##     0     1     2     3 
## 10914   965   662  1479

Let's look at the FPKMs of each gene (this'll be ugly!)

ggplot(data = fc.melt, aes(x = variable, y = value)) + geom_line(aes(group = ID), 
    alpha = 0.01) + scale_y_log10(breaks = 10^(-5:5), labels = sub("[.]?0*$", 
    "", sprintf("%0.5f", 10^(-5:5))))

plot of chunk unnamed-chunk-8

Oh, hang on, they aren't fold changes at all… they're “fragments per kilobase of exon per million fragments mapped”… doh!

Ok, well I wonder what their mean and variances look like?

fc.fpkm <- fc.df[,-1]
fc.df$mean <- rowMeans(fc.fpkm)
fc.df$var  <- apply(fc.fpkm, 1, var)
fc.df$sd   <- sqrt(fc.df$var)
ggplot(data=fc.df, (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()

plot of chunk unnamed-chunk-9

ggplot(data=subset(fc.df, mean <=10000), (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()

plot of chunk unnamed-chunk-9

ggplot(data=subset(fc.df, mean <=1000),  (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()

plot of chunk unnamed-chunk-9

ggplot(data=subset(fc.df, mean <=100),   (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()

plot of chunk unnamed-chunk-9

ggplot(data=subset(fc.df, mean <=10),    (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()

plot of chunk unnamed-chunk-9