Here are the packaged that we'll use:
require(reshape2)
## Loading required package: reshape2
require(ggplot2)
## Loading required package: ggplot2
Let's start by reading in the data. ID is really a label for each gene, so we turn it into a factor
fc.df <- read.csv("data/JKH_pca_qa_data.txt", header = TRUE, sep = "\t")
fc.df$ID <- factor(fc.df$ID)
head(fc.df)
## ID FPKM1 FPKM2 FPKM3
## 1 1 303.862 266.715 235.261
## 2 2 370.738 589.651 168.393
## 3 3 0.000 0.000 0.000
## 4 4 0.000 0.000 0.000
## 5 5 5.277 5.939 5.608
## 6 6 0.925 2.173 3.142
Let's get it ready for plotting
require(reshape2)
fc.melt <- melt(fc.df)
## Using ID as id variables
Let's look at the distributions of the three FPKM variables
ggplot(data = fc.melt, aes(x = variable, y = value)) + geom_boxplot(aes(group = variable))
Whoa there! Maybe on a log scale
blue <- "#3366FF"
ggplot(data = fc.melt, aes(x = variable, y = value)) + geom_jitter(aes(group = variable),
alpha = 0.1) + geom_boxplot(aes(group = variable), fill = NA, colour = blue,
outlier.colour = blue) + scale_y_log10(breaks = 10^(-5:5), labels = sub("[.]?0*$",
"", sprintf("%0.5f", 10^(-5:5))))
## Warning: Removed 6726 rows containing non-finite values (stat_boxplot).
I'm wondering about zero FPKM values
colSums(fc.df == 0)
## ID FPKM1 FPKM2 FPKM3
## 0 2025 2812 1889
Hmmm… and not all in the same places…
table(rowSums(fc.df == 0))
##
## 0 1 2 3
## 10914 965 662 1479
Let's look at the FPKMs of each gene (this'll be ugly!)
ggplot(data = fc.melt, aes(x = variable, y = value)) + geom_line(aes(group = ID),
alpha = 0.01) + scale_y_log10(breaks = 10^(-5:5), labels = sub("[.]?0*$",
"", sprintf("%0.5f", 10^(-5:5))))
Oh, hang on, they aren't fold changes at all… they're “fragments per kilobase of exon per million fragments mapped”… doh!
Ok, well I wonder what their mean and variances look like?
fc.fpkm <- fc.df[,-1]
fc.df$mean <- rowMeans(fc.fpkm)
fc.df$var <- apply(fc.fpkm, 1, var)
fc.df$sd <- sqrt(fc.df$var)
ggplot(data=fc.df, (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()
ggplot(data=subset(fc.df, mean <=10000), (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()
ggplot(data=subset(fc.df, mean <=1000), (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()
ggplot(data=subset(fc.df, mean <=100), (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()
ggplot(data=subset(fc.df, mean <=10), (aes(x=mean, y=sd))) + geom_point(alpha=0.1) + coord_equal()