Shaun Jackman — Jan 25, 2013, 6:05 PM
# STAT 540 Seminar 4 Take-home work
# Shaun Jackman
library('plyr')
setwd('~/work/stat540/seminar4')
load("../data/photoRec/GSE4051_MINI.robj")
prDat <- read.table("../data/photoRec/GSE4051_data.txt")
load("../data/photoRec/GSE4051_design.robj")
keepGenes <- c("1431708_a_at", "1424336_at", "1454696_at",
"1416119_at", "1432141_x_at", "1429226_at")
keepDat <- prDat[keepGenes,]
miniDat <- data.frame(prDes,
gExp = as.vector(t(as.matrix(keepDat))),
gene = factor(rep(rownames(keepDat),
each = ncol(keepDat)), levels = keepGenes))
Warning: row names were found from a short variable and have been
discarded
# In our last example, can you edit the by() command where we did the
# t-tests to only return the test statistic and p-value we care about
# (vs. the whole t-test result object)?
ttRes <- by(miniDat, miniDat$gene,
function(yoDat) t.test(gExp ~ gType, yoDat)[c('statistic', 'p.value')])
simplify2array(ttRes)
1431708_a_at 1424336_at 1454696_at 1416119_at 1432141_x_at
statistic 9.838 9.061 8.077 -0.184 -0.1324
p.value 7.381e-12 7.146e-11 2.278e-09 0.8551 0.8954
1429226_at
statistic 0.09827
p.value 0.9223
# Can you clean up the result, e.g. transpose it so the rows correspond
# to the 6 genes and the variables or columns to the test statistic and
# the p-value? Give that better column or variable names. How about
# adding a factor or character variable with the probeset ID and
# removing the now-redundant rownames?
ttArr <- t(simplify2array(ttRes))
colnames(ttArr) = c('tStat', 'tpVal')
ttDF <- data.frame(probe = rownames(ttArr), ttArr)
rownames(ttDF) <- NULL
ttDF
probe tStat tpVal
1 1431708_a_at 9.838 7.381e-12
2 1424336_at 9.061 7.146e-11
3 1454696_at 8.077 2.278e-09
4 1416119_at -0.184 0.8551
5 1432141_x_at -0.1324 0.8954
6 1429226_at 0.09827 0.9223
# Can you take a similar approach but with a different test? Such as the
# common variance t-test, the Wilcoxon, or the Kolmogorov-Smirnov tests?
wtRes <- by(miniDat, miniDat$gene,
function(yoDat) wilcox.test(gExp ~ gType, yoDat, exact = F)[c('statistic', 'p.value')])
wtArr <- t(simplify2array(wtRes))
colnames(wtArr) = c('wStat', 'wpVal')
wtDF <- data.frame(probe = rownames(wtArr), wtArr)
rownames(wtDF) <- NULL
wtDF
probe wStat wpVal
1 1431708_a_at 371 3.945e-07
2 1424336_at 379 1.179e-07
3 1454696_at 372 3.397e-07
4 1416119_at 188 0.9664
5 1432141_x_at 193 0.944
6 1429226_at 202 0.7465
# Can you use a data aggregation strategy to collect test statistics and
# p-values from multiple tests together? Summarize how often all tests
# agree/disagree (if you work with these same 6 genes, I doubt you'll
# get any disagreement).
t.and.wilcox <- merge(ttDF, wtDF)
count((t.and.wilcox$tpVal < 0.5) == (t.and.wilcox$wpVal < 0.5))
x freq
1 TRUE 6
# Can you use functions from the plyr package to accomplish the same
# task? Testing for each gene and perhaps conducting different tests for
# each gene. Please share any success!
ddply(miniDat, .(gene), as.data.frame(
function(x) t.test(gExp ~ gType, x)[c('statistic', 'p.value')]))
gene value.statistic value.p.value
1 1431708_a_at 9.83798 7.381e-12
2 1424336_at 9.06067 7.146e-11
3 1454696_at 8.07708 2.278e-09
4 1416119_at -0.18395 8.551e-01
5 1432141_x_at -0.13241 8.954e-01
6 1429226_at 0.09827 9.223e-01
ddply(miniDat, .(gene), as.data.frame(
function(x) wilcox.test(gExp ~ gType, x, exact=F)[c('statistic', 'p.value')]))
gene value.statistic value.p.value
1 1431708_a_at 371 3.945e-07
2 1424336_at 379 1.179e-07
3 1454696_at 372 3.397e-07
4 1416119_at 188 9.664e-01
5 1432141_x_at 193 9.440e-01
6 1429226_at 202 7.465e-01
ddply(miniDat, .(gene),
function(x) data.frame(
t = t.test(gExp ~ gType, x)[c('statistic', 'p.value')],
w = wilcox.test(gExp ~ gType, x, exact=F)[c('statistic', 'p.value')]
))
gene t.statistic t.p.value w.statistic w.p.value
1 1431708_a_at 9.83798 7.381e-12 371 3.945e-07
2 1424336_at 9.06067 7.146e-11 379 1.179e-07
3 1454696_at 8.07708 2.278e-09 372 3.397e-07
4 1416119_at -0.18395 8.551e-01 188 9.664e-01
5 1432141_x_at -0.13241 8.954e-01 193 9.440e-01
6 1429226_at 0.09827 9.223e-01 202 7.465e-01
# Work with a different subset of genes. If you are feeling game, try
# larger subsets. Using system.time() shown below in Appendix, look into
# how well the by() strategy works as you scale up. You'll probably see
# why this is not going to be our solution for differential expression
# analysis for all 30K probesets. (I haven't tried this myself, so I'm
# not entirely sure what you'll see here! Scale up slowly.)
wt <- prDes$gType == 'wt'
probes <- names(head(n=1000, x=sort(abs(rowMeans(prDat[,!wt]) - rowMeans(prDat[,wt])))))
keepDat <- prDat[probes,]
miniDat <- data.frame(prDes,
gExp = as.vector(t(as.matrix(keepDat))),
gene = factor(rep(rownames(keepDat),
each = ncol(keepDat)), levels = probes))
Warning: row names were found from a short variable and have been
discarded
system.time(by(miniDat, miniDat$gene,
function(x) t.test(gExp ~ gType, x)[c('statistic', 'p.value')]))
user system elapsed
1.598 0.041 1.643