STAT 540 Seminar 4 Take-home work

Shaun Jackman — Jan 25, 2013, 6:05 PM

# STAT 540 Seminar 4 Take-home work
# Shaun Jackman

library('plyr')
setwd('~/work/stat540/seminar4')
load("../data/photoRec/GSE4051_MINI.robj")
prDat <- read.table("../data/photoRec/GSE4051_data.txt")
load("../data/photoRec/GSE4051_design.robj")

keepGenes <- c("1431708_a_at", "1424336_at", "1454696_at",
    "1416119_at", "1432141_x_at", "1429226_at")
keepDat <- prDat[keepGenes,]
miniDat <- data.frame(prDes,
    gExp = as.vector(t(as.matrix(keepDat))),
    gene = factor(rep(rownames(keepDat),
    each = ncol(keepDat)), levels = keepGenes))
Warning: row names were found from a short variable and have been
discarded

# In our last example, can you edit the by() command where we did the
# t-tests to only return the test statistic and p-value we care about
# (vs. the whole t-test result object)?

ttRes <- by(miniDat, miniDat$gene,
    function(yoDat) t.test(gExp ~ gType, yoDat)[c('statistic', 'p.value')])
simplify2array(ttRes)
          1431708_a_at 1424336_at 1454696_at 1416119_at 1432141_x_at
statistic 9.838        9.061      8.077      -0.184     -0.1324     
p.value   7.381e-12    7.146e-11  2.278e-09  0.8551     0.8954      
          1429226_at
statistic 0.09827   
p.value   0.9223    

# Can you clean up the result, e.g. transpose it so the rows correspond
# to the 6 genes and the variables or columns to the test statistic and
# the p-value? Give that better column or variable names. How about
# adding a factor or character variable with the probeset ID and
# removing the now-redundant rownames?

ttArr <- t(simplify2array(ttRes))
colnames(ttArr) = c('tStat', 'tpVal')
ttDF <- data.frame(probe = rownames(ttArr), ttArr)
rownames(ttDF) <- NULL
ttDF
         probe   tStat     tpVal
1 1431708_a_at   9.838 7.381e-12
2   1424336_at   9.061 7.146e-11
3   1454696_at   8.077 2.278e-09
4   1416119_at  -0.184    0.8551
5 1432141_x_at -0.1324    0.8954
6   1429226_at 0.09827    0.9223

# Can you take a similar approach but with a different test? Such as the
# common variance t-test, the Wilcoxon, or the Kolmogorov-Smirnov tests?

wtRes <- by(miniDat, miniDat$gene,
    function(yoDat) wilcox.test(gExp ~ gType, yoDat, exact = F)[c('statistic', 'p.value')])
wtArr <- t(simplify2array(wtRes))
colnames(wtArr) = c('wStat', 'wpVal')
wtDF <- data.frame(probe = rownames(wtArr), wtArr)
rownames(wtDF) <- NULL
wtDF
         probe wStat     wpVal
1 1431708_a_at   371 3.945e-07
2   1424336_at   379 1.179e-07
3   1454696_at   372 3.397e-07
4   1416119_at   188    0.9664
5 1432141_x_at   193     0.944
6   1429226_at   202    0.7465

# Can you use a data aggregation strategy to collect test statistics and
# p-values from multiple tests together? Summarize how often all tests
# agree/disagree (if you work with these same 6 genes, I doubt you'll
# get any disagreement).

t.and.wilcox <- merge(ttDF, wtDF)
count((t.and.wilcox$tpVal < 0.5) == (t.and.wilcox$wpVal < 0.5))
     x freq
1 TRUE    6

# Can you use functions from the plyr package to accomplish the same
# task? Testing for each gene and perhaps conducting different tests for
# each gene. Please share any success!

ddply(miniDat, .(gene), as.data.frame(
    function(x) t.test(gExp ~ gType, x)[c('statistic', 'p.value')]))
          gene value.statistic value.p.value
1 1431708_a_at         9.83798     7.381e-12
2   1424336_at         9.06067     7.146e-11
3   1454696_at         8.07708     2.278e-09
4   1416119_at        -0.18395     8.551e-01
5 1432141_x_at        -0.13241     8.954e-01
6   1429226_at         0.09827     9.223e-01
ddply(miniDat, .(gene), as.data.frame(
    function(x) wilcox.test(gExp ~ gType, x, exact=F)[c('statistic', 'p.value')]))
          gene value.statistic value.p.value
1 1431708_a_at             371     3.945e-07
2   1424336_at             379     1.179e-07
3   1454696_at             372     3.397e-07
4   1416119_at             188     9.664e-01
5 1432141_x_at             193     9.440e-01
6   1429226_at             202     7.465e-01
ddply(miniDat, .(gene),
    function(x) data.frame(
        t = t.test(gExp ~ gType, x)[c('statistic', 'p.value')],
        w = wilcox.test(gExp ~ gType, x, exact=F)[c('statistic', 'p.value')]
    ))
          gene t.statistic t.p.value w.statistic w.p.value
1 1431708_a_at     9.83798 7.381e-12         371 3.945e-07
2   1424336_at     9.06067 7.146e-11         379 1.179e-07
3   1454696_at     8.07708 2.278e-09         372 3.397e-07
4   1416119_at    -0.18395 8.551e-01         188 9.664e-01
5 1432141_x_at    -0.13241 8.954e-01         193 9.440e-01
6   1429226_at     0.09827 9.223e-01         202 7.465e-01

# Work with a different subset of genes. If you are feeling game, try
# larger subsets. Using system.time() shown below in Appendix, look into
# how well the by() strategy works as you scale up. You'll probably see
# why this is not going to be our solution for differential expression
# analysis for all 30K probesets. (I haven't tried this myself, so I'm
# not entirely sure what you'll see here! Scale up slowly.)

wt <- prDes$gType == 'wt'
probes <- names(head(n=1000, x=sort(abs(rowMeans(prDat[,!wt]) - rowMeans(prDat[,wt])))))
keepDat <- prDat[probes,]
miniDat <- data.frame(prDes,
    gExp = as.vector(t(as.matrix(keepDat))),
    gene = factor(rep(rownames(keepDat),
    each = ncol(keepDat)), levels = probes))
Warning: row names were found from a short variable and have been
discarded
system.time(by(miniDat, miniDat$gene,
    function(x) t.test(gExp ~ gType, x)[c('statistic', 'p.value')]))
   user  system elapsed 
  1.598   0.041   1.643