Seminar 4: Two Sample Tests

shannonerdelyi — Jan 29, 2014, 7:43 PM

###############################################################################
# STAT 540: Seminar 4
# Take home problem: two sample tests with 100 genes 
# Date: 29-01-2014
# Author: Shannon Erdelyi
###############################################################################

# librarys
library(lattice)
library(plyr)

# data
dir <- "/Users/shannonerdelyi/Dropbox/UBC/W2014/STAT 540/Data"
des <- read.table(paste(dir, "/GSE4051_design.tsv", sep=""), header=T)
dat <- read.table(paste(dir, "/GSE4051_data.tsv", sep=""), header=T)

# constants
set.seed(8)
n <- 100
keepGenes <- sample(1:nrow(dat), n)

###############################################################################
# Data Preparation
###############################################################################

miniDat <- dat[keepGenes, ]
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))),
                      gene = factor(rep(rownames(miniDat), each = ncol(miniDat)),
                                    levels = rownames(miniDat)))
miniDat <- suppressWarnings(data.frame(des, miniDat))
str(miniDat)
'data.frame':   3900 obs. of  6 variables:
 $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
 $ sidNum  : int  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "4_weeks","E16",..: 2 2 2 2 2 2 2 4 4 4 ...
 $ gType   : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
 $ gExp    : num  10.59 10.5 10.67 10.58 9.17 ...
 $ gene    : Factor w/ 100 levels "1435552_at","1424407_s_at",..: 1 1 1 1 1 1 1 1 1 1 ...

###############################################################################
# Two sample tests
###############################################################################

# get the p-values 
pvals <- suppressWarnings(ddply(miniDat, ~ gene, function(miniDf) {
  tt <- t.test(gExp ~ gType, miniDf)$p.value
  wt <- wilcox.test(gExp ~ gType, miniDf)$p.value
  kt <- with(miniDf, 
             ks.test(gExp[gType == "NrlKO"], gExp[gType == "wt"]))$p.value
  c(t = tt, wilcoxon = wt, ks = kt)
}))

head(pvals)
          gene        t wilcoxon       ks
1   1435552_at 0.842959 0.407007 0.645207
2 1424407_s_at 0.011442 0.022017 0.092180
3   1451485_at 0.191994 0.216280 0.354961
4   1444288_at 0.935752 0.966774 0.824762
5   1429021_at 0.001555 0.001235 0.005278
6   1448113_at 0.714490 0.704424 0.864744

# compare pvalues across tests
plot(pvals[,-1])

plot of chunk unnamed-chunk-1

plot(log(pvals[,-1]))

plot of chunk unnamed-chunk-1


# apply 0.05 significance threshold
sig <- pvals[,-1] <= 0.05

# test agreement by gene
# 0 --> no tests were significant
# 3 --> all three tests were significant 
table(apply(sig, 1, sum))

 0  1  2  3 
77  4  9 10 

# how many genes are significant
apply(sig, 2, sum)
       t wilcoxon       ks 
      20       16       16