shannonerdelyi — Jan 29, 2014, 7:43 PM
###############################################################################
# STAT 540: Seminar 4
# Take home problem: two sample tests with 100 genes
# Date: 29-01-2014
# Author: Shannon Erdelyi
###############################################################################
# librarys
library(lattice)
library(plyr)
# data
dir <- "/Users/shannonerdelyi/Dropbox/UBC/W2014/STAT 540/Data"
des <- read.table(paste(dir, "/GSE4051_design.tsv", sep=""), header=T)
dat <- read.table(paste(dir, "/GSE4051_data.tsv", sep=""), header=T)
# constants
set.seed(8)
n <- 100
keepGenes <- sample(1:nrow(dat), n)
###############################################################################
# Data Preparation
###############################################################################
miniDat <- dat[keepGenes, ]
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))),
gene = factor(rep(rownames(miniDat), each = ncol(miniDat)),
levels = rownames(miniDat)))
miniDat <- suppressWarnings(data.frame(des, miniDat))
str(miniDat)
'data.frame': 3900 obs. of 6 variables:
$ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
$ sidNum : int 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "4_weeks","E16",..: 2 2 2 2 2 2 2 4 4 4 ...
$ gType : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
$ gExp : num 10.59 10.5 10.67 10.58 9.17 ...
$ gene : Factor w/ 100 levels "1435552_at","1424407_s_at",..: 1 1 1 1 1 1 1 1 1 1 ...
###############################################################################
# Two sample tests
###############################################################################
# get the p-values
pvals <- suppressWarnings(ddply(miniDat, ~ gene, function(miniDf) {
tt <- t.test(gExp ~ gType, miniDf)$p.value
wt <- wilcox.test(gExp ~ gType, miniDf)$p.value
kt <- with(miniDf,
ks.test(gExp[gType == "NrlKO"], gExp[gType == "wt"]))$p.value
c(t = tt, wilcoxon = wt, ks = kt)
}))
head(pvals)
gene t wilcoxon ks
1 1435552_at 0.842959 0.407007 0.645207
2 1424407_s_at 0.011442 0.022017 0.092180
3 1451485_at 0.191994 0.216280 0.354961
4 1444288_at 0.935752 0.966774 0.824762
5 1429021_at 0.001555 0.001235 0.005278
6 1448113_at 0.714490 0.704424 0.864744
# compare pvalues across tests
plot(pvals[,-1])
plot(log(pvals[,-1]))
# apply 0.05 significance threshold
sig <- pvals[,-1] <= 0.05
# test agreement by gene
# 0 --> no tests were significant
# 3 --> all three tests were significant
table(apply(sig, 1, sum))
0 1 2 3
77 4 9 10
# how many genes are significant
apply(sig, 2, sum)
t wilcoxon ks
20 16 16