Load the require library and import the data:
library(plyr)
dat <- read.table("GSE4051_data.tsv", header = T)
design <- read.table("GSE4051_design.tsv", header = T)
str(design)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
## $ sidNum : int 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "4_weeks","E16",..: 2 2 2 2 2 2 2 4 4 4 ...
## $ gType : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
Take 100 genes at random and create a smaller dataset called miniDat:
set.seed(123)
keepGenes <- sample(1:nrow(dat), size = 100)
miniDat <- dat[keepGenes, ]
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))), gene = factor(rep(rownames(miniDat),
each = ncol(miniDat)), levels = rownames(miniDat)))
miniDat <- suppressWarnings(data.frame(design, miniDat))
str(miniDat)
## 'data.frame': 3900 obs. of 6 variables:
## $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
## $ sidNum : int 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "4_weeks","E16",..: 2 2 2 2 2 2 2 4 4 4 ...
## $ gType : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
## $ gExp : num 7.32 7.71 7.06 7.2 7.16 ...
## $ gene : Factor w/ 100 levels "1427773_a_at",..: 1 1 1 1 1 1 1 1 1 1 ...
Create a function that gives as output a dataframe containing the pvalue computed in three different tests (t.test, wilcoxon test and KS test)
p.val <- ddply(miniDat, ~gene, function(z) {
t <- t.test(gExp ~ gType, z)
wc <- suppressWarnings(wilcox.test(gExp ~ gType, z))
ks <- suppressWarnings(ks.test(z$gExp[z$gType == "NrlKO"], z$gExp[z$gType ==
"wt"]))
c(pvalueT = t$p.value, pvalueW = wc$p.value, pvalueK = ks$p.value)
})
head(p.val)
## gene pvalueT pvalueW pvalueK
## 1 1427773_a_at 0.581648 0.35060 0.3278
## 2 1451078_at 0.059321 0.07438 0.0960
## 3 1433507_a_at 0.450034 0.46493 0.3550
## 4 1455179_at 0.161915 0.21380 0.4338
## 5 1457691_at 0.930725 0.74659 0.7005
## 6 1417272_at 0.008463 0.01190 0.1171
Plot the p-values against each other. In the first plot we simply plot the p-values, while in the second plot we log the axis
plot(p.val[, 2:4], col = "purple")
plot(log(p.val[, 2:4]), col = "purple")
The p-values are as similar as expected. Most, in fact, of the time we draw the same conclusion from the three test. Finally we can check how many genes are hits in each of the three test:
first <- p.val[, -1]
new <- first <= 0.05
head(new)
## pvalueT pvalueW pvalueK
## [1,] FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE
## [6,] TRUE TRUE FALSE
apply(new, 2, sum)
## pvalueT pvalueW pvalueK
## 15 13 8
and how many genes are “hits” by all the three methods, by two methods, by one and by none:
count(apply(new, 1, sum))
## x freq
## 1 0 83
## 2 1 5
## 3 2 5
## 4 3 7