We will begin by loading necessary packages and the photorec dataset
library(plyr)
library(lattice)
prDat <- read.table("GSE4051_data.tsv")
str(prDat, max.level = 0)
## 'data.frame': 29949 obs. of 39 variables:
prDes <- readRDS("GSE4051_design.rds")
str(prDes)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
Let's extract the gene expression data for 100 random probes and place the data into a separate dataframe
set.seed(16)
random.probes <- sample(1:nrow(prDat), 100)
mini.df <- prDat[random.probes, ]
mini.df <- data.frame(gExp = as.vector(t(as.matrix(mini.df))), gene = factor(rep(rownames(mini.df),
each = ncol(mini.df)), levels = rownames(mini.df)))
mini.df <- suppressWarnings(data.frame(prDes, mini.df))
str(mini.df)
## 'data.frame': 3900 obs. of 6 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
## $ gExp : num 6.82 6.33 6.77 6.63 7.42 ...
## $ gene : Factor w/ 100 levels "1446081_at","1425983_x_at",..: 1 1 1 1 1 1 1 1 1 1 ...
Create a function that computes a t-test, Wilcoxon and KS test and returns their respective p-values as a matrix
ppval <- suppressWarnings(ddply(mini.df, ~gene, function(x) {
tt <- t.test(gExp ~ gType, x)
wt <- wilcox.test(gExp ~ gType, x)
ks <- with(x, ks.test(gExp[gType == "NrlKO"], gExp[gType == "wt"]))
round(c(t.test = tt$p.value, Wilcoxon = wt$p.value, KS = ks$p.value), 4)
}))
head(ppval)
## gene t.test Wilcoxon KS
## 1 1446081_at 0.0894 0.1913 0.1367
## 2 1425983_x_at 0.0046 0.0119 0.0424
## 3 1434944_at 0.0001 0.0002 0.0018
## 4 1425277_at 0.1154 0.1189 0.1266
## 5 1454353_at 0.3386 0.5133 0.4808
## 6 1428648_at 0.6693 0.6870 0.9606
Plot scatterplots of p-values (untransformed and log-transformed) from the 3 different statistical tests
plot(ppval[, -1], col = "blue")
plot(log(ppval[, -1]), col = "red")
Set threshold of significance at alpha = 0.05 and count how many probes are “statistically significant hits” by all 3 methods, 2 methods, 1 method or none
alpha <- ppval[, -1] <= 0.05
apply(alpha, 2, sum)
## t.test Wilcoxon KS
## 12 15 11
(count.hits <- as.matrix(count(apply(alpha, 1, sum))))
## x freq
## [1,] 0 84
## [2,] 1 2
## [3,] 2 6
## [4,] 3 8