Seminar 4: Two-Sample Testing

We will begin by loading necessary packages and the photorec dataset

library(plyr)
library(lattice)
prDat <- read.table("GSE4051_data.tsv")
str(prDat, max.level = 0)
## 'data.frame':    29949 obs. of  39 variables:
prDes <- readRDS("GSE4051_design.rds")
str(prDes)
## 'data.frame':    39 obs. of  4 variables:
##  $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
##  $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...

Let's extract the gene expression data for 100 random probes and place the data into a separate dataframe

set.seed(16)
random.probes <- sample(1:nrow(prDat), 100)
mini.df <- prDat[random.probes, ]
mini.df <- data.frame(gExp = as.vector(t(as.matrix(mini.df))), gene = factor(rep(rownames(mini.df), 
    each = ncol(mini.df)), levels = rownames(mini.df)))
mini.df <- suppressWarnings(data.frame(prDes, mini.df))
str(mini.df)
## 'data.frame':    3900 obs. of  6 variables:
##  $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
##  $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
##  $ gExp    : num  6.82 6.33 6.77 6.63 7.42 ...
##  $ gene    : Factor w/ 100 levels "1446081_at","1425983_x_at",..: 1 1 1 1 1 1 1 1 1 1 ...

Create a function that computes a t-test, Wilcoxon and KS test and returns their respective p-values as a matrix

ppval <- suppressWarnings(ddply(mini.df, ~gene, function(x) {
    tt <- t.test(gExp ~ gType, x)
    wt <- wilcox.test(gExp ~ gType, x)
    ks <- with(x, ks.test(gExp[gType == "NrlKO"], gExp[gType == "wt"]))
    round(c(t.test = tt$p.value, Wilcoxon = wt$p.value, KS = ks$p.value), 4)
}))
head(ppval)
##           gene t.test Wilcoxon     KS
## 1   1446081_at 0.0894   0.1913 0.1367
## 2 1425983_x_at 0.0046   0.0119 0.0424
## 3   1434944_at 0.0001   0.0002 0.0018
## 4   1425277_at 0.1154   0.1189 0.1266
## 5   1454353_at 0.3386   0.5133 0.4808
## 6   1428648_at 0.6693   0.6870 0.9606

Plot scatterplots of p-values (untransformed and log-transformed) from the 3 different statistical tests

plot(ppval[, -1], col = "blue")

plot of chunk unnamed-chunk-1

plot(log(ppval[, -1]), col = "red")

plot of chunk unnamed-chunk-1

Set threshold of significance at alpha = 0.05 and count how many probes are “statistically significant hits” by all 3 methods, 2 methods, 1 method or none

alpha <- ppval[, -1] <= 0.05
apply(alpha, 2, sum)
##   t.test Wilcoxon       KS 
##       12       15       11
(count.hits <- as.matrix(count(apply(alpha, 1, sum))))
##      x freq
## [1,] 0   84
## [2,] 1    2
## [3,] 2    6
## [4,] 3    8