Stat 540 Seminar 4

Purpose: To explore 2 sample tests for comparing distributions in the context of differential expression.

Start by loading data/libraries:

library(ggplot2)
library(plyr)
prDat <- read.table("../GSE4051_data.tsv")
prDes <- read.table("../GSE4051_design.tsv", header = TRUE)

Editing Inner Function to use t test, Wicoxon Test and KS test

Let's extract 6 genes at random, rather than hardcoded genes:

set.seed(540)
keepGenes <- row.names(prDat)[sample(1:nrow(prDat), 6)]
miniDat <- subset(prDat, rownames(prDat) %in% keepGenes)
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))), gene = factor(rep(rownames(miniDat), 
    each = ncol(miniDat)), levels = keepGenes))
miniDat <- suppressWarnings(data.frame(prDes, miniDat))

Next lets define some functions for running each test

# for t test
tTest <- function(z) {
    zz <- t.test(gExp ~ gType, z)
    round(c(tStat = zz$statistic, pVal = zz$p.value), 4)
}

# for Wilcoxon test
wilTest <- function(z) {
    # not sure if suppressing warnings is the way to go but the error messages
    # are annoying
    suppressWarnings(zz <- wilcox.test(gExp ~ gType, z))
    round(c(wilStat = zz$statistic, pVal = zz$p.value), 4)
}

# for KS test
ksTest <- function(z) {
    suppressWarnings(zz <- ks.test(z[z$gType == "wt", "gExp"], z[z$gType == 
        "NrlKO", "gExp"]))
    round(c(ksStat = zz$statistic, pVal = zz$p.value), 4)
}

# for all 3 tests
allTest <- function(z) {
    zzT <- t.test(gExp ~ gType, z)
    suppressWarnings(zzWil <- wilcox.test(gExp ~ gType, z))
    suppressWarnings(zzKS <- ks.test(z[z$gType == "wt", "gExp"], z[z$gType == 
        "NrlKO", "gExp"]))
    round(c(pVal_T = zzT$p.value, pVal_Wil = zzWil$p.value, pVal_KS = zzKS$p.value), 
        4)
}

Test out the functions:

ddply(miniDat, ~gene, tTest)

##         gene tStat.t   pVal
## 1 1417558_at -1.7883 0.0825
## 2 1453147_at -2.2800 0.0295
## 3 1450169_at -0.6666 0.5092
## 4 1430577_at  1.4297 0.1614
## 5 1459967_at -0.5926 0.5571
## 6 1432182_at  1.6699 0.1035

ddply(miniDat, ~gene, wilTest)

##         gene wilStat.W   pVal
## 1 1417558_at     129.5 0.0918
## 2 1453147_at     120.5 0.0525
## 3 1450169_at     158.0 0.3803
## 4 1430577_at     237.5 0.1866
## 5 1459967_at     177.0 0.7254
## 6 1432182_at     262.5 0.0430

ddply(miniDat, ~gene, ksTest)

##         gene ksStat.D   pVal
## 1 1417558_at   0.3895 0.1040
## 2 1453147_at   0.3447 0.1972
## 3 1450169_at   0.3263 0.2074
## 4 1430577_at   0.3342 0.2265
## 5 1459967_at   0.1632 0.9577
## 6 1432182_at   0.4316 0.0530

ddply(miniDat, ~gene, allTest)

##         gene pVal_T pVal_Wil pVal_KS
## 1 1417558_at 0.0825   0.0918  0.1040
## 2 1453147_at 0.0295   0.0525  0.1972
## 3 1450169_at 0.5092   0.3803  0.2074
## 4 1430577_at 0.1614   0.1866  0.2265
## 5 1459967_at 0.5571   0.7254  0.9577
## 6 1432182_at 0.1035   0.0430  0.0530

Matrix of p-values and genes for 100 sampled genes

Start by sampling 100 genes:

geneSample100 <- row.names(prDat)[sample(1:nrow(prDat), 100)]

sample100Dat <- subset(prDat, rownames(prDat) %in% geneSample100)

sample100Dat <- data.frame(gExp = as.vector(t(as.matrix(sample100Dat))), gene = factor(rep(rownames(sample100Dat), 
    each = ncol(sample100Dat)), levels = geneSample100))

sample100Dat <- suppressWarnings(data.frame(prDes, sample100Dat))

# write new function to format data in long format for t test
tTest <- function(z) {
    zz <- t.test(gExp ~ gType, z)
    data.frame(type = "t", pVal = as.numeric(round(zz$p.value, 4)))
}

# for Wilcoxon test
wilTest <- function(z) {
    # not sure if suppressing warnings is the way to go but the error messages
    # are annoying
    suppressWarnings(zz <- wilcox.test(gExp ~ gType, z))
    data.frame(type = "wil", pVal = as.numeric(round(zz$p.value, 4)))
}

# for KS test
ksTest <- function(z) {
    suppressWarnings(zz <- ks.test(z[z$gType == "wt", "gExp"], z[z$gType == 
        "NrlKO", "gExp"]))
    data.frame(type = "ks", pVal = as.numeric(round(zz$p.value, 4)))
}

pvalDat <- rbind(ddply(sample100Dat, ~gene, tTest), ddply(sample100Dat, ~gene, 
    wilTest), ddply(sample100Dat, ~gene, ksTest))


p <- ggplot(pvalDat, aes(gene, pVal, colour = type)) + geom_point()
p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
p

plot of chunk unnamed-chunk-5

Not really sure what I am supposed to get out of this, seems chaotic. Lets try the log scale like suggested:

(p <- p + scale_y_log10())

plot of chunk unnamed-chunk-6

A little better, I guess I see that low p-values seems to occur together.

Let's try sorting the data by mean pvalue:

# for reordering
pvalDatMean <- ddply(pvalDat, ~gene, summarize, mean = mean(pVal))
pvalDatMean <- pvalueDatMean[order(pvalDatMean$mean), ]

## Error: object 'pvalueDatMean' not found


p <- ggplot(pvalDat, aes(x = reorder(gene, pVal), y = pVal, group = type, colour = type)) + 
    geom_point()
p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_y_log10()
p

plot of chunk unnamed-chunk-7

The p-values mostly follow the same trend (near the high and low p-values) but seem to be dispersed in the inner regions of the plot.

Now lets investigate what happens when you hard thresholding on p = 0.05:

threshDat <- ddply(pvalDat, ~gene + type, summarize, sig = pVal > 0.95)
sigDat <- ddply(threshDat, ~gene, summarize, count = length(which(sig)))

Significant 0 times:

length(which(sigDat$count == 0))

## [1] 91

Significant 1 times:

length(which(sigDat$count == 1))

## [1] 7

Significant 2 times:

length(which(sigDat$count == 2))

## [1] 2

Significant 3 times:

length(which(sigDat$count == 3))

## [1] 0

Time limiting study

Plotting how long each test takes. Start by generating samples

geneSample5 <- row.names(prDat)[sample(1:nrow(prDat), 5)]
sample5Dat <- subset(prDat, rownames(prDat) %in% geneSample5)
sample5Dat <- data.frame(gExp = as.vector(t(as.matrix(sample5Dat))), gene = factor(rep(rownames(sample5Dat), 
    each = ncol(sample5Dat)), levels = geneSample5))
sample5Dat <- suppressWarnings(data.frame(prDes, sample5Dat))

geneSample10 <- row.names(prDat)[sample(1:nrow(prDat), 10)]
sample10Dat <- subset(prDat, rownames(prDat) %in% geneSample10)
sample10Dat <- data.frame(gExp = as.vector(t(as.matrix(sample10Dat))), gene = factor(rep(rownames(sample10Dat), 
    each = ncol(sample10Dat)), levels = geneSample10))
sample10Dat <- suppressWarnings(data.frame(prDes, sample10Dat))

geneSample50 <- row.names(prDat)[sample(1:nrow(prDat), 50)]
sample50Dat <- subset(prDat, rownames(prDat) %in% geneSample50)
sample50Dat <- data.frame(gExp = as.vector(t(as.matrix(sample50Dat))), gene = factor(rep(rownames(sample50Dat), 
    each = ncol(sample50Dat)), levels = geneSample50))
sample50Dat <- suppressWarnings(data.frame(prDes, sample50Dat))

geneSample500 <- row.names(prDat)[sample(1:nrow(prDat), 500)]
sample500Dat <- subset(prDat, rownames(prDat) %in% geneSample500)
sample500Dat <- data.frame(gExp = as.vector(t(as.matrix(sample500Dat))), gene = factor(rep(rownames(sample500Dat), 
    each = ncol(sample500Dat)), levels = geneSample500))
sample500Dat <- suppressWarnings(data.frame(prDes, sample500Dat))

Run Tests:


# for ease of returning time
getTime <- function(expr) {
    result <- system.time(expr)
    return(result[1])
}

system.time(ddply(sample5Dat, ~gene, tTest))[1]

## user.self 
##      0.05


timeTTest <- data.frame(time = c(getTime(ddply(sample5Dat, ~gene, tTest)), getTime(ddply(sample10Dat, 
    ~gene, tTest)), getTime(ddply(sample50Dat, ~gene, tTest)), getTime(ddply(sample100Dat, 
    ~gene, tTest)), getTime(ddply(sample500Dat, ~gene, tTest))), testNum = c(5, 
    10, 50, 100, 500), type = "tTest")

timeWilTest <- data.frame(time = c(getTime(ddply(sample5Dat, ~gene, wilTest)), 
    getTime(ddply(sample10Dat, ~gene, wilTest)), getTime(ddply(sample50Dat, 
        ~gene, wilTest)), getTime(ddply(sample100Dat, ~gene, wilTest)), getTime(ddply(sample500Dat, 
        ~gene, wilTest))), testNum = c(5, 10, 50, 100, 500), type = "wilTest")

timeKSTest <- data.frame(time = c(getTime(ddply(sample5Dat, ~gene, ksTest)), 
    getTime(ddply(sample10Dat, ~gene, ksTest)), getTime(ddply(sample50Dat, ~gene, 
        ksTest)), getTime(ddply(sample100Dat, ~gene, ksTest)), getTime(ddply(sample500Dat, 
        ~gene, ksTest))), testNum = c(5, 10, 50, 100, 500), type = "ksTest")

timeDat <- rbind(timeTTest, timeWilTest, timeKSTest)

Plot the results:

p <- ggplot(timeDat, aes(testNum, time, group = type, colour = type)) + geom_line()
p

plot of chunk unnamed-chunk-15

According to this plot it scales linearly.