prDat <- read.table("../photoRecDataset/dataClean/GSE4051_data.tsv")
prDes <- readRDS("../photoRecDataset/dataClean/GSE4051_design.rds")
Let's extract the data for one gene and put in a data.frame with the experimental information.
set.seed(987)
(theGene <- sample(1:nrow(prDat), 1))
## [1] 14294
pDat <- data.frame(prDes, genExp = unlist(prDat[theGene, ]))
str(pDat)
## 'data.frame': 39 obs. of 5 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
## $ genExp : num 9.49 8.94 9.32 9.97 10 ...
Always explore the data before plunging into analysis! What are the sample means in the wild type and Nrl knockout groups (yes, we're ignoring developmental stage today)? aggregate() and other data aggregation functions are explained below.
aggregate(genExp ~ gType, pDat, FUN = mean)
## gType genExp
## 1 wt 9.758
## 2 NrlKO 9.553
Let's make a stripplot so we can sanity test our t test result.
library(lattice)
stripplot(gType ~ genExp, pDat)
t.test(genExp ~ gType, pDat)
##
## Welch Two Sample t-test
##
## data: genExp by gType
## t = 1.479, df = 36.78, p-value = 0.1475
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.07586 0.48605
## sample estimates:
## mean in group wt mean in group NrlKO
## 9.758 9.553
If we save the t test result, we can inspect what it is.
ttRes <- t.test(genExp ~ gType, pDat)
str(ttRes)
## List of 9
## $ statistic : Named num 1.48
## ..- attr(*, "names")= chr "t"
## $ parameter : Named num 36.8
## ..- attr(*, "names")= chr "df"
## $ p.value : num 0.148
## $ conf.int : atomic [1:2] -0.0759 0.4861
## ..- attr(*, "conf.level")= num 0.95
## $ estimate : Named num [1:2] 9.76 9.55
## ..- attr(*, "names")= chr [1:2] "mean in group wt" "mean in group NrlKO"
## $ null.value : Named num 0
## ..- attr(*, "names")= chr "difference in means"
## $ alternative: chr "two.sided"
## $ method : chr "Welch Two Sample t-test"
## $ data.name : chr "genExp by gType"
## - attr(*, "class")= chr "htest"
extract test statistic and p- value
ttRes$statistic
## t
## 1.479
ttRes$p.value
## [1] 0.1475
ttResEqlVar <- t.test(genExp ~ gType, pDat, var.equal = T)
str(ttResEqlVar)
## List of 9
## $ statistic : Named num 1.48
## ..- attr(*, "names")= chr "t"
## $ parameter : Named num 37
## ..- attr(*, "names")= chr "df"
## $ p.value : num 0.147
## $ conf.int : atomic [1:2] -0.0756 0.4858
## ..- attr(*, "conf.level")= num 0.95
## $ estimate : Named num [1:2] 9.76 9.55
## ..- attr(*, "names")= chr [1:2] "mean in group wt" "mean in group NrlKO"
## $ null.value : Named num 0
## ..- attr(*, "names")= chr "difference in means"
## $ alternative: chr "two.sided"
## $ method : chr " Two Sample t-test"
## $ data.name : chr "genExp by gType"
## - attr(*, "class")= chr "htest"
wtRes <- suppressWarnings(wilcox.test(genExp ~ gType, pDat))
kstRes <- suppressWarnings(ks.test(pDat$genExp[pDat$gType == "wt"], pDat$genExp[pDat$gType ==
"NrlKO"]))
Even though data.frames are at the heart of most analyses, let's start at the beginning with apply(), which operates on a matrix (or arrays more generally). apply() is a built-in base R function; it is not part of plyr. Recall that arrays can only hold info all of the same “flavor”, such as numeric.
We can get a numeric matrix easily from the gene expression variables in the small excerpt of photoRec that we've worked with before.
kDat <- readRDS("../photoRecDataset/dataClean/GSE4051_MINI.rds")
kMat <- as.matrix(kDat[c("crabHammer", "eggBomb", "poisonFang")])
str(kMat)
## num [1:39, 1:3] 9.68 9.13 9.74 9.82 9.96 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:39] "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## ..$ : chr [1:3] "crabHammer" "eggBomb" "poisonFang"
Let's compute the median expression for specific genes (= column), “by hand” and using apply().
median(kMat[, 1]) # column numbers are mysterious
## [1] 9.611
median(kMat[, "crabHammer"]) # use names for better code
## [1] 9.611
apply(kMat, 2, median)
## crabHammer eggBomb poisonFang
## 9.611 6.757 7.350
The first argument of apply(), X, is the matrix, the second, MARGIN is the dimension(s) to operate on (1 - rows, 2 - columns), the third, FUN is the function to apply, which can be built-in like median() above, custom defined by you elsewhere, or custom defined by you “on the fly”. Reading the help file, you will also notice the weird … argument where you can specify arbitrary arguments that will be passed through to the function specified via FUN =.
apply(kMat, 2, quantile, probs = 0.5)
## crabHammer eggBomb poisonFang
## 9.611 6.757 7.350
apply(kMat, 2, quantile, probs = c(0.25, 0.75))
## crabHammer eggBomb poisonFang
## 25% 8.938 6.278 7.188
## 75% 9.830 7.095 7.476
apply(kMat, 1, min)
## Sample_11 Sample_12 Sample_2 Sample_9 Sample_36 Sample_37 Sample_38
## 6.981 7.165 7.075 6.558 6.993 6.992 6.608
## Sample_39 Sample_16 Sample_17 Sample_6 Sample_20 Sample_21 Sample_22
## 7.003 6.470 7.005 6.735 7.370 6.890 6.720
## Sample_23 Sample_13 Sample_15 Sample_18 Sample_19 Sample_32 Sample_33
## 6.529 7.228 7.226 7.363 7.081 7.005 7.082
## Sample_34 Sample_35 Sample_14 Sample_3 Sample_5 Sample_8 Sample_24
## 6.757 6.155 6.138 6.166 6.269 6.264 6.587
## Sample_25 Sample_26 Sample_27 Sample_1 Sample_10 Sample_4 Sample_7
## 6.170 6.870 6.800 6.286 6.347 6.270 6.188
## Sample_28 Sample_29 Sample_30 Sample_31
## 6.530 7.100 6.269 6.211
colnames(kMat)[apply(kMat, 1, which.min)]
## [1] "poisonFang" "eggBomb" "poisonFang" "eggBomb" "poisonFang"
## [6] "eggBomb" "eggBomb" "eggBomb" "eggBomb" "poisonFang"
## [11] "poisonFang" "poisonFang" "eggBomb" "eggBomb" "eggBomb"
## [16] "eggBomb" "eggBomb" "poisonFang" "eggBomb" "poisonFang"
## [21] "eggBomb" "eggBomb" "eggBomb" "eggBomb" "eggBomb"
## [26] "eggBomb" "eggBomb" "eggBomb" "eggBomb" "eggBomb"
## [31] "eggBomb" "eggBomb" "eggBomb" "eggBomb" "eggBomb"
## [36] "eggBomb" "poisonFang" "eggBomb" "eggBomb"
rowSums(kMat)
## Sample_11 Sample_12 Sample_2 Sample_9 Sample_36 Sample_37 Sample_38
## 23.86 23.64 23.93 23.42 24.82 23.98 23.70
## Sample_39 Sample_16 Sample_17 Sample_6 Sample_20 Sample_21 Sample_22
## 24.52 22.55 24.21 24.09 25.05 24.09 23.71
## Sample_23 Sample_13 Sample_15 Sample_18 Sample_19 Sample_32 Sample_33
## 23.22 24.52 24.76 24.94 24.44 25.43 24.17
## Sample_34 Sample_35 Sample_14 Sample_3 Sample_5 Sample_8 Sample_24
## 23.86 21.80 22.96 22.78 22.60 23.40 22.96
## Sample_25 Sample_26 Sample_27 Sample_1 Sample_10 Sample_4 Sample_7
## 22.66 23.99 23.26 22.58 23.14 22.60 22.75
## Sample_28 Sample_29 Sample_30 Sample_31
## 22.17 24.80 22.49 22.31
rowMeans(kMat)
## Sample_11 Sample_12 Sample_2 Sample_9 Sample_36 Sample_37 Sample_38
## 7.954 7.881 7.975 7.808 8.273 7.994 7.901
## Sample_39 Sample_16 Sample_17 Sample_6 Sample_20 Sample_21 Sample_22
## 8.174 7.516 8.070 8.031 8.351 8.029 7.904
## Sample_23 Sample_13 Sample_15 Sample_18 Sample_19 Sample_32 Sample_33
## 7.740 8.175 8.253 8.314 8.146 8.476 8.057
## Sample_34 Sample_35 Sample_14 Sample_3 Sample_5 Sample_8 Sample_24
## 7.953 7.268 7.653 7.593 7.533 7.799 7.655
## Sample_25 Sample_26 Sample_27 Sample_1 Sample_10 Sample_4 Sample_7
## 7.552 7.997 7.752 7.528 7.714 7.535 7.582
## Sample_28 Sample_29 Sample_30 Sample_31
## 7.391 8.268 7.498 7.438
colSums(kMat)
## crabHammer eggBomb poisonFang
## 367.7 264.7 287.8
colMeans(kMat)
## crabHammer eggBomb poisonFang
## 9.428 6.788 7.379
all.equal(rowSums(kMat), apply(kMat, 1, sum))
## [1] TRUE
all.equal(colMeans(kMat), apply(kMat, 2, mean))
## [1] TRUE
Obviously you won't notice the performance advantage in kMat but we can detect it with the entire dataset prDat, where we get a substantial speed-up (albeit of little practical significance) using the purpose-built functions over apply(). But both are improvement over for loops. Due to historical problems with lousy memory management, there is a special horror around for() loops in R, although they can be useful. To be clear, of course computations like row-wise means require a for loop to happen somewhere. When you use the apply() family functions this loop is often happening down at a very low level (i.e. down in C, which is what R is written in) and implemented in code written by a better programmer than you or I. And therefore you use less RAM and time, sometimes dramatically less. But the real reason to embrace data aggregation facilities is the huge improvement in code readability and writability.
prDat <- read.table("../photoRecDataset/dataClean/GSE4051_data.tsv")
prDes <- readRDS("../photoRecDataset/dataClean/GSE4051_design.rds")
jRowSums <- rowSums(prDat)
jRowSums <- apply(prDat, 1, sum)
More typical – and conceptually trickier – than the row- and column-wise operations above are operations on groups of observations, where the groups are induced by the levels of some factor (or combinations of multiple factors). We re-focus on data.frames.
Specifically, let's compute average expression of eggBomb for different levels of devStage.
names(kDat)
## [1] "sidChar" "sidNum" "devStage" "gType" "crabHammer"
## [6] "eggBomb" "poisonFang"
levels(kDat$devStage)
## [1] "E16" "P2" "P6" "P10" "4_weeks"
aggregate(eggBomb ~ devStage, kDat, mean)
## devStage eggBomb
## 1 E16 6.879
## 2 P2 6.408
## 3 P6 6.459
## 4 P10 7.143
## 5 4_weeks 7.063
aggregate(eggBomb ~ gType * devStage, kDat, mean)
## gType devStage eggBomb
## 1 wt E16 6.900
## 2 NrlKO E16 6.851
## 3 wt P2 6.607
## 4 NrlKO P2 6.209
## 5 wt P6 6.646
## 6 NrlKO P6 6.273
## 7 wt P10 7.042
## 8 NrlKO P10 7.243
## 9 wt 4_weeks 7.117
## 10 NrlKO 4_weeks 7.008
We are not limited to computing a single value for each group. Although it's silly with such a small dataset, we can use range() to report the min and max.
aggregate(eggBomb ~ gType * devStage, kDat, range)
## gType devStage eggBomb.1 eggBomb.2
## 1 wt E16 6.529 7.462
## 2 NrlKO E16 6.470 7.065
## 3 wt P2 6.170 6.870
## 4 NrlKO P2 6.138 6.269
## 5 wt P6 6.211 7.574
## 6 NrlKO P6 6.188 6.347
## 7 wt P10 6.155 8.173
## 8 NrlKO P10 7.081 7.438
## 9 wt 4_weeks 6.608 7.866
## 10 NrlKO 4_weeks 6.558 7.204
I've picked them for you: 3 are interesting ('hits'), 3 are not. I also reshape the data to be tall and skinny, which is generally a good policy and allows us to keep learning more about data aggregation.
keepGenes <- c("1431708_a_at", "1424336_at", "1454696_at", "1416119_at", "1432141_x_at",
"1429226_at")
miniDat <- subset(prDat, rownames(prDat) %in% keepGenes)
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))), gene = factor(rep(rownames(miniDat),
each = ncol(miniDat)), levels = keepGenes))
miniDat <- suppressWarnings(data.frame(prDes, miniDat))
str(miniDat)
## 'data.frame': 234 obs. of 6 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
## $ gExp : num 9.38 9 9.07 10.35 9.58 ...
## $ gene : Factor w/ 6 levels "1431708_a_at",..: 4 4 4 4 4 4 4 4 4 4 ...
library(lattice)
stripplot(gType ~ gExp | gene, miniDat, scales = list(x = list(relation = "free")),
group = gType, auto.key = T)
Looks right - top=borring, bottom=hits
Recall the syntax of the two-sample t-test for one gene:
t.test(gExp ~ gType, someDat)
Conceptually, we want to make a sub-data.frame for each gene and provide in the place of someDat in a t test call like above. Sometimes that is a useful first step, when building up a data aggregation task. Walk before you run.
someDat <- droplevels(subset(miniDat, gene == keepGenes[1]))
t.test(gExp ~ gType, someDat)
##
## Welch Two Sample t-test
##
## data: gExp by gType
## t = 9.838, df = 36.89, p-value = 7.381e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.570 2.384
## sample estimates:
## mean in group wt mean in group NrlKO
## 9.554 7.578
We have now outgrown the capability of aggregate(). If we restrict ourselves to the built-in functions, we'd need to look at functions like tapply(), split(), and by(). However I think it's the right time to start using plyr.
There is more info on plyr in the tutorial on data aggregation
library(plyr)
Since our input, miniDat, is a data.frame, we will use functions that start with d. What do we want to get back, if anything? If we are happy to watch the t test results fly by on the screen, we can use d_ply():
d_ply(miniDat, ~gene, function(x) t.test(gExp ~ gType, x), .print = T)
##
## Welch Two Sample t-test
##
## data: gExp by gType
## t = 9.838, df = 36.89, p-value = 7.381e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.570 2.384
## sample estimates:
## mean in group wt mean in group NrlKO
## 9.554 7.578
##
##
## Welch Two Sample t-test
##
## data: gExp by gType
## t = 9.061, df = 36.49, p-value = 7.146e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.6239 0.9835
## sample estimates:
## mean in group wt mean in group NrlKO
## 7.670 6.867
##
##
## Welch Two Sample t-test
##
## data: gExp by gType
## t = 8.077, df = 33.49, p-value = 2.278e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.402 2.346
## sample estimates:
## mean in group wt mean in group NrlKO
## 12.85 10.97
##
##
## Welch Two Sample t-test
##
## data: gExp by gType
## t = -0.184, df = 36.53, p-value = 0.8551
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5079 0.4234
## sample estimates:
## mean in group wt mean in group NrlKO
## 9.893 9.935
##
##
## Welch Two Sample t-test
##
## data: gExp by gType
## t = -0.1324, df = 36.31, p-value = 0.8954
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2413 0.2118
## sample estimates:
## mean in group wt mean in group NrlKO
## 7.092 7.107
##
##
## Welch Two Sample t-test
##
## data: gExp by gType
## t = 0.0983, df = 35.58, p-value = 0.9223
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1019 0.1123
## sample estimates:
## mean in group wt mean in group NrlKO
## 6.551 6.546
That's not so helpful: the results aren't labelled by probeset and whiz by. In real life, you will want these results for further processing, e.g. writing to file or presenting in a table. We know that t.test() returns a list, so we can use dlply() to retain everything in a new list with one component per probeset:
ttRes <- dlply(miniDat, ~gene, function(x) t.test(gExp ~ gType, x))
names(ttRes)
## [1] "1431708_a_at" "1424336_at" "1454696_at" "1416119_at"
## [5] "1432141_x_at" "1429226_at"
ttRes[["1431708_a_at"]]
##
## Welch Two Sample t-test
##
## data: gExp by gType
## t = 9.838, df = 36.89, p-value = 7.381e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.570 2.384
## sample estimates:
## mean in group wt mean in group NrlKO
## 9.554 7.578
ttRes <- ddply(miniDat, ~gene, function(z) {
zz <- t.test(gExp ~ gType, z)
round(c(tStat = zz$statistic, pVal = zz$p.value), 4)
})
ttRes
## gene tStat.t pVal
## 1 1431708_a_at 9.8380 0.0000
## 2 1424336_at 9.0607 0.0000
## 3 1454696_at 8.0771 0.0000
## 4 1416119_at -0.1840 0.8551
## 5 1432141_x_at -0.1324 0.8954
## 6 1429226_at 0.0983 0.9223
We've now conducted two group comparisons for all 6 genes at once without ever writing a top-level for loop. Use data aggregation functions!
In our last example, can you edit the inner function to use the Wilcoxon or KS test? Or maybe do the t test, the Wilcoxon, and the KS test and return all 3 p-values?
## do a Wilcoxon test on the miniDat set comparing gene expression between wt
## and NrlKO genotypes
wRes <- ddply(miniDat, ~gene, function(z) {
zw <- suppressWarnings(wilcox.test(gExp ~ gType, z))
round(c(stat = zw$statistic, w.pVal = zw$p.value), 4)
})
wRes
## gene stat.W w.pVal
## 1 1431708_a_at 371 0.0000
## 2 1424336_at 379 0.0000
## 3 1454696_at 372 0.0000
## 4 1416119_at 188 0.9668
## 5 1432141_x_at 193 0.9440
## 6 1429226_at 202 0.7465
## do a KS test
ksRes <- ddply(miniDat, ~gene, function(z) {
zks <- suppressWarnings(ks.test(miniDat$gExp[miniDat$gType == "wt"], miniDat$gExp[miniDat$gType ==
"NrlKO"]))
round(c(stat = zks$statistic, ks.pVal = zks$p.value), 4)
})
ksRes
## gene stat.D ks.pVal
## 1 1431708_a_at 0.2408 0.0023
## 2 1424336_at 0.2408 0.0023
## 3 1454696_at 0.2408 0.0023
## 4 1416119_at 0.2408 0.0023
## 5 1432141_x_at 0.2408 0.0023
## 6 1429226_at 0.2408 0.0023
## run all tree tests and produce data.frame with p-values
testsFun <- function(x) {
xt <- t.test(gExp ~ gType, x)
xw <- suppressWarnings(wilcox.test(gExp ~ gType, x))
xks <- suppressWarnings(ks.test(x$gExp[x$gType == "wt"], x$gExp[x$gType ==
"NrlKO"]))
round(c(t.pVal = xt$p.value, w.pVal = xw$p.value, ks.pVal = xks$p.value),
4)
}
testsRes <- ddply(miniDat, ~gene, testsFun)
testsRes
## gene t.pVal w.pVal ks.pVal
## 1 1431708_a_at 0.0000 0.0000 0.0000
## 2 1424336_at 0.0000 0.0000 0.0000
## 3 1454696_at 0.0000 0.0000 0.0000
## 4 1416119_at 0.8551 0.9668 0.7128
## 5 1432141_x_at 0.8954 0.9440 0.4107
## 6 1429226_at 0.9223 0.7465 0.9947
str(testsRes)
## 'data.frame': 6 obs. of 4 variables:
## $ gene : Factor w/ 6 levels "1431708_a_at",..: 1 2 3 4 5 6
## $ t.pVal : num 0 0 0 0.855 0.895 ...
## $ w.pVal : num 0 0 0 0.967 0.944 ...
## $ ks.pVal: num 0 0 0 0.713 0.411 ...
## scatter plot the p values
library(lattice)
xyplot(t.pVal ~ w.pVal, testsRes, xlab = "Wilcoxon test p-value", ylab = "t-test p-value")
xyplot(t.pVal ~ ks.pVal, testsRes, xlab = "KS test p-value", ylab = "t-test p-value")
xyplot(w.pVal ~ ks.pVal, testsRes, xlab = "KS test p-value", ylab = "Wilcoxon test p-value")