We will begin by loading the required packages and data
source("http://www.bioconductor.org/biocLite.R")
## Bioconductor version 2.13 (BiocInstaller 1.12.0), ?biocLite for help
biocLite("limma")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 2.13 (BiocInstaller 1.12.0), R version 3.0.2.
## Installing package(s) 'limma'
##
## The downloaded binary packages are in
## /var/folders/WA/WAb0hePTGni0M7T1nJCmZU+++TI/-Tmp-//RtmpumoQjg/downloaded_packages
biocLite("statmod")
## BioC_mirror: http://bioconductor.org
## Using Bioconductor version 2.13 (BiocInstaller 1.12.0), R version 3.0.2.
## Installing package(s) 'statmod'
##
## The downloaded binary packages are in
## /var/folders/WA/WAb0hePTGni0M7T1nJCmZU+++TI/-Tmp-//RtmpumoQjg/downloaded_packages
library(limma)
library(lattice)
library(plyr)
prDat <- read.table("GSE4051_data.tsv")
str(prDat, max.level = 0)
## 'data.frame': 29949 obs. of 39 variables:
prDes <- readRDS("GSE4051_design.rds")
str(prDes)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
We will also write a few functions that we will use later on.
prepareData <- function(a) {
dat <- prDat[a, ]
dat <- data.frame(gExp = as.vector(t(as.matrix(dat))), gene = factor(rep(rownames(dat),
each = ncol(dat)), levels = rownames(dat)))
dat <- suppressWarnings(data.frame(prDes, dat))
dat <- subset(dat, subset = gType == "wt")
return(dat)
}
makeStripplot <- function(x, ...) {
stripplot(gExp ~ devStage | gene, x, group = gType, jitter.data = TRUE,
auto.key = FALSE, type = c("p", "a"), par.settings = simpleTheme(col = c("dodgerblue",
"deeppink1"), pch = 16), grid = TRUE, ...)
}
Fit.LM <- function(x) {
fitA <- lm(gExp ~ devStage, data = x)
return(fitA)
}
lmfp <- function(modelobject) {
if (class(modelobject) != "lm")
stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1], f[2], f[3], lower.tail = F)
attributes(p) <- NULL
return(c(F = f[1], P.value = p))
}
Let's do a simulation to illustrate the poor variance estimates for small sample (n) sizes. We will simulate data for 1000 genes; for each gene, we will have 3 observations from a standard normal distribution N(0,1).
set.seed(3)
m <- 1000
n <- 3
x <- matrix(rnorm(m * n), nrow = m)
# Inspecting the observed gene-wise variances
obsVars <- apply(x, 1, var)
summary(obsVars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001 0.277 0.678 1.020 1.430 6.900
mean(obsVars < 1/3)
## [1] 0.3
densityplot(~obsVars, n = 200)
We can observe that many of the observed variances are very small, even though they are equal to 1 “on average” (for a standard normal distribution). For example, we see that at least a quarter of the genes appear to exhibit a sample variance that is less than one-third the true variance. This can wreak havoc with statistical inference, such as t-statistics. Accordingly, limma is designed to combat this problem.
wtDes <- subset(prDes, gType == "wt")
str(wtDes)
## 'data.frame': 20 obs. of 4 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 24 25 26 27 28 29 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 1 1 1 1 1 1 ...
wtDat <- subset(prDat, select = prDes$gType == "wt")
str(wtDat, max.level = 0)
## 'data.frame': 29949 obs. of 20 variables:
# Make a model matrix using the default 'ref + treatment effects' scheme for
# handling the devStage factor
wtDesMat <- model.matrix(~devStage, wtDes)
str(wtDesMat)
## num [1:20, 1:5] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:20] "12" "13" "14" "15" ...
## ..$ : chr [1:5] "(Intercept)" "devStageP2" "devStageP6" "devStageP10" ...
## - attr(*, "assign")= int [1:5] 0 1 1 1 1
## - attr(*, "contrasts")=List of 1
## ..$ devStage: chr "contr.treatment"
# Now we will fit the model, for all probes at once, and use eBayes() to
# moderate the estimated error variances
wtFit <- lmFit(wtDat, wtDesMat)
wtEbFit <- eBayes(wtFit)
topTable(wtEbFit)
## X.Intercept. devStageP2 devStageP6 devStageP10
## 1423641_s_at 12.18 -0.0175 0.0750 0.0675
## 1438940_x_at 12.86 0.0850 0.1325 0.3425
## 1438657_x_at 12.78 0.1400 0.1250 -0.1850
## 1456736_x_at 12.32 0.1625 0.3050 0.2075
## 1436884_x_at 12.93 0.1775 0.3225 0.0300
## 1419700_a_at 12.32 0.1650 0.6475 0.8175
## 1435800_a_at 12.28 0.0450 0.6825 0.9000
## 1454613_at 12.47 -0.1075 -0.0500 -0.1025
## 1451240_a_at 13.00 0.3100 0.2800 0.2800
## 1450084_s_at 12.63 0.0825 0.0525 0.1725
## devStage4_weeks AveExpr F P.Value adj.P.Val
## 1423641_s_at 0.1800 12.24 45350 3.574e-36 5.201e-32
## 1438940_x_at 0.3500 13.04 44957 3.865e-36 5.201e-32
## 1438657_x_at -0.4500 12.71 43486 5.210e-36 5.201e-32
## 1456736_x_at 0.0725 12.47 39509 1.233e-35 6.725e-32
## 1436884_x_at 0.0250 13.04 39269 1.302e-35 6.725e-32
## 1419700_a_at 0.6825 12.78 39121 1.347e-35 6.725e-32
## 1435800_a_at 1.0200 12.81 36668 2.410e-35 1.031e-31
## 1454613_at -0.3825 12.34 35835 2.962e-35 1.078e-31
## 1451240_a_at -0.3700 13.10 35481 3.239e-35 1.078e-31
## 1450084_s_at 0.2600 12.75 34411 4.265e-35 1.234e-31
# You'll see that, by default, topTable() reports the top 10 hits. But let's
# take more care and specify explicitly the coefficients we want to test for
# equality with zero. Recall that one can specify these by number but I
# recommend doing this by name.
topTable(wtEbFit, coef = 2:5) # warning: this code is cryptic and error-prone!
## devStageP2 devStageP6 devStageP10 devStage4_weeks AveExpr
## 1440645_at 0.3990 0.1952 0.9200 3.961 6.528
## 1416041_at 0.1580 0.4797 0.3327 5.115 9.383
## 1425222_x_at 0.8820 0.7995 1.5488 5.532 7.028
## 1451635_at 1.3025 1.1900 2.0160 6.188 8.319
## 1429028_at -2.4433 -3.4073 -4.3105 -4.602 8.045
## 1422929_s_at -2.9117 -3.6182 -3.5472 -3.661 7.278
## 1424852_at 0.4575 0.2298 0.5740 3.979 7.454
## 1425171_at 0.9980 3.0530 5.2787 6.079 9.620
## 1451617_at 0.7255 2.5128 4.9837 6.685 8.817
## 1451618_at 0.6028 2.8903 5.0507 6.288 9.431
## F P.Value adj.P.Val
## 1440645_at 425.4 1.588e-17 4.755e-13
## 1416041_at 195.5 1.522e-14 2.280e-10
## 1425222_x_at 173.4 4.348e-14 4.341e-10
## 1451635_at 157.3 1.013e-13 7.585e-10
## 1429028_at 148.8 1.646e-13 9.203e-10
## 1422929_s_at 146.9 1.844e-13 9.203e-10
## 1424852_at 143.2 2.290e-13 9.799e-10
## 1425171_at 138.8 3.002e-13 1.124e-09
## 1451617_at 136.5 3.485e-13 1.160e-09
## 1451618_at 134.2 4.032e-13 1.207e-09
colnames(coef(wtEbFit))
## [1] "(Intercept)" "devStageP2" "devStageP6" "devStageP10"
## [5] "devStage4_weeks"
(dsHits <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit)))))
## devStageP2 devStageP6 devStageP10 devStage4_weeks AveExpr
## 1440645_at 0.3990 0.1952 0.9200 3.961 6.528
## 1416041_at 0.1580 0.4797 0.3327 5.115 9.383
## 1425222_x_at 0.8820 0.7995 1.5488 5.532 7.028
## 1451635_at 1.3025 1.1900 2.0160 6.188 8.319
## 1429028_at -2.4433 -3.4073 -4.3105 -4.602 8.045
## 1422929_s_at -2.9117 -3.6182 -3.5472 -3.661 7.278
## 1424852_at 0.4575 0.2298 0.5740 3.979 7.454
## 1425171_at 0.9980 3.0530 5.2787 6.079 9.620
## 1451617_at 0.7255 2.5128 4.9837 6.685 8.817
## 1451618_at 0.6028 2.8903 5.0507 6.288 9.431
## F P.Value adj.P.Val
## 1440645_at 425.4 1.588e-17 4.755e-13
## 1416041_at 195.5 1.522e-14 2.280e-10
## 1425222_x_at 173.4 4.348e-14 4.341e-10
## 1451635_at 157.3 1.013e-13 7.585e-10
## 1429028_at 148.8 1.646e-13 9.203e-10
## 1422929_s_at 146.9 1.844e-13 9.203e-10
## 1424852_at 143.2 2.290e-13 9.799e-10
## 1425171_at 138.8 3.002e-13 1.124e-09
## 1451617_at 136.5 3.485e-13 1.160e-09
## 1451618_at 134.2 4.032e-13 1.207e-09
Using the hit list from the above topTable, we will extract and make stripplots for hits 3, 6 and 9 on the list.
interesting_hits <- rownames(dsHits[c(3, 6, 9), ])
makeStripplot(some.hits <- prepareData(interesting_hits))
We will fit a linear model to each of these 3 probes to check if the F stats and p-values are similar to the topTable values produced by limma.
fit3 <- dlply(some.hits, ~gene, Fit.LM)
(fp.summary <- ldply(fit3, lmfp))
## gene F.value P.value
## 1 1425222_x_at 159.3 4.306e-12
## 2 1422929_s_at 144.3 8.858e-12
## 3 1451617_at 119.1 3.578e-11
(dsHits[c(3, 6, 9), c(6, 7)])
## F P.Value
## 1425222_x_at 173.4 4.348e-14
## 1422929_s_at 146.9 1.844e-13
## 1451617_at 136.5 3.485e-13
We can observe that the F-statistic and p-values generated from lm and limma are close but not exactly equal. This is because limma has moderated the estimated error variance.
To practice extracting information from topTable: How many probes have Benjamini-Hochberg (“BH”) adjusted p-values < than 1e-05?
topx <- topTable(wtEbFit, number = Inf, p.value = 1e-05, coef = grep("devStage",
colnames(coef(wtEbFit))))
nrow(topx)
## [1] 350
Pulling out the 63rd hit on the above list:
(topx[63, c("F", "adj.P.Val", "devStageP6")])
## F adj.P.Val devStageP6
## 1451633_a_at 64.01 1.049e-07 2.069
Scatterplot the t-statistics for the test that the P2 effect is zero against that of P10.
P2.hits <- topTable(wtEbFit, number = Inf, sort.by = "none", coef = grep("devStageP2",
colnames(coef(wtEbFit))))
P10.hits <- topTable(wtEbFit, number = Inf, sort.by = "none", coef = grep("devStageP10",
colnames(coef(wtEbFit))))
smoothScatter(P10.hits$t ~ P2.hits$t, xlim = c(-20, 20), ylim = c(-20, 20),
xlab = "t-statistic for P2 effect", ylab = "t-statistic for P10 effect")
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
abline(a = 0, b = 1, col = "orange")
Density plot of the associated p-values:
densityplot(~P10.hits$adj.P.Val + P2.hits$adj.P.Val, auto.key = T, plot.points = F)
The p-value distribution for P10 has a positive skew (to the right), we can say that P10 is more clearly distinguished from E16.
Hits with BH adjusted p-values <1e-03 for P2 vs. P10:
addmargins(table(P2.hits$adj.P.Val < 0.001, P10.hits$adj.P.Val < 0.001, dnn = c("P2",
"P10")))
## P10
## P2 FALSE TRUE Sum
## FALSE 29201 695 29896
## TRUE 1 52 53
## Sum 29202 747 29949
Observe 53 hits for P2, 747 for P10 and an overlap of 52 hits
Scatterplot matrix of raw p-values, BH adjusted p-values and BY
P10.hits.BY <- topTable(wtEbFit, number = Inf, sort.by = "none", adjust.method = "BY",
coef = grep("devStageP10", colnames(coef(wtEbFit))))
pdat <- data.frame(raw = P10.hits$P.Value, BH = P10.hits$adj.P.Val, BY = P10.hits.BY$adj.P.Val)
head(pdat)
## raw BH BY
## 1 0.5156 0.6027 1
## 2 0.0420 0.1056 1
## 3 0.1602 0.2498 1
## 4 0.9666 0.9742 1
## 5 0.1911 0.2839 1
## 6 0.2164 0.3112 1
splom(pdat, cex = 0.1, col = "red")
Perform inference for some contrasts
colnames(wtDesMat)
## [1] "(Intercept)" "devStageP2" "devStageP6" "devStageP10"
## [5] "devStage4_weeks"
(cont.matrix <- makeContrasts(P10VsP6 = devStageP10 - devStageP6, fourweeksVsP10 = devStage4_weeks -
devStageP10, levels = wtDesMat))
## Warning: Renaming (Intercept) to Intercept
## Contrasts
## Levels P10VsP6 fourweeksVsP10
## Intercept 0 0
## devStageP2 0 0
## devStageP6 -1 0
## devStageP10 1 -1
## devStage4_weeks 0 1
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
## Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
topDev <- topTable(wtEbFitCont)
makeStripplot(top_4_hits <- prepareData(rownames(topDev[c(1, 2, 3, 4), ])))
Let's use decideTests() to adjust the p-values for both contrasts globally, i.e. all together and then threshhold them at a cutoff of 1e-04.
cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
## P10VsP6 fourweeksVsP10
## -1 4 8
## 0 29945 29895
## 1 0 46
Here are the 4 probes that decline in expression from P6 to P10.
(hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)])
## [1] "1416635_at" "1437781_at" "1454752_at" "1455260_at"
makeStripplot(prepareData(hits1))
Here are 4 of the 8 probes that decline in expression from P10 to 4_weeks.
(hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)])
## [1] "1416021_a_at" "1423851_a_at" "1434500_at" "1435415_x_at"
## [5] "1437502_x_at" "1448182_a_at" "1452679_at" "1455447_at"
makeStripplot(prepareData(hits2[1:4]))
No overlap between these probe sets
intersect(hits1, hits2)
## character(0)
Here are 4 of the 46 probes that increase in expression from P10 to 4_weeks.
(hits3 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)])
## [1] "1416041_at" "1417280_at" "1418406_at" "1418710_at"
## [5] "1418789_at" "1419069_at" "1420725_at" "1420726_x_at"
## [9] "1421061_at" "1421818_at" "1422612_at" "1424852_at"
## [13] "1424895_at" "1425222_x_at" "1426059_at" "1426223_at"
## [17] "1427388_at" "1428763_at" "1429394_at" "1429791_at"
## [21] "1430580_at" "1431174_at" "1433699_at" "1434297_at"
## [25] "1434573_at" "1435436_at" "1435679_at" "1435727_s_at"
## [29] "1436265_at" "1436287_at" "1440402_at" "1440645_at"
## [33] "1441518_at" "1442243_at" "1443252_at" "1446484_at"
## [37] "1449170_at" "1449393_at" "1451042_a_at" "1451635_at"
## [41] "1452243_at" "1453385_at" "1455493_at" "1457878_at"
## [45] "1458418_at" "1459904_at"
makeStripplot(prepareData(hits3[1:4]))
intersect(hits1, hits3)
## character(0)
intersect(hits2, hits3)
## character(0)
No overlap between these probe sets either!
Let's try this again with a less stringent p-value cutoff of 0.01
cutoff <- 0.01
nHits <- 8
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
## P10VsP6 fourweeksVsP10
## -1 40 49
## 0 29897 29636
## 1 12 264
hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)]
makeStripplot(prepareData(hits1[1:nHits]))
hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)]
makeStripplot(prepareData(hits2[1:nHits]))
hits3 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0)]
makeStripplot(prepareData(hits3[1:nHits]))
hits4 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)]
makeStripplot(prepareData(hits4[1:nHits]))
vennDiagram(wtResCont)
hits5 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] != 0 & wtResCont[, "fourweeksVsP10"] !=
0)]
makeStripplot(prepareData(hits5))
hits6 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0 & wtResCont[, "fourweeksVsP10"] <
0)]
makeStripplot(prepareData(hits6))
We will now try to find one or more probes that have some expression changes up to P6 and then hold steady all the way to 4_weeks.
(cont.matrix1 <- makeContrasts(P2vsE16 = devStageP2 - Intercept, P6vsP2 = devStageP6 -
devStageP2, P10vsP6 = devStageP10 - devStageP6, fourweeksvsP10 = devStage4_weeks -
devStageP10, levels = wtDesMat))
## Warning: Renaming (Intercept) to Intercept
## Contrasts
## Levels P2vsE16 P6vsP2 P10vsP6 fourweeksvsP10
## Intercept -1 0 0 0
## devStageP2 1 -1 0 0
## devStageP6 0 1 -1 0
## devStageP10 0 0 1 -1
## devStage4_weeks 0 0 0 1
wtFitCont1 <- contrasts.fit(wtFit, cont.matrix1)
## Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont1 <- eBayes(wtFitCont1)
topDev1 <- topTable(wtEbFitCont1)
cutoff <- 1e-04
wtResCont1 <- decideTests(wtEbFitCont1, p.value = cutoff, method = "global")
summary(wtResCont1)
## P2vsE16 P6vsP2 P10vsP6 fourweeksvsP10
## -1 29903 0 32 37
## 0 46 29937 29907 29712
## 1 0 12 10 200
hitz <- rownames(prDat)[which(wtResCont1[, "P2vsE16"] != 0 & wtResCont1[, "P6vsP2"] !=
0 & wtResCont1[, "P10vsP6"] == 0 & wtResCont1[, "fourweeksvsP10"] == 0)]
makeStripplot(prepareData(hitz))
As we can see, there are 10 probes that show some expression changes up to P6 and then hold steady all the way to 4_weeks.