Shaun Jackman — Feb 13, 2013, 4:06 PM
# STAT 540 Seminar 6
# Shaun Jackman
# Preliminaries
library(limma)
library(lattice)
prDat <- read.table("../data/photoRec/GSE4051_data.txt") # the whole enchilada
str(prDat, max.level = 0)
'data.frame': 29949 obs. of 39 variables:
load("../data/photoRec/GSE4051_design.robj") # load exp design as 'prDes'
str(prDes)
'data.frame': 39 obs. of 3 variables:
$ sample : num 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
# The difficulty in estimating gene-wise variance
m <- 1000
n <- 3
x <- matrix(rnorm(m * n), nrow = m)
obsVars <- apply(x, 1, var)
summary(obsVars)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.004 0.288 0.686 0.988 1.340 7.160
mean(obsVars < 1/3)
[1] 0.275
densityplot(~obsVars, n = 200)
# Fit a linear model: explain gene expression in the wild type mice as a function of developmental stage (one-way ANOVA)
wtDes <- subset(prDes, gType == "wt")
str(wtDes)
'data.frame': 20 obs. of 3 variables:
$ sample : num 20 21 22 23 24 25 26 27 28 29 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 2 2 2 2 3 3 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 1 1 1 1 1 1 ...
wtDat <- subset(prDat, select = prDes$gType == "wt")
str(wtDat, max.level = 0)
'data.frame': 29949 obs. of 20 variables:
wtDesMat <- model.matrix(~devStage, wtDes)
str(wtDesMat)
num [1:20, 1:5] 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:20] "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
..$ : chr [1:5] "(Intercept)" "devStageP2" "devStageP6" "devStageP10" ...
- attr(*, "assign")= int [1:5] 0 1 1 1 1
- attr(*, "contrasts")=List of 1
..$ devStage: chr "contr.treatment"
wtFit <- lmFit(wtDat, wtDesMat)
wtEbFit <- eBayes(wtFit)
topTable(wtEbFit)
ID X.Intercept. devStageP2 devStageP6 devStageP10
1423641_s_at 1423641_s_at 12.18 -0.0175 0.0750 0.0675
1438940_x_at 1438940_x_at 12.86 0.0850 0.1325 0.3425
1438657_x_at 1438657_x_at 12.78 0.1400 0.1250 -0.1850
1456736_x_at 1456736_x_at 12.32 0.1625 0.3050 0.2075
1436884_x_at 1436884_x_at 12.93 0.1775 0.3225 0.0300
1419700_a_at 1419700_a_at 12.32 0.1650 0.6475 0.8175
1435800_a_at 1435800_a_at 12.28 0.0450 0.6825 0.9000
1454613_at 1454613_at 12.47 -0.1075 -0.0500 -0.1025
1451240_a_at 1451240_a_at 13.00 0.3100 0.2800 0.2800
1450084_s_at 1450084_s_at 12.63 0.0825 0.0525 0.1725
devStage4_weeks AveExpr F P.Value adj.P.Val
1423641_s_at 0.1800 12.24 45350 3.574e-36 5.201e-32
1438940_x_at 0.3500 13.04 44957 3.865e-36 5.201e-32
1438657_x_at -0.4500 12.71 43486 5.210e-36 5.201e-32
1456736_x_at 0.0725 12.47 39509 1.233e-35 6.725e-32
1436884_x_at 0.0250 13.04 39269 1.302e-35 6.725e-32
1419700_a_at 0.6825 12.78 39121 1.347e-35 6.725e-32
1435800_a_at 1.0200 12.81 36668 2.410e-35 1.031e-31
1454613_at -0.3825 12.34 35835 2.962e-35 1.078e-31
1451240_a_at -0.3700 13.10 35481 3.239e-35 1.078e-31
1450084_s_at 0.2600 12.75 34411 4.265e-35 1.234e-31
topTable(wtEbFit, coef = c("devStageP2", "devStageP6", "devStageP10", "devStage4_weeks"))
ID devStageP2 devStageP6 devStageP10
1440645_at 1440645_at 0.3990 0.1952 0.9200
1416041_at 1416041_at 0.1580 0.4797 0.3327
1425222_x_at 1425222_x_at 0.8820 0.7995 1.5488
1451635_at 1451635_at 1.3025 1.1900 2.0160
1429028_at 1429028_at -2.4433 -3.4073 -4.3105
1422929_s_at 1422929_s_at -2.9117 -3.6182 -3.5472
1424852_at 1424852_at 0.4575 0.2298 0.5740
1425171_at 1425171_at 0.9980 3.0530 5.2787
1451617_at 1451617_at 0.7255 2.5128 4.9837
1451618_at 1451618_at 0.6028 2.8903 5.0507
devStage4_weeks AveExpr F P.Value adj.P.Val
1440645_at 3.961 6.528 425.4 1.588e-17 4.755e-13
1416041_at 5.115 9.383 195.5 1.522e-14 2.280e-10
1425222_x_at 5.532 7.028 173.4 4.348e-14 4.341e-10
1451635_at 6.188 8.319 157.3 1.013e-13 7.585e-10
1429028_at -4.602 8.045 148.8 1.646e-13 9.203e-10
1422929_s_at -3.661 7.278 146.9 1.844e-13 9.203e-10
1424852_at 3.979 7.454 143.2 2.290e-13 9.799e-10
1425171_at 6.079 9.620 138.8 3.002e-13 1.124e-09
1451617_at 6.685 8.817 136.5 3.485e-13 1.160e-09
1451618_at 6.288 9.431 134.2 4.032e-13 1.207e-09
colnames(coef(wtEbFit))
[1] "(Intercept)" "devStageP2" "devStageP6" "devStageP10"
[5] "devStage4_weeks"
dsHits <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit))))
# Use the hit list you stored above and your functions for extracting and plotting data to produce this plot for hits 3, 6, and 9 on the list.
library(reshape2)
prepareData <- function(genes)
melt(cbind(prDes, t(prDat[genes,])),
id.vars=colnames(prDes),
variable.name='gene', value.name='gExp')
makeStripplot <- function(x, ...)
stripplot(gExp ~ devStage | gene, x, subset = gType == 'wt',
jitter.data = TRUE, auto.key = TRUE,
type = c("p", "a"), grid = TRUE,
...)
makeStripplot(prepareData(dsHits$ID[c(3,6,9)]))
# Be the boss of topTable()
# How many probes have Benjamini-Hochberg ("BH") adjusted p-values for the F test conducted above that are less than 1e-05?
hits = topTable(wtEbFit, coef = c("devStageP2", "devStageP6", "devStageP10", "devStage4_weeks"),
p.value=1e-5, number=9999)
nrow(hits)
[1] 350
# What is the 63rd hit on this list? Provide it's Affy ID, F statistic, BH adjusted p-value, and the estimated effect for developmental stage "P6" in that order.
hits[63,c('F', 'adj.P.Val', 'devStageP6')]
F adj.P.Val devStageP6
1451633_a_at 64.01 1.049e-07 2.069
# Scatterplot the t statistics for the test that the P2 effect is zero against that for P10.
hits <- topTable(wtEbFit, coef = c('devStageP2', 'devStageP10'), number=99999)
smoothScatter(hits$devStageP2, hits$devStageP10, asp=1)
KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
# Create a densityplot of the associated p-values, so you can get a sense of which developmental stage, P2 or P10, is more clearly distinguished from baseline E16.
P2Hits <- topTable(wtEbFit, coef = 'devStageP2', number=99999)
P10Hits <- topTable(wtEbFit, coef = 'devStageP10', number=99999)
densityplot(~P10Hits$adj.P.Val + P2Hits$adj.P.Val)
# Is this what you'd expect?
# Yes, it makes sense that P10 is more distinguished than P2 from the baseline E16.
# If you require a BH adjusted p-value less than 1e-03, how many hits do you get for P2? How many for P10? How much overlap is there?
P2Hits <- topTable(wtEbFit, coef = 'devStageP2', p.value=1e-3, number=99999)
P10Hits <- topTable(wtEbFit, coef = 'devStageP10', p.value=1e-3, number=99999)
length(intersect(P2Hits$ID, P10Hits$ID))
[1] 52
# Now just focus on the P10 effect. Create a scatterplot matrix of raw p-values, BH adjusted p-values, and BY p-values.
xyplot(adj.P.Val ~ P.Value, P10Hits)
P10Hits <- topTable(wtEbFit, coef = 'devStageP10', adjust.method='BY', p.value=1e-3, number=99999)
xyplot(adj.P.Val ~ P.Value, P10Hits)
# Perform inference for some contrasts
colnames(wtDesMat)
[1] "(Intercept)" "devStageP2" "devStageP6" "devStageP10"
[5] "devStage4_weeks"
cont.matrix <- makeContrasts(P10VsP6 = devStageP10 - devStageP6,
fourweeksVsP10 = devStage4_weeks - devStageP10, levels = wtDesMat)
Warning: Renaming (Intercept) to Intercept
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
topTable(wtEbFitCont)
ID P10VsP6 fourweeksVsP10 AveExpr F P.Value
1440645_at 1440645_at 0.7247 3.041 6.528 632.7 2.224e-17
1416041_at 1416041_at -0.1470 4.782 9.383 302.4 1.473e-14
1425222_x_at 1425222_x_at 0.7492 3.983 7.028 235.4 1.300e-13
1424852_at 1424852_at 0.3442 3.405 7.454 225.1 1.910e-13
1420726_x_at 1420726_x_at 0.1732 3.551 7.190 203.5 4.555e-13
1451635_at 1451635_at 0.8260 4.172 8.319 200.0 5.289e-13
1429394_at 1429394_at -0.0980 2.410 7.848 167.5 2.416e-12
1455447_at 1455447_at -0.9765 -1.800 9.973 153.5 5.063e-12
1429791_at 1429791_at 0.2480 1.658 8.026 145.7 7.877e-12
1422612_at 1422612_at 0.4837 3.426 8.833 142.2 9.676e-12
adj.P.Val
1440645_at 6.662e-13
1416041_at 2.206e-10
1425222_x_at 1.297e-09
1424852_at 1.430e-09
1420726_x_at 2.640e-09
1451635_at 2.640e-09
1429394_at 1.034e-08
1455447_at 1.896e-08
1429791_at 2.621e-08
1422612_at 2.840e-08
# The top hits are probes where there is big change from P6 to P10, from P10 to 4_weeks, or both. Let's check that by plotting the data from the top 4 hits.
makeStripplot(prepareData(rownames(topTable(wtEbFitCont))[1:4]))
cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
P10VsP6 fourweeksVsP10
-1 4 8
0 29945 29895
1 0 46
hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)]
makeStripplot(prepareData(hits1))
hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)]
makeStripplot(prepareData(hits2[1:4]))
intersect(hits1, hits2)
character(0)
hits3 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)]
makeStripplot(prepareData(hits3[1:4]))
intersect(hits1, hits3)
character(0)
intersect(hits2, hits3)
character(0)
cutoff <- 0.01
nHits <- 8
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
P10VsP6 fourweeksVsP10
-1 40 49
0 29897 29636
1 12 264
hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)]
makeStripplot(prepareData(hits1[1:nHits]))
hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)]
makeStripplot(prepareData(hits2[1:nHits]))
hits3 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0)]
makeStripplot(prepareData(hits3[1:nHits]))
hits4 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)]
makeStripplot(prepareData(hits4[1:nHits]))
vennDiagram(wtResCont)
hits5 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] != 0 & wtResCont[, "fourweeksVsP10"] != 0)]
makeStripplot(prepareData(hits5))
hits6 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0 & wtResCont[, "fourweeksVsP10"] < 0)]
makeStripplot(prepareData(hits6))
# Take-home exercise: See if you can find one or more probes that have some expression changes up to P6 and then hold steady all the way to 4_weeks. Here's some I found.
cont.matrix <- makeContrasts(
P2VsE16 = devStageP2,
P6VsP2 = devStageP6 - devStageP2,
P10VsP6 = devStageP10 - devStageP6,
fourweeksVsP10 = devStage4_weeks - devStageP10,
levels = wtDesMat)
Warning: Renaming (Intercept) to Intercept
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
topTable(wtEbFitCont)
ID P2VsE16 P6VsP2 P10VsP6 fourweeksVsP10 AveExpr
1440645_at 1440645_at 0.3990 -0.2037 0.7247 3.0413 6.528
1416041_at 1416041_at 0.1580 0.3218 -0.1470 4.7818 9.383
1425222_x_at 1425222_x_at 0.8820 -0.0825 0.7492 3.9830 7.028
1451635_at 1451635_at 1.3025 -0.1125 0.8260 4.1723 8.319
1429028_at 1429028_at -2.4433 -0.9640 -0.9033 -0.2913 8.045
1422929_s_at 1422929_s_at -2.9117 -0.7065 0.0710 -0.1140 7.278
1424852_at 1424852_at 0.4575 -0.2277 0.3442 3.4050 7.454
1425171_at 1425171_at 0.9980 2.0550 2.2257 0.8000 9.620
1451617_at 1451617_at 0.7255 1.7872 2.4710 1.7010 8.817
1451618_at 1451618_at 0.6028 2.2875 2.1605 1.2375 9.431
F P.Value adj.P.Val
1440645_at 425.4 1.588e-17 4.755e-13
1416041_at 195.5 1.522e-14 2.280e-10
1425222_x_at 173.4 4.348e-14 4.341e-10
1451635_at 157.3 1.013e-13 7.585e-10
1429028_at 148.8 1.646e-13 9.203e-10
1422929_s_at 146.9 1.844e-13 9.203e-10
1424852_at 143.2 2.290e-13 9.799e-10
1425171_at 138.8 3.002e-13 1.124e-09
1451617_at 136.5 3.485e-13 1.160e-09
1451618_at 134.2 4.032e-13 1.207e-09
wtResCont <- decideTests(wtEbFitCont, p.value = 0.05, method = "global")
summary(wtResCont)
P2VsE16 P6VsP2 P10VsP6 fourweeksVsP10
-1 471 28 116 129
0 29421 29871 29804 29154
1 57 50 29 666
hits <- rownames(prDat)[which(
wtResCont[, "P2VsE16"] != 0
& wtResCont[, "P6VsP2"] != 0
& wtResCont[, "P10VsP6"] == 0
& wtResCont[, "fourweeksVsP10"] == 0)]
length(hits)
[1] 6
wtResCont[hits,]
Contrasts
P2VsE16 P6VsP2 P10VsP6 fourweeksVsP10
1419740_at 1 1 0 0
1421821_at -1 -1 0 0
1424963_at 1 1 0 0
1435661_at -1 1 0 0
1436557_at -1 -1 0 0
1448201_at -1 -1 0 0
makeStripplot(prepareData(hits))