omaromeir — Feb 13, 2014, 10:19 PM
library(lattice)
library(limma)
prDat <- read.table("../data/GSE4051_data.tsv")
prDes <- readRDS("../data/GSE4051_design.rds")
source("../Seminar05/seminar05.R")
'data.frame': 29949 obs. of 39 variables:
'data.frame': 39 obs. of 4 variables:
$ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
$ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
'data.frame': 78 obs. of 6 variables:
$ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
$ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
$ gExp : num 10.93 10.74 10.67 10.68 9.61 ...
$ gene : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
'data.frame': 39 obs. of 6 variables:
$ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
$ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
$ gExp : num 7.04 7.48 7.37 6.94 6.16 ...
$ gene : Factor w/ 1 level "1456341_a_at": 1 1 1 1 1 1 1 1 1 1 ...
'data.frame': 39 obs. of 6 variables:
$ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
$ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
$ gExp : num 7.94 8.99 8.55 8.62 5.72 ...
$ gene : Factor w/ 1 level "1438786_a_at": 1 1 1 1 1 1 1 1 1 1 ...
m <- 1000
n <- 3
x <- matrix(rnorm(m * n), nrow = m)
obsVars <- apply(x, 1, var)
summary(obsVars)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.274 0.649 0.948 1.310 7.150
mean(obsVars < 1/3)
[1] 0.297
densityplot(~obsVars, n=200)
#take home exercise: Make the above simulation more realistic with two (or more) groups, different data-generating means and group differences, different data-generating gene-wise variances, etc.
#2 different groups here
m <- 1000
n <- 3
g <- 2
x <- matrix(rnorm(m * n, mean = rep(c(1, 2), each=(nrow(x)/g)), sd = rep(c(sqrt(1), sqrt(2)), each=(nrow(x)/g))), nrow = m)
gNames <- paste0("Group ", seq(1, g))
gMat <- cbind(rep(gNames , each=(nrow(x)/g)), x)
colnames(gMat) <- c("Group", "dat1", "dat2", "dat3")
obsVars <- apply(gMat[,grep("dat", colnames(gMat))], 1, var)
summary(obsVars)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.001 0.365 0.897 1.420 1.930 12.600
mean(obsVars < 1/3)
[1] 0.228
densityplot(~obsVars, n=200)
#Fit a linear model
wtDes <- subset(prDes, gType == "wt")
str(wtDes)
'data.frame': 20 obs. of 4 variables:
$ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
$ sidNum : num 20 21 22 23 24 25 26 27 28 29 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 2 2 2 2 3 3 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 1 1 1 1 1 1 ...
wtDat <- subset(prDat, select = prDes$gType == "wt")
str(wtDat, max.level = 0)
'data.frame': 29949 obs. of 20 variables:
wtDesMat <- model.matrix(~devStage, wtDes)
str(wtDesMat)
num [1:20, 1:5] 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:20] "12" "13" "14" "15" ...
..$ : chr [1:5] "(Intercept)" "devStageP2" "devStageP6" "devStageP10" ...
- attr(*, "assign")= int [1:5] 0 1 1 1 1
- attr(*, "contrasts")=List of 1
..$ devStage: chr "contr.treatment"
wtFit <- lmFit(wtDat, wtDesMat)
wtEbFit <- eBayes(wtFit)
topTable(wtEbFit)
X.Intercept. devStageP2 devStageP6 devStageP10
1423641_s_at 12.18 -0.0175 0.0750 0.0675
1438940_x_at 12.86 0.0850 0.1325 0.3425
1438657_x_at 12.78 0.1400 0.1250 -0.1850
1456736_x_at 12.32 0.1625 0.3050 0.2075
1436884_x_at 12.93 0.1775 0.3225 0.0300
1419700_a_at 12.32 0.1650 0.6475 0.8175
1435800_a_at 12.28 0.0450 0.6825 0.9000
1454613_at 12.47 -0.1075 -0.0500 -0.1025
1451240_a_at 13.00 0.3100 0.2800 0.2800
1450084_s_at 12.63 0.0825 0.0525 0.1725
devStage4_weeks AveExpr F P.Value adj.P.Val
1423641_s_at 0.1800 12.24 45350 3.574e-36 5.201e-32
1438940_x_at 0.3500 13.04 44957 3.865e-36 5.201e-32
1438657_x_at -0.4500 12.71 43486 5.210e-36 5.201e-32
1456736_x_at 0.0725 12.47 39509 1.233e-35 6.725e-32
1436884_x_at 0.0250 13.04 39269 1.302e-35 6.725e-32
1419700_a_at 0.6825 12.78 39121 1.347e-35 6.725e-32
1435800_a_at 1.0200 12.81 36668 2.410e-35 1.031e-31
1454613_at -0.3825 12.34 35835 2.962e-35 1.078e-31
1451240_a_at -0.3700 13.10 35481 3.239e-35 1.078e-31
1450084_s_at 0.2600 12.75 34411 4.265e-35 1.234e-31
colnames(coef(wtEbFit))
[1] "(Intercept)" "devStageP2" "devStageP6" "devStageP10"
[5] "devStage4_weeks"
(dsHits <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit)))))
devStageP2 devStageP6 devStageP10 devStage4_weeks AveExpr
1440645_at 0.3990 0.1952 0.9200 3.961 6.528
1416041_at 0.1580 0.4797 0.3327 5.115 9.383
1425222_x_at 0.8820 0.7995 1.5488 5.532 7.028
1451635_at 1.3025 1.1900 2.0160 6.188 8.319
1429028_at -2.4433 -3.4073 -4.3105 -4.602 8.045
1422929_s_at -2.9117 -3.6182 -3.5472 -3.661 7.278
1424852_at 0.4575 0.2298 0.5740 3.979 7.454
1425171_at 0.9980 3.0530 5.2787 6.079 9.620
1451617_at 0.7255 2.5128 4.9837 6.685 8.817
1451618_at 0.6028 2.8903 5.0507 6.288 9.431
F P.Value adj.P.Val
1440645_at 425.4 1.588e-17 4.755e-13
1416041_at 195.5 1.522e-14 2.280e-10
1425222_x_at 173.4 4.348e-14 4.341e-10
1451635_at 157.3 1.013e-13 7.585e-10
1429028_at 148.8 1.646e-13 9.203e-10
1422929_s_at 146.9 1.844e-13 9.203e-10
1424852_at 143.2 2.290e-13 9.799e-10
1425171_at 138.8 3.002e-13 1.124e-09
1451617_at 136.5 3.485e-13 1.160e-09
1451618_at 134.2 4.032e-13 1.207e-09
stripplot(gExp ~ devStage | gene, subset= gType=="wt", prepareData(rownames(dsHits)[c(3, 6, 9)]), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
#Does it look plausible to you that -- using only wild type data -- these probes show the most compelling evidence for expression change over development? yep
#Exercise: lm() on one or all 3 of these probes and check if the F stats and p-values are similar. Don't expect exact equality because you must remember that limma has moderated the estimated error variance.
mFit <- lm(formula = gExp ~ devStage, data=prepareData(rownames(dsHits)[3]), subset = gType=="wt")
summary(mFit)
Call:
lm(formula = gExp ~ devStage, data = prepareData(rownames(dsHits)[3]),
subset = gType == "wt")
Residuals:
Min 1Q Median 3Q Max
-0.4847 -0.2157 -0.0077 0.1781 0.8315
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.276 0.173 30.50 6.5e-15 ***
devStageP2 0.882 0.245 3.61 0.0026 **
devStageP6 0.800 0.245 3.27 0.0052 **
devStageP10 1.549 0.245 6.33 1.3e-05 ***
devStage4_weeks 5.532 0.245 22.61 5.3e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.346 on 15 degrees of freedom
Multiple R-squared: 0.977, Adjusted R-squared: 0.971
F-statistic: 159 on 4 and 15 DF, p-value: 4.31e-12
#The numbers are similar, though not exactly the same
#Be the boss of top table
newHits <- topTable(wtEbFit, p.value=1e-05, number=nrow(wtDat), coef = grep("devStage", colnames(coef(wtEbFit))))
str(newHits)
'data.frame': 350 obs. of 8 variables:
$ devStageP2 : num 0.399 0.158 0.882 1.303 -2.443 ...
$ devStageP6 : num 0.195 0.48 0.8 1.19 -3.407 ...
$ devStageP10 : num 0.92 0.333 1.549 2.016 -4.311 ...
$ devStage4_weeks: num 3.96 5.11 5.53 6.19 -4.6 ...
$ AveExpr : num 6.42 8.83 7.45 6.27 7.76 ...
$ F : num 425 195 173 157 149 ...
$ P.Value : num 1.59e-17 1.52e-14 4.35e-14 1.01e-13 1.65e-13 ...
$ adj.P.Val : num 4.76e-13 2.28e-10 4.34e-10 7.58e-10 9.20e-10 ...
newHits[63, c("F", "adj.P.Val", 'devStageP6')]
F adj.P.Val devStageP6
1451633_a_at 64.01 1.049e-07 2.069
p2Hits <- topTable(wtEbFit, number=nrow(wtDat), coef = "devStageP2", sort.by="none")
str(p2Hits)
'data.frame': 29949 obs. of 6 variables:
$ logFC : num -0.04825 -0.613 -0.38175 -0.00025 -0.1695 ...
$ AveExpr : num 7.22 9.3 9.73 8.44 8.47 ...
$ t : num -0.34501 -1.82865 -1.74759 -0.00177 -1.86842 ...
$ P.Value : num 0.7341 0.0841 0.0976 0.9986 0.0781 ...
$ adj.P.Val: num 0.792 0.168 0.184 0.999 0.161 ...
$ B : num -6.27 -4.77 -4.89 -6.33 -4.71 ...
p10Hits <- topTable(wtEbFit, number=nrow(wtDat), coef = "devStageP10", sort.by="none")
str(p10Hits)
'data.frame': 29949 obs. of 6 variables:
$ logFC : num -0.0927 -0.734 -0.32 0.006 -0.1232 ...
$ AveExpr : num 7.22 9.3 9.73 8.44 8.47 ...
$ t : num -0.6632 -2.1896 -1.4649 0.0424 -1.3586 ...
$ P.Value : num 0.516 0.042 0.16 0.967 0.191 ...
$ adj.P.Val: num 0.603 0.106 0.25 0.974 0.284 ...
$ B : num -6.55 -4.56 -5.72 -6.77 -5.86 ...
xyplot(p10Hits$t ~ p2Hits$t, scales= list(limit = c(-20, 20)), asp = 1, panel=function(...) {panel.smoothScatter(...); panel.abline(0,1,col=3)})
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
(loaded the KernSmooth namespace)
densityplot(~ p10Hits$adj.P.Val + p2Hits$adj.P.Val, auto.key=TRUE, type = 'p')
addmargins(table(p2Hits$adj.P.Val < 1e-03, p10Hits$adj.P.Val < 1e-03, dnn=c("p2", "p10")))
p10
p2 FALSE TRUE Sum
FALSE 29201 695 29896
TRUE 1 52 53
Sum 29202 747 29949
p10HitsBY <- topTable(wtEbFit, number=nrow(wtDat), coef = "devStageP10", adjust.method="BY", sort.by="none")
pVals <- data.frame(Raw = p10Hits$P.Value, BH = p10Hits$adj.P.Val, BY = p10HitsBY$adj.P.Val)
pairs(pVals)
#Relationship is linear between raw and BH
tail(pVals)
Raw BH BY
29944 2.663e-03 0.0225769 0.2457380
29945 4.634e-07 0.0000606 0.0006596
29946 5.670e-03 0.0354219 0.3855493
29947 2.167e-01 0.3114689 1.0000000
29948 5.803e-01 0.6608153 1.0000000
29949 9.915e-03 0.0484945 0.5278373
#Inference for some contrasts
colnames(wtDesMat)
[1] "(Intercept)" "devStageP2" "devStageP6" "devStageP10"
[5] "devStage4_weeks"
(cont.matrix <- makeContrasts(P10VsP6 = devStageP10 - devStageP6, fourweeksVsP10 = devStage4_weeks - devStageP10, levels = wtDesMat))
Warning: Renaming (Intercept) to Intercept
Contrasts
Levels P10VsP6 fourweeksVsP10
Intercept 0 0
devStageP2 0 0
devStageP6 -1 0
devStageP10 1 -1
devStage4_weeks 0 1
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
latestHits <- topTable(wtEbFitCont)
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(rownames(latestHits)[1:4]), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
P10VsP6 fourweeksVsP10
-1 4 8
0 29945 29895
1 0 46
(hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)])
[1] "1416635_at" "1437781_at" "1454752_at" "1455260_at"
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(hits1), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
(hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)])
[1] "1416021_a_at" "1423851_a_at" "1434500_at" "1435415_x_at"
[5] "1437502_x_at" "1448182_a_at" "1452679_at" "1455447_at"
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(hits2), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
intersect(hits1, hits2)
character(0)
(hits3 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)])
[1] "1416041_at" "1417280_at" "1418406_at" "1418710_at"
[5] "1418789_at" "1419069_at" "1420725_at" "1420726_x_at"
[9] "1421061_at" "1421818_at" "1422612_at" "1424852_at"
[13] "1424895_at" "1425222_x_at" "1426059_at" "1426223_at"
[17] "1427388_at" "1428763_at" "1429394_at" "1429791_at"
[21] "1430580_at" "1431174_at" "1433699_at" "1434297_at"
[25] "1434573_at" "1435436_at" "1435679_at" "1435727_s_at"
[29] "1436265_at" "1436287_at" "1440402_at" "1440645_at"
[33] "1441518_at" "1442243_at" "1443252_at" "1446484_at"
[37] "1449170_at" "1449393_at" "1451042_a_at" "1451635_at"
[41] "1452243_at" "1453385_at" "1455493_at" "1457878_at"
[45] "1458418_at" "1459904_at"
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(hits3[1:8]), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
intersect(hits1, hits3)
character(0)
intersect(hits2, hits3)
character(0)
cutoff <- 0.01
nHits <- 8
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
P10VsP6 fourweeksVsP10
-1 40 49
0 29897 29636
1 12 264
hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)]
hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)]
hits3 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0)]
hits4 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)]
vennDiagram(wtResCont)
hits6 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0 & wtResCont[, "fourweeksVsP10"] < 0)]
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(hits6[1:8]), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
#take home exercise
(cont.matrix <- makeContrasts(P2VsI = devStageP2 - Intercept, P6VsP2 = devStageP6 - devStageP2, P10VsP6 = devStageP10 - devStageP6, fourweeksVsP10 = devStage4_weeks - devStageP10, levels = wtDesMat))
Warning: Renaming (Intercept) to Intercept
Contrasts
Levels P2VsI P6VsP2 P10VsP6 fourweeksVsP10
Intercept -1 0 0 0
devStageP2 1 -1 0 0
devStageP6 0 1 -1 0
devStageP10 0 0 1 -1
devStage4_weeks 0 0 0 1
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
toptable(wtEbFitCont)
logFC t P.Value adj.P.Val B
1423641_s_at -12.20 -94.90 1.020e-25 1.531e-21 40.19
1438940_x_at -12.78 -92.88 1.499e-25 1.531e-21 40.07
1438657_x_at -12.64 -92.76 1.534e-25 1.531e-21 40.06
1456736_x_at -12.16 -86.66 5.186e-25 2.808e-21 39.65
1436884_x_at -12.75 -86.66 5.194e-25 2.808e-21 39.65
1454613_at -12.57 -86.27 5.626e-25 2.808e-21 39.62
1419700_a_at -12.15 -84.09 8.913e-25 3.729e-21 39.45
1456349_x_at -11.58 -82.59 1.228e-24 3.729e-21 39.33
1435800_a_at -12.24 -81.75 1.476e-24 3.729e-21 39.26
1450084_s_at -12.55 -81.68 1.499e-24 3.729e-21 39.26
cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
P2VsI P6VsP2 P10VsP6 fourweeksVsP10
-1 29903 0 32 37
0 46 29937 29907 29712
1 0 12 10 200
greatestHits <- rownames(prDat)[which(wtResCont[, "P2VsI"] < 0 & wtResCont[, "P6VsP2"] > 0 & wtResCont[, "P10VsP6"] == 0 & wtResCont[, "fourweeksVsP10"] == 0)]
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(greatestHits), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
#Found 8 possible hits!