seminar06.R

omaromeir — Feb 13, 2014, 10:19 PM

library(lattice)
library(limma)
prDat <- read.table("../data/GSE4051_data.tsv")
prDes <- readRDS("../data/GSE4051_design.rds")
source("../Seminar05/seminar05.R")
'data.frame':   29949 obs. of  39 variables:
'data.frame':   39 obs. of  4 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
'data.frame':   78 obs. of  6 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  10.93 10.74 10.67 10.68 9.61 ...
 $ gene    : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
'data.frame':   39 obs. of  6 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  7.04 7.48 7.37 6.94 6.16 ...
 $ gene    : Factor w/ 1 level "1456341_a_at": 1 1 1 1 1 1 1 1 1 1 ...
'data.frame':   39 obs. of  6 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  7.94 8.99 8.55 8.62 5.72 ...
 $ gene    : Factor w/ 1 level "1438786_a_at": 1 1 1 1 1 1 1 1 1 1 ...

m <- 1000
n <- 3
x <- matrix(rnorm(m * n), nrow = m)

obsVars <- apply(x, 1, var)
summary(obsVars)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   0.274   0.649   0.948   1.310   7.150 
mean(obsVars < 1/3)
[1] 0.297
densityplot(~obsVars, n=200)

plot of chunk unnamed-chunk-1


#take home exercise: Make the above simulation more realistic with two (or more) groups, different data-generating means and group differences, different data-generating gene-wise variances, etc.

#2 different groups here
m <- 1000
n <- 3
g <- 2

x <- matrix(rnorm(m * n, mean = rep(c(1, 2), each=(nrow(x)/g)), sd = rep(c(sqrt(1), sqrt(2)), each=(nrow(x)/g))), nrow = m)
gNames  <- paste0("Group ", seq(1, g))
gMat <- cbind(rep(gNames , each=(nrow(x)/g)), x)
colnames(gMat) <- c("Group", "dat1", "dat2", "dat3")

obsVars <- apply(gMat[,grep("dat", colnames(gMat))], 1, var)
summary(obsVars)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.001   0.365   0.897   1.420   1.930  12.600 
mean(obsVars < 1/3)
[1] 0.228
densityplot(~obsVars, n=200)

plot of chunk unnamed-chunk-1


#Fit a linear model

wtDes <- subset(prDes, gType == "wt")
str(wtDes)
'data.frame':   20 obs. of  4 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 24 25 26 27 28 29 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 1 1 1 1 1 1 ...
wtDat <- subset(prDat, select = prDes$gType == "wt")
str(wtDat, max.level = 0)
'data.frame':   29949 obs. of  20 variables:

wtDesMat <- model.matrix(~devStage, wtDes)
str(wtDesMat)
 num [1:20, 1:5] 1 1 1 1 1 1 1 1 1 1 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:20] "12" "13" "14" "15" ...
  ..$ : chr [1:5] "(Intercept)" "devStageP2" "devStageP6" "devStageP10" ...
 - attr(*, "assign")= int [1:5] 0 1 1 1 1
 - attr(*, "contrasts")=List of 1
  ..$ devStage: chr "contr.treatment"

wtFit <- lmFit(wtDat, wtDesMat)
wtEbFit <- eBayes(wtFit)

topTable(wtEbFit)
             X.Intercept. devStageP2 devStageP6 devStageP10
1423641_s_at        12.18    -0.0175     0.0750      0.0675
1438940_x_at        12.86     0.0850     0.1325      0.3425
1438657_x_at        12.78     0.1400     0.1250     -0.1850
1456736_x_at        12.32     0.1625     0.3050      0.2075
1436884_x_at        12.93     0.1775     0.3225      0.0300
1419700_a_at        12.32     0.1650     0.6475      0.8175
1435800_a_at        12.28     0.0450     0.6825      0.9000
1454613_at          12.47    -0.1075    -0.0500     -0.1025
1451240_a_at        13.00     0.3100     0.2800      0.2800
1450084_s_at        12.63     0.0825     0.0525      0.1725
             devStage4_weeks AveExpr     F   P.Value adj.P.Val
1423641_s_at          0.1800   12.24 45350 3.574e-36 5.201e-32
1438940_x_at          0.3500   13.04 44957 3.865e-36 5.201e-32
1438657_x_at         -0.4500   12.71 43486 5.210e-36 5.201e-32
1456736_x_at          0.0725   12.47 39509 1.233e-35 6.725e-32
1436884_x_at          0.0250   13.04 39269 1.302e-35 6.725e-32
1419700_a_at          0.6825   12.78 39121 1.347e-35 6.725e-32
1435800_a_at          1.0200   12.81 36668 2.410e-35 1.031e-31
1454613_at           -0.3825   12.34 35835 2.962e-35 1.078e-31
1451240_a_at         -0.3700   13.10 35481 3.239e-35 1.078e-31
1450084_s_at          0.2600   12.75 34411 4.265e-35 1.234e-31
colnames(coef(wtEbFit))
[1] "(Intercept)"     "devStageP2"      "devStageP6"      "devStageP10"    
[5] "devStage4_weeks"

(dsHits <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit)))))
             devStageP2 devStageP6 devStageP10 devStage4_weeks AveExpr
1440645_at       0.3990     0.1952      0.9200           3.961   6.528
1416041_at       0.1580     0.4797      0.3327           5.115   9.383
1425222_x_at     0.8820     0.7995      1.5488           5.532   7.028
1451635_at       1.3025     1.1900      2.0160           6.188   8.319
1429028_at      -2.4433    -3.4073     -4.3105          -4.602   8.045
1422929_s_at    -2.9117    -3.6182     -3.5472          -3.661   7.278
1424852_at       0.4575     0.2298      0.5740           3.979   7.454
1425171_at       0.9980     3.0530      5.2787           6.079   9.620
1451617_at       0.7255     2.5128      4.9837           6.685   8.817
1451618_at       0.6028     2.8903      5.0507           6.288   9.431
                 F   P.Value adj.P.Val
1440645_at   425.4 1.588e-17 4.755e-13
1416041_at   195.5 1.522e-14 2.280e-10
1425222_x_at 173.4 4.348e-14 4.341e-10
1451635_at   157.3 1.013e-13 7.585e-10
1429028_at   148.8 1.646e-13 9.203e-10
1422929_s_at 146.9 1.844e-13 9.203e-10
1424852_at   143.2 2.290e-13 9.799e-10
1425171_at   138.8 3.002e-13 1.124e-09
1451617_at   136.5 3.485e-13 1.160e-09
1451618_at   134.2 4.032e-13 1.207e-09

stripplot(gExp ~ devStage | gene, subset= gType=="wt", prepareData(rownames(dsHits)[c(3, 6, 9)]), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)

plot of chunk unnamed-chunk-1


#Does it look plausible to you that -- using only wild type data -- these probes show the most compelling evidence for expression change over development? yep

#Exercise: lm() on one or all 3 of these probes and check if the F stats and p-values are similar. Don't expect exact equality because you must remember that limma has moderated the estimated error variance.
mFit <- lm(formula = gExp ~ devStage, data=prepareData(rownames(dsHits)[3]), subset = gType=="wt")
summary(mFit)

Call:
lm(formula = gExp ~ devStage, data = prepareData(rownames(dsHits)[3]), 
    subset = gType == "wt")

Residuals:
    Min      1Q  Median      3Q     Max 
-0.4847 -0.2157 -0.0077  0.1781  0.8315 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)        5.276      0.173   30.50  6.5e-15 ***
devStageP2         0.882      0.245    3.61   0.0026 ** 
devStageP6         0.800      0.245    3.27   0.0052 ** 
devStageP10        1.549      0.245    6.33  1.3e-05 ***
devStage4_weeks    5.532      0.245   22.61  5.3e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.346 on 15 degrees of freedom
Multiple R-squared:  0.977, Adjusted R-squared:  0.971 
F-statistic:  159 on 4 and 15 DF,  p-value: 4.31e-12
#The numbers are similar, though not exactly the same

#Be the boss of top table
newHits <- topTable(wtEbFit, p.value=1e-05, number=nrow(wtDat), coef = grep("devStage", colnames(coef(wtEbFit))))
str(newHits)
'data.frame':   350 obs. of  8 variables:
 $ devStageP2     : num  0.399 0.158 0.882 1.303 -2.443 ...
 $ devStageP6     : num  0.195 0.48 0.8 1.19 -3.407 ...
 $ devStageP10    : num  0.92 0.333 1.549 2.016 -4.311 ...
 $ devStage4_weeks: num  3.96 5.11 5.53 6.19 -4.6 ...
 $ AveExpr        : num  6.42 8.83 7.45 6.27 7.76 ...
 $ F              : num  425 195 173 157 149 ...
 $ P.Value        : num  1.59e-17 1.52e-14 4.35e-14 1.01e-13 1.65e-13 ...
 $ adj.P.Val      : num  4.76e-13 2.28e-10 4.34e-10 7.58e-10 9.20e-10 ...
newHits[63, c("F", "adj.P.Val", 'devStageP6')]
                 F adj.P.Val devStageP6
1451633_a_at 64.01 1.049e-07      2.069

p2Hits <- topTable(wtEbFit, number=nrow(wtDat), coef = "devStageP2", sort.by="none")
str(p2Hits)
'data.frame':   29949 obs. of  6 variables:
 $ logFC    : num  -0.04825 -0.613 -0.38175 -0.00025 -0.1695 ...
 $ AveExpr  : num  7.22 9.3 9.73 8.44 8.47 ...
 $ t        : num  -0.34501 -1.82865 -1.74759 -0.00177 -1.86842 ...
 $ P.Value  : num  0.7341 0.0841 0.0976 0.9986 0.0781 ...
 $ adj.P.Val: num  0.792 0.168 0.184 0.999 0.161 ...
 $ B        : num  -6.27 -4.77 -4.89 -6.33 -4.71 ...

p10Hits <- topTable(wtEbFit, number=nrow(wtDat), coef = "devStageP10", sort.by="none")
str(p10Hits)
'data.frame':   29949 obs. of  6 variables:
 $ logFC    : num  -0.0927 -0.734 -0.32 0.006 -0.1232 ...
 $ AveExpr  : num  7.22 9.3 9.73 8.44 8.47 ...
 $ t        : num  -0.6632 -2.1896 -1.4649 0.0424 -1.3586 ...
 $ P.Value  : num  0.516 0.042 0.16 0.967 0.191 ...
 $ adj.P.Val: num  0.603 0.106 0.25 0.974 0.284 ...
 $ B        : num  -6.55 -4.56 -5.72 -6.77 -5.86 ...

xyplot(p10Hits$t ~ p2Hits$t, scales= list(limit = c(-20, 20)), asp = 1, panel=function(...) {panel.smoothScatter(...); panel.abline(0,1,col=3)})
KernSmooth 2.23 loaded
Copyright M. P. Wand 1997-2009
(loaded the KernSmooth namespace)

plot of chunk unnamed-chunk-1


densityplot(~ p10Hits$adj.P.Val + p2Hits$adj.P.Val, auto.key=TRUE, type = 'p')

plot of chunk unnamed-chunk-1


addmargins(table(p2Hits$adj.P.Val < 1e-03, p10Hits$adj.P.Val < 1e-03, dnn=c("p2", "p10")))
       p10
p2      FALSE  TRUE   Sum
  FALSE 29201   695 29896
  TRUE      1    52    53
  Sum   29202   747 29949

p10HitsBY <- topTable(wtEbFit, number=nrow(wtDat), coef = "devStageP10", adjust.method="BY", sort.by="none")
pVals <- data.frame(Raw = p10Hits$P.Value, BH = p10Hits$adj.P.Val, BY = p10HitsBY$adj.P.Val)
pairs(pVals)

plot of chunk unnamed-chunk-1


#Relationship is linear between raw and BH
tail(pVals)
            Raw        BH        BY
29944 2.663e-03 0.0225769 0.2457380
29945 4.634e-07 0.0000606 0.0006596
29946 5.670e-03 0.0354219 0.3855493
29947 2.167e-01 0.3114689 1.0000000
29948 5.803e-01 0.6608153 1.0000000
29949 9.915e-03 0.0484945 0.5278373

#Inference for some contrasts
colnames(wtDesMat)
[1] "(Intercept)"     "devStageP2"      "devStageP6"      "devStageP10"    
[5] "devStage4_weeks"
(cont.matrix <- makeContrasts(P10VsP6 = devStageP10 - devStageP6, fourweeksVsP10 = devStage4_weeks - devStageP10, levels = wtDesMat))
Warning: Renaming (Intercept) to Intercept
                 Contrasts
Levels            P10VsP6 fourweeksVsP10
  Intercept             0              0
  devStageP2            0              0
  devStageP6           -1              0
  devStageP10           1             -1
  devStage4_weeks       0              1
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
latestHits <- topTable(wtEbFitCont)
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(rownames(latestHits)[1:4]), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)

plot of chunk unnamed-chunk-1


cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
   P10VsP6 fourweeksVsP10
-1       4              8
0    29945          29895
1        0             46

(hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)])
[1] "1416635_at" "1437781_at" "1454752_at" "1455260_at"
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(hits1), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)

plot of chunk unnamed-chunk-1

(hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)])
[1] "1416021_a_at" "1423851_a_at" "1434500_at"   "1435415_x_at"
[5] "1437502_x_at" "1448182_a_at" "1452679_at"   "1455447_at"  
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(hits2), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)

plot of chunk unnamed-chunk-1

intersect(hits1, hits2)
character(0)
(hits3 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)])
 [1] "1416041_at"   "1417280_at"   "1418406_at"   "1418710_at"  
 [5] "1418789_at"   "1419069_at"   "1420725_at"   "1420726_x_at"
 [9] "1421061_at"   "1421818_at"   "1422612_at"   "1424852_at"  
[13] "1424895_at"   "1425222_x_at" "1426059_at"   "1426223_at"  
[17] "1427388_at"   "1428763_at"   "1429394_at"   "1429791_at"  
[21] "1430580_at"   "1431174_at"   "1433699_at"   "1434297_at"  
[25] "1434573_at"   "1435436_at"   "1435679_at"   "1435727_s_at"
[29] "1436265_at"   "1436287_at"   "1440402_at"   "1440645_at"  
[33] "1441518_at"   "1442243_at"   "1443252_at"   "1446484_at"  
[37] "1449170_at"   "1449393_at"   "1451042_a_at" "1451635_at"  
[41] "1452243_at"   "1453385_at"   "1455493_at"   "1457878_at"  
[45] "1458418_at"   "1459904_at"  
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(hits3[1:8]), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)

plot of chunk unnamed-chunk-1

intersect(hits1, hits3)
character(0)
intersect(hits2, hits3)
character(0)

cutoff <- 0.01
nHits <- 8
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
   P10VsP6 fourweeksVsP10
-1      40             49
0    29897          29636
1       12            264

hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)]
hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)]
hits3 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0)]
hits4 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)]
vennDiagram(wtResCont)

plot of chunk unnamed-chunk-1

hits6 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0 & wtResCont[, "fourweeksVsP10"] < 0)]
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(hits6[1:8]), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)

plot of chunk unnamed-chunk-1


#take home exercise
(cont.matrix <- makeContrasts(P2VsI = devStageP2 - Intercept, P6VsP2 = devStageP6 - devStageP2, P10VsP6 = devStageP10 - devStageP6, fourweeksVsP10 = devStage4_weeks - devStageP10, levels = wtDesMat))
Warning: Renaming (Intercept) to Intercept
                 Contrasts
Levels            P2VsI P6VsP2 P10VsP6 fourweeksVsP10
  Intercept          -1      0       0              0
  devStageP2          1     -1       0              0
  devStageP6          0      1      -1              0
  devStageP10         0      0       1             -1
  devStage4_weeks     0      0       0              1
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
toptable(wtEbFitCont)
              logFC      t   P.Value adj.P.Val     B
1423641_s_at -12.20 -94.90 1.020e-25 1.531e-21 40.19
1438940_x_at -12.78 -92.88 1.499e-25 1.531e-21 40.07
1438657_x_at -12.64 -92.76 1.534e-25 1.531e-21 40.06
1456736_x_at -12.16 -86.66 5.186e-25 2.808e-21 39.65
1436884_x_at -12.75 -86.66 5.194e-25 2.808e-21 39.65
1454613_at   -12.57 -86.27 5.626e-25 2.808e-21 39.62
1419700_a_at -12.15 -84.09 8.913e-25 3.729e-21 39.45
1456349_x_at -11.58 -82.59 1.228e-24 3.729e-21 39.33
1435800_a_at -12.24 -81.75 1.476e-24 3.729e-21 39.26
1450084_s_at -12.55 -81.68 1.499e-24 3.729e-21 39.26
cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
   P2VsI P6VsP2 P10VsP6 fourweeksVsP10
-1 29903      0      32             37
0     46  29937   29907          29712
1      0     12      10            200

greatestHits <- rownames(prDat)[which(wtResCont[, "P2VsI"] < 0 & wtResCont[, "P6VsP2"] > 0 & wtResCont[, "P10VsP6"] == 0 & wtResCont[, "fourweeksVsP10"] == 0)]
stripplot(gExp ~ devStage | gene, subset= gType=="wt", asp=1, prepareData(greatestHits), jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)

plot of chunk unnamed-chunk-1


#Found 8 possible hits!