Seminar 6 - Fitting and interpreting linear models (high volume)

Load required libraries and import data:

library(limma)
library(lattice)
## Warning: package 'lattice' was built under R version 3.0.2
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.0.2
prDes <- read.table("GSE4051_design.tsv", header = T)
str(prDes)
## 'data.frame':    39 obs. of  4 variables:
##  $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
##  $ sidNum  : int  20 21 22 23 16 17 6 24 25 26 ...
##  $ devStage: Factor w/ 5 levels "4_weeks","E16",..: 2 2 2 2 2 2 2 4 4 4 ...
##  $ gType   : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
prDat <- read.table("GSE4051_data.tsv", header = T)
str(prDat, max.level = 0)
## 'data.frame':    29949 obs. of  39 variables:

The difficult in estimating gene-wise data

# Simulation with only one group
m <- 1000
n <- 3
x <- matrix(rnorm(m * n), nrow = m)
obsVars <- apply(x, 1, var)
summary(obsVars)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.002   0.299   0.719   1.040   1.440   7.750
mean(obsVars < 1/3)
## [1] 0.272
densityplot(~obsVars, n = 200)

plot of chunk unnamed-chunk-3

# Simulation with two groups first group
m1 <- 500
n1 <- 4
x1 <- matrix(rnorm(m1 * n1, mean = 0, sd = 1), nrow = m1)
# second group
m2 <- 500
n2 <- 4
x2 <- matrix(rnorm(m2 * n2, mean = 1, sd = 2), nrow = m2)
# putting them together
final <- rbind(x1, x2)
obsVars1 <- apply(final, 1, var)
summary(obsVars1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.008   0.662   1.490   2.530   3.280  25.100
densityplot(~obsVars1, n = 200)

plot of chunk unnamed-chunk-4

In both cases we get many observed variances really small and others really high, even though the average in around 1 in the simulation with only one group, and it is about 1.5 on average in the secon dimulation setting.

Fit a linear model: explain gene expression in the wild type mice as a function of developmental stage (one-way ANOVA)

# Fit a linear model:limma
wtDes <- subset(prDes, gType == "wt")
wtDes$devStage <- factor(wtDes$devStage, c("E16", "P2", "P6", "P10", "4_weeks"))
str(wtDes)
## 'data.frame':    20 obs. of  4 variables:
##  $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 17 18 19 20 21 22 ...
##  $ sidNum  : int  20 21 22 23 24 25 26 27 28 29 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ gType   : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 2 2 2 2 2 2 ...
wtDat <- subset(prDat, select = prDes$gType == "wt")
str(wtDat, max.level = 0)
## 'data.frame':    29949 obs. of  20 variables:
wtDesMat <- model.matrix(~devStage, wtDes)
str(wtDesMat)
##  num [1:20, 1:5] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:20] "1" "2" "3" "4" ...
##   ..$ : chr [1:5] "(Intercept)" "devStageP2" "devStageP6" "devStageP10" ...
##  - attr(*, "assign")= int [1:5] 0 1 1 1 1
##  - attr(*, "contrasts")=List of 1
##   ..$ devStage: chr "contr.treatment"
wtFit <- lmFit(wtDat, wtDesMat)
wtEbFit <- eBayes(wtFit)
topTable(wtEbFit)
##                        ID X.Intercept. devStageP2 devStageP6 devStageP10
## 1423641_s_at 1423641_s_at        12.18    -0.0175     0.0750      0.0675
## 1438940_x_at 1438940_x_at        12.86     0.0850     0.1325      0.3425
## 1438657_x_at 1438657_x_at        12.78     0.1400     0.1250     -0.1850
## 1456736_x_at 1456736_x_at        12.32     0.1625     0.3050      0.2075
## 1436884_x_at 1436884_x_at        12.93     0.1775     0.3225      0.0300
## 1419700_a_at 1419700_a_at        12.32     0.1650     0.6475      0.8175
## 1435800_a_at 1435800_a_at        12.28     0.0450     0.6825      0.9000
## 1454613_at     1454613_at        12.47    -0.1075    -0.0500     -0.1025
## 1451240_a_at 1451240_a_at        13.00     0.3100     0.2800      0.2800
## 1450084_s_at 1450084_s_at        12.63     0.0825     0.0525      0.1725
##              devStage4_weeks AveExpr     F   P.Value adj.P.Val
## 1423641_s_at          0.1800   12.24 45350 3.574e-36 5.201e-32
## 1438940_x_at          0.3500   13.04 44957 3.865e-36 5.201e-32
## 1438657_x_at         -0.4500   12.71 43486 5.210e-36 5.201e-32
## 1456736_x_at          0.0725   12.47 39509 1.233e-35 6.725e-32
## 1436884_x_at          0.0250   13.04 39269 1.302e-35 6.725e-32
## 1419700_a_at          0.6825   12.78 39121 1.347e-35 6.725e-32
## 1435800_a_at          1.0200   12.81 36668 2.410e-35 1.031e-31
## 1454613_at           -0.3825   12.34 35835 2.962e-35 1.078e-31
## 1451240_a_at         -0.3700   13.10 35481 3.239e-35 1.078e-31
## 1450084_s_at          0.2600   12.75 34411 4.265e-35 1.234e-31
colnames(coef(wtEbFit))  # remind yourself of the coef names
## [1] "(Intercept)"     "devStageP2"      "devStageP6"      "devStageP10"    
## [5] "devStage4_weeks"
dsHits <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit))))
chosenHits <- c(dsHits[3, 1], dsHits[6, 1], dsHits[9, 1])
preparedata <- function(pID) {
    genexp <- prDat[pID, ]
    Dat <- data.frame(gene = row.names(genexp), genexp)
    newDat <- melt(Dat, id.vars = "gene")
    names(newDat) <- c("gene", "sidChar", "gExp")
    final <- merge(newDat, prDes, by = "sidChar")
    final$sidChar <- as.character(final$sidChar)
    final$devStage <- factor(final$devStage, c("E16", "P2", "P6", "P10", "4_weeks"))
    return(final)
}
dat <- preparedata(chosenHits)
makeStripplot <- function(x) {
    stripplot(gExp ~ devStage | gene, x, jitter.data = TRUE, auto.key = TRUE, 
        type = c("p", "a"), grid = TRUE)
}
makeStripplot(subset(dat, gType == "wt"))

plot of chunk unnamed-chunk-5

# Graphs show a high expression change over developmental stage
# Use lm() to check for similar result
chosenHits[1]
## [1] "1425222_x_at"
probe1 <- preparedata(chosenHits[1])
mod1 <- lm(formula = gExp ~ devStage, data = probe1, subset = gType == "wt")
summary(mod1)
## 
## Call:
## lm(formula = gExp ~ devStage, data = probe1, subset = gType == 
##     "wt")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4847 -0.2157 -0.0077  0.1781  0.8315 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.276      0.173   30.50  6.5e-15 ***
## devStageP2         0.882      0.245    3.61   0.0026 ** 
## devStageP6         0.800      0.245    3.27   0.0052 ** 
## devStageP10        1.549      0.245    6.33  1.3e-05 ***
## devStage4_weeks    5.532      0.245   22.61  5.3e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.346 on 15 degrees of freedom
## Multiple R-squared:  0.977,  Adjusted R-squared:  0.971 
## F-statistic:  159 on 4 and 15 DF,  p-value: 4.31e-12
chosenHits[2]
## [1] "1422929_s_at"
probe2 <- preparedata(chosenHits[2])
mod2 <- lm(formula = gExp ~ devStage, data = probe2, subset = gType == "wt")
summary(mod2)
## 
## Call:
## lm(formula = gExp ~ devStage, data = probe2, subset = gType == 
##     "wt")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5850 -0.0959 -0.0452  0.0606  0.6740 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.026      0.130    76.9  < 2e-16 ***
## devStageP2        -2.912      0.184   -15.8  9.4e-11 ***
## devStageP6        -3.618      0.184   -19.6  4.2e-12 ***
## devStageP10       -3.547      0.184   -19.2  5.5e-12 ***
## devStage4_weeks   -3.661      0.184   -19.9  3.5e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.261 on 15 degrees of freedom
## Multiple R-squared:  0.975,  Adjusted R-squared:  0.968 
## F-statistic:  144 on 4 and 15 DF,  p-value: 8.86e-12
chosenHits[3]
## [1] "1451617_at"
probe3 <- preparedata(chosenHits[3])
mod3 <- lm(formula = gExp ~ devStage, data = probe3, subset = gType == "wt")
summary(mod3)
## 
## Call:
## lm(formula = gExp ~ devStage, data = probe3, subset = gType == 
##     "wt")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9730 -0.1863 -0.0371  0.2429  1.1810 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        5.835      0.259   22.54  5.6e-13 ***
## devStageP2         0.726      0.366    1.98    0.066 .  
## devStageP6         2.513      0.366    6.86  5.4e-06 ***
## devStageP10        4.984      0.366   13.61  7.6e-10 ***
## devStage4_weeks    6.685      0.366   18.26  1.2e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.518 on 15 degrees of freedom
## Multiple R-squared:  0.969,  Adjusted R-squared:  0.961 
## F-statistic:  119 on 4 and 15 DF,  p-value: 3.58e-11

Be the boss of topTable

colnames(topTable(wtEbFit))
##  [1] "ID"              "X.Intercept."    "devStageP2"     
##  [4] "devStageP6"      "devStageP10"     "devStage4_weeks"
##  [7] "AveExpr"         "F"               "P.Value"        
## [10] "adj.P.Val"
# How many probes have Benjamini-Hochberg ('BH') adjusted p-values for the F
# test conducted above that are less than 1e-05?
dsHitspval <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit))), 
    n = Inf)$adj.P.Val
sum(dsHitspval < 1e-05)
## [1] 350
dsHits <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit))), 
    n = Inf)
pr <- dsHits[63, c("F", "adj.P.Val", "devStageP6")]
P2tHits <- topTable(wtEbFit, coef = "devStageP2", n = Inf, sort = "none")$t
P10tHits <- topTable(wtEbFit, coef = "devStageP10", n = Inf, sort = "none")$t
xyplot(P10tHits ~ P2tHits, aspect = 1, xlim = c(-20, 16), ylim = c(-20, 16), 
    panel = function(...) {
        panel.smoothScatter(...)
        panel.abline(0, 1, col = 4)
    })
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## (loaded the KernSmooth namespace)

plot of chunk unnamed-chunk-7

P2apHits <- topTable(wtEbFit, coef = "devStageP2", n = Inf, sort = "none")$adj.P.Val
P10apHits <- topTable(wtEbFit, coef = "devStageP10", n = Inf, sort = "none")$adj.P.Val
densityplot(~P10apHits + P2apHits, auto.key = TRUE, plot.points = FALSE)

plot of chunk unnamed-chunk-7

cutoff <- 0.001
foo <- data.frame(P2 = P2apHits < cutoff, P10 = P10apHits < cutoff)
addmargins(with(foo, table(P2, P10)))
##        P10
## P2      FALSE  TRUE   Sum
##   FALSE 29201   695 29896
##   TRUE      1    52    53
##   Sum   29202   747 29949
P2pHits <- topTable(wtEbFit, coef = "devStageP2", n = Inf, sort = "none")$P.Val
P10pHits <- topTable(wtEbFit, coef = "devStageP10", n = Inf, sort = "none")$P.Val
P10pVals <- data.frame(raw = P10pHits, BH = P10apHits, BY = p.adjust(P10pHits, 
    method = "BY"))
splom(P10pVals)

plot of chunk unnamed-chunk-7

# contrast
cont.matrix <- makeContrasts(P10VsP6 = devStageP10 - devStageP6, fourweeksVsP10 = devStage4_weeks - 
    devStageP10, levels = wtDesMat)
## Warning: Renaming (Intercept) to Intercept
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
## Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
dat <- topTable(wtEbFitCont)
makeStripplot(preparedata(rownames(dat)[1:4]))

plot of chunk unnamed-chunk-7

cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
##    P10VsP6 fourweeksVsP10
## -1       4              8
## 0    29945          29895
## 1        0             46
(hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)])
## [1] "1416635_at" "1437781_at" "1454752_at" "1455260_at"
makeStripplot(preparedata(hits1))

plot of chunk unnamed-chunk-7

(hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)])
## [1] "1416021_a_at" "1423851_a_at" "1434500_at"   "1435415_x_at"
## [5] "1437502_x_at" "1448182_a_at" "1452679_at"   "1455447_at"
intersect(hits1, hits2)
## character(0)
makeStripplot(preparedata(hits2))

plot of chunk unnamed-chunk-7

(hits3 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)])
##  [1] "1416041_at"   "1417280_at"   "1418406_at"   "1418710_at"  
##  [5] "1418789_at"   "1419069_at"   "1420725_at"   "1420726_x_at"
##  [9] "1421061_at"   "1421818_at"   "1422612_at"   "1424852_at"  
## [13] "1424895_at"   "1425222_x_at" "1426059_at"   "1426223_at"  
## [17] "1427388_at"   "1428763_at"   "1429394_at"   "1429791_at"  
## [21] "1430580_at"   "1431174_at"   "1433699_at"   "1434297_at"  
## [25] "1434573_at"   "1435436_at"   "1435679_at"   "1435727_s_at"
## [29] "1436265_at"   "1436287_at"   "1440402_at"   "1440645_at"  
## [33] "1441518_at"   "1442243_at"   "1443252_at"   "1446484_at"  
## [37] "1449170_at"   "1449393_at"   "1451042_a_at" "1451635_at"  
## [41] "1452243_at"   "1453385_at"   "1455493_at"   "1457878_at"  
## [45] "1458418_at"   "1459904_at"
makeStripplot(preparedata(hits3[1:4]))

plot of chunk unnamed-chunk-7

intersect(hits2, hits3)
## character(0)
intersect(hits1, hits3)
## character(0)
cutoff <- 0.01
nHits <- 8
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
##    P10VsP6 fourweeksVsP10
## -1      40             49
## 0    29897          29636
## 1       12            264
hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)]
makeStripplot(preparedata(hits1[1:nHits]))

plot of chunk unnamed-chunk-7

hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)]
makeStripplot(preparedata(hits2[1:nHits]))

plot of chunk unnamed-chunk-7

hits3 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0)]
makeStripplot(preparedata(hits3[1:nHits]))

plot of chunk unnamed-chunk-7

hits4 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)]
makeStripplot(preparedata(hits4[1:nHits]))

plot of chunk unnamed-chunk-7

vennDiagram(wtResCont)

plot of chunk unnamed-chunk-7

hits5 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] != 0 & wtResCont[, "fourweeksVsP10"] != 
    0)]
makeStripplot(preparedata(hits5))

plot of chunk unnamed-chunk-7

hits6 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0 & wtResCont[, "fourweeksVsP10"] < 
    0)]
makeStripplot(preparedata(hits6))

plot of chunk unnamed-chunk-7

Take Home Problem

# Take home
(cont.matrix <- makeContrasts(P2VsE16 = devStageP2 - Intercept, P6VsP2 = devStageP6 - 
    devStageP2, P10VsP6 = devStageP10 - devStageP6, fourweeksVsP10 = devStage4_weeks - 
    devStageP10, levels = wtDesMat))
## Warning: Renaming (Intercept) to Intercept
##                  Contrasts
## Levels            P2VsE16 P6VsP2 P10VsP6 fourweeksVsP10
##   Intercept            -1      0       0              0
##   devStageP2            1     -1       0              0
##   devStageP6            0      1      -1              0
##   devStageP10           0      0       1             -1
##   devStage4_weeks       0      0       0              1
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
## Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
##    P2VsE16 P6VsP2 P10VsP6 fourweeksVsP10
## -1   29903      0      32             37
## 0       46  29937   29907          29712
## 1        0     12      10            200
hitsNew <- rownames(prDat)[which((wtResCont[, "P2VsE16"] != 0 & wtResCont[, 
    "P6VsP2"] != 0) & wtResCont[, "P10VsP6"] == 0 & wtResCont[, "fourweeksVsP10"] == 
    0)]
makeStripplot(subset(preparedata(hitsNew[1:6]), gType == "wt"))

plot of chunk unnamed-chunk-8