Load required libraries and import data:
library(limma)
library(lattice)
## Warning: package 'lattice' was built under R version 3.0.2
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.0.2
prDes <- read.table("GSE4051_design.tsv", header = T)
str(prDes)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
## $ sidNum : int 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "4_weeks","E16",..: 2 2 2 2 2 2 2 4 4 4 ...
## $ gType : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
prDat <- read.table("GSE4051_data.tsv", header = T)
str(prDat, max.level = 0)
## 'data.frame': 29949 obs. of 39 variables:
The difficult in estimating gene-wise data
# Simulation with only one group
m <- 1000
n <- 3
x <- matrix(rnorm(m * n), nrow = m)
obsVars <- apply(x, 1, var)
summary(obsVars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002 0.299 0.719 1.040 1.440 7.750
mean(obsVars < 1/3)
## [1] 0.272
densityplot(~obsVars, n = 200)
# Simulation with two groups first group
m1 <- 500
n1 <- 4
x1 <- matrix(rnorm(m1 * n1, mean = 0, sd = 1), nrow = m1)
# second group
m2 <- 500
n2 <- 4
x2 <- matrix(rnorm(m2 * n2, mean = 1, sd = 2), nrow = m2)
# putting them together
final <- rbind(x1, x2)
obsVars1 <- apply(final, 1, var)
summary(obsVars1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.008 0.662 1.490 2.530 3.280 25.100
densityplot(~obsVars1, n = 200)
In both cases we get many observed variances really small and others really high, even though the average in around 1 in the simulation with only one group, and it is about 1.5 on average in the secon dimulation setting.
Fit a linear model: explain gene expression in the wild type mice as a function of developmental stage (one-way ANOVA)
# Fit a linear model:limma
wtDes <- subset(prDes, gType == "wt")
wtDes$devStage <- factor(wtDes$devStage, c("E16", "P2", "P6", "P10", "4_weeks"))
str(wtDes)
## 'data.frame': 20 obs. of 4 variables:
## $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 17 18 19 20 21 22 ...
## $ sidNum : int 20 21 22 23 24 25 26 27 28 29 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ gType : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 2 2 2 2 2 2 ...
wtDat <- subset(prDat, select = prDes$gType == "wt")
str(wtDat, max.level = 0)
## 'data.frame': 29949 obs. of 20 variables:
wtDesMat <- model.matrix(~devStage, wtDes)
str(wtDesMat)
## num [1:20, 1:5] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:20] "1" "2" "3" "4" ...
## ..$ : chr [1:5] "(Intercept)" "devStageP2" "devStageP6" "devStageP10" ...
## - attr(*, "assign")= int [1:5] 0 1 1 1 1
## - attr(*, "contrasts")=List of 1
## ..$ devStage: chr "contr.treatment"
wtFit <- lmFit(wtDat, wtDesMat)
wtEbFit <- eBayes(wtFit)
topTable(wtEbFit)
## ID X.Intercept. devStageP2 devStageP6 devStageP10
## 1423641_s_at 1423641_s_at 12.18 -0.0175 0.0750 0.0675
## 1438940_x_at 1438940_x_at 12.86 0.0850 0.1325 0.3425
## 1438657_x_at 1438657_x_at 12.78 0.1400 0.1250 -0.1850
## 1456736_x_at 1456736_x_at 12.32 0.1625 0.3050 0.2075
## 1436884_x_at 1436884_x_at 12.93 0.1775 0.3225 0.0300
## 1419700_a_at 1419700_a_at 12.32 0.1650 0.6475 0.8175
## 1435800_a_at 1435800_a_at 12.28 0.0450 0.6825 0.9000
## 1454613_at 1454613_at 12.47 -0.1075 -0.0500 -0.1025
## 1451240_a_at 1451240_a_at 13.00 0.3100 0.2800 0.2800
## 1450084_s_at 1450084_s_at 12.63 0.0825 0.0525 0.1725
## devStage4_weeks AveExpr F P.Value adj.P.Val
## 1423641_s_at 0.1800 12.24 45350 3.574e-36 5.201e-32
## 1438940_x_at 0.3500 13.04 44957 3.865e-36 5.201e-32
## 1438657_x_at -0.4500 12.71 43486 5.210e-36 5.201e-32
## 1456736_x_at 0.0725 12.47 39509 1.233e-35 6.725e-32
## 1436884_x_at 0.0250 13.04 39269 1.302e-35 6.725e-32
## 1419700_a_at 0.6825 12.78 39121 1.347e-35 6.725e-32
## 1435800_a_at 1.0200 12.81 36668 2.410e-35 1.031e-31
## 1454613_at -0.3825 12.34 35835 2.962e-35 1.078e-31
## 1451240_a_at -0.3700 13.10 35481 3.239e-35 1.078e-31
## 1450084_s_at 0.2600 12.75 34411 4.265e-35 1.234e-31
colnames(coef(wtEbFit)) # remind yourself of the coef names
## [1] "(Intercept)" "devStageP2" "devStageP6" "devStageP10"
## [5] "devStage4_weeks"
dsHits <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit))))
chosenHits <- c(dsHits[3, 1], dsHits[6, 1], dsHits[9, 1])
preparedata <- function(pID) {
genexp <- prDat[pID, ]
Dat <- data.frame(gene = row.names(genexp), genexp)
newDat <- melt(Dat, id.vars = "gene")
names(newDat) <- c("gene", "sidChar", "gExp")
final <- merge(newDat, prDes, by = "sidChar")
final$sidChar <- as.character(final$sidChar)
final$devStage <- factor(final$devStage, c("E16", "P2", "P6", "P10", "4_weeks"))
return(final)
}
dat <- preparedata(chosenHits)
makeStripplot <- function(x) {
stripplot(gExp ~ devStage | gene, x, jitter.data = TRUE, auto.key = TRUE,
type = c("p", "a"), grid = TRUE)
}
makeStripplot(subset(dat, gType == "wt"))
# Graphs show a high expression change over developmental stage
# Use lm() to check for similar result
chosenHits[1]
## [1] "1425222_x_at"
probe1 <- preparedata(chosenHits[1])
mod1 <- lm(formula = gExp ~ devStage, data = probe1, subset = gType == "wt")
summary(mod1)
##
## Call:
## lm(formula = gExp ~ devStage, data = probe1, subset = gType ==
## "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4847 -0.2157 -0.0077 0.1781 0.8315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.276 0.173 30.50 6.5e-15 ***
## devStageP2 0.882 0.245 3.61 0.0026 **
## devStageP6 0.800 0.245 3.27 0.0052 **
## devStageP10 1.549 0.245 6.33 1.3e-05 ***
## devStage4_weeks 5.532 0.245 22.61 5.3e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.346 on 15 degrees of freedom
## Multiple R-squared: 0.977, Adjusted R-squared: 0.971
## F-statistic: 159 on 4 and 15 DF, p-value: 4.31e-12
chosenHits[2]
## [1] "1422929_s_at"
probe2 <- preparedata(chosenHits[2])
mod2 <- lm(formula = gExp ~ devStage, data = probe2, subset = gType == "wt")
summary(mod2)
##
## Call:
## lm(formula = gExp ~ devStage, data = probe2, subset = gType ==
## "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5850 -0.0959 -0.0452 0.0606 0.6740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.026 0.130 76.9 < 2e-16 ***
## devStageP2 -2.912 0.184 -15.8 9.4e-11 ***
## devStageP6 -3.618 0.184 -19.6 4.2e-12 ***
## devStageP10 -3.547 0.184 -19.2 5.5e-12 ***
## devStage4_weeks -3.661 0.184 -19.9 3.5e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.261 on 15 degrees of freedom
## Multiple R-squared: 0.975, Adjusted R-squared: 0.968
## F-statistic: 144 on 4 and 15 DF, p-value: 8.86e-12
chosenHits[3]
## [1] "1451617_at"
probe3 <- preparedata(chosenHits[3])
mod3 <- lm(formula = gExp ~ devStage, data = probe3, subset = gType == "wt")
summary(mod3)
##
## Call:
## lm(formula = gExp ~ devStage, data = probe3, subset = gType ==
## "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9730 -0.1863 -0.0371 0.2429 1.1810
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.835 0.259 22.54 5.6e-13 ***
## devStageP2 0.726 0.366 1.98 0.066 .
## devStageP6 2.513 0.366 6.86 5.4e-06 ***
## devStageP10 4.984 0.366 13.61 7.6e-10 ***
## devStage4_weeks 6.685 0.366 18.26 1.2e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.518 on 15 degrees of freedom
## Multiple R-squared: 0.969, Adjusted R-squared: 0.961
## F-statistic: 119 on 4 and 15 DF, p-value: 3.58e-11
Be the boss of topTable
colnames(topTable(wtEbFit))
## [1] "ID" "X.Intercept." "devStageP2"
## [4] "devStageP6" "devStageP10" "devStage4_weeks"
## [7] "AveExpr" "F" "P.Value"
## [10] "adj.P.Val"
# How many probes have Benjamini-Hochberg ('BH') adjusted p-values for the F
# test conducted above that are less than 1e-05?
dsHitspval <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit))),
n = Inf)$adj.P.Val
sum(dsHitspval < 1e-05)
## [1] 350
dsHits <- topTable(wtEbFit, coef = grep("devStage", colnames(coef(wtEbFit))),
n = Inf)
pr <- dsHits[63, c("F", "adj.P.Val", "devStageP6")]
P2tHits <- topTable(wtEbFit, coef = "devStageP2", n = Inf, sort = "none")$t
P10tHits <- topTable(wtEbFit, coef = "devStageP10", n = Inf, sort = "none")$t
xyplot(P10tHits ~ P2tHits, aspect = 1, xlim = c(-20, 16), ylim = c(-20, 16),
panel = function(...) {
panel.smoothScatter(...)
panel.abline(0, 1, col = 4)
})
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## (loaded the KernSmooth namespace)
P2apHits <- topTable(wtEbFit, coef = "devStageP2", n = Inf, sort = "none")$adj.P.Val
P10apHits <- topTable(wtEbFit, coef = "devStageP10", n = Inf, sort = "none")$adj.P.Val
densityplot(~P10apHits + P2apHits, auto.key = TRUE, plot.points = FALSE)
cutoff <- 0.001
foo <- data.frame(P2 = P2apHits < cutoff, P10 = P10apHits < cutoff)
addmargins(with(foo, table(P2, P10)))
## P10
## P2 FALSE TRUE Sum
## FALSE 29201 695 29896
## TRUE 1 52 53
## Sum 29202 747 29949
P2pHits <- topTable(wtEbFit, coef = "devStageP2", n = Inf, sort = "none")$P.Val
P10pHits <- topTable(wtEbFit, coef = "devStageP10", n = Inf, sort = "none")$P.Val
P10pVals <- data.frame(raw = P10pHits, BH = P10apHits, BY = p.adjust(P10pHits,
method = "BY"))
splom(P10pVals)
# contrast
cont.matrix <- makeContrasts(P10VsP6 = devStageP10 - devStageP6, fourweeksVsP10 = devStage4_weeks -
devStageP10, levels = wtDesMat)
## Warning: Renaming (Intercept) to Intercept
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
## Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
dat <- topTable(wtEbFitCont)
makeStripplot(preparedata(rownames(dat)[1:4]))
cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
## P10VsP6 fourweeksVsP10
## -1 4 8
## 0 29945 29895
## 1 0 46
(hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)])
## [1] "1416635_at" "1437781_at" "1454752_at" "1455260_at"
makeStripplot(preparedata(hits1))
(hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)])
## [1] "1416021_a_at" "1423851_a_at" "1434500_at" "1435415_x_at"
## [5] "1437502_x_at" "1448182_a_at" "1452679_at" "1455447_at"
intersect(hits1, hits2)
## character(0)
makeStripplot(preparedata(hits2))
(hits3 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)])
## [1] "1416041_at" "1417280_at" "1418406_at" "1418710_at"
## [5] "1418789_at" "1419069_at" "1420725_at" "1420726_x_at"
## [9] "1421061_at" "1421818_at" "1422612_at" "1424852_at"
## [13] "1424895_at" "1425222_x_at" "1426059_at" "1426223_at"
## [17] "1427388_at" "1428763_at" "1429394_at" "1429791_at"
## [21] "1430580_at" "1431174_at" "1433699_at" "1434297_at"
## [25] "1434573_at" "1435436_at" "1435679_at" "1435727_s_at"
## [29] "1436265_at" "1436287_at" "1440402_at" "1440645_at"
## [33] "1441518_at" "1442243_at" "1443252_at" "1446484_at"
## [37] "1449170_at" "1449393_at" "1451042_a_at" "1451635_at"
## [41] "1452243_at" "1453385_at" "1455493_at" "1457878_at"
## [45] "1458418_at" "1459904_at"
makeStripplot(preparedata(hits3[1:4]))
intersect(hits2, hits3)
## character(0)
intersect(hits1, hits3)
## character(0)
cutoff <- 0.01
nHits <- 8
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
## P10VsP6 fourweeksVsP10
## -1 40 49
## 0 29897 29636
## 1 12 264
hits1 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] < 0)]
makeStripplot(preparedata(hits1[1:nHits]))
hits2 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] < 0)]
makeStripplot(preparedata(hits2[1:nHits]))
hits3 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0)]
makeStripplot(preparedata(hits3[1:nHits]))
hits4 <- rownames(prDat)[which(wtResCont[, "fourweeksVsP10"] > 0)]
makeStripplot(preparedata(hits4[1:nHits]))
vennDiagram(wtResCont)
hits5 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] != 0 & wtResCont[, "fourweeksVsP10"] !=
0)]
makeStripplot(preparedata(hits5))
hits6 <- rownames(prDat)[which(wtResCont[, "P10VsP6"] > 0 & wtResCont[, "fourweeksVsP10"] <
0)]
makeStripplot(preparedata(hits6))
Take Home Problem
# Take home
(cont.matrix <- makeContrasts(P2VsE16 = devStageP2 - Intercept, P6VsP2 = devStageP6 -
devStageP2, P10VsP6 = devStageP10 - devStageP6, fourweeksVsP10 = devStage4_weeks -
devStageP10, levels = wtDesMat))
## Warning: Renaming (Intercept) to Intercept
## Contrasts
## Levels P2VsE16 P6VsP2 P10VsP6 fourweeksVsP10
## Intercept -1 0 0 0
## devStageP2 1 -1 0 0
## devStageP6 0 1 -1 0
## devStageP10 0 0 1 -1
## devStage4_weeks 0 0 0 1
wtFitCont <- contrasts.fit(wtFit, cont.matrix)
## Warning: row names of contrasts don't match col names of coefficients
wtEbFitCont <- eBayes(wtFitCont)
cutoff <- 1e-04
wtResCont <- decideTests(wtEbFitCont, p.value = cutoff, method = "global")
summary(wtResCont)
## P2VsE16 P6VsP2 P10VsP6 fourweeksVsP10
## -1 29903 0 32 37
## 0 46 29937 29907 29712
## 1 0 12 10 200
hitsNew <- rownames(prDat)[which((wtResCont[, "P2VsE16"] != 0 & wtResCont[,
"P6VsP2"] != 0) & wtResCont[, "P10VsP6"] == 0 & wtResCont[, "fourweeksVsP10"] ==
0)]
makeStripplot(subset(preparedata(hitsNew[1:6]), gType == "wt"))