We will using data aggregation methods to fit linear models on a small dataset comprising a few (say, 9) probes.
library(lattice)
library(plyr)
prDat <- read.table("GSE4051_data.tsv")
str(prDat, max.level = 0)
## 'data.frame': 29949 obs. of 39 variables:
prDes <- readRDS("GSE4051_design.rds")
str(prDes)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
Select 9 random probes from the dataset.
set.seed(1231411)
n <- 9
(luckyGenes <- rownames(prDat)[sample(1:nrow(prDat), size = n)])
## [1] "1457246_at" "1442791_x_at" "1423612_at" "1445125_at"
## [5] "1426780_at" "1429798_s_at" "1447506_at" "1446842_at"
## [9] "1431353_at"
We will need to first write a number of functions to generate the dataframe, make the stripplots and perform the two-sample tests
prepareData <- function(a) {
dat <- prDat[a, ]
dat <- data.frame(gExp = as.vector(t(as.matrix(dat))), gene = factor(rep(rownames(dat),
each = ncol(dat)), levels = rownames(dat)))
dat <- suppressWarnings(data.frame(prDes, dat))
return(dat)
}
makeStripplot <- function(x, ...) {
stripplot(gExp ~ devStage | gene, x, group = gType, jitter.data = TRUE,
auto.key = TRUE, type = c("p", "a"), par.settings = simpleTheme(col = c("dodgerblue",
"deeppink1"), pch = 16), grid = TRUE, ...)
}
Run.TTest <- function(t) {
datt <- subset(t, devStage %in% c("P2", "P10"))
tt <- t.test(gExp ~ devStage, datt)
return(c(tt$estimate, tt$statistic, tt$parameter, p.value = tt$p.value))
}
Fit.LM <- function(x) {
fitA <- lm(gExp ~ devStage, data = x, subset = gType == "wt")
return(fitA)
}
inference.testing <- function(o) {
obsDiff <- contrast.matrix %*% coef(o)
estSE <- contrast.matrix %*% vcov(o) %*% t(contrast.matrix)
testStat <- obsDiff/estSE
p.values <- 2 * pt(abs(testStat), df = df.residual(o), lower.tail = FALSE)
c(Estimate = obsDiff, Std_Error = estSE, Statistic = testStat, p.values = p.values)
}
Interaction.2covariates <- function(y) {
oFitBig <- lm(formula = gExp ~ gType * devStage, data = y)
oFitSmall <- lm(formula = gExp ~ gType + devStage, data = y)
anova.interaction <- anova(oFitSmall, oFitBig)
c(p.values = anova.interaction[2, 6])
}
Let's begin the analysis by preparing the dataset of the 9 selected probes and making stripplots for each probe.
makeStripplot(sumDat <- prepareData(luckyGenes))
head(sumDat)
## sidChar sidNum devStage gType gExp gene
## 1 Sample_20 20 E16 wt 10.910 1457246_at
## 2 Sample_21 21 E16 wt 10.850 1457246_at
## 3 Sample_22 22 E16 wt 10.330 1457246_at
## 4 Sample_23 23 E16 wt 10.650 1457246_at
## 5 Sample_16 16 E16 NrlKO 8.721 1457246_at
## 6 Sample_17 17 E16 NrlKO 10.300 1457246_at
Now we will run t-tests across all probes in the dataset comparing P2 vs. P10 expression levels
ddply(sumDat, ~gene, Run.TTest)
## gene mean in group P2 mean in group P10 t df p.value
## 1 1457246_at 9.210 9.763 -1.6226 13.389 0.127968
## 2 1442791_x_at 7.883 7.704 0.8871 11.844 0.392662
## 3 1423612_at 6.239 6.607 -3.5748 11.635 0.004001
## 4 1445125_at 8.801 8.517 2.6568 10.623 0.022918
## 5 1426780_at 7.336 7.561 -1.0043 8.296 0.343627
## 6 1429798_s_at 7.997 7.781 2.0527 9.138 0.069834
## 7 1447506_at 8.030 7.926 1.0391 13.939 0.316457
## 8 1446842_at 8.954 8.878 0.3956 11.568 0.699579
## 9 1431353_at 8.537 8.137 1.3583 10.903 0.201815
Based on the t-test p-values, only the probes “1423612_at” and “1445125_at” had expression levels that were significantly different at P2 vs P10 devStages.
We will also fit linear models for expression of each probe by devStage, using only wt samples
lm.list <- dlply(sumDat, ~gene, Fit.LM)
(lm.coef.df <- ldply(lm.list, coef))
## gene (Intercept) devStageP2 devStageP6 devStageP10
## 1 1457246_at 10.685 -1.85450 -1.49850 -1.3105
## 2 1442791_x_at 7.055 0.75525 0.56225 0.7080
## 3 1423612_at 6.193 0.06025 -0.01450 0.2820
## 4 1445125_at 8.133 0.65925 0.36025 0.4753
## 5 1426780_at 8.036 -0.79900 -0.62650 -0.5987
## 6 1429798_s_at 7.468 0.47600 0.38675 0.4650
## 7 1447506_at 7.785 0.18225 0.05025 0.1285
## 8 1446842_at 8.456 0.39875 0.27400 0.2110
## 9 1431353_at 7.797 0.82225 0.39975 0.4320
## devStage4_weeks
## 1 -0.7707
## 2 0.2430
## 3 0.4003
## 4 0.2273
## 5 -0.3092
## 6 0.2182
## 7 0.2395
## 8 0.1818
## 9 0.1215
We will need to make a contrast matrix in order to infer differences between devStage P2 and 4_weeks effect
contrast.matrix <- matrix(c(0, 1, 0, 0, -1), nrow = 1)
(infer.df <- ldply(lm.list, inference.testing))
## gene Estimate Std_Error Statistic p.values
## 1 1457246_at -1.08375 0.21863 -4.957 1.721e-04
## 2 1442791_x_at 0.51225 0.09938 5.154 1.176e-04
## 3 1423612_at -0.34000 0.02133 -15.941 8.202e-11
## 4 1445125_at 0.43200 0.02924 14.773 2.407e-10
## 5 1426780_at -0.48975 0.10297 -4.756 2.549e-04
## 6 1429798_s_at 0.25775 0.02875 8.966 2.058e-07
## 7 1447506_at -0.05725 0.01506 -3.801 1.739e-03
## 8 1446842_at 0.21700 0.09722 2.232 4.129e-02
## 9 1431353_at 0.70075 0.21716 3.227 5.644e-03
Given that the two-sided p-values are rather large (ie >0.05), we can conclude that there is no true difference in mean expression at P2 and 4 weeks in wildtype mice for these 9 probes.
We will now fit a linear model with two categorical covariates (gType and devStage) and test if their interaction is significant using anova
ddply(sumDat, ~gene, Interaction.2covariates)
## gene p.values
## 1 1457246_at 0.08581
## 2 1442791_x_at 0.57786
## 3 1423612_at 0.03346
## 4 1445125_at 0.50339
## 5 1426780_at 0.44001
## 6 1429798_s_at 0.36282
## 7 1447506_at 0.78203
## 8 1446842_at 0.93564
## 9 1431353_at 0.53866
Based on the high p-values from the anova, we can conclude that there is no interaction of gType and devStage for most of these probes except for 1423612_at (p-value = 0.033).