prDat <- read.table("../photoRecDataset/dataClean/GSE4051_data.tsv")
str(prDat, max.level = 0)
## 'data.frame': 29949 obs. of 39 variables:
prDes <- readRDS("../photoRecDataset/dataClean/GSE4051_design.rds")
str(prDes)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
library(lattice)
Helpful info and tips:
- One row of a data.frame is still a data.frame (!), which is a list, remember? unlist() and as.vector() can be handy when converting a list or array to an atomic vector.
- Recycling can help you.
(luckyGenes <- c("1419655_at", "1438815_at"))
## [1] "1419655_at" "1438815_at"
gDat <- subset(prDat, rownames(prDat) %in% luckyGenes)
str(prDat)
## 'data.frame': 29949 obs. of 39 variables:
## $ Sample_11: num 7.42 9.83 10 8.6 8.43 ...
## $ Sample_12: num 7.11 9.71 9.43 8.43 8.5 ...
## $ Sample_2 : num 7.35 9.66 9.91 8.4 8.37 ...
## $ Sample_9 : num 7.32 9.8 9.85 8.4 8.46 ...
## $ Sample_36: num 7.25 9.66 9.51 8.49 8.42 ...
## $ Sample_37: num 7.04 8.38 9.21 8.75 8.26 ...
## $ Sample_38: num 7.37 9.44 9.48 8.49 8.34 ...
## $ Sample_39: num 7.13 8.73 9.53 8.65 8.28 ...
## $ Sample_16: num 7.38 7.64 8.42 8.36 8.51 ...
## $ Sample_17: num 7.34 10.03 10.24 8.37 8.89 ...
## $ Sample_6 : num 7.24 9.71 10.17 8.84 8.54 ...
## $ Sample_20: num 7.24 9.48 10.01 8.36 8.59 ...
## $ Sample_21: num 7.41 10.02 10.04 8.37 8.62 ...
## $ Sample_22: num 7.17 9.85 9.91 8.4 8.52 ...
## $ Sample_23: num 7.07 10.13 9.91 8.49 8.64 ...
## $ Sample_13: num 7.33 9.33 9.75 8.43 8.48 ...
## $ Sample_15: num 7.12 9.15 9.84 8.32 8.21 ...
## $ Sample_18: num 7.21 9.49 10.03 8.55 8.5 ...
## $ Sample_19: num 7.21 9.21 9.59 8.31 8.31 ...
## $ Sample_32: num 7.54 9.53 9.92 8.78 8.57 ...
## $ Sample_33: num 7.01 8.97 9.22 8.42 8.53 ...
## $ Sample_34: num 6.81 8.83 9.39 8.1 8.32 ...
## $ Sample_35: num 7.15 9.22 10.06 8.35 8.45 ...
## $ Sample_14: num 7.09 9.56 9.88 8.57 8.59 ...
## $ Sample_3 : num 7.16 9.55 9.84 8.33 8.5 ...
## $ Sample_5 : num 7.08 9.32 9.24 8.3 8.48 ...
## $ Sample_8 : num 7.11 8.24 9.13 8.13 8.33 ...
## $ Sample_24: num 7.11 9.75 9.39 8.37 8.36 ...
## $ Sample_25: num 7.19 9.16 10.11 8.2 8.5 ...
## $ Sample_26: num 7.18 9.49 9.41 8.73 8.39 ...
## $ Sample_27: num 7.21 8.64 9.43 8.33 8.43 ...
## $ Sample_1 : num 7.15 9.87 9.68 8.28 8.5 ...
## $ Sample_10: num 7.28 10.29 9.91 8.42 8.68 ...
## $ Sample_4 : num 7.18 10.16 9.72 8.32 8.5 ...
## $ Sample_7 : num 7.15 8.95 9.3 8.17 8.41 ...
## $ Sample_28: num 7.34 8.27 9.47 8.38 8.4 ...
## $ Sample_29: num 7.66 10.03 9.88 8.56 8.69 ...
## $ Sample_30: num 7.26 9.27 10.54 8.15 8.55 ...
## $ Sample_31: num 7.31 9.26 10.1 8.37 8.49 ...
gDat <- data.frame(gExp = as.vector(t(as.matrix(gDat))), gene = factor(rep(rownames(gDat),
each = ncol(gDat)), levels = luckyGenes))
str(gDat)
## 'data.frame': 78 obs. of 2 variables:
## $ gExp: num 10.47 9.9 10.41 10.85 8.95 ...
## $ gene: Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
gDat <- data.frame(prDes, gDat)
str(gDat)
## 'data.frame': 78 obs. of 6 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
## $ gExp : num 10.47 9.9 10.41 10.85 8.95 ...
## $ gene : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
head(gDat)
## sidChar sidNum devStage gType gExp gene
## 1 Sample_11 11 4_weeks NrlKO 10.470 1419655_at
## 2 Sample_12 12 4_weeks NrlKO 9.897 1419655_at
## 3 Sample_2 2 4_weeks NrlKO 10.410 1419655_at
## 4 Sample_9 9 4_weeks NrlKO 10.850 1419655_at
## 5 Sample_36 36 4_weeks wt 8.953 1419655_at
## 6 Sample_37 37 4_weeks wt 7.793 1419655_at
prepDat <- function(x) {
gDat <- subset(prDat, rownames(prDat) %in% x)
gDat <- data.frame(gExp = as.vector(t(as.matrix(gDat))), gene = factor(rep(rownames(gDat),
each = ncol(gDat)), levels = x))
gDat <- data.frame(prDes, gDat)
}
gDat <- prepDat(luckyGenes)
str(gDat)
## 'data.frame': 78 obs. of 6 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
## $ gExp : num 10.47 9.9 10.41 10.85 8.95 ...
## $ gene : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
head(gDat)
## sidChar sidNum devStage gType gExp gene
## 1 Sample_11 11 4_weeks NrlKO 10.470 1419655_at
## 2 Sample_12 12 4_weeks NrlKO 9.897 1419655_at
## 3 Sample_2 2 4_weeks NrlKO 10.410 1419655_at
## 4 Sample_9 9 4_weeks NrlKO 10.850 1419655_at
## 5 Sample_36 36 4_weeks wt 8.953 1419655_at
## 6 Sample_37 37 4_weeks wt 7.793 1419655_at
tail(gDat)
## sidChar sidNum devStage gType gExp gene
## 73 Sample_4 4 P6 NrlKO 10.440 1438815_at
## 74 Sample_7 7 P6 NrlKO 10.100 1438815_at
## 75 Sample_28 28 P6 wt 9.910 1438815_at
## 76 Sample_29 29 P6 wt 10.140 1438815_at
## 77 Sample_30 30 P6 wt 10.090 1438815_at
## 78 Sample_31 31 P6 wt 9.827 1438815_at
library(lattice)
stripplot(gExp ~ devStage | gene, gDat, group = gType, jitter.data = T, auto.key = T,
type = c("p", "a"), grid = T)
test <- prepDat("1415672_at")
str(test)
## 'data.frame': 39 obs. of 6 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
## $ gExp : num 10 9.43 9.91 9.85 9.51 ...
## $ gene : Factor w/ 1 level "1415672_at": 1 1 1 1 1 1 1 1 1 1 ...
test <- prepDat(c("1415672_at", "1415675_at", "1415674_a_at"))
str(test)
## 'data.frame': 117 obs. of 6 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
## $ gExp : num 10 9.43 9.91 9.85 9.51 ...
## $ gene : Factor w/ 3 levels "1415672_at","1415675_at",..: 1 1 1 1 1 1 1 1 1 1 ...
Helpful info and tips:
- When a plot does not appear and you think it should, try surrounding it with print(). This may be necessary inside functions, loops, etc.
- If you can't or don't want to anticipate all the arguments you'd like to pass to, e.g. stripplot(), use the special function argument ... to leave yourself flexibility.
makeStripplot <- function(x, ...) {
library(lattice)
stripplot(gExp ~ devStage | gene, x, group = gType, jitter.data = T, auto.key = T,
grid = T, type = c("a", "p"), ...)
}
makeStripplot(gDat)
makeStripplot(gDat, xlab = "Developmental Stage")
makeStripplot(gDat, xlab = "Developmental Stage", pch = 17)
makeStripplot(gDat, xlab = "Developmental Stage", pch = 17, cex = 3)
makeStripplot(newDat <- prepDat("1460739_at"))
makeStripplot(newDat <- prepDat(c("1460739_at", "1460741_x_at")))
Let's test for a difference in expected gene expression for probeset “1456341_a_at” at developmental stage P2 vs. 4 weeks post-natal (ignoring genotype, i.e. lump the wild types and knockouts together). Let's assume a common variance in the two groups.
makeStripplot(prepDat("1456341_a_at"))
gDat <- prepDat("1456341_a_at")
str(gDat)
## 'data.frame': 39 obs. of 6 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
## $ gExp : num 10.26 9.7 10.08 9.28 10.39 ...
## $ gene : Factor w/ 1 level "1456341_a_at": 1 1 1 1 1 1 1 1 1 1 ...
head(gDat)
## sidChar sidNum devStage gType gExp gene
## 1 Sample_11 11 4_weeks NrlKO 10.260 1456341_a_at
## 2 Sample_12 12 4_weeks NrlKO 9.704 1456341_a_at
## 3 Sample_2 2 4_weeks NrlKO 10.080 1456341_a_at
## 4 Sample_9 9 4_weeks NrlKO 9.281 1456341_a_at
## 5 Sample_36 36 4_weeks wt 10.390 1456341_a_at
## 6 Sample_37 37 4_weeks wt 9.798 1456341_a_at
t.test(gExp ~ devStage, subset(gDat, devStage == "P2" | devStage == "4_weeks"),
equal.var = T)
##
## Welch Two Sample t-test
##
## data: gExp by devStage
## t = -18.84, df = 13.98, p-value = 2.477e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.078 -3.244
## sample estimates:
## mean in group P2 mean in group 4_weeks
## 6.326 9.987
In other words, do “one-way ANOVA”. Focus on probeset “1438786_a_at”.
gDat <- prepDat("1438786_a_at")
str(gDat)
## 'data.frame': 39 obs. of 6 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
## $ gExp : num 8.63 7.47 8.16 8.29 9.18 ...
## $ gene : Factor w/ 1 level "1438786_a_at": 1 1 1 1 1 1 1 1 1 1 ...
library(lattice)
makeStripplot(gDat)
gFit <- lm(gExp ~ devStage, gDat, gType == "wt")
summary(gFit)
##
## Call:
## lm(formula = gExp ~ devStage, data = gDat, subset = gType ==
## "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1565 -0.4400 0.0288 0.4915 1.2065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.523 0.379 22.50 5.7e-13 ***
## devStageP2 -1.450 0.536 -2.71 0.016 *
## devStageP6 -0.107 0.536 -0.20 0.845
## devStageP10 -1.201 0.536 -2.24 0.040 *
## devStage4_weeks 0.081 0.536 0.15 0.882
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.758 on 15 degrees of freedom
## Multiple R-squared: 0.497, Adjusted R-squared: 0.363
## F-statistic: 3.71 on 4 and 15 DF, p-value: 0.0272
does the intercept look plausible given the plot? - yes (relative to reference - E16) How about the devStageP2 effect, etc.? - there is some effect at dev. stages P2 and P10 (relative to E16)
The “W” shape of the expression profile for “1438786_a_at” means that the expression values for developmental stages P2 and P10 are quite similar. We could formally test whether the P2 and P10 effects are equal or, equivalently, whether their difference is equal to zero.
First extract the parameter estimates from the linear model you fit above. You did save the fit, right? If not, edit your code to do so and re-run that bit. Hint: the coef() function will pull parameter estimates out of a wide array of fitted model objects in R.
coef(gFit)
## (Intercept) devStageP2 devStageP6 devStageP10
## 8.5227 -1.4503 -0.1068 -1.2013
## devStage4_weeks
## 0.0810
Now you need to construct the contrast matrix to form the difference between the P2 and P10 effects. I called mine contMat. Hint: it's OK for a matrix to have just one row.
contMat <- matrix(c(0, 1, 0, -1, 0), nrow = 1)
(obsDif <- contMat %*% coef(gFit))
## [,1]
## [1,] -0.249
Let's check that this really is the observed difference in sample mean for the wild type mice, P2 vs. P10.
(sampMeans <- aggregate(gExp ~ devStage, gDat, FUN = mean, subset = gType ==
"wt"))
## devStage gExp
## 1 E16 8.523
## 2 P2 7.072
## 3 P6 8.416
## 4 P10 7.322
## 5 4_weeks 8.604
with(sampMeans, gExp[devStage == "P2"] - gExp[devStage == "P10"])
## [1] -0.249
Now we need the (estimated) standard error for our contrast. The variance-covariance matrix of the parameters estimated in the original model can be obtained with vcov() and is equal to (XT X)−1 σ2.
vcov(gFit)
## (Intercept) devStageP2 devStageP6 devStageP10
## (Intercept) 0.1435 -0.1435 -0.1435 -0.1435
## devStageP2 -0.1435 0.2870 0.1435 0.1435
## devStageP6 -0.1435 0.1435 0.2870 0.1435
## devStageP10 -0.1435 0.1435 0.1435 0.2870
## devStage4_weeks -0.1435 0.1435 0.1435 0.1435
## devStage4_weeks
## (Intercept) -0.1435
## devStageP2 0.1435
## devStageP6 0.1435
## devStageP10 0.1435
## devStage4_weeks 0.2870
Let's check that this is really true. If we take the diagonal elements and take their square root, they should be exactly equal to the standard errors reported for out original model. Are they?
summary(gFit)$coefficients[, "Std. Error"]
## (Intercept) devStageP2 devStageP6 devStageP10
## 0.3788 0.5357 0.5357 0.5357
## devStage4_weeks
## 0.5357
sqrt(diag(vcov(gFit)))
## (Intercept) devStageP2 devStageP6 devStageP10
## 0.3788 0.5357 0.5357 0.5357
## devStage4_weeks
## 0.5357
summary(gFit)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.5227 0.3788 22.4979 5.697e-13
## devStageP2 -1.4503 0.5357 -2.7070 1.623e-02
## devStageP6 -0.1068 0.5357 -0.1993 8.447e-01
## devStageP10 -1.2013 0.5357 -2.2422 4.049e-02
## devStage4_weeks 0.0810 0.5357 0.1512 8.818e-01
Returning to our test of the P2 vs. P10 contrast, recall that the variance-covariance matrix of a contrast obtained as CαHat is C(XT X)−1 CT σ2.
(estSE <- contMat %*% vcov(gFit) %*% t(contMat))
## [,1]
## [1,] 0.287
Now we form a test statistic as an observed effect divided by its estimated standard error:
(testStat <- obsDif/estSE)
## [,1]
## [1,] -0.8676
Under the null hypothesis that the contrast equals zero, i.e. there is no true difference in mean for expression at P2 and P10 in wild type mice for this gene, the test statistic has a t distribution with n−p=20−5=15 degrees of freedom. We compute a two-sided p-value and we're done.
2 * pt(abs(testStat), df = df.residual(gFit), lower.tail = FALSE)
## [,1]
## [1,] 0.3993
Not surprisingly, this p-value is rather large and we conclude there is no difference.
Let's focus on probeset “1448690_at”. Use your functions to prepare the data and plot it.
oDat <- prepDat("1448690_at")
makeStripplot(oDat)
oFitBig <- lm(gExp ~ gType + devStage + gType * devStage, oDat)
summary(oFitBig)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.67800 0.3987 21.76755 1.634e-19
## gTypeNrlKO -0.84233 0.6090 -1.38320 1.772e-01
## devStageP2 -1.02900 0.5638 -1.82512 7.830e-02
## devStageP6 -1.91450 0.5638 -3.39571 2.003e-03
## devStageP10 -2.19325 0.5638 -3.89012 5.387e-04
## devStage4_weeks -2.08200 0.5638 -3.69280 9.149e-04
## gTypeNrlKO:devStageP2 0.06983 0.8299 0.08415 9.335e-01
## gTypeNrlKO:devStageP6 0.16533 0.8299 0.19922 8.435e-01
## gTypeNrlKO:devStageP10 0.22583 0.8299 0.27212 7.875e-01
## gTypeNrlKO:devStage4_weeks 0.64608 0.8299 0.77852 4.426e-01
Is the intercept plausible? - Yes How about the various effects? Do the ones with small p-values, e.g. meeting a conventional cut-off of 0.05, look 'real' to you? - Yes, looking at the plot there is no gType effect (paralel lines). There is a visible difference in gene expression between stages P6, P10, and 4weeks as compared to E16. Although there is also a difference between P2 and E16 it is not statistically significant.
oFitSmall <- lm(gExp ~ devStage + gType, oDat)
summary(oFitSmall)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.5803 0.3046 28.165 1.177e-24
## devStageP2 -1.0104 0.3924 -2.575 1.470e-02
## devStageP6 -1.8481 0.3924 -4.710 4.328e-05
## devStageP10 -2.0966 0.3924 -5.343 6.703e-06
## devStage4_weeks -1.7752 0.3924 -4.524 7.444e-05
## gTypeNrlKO -0.6144 0.2430 -2.528 1.643e-02
From the plot, the case for interaction seems very weak. This can be assessed with an F test that essentially looks at the reduction in the sum of squared residuals due to using a larger, more complicated model and determines if it is “big enough” given the number of additional parameters used. Recall the anova() function can take two fitted models, one nested within the other, and perform this test. (anova() can also be used on a single model to assess significance of terms, but remember the problem with the standard anova() function and unbalanced data. See references given in lecture for remedies.)
anova(oFitSmall, oFitBig)
## Analysis of Variance Table
##
## Model 1: gExp ~ devStage + gType
## Model 2: gExp ~ gType + devStage + gType * devStage
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 18.9
## 2 29 18.4 4 0.497 0.2 0.94
eDat <- prepDat("1429225_at")
makeStripplot(eDat)
From the plot it looks like there may be interaction between gType and devStage.
eFitBig <- lm(gExp ~ gType * devStage, eDat)
summary(eFitBig)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.3125 0.2617 27.9391 1.619e-22
## gTypeNrlKO -0.2602 0.3998 -0.6507 5.203e-01
## devStageP2 -1.1582 0.3701 -3.1292 3.973e-03
## devStageP6 -1.2495 0.3701 -3.3757 2.110e-03
## devStageP10 -1.0717 0.3701 -2.8955 7.125e-03
## devStage4_weeks -0.9088 0.3701 -2.4551 2.032e-02
## gTypeNrlKO:devStageP2 0.2804 0.5448 0.5147 6.107e-01
## gTypeNrlKO:devStageP6 0.7589 0.5448 1.3929 1.742e-01
## gTypeNrlKO:devStageP10 1.7914 0.5448 3.2880 2.648e-03
## gTypeNrlKO:devStage4_weeks 2.2389 0.5448 4.1094 2.970e-04
It looks like there is an interaction between gType and devStage and P6, P10, and 4weeks
eFitSmall <- lm(gExp ~ gType + devStage, eDat)
summary(eFitSmall)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8652 0.2722 25.2199 3.848e-23
## gTypeNrlKO 0.7836 0.2172 3.6085 1.007e-03
## devStageP2 -1.0926 0.3506 -3.1161 3.780e-03
## devStageP6 -0.9446 0.3506 -2.6940 1.101e-02
## devStageP10 -0.2506 0.3506 -0.7147 4.798e-01
## devStage4_weeks 0.1362 0.3506 0.3883 7.003e-01
anova(eFitSmall, eFitBig)
## Analysis of Variance Table
##
## Model 1: gExp ~ gType + devStage
## Model 2: gExp ~ gType * devStage
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 15.12
## 2 29 7.95 4 7.17 6.54 7e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The small p-value suggests that the Big model better fits the data. The interaction term is statistically significant.
We wrote functions to prepare and plot data for more than 1 gene. But when we started fitting models and conducting tests, we only worked with 1 gene at a time. Can you use data aggregation strategies from last week to do some of the same work for small sets of genes?
(luckyGenes <- c("1419655_at", "1438815_at", "1456341_a_at", "1448690_at"))
## [1] "1419655_at" "1438815_at" "1456341_a_at" "1448690_at"
nDat <- prepDat(luckyGenes)
makeStripplot(nDat)
library(plyr)
tRes <- dlply(nDat, ~gene, function(x) t.test(gExp ~ devStage, subset(nDat,
devStage == "P2" | devStage == "4_weeks")))
# get the p values for the t-tests
t.pVals <- ddply(nDat, ~gene, function(x) {
xt <- t.test(gExp ~ devStage, subset(x, devStage == "P2" | devStage == "4_weeks"))
round(c(t.pVal = xt$p.value), 4)
})
t.pVals
## gene t.pVal
## 1 1419655_at 0.1561
## 2 1438815_at 0.0000
## 3 1456341_a_at 0.0000
## 4 1448690_at 0.1671
nFit <- dlply(nDat, ~gene, function(x) lm(gExp ~ devStage, x, gType == "wt"))
str(nFit, max.level = 0)
## List of 4
## - attr(*, "split_type")= chr "data.frame"
## - attr(*, "split_labels")='data.frame': 4 obs. of 1 variable:
names(nFit)
## [1] "1419655_at" "1438815_at" "1456341_a_at" "1448690_at"
nFit$"1419655_at"
##
## Call:
## lm(formula = gExp ~ devStage, data = x, subset = gType == "wt")
##
## Coefficients:
## (Intercept) devStageP2 devStageP6 devStageP10
## 10.75 -1.05 -1.60 -2.02
## devStage4_weeks
## -2.39
summary(nFit$"1438815_at")
##
## Call:
## lm(formula = gExp ~ devStage, data = x, subset = gType == "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4445 -0.1247 -0.0514 0.1509 0.5052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.921 0.121 73.93 < 2e-16 ***
## devStageP2 0.744 0.171 4.36 0.00056 ***
## devStageP6 1.071 0.171 6.27 1.5e-05 ***
## devStageP10 0.737 0.171 4.32 0.00061 ***
## devStage4_weeks -0.681 0.171 -3.99 0.00118 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.241 on 15 degrees of freedom
## Multiple R-squared: 0.902, Adjusted R-squared: 0.876
## F-statistic: 34.4 on 4 and 15 DF, p-value: 2.14e-07
coef(nFit$"1456341_a_at")
## (Intercept) devStageP2 devStageP6 devStageP10
## 7.2100 -0.7468 1.3692 2.4277
## devStage4_weeks
## 2.9320
summary(nFit$"1448690_at")$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.678 0.3856 22.507 5.666e-13
## devStageP2 -1.029 0.5453 -1.887 7.866e-02
## devStageP6 -1.915 0.5453 -3.511 3.152e-03
## devStageP10 -2.193 0.5453 -4.022 1.108e-03
## devStage4_weeks -2.082 0.5453 -3.818 1.680e-03
lm.coefs <- ddply(nDat, ~gene, function(x) {
xlm <- lm(gExp ~ devStage, x, gType == "wt")
round(c(coef = xlm$coefficients), 4)
})
lm.coefs
## gene coef.(Intercept) coef.devStageP2 coef.devStageP6
## 1 1419655_at 10.755 -1.0540 -1.598
## 2 1438815_at 8.921 0.7435 1.071
## 3 1456341_a_at 7.210 -0.7468 1.369
## 4 1448690_at 8.678 -1.0290 -1.915
## coef.devStageP10 coef.devStage4_weeks
## 1 -2.016 -2.3895
## 2 0.737 -0.6807
## 3 2.428 2.9320
## 4 -2.193 -2.0820
lm.se <- ddply(nDat, ~gene, function(x) {
xlm <- lm(gExp ~ devStage, x, gType == "wt")
round(c(se = summary(xlm)$coefficients[, "Std. Error"]), 4)
})
lm.tVal <- ddply(nDat, ~gene, function(x) {
xlm <- lm(gExp ~ devStage, x, gType == "wt")
round(c(tVal = summary(xlm)$coefficients[, "t value"]), 4)
})
lm.tVal
## gene tVal.(Intercept) tVal.devStageP2 tVal.devStageP6
## 1 1419655_at 40.52 -2.808 -4.257
## 2 1438815_at 73.93 4.357 6.273
## 3 1456341_a_at 20.62 -1.510 2.769
## 4 1448690_at 22.51 -1.887 -3.511
## tVal.devStageP10 tVal.devStage4_weeks
## 1 -5.371 -6.366
## 2 4.319 -3.989
## 3 4.909 5.929
## 4 -4.022 -3.818
lm.pVals <- ddply(nDat, ~gene, function(x) {
xlm <- lm(gExp ~ devStage, x, gType == "wt")
round(c(pVal = summary(xlm)$coefficients[, "Pr(>|t|)"]), 6)
})
lm.pVals
## gene pVal.(Intercept) pVal.devStageP2 pVal.devStageP6
## 1 1419655_at 0 0.013247 0.000689
## 2 1438815_at 0 0.000563 0.000015
## 3 1456341_a_at 0 0.151825 0.014336
## 4 1448690_at 0 0.078657 0.003152
## pVal.devStageP10 pVal.devStage4_weeks
## 1 0.000078 0.000013
## 2 0.000608 0.001185
## 3 0.000189 0.000028
## 4 0.001108 0.001680
lm.pVals <- ddply(nDat, ~gene, function(x) {
xlm <- lm(gExp ~ devStage, x, gType == "wt")
c(pVal = summary(xlm)$coefficients[, "Pr(>|t|)"])
})
lm.pVals
## gene pVal.(Intercept) pVal.devStageP2 pVal.devStageP6
## 1 1419655_at 9.649e-17 0.0132472 6.888e-04
## 2 1438815_at 1.220e-20 0.0005635 1.493e-05
## 3 1456341_a_at 2.029e-12 0.1518253 1.434e-02
## 4 1448690_at 5.666e-13 0.0786567 3.152e-03
## pVal.devStageP10 pVal.devStage4_weeks
## 1 7.788e-05 1.268e-05
## 2 6.082e-04 1.185e-03
## 3 1.890e-04 2.768e-05
## 4 1.108e-03 1.680e-03
nFitBig <- ddply(nDat, ~gene, function(x) {
xlm <- lm(gExp ~ devStage * gType, x)
data.frame(terms = rep(names(xlm$coefficients), levels = ncol(xlm$coefficients)),
summary(xlm)$coef)
})
nFitBig
## gene terms Estimate Std..Error t.value
## 1 1419655_at (Intercept) 10.75500 0.2311 46.54420
## 2 1419655_at devStageP2 -1.05400 0.3268 -3.22538
## 3 1419655_at devStageP6 -1.59800 0.3268 -4.89009
## 4 1419655_at devStageP10 -2.01600 0.3268 -6.16923
## 5 1419655_at devStage4_weeks -2.38950 0.3268 -7.31219
## 6 1419655_at gTypeNrlKO -0.29967 0.3530 -0.84900
## 7 1419655_at devStageP2:gTypeNrlKO 1.09617 0.4810 2.27888
## 8 1419655_at devStageP6:gTypeNrlKO 1.36117 0.4810 2.82980
## 9 1419655_at devStageP10:gTypeNrlKO 2.27817 0.4810 4.73620
## 10 1419655_at devStage4_weeks:gTypeNrlKO 2.34092 0.4810 4.86665
## 11 1438815_at (Intercept) 8.92125 0.1244 71.74211
## 12 1438815_at devStageP2 0.74350 0.1759 4.22780
## 13 1438815_at devStageP6 1.07050 0.1759 6.08724
## 14 1438815_at devStageP10 0.73700 0.1759 4.19084
## 15 1438815_at devStage4_weeks -0.68075 0.1759 -3.87098
## 16 1438815_at gTypeNrlKO 0.24475 0.1900 1.28850
## 17 1438815_at devStageP2:gTypeNrlKO 0.01950 0.2589 0.07533
## 18 1438815_at devStageP6:gTypeNrlKO 0.19350 0.2589 0.74751
## 19 1438815_at devStageP10:gTypeNrlKO 0.19875 0.2589 0.76779
## 20 1438815_at devStage4_weeks:gTypeNrlKO 0.20575 0.2589 0.79484
## 21 1456341_a_at (Intercept) 7.21000 0.3335 21.61953
## 22 1456341_a_at devStageP2 -0.74675 0.4716 -1.58333
## 23 1456341_a_at devStageP6 1.36925 0.4716 2.90321
## 24 1456341_a_at devStageP10 2.42775 0.4716 5.14754
## 25 1456341_a_at devStage4_weeks 2.93200 0.4716 6.21670
## 26 1456341_a_at gTypeNrlKO -0.16133 0.5094 -0.31670
## 27 1456341_a_at devStageP2:gTypeNrlKO -0.11367 0.6942 -0.16373
## 28 1456341_a_at devStageP6:gTypeNrlKO 0.58608 0.6942 0.84423
## 29 1456341_a_at devStageP10:gTypeNrlKO 0.40983 0.6942 0.59035
## 30 1456341_a_at devStage4_weeks:gTypeNrlKO -0.14942 0.6942 -0.21523
## 31 1448690_at (Intercept) 8.67800 0.3987 21.76755
## 32 1448690_at devStageP2 -1.02900 0.5638 -1.82512
## 33 1448690_at devStageP6 -1.91450 0.5638 -3.39571
## 34 1448690_at devStageP10 -2.19325 0.5638 -3.89012
## 35 1448690_at devStage4_weeks -2.08200 0.5638 -3.69280
## 36 1448690_at gTypeNrlKO -0.84233 0.6090 -1.38320
## 37 1448690_at devStageP2:gTypeNrlKO 0.06983 0.8299 0.08415
## 38 1448690_at devStageP6:gTypeNrlKO 0.16533 0.8299 0.19922
## 39 1448690_at devStageP10:gTypeNrlKO 0.22583 0.8299 0.27212
## 40 1448690_at devStage4_weeks:gTypeNrlKO 0.64608 0.8299 0.77852
## Pr...t..
## 1 8.368e-29
## 2 3.110e-03
## 3 3.443e-05
## 4 1.002e-06
## 5 4.703e-08
## 6 4.028e-01
## 7 3.022e-02
## 8 8.369e-03
## 9 5.278e-05
## 10 3.675e-05
## 11 3.312e-34
## 12 2.148e-04
## 13 1.254e-06
## 14 2.377e-04
## 15 5.673e-04
## 16 2.078e-01
## 17 9.405e-01
## 18 4.608e-01
## 19 4.488e-01
## 20 4.332e-01
## 21 1.970e-19
## 22 1.242e-01
## 23 6.991e-03
## 24 1.683e-05
## 25 8.805e-07
## 26 7.537e-01
## 27 8.711e-01
## 28 4.055e-01
## 29 5.595e-01
## 30 8.311e-01
## 31 1.634e-19
## 32 7.830e-02
## 33 2.003e-03
## 34 5.387e-04
## 35 9.149e-04
## 36 1.772e-01
## 37 9.335e-01
## 38 8.435e-01
## 39 7.875e-01
## 40 4.426e-01
nFitSmall <- ddply(nDat, ~gene, function(x) {
xlm <- lm(gExp ~ devStage + gType, x)
data.frame(terms = rep(names(xlm$coefficients), levels = ncol(xlm$coefficients)),
summary(xlm)$coef)
})
nFitSmall
## gene terms Estimate Std..Error t.value Pr...t..
## 1 1419655_at (Intercept) 10.13061 0.2527 40.09569 1.416e-29
## 2 1419655_at devStageP2 -0.60998 0.3254 -1.87432 6.976e-02
## 3 1419655_at devStageP6 -1.02148 0.3254 -3.13876 3.563e-03
## 4 1419655_at devStageP10 -0.98098 0.3254 -3.01431 4.923e-03
## 5 1419655_at devStage4_weeks -1.32311 0.3254 -4.06557 2.792e-04
## 6 1419655_at gTypeNrlKO 1.15724 0.2016 5.74161 2.066e-06
## 7 1438815_at (Intercept) 8.86676 0.0959 92.46083 1.956e-41
## 8 1438815_at devStageP2 0.74417 0.1235 6.02461 8.973e-07
## 9 1438815_at devStageP6 1.15817 0.1235 9.37626 7.919e-11
## 10 1438815_at devStageP10 0.82729 0.1235 6.69757 1.257e-07
## 11 1438815_at devStage4_weeks -0.58696 0.1235 -4.75185 3.825e-05
## 12 1438815_at gTypeNrlKO 0.37188 0.0765 4.86122 2.775e-05
## 13 1456341_a_at (Intercept) 7.14534 0.2598 27.49879 2.513e-24
## 14 1456341_a_at devStageP2 -0.81436 0.3347 -2.43317 2.055e-02
## 15 1456341_a_at devStageP6 1.65151 0.3347 4.93444 2.237e-05
## 16 1456341_a_at devStageP10 2.62189 0.3347 7.83376 4.981e-09
## 17 1456341_a_at devStage4_weeks 2.84651 0.3347 8.50490 7.919e-10
## 18 1456341_a_at gTypeNrlKO -0.01046 0.2073 -0.05044 9.601e-01
## 19 1448690_at (Intercept) 8.58032 0.3046 28.16486 1.177e-24
## 20 1448690_at devStageP2 -1.01036 0.3924 -2.57482 1.470e-02
## 21 1448690_at devStageP6 -1.84811 0.3924 -4.70975 4.328e-05
## 22 1448690_at devStageP10 -2.09661 0.3924 -5.34303 6.703e-06
## 23 1448690_at devStage4_weeks -1.77524 0.3924 -4.52404 7.444e-05
## 24 1448690_at gTypeNrlKO -0.61440 0.2430 -2.52817 1.643e-02
compModels <- ddply(nDat, ~gene, function(x) {
lmBig <- lm(gExp ~ devStage * gType, x)
lmSmall <- lm(gExp ~ devStage + gType, x)
anova(lmSmall, lmBig)
})
compModels
## gene Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1419655_at 33 13.023 NA NA NA NA
## 2 1419655_at 29 6.194 4 6.82919 7.9939 0.0001798
## 3 1438815_at 33 1.876 NA NA NA NA
## 4 1438815_at 29 1.794 4 0.08231 0.3327 0.8536987
## 5 1456341_a_at 33 13.774 NA NA NA NA
## 6 1456341_a_at 29 12.901 4 0.87227 0.4902 0.7428873
## 7 1448690_at 33 18.933 NA NA NA NA
## 8 1448690_at 29 18.436 4 0.49660 0.1953 0.9388662
Factors that take on many levels can be unwieldy to deal with … do you care about the effect of each level and all of it’s potential interactions? or do you only care about the factor in a big picture way?
If it represents something like time or dose or temperature … factor treatment makes it awkward to pull out natural classes of “hits”, e.g. things that go up.
Consider making a quantitative covariate, age in days, and use that to explain changes in gene expression.
library(car)
prDes$age <- recode(prDes$devStage, "'E16'=-2; 'P2'=2; 'P6'=6; 'P10'=10; '4_weeks'=28",
as.factor.result = F)
str(prDes)
## 'data.frame': 39 obs. of 5 variables:
## $ sidChar : chr "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
## $ sidNum : num 11 12 2 9 36 37 38 39 16 17 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
## $ age : num 28 28 28 28 28 28 28 28 -2 -2 ...
head(prDes)
## sidChar sidNum devStage gType age
## 1 Sample_11 11 4_weeks NrlKO 28
## 2 Sample_12 12 4_weeks NrlKO 28
## 3 Sample_2 2 4_weeks NrlKO 28
## 4 Sample_9 9 4_weeks NrlKO 28
## 5 Sample_36 36 4_weeks wt 28
## 6 Sample_37 37 4_weeks wt 28
keepers <- c("1427275_at", "1441811_x_at", "1456341_a_at")
qDat <- prDat[keepers, ]
qDat <- data.frame(gExp = as.vector(t(as.matrix(qDat))), gene = factor(rep(rownames(qDat),
each = ncol(qDat)), levels = keepers))
qDat <- data.frame(prDes, qDat)
library(lattice)
stripplot(gExp ~ devStage | gene, qDat, subset = gType == "wt", xlab = "devStage",
type = c("a", "p"))
xyplot(gExp ~ age | gene, qDat, subset = gType == "wt", xlab = "age", type = c("a",
"p"))
xyplot(gExp ~ age | gene, qDat, subset = gType == "wt", xlab = "age", type = c("p",
"r"))
linear model looks ok for the middle probe but not so much for the others
Y =α_0 +α_1*x + ε - simple linear regression
library(plyr)
linFit <- dlply(qDat, ~gene, function(x) lm(gExp ~ age, x, subset = gType ==
"wt"))
summary(linFit[["1441811_x_at"]])
##
## Call:
## lm(formula = gExp ~ age, data = x, subset = gType == "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.551 -0.375 -0.084 0.310 0.868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.07337 0.13312 60.65 < 2e-16 ***
## age -0.06818 0.00977 -6.98 1.6e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.454 on 18 degrees of freedom
## Multiple R-squared: 0.73, Adjusted R-squared: 0.715
## F-statistic: 48.7 on 1 and 18 DF, p-value: 1.62e-06
summary(linFit[["1427275_at"]])
##
## Call:
## lm(formula = gExp ~ age, data = x, subset = gType == "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.574 -0.665 -0.124 0.582 1.593
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.5397 0.2921 29.23 <2e-16 ***
## age -0.0188 0.0214 -0.88 0.39
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.997 on 18 degrees of freedom
## Multiple R-squared: 0.0409, Adjusted R-squared: -0.0123
## F-statistic: 0.769 on 1 and 18 DF, p-value: 0.392
summary(linFit[["1456341_a_at"]])
##
## Call:
## lm(formula = gExp ~ age, data = x, subset = gType == "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.658 -0.626 -0.159 0.342 2.259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.4228 0.3039 24.43 3.0e-15 ***
## age 0.1118 0.0223 5.01 9.1e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.04 on 18 degrees of freedom
## Multiple R-squared: 0.583, Adjusted R-squared: 0.559
## F-statistic: 25.1 on 1 and 18 DF, p-value: 9.05e-05
Y =α_0 +α_1*x +α_2*x2 + ε - polynomial regression
NOTE: This is a linear model, because it is linear in the alphas. It is easy but wrong to focus on the x’s and mistake this for a nonlinear model.
quadFit <- dlply(qDat, ~gene, function(x) lm(gExp ~ age + I(age^2), x, subset = gType ==
"wt"))
summary(quadFit[["1441811_x_at"]])
##
## Call:
## lm(formula = gExp ~ age + I(age^2), data = x, subset = gType ==
## "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7695 -0.2548 -0.0059 0.1366 0.8220
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.19077 0.14097 58.10 <2e-16 ***
## age -0.12384 0.03195 -3.88 0.0012 **
## I(age^2) 0.00201 0.00110 1.82 0.0866 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.428 on 17 degrees of freedom
## Multiple R-squared: 0.774, Adjusted R-squared: 0.747
## F-statistic: 29.1 on 2 and 17 DF, p-value: 3.23e-06
summary(quadFit[["1427275_at"]])
##
## Call:
## lm(formula = gExp ~ age + I(age^2), data = x, subset = gType ==
## "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.163 -0.555 0.095 0.408 0.958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.03640 0.21231 42.56 < 2e-16 ***
## age -0.25430 0.04812 -5.28 6.1e-05 ***
## I(age^2) 0.00849 0.00166 5.11 8.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.644 on 17 degrees of freedom
## Multiple R-squared: 0.622, Adjusted R-squared: 0.577
## F-statistic: 14 on 2 and 17 DF, p-value: 0.000257
summary(quadFit[["1456341_a_at"]])
##
## Call:
## lm(formula = gExp ~ age + I(age^2), data = x, subset = gType ==
## "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.621 -0.501 -0.005 0.396 1.865
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.11248 0.31092 22.88 3.3e-14 ***
## age 0.25889 0.07048 3.67 0.0019 **
## I(age^2) -0.00530 0.00243 -2.18 0.0436 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.944 on 17 degrees of freedom
## Multiple R-squared: 0.674, Adjusted R-squared: 0.635
## F-statistic: 17.6 on 2 and 17 DF, p-value: 7.34e-05
anova() - for nested models AIC() - for models that are not nested. It is a measure of information lost going from one model to the next (the lower the number the better the model)
anova(linFit[["1441811_x_at"]], quadFit[["1441811_x_at"]])
## Analysis of Variance Table
##
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18 3.72
## 2 17 3.11 1 0.606 3.31 0.087 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(linFit[["1427275_at"]], quadFit[["1427275_at"]])
## Analysis of Variance Table
##
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18 17.90
## 2 17 7.06 1 10.8 26.1 8.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(linFit[["1456341_a_at"]], quadFit[["1456341_a_at"]])
## Analysis of Variance Table
##
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18 19.4
## 2 17 15.1 1 4.23 4.75 0.044 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(linFit, quadFit)
## Error: no applicable method for 'anova' applied to an object of class
## "list"
choseModel <- function(x) {
linFit <- lm(gExp ~ age, x, subset = gType == "wt")
quadFit <- lm(gExp ~ age + I(age^2), x, subset = gType == "wt")
anova(linFit, quadFit)
}
compFits <- dlply(qDat, ~gene, choseModel)
compFits
## $`1427275_at`
## Analysis of Variance Table
##
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18 17.90
## 2 17 7.06 1 10.8 26.1 8.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $`1441811_x_at`
## Analysis of Variance Table
##
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18 3.72
## 2 17 3.11 1 0.606 3.31 0.087 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## $`1456341_a_at`
## Analysis of Variance Table
##
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18 19.4
## 2 17 15.1 1 4.23 4.75 0.044 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
## gene
## 1 1427275_at
## 2 1441811_x_at
## 3 1456341_a_at
compFits["1427275_at"]
## $`1427275_at`
## Analysis of Variance Table
##
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 18 17.90
## 2 17 7.06 1 10.8 26.1 8.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is a justyfication for going from linear to quadratic for probe “1427275_at”.
is the categorical covariate (devStage) better than the quantitative covariate (age)? factorial model
factFit <- dlply(qDat, ~gene, function(x) lm(gExp ~ devStage, x, subset = gType ==
"wt"))
factFit[["1427275_at"]]
##
## Call:
## lm(formula = gExp ~ devStage, data = x, subset = gType == "wt")
##
## Coefficients:
## (Intercept) devStageP2 devStageP6 devStageP10
## 9.64 -1.22 -1.72 -2.31
## devStage4_weeks
## -1.07
summary(factFit[["1427275_at"]])$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.636 0.3398 28.363 1.903e-14
## devStageP2 -1.216 0.4805 -2.530 2.308e-02
## devStageP6 -1.721 0.4805 -3.581 2.731e-03
## devStageP10 -2.307 0.4805 -4.801 2.332e-04
## devStage4_weeks -1.067 0.4805 -2.220 4.224e-02
summary(factFit[["1427275_at"]])
##
## Call:
## lm(formula = gExp ~ devStage, data = x, subset = gType == "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0215 -0.5538 0.0377 0.3896 1.0605
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.64 0.34 28.36 1.9e-14 ***
## devStageP2 -1.22 0.48 -2.53 0.02308 *
## devStageP6 -1.72 0.48 -3.58 0.00273 **
## devStageP10 -2.31 0.48 -4.80 0.00023 ***
## devStage4_weeks -1.07 0.48 -2.22 0.04224 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.68 on 15 degrees of freedom
## Multiple R-squared: 0.629, Adjusted R-squared: 0.53
## F-statistic: 6.36 on 4 and 15 DF, p-value: 0.00337
AIC(linFit[["1427275_at"]], quadFit[["1427275_at"]], factFit[["1427275_at"]])
## df AIC
## linFit[["1427275_at"]] 3 60.54
## quadFit[["1427275_at"]] 4 43.93
## factFit[["1427275_at"]] 6 47.55
choseModelAIC <- function(x) {
linFit <- lm(gExp ~ age, x, subset = gType == "wt")
quadFit <- lm(gExp ~ age + I(age^2), x, subset = gType == "wt")
factFit <- lm(gExp ~ devStage, x, subset = gType == "wt")
AIC(linFit, quadFit, factFit)
}
comp3Models <- dlply(qDat, ~gene, choseModelAIC)
comp3Models
## $`1427275_at`
## df AIC
## linFit 3 60.54
## quadFit 4 43.93
## factFit 6 47.55
##
## $`1441811_x_at`
## df AIC
## linFit 3 29.10
## quadFit 4 27.55
## factFit 6 27.13
##
## $`1456341_a_at`
## df AIC
## linFit 3 62.12
## quadFit 4 59.19
## factFit 6 48.70
##
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
## gene
## 1 1427275_at
## 2 1441811_x_at
## 3 1456341_a_at
For “1441811_x_at” there is not much benefit in going from linear model to either quadratic or factorial. Occam’s Razor and the KISS principle → stick w/ simple linear model.
For “1456341_a_at” it is maybe worth it to go from linear to quadratic. It is justified to go from quadratic to factorial (one way ANOVA).
jFit <- lm(gExp ~ gType * age, qDat, subset = gene == "1441811_x_at")
summary(jFit)
##
## Call:
## lm(formula = gExp ~ gType * age, data = qDat, subset = gene ==
## "1441811_x_at")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0538 -0.4119 -0.0249 0.3130 1.1442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.0734 0.1655 48.78 < 2e-16 ***
## gTypeNrlKO 0.1315 0.2407 0.55 0.59
## age -0.0682 0.0121 -5.61 2.5e-06 ***
## gTypeNrlKO:age 0.0102 0.0174 0.58 0.56
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.565 on 35 degrees of freedom
## Multiple R-squared: 0.607, Adjusted R-squared: 0.573
## F-statistic: 18 on 3 and 35 DF, p-value: 3.05e-07
jFitAlt <- lm(gExp ~ gType/age - 1, qDat, subset = gene == "1441811_x_at")
summary(jFitAlt)
##
## Call:
## lm(formula = gExp ~ gType/age - 1, data = qDat, subset = gene ==
## "1441811_x_at")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0538 -0.4119 -0.0249 0.3130 1.1442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## gTypewt 8.0734 0.1655 48.78 < 2e-16 ***
## gTypeNrlKO 8.2048 0.1748 46.95 < 2e-16 ***
## gTypewt:age -0.0682 0.0121 -5.61 2.5e-06 ***
## gTypeNrlKO:age -0.0580 0.0125 -4.64 4.8e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.565 on 35 degrees of freedom
## Multiple R-squared: 0.995, Adjusted R-squared: 0.994
## F-statistic: 1.76e+03 on 4 and 35 DF, p-value: <2e-16
(contMat <- rbind(c(1, 0, 0, 0), c(1, 1, 0, 0), c(0, 0, 1, 0), c(0, 0, 1, 1)))
## [,1] [,2] [,3] [,4]
## [1,] 1 0 0 0
## [2,] 1 1 0 0
## [3,] 0 0 1 0
## [4,] 0 0 1 1
cbind(coefDefault = coef(jFit), coefAlt = coef(jFitAlt), matrixResult = as.vector(contMat %*%
coef(jFit)))
## coefDefault coefAlt matrixResult
## (Intercept) 8.07337 8.07337 8.07337
## gTypeNrlKO 0.13148 8.20485 8.20485
## age -0.06818 -0.06818 -0.06818
## gTypeNrlKO:age 0.01019 -0.05799 -0.05799
anova(lm(gExp ~ age, qDat, subset = gene == "1441811_x_at"), jFit)
## Analysis of Variance Table
##
## Model 1: gExp ~ age
## Model 2: gExp ~ gType * age
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 37 11.8
## 2 35 11.2 2 0.598 0.94 0.4
it is not clear that genotype effectst either the intercept or the slope
quadFit <- lm(gExp ~ gType * (age + I(age^2)), qDat, subset = gene == "1427275_at")
summary(quadFit)
##
## Call:
## lm(formula = gExp ~ gType * (age + I(age^2)), data = qDat, subset = gene ==
## "1427275_at")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.163 -0.558 0.082 0.420 1.968
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.03640 0.23485 38.48 < 2e-16 ***
## gTypeNrlKO -0.78497 0.35025 -2.24 0.032 *
## age -0.25430 0.05323 -4.78 3.6e-05 ***
## I(age^2) 0.00849 0.00184 4.62 5.6e-05 ***
## gTypeNrlKO:age 0.14820 0.07823 1.89 0.067 .
## gTypeNrlKO:I(age^2) -0.00500 0.00267 -1.87 0.070 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.713 on 33 degrees of freedom
## Multiple R-squared: 0.476, Adjusted R-squared: 0.396
## F-statistic: 5.98 on 5 and 33 DF, p-value: 0.00048
# assess relevance of genotype with F test
anova(lm(gExp ~ age + I(age^2), qDat, subset = gene == "1427275_at"), lm(gExp ~
gType * (age + I(age^2)), qDat, subset = gene == "1427275_at"))
## Analysis of Variance Table
##
## Model 1: gExp ~ age + I(age^2)
## Model 2: gExp ~ gType * (age + I(age^2))
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 36 20.1
## 2 33 16.8 3 3.31 2.17 0.11
borderline evidence that genotype has some effect on the fit.