Seminar05- fitting linear models

Load the photoRec data and lattice

prDat <- read.table("../photoRecDataset/dataClean/GSE4051_data.tsv")
str(prDat, max.level = 0)
## 'data.frame':    29949 obs. of  39 variables:
prDes <- readRDS("../photoRecDataset/dataClean/GSE4051_design.rds")
str(prDes)
## 'data.frame':    39 obs. of  4 variables:
##  $ sidChar : chr  "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
##  $ sidNum  : num  11 12 2 9 36 37 38 39 16 17 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
library(lattice)

Write a function to prepare a mini data set of selected genes

Helpful info and tips:

- One row of a data.frame is still a data.frame (!), which is a list, remember? unlist() and as.vector() can be handy when converting a list or array to an atomic vector.
- Recycling can help you.

pre-define the selected genes (not necessary for small set of genes)


(luckyGenes <- c("1419655_at", "1438815_at"))
## [1] "1419655_at" "1438815_at"

subset the expression data for the luckyGenes from prDat into a new data.frame

gDat <- subset(prDat, rownames(prDat) %in% luckyGenes)
str(prDat)
## 'data.frame':    29949 obs. of  39 variables:
##  $ Sample_11: num  7.42 9.83 10 8.6 8.43 ...
##  $ Sample_12: num  7.11 9.71 9.43 8.43 8.5 ...
##  $ Sample_2 : num  7.35 9.66 9.91 8.4 8.37 ...
##  $ Sample_9 : num  7.32 9.8 9.85 8.4 8.46 ...
##  $ Sample_36: num  7.25 9.66 9.51 8.49 8.42 ...
##  $ Sample_37: num  7.04 8.38 9.21 8.75 8.26 ...
##  $ Sample_38: num  7.37 9.44 9.48 8.49 8.34 ...
##  $ Sample_39: num  7.13 8.73 9.53 8.65 8.28 ...
##  $ Sample_16: num  7.38 7.64 8.42 8.36 8.51 ...
##  $ Sample_17: num  7.34 10.03 10.24 8.37 8.89 ...
##  $ Sample_6 : num  7.24 9.71 10.17 8.84 8.54 ...
##  $ Sample_20: num  7.24 9.48 10.01 8.36 8.59 ...
##  $ Sample_21: num  7.41 10.02 10.04 8.37 8.62 ...
##  $ Sample_22: num  7.17 9.85 9.91 8.4 8.52 ...
##  $ Sample_23: num  7.07 10.13 9.91 8.49 8.64 ...
##  $ Sample_13: num  7.33 9.33 9.75 8.43 8.48 ...
##  $ Sample_15: num  7.12 9.15 9.84 8.32 8.21 ...
##  $ Sample_18: num  7.21 9.49 10.03 8.55 8.5 ...
##  $ Sample_19: num  7.21 9.21 9.59 8.31 8.31 ...
##  $ Sample_32: num  7.54 9.53 9.92 8.78 8.57 ...
##  $ Sample_33: num  7.01 8.97 9.22 8.42 8.53 ...
##  $ Sample_34: num  6.81 8.83 9.39 8.1 8.32 ...
##  $ Sample_35: num  7.15 9.22 10.06 8.35 8.45 ...
##  $ Sample_14: num  7.09 9.56 9.88 8.57 8.59 ...
##  $ Sample_3 : num  7.16 9.55 9.84 8.33 8.5 ...
##  $ Sample_5 : num  7.08 9.32 9.24 8.3 8.48 ...
##  $ Sample_8 : num  7.11 8.24 9.13 8.13 8.33 ...
##  $ Sample_24: num  7.11 9.75 9.39 8.37 8.36 ...
##  $ Sample_25: num  7.19 9.16 10.11 8.2 8.5 ...
##  $ Sample_26: num  7.18 9.49 9.41 8.73 8.39 ...
##  $ Sample_27: num  7.21 8.64 9.43 8.33 8.43 ...
##  $ Sample_1 : num  7.15 9.87 9.68 8.28 8.5 ...
##  $ Sample_10: num  7.28 10.29 9.91 8.42 8.68 ...
##  $ Sample_4 : num  7.18 10.16 9.72 8.32 8.5 ...
##  $ Sample_7 : num  7.15 8.95 9.3 8.17 8.41 ...
##  $ Sample_28: num  7.34 8.27 9.47 8.38 8.4 ...
##  $ Sample_29: num  7.66 10.03 9.88 8.56 8.69 ...
##  $ Sample_30: num  7.26 9.27 10.54 8.15 8.55 ...
##  $ Sample_31: num  7.31 9.26 10.1 8.37 8.49 ...

transpose the gDat so that genes=variables (=columns). Also reshape the data so that gene expression is in one column and the genes are changed into a factor variable with n levels (where n is the number of genes)

gDat <- data.frame(gExp = as.vector(t(as.matrix(gDat))), gene = factor(rep(rownames(gDat), 
    each = ncol(gDat)), levels = luckyGenes))
str(gDat)
## 'data.frame':    78 obs. of  2 variables:
##  $ gExp: num  10.47 9.9 10.41 10.85 8.95 ...
##  $ gene: Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...

merge gene expression data with info on samples from prDes

gDat <- data.frame(prDes, gDat)
str(gDat)
## 'data.frame':    78 obs. of  6 variables:
##  $ sidChar : chr  "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
##  $ sidNum  : num  11 12 2 9 36 37 38 39 16 17 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
##  $ gExp    : num  10.47 9.9 10.41 10.85 8.95 ...
##  $ gene    : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
head(gDat)
##     sidChar sidNum devStage gType   gExp       gene
## 1 Sample_11     11  4_weeks NrlKO 10.470 1419655_at
## 2 Sample_12     12  4_weeks NrlKO  9.897 1419655_at
## 3  Sample_2      2  4_weeks NrlKO 10.410 1419655_at
## 4  Sample_9      9  4_weeks NrlKO 10.850 1419655_at
## 5 Sample_36     36  4_weeks    wt  8.953 1419655_at
## 6 Sample_37     37  4_weeks    wt  7.793 1419655_at

put the above operations into a function named prepDat

prepDat <- function(x) {
    gDat <- subset(prDat, rownames(prDat) %in% x)
    gDat <- data.frame(gExp = as.vector(t(as.matrix(gDat))), gene = factor(rep(rownames(gDat), 
        each = ncol(gDat)), levels = x))
    gDat <- data.frame(prDes, gDat)
}

test the function

gDat <- prepDat(luckyGenes)
str(gDat)
## 'data.frame':    78 obs. of  6 variables:
##  $ sidChar : chr  "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
##  $ sidNum  : num  11 12 2 9 36 37 38 39 16 17 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
##  $ gExp    : num  10.47 9.9 10.41 10.85 8.95 ...
##  $ gene    : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
head(gDat)
##     sidChar sidNum devStage gType   gExp       gene
## 1 Sample_11     11  4_weeks NrlKO 10.470 1419655_at
## 2 Sample_12     12  4_weeks NrlKO  9.897 1419655_at
## 3  Sample_2      2  4_weeks NrlKO 10.410 1419655_at
## 4  Sample_9      9  4_weeks NrlKO 10.850 1419655_at
## 5 Sample_36     36  4_weeks    wt  8.953 1419655_at
## 6 Sample_37     37  4_weeks    wt  7.793 1419655_at
tail(gDat)
##      sidChar sidNum devStage gType   gExp       gene
## 73  Sample_4      4       P6 NrlKO 10.440 1438815_at
## 74  Sample_7      7       P6 NrlKO 10.100 1438815_at
## 75 Sample_28     28       P6    wt  9.910 1438815_at
## 76 Sample_29     29       P6    wt 10.140 1438815_at
## 77 Sample_30     30       P6    wt 10.090 1438815_at
## 78 Sample_31     31       P6    wt  9.827 1438815_at

plot to check the data

library(lattice)
stripplot(gExp ~ devStage | gene, gDat, group = gType, jitter.data = T, auto.key = T, 
    type = c("p", "a"), grid = T)

plot of chunk unnamed-chunk-8

check for other genes

test <- prepDat("1415672_at")
str(test)
## 'data.frame':    39 obs. of  6 variables:
##  $ sidChar : chr  "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
##  $ sidNum  : num  11 12 2 9 36 37 38 39 16 17 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
##  $ gExp    : num  10 9.43 9.91 9.85 9.51 ...
##  $ gene    : Factor w/ 1 level "1415672_at": 1 1 1 1 1 1 1 1 1 1 ...
test <- prepDat(c("1415672_at", "1415675_at", "1415674_a_at"))
str(test)
## 'data.frame':    117 obs. of  6 variables:
##  $ sidChar : chr  "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
##  $ sidNum  : num  11 12 2 9 36 37 38 39 16 17 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
##  $ gExp    : num  10 9.43 9.91 9.85 9.51 ...
##  $ gene    : Factor w/ 3 levels "1415672_at","1415675_at",..: 1 1 1 1 1 1 1 1 1 1 ...

Write a function that makes a stripplot of the subset data

Want to make a function that will repeat the same plot for different genes

Helpful info and tips:

- When a plot does not appear and you think it should, try surrounding it with print(). This may be necessary inside functions, loops, etc.
- If you can't or don't want to anticipate all the arguments you'd like to pass to, e.g. stripplot(), use the special function argument ... to leave yourself flexibility.
makeStripplot <- function(x, ...) {
    library(lattice)
    stripplot(gExp ~ devStage | gene, x, group = gType, jitter.data = T, auto.key = T, 
        grid = T, type = c("a", "p"), ...)
}

test it

makeStripplot(gDat)

plot of chunk unnamed-chunk-11


makeStripplot(gDat, xlab = "Developmental Stage")

plot of chunk unnamed-chunk-11

makeStripplot(gDat, xlab = "Developmental Stage", pch = 17)

plot of chunk unnamed-chunk-11

makeStripplot(gDat, xlab = "Developmental Stage", pch = 17, cex = 3)

plot of chunk unnamed-chunk-11

You can use both functions together to create a minidatset and plot it all at once:

makeStripplot(newDat <- prepDat("1460739_at"))

plot of chunk unnamed-chunk-12

makeStripplot(newDat <- prepDat(c("1460739_at", "1460741_x_at")))

plot of chunk unnamed-chunk-12

Do a two-sample test

Let's test for a difference in expected gene expression for probeset “1456341_a_at” at developmental stage P2 vs. 4 weeks post-natal (ignoring genotype, i.e. lump the wild types and knockouts together). Let's assume a common variance in the two groups.

makeStripplot(prepDat("1456341_a_at"))

plot of chunk unnamed-chunk-13

gDat <- prepDat("1456341_a_at")
str(gDat)
## 'data.frame':    39 obs. of  6 variables:
##  $ sidChar : chr  "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
##  $ sidNum  : num  11 12 2 9 36 37 38 39 16 17 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
##  $ gExp    : num  10.26 9.7 10.08 9.28 10.39 ...
##  $ gene    : Factor w/ 1 level "1456341_a_at": 1 1 1 1 1 1 1 1 1 1 ...
head(gDat)
##     sidChar sidNum devStage gType   gExp         gene
## 1 Sample_11     11  4_weeks NrlKO 10.260 1456341_a_at
## 2 Sample_12     12  4_weeks NrlKO  9.704 1456341_a_at
## 3  Sample_2      2  4_weeks NrlKO 10.080 1456341_a_at
## 4  Sample_9      9  4_weeks NrlKO  9.281 1456341_a_at
## 5 Sample_36     36  4_weeks    wt 10.390 1456341_a_at
## 6 Sample_37     37  4_weeks    wt  9.798 1456341_a_at

t.test(gExp ~ devStage, subset(gDat, devStage == "P2" | devStage == "4_weeks"), 
    equal.var = T)
## 
##  Welch Two Sample t-test
## 
## data:  gExp by devStage
## t = -18.84, df = 13.98, p-value = 2.477e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.078 -3.244
## sample estimates:
##      mean in group P2 mean in group 4_weeks 
##                 6.326                 9.987

Fit a linear model with one categorical covariate

In other words, do “one-way ANOVA”. Focus on probeset “1438786_a_at”.

gDat <- prepDat("1438786_a_at")
str(gDat)
## 'data.frame':    39 obs. of  6 variables:
##  $ sidChar : chr  "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
##  $ sidNum  : num  11 12 2 9 36 37 38 39 16 17 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
##  $ gExp    : num  8.63 7.47 8.16 8.29 9.18 ...
##  $ gene    : Factor w/ 1 level "1438786_a_at": 1 1 1 1 1 1 1 1 1 1 ...
library(lattice)
makeStripplot(gDat)

plot of chunk unnamed-chunk-14

gFit <- lm(gExp ~ devStage, gDat, gType == "wt")
summary(gFit)
## 
## Call:
## lm(formula = gExp ~ devStage, data = gDat, subset = gType == 
##     "wt")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1565 -0.4400  0.0288  0.4915  1.2065 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        8.523      0.379   22.50  5.7e-13 ***
## devStageP2        -1.450      0.536   -2.71    0.016 *  
## devStageP6        -0.107      0.536   -0.20    0.845    
## devStageP10       -1.201      0.536   -2.24    0.040 *  
## devStage4_weeks    0.081      0.536    0.15    0.882    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.758 on 15 degrees of freedom
## Multiple R-squared:  0.497,  Adjusted R-squared:  0.363 
## F-statistic: 3.71 on 4 and 15 DF,  p-value: 0.0272

does the intercept look plausible given the plot? - yes (relative to reference - E16) How about the devStageP2 effect, etc.? - there is some effect at dev. stages P2 and P10 (relative to E16)

perform inference for a contrast

The “W” shape of the expression profile for “1438786_a_at” means that the expression values for developmental stages P2 and P10 are quite similar. We could formally test whether the P2 and P10 effects are equal or, equivalently, whether their difference is equal to zero.

First extract the parameter estimates from the linear model you fit above. You did save the fit, right? If not, edit your code to do so and re-run that bit. Hint: the coef() function will pull parameter estimates out of a wide array of fitted model objects in R.

coef(gFit)
##     (Intercept)      devStageP2      devStageP6     devStageP10 
##          8.5227         -1.4503         -0.1068         -1.2013 
## devStage4_weeks 
##          0.0810

Now you need to construct the contrast matrix to form the difference between the P2 and P10 effects. I called mine contMat. Hint: it's OK for a matrix to have just one row.

contMat <- matrix(c(0, 1, 0, -1, 0), nrow = 1)
(obsDif <- contMat %*% coef(gFit))
##        [,1]
## [1,] -0.249

Let's check that this really is the observed difference in sample mean for the wild type mice, P2 vs. P10.

(sampMeans <- aggregate(gExp ~ devStage, gDat, FUN = mean, subset = gType == 
    "wt"))
##   devStage  gExp
## 1      E16 8.523
## 2       P2 7.072
## 3       P6 8.416
## 4      P10 7.322
## 5  4_weeks 8.604
with(sampMeans, gExp[devStage == "P2"] - gExp[devStage == "P10"])
## [1] -0.249

Now we need the (estimated) standard error for our contrast. The variance-covariance matrix of the parameters estimated in the original model can be obtained with vcov() and is equal to (XT X)−1 σ2.

vcov(gFit)
##                 (Intercept) devStageP2 devStageP6 devStageP10
## (Intercept)          0.1435    -0.1435    -0.1435     -0.1435
## devStageP2          -0.1435     0.2870     0.1435      0.1435
## devStageP6          -0.1435     0.1435     0.2870      0.1435
## devStageP10         -0.1435     0.1435     0.1435      0.2870
## devStage4_weeks     -0.1435     0.1435     0.1435      0.1435
##                 devStage4_weeks
## (Intercept)             -0.1435
## devStageP2               0.1435
## devStageP6               0.1435
## devStageP10              0.1435
## devStage4_weeks          0.2870

Let's check that this is really true. If we take the diagonal elements and take their square root, they should be exactly equal to the standard errors reported for out original model. Are they?

summary(gFit)$coefficients[, "Std. Error"]
##     (Intercept)      devStageP2      devStageP6     devStageP10 
##          0.3788          0.5357          0.5357          0.5357 
## devStage4_weeks 
##          0.5357
sqrt(diag(vcov(gFit)))
##     (Intercept)      devStageP2      devStageP6     devStageP10 
##          0.3788          0.5357          0.5357          0.5357 
## devStage4_weeks 
##          0.5357

Note for the future that you can get the typical matrix of inferential results from most fitted model objects for further computing like so:

summary(gFit)$coefficients
##                 Estimate Std. Error t value  Pr(>|t|)
## (Intercept)       8.5227     0.3788 22.4979 5.697e-13
## devStageP2       -1.4503     0.5357 -2.7070 1.623e-02
## devStageP6       -0.1068     0.5357 -0.1993 8.447e-01
## devStageP10      -1.2013     0.5357 -2.2422 4.049e-02
## devStage4_weeks   0.0810     0.5357  0.1512 8.818e-01

Returning to our test of the P2 vs. P10 contrast, recall that the variance-covariance matrix of a contrast obtained as CαHat is C(XT X)−1 CT σ2.

(estSE <- contMat %*% vcov(gFit) %*% t(contMat))
##       [,1]
## [1,] 0.287

Now we form a test statistic as an observed effect divided by its estimated standard error:

(testStat <- obsDif/estSE)
##         [,1]
## [1,] -0.8676

Under the null hypothesis that the contrast equals zero, i.e. there is no true difference in mean for expression at P2 and P10 in wild type mice for this gene, the test statistic has a t distribution with n−p=20−5=15 degrees of freedom. We compute a two-sided p-value and we're done.

2 * pt(abs(testStat), df = df.residual(gFit), lower.tail = FALSE)
##        [,1]
## [1,] 0.3993

Not surprisingly, this p-value is rather large and we conclude there is no difference.

Fit a linear model with two categorical covariates

Let's focus on probeset “1448690_at”. Use your functions to prepare the data and plot it.

oDat <- prepDat("1448690_at")
makeStripplot(oDat)

plot of chunk unnamed-chunk-24

Fit a linear model with covariates gType and devStage and include their interactions.

oFitBig <- lm(gExp ~ gType + devStage + gType * devStage, oDat)
summary(oFitBig)$coef
##                            Estimate Std. Error  t value  Pr(>|t|)
## (Intercept)                 8.67800     0.3987 21.76755 1.634e-19
## gTypeNrlKO                 -0.84233     0.6090 -1.38320 1.772e-01
## devStageP2                 -1.02900     0.5638 -1.82512 7.830e-02
## devStageP6                 -1.91450     0.5638 -3.39571 2.003e-03
## devStageP10                -2.19325     0.5638 -3.89012 5.387e-04
## devStage4_weeks            -2.08200     0.5638 -3.69280 9.149e-04
## gTypeNrlKO:devStageP2       0.06983     0.8299  0.08415 9.335e-01
## gTypeNrlKO:devStageP6       0.16533     0.8299  0.19922 8.435e-01
## gTypeNrlKO:devStageP10      0.22583     0.8299  0.27212 7.875e-01
## gTypeNrlKO:devStage4_weeks  0.64608     0.8299  0.77852 4.426e-01

Is the intercept plausible? - Yes How about the various effects? Do the ones with small p-values, e.g. meeting a conventional cut-off of 0.05, look 'real' to you? - Yes, looking at the plot there is no gType effect (paralel lines). There is a visible difference in gene expression between stages P6, P10, and 4weeks as compared to E16. Although there is also a difference between P2 and E16 it is not statistically significant.

Fit a related, smaller model with the same covariates, but this time omit the interaction.

oFitSmall <- lm(gExp ~ devStage + gType, oDat)
summary(oFitSmall)$coef
##                 Estimate Std. Error t value  Pr(>|t|)
## (Intercept)       8.5803     0.3046  28.165 1.177e-24
## devStageP2       -1.0104     0.3924  -2.575 1.470e-02
## devStageP6       -1.8481     0.3924  -4.710 4.328e-05
## devStageP10      -2.0966     0.3924  -5.343 6.703e-06
## devStage4_weeks  -1.7752     0.3924  -4.524 7.444e-05
## gTypeNrlKO       -0.6144     0.2430  -2.528 1.643e-02

Now let's determine if the interaction terms are truly necessary.

From the plot, the case for interaction seems very weak. This can be assessed with an F test that essentially looks at the reduction in the sum of squared residuals due to using a larger, more complicated model and determines if it is “big enough” given the number of additional parameters used. Recall the anova() function can take two fitted models, one nested within the other, and perform this test. (anova() can also be used on a single model to assess significance of terms, but remember the problem with the standard anova() function and unbalanced data. See references given in lecture for remedies.)

anova(oFitSmall, oFitBig)
## Analysis of Variance Table
## 
## Model 1: gExp ~ devStage + gType
## Model 2: gExp ~ gType + devStage + gType * devStage
##   Res.Df  RSS Df Sum of Sq   F Pr(>F)
## 1     33 18.9                        
## 2     29 18.4  4     0.497 0.2   0.94

With a p-value awfully close to one, we confirm that, no, there is no evidence for interaction in this particular case.

Let's look at at probeset “1429225_at” for more exciting result

eDat <- prepDat("1429225_at")
makeStripplot(eDat)

plot of chunk unnamed-chunk-28

From the plot it looks like there may be interaction between gType and devStage.

Fit a linear model with covariates gType and devStage and include their interactions.

eFitBig <- lm(gExp ~ gType * devStage, eDat)
summary(eFitBig)$coef
##                            Estimate Std. Error t value  Pr(>|t|)
## (Intercept)                  7.3125     0.2617 27.9391 1.619e-22
## gTypeNrlKO                  -0.2602     0.3998 -0.6507 5.203e-01
## devStageP2                  -1.1582     0.3701 -3.1292 3.973e-03
## devStageP6                  -1.2495     0.3701 -3.3757 2.110e-03
## devStageP10                 -1.0717     0.3701 -2.8955 7.125e-03
## devStage4_weeks             -0.9088     0.3701 -2.4551 2.032e-02
## gTypeNrlKO:devStageP2        0.2804     0.5448  0.5147 6.107e-01
## gTypeNrlKO:devStageP6        0.7589     0.5448  1.3929 1.742e-01
## gTypeNrlKO:devStageP10       1.7914     0.5448  3.2880 2.648e-03
## gTypeNrlKO:devStage4_weeks   2.2389     0.5448  4.1094 2.970e-04

It looks like there is an interaction between gType and devStage and P6, P10, and 4weeks

Now, fit a smaller model omitting the interaction term

eFitSmall <- lm(gExp ~ gType + devStage, eDat)
summary(eFitSmall)$coef
##                 Estimate Std. Error t value  Pr(>|t|)
## (Intercept)       6.8652     0.2722 25.2199 3.848e-23
## gTypeNrlKO        0.7836     0.2172  3.6085 1.007e-03
## devStageP2       -1.0926     0.3506 -3.1161 3.780e-03
## devStageP6       -0.9446     0.3506 -2.6940 1.101e-02
## devStageP10      -0.2506     0.3506 -0.7147 4.798e-01
## devStage4_weeks   0.1362     0.3506  0.3883 7.003e-01

Now let's determine if the interaction terms are truly necessary.

anova(eFitSmall, eFitBig)
## Analysis of Variance Table
## 
## Model 1: gExp ~ gType + devStage
## Model 2: gExp ~ gType * devStage
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1     33 15.12                             
## 2     29  7.95  4      7.17 6.54  7e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The small p-value suggests that the Big model better fits the data. The interaction term is statistically significant.

Use aggregation techniques to fit linear models to more than one gene

We wrote functions to prepare and plot data for more than 1 gene. But when we started fitting models and conducting tests, we only worked with 1 gene at a time. Can you use data aggregation strategies from last week to do some of the same work for small sets of genes?

Use the prepDat function made above to generate dataset for 4 genes. Then plot the gene expression data vs. devStage using teh makeStripplot function from above

(luckyGenes <- c("1419655_at", "1438815_at", "1456341_a_at", "1448690_at"))
## [1] "1419655_at"   "1438815_at"   "1456341_a_at" "1448690_at"
nDat <- prepDat(luckyGenes)
makeStripplot(nDat)

plot of chunk unnamed-chunk-32

apply a t.test to all 4 genes using the plyr package

library(plyr)
tRes <- dlply(nDat, ~gene, function(x) t.test(gExp ~ devStage, subset(nDat, 
    devStage == "P2" | devStage == "4_weeks")))

# get the p values for the t-tests

t.pVals <- ddply(nDat, ~gene, function(x) {
    xt <- t.test(gExp ~ devStage, subset(x, devStage == "P2" | devStage == "4_weeks"))
    round(c(t.pVal = xt$p.value), 4)
})
t.pVals
##           gene t.pVal
## 1   1419655_at 0.1561
## 2   1438815_at 0.0000
## 3 1456341_a_at 0.0000
## 4   1448690_at 0.1671

Fit a linear model with categorical covariate to all 4 genes

nFit <- dlply(nDat, ~gene, function(x) lm(gExp ~ devStage, x, gType == "wt"))

str(nFit, max.level = 0)
## List of 4
##  - attr(*, "split_type")= chr "data.frame"
##  - attr(*, "split_labels")='data.frame': 4 obs. of  1 variable:
names(nFit)
## [1] "1419655_at"   "1438815_at"   "1456341_a_at" "1448690_at"
nFit$"1419655_at"
## 
## Call:
## lm(formula = gExp ~ devStage, data = x, subset = gType == "wt")
## 
## Coefficients:
##     (Intercept)       devStageP2       devStageP6      devStageP10  
##           10.75            -1.05            -1.60            -2.02  
## devStage4_weeks  
##           -2.39
summary(nFit$"1438815_at")
## 
## Call:
## lm(formula = gExp ~ devStage, data = x, subset = gType == "wt")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4445 -0.1247 -0.0514  0.1509  0.5052 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        8.921      0.121   73.93  < 2e-16 ***
## devStageP2         0.744      0.171    4.36  0.00056 ***
## devStageP6         1.071      0.171    6.27  1.5e-05 ***
## devStageP10        0.737      0.171    4.32  0.00061 ***
## devStage4_weeks   -0.681      0.171   -3.99  0.00118 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.241 on 15 degrees of freedom
## Multiple R-squared:  0.902,  Adjusted R-squared:  0.876 
## F-statistic: 34.4 on 4 and 15 DF,  p-value: 2.14e-07
coef(nFit$"1456341_a_at")
##     (Intercept)      devStageP2      devStageP6     devStageP10 
##          7.2100         -0.7468          1.3692          2.4277 
## devStage4_weeks 
##          2.9320
summary(nFit$"1448690_at")$coefficients
##                 Estimate Std. Error t value  Pr(>|t|)
## (Intercept)        8.678     0.3856  22.507 5.666e-13
## devStageP2        -1.029     0.5453  -1.887 7.866e-02
## devStageP6        -1.915     0.5453  -3.511 3.152e-03
## devStageP10       -2.193     0.5453  -4.022 1.108e-03
## devStage4_weeks   -2.082     0.5453  -3.818 1.680e-03

To get the coefficients for all 4 genes

lm.coefs <- ddply(nDat, ~gene, function(x) {
    xlm <- lm(gExp ~ devStage, x, gType == "wt")
    round(c(coef = xlm$coefficients), 4)
})
lm.coefs
##           gene coef.(Intercept) coef.devStageP2 coef.devStageP6
## 1   1419655_at           10.755         -1.0540          -1.598
## 2   1438815_at            8.921          0.7435           1.071
## 3 1456341_a_at            7.210         -0.7468           1.369
## 4   1448690_at            8.678         -1.0290          -1.915
##   coef.devStageP10 coef.devStage4_weeks
## 1           -2.016              -2.3895
## 2            0.737              -0.6807
## 3            2.428               2.9320
## 4           -2.193              -2.0820

To get standard errors

lm.se <- ddply(nDat, ~gene, function(x) {
    xlm <- lm(gExp ~ devStage, x, gType == "wt")
    round(c(se = summary(xlm)$coefficients[, "Std. Error"]), 4)
})

To get t-values

lm.tVal <- ddply(nDat, ~gene, function(x) {
    xlm <- lm(gExp ~ devStage, x, gType == "wt")
    round(c(tVal = summary(xlm)$coefficients[, "t value"]), 4)
})
lm.tVal
##           gene tVal.(Intercept) tVal.devStageP2 tVal.devStageP6
## 1   1419655_at            40.52          -2.808          -4.257
## 2   1438815_at            73.93           4.357           6.273
## 3 1456341_a_at            20.62          -1.510           2.769
## 4   1448690_at            22.51          -1.887          -3.511
##   tVal.devStageP10 tVal.devStage4_weeks
## 1           -5.371               -6.366
## 2            4.319               -3.989
## 3            4.909                5.929
## 4           -4.022               -3.818

To get p-values

lm.pVals <- ddply(nDat, ~gene, function(x) {
    xlm <- lm(gExp ~ devStage, x, gType == "wt")
    round(c(pVal = summary(xlm)$coefficients[, "Pr(>|t|)"]), 6)
})
lm.pVals
##           gene pVal.(Intercept) pVal.devStageP2 pVal.devStageP6
## 1   1419655_at                0        0.013247        0.000689
## 2   1438815_at                0        0.000563        0.000015
## 3 1456341_a_at                0        0.151825        0.014336
## 4   1448690_at                0        0.078657        0.003152
##   pVal.devStageP10 pVal.devStage4_weeks
## 1         0.000078             0.000013
## 2         0.000608             0.001185
## 3         0.000189             0.000028
## 4         0.001108             0.001680

To get p-values w/o round up

lm.pVals <- ddply(nDat, ~gene, function(x) {
    xlm <- lm(gExp ~ devStage, x, gType == "wt")
    c(pVal = summary(xlm)$coefficients[, "Pr(>|t|)"])
})
lm.pVals
##           gene pVal.(Intercept) pVal.devStageP2 pVal.devStageP6
## 1   1419655_at        9.649e-17       0.0132472       6.888e-04
## 2   1438815_at        1.220e-20       0.0005635       1.493e-05
## 3 1456341_a_at        2.029e-12       0.1518253       1.434e-02
## 4   1448690_at        5.666e-13       0.0786567       3.152e-03
##   pVal.devStageP10 pVal.devStage4_weeks
## 1        7.788e-05            1.268e-05
## 2        6.082e-04            1.185e-03
## 3        1.890e-04            2.768e-05
## 4        1.108e-03            1.680e-03

fit a linear model with two covariates (gType and devStage) and their interaction (big) for the 4 genes and return the summary for the coefficients.

nFitBig <- ddply(nDat, ~gene, function(x) {
    xlm <- lm(gExp ~ devStage * gType, x)
    data.frame(terms = rep(names(xlm$coefficients), levels = ncol(xlm$coefficients)), 
        summary(xlm)$coef)
})
nFitBig
##            gene                      terms Estimate Std..Error  t.value
## 1    1419655_at                (Intercept) 10.75500     0.2311 46.54420
## 2    1419655_at                 devStageP2 -1.05400     0.3268 -3.22538
## 3    1419655_at                 devStageP6 -1.59800     0.3268 -4.89009
## 4    1419655_at                devStageP10 -2.01600     0.3268 -6.16923
## 5    1419655_at            devStage4_weeks -2.38950     0.3268 -7.31219
## 6    1419655_at                 gTypeNrlKO -0.29967     0.3530 -0.84900
## 7    1419655_at      devStageP2:gTypeNrlKO  1.09617     0.4810  2.27888
## 8    1419655_at      devStageP6:gTypeNrlKO  1.36117     0.4810  2.82980
## 9    1419655_at     devStageP10:gTypeNrlKO  2.27817     0.4810  4.73620
## 10   1419655_at devStage4_weeks:gTypeNrlKO  2.34092     0.4810  4.86665
## 11   1438815_at                (Intercept)  8.92125     0.1244 71.74211
## 12   1438815_at                 devStageP2  0.74350     0.1759  4.22780
## 13   1438815_at                 devStageP6  1.07050     0.1759  6.08724
## 14   1438815_at                devStageP10  0.73700     0.1759  4.19084
## 15   1438815_at            devStage4_weeks -0.68075     0.1759 -3.87098
## 16   1438815_at                 gTypeNrlKO  0.24475     0.1900  1.28850
## 17   1438815_at      devStageP2:gTypeNrlKO  0.01950     0.2589  0.07533
## 18   1438815_at      devStageP6:gTypeNrlKO  0.19350     0.2589  0.74751
## 19   1438815_at     devStageP10:gTypeNrlKO  0.19875     0.2589  0.76779
## 20   1438815_at devStage4_weeks:gTypeNrlKO  0.20575     0.2589  0.79484
## 21 1456341_a_at                (Intercept)  7.21000     0.3335 21.61953
## 22 1456341_a_at                 devStageP2 -0.74675     0.4716 -1.58333
## 23 1456341_a_at                 devStageP6  1.36925     0.4716  2.90321
## 24 1456341_a_at                devStageP10  2.42775     0.4716  5.14754
## 25 1456341_a_at            devStage4_weeks  2.93200     0.4716  6.21670
## 26 1456341_a_at                 gTypeNrlKO -0.16133     0.5094 -0.31670
## 27 1456341_a_at      devStageP2:gTypeNrlKO -0.11367     0.6942 -0.16373
## 28 1456341_a_at      devStageP6:gTypeNrlKO  0.58608     0.6942  0.84423
## 29 1456341_a_at     devStageP10:gTypeNrlKO  0.40983     0.6942  0.59035
## 30 1456341_a_at devStage4_weeks:gTypeNrlKO -0.14942     0.6942 -0.21523
## 31   1448690_at                (Intercept)  8.67800     0.3987 21.76755
## 32   1448690_at                 devStageP2 -1.02900     0.5638 -1.82512
## 33   1448690_at                 devStageP6 -1.91450     0.5638 -3.39571
## 34   1448690_at                devStageP10 -2.19325     0.5638 -3.89012
## 35   1448690_at            devStage4_weeks -2.08200     0.5638 -3.69280
## 36   1448690_at                 gTypeNrlKO -0.84233     0.6090 -1.38320
## 37   1448690_at      devStageP2:gTypeNrlKO  0.06983     0.8299  0.08415
## 38   1448690_at      devStageP6:gTypeNrlKO  0.16533     0.8299  0.19922
## 39   1448690_at     devStageP10:gTypeNrlKO  0.22583     0.8299  0.27212
## 40   1448690_at devStage4_weeks:gTypeNrlKO  0.64608     0.8299  0.77852
##     Pr...t..
## 1  8.368e-29
## 2  3.110e-03
## 3  3.443e-05
## 4  1.002e-06
## 5  4.703e-08
## 6  4.028e-01
## 7  3.022e-02
## 8  8.369e-03
## 9  5.278e-05
## 10 3.675e-05
## 11 3.312e-34
## 12 2.148e-04
## 13 1.254e-06
## 14 2.377e-04
## 15 5.673e-04
## 16 2.078e-01
## 17 9.405e-01
## 18 4.608e-01
## 19 4.488e-01
## 20 4.332e-01
## 21 1.970e-19
## 22 1.242e-01
## 23 6.991e-03
## 24 1.683e-05
## 25 8.805e-07
## 26 7.537e-01
## 27 8.711e-01
## 28 4.055e-01
## 29 5.595e-01
## 30 8.311e-01
## 31 1.634e-19
## 32 7.830e-02
## 33 2.003e-03
## 34 5.387e-04
## 35 9.149e-04
## 36 1.772e-01
## 37 9.335e-01
## 38 8.435e-01
## 39 7.875e-01
## 40 4.426e-01

fit a model with two covariates w/o their interaction (small)

nFitSmall <- ddply(nDat, ~gene, function(x) {
    xlm <- lm(gExp ~ devStage + gType, x)
    data.frame(terms = rep(names(xlm$coefficients), levels = ncol(xlm$coefficients)), 
        summary(xlm)$coef)
})
nFitSmall
##            gene           terms Estimate Std..Error  t.value  Pr...t..
## 1    1419655_at     (Intercept) 10.13061     0.2527 40.09569 1.416e-29
## 2    1419655_at      devStageP2 -0.60998     0.3254 -1.87432 6.976e-02
## 3    1419655_at      devStageP6 -1.02148     0.3254 -3.13876 3.563e-03
## 4    1419655_at     devStageP10 -0.98098     0.3254 -3.01431 4.923e-03
## 5    1419655_at devStage4_weeks -1.32311     0.3254 -4.06557 2.792e-04
## 6    1419655_at      gTypeNrlKO  1.15724     0.2016  5.74161 2.066e-06
## 7    1438815_at     (Intercept)  8.86676     0.0959 92.46083 1.956e-41
## 8    1438815_at      devStageP2  0.74417     0.1235  6.02461 8.973e-07
## 9    1438815_at      devStageP6  1.15817     0.1235  9.37626 7.919e-11
## 10   1438815_at     devStageP10  0.82729     0.1235  6.69757 1.257e-07
## 11   1438815_at devStage4_weeks -0.58696     0.1235 -4.75185 3.825e-05
## 12   1438815_at      gTypeNrlKO  0.37188     0.0765  4.86122 2.775e-05
## 13 1456341_a_at     (Intercept)  7.14534     0.2598 27.49879 2.513e-24
## 14 1456341_a_at      devStageP2 -0.81436     0.3347 -2.43317 2.055e-02
## 15 1456341_a_at      devStageP6  1.65151     0.3347  4.93444 2.237e-05
## 16 1456341_a_at     devStageP10  2.62189     0.3347  7.83376 4.981e-09
## 17 1456341_a_at devStage4_weeks  2.84651     0.3347  8.50490 7.919e-10
## 18 1456341_a_at      gTypeNrlKO -0.01046     0.2073 -0.05044 9.601e-01
## 19   1448690_at     (Intercept)  8.58032     0.3046 28.16486 1.177e-24
## 20   1448690_at      devStageP2 -1.01036     0.3924 -2.57482 1.470e-02
## 21   1448690_at      devStageP6 -1.84811     0.3924 -4.70975 4.328e-05
## 22   1448690_at     devStageP10 -2.09661     0.3924 -5.34303 6.703e-06
## 23   1448690_at devStage4_weeks -1.77524     0.3924 -4.52404 7.444e-05
## 24   1448690_at      gTypeNrlKO -0.61440     0.2430 -2.52817 1.643e-02

test if the interaction term improves the model

compModels <- ddply(nDat, ~gene, function(x) {
    lmBig <- lm(gExp ~ devStage * gType, x)
    lmSmall <- lm(gExp ~ devStage + gType, x)
    anova(lmSmall, lmBig)
})
compModels
##           gene Res.Df    RSS Df Sum of Sq      F    Pr(>F)
## 1   1419655_at     33 13.023 NA        NA     NA        NA
## 2   1419655_at     29  6.194  4   6.82919 7.9939 0.0001798
## 3   1438815_at     33  1.876 NA        NA     NA        NA
## 4   1438815_at     29  1.794  4   0.08231 0.3327 0.8536987
## 5 1456341_a_at     33 13.774 NA        NA     NA        NA
## 6 1456341_a_at     29 12.901  4   0.87227 0.4902 0.7428873
## 7   1448690_at     33 18.933 NA        NA     NA        NA
## 8   1448690_at     29 18.436  4   0.49660 0.1953 0.9388662

modeling with quantitative covariate

Factors that take on many levels can be unwieldy to deal with … do you care about the effect of each level and all of it’s potential interactions? or do you only care about the factor in a big picture way?

If it represents something like time or dose or temperature … factor treatment makes it awkward to pull out natural classes of “hits”, e.g. things that go up.

Consider making a quantitative covariate, age in days, and use that to explain changes in gene expression.

recode() is a function in add-on package 'car'

library(car)
prDes$age <- recode(prDes$devStage, "'E16'=-2; 'P2'=2; 'P6'=6; 'P10'=10; '4_weeks'=28", 
    as.factor.result = F)
str(prDes)
## 'data.frame':    39 obs. of  5 variables:
##  $ sidChar : chr  "Sample_11" "Sample_12" "Sample_2" "Sample_9" ...
##  $ sidNum  : num  11 12 2 9 36 37 38 39 16 17 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 5 5 5 5 5 5 5 5 1 1 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 1 1 1 1 2 2 ...
##  $ age     : num  28 28 28 28 28 28 28 28 -2 -2 ...
head(prDes)
##     sidChar sidNum devStage gType age
## 1 Sample_11     11  4_weeks NrlKO  28
## 2 Sample_12     12  4_weeks NrlKO  28
## 3  Sample_2      2  4_weeks NrlKO  28
## 4  Sample_9      9  4_weeks NrlKO  28
## 5 Sample_36     36  4_weeks    wt  28
## 6 Sample_37     37  4_weeks    wt  28

lets look at probesets '1427275_at', '1441811_x_at', '1456341_a_at'

keepers <- c("1427275_at", "1441811_x_at", "1456341_a_at")
qDat <- prDat[keepers, ]
qDat <- data.frame(gExp = as.vector(t(as.matrix(qDat))), gene = factor(rep(rownames(qDat), 
    each = ncol(qDat)), levels = keepers))
qDat <- data.frame(prDes, qDat)

library(lattice)
stripplot(gExp ~ devStage | gene, qDat, subset = gType == "wt", xlab = "devStage", 
    type = c("a", "p"))

plot of chunk unnamed-chunk-44

xyplot(gExp ~ age | gene, qDat, subset = gType == "wt", xlab = "age", type = c("a", 
    "p"))

plot of chunk unnamed-chunk-44


xyplot(gExp ~ age | gene, qDat, subset = gType == "wt", xlab = "age", type = c("p", 
    "r"))

plot of chunk unnamed-chunk-44

linear model looks ok for the middle probe but not so much for the others

lets fit linear model

Y =α_0 +α_1*x + ε - simple linear regression

library(plyr)
linFit <- dlply(qDat, ~gene, function(x) lm(gExp ~ age, x, subset = gType == 
    "wt"))
summary(linFit[["1441811_x_at"]])
## 
## Call:
## lm(formula = gExp ~ age, data = x, subset = gType == "wt")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.551 -0.375 -0.084  0.310  0.868 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.07337    0.13312   60.65  < 2e-16 ***
## age         -0.06818    0.00977   -6.98  1.6e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.454 on 18 degrees of freedom
## Multiple R-squared:  0.73,   Adjusted R-squared:  0.715 
## F-statistic: 48.7 on 1 and 18 DF,  p-value: 1.62e-06
summary(linFit[["1427275_at"]])
## 
## Call:
## lm(formula = gExp ~ age, data = x, subset = gType == "wt")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.574 -0.665 -0.124  0.582  1.593 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.5397     0.2921   29.23   <2e-16 ***
## age          -0.0188     0.0214   -0.88     0.39    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.997 on 18 degrees of freedom
## Multiple R-squared:  0.0409, Adjusted R-squared:  -0.0123 
## F-statistic: 0.769 on 1 and 18 DF,  p-value: 0.392
summary(linFit[["1456341_a_at"]])
## 
## Call:
## lm(formula = gExp ~ age, data = x, subset = gType == "wt")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.658 -0.626 -0.159  0.342  2.259 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.4228     0.3039   24.43  3.0e-15 ***
## age           0.1118     0.0223    5.01  9.1e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.04 on 18 degrees of freedom
## Multiple R-squared:  0.583,  Adjusted R-squared:  0.559 
## F-statistic: 25.1 on 1 and 18 DF,  p-value: 9.05e-05

now lets fit polynomial regression (still a linear model)

Y =α_0 +α_1*x +α_2*x2 + ε - polynomial regression

NOTE: This is a linear model, because it is linear in the alphas. It is easy but wrong to focus on the x’s and mistake this for a nonlinear model.

quadFit <- dlply(qDat, ~gene, function(x) lm(gExp ~ age + I(age^2), x, subset = gType == 
    "wt"))
summary(quadFit[["1441811_x_at"]])
## 
## Call:
## lm(formula = gExp ~ age + I(age^2), data = x, subset = gType == 
##     "wt")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7695 -0.2548 -0.0059  0.1366  0.8220 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.19077    0.14097   58.10   <2e-16 ***
## age         -0.12384    0.03195   -3.88   0.0012 ** 
## I(age^2)     0.00201    0.00110    1.82   0.0866 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.428 on 17 degrees of freedom
## Multiple R-squared:  0.774,  Adjusted R-squared:  0.747 
## F-statistic: 29.1 on 2 and 17 DF,  p-value: 3.23e-06
summary(quadFit[["1427275_at"]])
## 
## Call:
## lm(formula = gExp ~ age + I(age^2), data = x, subset = gType == 
##     "wt")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.163 -0.555  0.095  0.408  0.958 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.03640    0.21231   42.56  < 2e-16 ***
## age         -0.25430    0.04812   -5.28  6.1e-05 ***
## I(age^2)     0.00849    0.00166    5.11  8.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.644 on 17 degrees of freedom
## Multiple R-squared:  0.622,  Adjusted R-squared:  0.577 
## F-statistic:   14 on 2 and 17 DF,  p-value: 0.000257
summary(quadFit[["1456341_a_at"]])
## 
## Call:
## lm(formula = gExp ~ age + I(age^2), data = x, subset = gType == 
##     "wt")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.621 -0.501 -0.005  0.396  1.865 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.11248    0.31092   22.88  3.3e-14 ***
## age          0.25889    0.07048    3.67   0.0019 ** 
## I(age^2)    -0.00530    0.00243   -2.18   0.0436 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.944 on 17 degrees of freedom
## Multiple R-squared:  0.674,  Adjusted R-squared:  0.635 
## F-statistic: 17.6 on 2 and 17 DF,  p-value: 7.34e-05

F- statistics is used for model comparison

anova() - for nested models AIC() - for models that are not nested. It is a measure of information lost going from one model to the next (the lower the number the better the model)

which model is better? simple linear or polynomial (quadratic)

anova(linFit[["1441811_x_at"]], quadFit[["1441811_x_at"]])
## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## 1     18 3.72                           
## 2     17 3.11  1     0.606 3.31  0.087 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(linFit[["1427275_at"]], quadFit[["1427275_at"]])
## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
##   Res.Df   RSS Df Sum of Sq    F  Pr(>F)    
## 1     18 17.90                              
## 2     17  7.06  1      10.8 26.1 8.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(linFit[["1456341_a_at"]], quadFit[["1456341_a_at"]])
## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## 1     18 19.4                           
## 2     17 15.1  1      4.23 4.75  0.044 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(linFit, quadFit)
## Error: no applicable method for 'anova' applied to an object of class
## "list"

do all 3 genes simultaneously

choseModel <- function(x) {
    linFit <- lm(gExp ~ age, x, subset = gType == "wt")
    quadFit <- lm(gExp ~ age + I(age^2), x, subset = gType == "wt")
    anova(linFit, quadFit)
}

compFits <- dlply(qDat, ~gene, choseModel)
compFits
## $`1427275_at`
## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
##   Res.Df   RSS Df Sum of Sq    F  Pr(>F)    
## 1     18 17.90                              
## 2     17  7.06  1      10.8 26.1 8.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`1441811_x_at`
## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## 1     18 3.72                           
## 2     17 3.11  1     0.606 3.31  0.087 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $`1456341_a_at`
## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## 1     18 19.4                           
## 2     17 15.1  1      4.23 4.75  0.044 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##           gene
## 1   1427275_at
## 2 1441811_x_at
## 3 1456341_a_at
compFits["1427275_at"]
## $`1427275_at`
## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ age + I(age^2)
##   Res.Df   RSS Df Sum of Sq    F  Pr(>F)    
## 1     18 17.90                              
## 2     17  7.06  1      10.8 26.1 8.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There is a justyfication for going from linear to quadratic for probe “1427275_at”.

would one way ANOVA (linear model with one categorical covariate) fit even better?

for probe “1427275_at”

factFit <- dlply(qDat, ~gene, function(x) lm(gExp ~ devStage, x, subset = gType == 
    "wt"))
factFit[["1427275_at"]]
## 
## Call:
## lm(formula = gExp ~ devStage, data = x, subset = gType == "wt")
## 
## Coefficients:
##     (Intercept)       devStageP2       devStageP6      devStageP10  
##            9.64            -1.22            -1.72            -2.31  
## devStage4_weeks  
##           -1.07
summary(factFit[["1427275_at"]])$coef
##                 Estimate Std. Error t value  Pr(>|t|)
## (Intercept)        9.636     0.3398  28.363 1.903e-14
## devStageP2        -1.216     0.4805  -2.530 2.308e-02
## devStageP6        -1.721     0.4805  -3.581 2.731e-03
## devStageP10       -2.307     0.4805  -4.801 2.332e-04
## devStage4_weeks   -1.067     0.4805  -2.220 4.224e-02
summary(factFit[["1427275_at"]])
## 
## Call:
## lm(formula = gExp ~ devStage, data = x, subset = gType == "wt")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0215 -0.5538  0.0377  0.3896  1.0605 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         9.64       0.34   28.36  1.9e-14 ***
## devStageP2         -1.22       0.48   -2.53  0.02308 *  
## devStageP6         -1.72       0.48   -3.58  0.00273 ** 
## devStageP10        -2.31       0.48   -4.80  0.00023 ***
## devStage4_weeks    -1.07       0.48   -2.22  0.04224 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.68 on 15 degrees of freedom
## Multiple R-squared:  0.629,  Adjusted R-squared:  0.53 
## F-statistic: 6.36 on 4 and 15 DF,  p-value: 0.00337
AIC(linFit[["1427275_at"]], quadFit[["1427275_at"]], factFit[["1427275_at"]])
##                         df   AIC
## linFit[["1427275_at"]]   3 60.54
## quadFit[["1427275_at"]]  4 43.93
## factFit[["1427275_at"]]  6 47.55

No, there is no justification in going from quadratic to one-way ANOVA (factorial) analysis for probe “1427275_at”

choseModelAIC <- function(x) {
    linFit <- lm(gExp ~ age, x, subset = gType == "wt")
    quadFit <- lm(gExp ~ age + I(age^2), x, subset = gType == "wt")
    factFit <- lm(gExp ~ devStage, x, subset = gType == "wt")
    AIC(linFit, quadFit, factFit)
}
comp3Models <- dlply(qDat, ~gene, choseModelAIC)
comp3Models
## $`1427275_at`
##         df   AIC
## linFit   3 60.54
## quadFit  4 43.93
## factFit  6 47.55
## 
## $`1441811_x_at`
##         df   AIC
## linFit   3 29.10
## quadFit  4 27.55
## factFit  6 27.13
## 
## $`1456341_a_at`
##         df   AIC
## linFit   3 62.12
## quadFit  4 59.19
## factFit  6 48.70
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##           gene
## 1   1427275_at
## 2 1441811_x_at
## 3 1456341_a_at

For “1441811_x_at” there is not much benefit in going from linear model to either quadratic or factorial. Occam’s Razor and the KISS principle → stick w/ simple linear model.

For “1456341_a_at” it is maybe worth it to go from linear to quadratic. It is justified to go from quadratic to factorial (one way ANOVA).

Increase complexity - 1 quantitative covariate (age) and one categorical covariate (gType)

Let's look at “1441811_x_at”

fit a linear model with interaction

jFit <- lm(gExp ~ gType * age, qDat, subset = gene == "1441811_x_at")
summary(jFit)
## 
## Call:
## lm(formula = gExp ~ gType * age, data = qDat, subset = gene == 
##     "1441811_x_at")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0538 -0.4119 -0.0249  0.3130  1.1442 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.0734     0.1655   48.78  < 2e-16 ***
## gTypeNrlKO       0.1315     0.2407    0.55     0.59    
## age             -0.0682     0.0121   -5.61  2.5e-06 ***
## gTypeNrlKO:age   0.0102     0.0174    0.58     0.56    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.565 on 35 degrees of freedom
## Multiple R-squared:  0.607,  Adjusted R-squared:  0.573 
## F-statistic:   18 on 3 and 35 DF,  p-value: 3.05e-07

do a different parametrization

jFitAlt <- lm(gExp ~ gType/age - 1, qDat, subset = gene == "1441811_x_at")
summary(jFitAlt)
## 
## Call:
## lm(formula = gExp ~ gType/age - 1, data = qDat, subset = gene == 
##     "1441811_x_at")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0538 -0.4119 -0.0249  0.3130  1.1442 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## gTypewt          8.0734     0.1655   48.78  < 2e-16 ***
## gTypeNrlKO       8.2048     0.1748   46.95  < 2e-16 ***
## gTypewt:age     -0.0682     0.0121   -5.61  2.5e-06 ***
## gTypeNrlKO:age  -0.0580     0.0125   -4.64  4.8e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.565 on 35 degrees of freedom
## Multiple R-squared:  0.995,  Adjusted R-squared:  0.994 
## F-statistic: 1.76e+03 on 4 and 35 DF,  p-value: <2e-16

can switch between parametrizations via multiplication by an appropriate contrast matrix!

(contMat <- rbind(c(1, 0, 0, 0), c(1, 1, 0, 0), c(0, 0, 1, 0), c(0, 0, 1, 1)))
##      [,1] [,2] [,3] [,4]
## [1,]    1    0    0    0
## [2,]    1    1    0    0
## [3,]    0    0    1    0
## [4,]    0    0    1    1
cbind(coefDefault = coef(jFit), coefAlt = coef(jFitAlt), matrixResult = as.vector(contMat %*% 
    coef(jFit)))
##                coefDefault  coefAlt matrixResult
## (Intercept)        8.07337  8.07337      8.07337
## gTypeNrlKO         0.13148  8.20485      8.20485
## age               -0.06818 -0.06818     -0.06818
## gTypeNrlKO:age     0.01019 -0.05799     -0.05799

as always, you can assess the relevance of several terms at once – such as everything involving genotype – with an F test

anova(lm(gExp ~ age, qDat, subset = gene == "1441811_x_at"), jFit)
## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ gType * age
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1     37 11.8                         
## 2     35 11.2  2     0.598 0.94    0.4

it is not clear that genotype effectst either the intercept or the slope

let's test model that includes quadratic age term for probe 1427275_at

quadFit <- lm(gExp ~ gType * (age + I(age^2)), qDat, subset = gene == "1427275_at")
summary(quadFit)
## 
## Call:
## lm(formula = gExp ~ gType * (age + I(age^2)), data = qDat, subset = gene == 
##     "1427275_at")
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.163 -0.558  0.082  0.420  1.968 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.03640    0.23485   38.48  < 2e-16 ***
## gTypeNrlKO          -0.78497    0.35025   -2.24    0.032 *  
## age                 -0.25430    0.05323   -4.78  3.6e-05 ***
## I(age^2)             0.00849    0.00184    4.62  5.6e-05 ***
## gTypeNrlKO:age       0.14820    0.07823    1.89    0.067 .  
## gTypeNrlKO:I(age^2) -0.00500    0.00267   -1.87    0.070 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.713 on 33 degrees of freedom
## Multiple R-squared:  0.476,  Adjusted R-squared:  0.396 
## F-statistic: 5.98 on 5 and 33 DF,  p-value: 0.00048

# assess relevance of genotype with F test

anova(lm(gExp ~ age + I(age^2), qDat, subset = gene == "1427275_at"), lm(gExp ~ 
    gType * (age + I(age^2)), qDat, subset = gene == "1427275_at"))
## Analysis of Variance Table
## 
## Model 1: gExp ~ age + I(age^2)
## Model 2: gExp ~ gType * (age + I(age^2))
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1     36 20.1                         
## 2     33 16.8  3      3.31 2.17   0.11

borderline evidence that genotype has some effect on the fit.