Seminar 5 Low-volume Fitting and Interpreting Linear Models

We will using data aggregation methods to fit linear models on a small dataset comprising a few (say, 9) probes.

library(lattice)
library(plyr)
prDat <- read.table("GSE4051_data.tsv")
str(prDat, max.level = 0)

## 'data.frame':    29949 obs. of  39 variables:

prDes <- readRDS("GSE4051_design.rds")
str(prDes)

## 'data.frame':    39 obs. of  4 variables:
##  $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
##  $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...

Select 9 random probes from the dataset.

set.seed(1231411)
n <- 9
(luckyGenes <- rownames(prDat)[sample(1:nrow(prDat), size = n)])

## [1] "1457246_at"   "1442791_x_at" "1423612_at"   "1445125_at"  
## [5] "1426780_at"   "1429798_s_at" "1447506_at"   "1446842_at"  
## [9] "1431353_at"

We will need to first write a number of functions to generate the dataframe, make the stripplots and perform the two-sample tests

prepareData <- function(a) {
    dat <- prDat[a, ]
    dat <- data.frame(gExp = as.vector(t(as.matrix(dat))), gene = factor(rep(rownames(dat), 
        each = ncol(dat)), levels = rownames(dat)))
    dat <- suppressWarnings(data.frame(prDes, dat))
    return(dat)
}

makeStripplot <- function(x, ...) {
    stripplot(gExp ~ devStage | gene, x, group = gType, jitter.data = TRUE, 
        auto.key = TRUE, type = c("p", "a"), par.settings = simpleTheme(col = c("dodgerblue", 
            "deeppink1"), pch = 16), grid = TRUE, ...)
}

Run.TTest <- function(t) {
    datt <- subset(t, devStage %in% c("P2", "P10"))
    tt <- t.test(gExp ~ devStage, datt)
    return(c(tt$estimate, tt$statistic, tt$parameter, p.value = tt$p.value))
}

Fit.LM <- function(x) {
    fitA <- lm(gExp ~ devStage, data = x, subset = gType == "wt")
    return(fitA)
}

inference.testing <- function(o) {
    obsDiff <- contrast.matrix %*% coef(o)
    estSE <- contrast.matrix %*% vcov(o) %*% t(contrast.matrix)
    testStat <- obsDiff/estSE
    p.values <- 2 * pt(abs(testStat), df = df.residual(o), lower.tail = FALSE)
    c(Estimate = obsDiff, Std_Error = estSE, Statistic = testStat, p.values = p.values)
}

Interaction.2covariates <- function(y) {
    oFitBig <- lm(formula = gExp ~ gType * devStage, data = y)
    oFitSmall <- lm(formula = gExp ~ gType + devStage, data = y)
    anova.interaction <- anova(oFitSmall, oFitBig)
    c(p.values = anova.interaction[2, 6])
}

Let's begin the analysis by preparing the dataset of the 9 selected probes and making stripplots for each probe.

makeStripplot(sumDat <- prepareData(luckyGenes))

plot of chunk unnamed-chunk-1

head(sumDat)

##     sidChar sidNum devStage gType   gExp       gene
## 1 Sample_20     20      E16    wt 10.910 1457246_at
## 2 Sample_21     21      E16    wt 10.850 1457246_at
## 3 Sample_22     22      E16    wt 10.330 1457246_at
## 4 Sample_23     23      E16    wt 10.650 1457246_at
## 5 Sample_16     16      E16 NrlKO  8.721 1457246_at
## 6 Sample_17     17      E16 NrlKO 10.300 1457246_at

Now we will run t-tests across all probes in the dataset comparing P2 vs. P10 expression levels

ddply(sumDat, ~gene, Run.TTest)

##           gene mean in group P2 mean in group P10       t     df  p.value
## 1   1457246_at            9.210             9.763 -1.6226 13.389 0.127968
## 2 1442791_x_at            7.883             7.704  0.8871 11.844 0.392662
## 3   1423612_at            6.239             6.607 -3.5748 11.635 0.004001
## 4   1445125_at            8.801             8.517  2.6568 10.623 0.022918
## 5   1426780_at            7.336             7.561 -1.0043  8.296 0.343627
## 6 1429798_s_at            7.997             7.781  2.0527  9.138 0.069834
## 7   1447506_at            8.030             7.926  1.0391 13.939 0.316457
## 8   1446842_at            8.954             8.878  0.3956 11.568 0.699579
## 9   1431353_at            8.537             8.137  1.3583 10.903 0.201815

Based on the t-test p-values, only the probes “1423612_at” and “1445125_at” had expression levels that were significantly different at P2 vs P10 devStages.

We will also fit linear models for expression of each probe by devStage, using only wt samples

lm.list <- dlply(sumDat, ~gene, Fit.LM)
(lm.coef.df <- ldply(lm.list, coef))

##           gene (Intercept) devStageP2 devStageP6 devStageP10
## 1   1457246_at      10.685   -1.85450   -1.49850     -1.3105
## 2 1442791_x_at       7.055    0.75525    0.56225      0.7080
## 3   1423612_at       6.193    0.06025   -0.01450      0.2820
## 4   1445125_at       8.133    0.65925    0.36025      0.4753
## 5   1426780_at       8.036   -0.79900   -0.62650     -0.5987
## 6 1429798_s_at       7.468    0.47600    0.38675      0.4650
## 7   1447506_at       7.785    0.18225    0.05025      0.1285
## 8   1446842_at       8.456    0.39875    0.27400      0.2110
## 9   1431353_at       7.797    0.82225    0.39975      0.4320
##   devStage4_weeks
## 1         -0.7707
## 2          0.2430
## 3          0.4003
## 4          0.2273
## 5         -0.3092
## 6          0.2182
## 7          0.2395
## 8          0.1818
## 9          0.1215

We will need to make a contrast matrix in order to infer differences between devStage P2 and 4_weeks effect

contrast.matrix <- matrix(c(0, 1, 0, 0, -1), nrow = 1)
(infer.df <- ldply(lm.list, inference.testing))

##           gene Estimate Std_Error Statistic  p.values
## 1   1457246_at -1.08375   0.21863    -4.957 1.721e-04
## 2 1442791_x_at  0.51225   0.09938     5.154 1.176e-04
## 3   1423612_at -0.34000   0.02133   -15.941 8.202e-11
## 4   1445125_at  0.43200   0.02924    14.773 2.407e-10
## 5   1426780_at -0.48975   0.10297    -4.756 2.549e-04
## 6 1429798_s_at  0.25775   0.02875     8.966 2.058e-07
## 7   1447506_at -0.05725   0.01506    -3.801 1.739e-03
## 8   1446842_at  0.21700   0.09722     2.232 4.129e-02
## 9   1431353_at  0.70075   0.21716     3.227 5.644e-03

Given that the two-sided p-values are rather large (ie >0.05), we can conclude that there is no true difference in mean expression at P2 and 4 weeks in wildtype mice for these 9 probes.

We will now fit a linear model with two categorical covariates (gType and devStage) and test if their interaction is significant using anova

ddply(sumDat, ~gene, Interaction.2covariates)

##           gene p.values
## 1   1457246_at  0.08581
## 2 1442791_x_at  0.57786
## 3   1423612_at  0.03346
## 4   1445125_at  0.50339
## 5   1426780_at  0.44001
## 6 1429798_s_at  0.36282
## 7   1447506_at  0.78203
## 8   1446842_at  0.93564
## 9   1431353_at  0.53866

Based on the high p-values from the anova, we can conclude that there is no interaction of gType and devStage for most of these probes except for 1423612_at (p-value = 0.033).