Seminar 5 - Fitting and interpreting linear models (low volume)

Load the required libraries and import the data:

library(lattice)
library(reshape2)

## Warning: package 'reshape2' was built under R version 3.0.2

library(car)

## Loading required package: MASS
## Loading required package: nnet

prDat <- read.table("GSE4051_data.tsv", header = T)
str(prDat, max.level = 0)

## 'data.frame':    29949 obs. of  39 variables:

prDes <- read.table("GSE4051_design.tsv", header = T)
str(prDes)

## 'data.frame':    39 obs. of  4 variables:
##  $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
##  $ sidNum  : int  20 21 22 23 16 17 6 24 25 26 ...
##  $ devStage: Factor w/ 5 levels "4_weeks","E16",..: 2 2 2 2 2 2 2 4 4 4 ...
##  $ gType   : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...

The following function, called preparedata, takes as input the probeset IDs and as output it gives a new dataframe. In this case two genes where chosen and a new dataframe, called dat, was created.

luckyGenes <- c("1419655_at", "1438815_at")
preparedata <- function(pID) {
    genexp <- prDat[pID, ]
    Dat <- data.frame(gene = row.names(genexp), genexp)
    newDat <- melt(Dat, id.vars = "gene")
    names(newDat) <- c("gene", "sidChar", "gExp")
    final <- merge(newDat, prDes, by = "sidChar")
    final$sidChar <- as.character(final$sidChar)
    final$devStage <- factor(final$devStage, c("E16", "P2", "P6", "P10", "4_weeks"))
    final$gType <- factor(final$gType, c("wt", "NrlKO"))
    return(final)
}
dat <- preparedata(luckyGenes)

The next function stripplot a mini dataset. The function was used twice. First, the “dat” dataframe was used as input, then a new mini dataframe was created and used as input.

makeStripplot <- function(x) {
    stripplot(gExp ~ devStage | gene, x, group = gType, jitter.data = TRUE, 
        auto.key = TRUE, type = c("p", "a"), grid = TRUE)
}

makeStripplot(dat)

plot of chunk unnamed-chunk-4

makeStripplot(nDat <- preparedata("1456341_a_at"))

plot of chunk unnamed-chunk-4

Test for a difference in expected gene expression for probeset “1456341_a_at” at developmental stage P2 versus 4 weeks post-natal.

sub <- subset(nDat, devStage == "P2" | devStage == "4_weeks")
t.test(sub$gExp ~ sub$devStage, var.equal = T)

## 
##  Two Sample t-test
## 
## data:  sub$gExp by sub$devStage
## t = -18.84, df = 14, p-value = 2.411e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.078 -3.244
## sample estimates:
##      mean in group P2 mean in group 4_weeks 
##                 6.326                 9.987

Fit a linear model with a categorical covariate:

makeStripplot(nDat1 <- preparedata("1438786_a_at"))

plot of chunk unnamed-chunk-6

mod1 <- lm(formula = gExp ~ devStage, data = nDat1, subset = gType == "wt")
summary(mod1)

## 
## Call:
## lm(formula = gExp ~ devStage, data = nDat1, subset = gType == 
##     "wt")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1565 -0.4400  0.0288  0.4915  1.2065 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        8.523      0.379   22.50  5.7e-13 ***
## devStageP2        -1.450      0.536   -2.71    0.016 *  
## devStageP6        -0.107      0.536   -0.20    0.845    
## devStageP10       -1.201      0.536   -2.24    0.040 *  
## devStage4_weeks    0.081      0.536    0.15    0.882    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.758 on 15 degrees of freedom
## Multiple R-squared:  0.497,  Adjusted R-squared:  0.363 
## F-statistic: 3.71 on 4 and 15 DF,  p-value: 0.0272

The intercept is represented by the value of gExp for the developmental stage E16. The result given by fitting a linear model is also confirmed by the blu line in the plot (the one related to the gType “wt”). Also for the devStage effect the result are confirmed by the stripplot. The expression values for devStage P2 and P10 are quite similar.

Perform inference for a contrast:

coef(mod1)

##     (Intercept)      devStageP2      devStageP6     devStageP10 
##          8.5227         -1.4503         -0.1067         -1.2012 
## devStage4_weeks 
##          0.0810

contMat <- matrix(c(0, 1, 0, -1, 0), nrow = 1)
(obsDiff <- contMat %*% coef(mod1))

##        [,1]
## [1,] -0.249

sampMeans <- aggregate(gExp ~ devStage, nDat1, FUN = mean, subset = gType == 
    "wt")
sampMeans

##   devStage  gExp
## 1      E16 8.523
## 2       P2 7.072
## 3       P6 8.416
## 4      P10 7.322
## 5  4_weeks 8.604

with(sampMeans, gExp[devStage == "P2"] - gExp[devStage == "P10"])

## [1] -0.249

vcov(mod1)

##                 (Intercept) devStageP2 devStageP6 devStageP10
## (Intercept)          0.1435    -0.1435    -0.1435     -0.1435
## devStageP2          -0.1435     0.2870     0.1435      0.1435
## devStageP6          -0.1435     0.1435     0.2870      0.1435
## devStageP10         -0.1435     0.1435     0.1435      0.2870
## devStage4_weeks     -0.1435     0.1435     0.1435      0.1435
##                 devStage4_weeks
## (Intercept)             -0.1435
## devStageP2               0.1435
## devStageP6               0.1435
## devStageP10              0.1435
## devStage4_weeks          0.2870

summary(mod1)$coefficients[, "Std. Error"]

##     (Intercept)      devStageP2      devStageP6     devStageP10 
##          0.3788          0.5357          0.5357          0.5357 
## devStage4_weeks 
##          0.5357

sqrt(diag(vcov(mod1)))

##     (Intercept)      devStageP2      devStageP6     devStageP10 
##          0.3788          0.5357          0.5357          0.5357 
## devStage4_weeks 
##          0.5357

(estSe <- contMat %*% vcov(mod1) %*% t(contMat))

##       [,1]
## [1,] 0.287

(testStat <- obsDiff/estSe)

##         [,1]
## [1,] -0.8676

2 * pt(abs(testStat), df = df.residual(mod1), lower.tail = FALSE)

##        [,1]
## [1,] 0.3993

Fit a linear model with two categorical covariates:

makeStripplot(oDat <- preparedata("1448690_at"))

plot of chunk unnamed-chunk-9

oFitBig <- lm(gExp ~ devStage * gType, data = oDat)
summary(oFitBig)$coef

##                            Estimate Std. Error  t value  Pr(>|t|)
## (Intercept)                 8.67800     0.3987 21.76755 1.634e-19
## devStageP2                 -1.02900     0.5638 -1.82512 7.830e-02
## devStageP6                 -1.91450     0.5638 -3.39571 2.003e-03
## devStageP10                -2.19325     0.5638 -3.89012 5.387e-04
## devStage4_weeks            -2.08200     0.5638 -3.69280 9.149e-04
## gTypeNrlKO                 -0.84233     0.6090 -1.38320 1.772e-01
## devStageP2:gTypeNrlKO       0.06983     0.8299  0.08415 9.335e-01
## devStageP6:gTypeNrlKO       0.16533     0.8299  0.19922 8.435e-01
## devStageP10:gTypeNrlKO      0.22583     0.8299  0.27212 7.875e-01
## devStage4_weeks:gTypeNrlKO  0.64608     0.8299  0.77852 4.426e-01

Looking at both the stripplot and the results of the linear regression, the intercept value seems plausible. The case for interaction seems very weak. Let's compare this model with a smaller one that does not take into account the interaction.

oFitSmall <- lm(gExp ~ devStage + gType, data = oDat)
summary(oFitSmall)$coef

##                 Estimate Std. Error t value  Pr(>|t|)
## (Intercept)       8.5803     0.3046  28.165 1.177e-24
## devStageP2       -1.0104     0.3924  -2.575 1.470e-02
## devStageP6       -1.8481     0.3924  -4.710 4.328e-05
## devStageP10      -2.0966     0.3924  -5.343 6.703e-06
## devStage4_weeks  -1.7752     0.3924  -4.524 7.444e-05
## gTypeNrlKO       -0.6144     0.2430  -2.528 1.643e-02

anova(oFitSmall, oFitBig)

## Analysis of Variance Table
## 
## Model 1: gExp ~ devStage + gType
## Model 2: gExp ~ devStage * gType
##   Res.Df  RSS Df Sum of Sq   F Pr(>F)
## 1     33 18.9                        
## 2     29 18.4  4     0.497 0.2   0.94

The p-value close to one indicated that there is no evidence for interaction in this case. The following is a gene where interaction is significant:

makeStripplot(oDat1 <- preparedata("1429225_at"))

plot of chunk unnamed-chunk-12

str(oDat1)

## 'data.frame':    39 obs. of  6 variables:
##  $ sidChar : chr  "Sample_1" "Sample_10" "Sample_11" "Sample_12" ...
##  $ gene    : Factor w/ 1 level "1429225_at": 1 1 1 1 1 1 1 1 1 1 ...
##  $ gExp    : num  6.51 6.85 9.5 9.01 7.87 ...
##  $ sidNum  : int  1 10 11 12 13 14 15 16 17 18 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 3 3 5 5 4 2 4 1 1 4 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 2 2 2 2 2 2 ...

oFitBig1 <- lm(gExp ~ devStage * gType, data = oDat1)
summary(oFitBig1)$coef

##                            Estimate Std. Error t value  Pr(>|t|)
## (Intercept)                  7.3125     0.2617 27.9391 1.619e-22
## devStageP2                  -1.1582     0.3701 -3.1292 3.973e-03
## devStageP6                  -1.2495     0.3701 -3.3757 2.110e-03
## devStageP10                 -1.0717     0.3701 -2.8955 7.125e-03
## devStage4_weeks             -0.9087     0.3701 -2.4551 2.032e-02
## gTypeNrlKO                  -0.2602     0.3998 -0.6507 5.203e-01
## devStageP2:gTypeNrlKO        0.2804     0.5448  0.5147 6.107e-01
## devStageP6:gTypeNrlKO        0.7589     0.5448  1.3929 1.742e-01
## devStageP10:gTypeNrlKO       1.7914     0.5448  3.2880 2.648e-03
## devStage4_weeks:gTypeNrlKO   2.2389     0.5448  4.1094 2.970e-04

oFitSmall1 <- lm(gExp ~ devStage + gType, data = oDat1)
summary(oFitSmall1)$coef

##                 Estimate Std. Error t value  Pr(>|t|)
## (Intercept)       6.8652     0.2722 25.2199 3.848e-23
## devStageP2       -1.0926     0.3506 -3.1161 3.780e-03
## devStageP6       -0.9446     0.3506 -2.6940 1.101e-02
## devStageP10      -0.2506     0.3506 -0.7147 4.798e-01
## devStage4_weeks   0.1362     0.3506  0.3883 7.003e-01
## gTypeNrlKO        0.7836     0.2172  3.6085 1.007e-03

anova(oFitSmall1, oFitBig1)

## Analysis of Variance Table
## 
## Model 1: gExp ~ devStage + gType
## Model 2: gExp ~ devStage * gType
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1     33 15.12                             
## 2     29  7.95  4      7.17 6.54  7e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Create a function that perform linear regression for a small dataset of genes at the same time.

library(plyr)
linReg <- ddply(dat, ~gene, function(z) {
    estCoefs <- coef(lm(gExp ~ devStage * gType, data = z))
    return(estCoefs)
})

linReg[1, ]

##         gene (Intercept) devStageP2 devStageP6 devStageP10 devStage4_weeks
## 1 1419655_at       10.76     -1.054     -1.598      -2.016          -2.389
##   gTypeNrlKO devStageP2:gTypeNrlKO devStageP6:gTypeNrlKO
## 1    -0.2997                 1.096                 1.361
##   devStageP10:gTypeNrlKO devStage4_weeks:gTypeNrlKO
## 1                  2.278                      2.341

To check whether the function works, one of the two luck genes was extracted, then, for this gene I performed linear regression and check whether the results are the same of the ones obtained used the previous function.

# Results from the function
linReg[1, ]

##         gene (Intercept) devStageP2 devStageP6 devStageP10 devStage4_weeks
## 1 1419655_at       10.76     -1.054     -1.598      -2.016          -2.389
##   gTypeNrlKO devStageP2:gTypeNrlKO devStageP6:gTypeNrlKO
## 1    -0.2997                 1.096                 1.361
##   devStageP10:gTypeNrlKO devStage4_weeks:gTypeNrlKO
## 1                  2.278                      2.341

# Result from simple linear regression
reg <- preparedata("1419655_at")
fit <- lm(gExp ~ devStage * gType, data = reg)
summary(fit)$coef

##                            Estimate Std. Error t value  Pr(>|t|)
## (Intercept)                 10.7550     0.2311  46.544 8.368e-29
## devStageP2                  -1.0540     0.3268  -3.225 3.110e-03
## devStageP6                  -1.5980     0.3268  -4.890 3.443e-05
## devStageP10                 -2.0160     0.3268  -6.169 1.002e-06
## devStage4_weeks             -2.3895     0.3268  -7.312 4.703e-08
## gTypeNrlKO                  -0.2997     0.3530  -0.849 4.028e-01
## devStageP2:gTypeNrlKO        1.0962     0.4810   2.279 3.022e-02
## devStageP6:gTypeNrlKO        1.3612     0.4810   2.830 8.369e-03
## devStageP10:gTypeNrlKO       2.2782     0.4810   4.736 5.278e-05
## devStage4_weeks:gTypeNrlKO   2.3409     0.4810   4.867 3.675e-05

As expected the estimates are the same.

Using the variable age instead of devStage. In this case, the gene examinated are the one also presented in class.

prDes$age <- recode(prDes$devStage, "'E16'=-2; 'P2'=2; 'P6'=6; 'P10'=10; '4_weeks'=28", 
    as.factor.result = FALSE)
datAge <- preparedata(c("1427275_at", "1441811_x_at", "1456341_a_at"))

makeStripplot(datAge)

plot of chunk unnamed-chunk-17

# focus on '1441811_x_at'
ageNew <- preparedata("1441811_x_at")
AgeFit <- lm(gExp ~ gType * age, ageNew)
summary(AgeFit)

## 
## Call:
## lm(formula = gExp ~ gType * age, data = ageNew)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0538 -0.4119 -0.0249  0.3130  1.1442 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.0734     0.1655   48.78  < 2e-16 ***
## gTypeNrlKO       0.1315     0.2407    0.55     0.59    
## age             -0.0682     0.0121   -5.61  2.5e-06 ***
## gTypeNrlKO:age   0.0102     0.0174    0.58     0.56    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.565 on 35 degrees of freedom
## Multiple R-squared:  0.607,  Adjusted R-squared:  0.573 
## F-statistic:   18 on 3 and 35 DF,  p-value: 3.05e-07

# anova
anova(lm(gExp ~ age, ageNew), AgeFit)

## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ gType * age
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1     37 11.8                         
## 2     35 11.2  2     0.598 0.94    0.4

# focus on '1427275_at'
ageNew2 <- preparedata("1427275_at")
AgeFit2 <- lm(gExp ~ gType * (age + I(age^2)), ageNew2)
summary(AgeFit2)

## 
## Call:
## lm(formula = gExp ~ gType * (age + I(age^2)), data = ageNew2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.163 -0.558  0.082  0.420  1.968 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.03640    0.23485   38.48  < 2e-16 ***
## gTypeNrlKO          -0.78497    0.35025   -2.24    0.032 *  
## age                 -0.25430    0.05323   -4.78  3.6e-05 ***
## I(age^2)             0.00849    0.00184    4.62  5.6e-05 ***
## gTypeNrlKO:age       0.14820    0.07823    1.89    0.067 .  
## gTypeNrlKO:I(age^2) -0.00500    0.00267   -1.87    0.070 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.713 on 33 degrees of freedom
## Multiple R-squared:  0.476,  Adjusted R-squared:  0.396 
## F-statistic: 5.98 on 5 and 33 DF,  p-value: 0.00048

# anova
anova(lm(gExp ~ age + I(age^2), ageNew2), AgeFit2)

## Analysis of Variance Table
## 
## Model 1: gExp ~ age + I(age^2)
## Model 2: gExp ~ gType * (age + I(age^2))
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1     36 20.1                         
## 2     33 16.8  3      3.31 2.17   0.11

Drop devStage 4 weks and repeat the previous analysis.

no4weeks <- subset(preparedata(c("1427275_at", "1441811_x_at")), age == -2 | 
    age == 2 | age == 6 | age == 10)
str(no4weeks)

## 'data.frame':    62 obs. of  7 variables:
##  $ sidChar : chr  "Sample_1" "Sample_1" "Sample_10" "Sample_10" ...
##  $ gene    : Factor w/ 2 levels "1427275_at","1441811_x_at": 1 2 1 2 1 2 1 2 1 2 ...
##  $ gExp    : num  7.83 7.83 8.34 8.92 7.42 ...
##  $ sidNum  : int  1 1 10 10 13 13 14 14 15 15 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 3 3 3 3 4 4 2 2 4 4 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 2 2 2 2 2 2 2 2 2 2 ...
##  $ age     : num  6 6 6 6 10 10 2 2 10 10 ...

makeStripplot(no4weeks)

plot of chunk unnamed-chunk-20

# focus on '1441811_x_at'
new <- subset(no4weeks, subset = gene == "1441811_x_at")
NewFit <- lm(gExp ~ gType * age, new)
summary(NewFit)

## 
## Call:
## lm(formula = gExp ~ gType * age, data = new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3659 -0.2987  0.0104  0.4191  1.1995 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     8.19724    0.19862   41.27   <2e-16 ***
## gTypeNrlKO      0.20833    0.29485    0.71   0.4859    
## age            -0.10729    0.03310   -3.24   0.0032 ** 
## gTypeNrlKO:age -0.00638    0.04839   -0.13   0.8961    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.592 on 27 degrees of freedom
## Multiple R-squared:  0.441,  Adjusted R-squared:  0.379 
## F-statistic:  7.1 on 3 and 27 DF,  p-value: 0.00115

# anova
anova(lm(gExp ~ age, new), NewFit)

## Analysis of Variance Table
## 
## Model 1: gExp ~ age
## Model 2: gExp ~ gType * age
##   Res.Df  RSS Df Sum of Sq    F Pr(>F)
## 1     29 9.73                         
## 2     27 9.47  2     0.261 0.37   0.69

# focus on '1427275_at'
new2 <- subset(no4weeks, subset = gene == "1427275_at")
NewFit2 <- lm(gExp ~ gType * (age + I(age^2)), new2)
summary(NewFit2)

## 
## Call:
## lm(formula = gExp ~ gType * (age + I(age^2)), data = new2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1404 -0.5368  0.0904  0.4397  0.9416 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          9.02868    0.21461   42.07   <2e-16 ***
## gTypeNrlKO          -0.74747    0.31595   -2.37   0.0261 *  
## age                 -0.26430    0.08613   -3.07   0.0051 ** 
## I(age^2)             0.00983    0.00983    1.00   0.3267    
## gTypeNrlKO:age       0.22602    0.12702    1.78   0.0873 .  
## gTypeNrlKO:I(age^2) -0.01506    0.01418   -1.06   0.2986    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.629 on 25 degrees of freedom
## Multiple R-squared:  0.598,  Adjusted R-squared:  0.518 
## F-statistic: 7.45 on 5 and 25 DF,  p-value: 0.000214

# anova
anova(lm(gExp ~ age + I(age^2), new2), NewFit2)

## Analysis of Variance Table
## 
## Model 1: gExp ~ age + I(age^2)
## Model 2: gExp ~ gType * (age + I(age^2))
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)  
## 1     28 12.88                           
## 2     25  9.89  3      2.99 2.52  0.081 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1