Stat 540 Seminar 5

To explore linear models on gene expression data.

Start by loading data/libraries:

library(lattice)
library(plyr)
prDat <- read.table("../GSE4051_data.tsv")
prDes <- read.table("../GSE4051_design.tsv", header = TRUE)

Functions we were asked to make

prepareData <- function(x) {
    miniDat <- subset(prDat, rownames(prDat) %in% x)
    miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))), gene = factor(rep(rownames(miniDat), 
        each = ncol(miniDat)), levels = x))
    miniDat <- data.frame(prDes, miniDat)
    miniDat$sidNum <- as.numeric(miniDat$sidNum)
    miniDat$devStage <- factor(miniDat$devStage, levels = unique(miniDat$devStage))  #to make plotting in order
    return(miniDat)
}

makeStripplot <- function(x, pch = 1, cex = 1) {
    return(stripplot(gExp ~ devStage | gene, x, group = gType, jitter.data = TRUE, 
        auto.key = TRUE, type = c("p", "a"), grid = TRUE, pch = pch, cex = cex))
}

Write a function to prepare a mini-dataset for a small number of genes

(luckyGenes <- c("1419655_at", "1438815_at"))

## [1] "1419655_at" "1438815_at"

jDat <- prepareData(luckyGenes)
str(jDat)

## 'data.frame':    78 obs. of  6 variables:
##  $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
##  $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ gType   : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
##  $ gExp    : num  10.93 10.74 10.67 10.68 9.61 ...
##  $ gene    : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...

head(jDat)

##     sidChar sidNum devStage gType   gExp       gene
## 1 Sample_20     20      E16    wt 10.930 1419655_at
## 2 Sample_21     21      E16    wt 10.740 1419655_at
## 3 Sample_22     22      E16    wt 10.670 1419655_at
## 4 Sample_23     23      E16    wt 10.680 1419655_at
## 5 Sample_16     16      E16 NrlKO  9.606 1419655_at
## 6 Sample_17     17      E16 NrlKO 10.840 1419655_at

tail(jDat)

##      sidChar sidNum devStage gType  gExp       gene
## 73 Sample_38     38  4_weeks    wt 8.211 1438815_at
## 74 Sample_39     39  4_weeks    wt 8.436 1438815_at
## 75 Sample_11     11  4_weeks NrlKO 8.465 1438815_at
## 76 Sample_12     12  4_weeks NrlKO 8.841 1438815_at
## 77  Sample_2      2  4_weeks NrlKO 8.506 1438815_at
## 78  Sample_9      9  4_weeks NrlKO 8.952 1438815_at

Write a function to stripplot a mini-dataset

makeStripplot(jDat)

plot of chunk unnamed-chunk-5

makeStripplot(jDat, pch = 17, cex = 3)

plot of chunk unnamed-chunk-5

You can use both of your functions together and create a minidatset and plot it all at once:

makeStripplot(newDat <- prepareData("1456341_a_at"))

plot of chunk unnamed-chunk-6

str(newDat)

## 'data.frame':    39 obs. of  6 variables:
##  $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
##  $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ gType   : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
##  $ gExp    : num  7.04 7.48 7.37 6.94 6.16 ...
##  $ gene    : Factor w/ 1 level "1456341_a_at": 1 1 1 1 1 1 1 1 1 1 ...

head(newDat)

##     sidChar sidNum devStage gType  gExp         gene
## 1 Sample_20     20      E16    wt 7.044 1456341_a_at
## 2 Sample_21     21      E16    wt 7.478 1456341_a_at
## 3 Sample_22     22      E16    wt 7.374 1456341_a_at
## 4 Sample_23     23      E16    wt 6.944 1456341_a_at
## 5 Sample_16     16      E16 NrlKO 6.161 1456341_a_at
## 6 Sample_17     17      E16 NrlKO 6.931 1456341_a_at

Do a two-sample t-test

Here's what I get:

mDat <- prepareData("1438786_a_at")
t.test(gExp ~ devStage, mDat, subset = devStage %in% c("P2", "4_weeks"), var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  gExp by devStage
## t = -5.666, df = 14, p-value = 5.821e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.9602 -0.8838
## sample estimates:
##      mean in group P2 mean in group 4_weeks 
##                 6.949                 8.371

Fit a linear model with a categorical covariate

mFit <- lm(gExp ~ devStage, mDat, subset = gType == "wt")
summary(mFit)

## 
## Call:
## lm(formula = gExp ~ devStage, data = mDat, subset = gType == 
##     "wt")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1565 -0.4400  0.0288  0.4915  1.2065 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        8.523      0.379   22.50  5.7e-13 ***
## devStageP2        -1.450      0.536   -2.71    0.016 *  
## devStageP6        -0.107      0.536   -0.20    0.845    
## devStageP10       -1.201      0.536   -2.24    0.040 *  
## devStage4_weeks    0.081      0.536    0.15    0.882    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.758 on 15 degrees of freedom
## Multiple R-squared:  0.497,  Adjusted R-squared:  0.363 
## F-statistic: 3.71 on 4 and 15 DF,  p-value: 0.0272

Perform inference for a contrast

Helpful link for making correct contrast matrix http://www.vsni.co.uk/software/genstat/htmlhelp/anova/Contrasts.htm

contMat <- matrix(c(0, 1, 0, -1, 0), nrow = 1)
obsDiff <- contMat %*% coef(mFit)
obsDiff

##        [,1]
## [1,] -0.249

Get p-value for contrast

estSe <- contMat %*% vcov(mFit) %*% t(contMat)
testStat <- obsDiff/estSe
2 * pt(abs(testStat), df = df.residual(mFit), lower.tail = FALSE)

##        [,1]
## [1,] 0.3993

Fit a linear model with two categorical covariates

makeStripplot(oDat <- prepareData("1448690_at"))

plot of chunk unnamed-chunk-11

oFitBig <- lm(gExp ~ gType * devStage, oDat)
summary(oFitBig)$coef

##                         Estimate Std. Error  t value  Pr(>|t|)
## (Intercept)              7.83567     0.4603 17.02145 1.234e-16
## gTypewt                  0.84233     0.6090  1.38320 1.772e-01
## devStageP2              -0.95917     0.6090 -1.57505 1.261e-01
## devStageP6              -1.74917     0.6090 -2.87232 7.542e-03
## devStageP10             -1.96742     0.6090 -3.23071 3.068e-03
## devStage4_weeks         -1.43592     0.6090 -2.35793 2.534e-02
## gTypewt:devStageP2      -0.06983     0.8299 -0.08415 9.335e-01
## gTypewt:devStageP6      -0.16533     0.8299 -0.19922 8.435e-01
## gTypewt:devStageP10     -0.22583     0.8299 -0.27212 7.875e-01
## gTypewt:devStage4_weeks -0.64608     0.8299 -0.77852 4.426e-01

oFitSmall <- lm(gExp ~ gType + devStage, oDat)
summary(oFitSmall)$coef

##                 Estimate Std. Error t value  Pr(>|t|)
## (Intercept)       7.9659     0.3182  25.035 4.849e-23
## gTypewt           0.6144     0.2430   2.528 1.643e-02
## devStageP2       -1.0104     0.3924  -2.575 1.470e-02
## devStageP6       -1.8481     0.3924  -4.710 4.328e-05
## devStageP10      -2.0966     0.3924  -5.343 6.703e-06
## devStage4_weeks  -1.7752     0.3924  -4.524 7.444e-05

Results slighly different due to R picking a differnt intercept to use for some reason. Overall interpretation seems the same.

Peform Anova:

anova(oFitSmall, oFitBig)

## Analysis of Variance Table
## 
## Model 1: gExp ~ gType + devStage
## Model 2: gExp ~ gType * devStage
##   Res.Df  RSS Df Sum of Sq   F Pr(>F)
## 1     33 18.9                        
## 2     29 18.4  4     0.497 0.2   0.94

Apply methods on multiple genes

Sample some data:

set.seed(540)
genes12 <- row.names(prDat)[sample(1:nrow(prDat), 12)]
jDat <- prepareData(genes12)
makeStripplot(jDat)

plot of chunk unnamed-chunk-13

Function for performing an anova on categorical covariates devStage and gType:

anovaDSGT <- function(x) {
    makeStripplot(oDat <- prepareData("1448690_at"))
    oFitBig <- lm(gExp ~ gType * devStage, x)
    oFitSmall <- lm(gExp ~ gType + devStage, x)
    return(as.numeric(anova(oFitSmall, oFitBig)$Pr[2]))
}

ddply(jDat, ~gene, anovaDSGT)

##            gene       V1
## 1    1417558_at 0.072126
## 2    1453147_at 0.251209
## 3    1450169_at 0.084658
## 4    1430577_at 0.148439
## 5    1459967_at 0.925715
## 6    1432182_at 0.499350
## 7    1446589_at 0.005162
## 8    1433117_at 0.740763
## 9  1458637_x_at 0.257101
## 10   1452944_at 0.747966
## 11 1416357_a_at 0.008065
## 12   1418200_at 0.348813

1416357_a_at shows some interaction as well as possibly 1450169_at.

Use quanitative time variable instead of devStage

library(car)

# method with age variable
prepareDataAge <- function(x) {
    miniDat <- subset(prDat, rownames(prDat) %in% x)
    miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))), gene = factor(rep(rownames(miniDat), 
        each = ncol(miniDat)), levels = x))
    miniDat <- data.frame(prDes, miniDat)
    miniDat$sidNum <- as.numeric(miniDat$sidNum)
    miniDat$age <- suppressWarnings(as.numeric(recode(prDes$devStage, "'E16'=-2; 'P2'=2; 'P6'=6; 'P10'=10", 
        as.factor.result = FALSE)))
    # remove week 4
    miniDat <- subset(miniDat, miniDat$devStage != "4_weeks")
    return(miniDat)
}

makexyplot <- function(x, pch = 1, cex = 1) {
    return(xyplot(gExp ~ age | gene, x, group = gType, jitter.data = TRUE, auto.key = TRUE, 
        type = c("p", "a"), grid = TRUE, pch = pch, cex = cex))
}
jDat <- prepareDataAge(genes12)
makexyplot(jDat)

plot of chunk unnamed-chunk-15