seminar05.R

omaromeir — Feb 7, 2014, 12:57 AM

library(lattice)
prDat <- read.table("../data/GSE4051_data.tsv")
str(prDat, max.level = 0)
'data.frame':   29949 obs. of  39 variables:

prDes  <- readRDS("../data/GSE4051_design.rds")
str(prDes)
'data.frame':   39 obs. of  4 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...

#g: the selected genes
prepareData  <- function(g){
  #prDes row + gExp of the gene from prDat + gene from the input
  pDat <- data.frame()
  for (i in 1:length(g)){
    pDat1 <- data.frame(prDes, gExp = as.vector(as.matrix(prDat[g[i], ])), gene = g[i])
    pDat <- rbind(pDat, pDat1)
  } 
  pDat
}

(magicGenes = c("1419655_at", "1438815_at"))
[1] "1419655_at" "1438815_at"
jDat  <- prepareData(magicGenes)
str(jDat)
'data.frame':   78 obs. of  6 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  10.93 10.74 10.67 10.68 9.61 ...
 $ gene    : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
head(jDat)
     sidChar sidNum devStage gType   gExp       gene
12 Sample_20     20      E16    wt 10.930 1419655_at
13 Sample_21     21      E16    wt 10.740 1419655_at
14 Sample_22     22      E16    wt 10.670 1419655_at
15 Sample_23     23      E16    wt 10.680 1419655_at
9  Sample_16     16      E16 NrlKO  9.606 1419655_at
10 Sample_17     17      E16 NrlKO 10.840 1419655_at
tail(jDat)
      sidChar sidNum devStage gType  gExp       gene
71  Sample_38     38  4_weeks    wt 8.211 1438815_at
81  Sample_39     39  4_weeks    wt 8.436 1438815_at
110 Sample_11     11  4_weeks NrlKO 8.465 1438815_at
210 Sample_12     12  4_weeks NrlKO 8.841 1438815_at
310  Sample_2      2  4_weeks NrlKO 8.506 1438815_at
41   Sample_9      9  4_weeks NrlKO 8.952 1438815_at

#d: the data
makeStripplot  <- function(d){
  stripplot(gExp ~ devStage | gene, d, group = gType, jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
}

makeStripplot(jDat)

plot of chunk unnamed-chunk-1


#t test
tDat <- prepareData("1456341_a_at")
str(tDat)
'data.frame':   39 obs. of  6 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  7.04 7.48 7.37 6.94 6.16 ...
 $ gene    : Factor w/ 1 level "1456341_a_at": 1 1 1 1 1 1 1 1 1 1 ...

t.test(gExp ~ devStage, subset(tDat, devStage == "P2" | devStage == "4_weeks" ))

    Welch Two Sample t-test

data:  gExp by devStage
t = -18.84, df = 13.98, p-value = 2.477e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -4.078 -3.244
sample estimates:
     mean in group P2 mean in group 4_weeks 
                6.326                 9.987 

#Linear model
makeStripplot(mDat <- prepareData("1438786_a_at"))

plot of chunk unnamed-chunk-1

str(mDat)
'data.frame':   39 obs. of  6 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  7.94 8.99 8.55 8.62 5.72 ...
 $ gene    : Factor w/ 1 level "1438786_a_at": 1 1 1 1 1 1 1 1 1 1 ...

mFit <- lm(formula = gExp ~ devStage, data=mDat, subset = gType=="wt")

summary(mFit)

Call:
lm(formula = gExp ~ devStage, data = mDat, subset = gType == 
    "wt")

Residuals:
    Min      1Q  Median      3Q     Max 
-1.1565 -0.4400  0.0288  0.4915  1.2065 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)        8.523      0.379   22.50  5.7e-13 ***
devStageP2        -1.450      0.536   -2.71    0.016 *  
devStageP6        -0.107      0.536   -0.20    0.845    
devStageP10       -1.201      0.536   -2.24    0.040 *  
devStage4_weeks    0.081      0.536    0.15    0.882    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.758 on 15 degrees of freedom
Multiple R-squared:  0.497, Adjusted R-squared:  0.363 
F-statistic: 3.71 on 4 and 15 DF,  p-value: 0.0272

#Questions: does the intercept look plausible given the plot? How about the devStageP2 effect, etc.?
#Yes, the P2 and P10 effect is clear using E16 as an intercept.

#Iference for a contrast

contMat = matrix(c(0, 1, 0, -1, 0), nrow=1)

(obsDiff <- contMat %*% coef(mFit))
       [,1]
[1,] -0.249

(sampMeans <- aggregate(gExp ~ devStage, mDat, FUN = mean,
                        subset = gType == "wt"))
  devStage  gExp
1      E16 8.523
2       P2 7.072
3       P6 8.416
4      P10 7.322
5  4_weeks 8.604
with(sampMeans, gExp[devStage == "P2"] - gExp[devStage == "P10"])
[1] -0.249

sqrt(diag(vcov(mFit)))
    (Intercept)      devStageP2      devStageP6     devStageP10 
         0.3788          0.5357          0.5357          0.5357 
devStage4_weeks 
         0.5357 
summary(mFit)$coefficients[ , "Std. Error"]
    (Intercept)      devStageP2      devStageP6     devStageP10 
         0.3788          0.5357          0.5357          0.5357 
devStage4_weeks 
         0.5357 

(estSe <- contMat %*% vcov(mFit) %*% t(contMat))
      [,1]
[1,] 0.287
(testStat <- obsDiff/estSe)
        [,1]
[1,] -0.8676

2 * pt(abs(testStat), df = df.residual(mFit), lower.tail = FALSE)
       [,1]
[1,] 0.3993

#Two categorical covariates

makeStripplot(oDat <- prepareData("1448690_at"))

plot of chunk unnamed-chunk-1


oFitBig <- lm(formula = gExp ~ gType * devStage, data=oDat)

oFitSmall <- lm(formula = gExp ~ gType + devStage, data=oDat)

anova(oFitSmall, oFitBig)
Analysis of Variance Table

Model 1: gExp ~ gType + devStage
Model 2: gExp ~ gType * devStage
  Res.Df  RSS Df Sum of Sq   F Pr(>F)
1     33 18.9                        
2     29 18.4  4     0.497 0.2   0.94

#Is the intercept plausible? How about the various effects? Do the ones with small p-values, e.g. meeting a conventional cut-off of 0.05, look 'real' to you?
#Intercept seems reasonable, the effects with p values smaller than 0.05 are clear in the plot.

#Looking at a more interesting gene

makeStripplot(iDat <- prepareData("1429225_at"))

plot of chunk unnamed-chunk-1


iFitBig <- lm(formula = gExp ~ gType * devStage, data=iDat)

iFitSmall <- lm(formula = gExp ~ gType + devStage, data=iDat)

anova(iFitSmall, iFitBig)
Analysis of Variance Table

Model 1: gExp ~ gType + devStage
Model 2: gExp ~ gType * devStage
  Res.Df   RSS Df Sum of Sq    F Pr(>F)    
1     33 15.12                             
2     29  7.95  4      7.17 6.54  7e-04 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Further experimentation

#adding age as a quantitative variable
library(car)

mDat$age <- recode(mDat$devStage, "'E16'=-2; 'P2'=2; ' P6'=6; 'P10'=10; '4_weeks'=28", as.factor.result = FALSE)

# xyplot(gExp ~ age, mDat, type=c("p", "smooth"))

#Quadratic model
# qFit = lm(formula=gExp~age + I(age^2), data=mDat)
# summary(qFit)