omaromeir — Feb 7, 2014, 12:57 AM
library(lattice)
prDat <- read.table("../data/GSE4051_data.tsv")
str(prDat, max.level = 0)
'data.frame': 29949 obs. of 39 variables:
prDes <- readRDS("../data/GSE4051_design.rds")
str(prDes)
'data.frame': 39 obs. of 4 variables:
$ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
$ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
#g: the selected genes
prepareData <- function(g){
#prDes row + gExp of the gene from prDat + gene from the input
pDat <- data.frame()
for (i in 1:length(g)){
pDat1 <- data.frame(prDes, gExp = as.vector(as.matrix(prDat[g[i], ])), gene = g[i])
pDat <- rbind(pDat, pDat1)
}
pDat
}
(magicGenes = c("1419655_at", "1438815_at"))
[1] "1419655_at" "1438815_at"
jDat <- prepareData(magicGenes)
str(jDat)
'data.frame': 78 obs. of 6 variables:
$ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
$ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
$ gExp : num 10.93 10.74 10.67 10.68 9.61 ...
$ gene : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
head(jDat)
sidChar sidNum devStage gType gExp gene
12 Sample_20 20 E16 wt 10.930 1419655_at
13 Sample_21 21 E16 wt 10.740 1419655_at
14 Sample_22 22 E16 wt 10.670 1419655_at
15 Sample_23 23 E16 wt 10.680 1419655_at
9 Sample_16 16 E16 NrlKO 9.606 1419655_at
10 Sample_17 17 E16 NrlKO 10.840 1419655_at
tail(jDat)
sidChar sidNum devStage gType gExp gene
71 Sample_38 38 4_weeks wt 8.211 1438815_at
81 Sample_39 39 4_weeks wt 8.436 1438815_at
110 Sample_11 11 4_weeks NrlKO 8.465 1438815_at
210 Sample_12 12 4_weeks NrlKO 8.841 1438815_at
310 Sample_2 2 4_weeks NrlKO 8.506 1438815_at
41 Sample_9 9 4_weeks NrlKO 8.952 1438815_at
#d: the data
makeStripplot <- function(d){
stripplot(gExp ~ devStage | gene, d, group = gType, jitter.data = TRUE, auto.key = TRUE, type = c('p', 'a'), grid = TRUE)
}
makeStripplot(jDat)
#t test
tDat <- prepareData("1456341_a_at")
str(tDat)
'data.frame': 39 obs. of 6 variables:
$ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
$ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
$ gExp : num 7.04 7.48 7.37 6.94 6.16 ...
$ gene : Factor w/ 1 level "1456341_a_at": 1 1 1 1 1 1 1 1 1 1 ...
t.test(gExp ~ devStage, subset(tDat, devStage == "P2" | devStage == "4_weeks" ))
Welch Two Sample t-test
data: gExp by devStage
t = -18.84, df = 13.98, p-value = 2.477e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-4.078 -3.244
sample estimates:
mean in group P2 mean in group 4_weeks
6.326 9.987
#Linear model
makeStripplot(mDat <- prepareData("1438786_a_at"))
str(mDat)
'data.frame': 39 obs. of 6 variables:
$ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
$ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
$ gExp : num 7.94 8.99 8.55 8.62 5.72 ...
$ gene : Factor w/ 1 level "1438786_a_at": 1 1 1 1 1 1 1 1 1 1 ...
mFit <- lm(formula = gExp ~ devStage, data=mDat, subset = gType=="wt")
summary(mFit)
Call:
lm(formula = gExp ~ devStage, data = mDat, subset = gType ==
"wt")
Residuals:
Min 1Q Median 3Q Max
-1.1565 -0.4400 0.0288 0.4915 1.2065
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.523 0.379 22.50 5.7e-13 ***
devStageP2 -1.450 0.536 -2.71 0.016 *
devStageP6 -0.107 0.536 -0.20 0.845
devStageP10 -1.201 0.536 -2.24 0.040 *
devStage4_weeks 0.081 0.536 0.15 0.882
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.758 on 15 degrees of freedom
Multiple R-squared: 0.497, Adjusted R-squared: 0.363
F-statistic: 3.71 on 4 and 15 DF, p-value: 0.0272
#Questions: does the intercept look plausible given the plot? How about the devStageP2 effect, etc.?
#Yes, the P2 and P10 effect is clear using E16 as an intercept.
#Iference for a contrast
contMat = matrix(c(0, 1, 0, -1, 0), nrow=1)
(obsDiff <- contMat %*% coef(mFit))
[,1]
[1,] -0.249
(sampMeans <- aggregate(gExp ~ devStage, mDat, FUN = mean,
subset = gType == "wt"))
devStage gExp
1 E16 8.523
2 P2 7.072
3 P6 8.416
4 P10 7.322
5 4_weeks 8.604
with(sampMeans, gExp[devStage == "P2"] - gExp[devStage == "P10"])
[1] -0.249
sqrt(diag(vcov(mFit)))
(Intercept) devStageP2 devStageP6 devStageP10
0.3788 0.5357 0.5357 0.5357
devStage4_weeks
0.5357
summary(mFit)$coefficients[ , "Std. Error"]
(Intercept) devStageP2 devStageP6 devStageP10
0.3788 0.5357 0.5357 0.5357
devStage4_weeks
0.5357
(estSe <- contMat %*% vcov(mFit) %*% t(contMat))
[,1]
[1,] 0.287
(testStat <- obsDiff/estSe)
[,1]
[1,] -0.8676
2 * pt(abs(testStat), df = df.residual(mFit), lower.tail = FALSE)
[,1]
[1,] 0.3993
#Two categorical covariates
makeStripplot(oDat <- prepareData("1448690_at"))
oFitBig <- lm(formula = gExp ~ gType * devStage, data=oDat)
oFitSmall <- lm(formula = gExp ~ gType + devStage, data=oDat)
anova(oFitSmall, oFitBig)
Analysis of Variance Table
Model 1: gExp ~ gType + devStage
Model 2: gExp ~ gType * devStage
Res.Df RSS Df Sum of Sq F Pr(>F)
1 33 18.9
2 29 18.4 4 0.497 0.2 0.94
#Is the intercept plausible? How about the various effects? Do the ones with small p-values, e.g. meeting a conventional cut-off of 0.05, look 'real' to you?
#Intercept seems reasonable, the effects with p values smaller than 0.05 are clear in the plot.
#Looking at a more interesting gene
makeStripplot(iDat <- prepareData("1429225_at"))
iFitBig <- lm(formula = gExp ~ gType * devStage, data=iDat)
iFitSmall <- lm(formula = gExp ~ gType + devStage, data=iDat)
anova(iFitSmall, iFitBig)
Analysis of Variance Table
Model 1: gExp ~ gType + devStage
Model 2: gExp ~ gType * devStage
Res.Df RSS Df Sum of Sq F Pr(>F)
1 33 15.12
2 29 7.95 4 7.17 6.54 7e-04 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Further experimentation
#adding age as a quantitative variable
library(car)
mDat$age <- recode(mDat$devStage, "'E16'=-2; 'P2'=2; ' P6'=6; 'P10'=10; '4_weeks'=28", as.factor.result = FALSE)
# xyplot(gExp ~ age, mDat, type=c("p", "smooth"))
#Quadratic model
# qFit = lm(formula=gExp~age + I(age^2), data=mDat)
# summary(qFit)