To explore linear models on gene expression data.
Start by loading data/libraries:
library(lattice)
library(plyr)
prDat <- read.table("../GSE4051_data.tsv")
prDes <- read.table("../GSE4051_design.tsv", header = TRUE)
prepareData <- function(x) {
miniDat <- subset(prDat, rownames(prDat) %in% x)
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))), gene = factor(rep(rownames(miniDat),
each = ncol(miniDat)), levels = x))
miniDat <- data.frame(prDes, miniDat)
miniDat$sidNum <- as.numeric(miniDat$sidNum)
miniDat$devStage <- factor(miniDat$devStage, levels = unique(miniDat$devStage)) #to make plotting in order
return(miniDat)
}
makeStripplot <- function(x, pch = 1, cex = 1) {
return(stripplot(gExp ~ devStage | gene, x, group = gType, jitter.data = TRUE,
auto.key = TRUE, type = c("p", "a"), grid = TRUE, pch = pch, cex = cex))
}
(luckyGenes <- c("1419655_at", "1438815_at"))
## [1] "1419655_at" "1438815_at"
jDat <- prepareData(luckyGenes)
str(jDat)
## 'data.frame': 78 obs. of 6 variables:
## $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
## $ gExp : num 10.93 10.74 10.67 10.68 9.61 ...
## $ gene : Factor w/ 2 levels "1419655_at","1438815_at": 1 1 1 1 1 1 1 1 1 1 ...
head(jDat)
## sidChar sidNum devStage gType gExp gene
## 1 Sample_20 20 E16 wt 10.930 1419655_at
## 2 Sample_21 21 E16 wt 10.740 1419655_at
## 3 Sample_22 22 E16 wt 10.670 1419655_at
## 4 Sample_23 23 E16 wt 10.680 1419655_at
## 5 Sample_16 16 E16 NrlKO 9.606 1419655_at
## 6 Sample_17 17 E16 NrlKO 10.840 1419655_at
tail(jDat)
## sidChar sidNum devStage gType gExp gene
## 73 Sample_38 38 4_weeks wt 8.211 1438815_at
## 74 Sample_39 39 4_weeks wt 8.436 1438815_at
## 75 Sample_11 11 4_weeks NrlKO 8.465 1438815_at
## 76 Sample_12 12 4_weeks NrlKO 8.841 1438815_at
## 77 Sample_2 2 4_weeks NrlKO 8.506 1438815_at
## 78 Sample_9 9 4_weeks NrlKO 8.952 1438815_at
makeStripplot(jDat)
makeStripplot(jDat, pch = 17, cex = 3)
You can use both of your functions together and create a minidatset and plot it all at once:
makeStripplot(newDat <- prepareData("1456341_a_at"))
str(newDat)
## 'data.frame': 39 obs. of 6 variables:
## $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
## $ gExp : num 7.04 7.48 7.37 6.94 6.16 ...
## $ gene : Factor w/ 1 level "1456341_a_at": 1 1 1 1 1 1 1 1 1 1 ...
head(newDat)
## sidChar sidNum devStage gType gExp gene
## 1 Sample_20 20 E16 wt 7.044 1456341_a_at
## 2 Sample_21 21 E16 wt 7.478 1456341_a_at
## 3 Sample_22 22 E16 wt 7.374 1456341_a_at
## 4 Sample_23 23 E16 wt 6.944 1456341_a_at
## 5 Sample_16 16 E16 NrlKO 6.161 1456341_a_at
## 6 Sample_17 17 E16 NrlKO 6.931 1456341_a_at
Here's what I get:
mDat <- prepareData("1438786_a_at")
t.test(gExp ~ devStage, mDat, subset = devStage %in% c("P2", "4_weeks"), var.equal = TRUE)
##
## Two Sample t-test
##
## data: gExp by devStage
## t = -5.666, df = 14, p-value = 5.821e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.9602 -0.8838
## sample estimates:
## mean in group P2 mean in group 4_weeks
## 6.949 8.371
mFit <- lm(gExp ~ devStage, mDat, subset = gType == "wt")
summary(mFit)
##
## Call:
## lm(formula = gExp ~ devStage, data = mDat, subset = gType ==
## "wt")
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1565 -0.4400 0.0288 0.4915 1.2065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.523 0.379 22.50 5.7e-13 ***
## devStageP2 -1.450 0.536 -2.71 0.016 *
## devStageP6 -0.107 0.536 -0.20 0.845
## devStageP10 -1.201 0.536 -2.24 0.040 *
## devStage4_weeks 0.081 0.536 0.15 0.882
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.758 on 15 degrees of freedom
## Multiple R-squared: 0.497, Adjusted R-squared: 0.363
## F-statistic: 3.71 on 4 and 15 DF, p-value: 0.0272
Helpful link for making correct contrast matrix http://www.vsni.co.uk/software/genstat/htmlhelp/anova/Contrasts.htm
contMat <- matrix(c(0, 1, 0, -1, 0), nrow = 1)
obsDiff <- contMat %*% coef(mFit)
obsDiff
## [,1]
## [1,] -0.249
Get p-value for contrast
estSe <- contMat %*% vcov(mFit) %*% t(contMat)
testStat <- obsDiff/estSe
2 * pt(abs(testStat), df = df.residual(mFit), lower.tail = FALSE)
## [,1]
## [1,] 0.3993
makeStripplot(oDat <- prepareData("1448690_at"))
oFitBig <- lm(gExp ~ gType * devStage, oDat)
summary(oFitBig)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.83567 0.4603 17.02145 1.234e-16
## gTypewt 0.84233 0.6090 1.38320 1.772e-01
## devStageP2 -0.95917 0.6090 -1.57505 1.261e-01
## devStageP6 -1.74917 0.6090 -2.87232 7.542e-03
## devStageP10 -1.96742 0.6090 -3.23071 3.068e-03
## devStage4_weeks -1.43592 0.6090 -2.35793 2.534e-02
## gTypewt:devStageP2 -0.06983 0.8299 -0.08415 9.335e-01
## gTypewt:devStageP6 -0.16533 0.8299 -0.19922 8.435e-01
## gTypewt:devStageP10 -0.22583 0.8299 -0.27212 7.875e-01
## gTypewt:devStage4_weeks -0.64608 0.8299 -0.77852 4.426e-01
oFitSmall <- lm(gExp ~ gType + devStage, oDat)
summary(oFitSmall)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.9659 0.3182 25.035 4.849e-23
## gTypewt 0.6144 0.2430 2.528 1.643e-02
## devStageP2 -1.0104 0.3924 -2.575 1.470e-02
## devStageP6 -1.8481 0.3924 -4.710 4.328e-05
## devStageP10 -2.0966 0.3924 -5.343 6.703e-06
## devStage4_weeks -1.7752 0.3924 -4.524 7.444e-05
Results slighly different due to R picking a differnt intercept to use for some reason. Overall interpretation seems the same.
Peform Anova:
anova(oFitSmall, oFitBig)
## Analysis of Variance Table
##
## Model 1: gExp ~ gType + devStage
## Model 2: gExp ~ gType * devStage
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 18.9
## 2 29 18.4 4 0.497 0.2 0.94
Sample some data:
set.seed(540)
genes12 <- row.names(prDat)[sample(1:nrow(prDat), 12)]
jDat <- prepareData(genes12)
makeStripplot(jDat)
Function for performing an anova on categorical covariates devStage and gType:
anovaDSGT <- function(x) {
makeStripplot(oDat <- prepareData("1448690_at"))
oFitBig <- lm(gExp ~ gType * devStage, x)
oFitSmall <- lm(gExp ~ gType + devStage, x)
return(as.numeric(anova(oFitSmall, oFitBig)$Pr[2]))
}
ddply(jDat, ~gene, anovaDSGT)
## gene V1
## 1 1417558_at 0.072126
## 2 1453147_at 0.251209
## 3 1450169_at 0.084658
## 4 1430577_at 0.148439
## 5 1459967_at 0.925715
## 6 1432182_at 0.499350
## 7 1446589_at 0.005162
## 8 1433117_at 0.740763
## 9 1458637_x_at 0.257101
## 10 1452944_at 0.747966
## 11 1416357_a_at 0.008065
## 12 1418200_at 0.348813
1416357_a_at shows some interaction as well as possibly 1450169_at.
library(car)
# method with age variable
prepareDataAge <- function(x) {
miniDat <- subset(prDat, rownames(prDat) %in% x)
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))), gene = factor(rep(rownames(miniDat),
each = ncol(miniDat)), levels = x))
miniDat <- data.frame(prDes, miniDat)
miniDat$sidNum <- as.numeric(miniDat$sidNum)
miniDat$age <- suppressWarnings(as.numeric(recode(prDes$devStage, "'E16'=-2; 'P2'=2; 'P6'=6; 'P10'=10",
as.factor.result = FALSE)))
# remove week 4
miniDat <- subset(miniDat, miniDat$devStage != "4_weeks")
return(miniDat)
}
makexyplot <- function(x, pch = 1, cex = 1) {
return(xyplot(gExp ~ age | gene, x, group = gType, jitter.data = TRUE, auto.key = TRUE,
type = c("p", "a"), grid = TRUE, pch = pch, cex = cex))
}
jDat <- prepareDataAge(genes12)
makexyplot(jDat)