shannonerdelyi — Feb 5, 2014, 9:09 PM
###############################################################################
# STAT 540: Seminar 5
# Low Volume Linear Modeling
# Take home problem: Use data aggregation techniques to repeat the
# analysis for more than one gene
# Date: 05-02-2014
# Author: Shannon Erdelyi
###############################################################################
# librarys
library(lattice)
library(plyr)
# data
dir <- "/Users/shannonerdelyi/Dropbox/UBC/W2014/STAT 540/Data"
des <- read.table(paste(dir, "/GSE4051_design.tsv", sep=""), header=T)
dat <- read.table(paste(dir, "/GSE4051_data.tsv", sep=""), header=T)
# constants
set.seed(8)
n <- 10
(keepGenes <- rownames(dat)[sample(1:nrow(dat), n)])
[1] "1435552_at" "1424407_s_at" "1451485_at" "1444288_at"
[5] "1429021_at" "1448113_at" "1427929_a_at" "1457272_at"
[9] "1450193_at" "1443950_at"
###############################################################################
# Functions
###############################################################################
# Function to return a data frame with only the specified genes
makeDF <- function(geneNames){
miniDat <- dat[geneNames, ]
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))),
gene = factor(rep(rownames(miniDat), each = ncol(miniDat)),
levels = rownames(miniDat)))
miniDat <- suppressWarnings(data.frame(des, miniDat))
miniDat$devStage <- factor(miniDat$devStage,
levels(miniDat$devStage)[c(2, 4, 5, 3, 1)])
return(miniDat)
}
# Function to make a stripplot using the photoRec data
makeSP <- function(data, ...){
stripplot(gExp ~ devStage | gene, data,
group = gType, jitter.data = TRUE,
scales=list(tck=c(1, 0), rot=c(45, 0)),
auto.key = TRUE,
type = c('p', 'a'),
par.settings=simpleTheme(col=c("coral2","cadetblue3"), pch=16),
grid = TRUE, ...)
}
# Function to perform a t-test comparing P2 to 4_week devStages for one gene
doTwoSampleTests <- function(miniDf) {
tt <- t.test(gExp ~ devStage,
subset(miniDf, devStage %in% c("P2", "4_weeks")),
var.equal=T)
results <- with(tt, c(estimate, statistic, parameter, pval=p.value))
return(results)
}
# Function to fit a linear model with devStage (for wt's only)
fitOneWayANOVA <- function(miniDf) {
lm(gExp ~ devStage, miniDf, subset=gType=="wt")
}
# Function to fit a linear model with devStage and gType
# Test for importance of interaction term
interactionPval <- function(miniDf) {
fitBig <- lm(gExp ~ gType*devStage, miniDf)
fitSmall <- lm(gExp ~ gType + devStage, miniDf)
c(pval=anova(fitSmall, fitBig)[2,6])
}
###############################################################################
# Analysis
###############################################################################
# make a data frame
sDat <- makeDF(keepGenes)
str(sDat)
'data.frame': 390 obs. of 6 variables:
$ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
$ sidNum : int 20 21 22 23 16 17 6 24 25 26 ...
$ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
$ gType : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
$ gExp : num 10.59 10.5 10.67 10.58 9.17 ...
$ gene : Factor w/ 10 levels "1435552_at","1424407_s_at",..: 1 1 1 1 1 1 1 1 1 1 ...
head(sDat)
sidChar sidNum devStage gType gExp gene
1 Sample_20 20 E16 wt 10.590 1435552_at
2 Sample_21 21 E16 wt 10.500 1435552_at
3 Sample_22 22 E16 wt 10.670 1435552_at
4 Sample_23 23 E16 wt 10.580 1435552_at
5 Sample_16 16 E16 NrlKO 9.169 1435552_at
6 Sample_17 17 E16 NrlKO 10.680 1435552_at
# make a stripplot
makeSP(sDat)
# two sample t-tests (P2 vs. 4_weeks)
ddply(sDat, ~ gene, doTwoSampleTests)
gene mean in group P2 mean in group 4_weeks t df
1 1435552_at 10.377 10.928 -4.2877 14
2 1424407_s_at 10.079 11.031 -5.0149 14
3 1451485_at 11.355 12.383 -3.7756 14
4 1444288_at 8.604 8.924 -2.7656 14
5 1429021_at 6.243 6.692 -0.7367 14
6 1448113_at 10.339 8.965 3.6238 14
7 1427929_a_at 9.098 9.060 0.4730 14
8 1457272_at 10.396 10.261 1.0925 14
9 1450193_at 7.615 8.336 -4.5997 14
10 1443950_at 6.511 6.480 0.2556 14
pval
1 0.0007512
2 0.0001892
3 0.0020470
4 0.0151717
5 0.4734768
6 0.0027649
7 0.6434721
8 0.2930414
9 0.0004127
10 0.8019925
# linear model with categorical covariate
ow <- dlply(sDat, ~ gene, fitOneWayANOVA)
(owCoefs <- ldply(ow, coef))
gene (Intercept) devStageP2 devStageP6 devStageP10
1 1435552_at 10.585 -0.2270 0.0200 -0.1400
2 1424407_s_at 11.320 -1.5237 -1.9630 -1.9010
3 1451485_at 12.025 -0.9400 -1.2518 -0.7525
4 1444288_at 8.304 0.3798 0.3073 0.3685
5 1429021_at 7.773 -0.6570 -1.1225 -0.5307
6 1448113_at 11.172 -1.0725 -1.2755 -1.5918
7 1427929_a_at 8.633 0.4455 0.4230 0.5408
8 1457272_at 10.757 -0.3375 -0.1900 -0.8252
9 1450193_at 7.313 0.2240 0.5722 0.4588
10 1443950_at 6.265 0.1717 0.2760 0.5665
devStage4_weeks
1 0.27750
2 -0.51250
3 0.29500
4 0.67000
5 -0.71750
6 -2.34825
7 0.52500
8 -0.49000
9 1.09575
10 0.07025
# inference for a contrast (P2 vs P10)
cont <- matrix(c(0, 1, 0, -1, 0), nrow=1)
(inf <- ldply(ow, function(x){
est <- cont %*% coef(x)
se <- cont %*% vcov(x) %*% t(cont)
t <- est/se
p <- 2*pt(abs(t), df = df.residual(x), lower.tail = FALSE)
c(est=est, se=se, pval=p)
}))
gene est se pval
1 1435552_at -0.08700 0.08531 3.240e-01
2 1424407_s_at 0.37725 0.22262 1.108e-01
3 1451485_at -0.18750 0.36342 6.134e-01
4 1444288_at 0.01125 0.04925 8.224e-01
5 1429021_at -0.12625 0.43363 7.749e-01
6 1448113_at 0.51925 0.17867 1.086e-02
7 1427929_a_at -0.09525 0.01879 1.386e-04
8 1457272_at 0.48775 0.05784 4.471e-07
9 1450193_at -0.23475 0.05099 3.444e-04
10 1443950_at -0.39475 0.02796 4.551e-10
# importance of interaction term
ddply(sDat, ~ gene, interactionPval)
gene pval
1 1435552_at 0.380463
2 1424407_s_at 0.003549
3 1451485_at 0.380632
4 1444288_at 0.157013
5 1429021_at 0.294307
6 1448113_at 0.901769
7 1427929_a_at 0.377145
8 1457272_at 0.205546
9 1450193_at 0.659160
10 1443950_at 0.114901