Seminar 5: take home problem

shannonerdelyi — Feb 5, 2014, 9:09 PM

###############################################################################
# STAT 540: Seminar 5
# Low Volume Linear Modeling 
# Take home problem:  Use data aggregation techniques to repeat the 
#                     analysis for more than one gene
# Date: 05-02-2014
# Author: Shannon Erdelyi
###############################################################################

# librarys
library(lattice)
library(plyr)

# data
dir <- "/Users/shannonerdelyi/Dropbox/UBC/W2014/STAT 540/Data"
des <- read.table(paste(dir, "/GSE4051_design.tsv", sep=""), header=T)
dat <- read.table(paste(dir, "/GSE4051_data.tsv", sep=""), header=T)

# constants
set.seed(8)
n <- 10
(keepGenes <- rownames(dat)[sample(1:nrow(dat), n)])
 [1] "1435552_at"   "1424407_s_at" "1451485_at"   "1444288_at"  
 [5] "1429021_at"   "1448113_at"   "1427929_a_at" "1457272_at"  
 [9] "1450193_at"   "1443950_at"  

###############################################################################
# Functions
###############################################################################

# Function to return a data frame with only the specified genes
makeDF <- function(geneNames){
  miniDat <- dat[geneNames, ]
  miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))),
                        gene = factor(rep(rownames(miniDat), each = ncol(miniDat)),
                                      levels = rownames(miniDat)))
  miniDat <- suppressWarnings(data.frame(des, miniDat))
  miniDat$devStage <- factor(miniDat$devStage, 
                             levels(miniDat$devStage)[c(2, 4, 5, 3, 1)])
  return(miniDat)
}

# Function to make a stripplot using the photoRec data
makeSP <- function(data, ...){
  stripplot(gExp ~ devStage | gene, data,
            group = gType, jitter.data = TRUE,
            scales=list(tck=c(1, 0), rot=c(45, 0)),
            auto.key = TRUE, 
            type = c('p', 'a'), 
            par.settings=simpleTheme(col=c("coral2","cadetblue3"), pch=16),
            grid = TRUE, ...)
}

# Function to perform a t-test comparing P2 to 4_week devStages for one gene
doTwoSampleTests <- function(miniDf) {
  tt <- t.test(gExp ~ devStage, 
               subset(miniDf, devStage %in% c("P2", "4_weeks")),
               var.equal=T)
  results <- with(tt, c(estimate, statistic, parameter, pval=p.value))
  return(results)
}

# Function to fit a linear model with devStage (for wt's only)
fitOneWayANOVA <- function(miniDf) {
  lm(gExp ~ devStage, miniDf, subset=gType=="wt")
}

# Function to fit a linear model with devStage and gType
# Test for importance of interaction term
interactionPval <- function(miniDf) {
  fitBig <- lm(gExp ~ gType*devStage, miniDf)
  fitSmall <- lm(gExp ~ gType + devStage, miniDf)
  c(pval=anova(fitSmall, fitBig)[2,6])
}

###############################################################################
# Analysis
###############################################################################

# make a data frame
sDat <- makeDF(keepGenes)
str(sDat)
'data.frame':   390 obs. of  6 variables:
 $ sidChar : Factor w/ 39 levels "Sample_1","Sample_10",..: 13 14 15 16 8 9 36 17 18 19 ...
 $ sidNum  : int  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "NrlKO","wt": 2 2 2 2 1 1 1 2 2 2 ...
 $ gExp    : num  10.59 10.5 10.67 10.58 9.17 ...
 $ gene    : Factor w/ 10 levels "1435552_at","1424407_s_at",..: 1 1 1 1 1 1 1 1 1 1 ...
head(sDat)
    sidChar sidNum devStage gType   gExp       gene
1 Sample_20     20      E16    wt 10.590 1435552_at
2 Sample_21     21      E16    wt 10.500 1435552_at
3 Sample_22     22      E16    wt 10.670 1435552_at
4 Sample_23     23      E16    wt 10.580 1435552_at
5 Sample_16     16      E16 NrlKO  9.169 1435552_at
6 Sample_17     17      E16 NrlKO 10.680 1435552_at

# make a stripplot
makeSP(sDat)

plot of chunk unnamed-chunk-1


# two sample t-tests (P2 vs. 4_weeks)
ddply(sDat, ~ gene, doTwoSampleTests)
           gene mean in group P2 mean in group 4_weeks       t df
1    1435552_at           10.377                10.928 -4.2877 14
2  1424407_s_at           10.079                11.031 -5.0149 14
3    1451485_at           11.355                12.383 -3.7756 14
4    1444288_at            8.604                 8.924 -2.7656 14
5    1429021_at            6.243                 6.692 -0.7367 14
6    1448113_at           10.339                 8.965  3.6238 14
7  1427929_a_at            9.098                 9.060  0.4730 14
8    1457272_at           10.396                10.261  1.0925 14
9    1450193_at            7.615                 8.336 -4.5997 14
10   1443950_at            6.511                 6.480  0.2556 14
        pval
1  0.0007512
2  0.0001892
3  0.0020470
4  0.0151717
5  0.4734768
6  0.0027649
7  0.6434721
8  0.2930414
9  0.0004127
10 0.8019925

# linear model with categorical covariate 
ow <- dlply(sDat, ~ gene, fitOneWayANOVA)
(owCoefs <- ldply(ow, coef))
           gene (Intercept) devStageP2 devStageP6 devStageP10
1    1435552_at      10.585    -0.2270     0.0200     -0.1400
2  1424407_s_at      11.320    -1.5237    -1.9630     -1.9010
3    1451485_at      12.025    -0.9400    -1.2518     -0.7525
4    1444288_at       8.304     0.3798     0.3073      0.3685
5    1429021_at       7.773    -0.6570    -1.1225     -0.5307
6    1448113_at      11.172    -1.0725    -1.2755     -1.5918
7  1427929_a_at       8.633     0.4455     0.4230      0.5408
8    1457272_at      10.757    -0.3375    -0.1900     -0.8252
9    1450193_at       7.313     0.2240     0.5722      0.4588
10   1443950_at       6.265     0.1717     0.2760      0.5665
   devStage4_weeks
1          0.27750
2         -0.51250
3          0.29500
4          0.67000
5         -0.71750
6         -2.34825
7          0.52500
8         -0.49000
9          1.09575
10         0.07025

# inference for a contrast (P2 vs P10)
cont <- matrix(c(0, 1, 0, -1, 0), nrow=1)
(inf <- ldply(ow, function(x){
  est <- cont %*% coef(x)
  se <- cont %*% vcov(x) %*% t(cont)
  t <- est/se
  p <- 2*pt(abs(t), df = df.residual(x), lower.tail = FALSE)
  c(est=est, se=se, pval=p)
  }))
           gene      est      se      pval
1    1435552_at -0.08700 0.08531 3.240e-01
2  1424407_s_at  0.37725 0.22262 1.108e-01
3    1451485_at -0.18750 0.36342 6.134e-01
4    1444288_at  0.01125 0.04925 8.224e-01
5    1429021_at -0.12625 0.43363 7.749e-01
6    1448113_at  0.51925 0.17867 1.086e-02
7  1427929_a_at -0.09525 0.01879 1.386e-04
8    1457272_at  0.48775 0.05784 4.471e-07
9    1450193_at -0.23475 0.05099 3.444e-04
10   1443950_at -0.39475 0.02796 4.551e-10

# importance of interaction term
ddply(sDat, ~ gene, interactionPval)
           gene     pval
1    1435552_at 0.380463
2  1424407_s_at 0.003549
3    1451485_at 0.380632
4    1444288_at 0.157013
5    1429021_at 0.294307
6    1448113_at 0.901769
7  1427929_a_at 0.377145
8    1457272_at 0.205546
9    1450193_at 0.659160
10   1443950_at 0.114901