Seminar04.R

Abrar — Feb 7, 2014, 9:25 PM

library(lattice)
prDat <- read.table("../data/GSE4051_data.tsv")
str(prDat, max.level = 0)
'data.frame':   29949 obs. of  39 variables:
prDes <- readRDS("../data/GSE4051_design.rds")
str(prDes)
'data.frame':   39 obs. of  4 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...

# Two sample tests -- one gene
set.seed(987)
(theGene <- sample(1:nrow(prDat), 1))
[1] 14294
pDat <- data.frame(prDes, gExp = unlist(prDat[theGene, ]))
str(pDat)
'data.frame':   39 obs. of  5 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  9.88 10.59 10.28 10.22 8.75 ...

aggregate(gExp ~ gType, pDat, FUN = mean)
  gType  gExp
1    wt 9.758
2 NrlKO 9.553
stripplot(gType ~ gExp, pDat)

plot of chunk unnamed-chunk-1

t.test(gExp ~ gType, pDat)

    Welch Two Sample t-test

data:  gExp by gType
t = 1.479, df = 36.78, p-value = 0.1475
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.07586  0.48605
sample estimates:
   mean in group wt mean in group NrlKO 
              9.758               9.553 
ttRes <- t.test(gExp ~ gType, pDat)
str(ttRes)
List of 9
 $ statistic  : Named num 1.48
  ..- attr(*, "names")= chr "t"
 $ parameter  : Named num 36.8
  ..- attr(*, "names")= chr "df"
 $ p.value    : num 0.148
 $ conf.int   : atomic [1:2] -0.0759 0.4861
  ..- attr(*, "conf.level")= num 0.95
 $ estimate   : Named num [1:2] 9.76 9.55
  ..- attr(*, "names")= chr [1:2] "mean in group wt" "mean in group NrlKO"
 $ null.value : Named num 0
  ..- attr(*, "names")= chr "difference in means"
 $ alternative: chr "two.sided"
 $ method     : chr "Welch Two Sample t-test"
 $ data.name  : chr "gExp by gType"
 - attr(*, "class")= chr "htest"
ttRes$statistic
    t 
1.479 
ttRes$p.value
[1] 0.1475

wtRes <- wilcox.test(gExp ~ gType, pDat)
Warning: cannot compute exact p-value with ties
wtRes$statistic
  W 
241 
wtRes$p.value
[1] 0.1559

ktRes <- ks.test(pDat$gExp[pDat$gType == "wt"],pDat$gExp[pDat$gType == "NrlKO"])
Warning: cannot compute exact p-value with ties
ktRes$statistic
     D 
0.2974 
ktRes$p.value
[1] 0.355

# Can you pull test statistics and/or p-values from the different approaches into an common object, like a readable table?
# Yes
cbind(c(ttRes$p.value,wtRes$p.value, ktRes$p.value), c(ttRes$statistic,wtRes$statistic, ktRes$statistic))
    [,1]     [,2]
t 0.1475   1.4794
W 0.1559 241.0000
D 0.3550   0.2974

# Are you getting the same message from the various approaches?
# Yes all show there is no difference in means


# Apply() for computing on rows and columns of matrices
kDat <- readRDS("../data/GSE4051_MINI.rds")
kMat <- as.matrix(kDat[c('crabHammer', 'eggBomb', 'poisonFang')])
str(kMat)
 num [1:39, 1:3] 10.22 10.02 9.64 9.65 8.58 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:39] "12" "13" "14" "15" ...
  ..$ : chr [1:3] "crabHammer" "eggBomb" "poisonFang"
median(kMat[ , 'eggBomb'])
[1] 6.757
apply(kMat, 2, median)
crabHammer    eggBomb poisonFang 
     9.611      6.757      7.350 
apply(kMat, 2, quantile, probs = 0.5)
crabHammer    eggBomb poisonFang 
     9.611      6.757      7.350 
apply(kMat, 2, quantile, probs = c(0.25, 0.75))
    crabHammer eggBomb poisonFang
25%      8.938   6.278      7.188
75%      9.830   7.095      7.476
apply(kMat, 1, min)
   12    13    14    15     9    10    11    28    29    30    31    24 
7.370 6.890 6.720 6.529 6.470 7.005 6.735 6.587 6.170 6.870 6.800 6.138 
   25    26    27    36    37    38    39    32    33    34    35    20 
6.166 6.269 6.264 6.530 7.100 6.269 6.211 6.286 6.347 6.270 6.188 7.005 
   21    22    23    16    17    18    19     5     6     7     8     1 
7.082 6.757 6.155 7.228 7.226 7.363 7.081 6.993 6.992 6.608 7.003 6.981 
    2     3     4 
7.165 7.075 6.558 
colnames(kMat)[apply(kMat, 1, which.min)]
 [1] "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"   
 [6] "poisonFang" "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"   
[11] "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"   
[16] "eggBomb"    "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"   
[21] "eggBomb"    "eggBomb"    "eggBomb"    "poisonFang" "eggBomb"   
[26] "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"    "poisonFang"
[31] "eggBomb"    "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"   
[36] "poisonFang" "eggBomb"    "poisonFang" "eggBomb"   
all.equal(rowSums(kMat), apply(kMat, 1, sum))
[1] TRUE

# Computing on groups of observations with aggregate()
aggregate(eggBomb ~ devStage, kDat, FUN = mean)
  devStage eggBomb
1      E16   6.879
2       P2   6.408
3       P6   6.459
4      P10   7.143
5  4_weeks   7.063
aggregate(eggBomb ~ gType * devStage, kDat, FUN = mean)
   gType devStage eggBomb
1     wt      E16   6.900
2  NrlKO      E16   6.851
3     wt       P2   6.607
4  NrlKO       P2   6.209
5     wt       P6   6.646
6  NrlKO       P6   6.273
7     wt      P10   7.042
8  NrlKO      P10   7.243
9     wt  4_weeks   7.117
10 NrlKO  4_weeks   7.008
aggregate(eggBomb ~ gType * devStage, kDat, FUN = range)
   gType devStage eggBomb.1 eggBomb.2
1     wt      E16     6.529     7.462
2  NrlKO      E16     6.470     7.065
3     wt       P2     6.170     6.870
4  NrlKO       P2     6.138     6.269
5     wt       P6     6.211     7.574
6  NrlKO       P6     6.188     6.347
7     wt      P10     6.155     8.173
8  NrlKO      P10     7.081     7.438
9     wt  4_weeks     6.608     7.866
10 NrlKO  4_weeks     6.558     7.204


keepGenes <- c("1431708_a_at", "1424336_at", "1454696_at",
               "1416119_at", "1432141_x_at", "1429226_at" )
miniDat <- subset(prDat, rownames(prDat) %in% keepGenes)
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))),gene = factor(rep(rownames(miniDat), each = ncol(miniDat)), levels = keepGenes))
miniDat <- suppressWarnings(data.frame(prDes, miniDat))
str(miniDat)
'data.frame':   234 obs. of  6 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  10.6 11 10.8 10.9 9.2 ...
 $ gene    : Factor w/ 6 levels "1431708_a_at",..: 4 4 4 4 4 4 4 4 4 4 ...
stripplot(gType ~ gExp | gene, miniDat,scales = list(x = list(relation = "free")),group = gType, auto.key = TRUE)

plot of chunk unnamed-chunk-1

someDat <- droplevels(subset(miniDat, gene == keepGenes[1]))
t.test(gExp ~ gType, someDat)

    Welch Two Sample t-test

data:  gExp by gType
t = 9.838, df = 36.89, p-value = 7.381e-12
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.570 2.384
sample estimates:
   mean in group wt mean in group NrlKO 
              9.554               7.578 


# The plyr package
library(plyr)
d_ply(miniDat, ~ gene, function(x) t.test(gExp ~ gType, x), .print = TRUE)

    Welch Two Sample t-test

data:  gExp by gType
t = 9.838, df = 36.89, p-value = 7.381e-12
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.570 2.384
sample estimates:
   mean in group wt mean in group NrlKO 
              9.554               7.578 


    Welch Two Sample t-test

data:  gExp by gType
t = 9.061, df = 36.49, p-value = 7.146e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.6239 0.9835
sample estimates:
   mean in group wt mean in group NrlKO 
              7.670               6.867 


    Welch Two Sample t-test

data:  gExp by gType
t = 8.077, df = 33.49, p-value = 2.278e-09
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.402 2.346
sample estimates:
   mean in group wt mean in group NrlKO 
              12.85               10.97 


    Welch Two Sample t-test

data:  gExp by gType
t = -0.184, df = 36.53, p-value = 0.8551
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.5079  0.4234
sample estimates:
   mean in group wt mean in group NrlKO 
              9.893               9.935 


    Welch Two Sample t-test

data:  gExp by gType
t = -0.1324, df = 36.31, p-value = 0.8954
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.2413  0.2118
sample estimates:
   mean in group wt mean in group NrlKO 
              7.092               7.107 


    Welch Two Sample t-test

data:  gExp by gType
t = 0.0983, df = 35.58, p-value = 0.9223
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.1019  0.1123
sample estimates:
   mean in group wt mean in group NrlKO 
              6.551               6.546 
ttRes <- dlply(miniDat, ~ gene, function(x) t.test(gExp ~ gType, x))
names(ttRes)
[1] "1431708_a_at" "1424336_at"   "1454696_at"   "1416119_at"  
[5] "1432141_x_at" "1429226_at"  
ttRes[["1454696_at"]]

    Welch Two Sample t-test

data:  gExp by gType
t = 8.077, df = 33.49, p-value = 2.278e-09
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.402 2.346
sample estimates:
   mean in group wt mean in group NrlKO 
              12.85               10.97 

ttRes <- ddply(miniDat, ~ gene, function(z) {
  zz <- t.test(gExp ~ gType, z)
  round(c(tStat = zz$statistic, pVal = zz$p.value), 4)
})
ttRes
          gene tStat.t   pVal
1 1431708_a_at  9.8380 0.0000
2   1424336_at  9.0607 0.0000
3   1454696_at  8.0771 0.0000
4   1416119_at -0.1840 0.8551
5 1432141_x_at -0.1324 0.8954
6   1429226_at  0.0983 0.9223