Seminar04.R

Abrar — Feb 7, 2014, 9:25 PM

library(lattice)
prDat <- read.table("../data/GSE4051_data.tsv")
str(prDat, max.level = 0)

'data.frame':   29949 obs. of  39 variables:

prDes <- readRDS("../data/GSE4051_design.rds")
str(prDes)

'data.frame':   39 obs. of  4 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...


# Two sample tests -- one gene
set.seed(987)
(theGene <- sample(1:nrow(prDat), 1))

[1] 14294

pDat <- data.frame(prDes, gExp = unlist(prDat[theGene, ]))
str(pDat)

'data.frame':   39 obs. of  5 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  9.88 10.59 10.28 10.22 8.75 ...


aggregate(gExp ~ gType, pDat, FUN = mean)

  gType  gExp
1    wt 9.758
2 NrlKO 9.553

stripplot(gType ~ gExp, pDat)

plot of chunk unnamed-chunk-1

t.test(gExp ~ gType, pDat)


    Welch Two Sample t-test

data:  gExp by gType
t = 1.479, df = 36.78, p-value = 0.1475
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.07586  0.48605
sample estimates:
   mean in group wt mean in group NrlKO 
              9.758               9.553

ttRes <- t.test(gExp ~ gType, pDat)
str(ttRes)

List of 9
 $ statistic  : Named num 1.48
  ..- attr(*, "names")= chr "t"
 $ parameter  : Named num 36.8
  ..- attr(*, "names")= chr "df"
 $ p.value    : num 0.148
 $ conf.int   : atomic [1:2] -0.0759 0.4861
  ..- attr(*, "conf.level")= num 0.95
 $ estimate   : Named num [1:2] 9.76 9.55
  ..- attr(*, "names")= chr [1:2] "mean in group wt" "mean in group NrlKO"
 $ null.value : Named num 0
  ..- attr(*, "names")= chr "difference in means"
 $ alternative: chr "two.sided"
 $ method     : chr "Welch Two Sample t-test"
 $ data.name  : chr "gExp by gType"
 - attr(*, "class")= chr "htest"

ttRes$statistic

    t 
1.479

ttRes$p.value

[1] 0.1475


wtRes <- wilcox.test(gExp ~ gType, pDat)

Warning: cannot compute exact p-value with ties

wtRes$statistic

  W 
241

wtRes$p.value

[1] 0.1559


ktRes <- ks.test(pDat$gExp[pDat$gType == "wt"],pDat$gExp[pDat$gType == "NrlKO"])

Warning: cannot compute exact p-value with ties

ktRes$statistic

     D 
0.2974

ktRes$p.value

[1] 0.355


# Can you pull test statistics and/or p-values from the different approaches into an common object, like a readable table?
# Yes
cbind(c(ttRes$p.value,wtRes$p.value, ktRes$p.value), c(ttRes$statistic,wtRes$statistic, ktRes$statistic))

    [,1]     [,2]
t 0.1475   1.4794
W 0.1559 241.0000
D 0.3550   0.2974


# Are you getting the same message from the various approaches?
# Yes all show there is no difference in means


# Apply() for computing on rows and columns of matrices
kDat <- readRDS("../data/GSE4051_MINI.rds")
kMat <- as.matrix(kDat[c('crabHammer', 'eggBomb', 'poisonFang')])
str(kMat)

 num [1:39, 1:3] 10.22 10.02 9.64 9.65 8.58 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:39] "12" "13" "14" "15" ...
  ..$ : chr [1:3] "crabHammer" "eggBomb" "poisonFang"

median(kMat[ , 'eggBomb'])

[1] 6.757

apply(kMat, 2, median)

crabHammer    eggBomb poisonFang 
     9.611      6.757      7.350

apply(kMat, 2, quantile, probs = 0.5)

crabHammer    eggBomb poisonFang 
     9.611      6.757      7.350

apply(kMat, 2, quantile, probs = c(0.25, 0.75))

    crabHammer eggBomb poisonFang
25%      8.938   6.278      7.188
75%      9.830   7.095      7.476

apply(kMat, 1, min)

   12    13    14    15     9    10    11    28    29    30    31    24 
7.370 6.890 6.720 6.529 6.470 7.005 6.735 6.587 6.170 6.870 6.800 6.138 
   25    26    27    36    37    38    39    32    33    34    35    20 
6.166 6.269 6.264 6.530 7.100 6.269 6.211 6.286 6.347 6.270 6.188 7.005 
   21    22    23    16    17    18    19     5     6     7     8     1 
7.082 6.757 6.155 7.228 7.226 7.363 7.081 6.993 6.992 6.608 7.003 6.981 
    2     3     4 
7.165 7.075 6.558

colnames(kMat)[apply(kMat, 1, which.min)]

 [1] "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"   
 [6] "poisonFang" "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"   
[11] "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"   
[16] "eggBomb"    "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"   
[21] "eggBomb"    "eggBomb"    "eggBomb"    "poisonFang" "eggBomb"   
[26] "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"    "poisonFang"
[31] "eggBomb"    "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"   
[36] "poisonFang" "eggBomb"    "poisonFang" "eggBomb"

all.equal(rowSums(kMat), apply(kMat, 1, sum))

[1] TRUE


# Computing on groups of observations with aggregate()
aggregate(eggBomb ~ devStage, kDat, FUN = mean)

  devStage eggBomb
1      E16   6.879
2       P2   6.408
3       P6   6.459
4      P10   7.143
5  4_weeks   7.063

aggregate(eggBomb ~ gType * devStage, kDat, FUN = mean)

   gType devStage eggBomb
1     wt      E16   6.900
2  NrlKO      E16   6.851
3     wt       P2   6.607
4  NrlKO       P2   6.209
5     wt       P6   6.646
6  NrlKO       P6   6.273
7     wt      P10   7.042
8  NrlKO      P10   7.243
9     wt  4_weeks   7.117
10 NrlKO  4_weeks   7.008

aggregate(eggBomb ~ gType * devStage, kDat, FUN = range)

   gType devStage eggBomb.1 eggBomb.2
1     wt      E16     6.529     7.462
2  NrlKO      E16     6.470     7.065
3     wt       P2     6.170     6.870
4  NrlKO       P2     6.138     6.269
5     wt       P6     6.211     7.574
6  NrlKO       P6     6.188     6.347
7     wt      P10     6.155     8.173
8  NrlKO      P10     7.081     7.438
9     wt  4_weeks     6.608     7.866
10 NrlKO  4_weeks     6.558     7.204



keepGenes <- c("1431708_a_at", "1424336_at", "1454696_at",
               "1416119_at", "1432141_x_at", "1429226_at" )
miniDat <- subset(prDat, rownames(prDat) %in% keepGenes)
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))),gene = factor(rep(rownames(miniDat), each = ncol(miniDat)), levels = keepGenes))
miniDat <- suppressWarnings(data.frame(prDes, miniDat))
str(miniDat)

'data.frame':   234 obs. of  6 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  10.6 11 10.8 10.9 9.2 ...
 $ gene    : Factor w/ 6 levels "1431708_a_at",..: 4 4 4 4 4 4 4 4 4 4 ...

stripplot(gType ~ gExp | gene, miniDat,scales = list(x = list(relation = "free")),group = gType, auto.key = TRUE)

plot of chunk unnamed-chunk-1

someDat <- droplevels(subset(miniDat, gene == keepGenes[1]))
t.test(gExp ~ gType, someDat)


    Welch Two Sample t-test

data:  gExp by gType
t = 9.838, df = 36.89, p-value = 7.381e-12
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.570 2.384
sample estimates:
   mean in group wt mean in group NrlKO 
              9.554               7.578



# The plyr package
library(plyr)
d_ply(miniDat, ~ gene, function(x) t.test(gExp ~ gType, x), .print = TRUE)


    Welch Two Sample t-test

data:  gExp by gType
t = 9.838, df = 36.89, p-value = 7.381e-12
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.570 2.384
sample estimates:
   mean in group wt mean in group NrlKO 
              9.554               7.578 


    Welch Two Sample t-test

data:  gExp by gType
t = 9.061, df = 36.49, p-value = 7.146e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.6239 0.9835
sample estimates:
   mean in group wt mean in group NrlKO 
              7.670               6.867 


    Welch Two Sample t-test

data:  gExp by gType
t = 8.077, df = 33.49, p-value = 2.278e-09
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.402 2.346
sample estimates:
   mean in group wt mean in group NrlKO 
              12.85               10.97 


    Welch Two Sample t-test

data:  gExp by gType
t = -0.184, df = 36.53, p-value = 0.8551
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.5079  0.4234
sample estimates:
   mean in group wt mean in group NrlKO 
              9.893               9.935 


    Welch Two Sample t-test

data:  gExp by gType
t = -0.1324, df = 36.31, p-value = 0.8954
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.2413  0.2118
sample estimates:
   mean in group wt mean in group NrlKO 
              7.092               7.107 


    Welch Two Sample t-test

data:  gExp by gType
t = 0.0983, df = 35.58, p-value = 0.9223
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.1019  0.1123
sample estimates:
   mean in group wt mean in group NrlKO 
              6.551               6.546

ttRes <- dlply(miniDat, ~ gene, function(x) t.test(gExp ~ gType, x))
names(ttRes)

[1] "1431708_a_at" "1424336_at"   "1454696_at"   "1416119_at"  
[5] "1432141_x_at" "1429226_at"

ttRes[["1454696_at"]]


    Welch Two Sample t-test

data:  gExp by gType
t = 8.077, df = 33.49, p-value = 2.278e-09
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.402 2.346
sample estimates:
   mean in group wt mean in group NrlKO 
              12.85               10.97


ttRes <- ddply(miniDat, ~ gene, function(z) {
  zz <- t.test(gExp ~ gType, z)
  round(c(tStat = zz$statistic, pVal = zz$p.value), 4)
})
ttRes

          gene tStat.t   pVal
1 1431708_a_at  9.8380 0.0000
2   1424336_at  9.0607 0.0000
3   1454696_at  8.0771 0.0000
4   1416119_at -0.1840 0.8551
5 1432141_x_at -0.1324 0.8954
6   1429226_at  0.0983 0.9223