Seminar04_DataAggregation.R

omaromeir — Feb 7, 2014, 9:25 PM

library(lattice)
prDat = read.table("../data/GSE4051_data.tsv")
str(prDat, max.level=0)

'data.frame':   29949 obs. of  39 variables:

prDes = readRDS("../data/GSE4051_design.rds")
str(prDes)

'data.frame':   39 obs. of  4 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...


set.seed(987)
(theGene <- sample(1:nrow(prDat), 1))

[1] 14294

pDat  <- data.frame(prDes, gExp = unlist(prDat[theGene, ]))
str(pDat)

'data.frame':   39 obs. of  5 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  9.88 10.59 10.28 10.22 8.75 ...


aggregate(gExp ~ gType, pDat, FUN = mean)

  gType  gExp
1    wt 9.758
2 NrlKO 9.553

stripplot(gType ~ gExp, pDat)

plot of chunk unnamed-chunk-1

ttRes  <- t.test(gExp ~ gType, pDat)
str(ttRes)

List of 9
 $ statistic  : Named num 1.48
  ..- attr(*, "names")= chr "t"
 $ parameter  : Named num 36.8
  ..- attr(*, "names")= chr "df"
 $ p.value    : num 0.148
 $ conf.int   : atomic [1:2] -0.0759 0.4861
  ..- attr(*, "conf.level")= num 0.95
 $ estimate   : Named num [1:2] 9.76 9.55
  ..- attr(*, "names")= chr [1:2] "mean in group wt" "mean in group NrlKO"
 $ null.value : Named num 0
  ..- attr(*, "names")= chr "difference in means"
 $ alternative: chr "two.sided"
 $ method     : chr "Welch Two Sample t-test"
 $ data.name  : chr "gExp by gType"
 - attr(*, "class")= chr "htest"

ttRes$statistic

    t 
1.479

ttRes$p.value

[1] 0.1475


# You try

myGene <- 14158
pDat2  <- data.frame(prDes, gExp = unlist(prDat[myGene, ]))
str(pDat2)

'data.frame':   39 obs. of  5 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  6.85 7.27 7.11 7.32 7.14 ...

aggregate(gExp ~ gType, pDat2, FUN = mean)

  gType  gExp
1    wt 7.366
2 NrlKO 7.240

stripplot(gType ~ gExp, pDat2)

plot of chunk unnamed-chunk-1

tt  <- t.test(gExp ~ gType, pDat2)
str(tt)

List of 9
 $ statistic  : Named num 1.28
  ..- attr(*, "names")= chr "t"
 $ parameter  : Named num 28.5
  ..- attr(*, "names")= chr "df"
 $ p.value    : num 0.213
 $ conf.int   : atomic [1:2] -0.0764 0.3287
  ..- attr(*, "conf.level")= num 0.95
 $ estimate   : Named num [1:2] 7.37 7.24
  ..- attr(*, "names")= chr [1:2] "mean in group wt" "mean in group NrlKO"
 $ null.value : Named num 0
  ..- attr(*, "names")= chr "difference in means"
 $ alternative: chr "two.sided"
 $ method     : chr "Welch Two Sample t-test"
 $ data.name  : chr "gExp by gType"
 - attr(*, "class")= chr "htest"

tt$statistic

    t 
1.275

tt$p.value

[1] 0.2126


tt2  <- t.test(gExp ~ gType, pDat2, var.equal=TRUE)
str(tt2)

List of 9
 $ statistic  : Named num 1.26
  ..- attr(*, "names")= chr "t"
 $ parameter  : Named num 37
  ..- attr(*, "names")= chr "df"
 $ p.value    : num 0.217
 $ conf.int   : atomic [1:2] -0.0775 0.3298
  ..- attr(*, "conf.level")= num 0.95
 $ estimate   : Named num [1:2] 7.37 7.24
  ..- attr(*, "names")= chr [1:2] "mean in group wt" "mean in group NrlKO"
 $ null.value : Named num 0
  ..- attr(*, "names")= chr "difference in means"
 $ alternative: chr "two.sided"
 $ method     : chr " Two Sample t-test"
 $ data.name  : chr "gExp by gType"
 - attr(*, "class")= chr "htest"


wt <- wilcox.test(gExp ~ gType, pDat2)

Warning: cannot compute exact p-value with ties

str(wt)

List of 7
 $ statistic  : Named num 236
  ..- attr(*, "names")= chr "W"
 $ parameter  : NULL
 $ p.value    : num 0.201
 $ null.value : Named num 0
  ..- attr(*, "names")= chr "location shift"
 $ alternative: chr "two.sided"
 $ method     : chr "Wilcoxon rank sum test with continuity correction"
 $ data.name  : chr "gExp by gType"
 - attr(*, "class")= chr "htest"

wt$statistic

  W 
236

wt$p.value

[1] 0.201


ks <- ks.test(pDat2$gExp[pDat2$gType == "wt"], pDat2$gExp[pDat2$gType == "NrlKO"])

Warning: cannot compute exact p-value with ties

str(ks)

List of 5
 $ statistic  : Named num 0.337
  ..- attr(*, "names")= chr "D"
 $ p.value    : num 0.219
 $ alternative: chr "two-sided"
 $ method     : chr "Two-sample Kolmogorov-Smirnov test"
 $ data.name  : chr "pDat2$gExp[pDat2$gType == \"wt\"] and pDat2$gExp[pDat2$gType == \"NrlKO\"]"
 - attr(*, "class")= chr "htest"

ks$statistic

     D 
0.3368

ks$p.value

[1] 0.2189


# Can you pull test statistics and/or p-values from the different approaches into an common object, like a readable table? Are you getting the same message from the various approaches?

rbind(c(tt$statistic, tt$p.value, wt$statistic, wt$p.value, ks$statistic, ks$p.value))

         t          W            D       
[1,] 1.275 0.2126 236 0.201 0.3368 0.2189

#According to p values all approaches show that there is no difference in means.


#apply() stuff from here on out
kDat <- readRDS("../data/GSE4051_MINI.rds")
kMat <- as.matrix(kDat[c('crabHammer', 'eggBomb', 'poisonFang')])
str(kMat)

 num [1:39, 1:3] 10.22 10.02 9.64 9.65 8.58 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:39] "12" "13" "14" "15" ...
  ..$ : chr [1:3] "crabHammer" "eggBomb" "poisonFang"

median(kMat[, "eggBomb"])

[1] 6.757

apply(kMat, 2, median)

crabHammer    eggBomb poisonFang 
     9.611      6.757      7.350


apply(kMat, 1, min)

   12    13    14    15     9    10    11    28    29    30    31    24 
7.370 6.890 6.720 6.529 6.470 7.005 6.735 6.587 6.170 6.870 6.800 6.138 
   25    26    27    36    37    38    39    32    33    34    35    20 
6.166 6.269 6.264 6.530 7.100 6.269 6.211 6.286 6.347 6.270 6.188 7.005 
   21    22    23    16    17    18    19     5     6     7     8     1 
7.082 6.757 6.155 7.228 7.226 7.363 7.081 6.993 6.992 6.608 7.003 6.981 
    2     3     4 
7.165 7.075 6.558

colnames(kMat)[apply(kMat, 1, which.min)]

 [1] "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"   
 [6] "poisonFang" "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"   
[11] "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"   
[16] "eggBomb"    "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"   
[21] "eggBomb"    "eggBomb"    "eggBomb"    "poisonFang" "eggBomb"   
[26] "eggBomb"    "eggBomb"    "eggBomb"    "eggBomb"    "poisonFang"
[31] "eggBomb"    "poisonFang" "eggBomb"    "eggBomb"    "eggBomb"   
[36] "poisonFang" "eggBomb"    "poisonFang" "eggBomb"


#aggregate() stuff
aggregate(eggBomb ~ devStage, kDat, FUN = mean)

  devStage eggBomb
1      E16   6.879
2       P2   6.408
3       P6   6.459
4      P10   7.143
5  4_weeks   7.063

aggregate(eggBomb ~ gType * devStage, kDat, FUN = mean)

   gType devStage eggBomb
1     wt      E16   6.900
2  NrlKO      E16   6.851
3     wt       P2   6.607
4  NrlKO       P2   6.209
5     wt       P6   6.646
6  NrlKO       P6   6.273
7     wt      P10   7.042
8  NrlKO      P10   7.243
9     wt  4_weeks   7.117
10 NrlKO  4_weeks   7.008


#two sample test
keepGenes <- c("1431708_a_at", "1424336_at", "1454696_at","1416119_at", "1432141_x_at", "1429226_at" )
miniDat <- subset(prDat, rownames(prDat) %in% keepGenes)
miniDat <- data.frame(gExp = as.vector(t(as.matrix(miniDat))),
                      gene = factor(rep(rownames(miniDat), each = ncol(miniDat)),
                                    levels = keepGenes))
miniDat <- suppressWarnings(data.frame(prDes, miniDat))
str(miniDat)

'data.frame':   234 obs. of  6 variables:
 $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
 $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
 $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
 $ gExp    : num  10.6 11 10.8 10.9 9.2 ...
 $ gene    : Factor w/ 6 levels "1431708_a_at",..: 4 4 4 4 4 4 4 4 4 4 ...

stripplot(gType ~ gExp | gene, miniDat, scales = list(x = list(relation = "free")), group = gType, auto.key = TRUE)

plot of chunk unnamed-chunk-1


someDat <- droplevels(subset(miniDat, gene == keepGenes[1]))
t.test(gExp ~ gType, someDat)


    Welch Two Sample t-test

data:  gExp by gType
t = 9.838, df = 36.89, p-value = 7.381e-12
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.570 2.384
sample estimates:
   mean in group wt mean in group NrlKO 
              9.554               7.578


#plyr stuff
library(plyr)

d_ply(miniDat, ~ gene, function(x) t.test(gExp ~ gType, x), .print=TRUE)


    Welch Two Sample t-test

data:  gExp by gType
t = 9.838, df = 36.89, p-value = 7.381e-12
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.570 2.384
sample estimates:
   mean in group wt mean in group NrlKO 
              9.554               7.578 


    Welch Two Sample t-test

data:  gExp by gType
t = 9.061, df = 36.49, p-value = 7.146e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.6239 0.9835
sample estimates:
   mean in group wt mean in group NrlKO 
              7.670               6.867 


    Welch Two Sample t-test

data:  gExp by gType
t = 8.077, df = 33.49, p-value = 2.278e-09
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 1.402 2.346
sample estimates:
   mean in group wt mean in group NrlKO 
              12.85               10.97 


    Welch Two Sample t-test

data:  gExp by gType
t = -0.184, df = 36.53, p-value = 0.8551
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.5079  0.4234
sample estimates:
   mean in group wt mean in group NrlKO 
              9.893               9.935 


    Welch Two Sample t-test

data:  gExp by gType
t = -0.1324, df = 36.31, p-value = 0.8954
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.2413  0.2118
sample estimates:
   mean in group wt mean in group NrlKO 
              7.092               7.107 


    Welch Two Sample t-test

data:  gExp by gType
t = 0.0983, df = 35.58, p-value = 0.9223
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.1019  0.1123
sample estimates:
   mean in group wt mean in group NrlKO 
              6.551               6.546

ttRes <- dlply(miniDat, ~ gene, function(x) t.test(gExp ~ gType, x))
names(ttRes)

[1] "1431708_a_at" "1424336_at"   "1454696_at"   "1416119_at"  
[5] "1432141_x_at" "1429226_at"


ttRes <- ddply(miniDat, ~ gene, function(z) {
  zz <- t.test(gExp ~ gType, z)
  round(c(tStat = zz$statistic, pVal = zz$p.value), 4)
})
ttRes

          gene tStat.t   pVal
1 1431708_a_at  9.8380 0.0000
2   1424336_at  9.0607 0.0000
3   1454696_at  8.0771 0.0000
4   1416119_at -0.1840 0.8551
5 1432141_x_at -0.1324 0.8954
6   1429226_at  0.0983 0.9223