Session 5a

Practicing data aggregation, reshaping, and plotting

Load the iris data as usual

data(iris)

Create some treatment information to play around with

iris$treat <- "none"
iris$treat[iris$Sepal.Width < 3] <- "small"
iris$treat[iris$Sepal.Width >= 3] <- "large"
iris$treat <- as.factor(iris$treat)

Using another member of the apply family to aggregate data

by(iris$Sepal.Length, iris$Species, mean)

## iris$Species: setosa
## [1] 5.006
## -------------------------------------------------------- 
## iris$Species: versicolor
## [1] 5.936
## -------------------------------------------------------- 
## iris$Species: virginica
## [1] 6.588

by(iris$Sepal.Length, list(iris$Species, iris$treat), mean)

## : setosa
## : large
## [1] 5.029
## -------------------------------------------------------- 
## : versicolor
## : large
## [1] 6.219
## -------------------------------------------------------- 
## : virginica
## : large
## [1] 6.769
## -------------------------------------------------------- 
## : setosa
## : small
## [1] 4.45
## -------------------------------------------------------- 
## : versicolor
## : small
## [1] 5.803
## -------------------------------------------------------- 
## : virginica
## : small
## [1] 6.338

by(iris$Sepal.Length, list(iris$Species, iris$treat), sd)

## : setosa
## : large
## [1] 0.3402
## -------------------------------------------------------- 
## : versicolor
## : large
## [1] 0.5115
## -------------------------------------------------------- 
## : virginica
## : large
## [1] 0.5197
## -------------------------------------------------------- 
## : setosa
## : small
## [1] 0.07071
## -------------------------------------------------------- 
## : versicolor
## : small
## [1] 0.4687
## -------------------------------------------------------- 
## : virginica
## : small
## [1] 0.7067

We can write a custom function to collect mean and sd:

myMeanSD <- function(data) {
  my.mean = mean(data)
  my.sd   = sd(data)
  output <- data.frame(Meanvalue = my.mean, SDvalue = my.sd)
  return(output)
}

my.list <- by(iris$Sepal.Length, list(iris$Species, iris$treat), function(x) myMeanSD(x))

# get "my.list" into a data frame format
do.call(rbind.data.frame, my.list)

##   Meanvalue SDvalue
## 1     5.029 0.34019
## 2     6.219 0.51149
## 3     6.769 0.51969
## 4     4.450 0.07071
## 5     5.803 0.46871
## 6     6.338 0.70674

# This result table does not indicate species or treatment!!

Plyr

A great package to handle aggregation, split, and apply is “plyr”. This is another great package from Hadley Wickham

install.packages("plyr", dep = TRUE)

library(plyr)
my.df <- ddply(iris,
               .(Species, treat),
               summarise,
               my.mean = mean(Sepal.Length),
               standard_deviation = sd(Sepal.Length))
my.df

##      Species treat my.mean standard_deviation
## 1     setosa large   5.029            0.34019
## 2     setosa small   4.450            0.07071
## 3 versicolor large   6.219            0.51149
## 4 versicolor small   5.803            0.46871
## 5  virginica large   6.769            0.51969
## 6  virginica small   6.338            0.70674


# creating a graph of the results
library(ggplot2)
p <- ggplot(my.df, aes(x = Species, y = my.mean))
  p <- p + geom_point(aes(colour = treat))
  p <- p + theme_bw()
p

plot of chunk unnamed-chunk-5

A package to re-organise data

install.packages("reshape", dep = TRUE)

library(reshape)

## Attaching package: 'reshape'
## 
## The following object is masked from 'package:plyr':
## 
## rename, round_any


# melt the data into a common format
new.iris <- melt(iris, id = c("Species", "treat"))

# then cast the data into a new form
# here, we calculate the mean value for each group
# this is yet another way of aggregating data
cast.iris <- cast(new.iris,
                  Species ~ treat + variable,
                  fun.aggregate = mean)
head(cast.iris)

##      Species large_Sepal.Length large_Sepal.Width large_Petal.Length
## 1     setosa              5.029             3.462              1.467
## 2 versicolor              6.219             3.100              4.544
## 3  virginica              6.769             3.183              5.641
##   large_Petal.Width small_Sepal.Length small_Sepal.Width
## 1            0.2458              4.450             2.600
## 2            1.4875              5.803             2.615
## 3            2.1276              6.338             2.686
##   small_Petal.Length small_Petal.Width
## 1              1.350             0.250
## 2              4.126             1.250
## 3              5.429             1.886


# or, a more complex example
# we want columns for mean, sd of each parameter
# next to each other 

cast.iris <- cast(new.iris,
                  Species + treat ~ variable,
                  c(mean, sd))
head(cast.iris)

##      Species treat Sepal.Length_mean Sepal.Length_sd Sepal.Width_mean
## 1     setosa large             5.029         0.34019            3.462
## 2     setosa small             4.450         0.07071            2.600
## 3 versicolor large             6.219         0.51149            3.100
## 4 versicolor small             5.803         0.46871            2.615
## 5  virginica large             6.769         0.51969            3.183
## 6  virginica small             6.338         0.70674            2.686
##   Sepal.Width_sd Petal.Length_mean Petal.Length_sd Petal.Width_mean
## 1         0.3400             1.467         0.17544           0.2458
## 2         0.4243             1.350         0.07071           0.2500
## 3         0.1265             4.544         0.25290           1.4875
## 4         0.2476             4.126         0.49132           1.2500
## 5         0.2316             5.641         0.47473           2.1276
## 6         0.1711             5.429         0.63494           1.8857
##   Petal.Width_sd
## 1        0.10711
## 2        0.07071
## 3        0.14549
## 4        0.17277
## 5        0.25057
## 6        0.24756

How to create multiple complex graphs from “new.iris”. The “melted” data can be easily used for a split-apply approach to getting things done. Here a silly, complex graph for each parameter to showcase “plyr” and “ggplot2”. Play around with the options!!

my.plots <- dlply(new.iris,
    .(variable),
    function(x){
    # get a handle on the current variable
    my.label <- unique(x$variable)
    # create a plot
    ggplot(x, aes(x = Species, y = value)) + 
          stat_summary(aes(colour = treat), 
               fun.data = "mean_sdl", mult = 1, 
               position = position_dodge(width = 0.75)) + 
          theme_bw() + 
          facet_grid(Species ~ .) + 
          labs(y = my.label)})
# this creates a list of plots
# to show the plots on the screen
my.plots

## $Sepal.Length

plot of chunk unnamed-chunk-7

## 
## $Sepal.Width

plot of chunk unnamed-chunk-7

## 
## $Petal.Length

plot of chunk unnamed-chunk-7

## 
## $Petal.Width

plot of chunk unnamed-chunk-7

## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##       variable
## 1 Sepal.Length
## 2  Sepal.Width
## 3 Petal.Length
## 4  Petal.Width

print all graphs into a pdf file

# Set your working directory
setwd("~/Desktop")

pdf(file = "My_plots.pdf")
print(my.plots) # have to use "print" explicitly when working within an output file
dev.off() # don't forget to close the file when you are done