Load the iris data as usual
data(iris)
Create some treatment information to play around with
iris$treat <- "none"
iris$treat[iris$Sepal.Width < 3] <- "small"
iris$treat[iris$Sepal.Width >= 3] <- "large"
iris$treat <- as.factor(iris$treat)
Using another member of the apply family to aggregate data
by(iris$Sepal.Length, iris$Species, mean)
## iris$Species: setosa
## [1] 5.006
## --------------------------------------------------------
## iris$Species: versicolor
## [1] 5.936
## --------------------------------------------------------
## iris$Species: virginica
## [1] 6.588
by(iris$Sepal.Length, list(iris$Species, iris$treat), mean)
## : setosa
## : large
## [1] 5.029
## --------------------------------------------------------
## : versicolor
## : large
## [1] 6.219
## --------------------------------------------------------
## : virginica
## : large
## [1] 6.769
## --------------------------------------------------------
## : setosa
## : small
## [1] 4.45
## --------------------------------------------------------
## : versicolor
## : small
## [1] 5.803
## --------------------------------------------------------
## : virginica
## : small
## [1] 6.338
by(iris$Sepal.Length, list(iris$Species, iris$treat), sd)
## : setosa
## : large
## [1] 0.3402
## --------------------------------------------------------
## : versicolor
## : large
## [1] 0.5115
## --------------------------------------------------------
## : virginica
## : large
## [1] 0.5197
## --------------------------------------------------------
## : setosa
## : small
## [1] 0.07071
## --------------------------------------------------------
## : versicolor
## : small
## [1] 0.4687
## --------------------------------------------------------
## : virginica
## : small
## [1] 0.7067
We can write a custom function to collect mean and sd:
myMeanSD <- function(data) {
my.mean = mean(data)
my.sd = sd(data)
output <- data.frame(Meanvalue = my.mean, SDvalue = my.sd)
return(output)
}
my.list <- by(iris$Sepal.Length, list(iris$Species, iris$treat), function(x) myMeanSD(x))
# get "my.list" into a data frame format
do.call(rbind.data.frame, my.list)
## Meanvalue SDvalue
## 1 5.029 0.34019
## 2 6.219 0.51149
## 3 6.769 0.51969
## 4 4.450 0.07071
## 5 5.803 0.46871
## 6 6.338 0.70674
# This result table does not indicate species or treatment!!
A great package to handle aggregation, split, and apply is “plyr”. This is another great package from Hadley Wickham
install.packages("plyr", dep = TRUE)
library(plyr)
my.df <- ddply(iris,
.(Species, treat),
summarise,
my.mean = mean(Sepal.Length),
standard_deviation = sd(Sepal.Length))
my.df
## Species treat my.mean standard_deviation
## 1 setosa large 5.029 0.34019
## 2 setosa small 4.450 0.07071
## 3 versicolor large 6.219 0.51149
## 4 versicolor small 5.803 0.46871
## 5 virginica large 6.769 0.51969
## 6 virginica small 6.338 0.70674
# creating a graph of the results
library(ggplot2)
p <- ggplot(my.df, aes(x = Species, y = my.mean))
p <- p + geom_point(aes(colour = treat))
p <- p + theme_bw()
p
install.packages("reshape", dep = TRUE)
library(reshape)
## Attaching package: 'reshape'
##
## The following object is masked from 'package:plyr':
##
## rename, round_any
# melt the data into a common format
new.iris <- melt(iris, id = c("Species", "treat"))
# then cast the data into a new form
# here, we calculate the mean value for each group
# this is yet another way of aggregating data
cast.iris <- cast(new.iris,
Species ~ treat + variable,
fun.aggregate = mean)
head(cast.iris)
## Species large_Sepal.Length large_Sepal.Width large_Petal.Length
## 1 setosa 5.029 3.462 1.467
## 2 versicolor 6.219 3.100 4.544
## 3 virginica 6.769 3.183 5.641
## large_Petal.Width small_Sepal.Length small_Sepal.Width
## 1 0.2458 4.450 2.600
## 2 1.4875 5.803 2.615
## 3 2.1276 6.338 2.686
## small_Petal.Length small_Petal.Width
## 1 1.350 0.250
## 2 4.126 1.250
## 3 5.429 1.886
# or, a more complex example
# we want columns for mean, sd of each parameter
# next to each other
cast.iris <- cast(new.iris,
Species + treat ~ variable,
c(mean, sd))
head(cast.iris)
## Species treat Sepal.Length_mean Sepal.Length_sd Sepal.Width_mean
## 1 setosa large 5.029 0.34019 3.462
## 2 setosa small 4.450 0.07071 2.600
## 3 versicolor large 6.219 0.51149 3.100
## 4 versicolor small 5.803 0.46871 2.615
## 5 virginica large 6.769 0.51969 3.183
## 6 virginica small 6.338 0.70674 2.686
## Sepal.Width_sd Petal.Length_mean Petal.Length_sd Petal.Width_mean
## 1 0.3400 1.467 0.17544 0.2458
## 2 0.4243 1.350 0.07071 0.2500
## 3 0.1265 4.544 0.25290 1.4875
## 4 0.2476 4.126 0.49132 1.2500
## 5 0.2316 5.641 0.47473 2.1276
## 6 0.1711 5.429 0.63494 1.8857
## Petal.Width_sd
## 1 0.10711
## 2 0.07071
## 3 0.14549
## 4 0.17277
## 5 0.25057
## 6 0.24756
How to create multiple complex graphs from “new.iris”. The “melted” data can be easily used for a split-apply approach to getting things done. Here a silly, complex graph for each parameter to showcase “plyr” and “ggplot2”. Play around with the options!!
my.plots <- dlply(new.iris,
.(variable),
function(x){
# get a handle on the current variable
my.label <- unique(x$variable)
# create a plot
ggplot(x, aes(x = Species, y = value)) +
stat_summary(aes(colour = treat),
fun.data = "mean_sdl", mult = 1,
position = position_dodge(width = 0.75)) +
theme_bw() +
facet_grid(Species ~ .) +
labs(y = my.label)})
# this creates a list of plots
# to show the plots on the screen
my.plots
## $Sepal.Length
##
## $Sepal.Width
##
## $Petal.Length
##
## $Petal.Width
##
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
## variable
## 1 Sepal.Length
## 2 Sepal.Width
## 3 Petal.Length
## 4 Petal.Width
# Set your working directory
setwd("~/Desktop")
pdf(file = "My_plots.pdf")
print(my.plots) # have to use "print" explicitly when working within an output file
dev.off() # don't forget to close the file when you are done