From: http://www.cookbook-r.com/Manipulating_data/Summarizing_data/#using-ddply
dfw <- read.table(header=T, text='
subject pretest posttest
1 59.4 64.5
2 46.4 52.4
3 46.0 49.7
4 49.0 48.7
5 32.5 37.4
6 45.2 49.5
7 60.3 59.9
8 54.3 54.1
9 45.4 49.6
10 38.9 48.5
')
# Treat subject ID as a factor
dfw$subject <- factor(dfw$subject)
# Convert to long format
library(reshape2)
dfw.long <- melt(dfw,
id.vars = "subject",
measure.vars = c("pretest","posttest"),
variable.name = "condition")
head(dfw.long)
## subject condition value
## 1 1 pretest 59.4
## 2 2 pretest 46.4
## 3 3 pretest 46.0
## 4 4 pretest 49.0
## 5 5 pretest 32.5
## 6 6 pretest 45.2
Collapse the data using summarySEwithin (defined at the bottom of this page; both of the helper functions below must be entered before the function is called here).
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
dfwc <- summarySEwithin(dfw.long, measurevar="value", withinvars="condition",
idvar="subject", na.rm=FALSE, conf.interval=.95)
dfwc
## condition N value sd se ci
## 1 pretest 10 47.74 2.262361 0.7154214 1.618396
## 2 posttest 10 51.43 2.262361 0.7154214 1.618396
library(ggplot2)
# Make the graph with the 95% confidence interval
ggplot(dfwc, aes(x=condition, y=value, group=1)) +
geom_line() +
geom_errorbar(width=.1, aes(ymin=value-ci, ymax=value+ci)) +
geom_point(shape=21, size=3, fill="white") +
ylim(40,60)
# Basic operation on a small dataset
d <- qplot(cyl, mpg, data=mtcars) ; d
d + stat_summary(fun.data = "mean_cl_boot", colour = "red")
stat_sum_single <- function(fun, geom="point", ...) {
stat_summary(fun.y=fun, colour="red", geom=geom, size = 3, ...)
}
stat_sum_df <- function(fun, geom="crossbar", ...) {
stat_summary(fun.data=fun, colour="red", geom=geom, width=0.2, ...)
}
d + stat_sum_df("mean_cl_normal", mapping = aes(group = cyl)) + stat_sum_single(mean)+ stat_sum_single(median)
d + stat_sum_df("mean_cl_normal", geom = "errorbar") + stat_sum_single(mean)
d + stat_sum_df("mean_cl_normal", geom = "pointrange")+ stat_sum_df("mean_cl_normal", geom = "smooth")
data <- data.frame(sex = c(rep(1, 1000), rep(2, 1000)),
treatment = rep(c(1, 2), 1000),
response1 = rnorm(2000, 0, 1),
response2 = rnorm(2000, 0, 1))
Summarizing data using “ddply”
data <- read.table(header=T, text='
subject sex condition before after change
1 F placebo 10.1 6.9 -3.2
2 F placebo 6.3 4.2 -2.1
3 M aspirin 12.4 6.3 -6.1
4 F placebo 8.1 6.1 -2.0
5 M aspirin 15.2 9.9 -5.3
6 F aspirin 10.9 7.0 -3.9
7 F aspirin 11.6 8.5 -3.1
8 M aspirin 9.5 3.0 -6.5
9 F placebo 11.5 9.0 -2.5
10 M placebo 11.9 11.0 -0.9
11 F aspirin 11.4 8.0 -3.4
12 M aspirin 10.0 4.4 -5.6
13 M aspirin 12.5 5.4 -7.1
14 M placebo 10.6 10.6 0.0
15 M aspirin 9.1 4.3 -4.8
16 F placebo 12.1 10.2 -1.9
17 F placebo 11.0 8.8 -2.2
18 F placebo 11.9 10.2 -1.7
19 M aspirin 9.1 3.6 -5.5
20 M placebo 13.5 12.4 -1.1
21 M aspirin 12.0 7.5 -4.5
22 F placebo 9.1 7.6 -1.5
23 M placebo 9.9 8.0 -1.9
24 F placebo 7.6 5.2 -2.4
25 F placebo 11.8 9.7 -2.1
26 F placebo 11.8 10.7 -1.1
27 F aspirin 10.1 7.9 -2.2
28 M aspirin 11.6 8.3 -3.3
29 F aspirin 11.3 6.8 -4.5
30 F placebo 10.3 8.3 -2.0
')
library(plyr)
# Run the functions length, mean, and sd on the value of "change" for each group,
# broken down by sex + condition
cdata <- ddply(data, c("sex", "condition"), summarise,
N = length(change),
mean = mean(change),
sd = sd(change),
se = sd / sqrt(N) )
cdata
## sex condition N mean sd se
## 1 F aspirin 5 -3.420000 0.8642916 0.3865230
## 2 F placebo 12 -2.058333 0.5247655 0.1514867
## 3 M aspirin 9 -5.411111 1.1307569 0.3769190
## 4 M placebo 4 -0.975000 0.7804913 0.3902456
Handling missing data
# Put some NA's in the data
dataNA <- data
dataNA$change[11:14] <- NA
cdata <- ddply(dataNA, c("sex", "condition"), summarise,
N = sum(!is.na(change)),
mean = mean(change, na.rm=TRUE),
sd = sd(change, na.rm=TRUE),
se = sd / sqrt(N) )
cdata
## sex condition N mean sd se
## 1 F aspirin 4 -3.425000 0.9979145 0.4989572
## 2 F placebo 12 -2.058333 0.5247655 0.1514867
## 3 M aspirin 7 -5.142857 1.0674848 0.4034713
## 4 M placebo 3 -1.300000 0.5291503 0.3055050
A function for mean, count, standard deviation, standard error of the mean, and confidence interval
summarySE(data, measurevar="change", groupvars=c("sex", "condition"))
## sex condition N change sd se ci
## 1 F aspirin 5 -3.420000 0.8642916 0.3865230 1.0731598
## 2 F placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
## 3 M aspirin 9 -5.411111 1.1307569 0.3769190 0.8691767
## 4 M placebo 4 -0.975000 0.7804913 0.3902456 1.2419358
# With a data set with NA's, use na.rm=TRUE
summarySE(dataNA, measurevar="change", groupvars=c("sex","condition"), na.rm=TRUE)
## sex condition N change sd se ci
## 1 F aspirin 4 -3.425000 0.9979145 0.4989572 1.5879046
## 2 F placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
## 3 M aspirin 7 -5.142857 1.0674848 0.4034713 0.9872588
## 4 M placebo 3 -1.300000 0.5291503 0.3055050 1.3144821
data <- data.frame(sex = c(rep(1, 1000), rep(2, 1000)),
treatment = rep(c(1, 2), 1000),
response1 = rnorm(2000, 0, 1),
response2 = rnorm(2000, 0, 1))
head(data)
## sex treatment response1 response2
## 1 1 1 -1.4947471 -0.5453899
## 2 1 2 -0.4105637 -2.0434089
## 3 1 1 -2.2265493 0.8195921
## 4 1 2 0.2457963 -0.1035008
## 5 1 1 -0.8078542 0.2195706
## 6 1 2 0.3053890 -0.4768785
library(reshape2)
melted <- melt(data, id.vars=c("sex", "treatment"))
head(melted)
## sex treatment variable value
## 1 1 1 response1 -1.4947471
## 2 1 2 response1 -0.4105637
## 3 1 1 response1 -2.2265493
## 4 1 2 response1 0.2457963
## 5 1 1 response1 -0.8078542
## 6 1 2 response1 0.3053890
library(plyr)
ddply(melted, c("sex", "treatment", "variable"), summarise,
mean = mean(value), sd = sd(value),
sem = sd(value)/sqrt(length(value)))
## sex treatment variable mean sd sem
## 1 1 1 response1 -0.052060066 1.0415435 0.04657924
## 2 1 1 response2 0.021437599 1.0213942 0.04567814
## 3 1 2 response1 -0.014597858 1.0203016 0.04562928
## 4 1 2 response2 0.005590416 1.0244686 0.04581563
## 5 2 1 response1 0.071343951 1.0181687 0.04553389
## 6 2 1 response2 -0.081254397 1.0249400 0.04583671
## 7 2 2 response1 0.037969467 0.9512362 0.04254057
## 8 2 2 response2 0.038899826 1.0214009 0.04567844
library(dplyr)
data %>%
group_by(sex,treatment) %>%
summarize(mean1 = mean(response1),
sd1 = sd(response1),
mean2 = mean(response2),
sd2 = sd(response2))
## Source: local data frame [4 x 6]
## Groups: sex
##
## sex treatment mean1 sd1 mean2 sd2
## 1 1 1 -0.05206007 1.0415435 0.021437599 1.021394
## 2 1 2 -0.01459786 1.0203016 0.005590416 1.024469
## 3 2 1 0.07134395 1.0181687 -0.081254397 1.024940
## 4 2 2 0.03796947 0.9512362 0.038899826 1.021401
EXTRA Topic
An example with highly skewed distributions:
set.seed(596)
mov <- movies[sample(nrow(movies), 1000), ]
m2 <- ggplot(mov, aes(x= factor(round(rating)), y=votes)) + geom_point()
m2 <- m2 + stat_summary(fun.data = "mean_cl_boot", geom = "crossbar",
colour = "red", width = 0.3) + xlab("rating")
m2
# Notice how the overplotting skews off visual perception of the mean
# supplementing the raw data with summary statistics is _very_ important
# Next, we'll look at votes on a log scale.
# Transforming the scale means the data are transformed
# first, after which statistics are computed:
m2 + scale_y_log10()
# Transforming the coordinate system occurs after the
# statistic has been computed. This means we're calculating the summary on the raw data
# and stretching the geoms onto the log scale. Compare the widths of the
# standard errors.
m2 + coord_trans(y="log10")