Manipulating and summarising dataframes

From: http://www.cookbook-r.com/Manipulating_data/Summarizing_data/#using-ddply

dfw <- read.table(header=T, text='
 subject pretest posttest
       1    59.4     64.5
       2    46.4     52.4
       3    46.0     49.7
       4    49.0     48.7
       5    32.5     37.4
       6    45.2     49.5
       7    60.3     59.9
       8    54.3     54.1
       9    45.4     49.6
      10    38.9     48.5
 ')

# Treat subject ID as a factor
dfw$subject <- factor(dfw$subject)

# Convert to long format
library(reshape2)
dfw.long <- melt(dfw,
                 id.vars = "subject",
                 measure.vars = c("pretest","posttest"),
                 variable.name = "condition")
head(dfw.long)

##   subject condition value
## 1       1   pretest  59.4
## 2       2   pretest  46.4
## 3       3   pretest  46.0
## 4       4   pretest  49.0
## 5       5   pretest  32.5
## 6       6   pretest  45.2

Collapse the data using summarySEwithin (defined at the bottom of this page; both of the helper functions below must be entered before the function is called here).

library(Rmisc)

## Loading required package: lattice
## Loading required package: plyr

dfwc <- summarySEwithin(dfw.long, measurevar="value", withinvars="condition",
                        idvar="subject", na.rm=FALSE, conf.interval=.95)
dfwc

##   condition  N value       sd        se       ci
## 1   pretest 10 47.74 2.262361 0.7154214 1.618396
## 2  posttest 10 51.43 2.262361 0.7154214 1.618396

library(ggplot2)

# Make the graph with the 95% confidence interval
ggplot(dfwc, aes(x=condition, y=value, group=1)) +
    geom_line() +
    geom_errorbar(width=.1, aes(ymin=value-ci, ymax=value+ci)) +
    geom_point(shape=21, size=3, fill="white") +
    ylim(40,60)

# Basic operation on a small dataset
d <- qplot(cyl, mpg, data=mtcars) ; d

d + stat_summary(fun.data = "mean_cl_boot", colour = "red")

stat_sum_single <- function(fun, geom="point", ...) {
  stat_summary(fun.y=fun, colour="red", geom=geom, size = 3, ...)
}

stat_sum_df <- function(fun, geom="crossbar", ...) {
  stat_summary(fun.data=fun, colour="red", geom=geom, width=0.2, ...)
}

d + stat_sum_df("mean_cl_normal", mapping = aes(group = cyl)) + stat_sum_single(mean)+ stat_sum_single(median)

d + stat_sum_df("mean_cl_normal", geom = "errorbar") + stat_sum_single(mean)

d + stat_sum_df("mean_cl_normal", geom = "pointrange")+ stat_sum_df("mean_cl_normal", geom = "smooth")

data <- data.frame(sex = c(rep(1, 1000), rep(2, 1000)),
                   treatment = rep(c(1, 2), 1000),
                   response1 = rnorm(2000, 0, 1),
                   response2 = rnorm(2000, 0, 1))

Summarizing data using “ddply”

data <- read.table(header=T, text='
 subject sex condition before after change
       1   F   placebo   10.1   6.9   -3.2
       2   F   placebo    6.3   4.2   -2.1
       3   M   aspirin   12.4   6.3   -6.1
       4   F   placebo    8.1   6.1   -2.0
       5   M   aspirin   15.2   9.9   -5.3
       6   F   aspirin   10.9   7.0   -3.9
       7   F   aspirin   11.6   8.5   -3.1
       8   M   aspirin    9.5   3.0   -6.5
       9   F   placebo   11.5   9.0   -2.5
      10   M   placebo   11.9  11.0   -0.9
      11   F   aspirin   11.4   8.0   -3.4
      12   M   aspirin   10.0   4.4   -5.6
      13   M   aspirin   12.5   5.4   -7.1
      14   M   placebo   10.6  10.6    0.0
      15   M   aspirin    9.1   4.3   -4.8
      16   F   placebo   12.1  10.2   -1.9
      17   F   placebo   11.0   8.8   -2.2
      18   F   placebo   11.9  10.2   -1.7
      19   M   aspirin    9.1   3.6   -5.5
      20   M   placebo   13.5  12.4   -1.1
      21   M   aspirin   12.0   7.5   -4.5
      22   F   placebo    9.1   7.6   -1.5
      23   M   placebo    9.9   8.0   -1.9
      24   F   placebo    7.6   5.2   -2.4
      25   F   placebo   11.8   9.7   -2.1
      26   F   placebo   11.8  10.7   -1.1
      27   F   aspirin   10.1   7.9   -2.2
      28   M   aspirin   11.6   8.3   -3.3
      29   F   aspirin   11.3   6.8   -4.5
      30   F   placebo   10.3   8.3   -2.0
 ')

library(plyr)

# Run the functions length, mean, and sd on the value of "change" for each group, 
# broken down by sex + condition
cdata <- ddply(data, c("sex", "condition"), summarise,
               N    = length(change),
               mean = mean(change),
               sd   = sd(change),
               se   = sd / sqrt(N) )
cdata

##   sex condition  N      mean        sd        se
## 1   F   aspirin  5 -3.420000 0.8642916 0.3865230
## 2   F   placebo 12 -2.058333 0.5247655 0.1514867
## 3   M   aspirin  9 -5.411111 1.1307569 0.3769190
## 4   M   placebo  4 -0.975000 0.7804913 0.3902456

Handling missing data

# Put some NA's in the data
dataNA <- data
dataNA$change[11:14] <- NA

cdata <- ddply(dataNA, c("sex", "condition"), summarise,
               N    = sum(!is.na(change)),
               mean = mean(change, na.rm=TRUE),
               sd   = sd(change, na.rm=TRUE),
               se   = sd / sqrt(N) )
cdata

##   sex condition  N      mean        sd        se
## 1   F   aspirin  4 -3.425000 0.9979145 0.4989572
## 2   F   placebo 12 -2.058333 0.5247655 0.1514867
## 3   M   aspirin  7 -5.142857 1.0674848 0.4034713
## 4   M   placebo  3 -1.300000 0.5291503 0.3055050

A function for mean, count, standard deviation, standard error of the mean, and confidence interval

summarySE(data, measurevar="change", groupvars=c("sex", "condition"))

##   sex condition  N    change        sd        se        ci
## 1   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
## 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
## 3   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
## 4   M   placebo  4 -0.975000 0.7804913 0.3902456 1.2419358

# With a data set with NA's, use na.rm=TRUE
summarySE(dataNA, measurevar="change", groupvars=c("sex","condition"), na.rm=TRUE)

##   sex condition  N    change        sd        se        ci
## 1   F   aspirin  4 -3.425000 0.9979145 0.4989572 1.5879046
## 2   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
## 3   M   aspirin  7 -5.142857 1.0674848 0.4034713 0.9872588
## 4   M   placebo  3 -1.300000 0.5291503 0.3055050 1.3144821

data <- data.frame(sex = c(rep(1, 1000), rep(2, 1000)),
                   treatment = rep(c(1, 2), 1000),
                   response1 = rnorm(2000, 0, 1),
                   response2 = rnorm(2000, 0, 1))
head(data)

##   sex treatment  response1  response2
## 1   1         1 -1.4947471 -0.5453899
## 2   1         2 -0.4105637 -2.0434089
## 3   1         1 -2.2265493  0.8195921
## 4   1         2  0.2457963 -0.1035008
## 5   1         1 -0.8078542  0.2195706
## 6   1         2  0.3053890 -0.4768785

library(reshape2)
melted <- melt(data, id.vars=c("sex", "treatment"))
head(melted)

##   sex treatment  variable      value
## 1   1         1 response1 -1.4947471
## 2   1         2 response1 -0.4105637
## 3   1         1 response1 -2.2265493
## 4   1         2 response1  0.2457963
## 5   1         1 response1 -0.8078542
## 6   1         2 response1  0.3053890

library(plyr)
ddply(melted, c("sex", "treatment", "variable"), summarise,
      mean = mean(value), sd = sd(value),
      sem = sd(value)/sqrt(length(value)))

##   sex treatment  variable         mean        sd        sem
## 1   1         1 response1 -0.052060066 1.0415435 0.04657924
## 2   1         1 response2  0.021437599 1.0213942 0.04567814
## 3   1         2 response1 -0.014597858 1.0203016 0.04562928
## 4   1         2 response2  0.005590416 1.0244686 0.04581563
## 5   2         1 response1  0.071343951 1.0181687 0.04553389
## 6   2         1 response2 -0.081254397 1.0249400 0.04583671
## 7   2         2 response1  0.037969467 0.9512362 0.04254057
## 8   2         2 response2  0.038899826 1.0214009 0.04567844

library(dplyr)
data %>%
group_by(sex,treatment) %>%
summarize(mean1 = mean(response1),
          sd1 = sd(response1),
          mean2 = mean(response2),
          sd2 = sd(response2))

## Source: local data frame [4 x 6]
## Groups: sex
## 
##   sex treatment       mean1       sd1        mean2      sd2
## 1   1         1 -0.05206007 1.0415435  0.021437599 1.021394
## 2   1         2 -0.01459786 1.0203016  0.005590416 1.024469
## 3   2         1  0.07134395 1.0181687 -0.081254397 1.024940
## 4   2         2  0.03796947 0.9512362  0.038899826 1.021401

EXTRA Topic

An example with highly skewed distributions:

set.seed(596)
mov <- movies[sample(nrow(movies), 1000), ]
m2 <- ggplot(mov, aes(x= factor(round(rating)), y=votes)) + geom_point()
m2 <- m2 + stat_summary(fun.data = "mean_cl_boot", geom = "crossbar",
                        colour = "red", width = 0.3) + xlab("rating")
m2

# Notice how the overplotting skews off visual perception of the mean
# supplementing the raw data with summary statistics is _very_ important

# Next, we'll look at votes on a log scale.

# Transforming the scale means the data are transformed
# first, after which statistics are computed:
m2 + scale_y_log10()

# Transforming the coordinate system occurs after the
# statistic has been computed. This means we're calculating the summary on the raw data
# and stretching the geoms onto the log scale.  Compare the widths of the
# standard errors.
m2 + coord_trans(y="log10")