Summarizing data by groups in R

References

Prepare data

## Load dataset directly from the internet
lbw <- read.table("http://www.umass.edu/statdata/statdata/data/lowbwt.dat", head = T, skip = 4)

## Change variable names to lower cases
names(lbw) <- tolower(names(lbw))

## Recoding using within function
lbw <- within(lbw, {

    ## Relabel race: 1, 2, 3 -> White, Black, Other
    race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))

    ## Dichotomize ptl
    preterm  <- factor(ptl >= 1, levels = c(FALSE,TRUE), labels = c("0","1+"))
    ## You can alse use cut
    ## preterm  <- cut(ptl, breaks = c(-Inf, 0, Inf), labels = c("0","1+"))

    ## Change 0,1 binary to No,Yes binary
    smoke    <- factor(smoke, levels  = c(0,1), labels = c("No","Yes"))
    ht       <- factor(ht   , levels  = c(0,1), labels = c("No","Yes"))
    ui       <- factor(ui   , levels  = c(0,1), labels = c("No","Yes"))
    low      <- factor(low  , levels  = c(0,1), labels = c("No","Yes"))

    ## Categorize ftv (frequency of visit): 0   1   2   3   4   6 -> None(0), Normal(1-2), Many (>= 3)
    ftv.cat  <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))

    ## Make Normal the reference level
    ftv.cat  <- relevel(ftv.cat, ref = "Normal")
})

Exaplanation

These functions take data, split it by specified variables, and apply a specified summary function.

tapply()

## mean grouped by ftv.cat
tapply(X = lbw$age, INDEX = lbw$ftv.cat, FUN = mean)
Normal   None   Many 
 24.69  21.95  24.67 
## mean grouped by ftv.cat & smoke
tapply(X = lbw$age, INDEX = list(lbw$ftv.cat, lbw$smoke), FUN = mean)
          No   Yes
Normal 24.31 25.57
None   22.33 21.49
Many   25.50 23.83

by()

## mean grouped by ftv.cat
by(data = lbw$age, INDICES = lbw$ftv.cat, FUN = mean)
lbw$ftv.cat: Normal
[1] 24.69
--------------------------------------------------------------------------------------- 
lbw$ftv.cat: None
[1] 21.95
--------------------------------------------------------------------------------------- 
lbw$ftv.cat: Many
[1] 24.67
## summary grouped by ftv.cat
by(data = lbw$age, INDICES = lbw$ftv.cat, FUN = summary)
lbw$ftv.cat: Normal
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   14.0    21.0    24.0    24.7    29.0    45.0 
--------------------------------------------------------------------------------------- 
lbw$ftv.cat: None
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     14      19      21      22      25      36 
--------------------------------------------------------------------------------------- 
lbw$ftv.cat: Many
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   16.0    20.0    23.0    24.7    31.2    33.0 
## summary grouped by ftv.cat and smoke
by(data = lbw$age, INDICES = list(lbw$ftv.cat, lbw$smoke), FUN = summary)
: Normal
: No
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   14.0    20.2    24.0    24.3    28.8    45.0 
--------------------------------------------------------------------------------------- 
: None
: No
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   14.0    19.0    22.0    22.3    25.0    36.0 
--------------------------------------------------------------------------------------- 
: Many
: No
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   16.0    20.2    26.0    25.5    31.8    33.0 
--------------------------------------------------------------------------------------- 
: Normal
: Yes
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   18.0    21.0    24.0    25.6    29.5    35.0 
--------------------------------------------------------------------------------------- 
: None
: Yes
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   14.0    18.0    20.0    21.5    25.0    34.0 
--------------------------------------------------------------------------------------- 
: Many
: Yes
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   17.0    20.2    23.0    23.8    27.2    32.0 
## simpler expression
with(lbw, by(data = age, INDICES = list(ftv.cat, smoke), FUN = summary))
: Normal
: No
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   14.0    20.2    24.0    24.3    28.8    45.0 
--------------------------------------------------------------------------------------- 
: None
: No
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   14.0    19.0    22.0    22.3    25.0    36.0 
--------------------------------------------------------------------------------------- 
: Many
: No
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   16.0    20.2    26.0    25.5    31.8    33.0 
--------------------------------------------------------------------------------------- 
: Normal
: Yes
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   18.0    21.0    24.0    25.6    29.5    35.0 
--------------------------------------------------------------------------------------- 
: None
: Yes
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   14.0    18.0    20.0    21.5    25.0    34.0 
--------------------------------------------------------------------------------------- 
: Many
: Yes
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   17.0    20.2    23.0    23.8    27.2    32.0 

doBy::summaryBy()

library(doBy)
## mean grouped by ftv.cat
summaryBy(age ~ ftv.cat, data = lbw, FUN = mean)
  ftv.cat age.mean
1  Normal    24.69
2    None    21.95
3    Many    24.67
## summary grouped by ftv.cat
summaryBy(age ~ ftv.cat, data = lbw, FUN = summary)
  ftv.cat age.Min. age.1st Qu. age.Median age.Mean age.3rd Qu. age.Max.
1  Normal       14          21         24     24.7        29.0       45
2    None       14          19         21     22.0        25.0       36
3    Many       16          20         23     24.7        31.2       33
## summary grouped by ftv.cat and smoke
summaryBy(age ~ ftv.cat + smoke, data = lbw, FUN = summary)
  ftv.cat smoke age.Min. age.1st Qu. age.Median age.Mean age.3rd Qu. age.Max.
1  Normal    No       14        20.2         24     24.3        28.8       45
2  Normal   Yes       18        21.0         24     25.6        29.5       35
3    None    No       14        19.0         22     22.3        25.0       36
4    None   Yes       14        18.0         20     21.5        25.0       34
5    Many    No       16        20.2         26     25.5        31.8       33
6    Many   Yes       17        20.2         23     23.8        27.2       32

plyr::ddply()

library(plyr)
## mean grouped by ftv.cat
ddply(.data = lbw, .variables = c("ftv.cat"), .fun = summarize, mean.of.age = mean(age))
  ftv.cat mean.of.age
1  Normal       24.69
2    None       21.95
3    Many       24.67
## mean grouped by ftv.cat and smoke
ddply(.data = lbw, .variables = c("ftv.cat","smoke"), .fun = summarize, mean.of.age = mean(age))
  ftv.cat smoke mean.of.age
1  Normal    No       24.31
2  Normal   Yes       25.57
3    None    No       22.33
4    None   Yes       21.49
5    Many    No       25.50
6    Many   Yes       23.83