Rob Schick
27 March 2014
plyr
dplyr is better
plyr to dplyrdplyrplyr package (Wickham 2011)base::split(), base::*apply(), base::rbind()plyr eliminates extra code & illuminates key components of the codepieces <- split(ozonedf, list(ozonedf$lat, ozonedf$long))
models <- lapply(pieces, deseasf_df)
results <- mapply(function(model, df) {
cbind(df[rep(1, 72), c("lat", "long")], resid(model))
}, models, pieces)
deseasdf <- do.call("rbind", results)
models <- dlply(ozonedf, .(lat, long), deseasf_df)
deseas <- ldply(models, resid)
| Input\Output | Array | Data frame | List | Discarded |
|---|---|---|---|---|
| Array | aaply | adply | alply | a_ply |
| Data frame | daply | ddply | dlply | d_ply |
| List | laply | ldply | llply | l_ply |
# calculate mean runs batted in (that's baseball speak) for each year
tail(ddply(baseball, .(year), summarise, mean_rbi = mean(rbi, na.rm = TRUE)))
year mean_rbi
132 2002 28.24
133 2003 24.44
134 2004 24.89
135 2005 21.33
136 2006 18.07
137 2007 18.83
plyr or standard R operations (most processing is done in parallel in C++)inds <- unique(Batting$playerID)
totals <- data.frame(ID = inds, runs = rep(NA, length(inds)))
for(i in 1:length(inds)){
totals$runs[i] <- sum(Batting[Batting$playerID == inds[i], 'R'], na.rm=TRUE)
}
head(runs[order(-totals$runs),],3)
library(Lahman)
totals <- aggregate(. ~ playerID, data=Batting[,c("playerID","R")], sum)
ranks <- sort.list(-totals$R)
totals[ranks[1:3],]
playerID R
6733 henderi01 2295
2869 cobbty01 2246
1398 bondsba01 2227
runs <- ddply(Batting, .(playerID), summarise, total_runs = sum(R, na.rm = TRUE))
head(runs[order(-runs$total_runs),],3)
playerID total_runs
6967 henderi01 2295
2973 cobbty01 2246
1446 bondsba01 2227
Batting %.%
group_by(playerID) %.%
summarise(total = sum(R)) %.%
arrange(desc(total)) %.%
head(3)
Source: local data frame [3 x 2]
playerID total
1 henderi01 2295
2 cobbty01 2246
3 bondsba01 2227
| Method | Proc Time (s) | Speed Up |
|---|---|---|
for(i in ...) |
80.74 | |
plyr |
8.191 | 9-10x faster |
aggregate |
0.298 | 270x faster |
dplyr |
0.044 | 1835x faster |
filter()arrange()select()mutate()summarise()M$-Excel's filter tool:
filter(hflights_df, Month == 12, DayofMonth == 25)
filter(hflights_df, Month == 12, DayofMonth == 25)
hflights_df[which(hflights_df$Month == 12 & hflights_df$DayofMonth == 25),]
filter(): 0.03119 seconds
which(): 0.04858 seconds
Difference: 0.01739 seconds
arrange(hflights_df, desc(ArrDelay))
order() with less typing. Compare the first and second lineshflights[order(hflights$ArrDelay, hflights$DayofMonth, hflights$Month),]
arrange(hflights_df, ArrDelay, DayofMonth, Month)
select() allows you to rapidly zoom in on a useful subset of a data frame. This works similarly to base::subset()select(hflights_df, Month, DayofMonth)
mutate(hflights_df,
gain = ArrDelay - DepDelay,
speed = Distance / AirTime * 60)
summarise(hflights_df,
delay = mean(DepDelay, na.rm = TRUE))
Source: local data frame [1 x 1]
delay
1 9.445
hflights package, which has all flights from Houston, TX in 2011 (n = 227496)dplyr the group_by() function tells R how to break a dataset down into groups of rowsplanes <- group_by(hflights_df, TailNum)
delay <- summarise(planes,
count = n(),
dist = mean(Distance, na.rm = TRUE),
delay = mean(ArrDelay, na.rm = TRUE))
delay <- filter(delay, count > 20, dist < 2000)
dplyr API is functional in the sense that function calls don't have side-effects & you must always save the resultsa1 <- group_by(hflights, Year, Month, DayofMonth)
a2 <- select(a1, Year:DayofMonth, ArrDelay, DepDelay)
a3 <- summarise(a2,
arr = mean(ArrDelay, na.rm = TRUE),
dep = mean(DepDelay, na.rm = TRUE))
a4 <- filter(a3, arr > 30 | dep > 30)
filter(
summarise(
select(
group_by(hflights, Year, Month, DayofMonth),
Year:DayofMonth, ArrDelay, DepDelay
),
arr = mean(ArrDelay, na.rm = TRUE),
dep = mean(DepDelay, na.rm = TRUE)
),
arr > 30 | dep > 30
)
dplyr offers the %.% operatorhflights %.%
group_by() %.%
select() %.%
summarise() %.%
filter()
demoCode.RdemoCode.R)demoCode.R)
data %.%
group_by() %.%
select() %.%
summarise() %.%
mutate() %.%
filter() %.%
arrange()