May 6, 2015

Overview

  • Sequence of steps
  • Each step takes a data object
  • Each step returns a modified object

Cooking

  1. Preheat oven
  2. Combine
  • flour
  • sugar
  • baking powder
  • salt
  1. Add
  • eggs
  • milk
  • vanilla

Shell

grep Jim /usr/share/dict/words | shuf | head -n 1

Base R

head(n = 2,
     subset(select=c(gear, cyl),
            sample(
                   mtcars
                   )
            )
     )
##               gear cyl
## Mazda RX4        4   6
## Mazda RX4 Wag    4   6

Magrittr

library(magrittr)
mtcars %>%
  sample %>%
  subset(select=c(gear, cyl)) %>%
  head(n=2)
##               gear cyl
## Mazda RX4        4   6
## Mazda RX4 Wag    4   6

Magrittr - placeholder

mtcars %>%
  lm(cyl ~ gear)
## Error in as.data.frame.default(data): cannot coerce class ""formula"" to a data.frame
mtcars %>%
  lm(cyl ~ gear, data = .)
## 
## Call:
## lm(formula = cyl ~ gear, data = .)
## 
## Coefficients:
## (Intercept)         gear  
##      10.585       -1.193

Magrittr - braces

mtcars %>%
  {
    H <- head(., 2)
    T <- tail(., 2)
    rbind(H, T)
  } %>%
  summary()
##       mpg            cyl           disp             hp       
##  Min.   :15.0   Min.   :4.0   Min.   :121.0   Min.   :109.0  
##  1st Qu.:19.5   1st Qu.:5.5   1st Qu.:150.2   1st Qu.:109.8  
##  Median :21.0   Median :6.0   Median :160.0   Median :110.0  
##  Mean   :19.6   Mean   :6.0   Mean   :185.5   Mean   :166.0  
##  3rd Qu.:21.1   3rd Qu.:6.5   3rd Qu.:195.2   3rd Qu.:166.2  
##  Max.   :21.4   Max.   :8.0   Max.   :301.0   Max.   :335.0  
##       drat             wt             qsec             vs      
##  Min.   :3.540   Min.   :2.620   Min.   :14.60   Min.   :0.00  
##  1st Qu.:3.810   1st Qu.:2.740   1st Qu.:15.99   1st Qu.:0.00  
##  Median :3.900   Median :2.828   Median :16.74   Median :0.00  
##  Mean   :3.862   Mean   :2.961   Mean   :16.67   Mean   :0.25  
##  3rd Qu.:3.953   3rd Qu.:3.049   3rd Qu.:17.41   3rd Qu.:0.25  
##  Max.   :4.110   Max.   :3.570   Max.   :18.60   Max.   :1.00  
##        am         gear           carb    
##  Min.   :1   Min.   :4.00   Min.   :2.0  
##  1st Qu.:1   1st Qu.:4.00   1st Qu.:3.5  
##  Median :1   Median :4.00   Median :4.0  
##  Mean   :1   Mean   :4.25   Mean   :4.5  
##  3rd Qu.:1   3rd Qu.:4.25   3rd Qu.:5.0  
##  Max.   :1   Max.   :5.00   Max.   :8.0

Magrittr - building pipelines

outer_summary <- . %>% {
    H <- head(., 2)
    T <- tail(., 2)
    rbind(H, T)
  } %>%
  summary()

outer_summary(mtcars)
##       mpg            cyl           disp             hp       
##  Min.   :15.0   Min.   :4.0   Min.   :121.0   Min.   :109.0  
##  1st Qu.:19.5   1st Qu.:5.5   1st Qu.:150.2   1st Qu.:109.8  
##  Median :21.0   Median :6.0   Median :160.0   Median :110.0  
##  Mean   :19.6   Mean   :6.0   Mean   :185.5   Mean   :166.0  
##  3rd Qu.:21.1   3rd Qu.:6.5   3rd Qu.:195.2   3rd Qu.:166.2  
##  Max.   :21.4   Max.   :8.0   Max.   :301.0   Max.   :335.0  
##       drat             wt             qsec             vs      
##  Min.   :3.540   Min.   :2.620   Min.   :14.60   Min.   :0.00  
##  1st Qu.:3.810   1st Qu.:2.740   1st Qu.:15.99   1st Qu.:0.00  
##  Median :3.900   Median :2.828   Median :16.74   Median :0.00  
##  Mean   :3.862   Mean   :2.961   Mean   :16.67   Mean   :0.25  
##  3rd Qu.:3.953   3rd Qu.:3.049   3rd Qu.:17.41   3rd Qu.:0.25  
##  Max.   :4.110   Max.   :3.570   Max.   :18.60   Max.   :1.00  
##        am         gear           carb    
##  Min.   :1   Min.   :4.00   Min.   :2.0  
##  1st Qu.:1   1st Qu.:4.00   1st Qu.:3.5  
##  Median :1   Median :4.00   Median :4.0  
##  Mean   :1   Mean   :4.25   Mean   :4.5  
##  3rd Qu.:1   3rd Qu.:4.25   3rd Qu.:5.0  
##  Max.   :1   Max.   :5.00   Max.   :8.0

iris %>% outer_summary
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.900   Min.   :3.000   Min.   :1.400   Min.   :0.200  
##  1st Qu.:5.050   1st Qu.:3.000   1st Qu.:1.400   1st Qu.:0.200  
##  Median :5.500   Median :3.200   Median :3.250   Median :1.000  
##  Mean   :5.525   Mean   :3.225   Mean   :3.325   Mean   :1.125  
##  3rd Qu.:5.975   3rd Qu.:3.425   3rd Qu.:5.175   3rd Qu.:1.925  
##  Max.   :6.200   Max.   :3.500   Max.   :5.400   Max.   :2.300  
##        Species 
##  setosa    :2  
##  versicolor:0  
##  virginica :2  
##                
##                
## 

Magrittr - Tee

ls | tee files | head
rnorm(200) %>%
  matrix(ncol = 2) %T>%
  plot %>% # plot usually does not return anything.
  colSums

## [1] -20.173387  -3.261714

Magrittr - Exposition

subset(mtcars, mpg > mean(mpg) * 1.5)
##                 mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Fiat 128       32.4   4 78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic    30.4   4 75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla 33.9   4 71.1  65 4.22 1.835 19.90  1  1    4    1
## Lotus Europa   30.4   4 95.1 113 3.77 1.513 16.90  1  1    5    2
mtcars %>% subset(mpg > mean(mpg) * 1.5)
##                 mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Fiat 128       32.4   4 78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic    30.4   4 75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla 33.9   4 71.1  65 4.22 1.835 19.90  1  1    4    1
## Lotus Europa   30.4   4 95.1 113 3.77 1.513 16.90  1  1    5    2

Magrittr - Exposition

mtcars %>%
  subset(mpg > mean(mpg) * 1.5) %$%
  cor(mpg, hp)
## [1] -0.3475577

dplyr

  • magrittr co-evolved with dplyr
library(nycflights13)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
hourly_delay <- filter(
  summarise(
    group_by(
      filter(
        flights,
        !is.na(dep_delay)
        ),
      year, month),
    delay = mean(dep_delay),
    n = n()
    ),
  n > 10
  )

hourly_delay <- flights %>%
  filter(!is.na(dep_delay)) %>%
  group_by(year, month) %>%
  summarise(
    delay = mean(dep_delay),
    n = n()
  ) %>% filter(n > 10)

ggplot2

  • ggplot2 older than magrittr
  • core principles same, build up a plot object with discrete steps
library(ggplot2)
ggplot(mtcars) +
  aes(x = hp, y = disp) +
  geom_point()

ggplot(mtcars) +
  aes(x = hp, y = disp) +
  geom_point() +
  geom_smooth()

ggplot2

mtcars %>%
  filter(hp > 150) %>%
  ggplot() +
    aes(x = hp, y = disp) +
    geom_point() +
    geom_smooth()

Key insights

  • Each step simple and atomic
  • Common interface/API throughout
  • Small set of verbs, Hadley's APIs simpler, less powerful over time
  • plyr > dplyr
  • reshape > reshape2 > tidyr