The Blaggers’ Guide to…dplyr

dplyr has a number of functions which are useful for manipulating data.frames.
Here, I will look at:

Our data is a 12 item subset of the iris dataset

data(iris)
iris2 <- head(iris,12)
iris2
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa

Filter

Return rows with matching conditions.

filter(iris2, Sepal.Length > 5)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          5.4         3.9          1.7         0.4  setosa
## 3          5.4         3.7          1.5         0.2  setosa

Select

Select variables by name.

select(iris2, Sepal.Length)
##    Sepal.Length
## 1           5.1
## 2           4.9
## 3           4.7
## 4           4.6
## 5           5.0
## 6           5.4
## 7           4.6
## 8           5.0
## 9           4.4
## 10          4.9
## 11          5.4
## 12          4.8
select(iris2, contains("etal"))
##    Petal.Length Petal.Width
## 1           1.4         0.2
## 2           1.4         0.2
## 3           1.3         0.2
## 4           1.5         0.2
## 5           1.4         0.2
## 6           1.7         0.4
## 7           1.4         0.3
## 8           1.5         0.2
## 9           1.4         0.2
## 10          1.5         0.1
## 11          1.5         0.2
## 12          1.6         0.2
select(iris2, ends_with("Width"))
##    Sepal.Width Petal.Width
## 1          3.5         0.2
## 2          3.0         0.2
## 3          3.2         0.2
## 4          3.1         0.2
## 5          3.6         0.2
## 6          3.9         0.4
## 7          3.4         0.3
## 8          3.4         0.2
## 9          2.9         0.2
## 10         3.1         0.1
## 11         3.7         0.2
## 12         3.4         0.2

Rename

Change the name of a variable and keep the rest.

rename(iris2, Type=Species)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Type
## 1           5.1         3.5          1.4         0.2 setosa
## 2           4.9         3.0          1.4         0.2 setosa
## 3           4.7         3.2          1.3         0.2 setosa
## 4           4.6         3.1          1.5         0.2 setosa
## 5           5.0         3.6          1.4         0.2 setosa
## 6           5.4         3.9          1.7         0.4 setosa
## 7           4.6         3.4          1.4         0.3 setosa
## 8           5.0         3.4          1.5         0.2 setosa
## 9           4.4         2.9          1.4         0.2 setosa
## 10          4.9         3.1          1.5         0.1 setosa
## 11          5.4         3.7          1.5         0.2 setosa
## 12          4.8         3.4          1.6         0.2 setosa

Mutate

Add new variables and preserve existing variables.

mutate(iris2, Sepal.Measure = (Sepal.Length+Sepal.Width)/2)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Measure
## 1           5.1         3.5          1.4         0.2  setosa          4.30
## 2           4.9         3.0          1.4         0.2  setosa          3.95
## 3           4.7         3.2          1.3         0.2  setosa          3.95
## 4           4.6         3.1          1.5         0.2  setosa          3.85
## 5           5.0         3.6          1.4         0.2  setosa          4.30
## 6           5.4         3.9          1.7         0.4  setosa          4.65
## 7           4.6         3.4          1.4         0.3  setosa          4.00
## 8           5.0         3.4          1.5         0.2  setosa          4.20
## 9           4.4         2.9          1.4         0.2  setosa          3.65
## 10          4.9         3.1          1.5         0.1  setosa          4.00
## 11          5.4         3.7          1.5         0.2  setosa          4.55
## 12          4.8         3.4          1.6         0.2  setosa          4.10

Transmute

Add new variables and drop existing variables.

transmute(iris2, Petal.Measure = (Petal.Length + Petal.Width) / 2)
##    Petal.Measure
## 1           0.80
## 2           0.80
## 3           0.75
## 4           0.85
## 5           0.80
## 6           1.05
## 7           0.85
## 8           0.85
## 9           0.80
## 10          0.80
## 11          0.85
## 12          0.90

Distinct

Select distinct rows

# Duplicate the final row for the purpose of our example
iris2 <- bind_rows(iris2, iris2[12,])
iris2
## Source: local data frame [13 x 5]
## 
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           (dbl)       (dbl)        (dbl)       (dbl)  (fctr)
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa
## 13          4.8         3.4          1.6         0.2  setosa
distinct(iris2)
## Source: local data frame [12 x 5]
## 
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           (dbl)       (dbl)        (dbl)       (dbl)  (fctr)
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 12          4.8         3.4          1.6         0.2  setosa

Arrange

Arrange rows by variables

arrange(iris2, Sepal.Length, Sepal.Width)
## Source: local data frame [13 x 5]
## 
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           (dbl)       (dbl)        (dbl)       (dbl)  (fctr)
## 1           4.4         2.9          1.4         0.2  setosa
## 2           4.6         3.1          1.5         0.2  setosa
## 3           4.6         3.4          1.4         0.3  setosa
## 4           4.7         3.2          1.3         0.2  setosa
## 5           4.8         3.4          1.6         0.2  setosa
## 6           4.8         3.4          1.6         0.2  setosa
## 7           4.9         3.0          1.4         0.2  setosa
## 8           4.9         3.1          1.5         0.1  setosa
## 9           5.0         3.4          1.5         0.2  setosa
## 10          5.0         3.6          1.4         0.2  setosa
## 11          5.1         3.5          1.4         0.2  setosa
## 12          5.4         3.7          1.5         0.2  setosa
## 13          5.4         3.9          1.7         0.4  setosa

Group_by and summarise

Group by - take an existing table and convert to grouped table where operations are performed by group.

Summarise - Summarise multiple values to a single value.

group_by(iris2, Sepal.Length>5) %>%
  summarise(mean(Sepal.Width))
## Source: local data frame [2 x 2]
## 
##   Sepal.Length > 5 mean(Sepal.Width)
##              (lgl)             (dbl)
## 1            FALSE              3.25
## 2             TRUE              3.70