Data wrangling with dplyr

Examples of how to use dpyr for data wrangling.

load the dplyr library

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corrr)

## Warning: package 'corrr' was built under R version 3.5.2

looking at the data

library(datasets)
data("iris")
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Using the “select” and “filter” functions

sepal <- iris %>% 
  select(Sepal.Length, Sepal.Width)
head(sepal, 3)

##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2

now let’s use “filter”

virginica <- iris %>%
  filter(Species == "virginica")
head(virginica)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 1          6.3         3.3          6.0         2.5 virginica
## 2          5.8         2.7          5.1         1.9 virginica
## 3          7.1         3.0          5.9         2.1 virginica
## 4          6.3         2.9          5.6         1.8 virginica
## 5          6.5         3.0          5.8         2.2 virginica
## 6          7.6         3.0          6.6         2.1 virginica

tail(virginica)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 45          6.7         3.3          5.7         2.5 virginica
## 46          6.7         3.0          5.2         2.3 virginica
## 47          6.3         2.5          5.0         1.9 virginica
## 48          6.5         3.0          5.2         2.0 virginica
## 49          6.2         3.4          5.4         2.3 virginica
## 50          5.9         3.0          5.1         1.8 virginica

“select” and “filter” can be used together

setosa.sepal <- iris %>%
  filter(Species == "setosa") %>%
  select(Sepal.Length, Sepal.Width)
head(setosa.sepal)

##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
## 6          5.4         3.9

“filter” can be used on multiple atributes, using “&” (and)

setosa.5.2 <- iris %>%
filter(Species == "setosa & Sepal.Length >= 5.2")
setosa.5.2

## [1] Sepal.Length Sepal.Width  Petal.Length Petal.Width  Species     
## <0 rows> (or 0-length row.names)

Using “|” (or)

iris5.2.5.9 <- iris %>% 
  filter(Sepal.Length == 5.2 | Sepal.Length == 5.9)
iris5.2.5.9

##   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1          5.2         3.5          1.5         0.2     setosa
## 2          5.2         3.4          1.4         0.2     setosa
## 3          5.2         4.1          1.5         0.1     setosa
## 4          5.2         2.7          3.9         1.4 versicolor
## 5          5.9         3.0          4.2         1.5 versicolor
## 6          5.9         3.2          4.8         1.8 versicolor
## 7          5.9         3.0          5.1         1.8  virginica

dplyr allows to rename variables as you subset the data

iris1 <- iris %>% select(sep.len = Sepal.Length, Sepal.Width)
head(iris1, 3)

##   sep.len Sepal.Width
## 1     5.1         3.5
## 2     4.9         3.0
## 3     4.7         3.2

within “select” we have many options such as starts_with(); ends_with(), and contains()

iris.pe <- iris %>% select(starts_with("Pe"))
head(iris.pe, 3)

##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2

iris.tal <- iris %>% select(contains("tal"))
head(iris.tal, 3)

##   Petal.Length Petal.Width
## 1          1.4         0.2
## 2          1.4         0.2
## 3          1.3         0.2

one of my favorites the “mutate” function to add and compute new variables let’s add a variable sepal.class (long = sepal.length is higher than the mean. short = otherwise)

iris2 <- iris %>%
  mutate(sepal.class = ifelse(Sepal.Length > 5, "long", "short"))
head(iris2, 12)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 1           5.1         3.5          1.4         0.2  setosa        long
## 2           4.9         3.0          1.4         0.2  setosa       short
## 3           4.7         3.2          1.3         0.2  setosa       short
## 4           4.6         3.1          1.5         0.2  setosa       short
## 5           5.0         3.6          1.4         0.2  setosa       short
## 6           5.4         3.9          1.7         0.4  setosa        long
## 7           4.6         3.4          1.4         0.3  setosa       short
## 8           5.0         3.4          1.5         0.2  setosa       short
## 9           4.4         2.9          1.4         0.2  setosa       short
## 10          4.9         3.1          1.5         0.1  setosa       short
## 11          5.4         3.7          1.5         0.2  setosa        long
## 12          4.8         3.4          1.6         0.2  setosa       short

the mutate function can take many arguments. So you can add many variable with one command

iris3 <- iris2 %>% mutate(sepal.int = Sepal.Length * Sepal.Width,
                          petal.int = Petal.Length * Petal.Width)
head(iris3, 4)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 1          5.1         3.5          1.4         0.2  setosa        long
## 2          4.9         3.0          1.4         0.2  setosa       short
## 3          4.7         3.2          1.3         0.2  setosa       short
## 4          4.6         3.1          1.5         0.2  setosa       short
##   sepal.int petal.int
## 1     17.85      0.28
## 2     14.70      0.28
## 3     15.04      0.26
## 4     14.26      0.30

Let’s look at the correlation between these two new variables, using package corrr

iris3 %>% select(sepal.int, petal.int) %>%
  correlate()

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

## # A tibble: 2 x 3
##   rowname   sepal.int petal.int
##   <chr>         <dbl>     <dbl>
## 1 sepal.int    NA         0.455
## 2 petal.int     0.455    NA

Using the “arrange” function

iris4 <- iris3 %>% arrange(Sepal.Length)
head(iris4)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 1          4.3         3.0          1.1         0.1  setosa       short
## 2          4.4         2.9          1.4         0.2  setosa       short
## 3          4.4         3.0          1.3         0.2  setosa       short
## 4          4.4         3.2          1.3         0.2  setosa       short
## 5          4.5         2.3          1.3         0.3  setosa       short
## 6          4.6         3.1          1.5         0.2  setosa       short
##   sepal.int petal.int
## 1     12.90      0.11
## 2     12.76      0.28
## 3     13.20      0.26
## 4     14.08      0.26
## 5     10.35      0.39
## 6     14.26      0.30

tail(iris4)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          7.6         3.0          6.6         2.1 virginica
## 146          7.7         3.8          6.7         2.2 virginica
## 147          7.7         2.6          6.9         2.3 virginica
## 148          7.7         2.8          6.7         2.0 virginica
## 149          7.7         3.0          6.1         2.3 virginica
## 150          7.9         3.8          6.4         2.0 virginica
##     sepal.class sepal.int petal.int
## 145        long     22.80     13.86
## 146        long     29.26     14.74
## 147        long     20.02     15.87
## 148        long     21.56     13.40
## 149        long     23.10     14.03
## 150        long     30.02     12.80

arrange() can be used with multiple arguments

iris5 <- iris2 %>% arrange(sepal.class, Petal.Length)
head(iris5)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 1          5.8         4.0          1.2         0.2  setosa        long
## 2          5.4         3.9          1.3         0.4  setosa        long
## 3          5.5         3.5          1.3         0.2  setosa        long
## 4          5.1         3.5          1.4         0.2  setosa        long
## 5          5.1         3.5          1.4         0.3  setosa        long
## 6          5.2         3.4          1.4         0.2  setosa        long

tail(iris5)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 145          5.0         3.5          1.6         0.6     setosa
## 146          4.8         3.4          1.9         0.2     setosa
## 147          4.9         2.4          3.3         1.0 versicolor
## 148          5.0         2.3          3.3         1.0 versicolor
## 149          5.0         2.0          3.5         1.0 versicolor
## 150          4.9         2.5          4.5         1.7  virginica
##     sepal.class
## 145       short
## 146       short
## 147       short
## 148       short
## 149       short
## 150       short

The above above manipulation can also be don, but with desc() to arrange in descending order

iris6 <- iris2 %>% arrange(sepal.class, desc(Petal.Length))
head(iris6)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width   Species sepal.class
## 1          7.7         2.6          6.9         2.3 virginica        long
## 2          7.7         3.8          6.7         2.2 virginica        long
## 3          7.7         2.8          6.7         2.0 virginica        long
## 4          7.6         3.0          6.6         2.1 virginica        long
## 5          7.9         3.8          6.4         2.0 virginica        long
## 6          7.3         2.9          6.3         1.8 virginica        long

tail(iris6)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 145          5.0         3.5          1.3         0.3  setosa       short
## 146          4.5         2.3          1.3         0.3  setosa       short
## 147          4.4         3.2          1.3         0.2  setosa       short
## 148          5.0         3.2          1.2         0.2  setosa       short
## 149          4.3         3.0          1.1         0.1  setosa       short
## 150          4.6         3.6          1.0         0.2  setosa       short

Using the summarize() function to summarize information

sum.info <- iris %>% 
  summarize(M.sepl = mean(Sepal.Length), SD.sepl = sd(Sepal.Length))
sum.info

##     M.sepl   SD.sepl
## 1 5.843333 0.8280661

Using the group_by() function

sum.info2 <- iris %>%
  group_by(Species) %>%
  summarize(M.mean = mean(Sepal.Length), V.var = var(Sepal.Length))
sum.info2

## # A tibble: 3 x 3
##   Species    M.mean V.var
##   <fct>       <dbl> <dbl>
## 1 setosa       5.01 0.124
## 2 versicolor   5.94 0.266
## 3 virginica    6.59 0.404

Handling longitudinal or multilevel data. Looking at the sleep deprivation data stored in wide format

Data wrangling with dplyr

J. Mess