Examples of how to use dpyr for data wrangling.
load the dplyr library
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrr)
## Warning: package 'corrr' was built under R version 3.5.2
looking at the data
library(datasets)
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Using the “select” and “filter” functions
sepal <- iris %>%
select(Sepal.Length, Sepal.Width)
head(sepal, 3)
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
now let’s use “filter”
virginica <- iris %>%
filter(Species == "virginica")
head(virginica)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.3 3.3 6.0 2.5 virginica
## 2 5.8 2.7 5.1 1.9 virginica
## 3 7.1 3.0 5.9 2.1 virginica
## 4 6.3 2.9 5.6 1.8 virginica
## 5 6.5 3.0 5.8 2.2 virginica
## 6 7.6 3.0 6.6 2.1 virginica
tail(virginica)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 45 6.7 3.3 5.7 2.5 virginica
## 46 6.7 3.0 5.2 2.3 virginica
## 47 6.3 2.5 5.0 1.9 virginica
## 48 6.5 3.0 5.2 2.0 virginica
## 49 6.2 3.4 5.4 2.3 virginica
## 50 5.9 3.0 5.1 1.8 virginica
“select” and “filter” can be used together
setosa.sepal <- iris %>%
filter(Species == "setosa") %>%
select(Sepal.Length, Sepal.Width)
head(setosa.sepal)
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
## 4 4.6 3.1
## 5 5.0 3.6
## 6 5.4 3.9
“filter” can be used on multiple atributes, using “&” (and)
setosa.5.2 <- iris %>%
filter(Species == "setosa & Sepal.Length >= 5.2")
setosa.5.2
## [1] Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <0 rows> (or 0-length row.names)
Using “|” (or)
iris5.2.5.9 <- iris %>%
filter(Sepal.Length == 5.2 | Sepal.Length == 5.9)
iris5.2.5.9
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.2 3.5 1.5 0.2 setosa
## 2 5.2 3.4 1.4 0.2 setosa
## 3 5.2 4.1 1.5 0.1 setosa
## 4 5.2 2.7 3.9 1.4 versicolor
## 5 5.9 3.0 4.2 1.5 versicolor
## 6 5.9 3.2 4.8 1.8 versicolor
## 7 5.9 3.0 5.1 1.8 virginica
dplyr allows to rename variables as you subset the data
iris1 <- iris %>% select(sep.len = Sepal.Length, Sepal.Width)
head(iris1, 3)
## sep.len Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
within “select” we have many options such as starts_with(); ends_with(), and contains()
iris.pe <- iris %>% select(starts_with("Pe"))
head(iris.pe, 3)
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
iris.tal <- iris %>% select(contains("tal"))
head(iris.tal, 3)
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
one of my favorites the “mutate” function to add and compute new variables let’s add a variable sepal.class (long = sepal.length is higher than the mean. short = otherwise)
iris2 <- iris %>%
mutate(sepal.class = ifelse(Sepal.Length > 5, "long", "short"))
head(iris2, 12)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 1 5.1 3.5 1.4 0.2 setosa long
## 2 4.9 3.0 1.4 0.2 setosa short
## 3 4.7 3.2 1.3 0.2 setosa short
## 4 4.6 3.1 1.5 0.2 setosa short
## 5 5.0 3.6 1.4 0.2 setosa short
## 6 5.4 3.9 1.7 0.4 setosa long
## 7 4.6 3.4 1.4 0.3 setosa short
## 8 5.0 3.4 1.5 0.2 setosa short
## 9 4.4 2.9 1.4 0.2 setosa short
## 10 4.9 3.1 1.5 0.1 setosa short
## 11 5.4 3.7 1.5 0.2 setosa long
## 12 4.8 3.4 1.6 0.2 setosa short
the mutate function can take many arguments. So you can add many variable with one command
iris3 <- iris2 %>% mutate(sepal.int = Sepal.Length * Sepal.Width,
petal.int = Petal.Length * Petal.Width)
head(iris3, 4)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 1 5.1 3.5 1.4 0.2 setosa long
## 2 4.9 3.0 1.4 0.2 setosa short
## 3 4.7 3.2 1.3 0.2 setosa short
## 4 4.6 3.1 1.5 0.2 setosa short
## sepal.int petal.int
## 1 17.85 0.28
## 2 14.70 0.28
## 3 15.04 0.26
## 4 14.26 0.30
Let’s look at the correlation between these two new variables, using package corrr
iris3 %>% select(sepal.int, petal.int) %>%
correlate()
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 2 x 3
## rowname sepal.int petal.int
## <chr> <dbl> <dbl>
## 1 sepal.int NA 0.455
## 2 petal.int 0.455 NA
Using the “arrange” function
iris4 <- iris3 %>% arrange(Sepal.Length)
head(iris4)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 1 4.3 3.0 1.1 0.1 setosa short
## 2 4.4 2.9 1.4 0.2 setosa short
## 3 4.4 3.0 1.3 0.2 setosa short
## 4 4.4 3.2 1.3 0.2 setosa short
## 5 4.5 2.3 1.3 0.3 setosa short
## 6 4.6 3.1 1.5 0.2 setosa short
## sepal.int petal.int
## 1 12.90 0.11
## 2 12.76 0.28
## 3 13.20 0.26
## 4 14.08 0.26
## 5 10.35 0.39
## 6 14.26 0.30
tail(iris4)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 7.6 3.0 6.6 2.1 virginica
## 146 7.7 3.8 6.7 2.2 virginica
## 147 7.7 2.6 6.9 2.3 virginica
## 148 7.7 2.8 6.7 2.0 virginica
## 149 7.7 3.0 6.1 2.3 virginica
## 150 7.9 3.8 6.4 2.0 virginica
## sepal.class sepal.int petal.int
## 145 long 22.80 13.86
## 146 long 29.26 14.74
## 147 long 20.02 15.87
## 148 long 21.56 13.40
## 149 long 23.10 14.03
## 150 long 30.02 12.80
arrange() can be used with multiple arguments
iris5 <- iris2 %>% arrange(sepal.class, Petal.Length)
head(iris5)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 1 5.8 4.0 1.2 0.2 setosa long
## 2 5.4 3.9 1.3 0.4 setosa long
## 3 5.5 3.5 1.3 0.2 setosa long
## 4 5.1 3.5 1.4 0.2 setosa long
## 5 5.1 3.5 1.4 0.3 setosa long
## 6 5.2 3.4 1.4 0.2 setosa long
tail(iris5)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 5.0 3.5 1.6 0.6 setosa
## 146 4.8 3.4 1.9 0.2 setosa
## 147 4.9 2.4 3.3 1.0 versicolor
## 148 5.0 2.3 3.3 1.0 versicolor
## 149 5.0 2.0 3.5 1.0 versicolor
## 150 4.9 2.5 4.5 1.7 virginica
## sepal.class
## 145 short
## 146 short
## 147 short
## 148 short
## 149 short
## 150 short
The above above manipulation can also be don, but with desc() to arrange in descending order
iris6 <- iris2 %>% arrange(sepal.class, desc(Petal.Length))
head(iris6)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 1 7.7 2.6 6.9 2.3 virginica long
## 2 7.7 3.8 6.7 2.2 virginica long
## 3 7.7 2.8 6.7 2.0 virginica long
## 4 7.6 3.0 6.6 2.1 virginica long
## 5 7.9 3.8 6.4 2.0 virginica long
## 6 7.3 2.9 6.3 1.8 virginica long
tail(iris6)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species sepal.class
## 145 5.0 3.5 1.3 0.3 setosa short
## 146 4.5 2.3 1.3 0.3 setosa short
## 147 4.4 3.2 1.3 0.2 setosa short
## 148 5.0 3.2 1.2 0.2 setosa short
## 149 4.3 3.0 1.1 0.1 setosa short
## 150 4.6 3.6 1.0 0.2 setosa short
Using the summarize() function to summarize information
sum.info <- iris %>%
summarize(M.sepl = mean(Sepal.Length), SD.sepl = sd(Sepal.Length))
sum.info
## M.sepl SD.sepl
## 1 5.843333 0.8280661
Using the group_by() function
sum.info2 <- iris %>%
group_by(Species) %>%
summarize(M.mean = mean(Sepal.Length), V.var = var(Sepal.Length))
sum.info2
## # A tibble: 3 x 3
## Species M.mean V.var
## <fct> <dbl> <dbl>
## 1 setosa 5.01 0.124
## 2 versicolor 5.94 0.266
## 3 virginica 6.59 0.404
Handling longitudinal or multilevel data. Looking at the sleep deprivation data stored in wide format