"_if", "_at" and "_all" suffixes
can be combined with several dplyr functions
to perform a single operation simultaneously on multiple columns.
March 7, 2018
"_if", "_at" and "_all" suffixes
can be combined with several dplyr functions
to perform a single operation simultaneously on multiple columns.
Example:
data(airquality) airquality %>% head()
Ozone Solar.R Wind Temp Month Day 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 5 NA NA 14.3 56 5 5 6 28 NA 14.9 66 5 6
airquality %>% summarize_at(1:4, mean, na.rm = TRUE)
Ozone Solar.R Wind Temp 1 42.12931 185.9315 9.957516 77.88235
_all() - performs the function on all variables in a dataset
_at() - performs the function on variables that meet a specified naming or positional criteria
_if() - performs the function on columns that return TRUE based on a predicate statement
The verbs with scoped variants are:
mutate(), transmute() and summarise()
filter()
group_by()
rename() and select()
arrange()
Similar examples and more info at: http://dplyr.tidyverse.org/reference/scoped.html
data(iris) iris_grouped <- iris %>% group_by(Species) iris %>% head()
Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 5.1 3.5 1.4 0.2 setosa 2 4.9 3.0 1.4 0.2 setosa 3 4.7 3.2 1.3 0.2 setosa 4 4.6 3.1 1.5 0.2 setosa 5 5.0 3.6 1.4 0.2 setosa 6 5.4 3.9 1.7 0.4 setosa
summarise_all(.tbl, .funs, …)
iris_grouped %>% summarize_all(max) %>% data.frame()
Species Sepal.Length Sepal.Width Petal.Length Petal.Width 1 setosa 5.8 4.4 1.9 0.6 2 versicolor 7.0 3.4 5.1 1.8 3 virginica 7.9 3.8 6.9 2.5
# iris %>% summarize_all(max)
Error in summarise_impl(.data, dots) : Evaluation error: ‘max’ not meaningful for factors.
summarise_at(.tbl, .vars, .funs, …)
iris_grouped %>% summarize_at(c("Sepal.Length", "Sepal.Width"), max) %>% data.frame()
Species Sepal.Length Sepal.Width 1 setosa 5.8 4.4 2 versicolor 7.0 3.4 3 virginica 7.9 3.8
summarise_if(.tbl, .predicate, .funs, …)
iris %>% summarize_if(is.numeric, max) %>% data.frame()
Sepal.Length Sepal.Width Petal.Length Petal.Width 1 7.9 4.4 6.9 2.5
summarise_at(.tbl, .vars, .funs, …)
All will give the same results:
df <- iris_grouped %>% summarize_at(1:2, max)
df <- iris_grouped %>% summarize_at(c("Sepal.Length", "Sepal.Width"), max)
df <- iris_grouped %>% summarize_at(vars(matches("Sepal.*")), max)
iris_grouped %>% summarize_at(vars(starts_with("Sepal")), max) %>% data.frame()
Species Sepal.Length Sepal.Width 1 setosa 5.8 4.4 2 versicolor 7.0 3.4 3 virginica 7.9 3.8
mutate_all(.tbl, .funs, …)
iris_grouped %>% mutate_all(log) %>% head() %>% data.frame()
Sepal.Length Sepal.Width Petal.Length Petal.Width Species 1 1.629241 1.252763 0.3364722 -1.6094379 setosa 2 1.589235 1.098612 0.3364722 -1.6094379 setosa 3 1.547563 1.163151 0.2623643 -1.6094379 setosa 4 1.526056 1.131402 0.4054651 -1.6094379 setosa 5 1.609438 1.280934 0.3364722 -1.6094379 setosa 6 1.686399 1.360977 0.5306283 -0.9162907 setosa
mutate_all(.tbl, .funs, …)
iris_grouped %>% mutate_all(funs(logs = log)) %>% head() %>% select(c(1,2,5,6,7)) %>% data.frame()
Sepal.Length Sepal.Width Species Sepal.Length_logs Sepal.Width_logs 1 5.1 3.5 setosa 1.629241 1.252763 2 4.9 3.0 setosa 1.589235 1.098612 3 4.7 3.2 setosa 1.547563 1.163151 4 4.6 3.1 setosa 1.526056 1.131402 5 5.0 3.6 setosa 1.609438 1.280934 6 5.4 3.9 setosa 1.686399 1.360977
mutate_all(.tbl, .funs, …)
iris_grouped %>% mutate_all(funs(meters = . / 100)) %>% head() %>% select(c(1,2,5,6,7)) %>% data.frame()
Sepal.Length Sepal.Width Species Sepal.Length_meters Sepal.Width_meters 1 5.1 3.5 setosa 0.051 0.035 2 4.9 3.0 setosa 0.049 0.030 3 4.7 3.2 setosa 0.047 0.032 4 4.6 3.1 setosa 0.046 0.031 5 5.0 3.6 setosa 0.050 0.036 6 5.4 3.9 setosa 0.054 0.039
airquality %>% summarize_if(is.numeric, mean)
Ozone Solar.R Wind Temp Month Day 1 NA NA 9.957516 77.88235 6.993464 15.80392
airquality %>% summarize_if(is.numeric, mean, na.rm = TRUE)
Ozone Solar.R Wind Temp Month Day 1 42.12931 185.9315 9.957516 77.88235 6.993464 15.80392
airquality %>% summarize_if(is.numeric, funs(meanVal = mean), na.rm = TRUE)
Ozone_meanVal Solar.R_meanVal Wind_meanVal Temp_meanVal Month_meanVal 1 42.12931 185.9315 9.957516 77.88235 6.993464 Day_meanVal 1 15.80392
quantile(iris$Sepal.Length[iris$Species == "setosa"], probs = 0.75)
75% 5.2
iris_grouped %>% summarise_all(funs(Q3 = quantile), probs = 0.75) %>% data.frame()
Species Sepal.Length_Q3 Sepal.Width_Q3 Petal.Length_Q3 Petal.Width_Q3 1 setosa 5.2 3.675 1.575 0.3 2 versicolor 6.3 3.000 4.600 1.5 3 virginica 6.9 3.175 5.875 2.3
summarize_all(.tbl, .funs, …)
Named functions:
# iris_grouped %>% summarise_all(min, max)
iris_grouped %>% summarise_all(c("min", "max")) %>% select(c(1,2,3,6)) %>% data.frame()
Species Sepal.Length_min Sepal.Width_min Sepal.Length_max 1 setosa 4.3 2.3 5.8 2 versicolor 4.9 2.0 7.0 3 virginica 4.9 2.2 7.9
Works similarly for user-defined functions:
# iris_grouped %>% summarise_all(min, max)
myMin <- function(x){min(x)}
myMax <- function(x){max(x)}
iris_grouped %>% summarise_all(c("myMin", "myMax")) %>% select(c(1,2,3,6)) %>% data.frame()
Species Sepal.Length_myMin Sepal.Width_myMin Sepal.Length_myMax 1 setosa 4.3 2.3 5.8 2 versicolor 4.9 2.0 7.0 3 virginica 4.9 2.2 7.9
# iris_grouped %>% summarise_all(min, max) iris_grouped %>% summarise_all(c(min, max)) %>% select(c(1,2,6)) %>% data.frame()
Species Sepal.Length_.Primitive..min.. Sepal.Length_.Primitive..max.. 1 setosa 4.3 5.8 2 versicolor 4.9 7.0 3 virginica 4.9 7.9
iris_grouped %>% summarise_all(c(mins = min, maxs = max)) %>% select(c(1,2,6)) %>% data.frame()
Species Sepal.Length_mins Sepal.Length_maxs 1 setosa 4.3 5.8 2 versicolor 4.9 7.0 3 virginica 4.9 7.9
Always re-uses parameters where applicable?
airquality %>% summarise_all(c(mins = min, maxs = max), na.rm = TRUE) %>% select(c(1:8)) %>% data.frame()
Ozone_mins Solar.R_mins Wind_mins Temp_mins Month_mins Day_mins 1 1 7 1.7 56 5 1 Ozone_maxs Solar.R_maxs 1 168 334
filter_if(.tbl, .predicate, .vars_predicate)
See all rows with NA's
airquality %>% filter_if(is.numeric, any_vars(is.na(.))) %>% head()
Ozone Solar.R Wind Temp Month Day 1 NA NA 14.3 56 5 5 2 28 NA 14.9 66 5 6 3 NA 194 8.6 69 5 10 4 7 NA 6.9 74 5 11 5 NA 66 16.6 57 5 25 6 NA 266 14.9 58 5 26
Finding which colums are causing duplicate rows by id
df <- data.frame(
id = as.integer(c(1, 2, 2, 3, 3)),
val1 = c("a", "a", "b", "c", "c"),
val2 = c("x", "y", "y", "z", "zz"),
val3 = c("one", "two", "two", "three", "three")
)
df
id val1 val2 val3 1 1 a x one 2 2 a y two 3 2 b y two 4 3 c z three 5 3 c zz three
check.distinct <- function(df, ...){
multDf <- df %>%
unique() %>%
group_by(...) %>%
filter(n() > 1)
temp <- multDf %>%
summarize_all(funs(length(unique(.)))) %>%
ungroup() %>%
summarize_all(max) %>%
select_if(funs(is.na(.) | . > 1))
multDf %>% select(names(temp))
}
check.distinct(df, id) %>% data.frame()
id val1 val2 1 2 a y 2 2 b y 3 3 c z 4 3 c zz
multDf <- df %>%
unique() %>%
group_by(id) %>%
filter(n() > 1)
multDf %>% data.frame()
id val1 val2 val3 1 2 a y two 2 2 b y two 3 3 c z three 4 3 c zz three
temp <- multDf %>% summarize_all(funs(length(unique(.))))
temp %>% data.frame()
id val1 val2 val3 1 2 2 1 1 2 3 1 2 1
temp <- temp %>% ungroup() %>% summarize_all(max)
temp %>% data.frame()
id val1 val2 val3 1 3 2 2 1
temp <- temp %>% ungroup() %>% select_if(funs(is.na(.) | . > 1))
temp %>% data.frame()
id val1 val2 1 3 2 2
multDf %>% select(names(temp)) %>% data.frame()
id val1 val2 1 2 a y 2 2 b y 3 3 c z 4 3 c zz