March 7, 2018

Operations on a selection of variables

"_if", "_at" and "_all" suffixes

can be combined with several dplyr functions

to perform a single operation simultaneously on multiple columns.

Example:

data(airquality)
airquality %>% head()
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6
airquality %>% summarize_at(1:4, mean, na.rm = TRUE)
     Ozone  Solar.R     Wind     Temp
1 42.12931 185.9315 9.957516 77.88235

Suffixes

  • _all() - performs the function on all variables in a dataset

  • _at() - performs the function on variables that meet a specified naming or positional criteria

  • _if() - performs the function on columns that return TRUE based on a predicate statement

Eligible "verbs"

Examples: Sepal dataset

data(iris)
iris_grouped <- iris %>% group_by(Species)
iris %>% head()
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa

Examples: summarize_all()

summarise_all(.tbl, .funs, …)

iris_grouped %>% summarize_all(max) %>% data.frame()
     Species Sepal.Length Sepal.Width Petal.Length Petal.Width
1     setosa          5.8         4.4          1.9         0.6
2 versicolor          7.0         3.4          5.1         1.8
3  virginica          7.9         3.8          6.9         2.5

Summarize_all()

# iris %>% summarize_all(max)

Error in summarise_impl(.data, dots) : Evaluation error: ‘max’ not meaningful for factors.

Summarize_at()

summarise_at(.tbl, .vars, .funs, …)

iris_grouped %>% summarize_at(c("Sepal.Length", "Sepal.Width"), max) %>% data.frame()
     Species Sepal.Length Sepal.Width
1     setosa          5.8         4.4
2 versicolor          7.0         3.4
3  virginica          7.9         3.8

Summarize_if()

summarise_if(.tbl, .predicate, .funs, …)

iris %>% summarize_if(is.numeric, max) %>% data.frame()
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1          7.9         4.4          6.9         2.5

Summarize_at() - Specifying columns

summarise_at(.tbl, .vars, .funs, …)

All will give the same results:

df <- iris_grouped %>% summarize_at(1:2, max)

df <- iris_grouped %>% summarize_at(c("Sepal.Length", "Sepal.Width"), max)

df <- iris_grouped %>% summarize_at(vars(matches("Sepal.*")), max)

iris_grouped %>% summarize_at(vars(starts_with("Sepal")), max) %>% data.frame()
     Species Sepal.Length Sepal.Width
1     setosa          5.8         4.4
2 versicolor          7.0         3.4
3  virginica          7.9         3.8

Mutate_all()

mutate_all(.tbl, .funs, …)

iris_grouped %>% mutate_all(log) %>% head() %>% data.frame()
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1     1.629241    1.252763    0.3364722  -1.6094379  setosa
2     1.589235    1.098612    0.3364722  -1.6094379  setosa
3     1.547563    1.163151    0.2623643  -1.6094379  setosa
4     1.526056    1.131402    0.4054651  -1.6094379  setosa
5     1.609438    1.280934    0.3364722  -1.6094379  setosa
6     1.686399    1.360977    0.5306283  -0.9162907  setosa

mutate_all() - preserving original columns

mutate_all(.tbl, .funs, …)

iris_grouped %>% 
  mutate_all(funs(logs = log)) %>% 
  head() %>% select(c(1,2,5,6,7)) %>% data.frame()
  Sepal.Length Sepal.Width Species Sepal.Length_logs Sepal.Width_logs
1          5.1         3.5  setosa          1.629241         1.252763
2          4.9         3.0  setosa          1.589235         1.098612
3          4.7         3.2  setosa          1.547563         1.163151
4          4.6         3.1  setosa          1.526056         1.131402
5          5.0         3.6  setosa          1.609438         1.280934
6          5.4         3.9  setosa          1.686399         1.360977

mutate_all() - user-defined function:

mutate_all(.tbl, .funs, …)

iris_grouped %>% 
  mutate_all(funs(meters = . / 100)) %>% 
  head() %>% select(c(1,2,5,6,7)) %>% data.frame()
  Sepal.Length Sepal.Width Species Sepal.Length_meters Sepal.Width_meters
1          5.1         3.5  setosa               0.051              0.035
2          4.9         3.0  setosa               0.049              0.030
3          4.7         3.2  setosa               0.047              0.032
4          4.6         3.1  setosa               0.046              0.031
5          5.0         3.6  setosa               0.050              0.036
6          5.4         3.9  setosa               0.054              0.039

Adding parameters:

airquality %>% summarize_if(is.numeric, mean)
  Ozone Solar.R     Wind     Temp    Month      Day
1    NA      NA 9.957516 77.88235 6.993464 15.80392
airquality %>% summarize_if(is.numeric, mean, na.rm = TRUE)
     Ozone  Solar.R     Wind     Temp    Month      Day
1 42.12931 185.9315 9.957516 77.88235 6.993464 15.80392
airquality %>% summarize_if(is.numeric, funs(meanVal = mean), na.rm = TRUE)
  Ozone_meanVal Solar.R_meanVal Wind_meanVal Temp_meanVal Month_meanVal
1      42.12931        185.9315     9.957516     77.88235      6.993464
  Day_meanVal
1    15.80392

Adding parameters:

quantile(iris$Sepal.Length[iris$Species == "setosa"], probs = 0.75)
75% 
5.2 
iris_grouped %>% summarise_all(funs(Q3 = quantile), probs = 0.75) %>% data.frame()
     Species Sepal.Length_Q3 Sepal.Width_Q3 Petal.Length_Q3 Petal.Width_Q3
1     setosa             5.2          3.675           1.575            0.3
2 versicolor             6.3          3.000           4.600            1.5
3  virginica             6.9          3.175           5.875            2.3

Sending more than one function to summarize_all()

summarize_all(.tbl, .funs, …)

Named functions:

# iris_grouped %>% summarise_all(min, max)

iris_grouped %>% summarise_all(c("min", "max")) %>% select(c(1,2,3,6)) %>% data.frame()
     Species Sepal.Length_min Sepal.Width_min Sepal.Length_max
1     setosa              4.3             2.3              5.8
2 versicolor              4.9             2.0              7.0
3  virginica              4.9             2.2              7.9

summarize_all() - Sending more than one function

Works similarly for user-defined functions:

# iris_grouped %>% summarise_all(min, max)

myMin <- function(x){min(x)}
myMax <- function(x){max(x)}

iris_grouped %>% summarise_all(c("myMin", "myMax")) %>% select(c(1,2,3,6)) %>% data.frame()
     Species Sepal.Length_myMin Sepal.Width_myMin Sepal.Length_myMax
1     setosa                4.3               2.3                5.8
2 versicolor                4.9               2.0                7.0
3  virginica                4.9               2.2                7.9

summarize_all() - Sending more than one function to summarize_all()

# iris_grouped %>% summarise_all(min, max)
iris_grouped %>% summarise_all(c(min, max)) %>% select(c(1,2,6)) %>% data.frame()
     Species Sepal.Length_.Primitive..min.. Sepal.Length_.Primitive..max..
1     setosa                            4.3                            5.8
2 versicolor                            4.9                            7.0
3  virginica                            4.9                            7.9
iris_grouped %>% summarise_all(c(mins = min, maxs = max)) %>% select(c(1,2,6)) %>% data.frame()
     Species Sepal.Length_mins Sepal.Length_maxs
1     setosa               4.3               5.8
2 versicolor               4.9               7.0
3  virginica               4.9               7.9

summarize_all() - Sending more than one function + parameters

Always re-uses parameters where applicable?

airquality %>% summarise_all(c(mins = min, maxs = max), na.rm = TRUE) %>% 
  select(c(1:8)) %>% data.frame()
  Ozone_mins Solar.R_mins Wind_mins Temp_mins Month_mins Day_mins
1          1            7       1.7        56          5        1
  Ozone_maxs Solar.R_maxs
1        168          334

filter_if()

filter_if(.tbl, .predicate, .vars_predicate)

  • .vars_predicate should use all_vars() or any_vars() and "."

See all rows with NA's

airquality %>% filter_if(is.numeric, any_vars(is.na(.))) %>% 
  head()
  Ozone Solar.R Wind Temp Month Day
1    NA      NA 14.3   56     5   5
2    28      NA 14.9   66     5   6
3    NA     194  8.6   69     5  10
4     7      NA  6.9   74     5  11
5    NA      66 16.6   57     5  25
6    NA     266 14.9   58     5  26

Example use case

Finding which colums are causing duplicate rows by id

df <- data.frame(
  id = as.integer(c(1, 2, 2, 3, 3)),
  val1 = c("a", "a", "b", "c", "c"),
  val2 = c("x", "y", "y", "z", "zz"),
  val3 = c("one", "two", "two", "three", "three")
)
df
  id val1 val2  val3
1  1    a    x   one
2  2    a    y   two
3  2    b    y   two
4  3    c    z three
5  3    c   zz three

Example use case

  check.distinct <- function(df, ...){
      multDf <- df %>% 
        unique() %>%
        group_by(...) %>%
        filter(n() > 1)
      
      temp <- multDf %>%
        summarize_all(funs(length(unique(.)))) %>%
        ungroup() %>%
        summarize_all(max) %>%
        select_if(funs(is.na(.) | . > 1))
      
      multDf %>% select(names(temp))
}

Example use case

check.distinct(df, id) %>% data.frame()
  id val1 val2
1  2    a    y
2  2    b    y
3  3    c    z
4  3    c   zz

Example use case

      multDf <- df %>% 
        unique() %>%
        group_by(id) %>%
        filter(n() > 1)
      multDf %>% data.frame()
  id val1 val2  val3
1  2    a    y   two
2  2    b    y   two
3  3    c    z three
4  3    c   zz three

Example use case

      temp <- multDf %>% summarize_all(funs(length(unique(.))))
      temp %>% data.frame()
  id val1 val2 val3
1  2    2    1    1
2  3    1    2    1
      temp <- temp %>% ungroup() %>% summarize_all(max) 
      temp %>% data.frame()
  id val1 val2 val3
1  3    2    2    1

Example use case

      temp <- temp %>% ungroup() %>% select_if(funs(is.na(.) | . > 1))
      temp %>% data.frame()
  id val1 val2
1  3    2    2
      multDf %>% select(names(temp)) %>% data.frame()
  id val1 val2
1  2    a    y
2  2    b    y
3  3    c    z
4  3    c   zz