Set up Parallel Processing
Load Required Libraries
Data Manipulation
- coalesce
- The complement of coalesce() is na_if(): it replaces a specified value with an NA.
recode(), a vectorised switch(), takes a numeric vector, character vector, or factor, and replaces elements based on their values.
- case_when()
gather

Set up Parallel Processing

library(parallel)

# Calculate the number of cores
no_cores <- detectCores() - 1

# Initiate cluster
cl <- makeCluster(no_cores, type="FORK")

Load Required Libraries

summarise=dplyr::summarise

list=c("tidyverse")

R<-suppressWarnings(suppressMessages(sapply(list, library, character.only = TRUE)))


select=dplyr::select

Data Manipulation

head(iris) %>% 
  rowwise() %>% 
  mutate(Max.Len= max(Sepal.Length,Petal.Length))

## Source: local data frame [6 x 6]
## Groups: <by row>
## 
## # A tibble: 6 x 6
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Max.Len
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>     <dbl>
## 1         5.10        3.50         1.40       0.200 setosa     5.10
## 2         4.90        3.00         1.40       0.200 setosa     4.90
## 3         4.70        3.20         1.30       0.200 setosa     4.70
## 4         4.60        3.10         1.50       0.200 setosa     4.60
## 5         5.00        3.60         1.40       0.200 setosa     5.00
## 6         5.40        3.90         1.70       0.400 setosa     5.40

a=as.tibble(matrix(1:9,3,3))
#a%>%rowwise()mutate(Meancol= mean(.))
#a%>%colMeans(.,na.rm=TRUE)
a%>%mutate(col=colMeans(.,na.rm=TRUE))

## # A tibble: 3 x 4
##      V1    V2    V3   col
##   <int> <int> <int> <dbl>
## 1     1     4     7  2.00
## 2     2     5     8  5.00
## 3     3     6     9  8.00

a%>%rowwise() %>%
mutate(col2 = mean(V1,V2,V3,na.rm=TRUE))

## Source: local data frame [3 x 4]
## Groups: <by row>
## 
## # A tibble: 3 x 4
##      V1    V2    V3  col2
##   <int> <int> <int> <int>
## 1     1     4     7     1
## 2     2     5     8     2
## 3     3     6     9     3

one <- mtcars[1:4, ]
two <- mtcars[5:8, ]
three <- mtcars[5, ]
#bind_cols(one, two)
#bind_cols(list(one, two))
bind_rows(one,three)

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2

df <- expand.grid(x = 1:3, y = 3:1)

df %>% rowwise() %>% do(i = seq(.$x, .$y))

## Source: local data frame [9 x 1]
## Groups: <by row>
## 
## # A tibble: 9 x 1
##   i        
## * <list>   
## 1 <int [3]>
## 2 <int [2]>
## 3 <int [1]>
## 4 <int [2]>
## 5 <int [1]>
## 6 <int [2]>
## 7 <int [1]>
## 8 <int [2]>
## 9 <int [3]>

#.Last.value %>% dplyr::summarize(n = length(i))

coalesce

x <- c(1,  2, NA, 4, NA, 6)
y <- c(NA, 2,  3, 4,  5, NA)

# Use this to piece together a complete vector:
coalesce(x, y)

## [1] 1 2 3 4 5 6

# Or just replace missing value with a constant:
coalesce(x, 0)

## [1] 1 2 0 4 0 6

union(x,y)%>%na.omit()

## [1] 1 2 4 6 3 5
## attr(,"na.action")
## [1] 3
## attr(,"class")
## [1] "omit"

union_all(x,y)

##  [1]  1  2 NA  4 NA  6 NA  2  3  4  5 NA

The complement of coalesce() is na_if(): it replaces a specified value with an NA.

x <- c(1, 5, 2, -99, -99, 10)
na_if(x, -99)

## [1]  1  5  2 NA NA 10

recode(), a vectorised switch(), takes a numeric vector, character vector, or factor, and replaces elements based on their values.

x <- sample(c("a", "b", "c", NA), 10, replace = TRUE)

# The default is to leave non-replaced values as is
recode(x, a = "Apple")

##  [1] "Apple" NA      "c"     "c"     "c"     "b"     "c"     "b"    
##  [9] "b"     "c"

# But you can choose to override the default:
recode(x, a = "Apple", .default = NA_character_)

##  [1] "Apple" NA      NA      NA      NA      NA      NA      NA     
##  [9] NA      NA

# You can also choose what value is used for missing values
recode(x, a = "Apple", .default = NA_character_, .missing = "Unknown")

##  [1] "Apple"   "Unknown" NA        NA        NA        NA        NA       
##  [8] NA        NA        NA

case_when()

a vectorised set of if and else ifs. You provide it a set of test-result pairs as formulas: The left side of the formula should return a logical vector, and the right hand side should return either a single value, or a vector the same length as the left hand side. All results must be the same type of vector.

x <- 1:40
case_when(
  x %% 35 == 0 ~ "fizz buzz",
  x %% 5 == 0 ~ "fizz",
  x %% 7 == 0 ~ "buzz",
  TRUE ~ as.character(x)
)

##  [1] "1"         "2"         "3"         "4"         "fizz"     
##  [6] "6"         "buzz"      "8"         "9"         "fizz"     
## [11] "11"        "12"        "13"        "buzz"      "fizz"     
## [16] "16"        "17"        "18"        "19"        "fizz"     
## [21] "buzz"      "22"        "23"        "24"        "fizz"     
## [26] "26"        "27"        "buzz"      "29"        "fizz"     
## [31] "31"        "32"        "33"        "34"        "fizz buzz"
## [36] "36"        "37"        "38"        "39"        "fizz"

# mutate_if is particularly useful for transforming variables from
# one type to another
iris %>% as_tibble() %>% mutate_if(is.factor, as.character)

## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
##  1         5.10        3.50         1.40       0.200 setosa 
##  2         4.90        3.00         1.40       0.200 setosa 
##  3         4.70        3.20         1.30       0.200 setosa 
##  4         4.60        3.10         1.50       0.200 setosa 
##  5         5.00        3.60         1.40       0.200 setosa 
##  6         5.40        3.90         1.70       0.400 setosa 
##  7         4.60        3.40         1.40       0.300 setosa 
##  8         5.00        3.40         1.50       0.200 setosa 
##  9         4.40        2.90         1.40       0.200 setosa 
## 10         4.90        3.10         1.50       0.100 setosa 
## # ... with 140 more rows

iris %>% as_tibble() %>% mutate_if(is.double, as.integer)

## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <int>       <int>        <int>       <int> <fct>  
##  1            5           3            1           0 setosa 
##  2            4           3            1           0 setosa 
##  3            4           3            1           0 setosa 
##  4            4           3            1           0 setosa 
##  5            5           3            1           0 setosa 
##  6            5           3            1           0 setosa 
##  7            4           3            1           0 setosa 
##  8            5           3            1           0 setosa 
##  9            4           2            1           0 setosa 
## 10            4           3            1           0 setosa 
## # ... with 140 more rows

iris%>%head(3)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa

# The scoped variants of summarise() and mutate() make it easy to
# apply the same transformation to multiple variables:

iris %>%
  group_by(Species) %>%
  dplyr::summarise(sepal=mean(Sepal.Length,na.rm=TRUE))

## # A tibble: 3 x 2
##   Species    sepal
##   <fct>      <dbl>
## 1 setosa      5.01
## 2 versicolor  5.94
## 3 virginica   6.59

iris %>%
  group_by(Species) %>%
  summarise_all(mean)

## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5.01        3.43         1.46       0.246
## 2 versicolor         5.94        2.77         4.26       1.33 
## 3 virginica          6.59        2.97         5.55       2.03

# There are three variants.
# * _all affects every variable
# * _at affects variables selected with a character vector or vars()
# * _if affects variables selected with a predicate function:

starwars %>% summarise_at(vars(height:mass), mean, na.rm = TRUE)

## # A tibble: 1 x 2
##   height  mass
##    <dbl> <dbl>
## 1    174  97.3

starwars %>% summarise_at(c("height", "mass"), mean, na.rm = TRUE)

## # A tibble: 1 x 2
##   height  mass
##    <dbl> <dbl>
## 1    174  97.3

starwars %>% summarise_if(is.numeric, mean, na.rm = TRUE)

## # A tibble: 1 x 3
##   height  mass birth_year
##    <dbl> <dbl>      <dbl>
## 1    174  97.3       87.6

# mutate_if is particularly useful for transforming variables from
# one type to another
#mutate_if(is.character, str_to_lower)
iris %>% as_tibble() %>% mutate_if(is.factor, as.character)

## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
##  1         5.10        3.50         1.40       0.200 setosa 
##  2         4.90        3.00         1.40       0.200 setosa 
##  3         4.70        3.20         1.30       0.200 setosa 
##  4         4.60        3.10         1.50       0.200 setosa 
##  5         5.00        3.60         1.40       0.200 setosa 
##  6         5.40        3.90         1.70       0.400 setosa 
##  7         4.60        3.40         1.40       0.300 setosa 
##  8         5.00        3.40         1.50       0.200 setosa 
##  9         4.40        2.90         1.40       0.200 setosa 
## 10         4.90        3.10         1.50       0.100 setosa 
## # ... with 140 more rows

iris %>% as_tibble() %>% mutate_if(is.double, as.integer)

## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <int>       <int>        <int>       <int> <fct>  
##  1            5           3            1           0 setosa 
##  2            4           3            1           0 setosa 
##  3            4           3            1           0 setosa 
##  4            4           3            1           0 setosa 
##  5            5           3            1           0 setosa 
##  6            5           3            1           0 setosa 
##  7            4           3            1           0 setosa 
##  8            5           3            1           0 setosa 
##  9            4           2            1           0 setosa 
## 10            4           3            1           0 setosa 
## # ... with 140 more rows

#for every numeric variable replace NA observations with 0 
iris %>% mutate_if(is.numeric, funs(ifelse(is.na(.), 0, .)))

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica

iris%>% mutate_if(is.numeric, coalesce, ... = 0)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica

pacman::p_load("lubridate", "dplyr", "magrittr")

test <- data.frame(
  aroma = c("5/10", "2/10", "3/10"),
  taste = c("2/20", "5/20", "15/20"),
  orderdt  = structure(c(1343692800, 1360022400,1381968000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
  shipdt  = structure(c(1343692800, 1360022400,1381968000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
  stringsAsFactors = FALSE)

str(test)

## 'data.frame':    3 obs. of  4 variables:
##  $ aroma  : chr  "5/10" "2/10" "3/10"
##  $ taste  : chr  "2/20" "5/20" "15/20"
##  $ orderdt: POSIXct, format: "2012-07-31" "2013-02-05" ...
##  $ shipdt : POSIXct, format: "2012-07-31" "2013-02-05" ...

test <- test %>% mutate_if(is.POSIXt, as.Date)

#convert character variables to factor variables
test <- test %>% mutate_if(is.character, as.factor)
glimpse(test)

## Observations: 3
## Variables: 4
## $ aroma   <fct> 5/10, 2/10, 3/10
## $ taste   <fct> 2/20, 5/20, 15/20
## $ orderdt <date> 2012-07-31, 2013-02-05, 2013-10-17
## $ shipdt  <date> 2012-07-31, 2013-02-05, 2013-10-17

# ---------------------------------------------------------------------------
# If you want apply multiple transformations, use funs()
by_species <- iris %>% group_by(Species)

by_species %>% summarise_all(funs(min, max))

## # A tibble: 3 x 9
##   Species    Sepal.Length_min Sepal.Width_min Petal.Length_min
##   <fct>                 <dbl>           <dbl>            <dbl>
## 1 setosa                 4.30            2.30             1.00
## 2 versicolor             4.90            2.00             3.00
## 3 virginica              4.90            2.20             4.50
## # ... with 5 more variables: Petal.Width_min <dbl>,
## #   Sepal.Length_max <dbl>, Sepal.Width_max <dbl>, Petal.Length_max <dbl>,
## #   Petal.Width_max <dbl>

# Note that output variable name now includes the function name, in order to
# keep things distinct.

# You can express more complex inline transformations using .
by_species %>% mutate_all(funs(. / 2.54))%>%head(3)

## # A tibble: 3 x 5
## # Groups:   Species [1]
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1         2.01        1.38        0.551      0.0787 setosa 
## 2         1.93        1.18        0.551      0.0787 setosa 
## 3         1.85        1.26        0.512      0.0787 setosa

# Function names will be included if .funs has names or multiple inputs
by_species %>% mutate_all(funs(cm = . / 2.54))%>%head(3)

## # A tibble: 3 x 9
## # Groups:   Species [1]
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1         5.10        3.50         1.40       0.200 setosa 
## 2         4.90        3.00         1.40       0.200 setosa 
## 3         4.70        3.20         1.30       0.200 setosa 
## # ... with 4 more variables: Sepal.Length_cm <dbl>, Sepal.Width_cm <dbl>,
## #   Petal.Length_cm <dbl>, Petal.Width_cm <dbl>

by_species %>% summarise_all(funs(med = median))

## # A tibble: 3 x 5
##   Species    Sepal.Length_med Sepal.Width_med Petal.Length_med
##   <fct>                 <dbl>           <dbl>            <dbl>
## 1 setosa                 5.00            3.40             1.50
## 2 versicolor             5.90            2.80             4.35
## 3 virginica              6.50            3.00             5.55
## # ... with 1 more variable: Petal.Width_med <dbl>

by_species %>% summarise_all(funs(Q3 = quantile), probs = 0.75)

## # A tibble: 3 x 5
##   Species    Sepal.Length_Q3 Sepal.Width_Q3 Petal.Length_Q3 Petal.Width_Q3
##   <fct>                <dbl>          <dbl>           <dbl>          <dbl>
## 1 setosa                5.20           3.68            1.58          0.300
## 2 versicolor            6.30           3.00            4.60          1.50 
## 3 virginica             6.90           3.18            5.88          2.30

by_species %>% summarise_all(c("min", "max"))

## # A tibble: 3 x 9
##   Species    Sepal.Length_min Sepal.Width_min Petal.Length_min
##   <fct>                 <dbl>           <dbl>            <dbl>
## 1 setosa                 4.30            2.30             1.00
## 2 versicolor             4.90            2.00             3.00
## 3 virginica              4.90            2.20             4.50
## # ... with 5 more variables: Petal.Width_min <dbl>,
## #   Sepal.Length_max <dbl>, Sepal.Width_max <dbl>, Petal.Length_max <dbl>,
## #   Petal.Width_max <dbl>

by_species %>% mutate_if(is.numeric,funs(cm = . / 2.54))%>%head(3)

## # A tibble: 3 x 9
## # Groups:   Species [1]
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1         5.10        3.50         1.40       0.200 setosa 
## 2         4.90        3.00         1.40       0.200 setosa 
## 3         4.70        3.20         1.30       0.200 setosa 
## # ... with 4 more variables: Sepal.Length_cm <dbl>, Sepal.Width_cm <dbl>,
## #   Petal.Length_cm <dbl>, Petal.Width_cm <dbl>

library(tidyverse)
data(iris)
iris%>%head(3)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa

Iris_summary2 <- iris %>% # the names of the new data frame and the data frame to be summarised
  group_by(Species) %>%   # the grouping variable
  summarise_at("Petal.Length",funs(min, max,mean(.,na.rm=TRUE),sd,n(),SE_PL = sd(.)/sqrt(n())))
Iris_summary2

## # A tibble: 3 x 7
##   Species      min   max  mean    sd     n  SE_PL
##   <fct>      <dbl> <dbl> <dbl> <dbl> <int>  <dbl>
## 1 setosa      1.00  1.90  1.46 0.174    50 0.0246
## 2 versicolor  3.00  5.10  4.26 0.470    50 0.0665
## 3 virginica   4.50  6.90  5.55 0.552    50 0.0780

gather

# From http://stackoverflow.com/questions/1181060
stocks <- data_frame(
  time = as.Date('2009-01-01') + 0:9,
  X = rnorm(10, 0, 1),
  Y = rnorm(10, 0, 2),
  Z = rnorm(10, 0, 4)
)

gather(stocks, stock, price, -time) %>%head(3)

## # A tibble: 3 x 3
##   time       stock price
##   <date>     <chr> <dbl>
## 1 2009-01-01 X     2.05 
## 2 2009-01-02 X     0.382
## 3 2009-01-03 X     0.564

stocks %>% gather(stock, price, -time) %>%head(3)

## # A tibble: 3 x 3
##   time       stock price
##   <date>     <chr> <dbl>
## 1 2009-01-01 X     2.05 
## 2 2009-01-02 X     0.382
## 3 2009-01-03 X     0.564

# get first observation for each Species in iris data -- base R
mini_iris <- iris[c(1, 51, 101), ]
# gather Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
#gather(mini_iris, key = flower_att, value = measurement,Species)
gather(mini_iris, key = flower_att, value = measurement,
       Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%head(3)

##      Species   flower_att measurement
## 1     setosa Sepal.Length         5.1
## 2 versicolor Sepal.Length         7.0
## 3  virginica Sepal.Length         6.3

gather(mini_iris, key = flower_att, value = measurement,-Species)%>%head(3)

##      Species   flower_att measurement
## 1     setosa Sepal.Length         5.1
## 2 versicolor Sepal.Length         7.0
## 3  virginica Sepal.Length         6.3

# same result but less verbose
gather(mini_iris, key = flower_att, value = measurement, -Species)%>%head(3)

##      Species   flower_att measurement
## 1     setosa Sepal.Length         5.1
## 2 versicolor Sepal.Length         7.0
## 3  virginica Sepal.Length         6.3

# repeat iris example using dplyr and the pipe operator

mini_iris <-
  iris %>%
  group_by(Species) %>%
  dplyr::slice(1)
mini_iris %>% gather(key = flower_att, value = measurement, -Species)%>%head(3)

## # A tibble: 3 x 3
## # Groups:   Species [3]
##   Species    flower_att   measurement
##   <fct>      <chr>              <dbl>
## 1 setosa     Sepal.Length        5.10
## 2 versicolor Sepal.Length        7.00
## 3 virginica  Sepal.Length        6.30

by(warpbreaks[, 1],   warpbreaks[, -1],summary)

## wool: A
## tension: L
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   26.00   51.00   44.56   54.00   70.00 
## -------------------------------------------------------- 
## wool: B
## tension: L
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   20.00   29.00   28.22   31.00   44.00 
## -------------------------------------------------------- 
## wool: A
## tension: M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12      18      21      24      30      36 
## -------------------------------------------------------- 
## wool: B
## tension: M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   21.00   28.00   28.78   39.00   42.00 
## -------------------------------------------------------- 
## wool: A
## tension: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   18.00   24.00   24.56   28.00   43.00 
## -------------------------------------------------------- 
## wool: B
## tension: H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   15.00   17.00   18.78   21.00   28.00

by(warpbreaks[, 1],   warpbreaks[, -1],fivenum)

## wool: A
## tension: L
## [1] 25 26 51 54 70
## -------------------------------------------------------- 
## wool: B
## tension: L
## [1] 14 20 29 31 44
## -------------------------------------------------------- 
## wool: A
## tension: M
## [1] 12 18 21 30 36
## -------------------------------------------------------- 
## wool: B
## tension: M
## [1] 16 21 28 39 42
## -------------------------------------------------------- 
## wool: A
## tension: H
## [1] 10 18 24 28 43
## -------------------------------------------------------- 
## wool: B
## tension: H
## [1] 13 15 17 21 28

# Iris_summary2 <- iris %>% # the names of the new data frame and the data frame to be summarised
#   group_by(Species) %>%   # the grouping variable
#   summarise_at("Petal.Length",funs(fivenum(.)))
# Iris_summary2

((tapply(iris$Petal.Length,iris$Species,fivenum)))

## $setosa
## [1] 1.0 1.4 1.5 1.6 1.9
## 
## $versicolor
## [1] 3.00 4.00 4.35 4.60 5.10
## 
## $virginica
## [1] 4.50 5.10 5.55 5.90 6.90

tapply(warpbreaks$breaks,warpbreaks$tension,summary)

## $L
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   26.00   29.50   36.39   49.25   70.00 
## 
## $M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.00   18.25   27.00   26.39   33.75   42.00 
## 
## $H
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   15.25   20.50   21.67   25.50   43.00

by(warpbreaks[, 1:2],   warpbreaks[, 3],summary)

## warpbreaks[, 3]: L
##      breaks      wool 
##  Min.   :14.00   A:9  
##  1st Qu.:26.00   B:9  
##  Median :29.50        
##  Mean   :36.39        
##  3rd Qu.:49.25        
##  Max.   :70.00        
## -------------------------------------------------------- 
## warpbreaks[, 3]: M
##      breaks      wool 
##  Min.   :12.00   A:9  
##  1st Qu.:18.25   B:9  
##  Median :27.00        
##  Mean   :26.39        
##  3rd Qu.:33.75        
##  Max.   :42.00        
## -------------------------------------------------------- 
## warpbreaks[, 3]: H
##      breaks      wool 
##  Min.   :10.00   A:9  
##  1st Qu.:15.25   B:9  
##  Median :20.50        
##  Mean   :21.67        
##  3rd Qu.:25.50        
##  Max.   :43.00

by(warpbreaks[, 1],   warpbreaks[, 3],fivenum)

## warpbreaks[, 3]: L
## [1] 14.0 26.0 29.5 51.0 70.0
## -------------------------------------------------------- 
## warpbreaks[, 3]: M
## [1] 12 18 27 35 42
## -------------------------------------------------------- 
## warpbreaks[, 3]: H
## [1] 10.0 15.0 20.5 26.0 43.0

b=by(warpbreaks[, 1],   warpbreaks[, 3],fivenum)
do.call(rbind,b)

##   [,1] [,2] [,3] [,4] [,5]
## L   14   26 29.5   51   70
## M   12   18 27.0   35   42
## H   10   15 20.5   26   43

class(b)

## [1] "by"

class(do.call(rbind,b))

## [1] "matrix"

as.matrix.data.frame(do.call(rbind,b))

##      [,1] [,2] [,3] [,4] [,5]
## [1,]   14   26 29.5   51   70
## [2,]   12   18 27.0   35   42
## [3,]   10   15 20.5   26   43

as.data.frame.matrix(do.call(rbind,b))

##   V1 V2   V3 V4 V5
## L 14 26 29.5 51 70
## M 12 18 27.0 35 42
## H 10 15 20.5 26 43

as.data.frame.list(by(warpbreaks[, 1:2],   warpbreaks[, 3],summary))

##    L.Var1     L.Var2          L.Freq M.Var1     M.Var2          M.Freq
## 1             breaks Min.   :14.00              breaks Min.   :12.00  
## 2             breaks 1st Qu.:26.00              breaks 1st Qu.:18.25  
## 3             breaks Median :29.50              breaks Median :27.00  
## 4             breaks Mean   :36.39              breaks Mean   :26.39  
## 5             breaks 3rd Qu.:49.25              breaks 3rd Qu.:33.75  
## 6             breaks Max.   :70.00              breaks Max.   :42.00  
## 7               wool           A:9                wool           A:9  
## 8               wool           B:9                wool           B:9  
## 9               wool            <NA>              wool            <NA>
## 10              wool            <NA>              wool            <NA>
## 11              wool            <NA>              wool            <NA>
## 12              wool            <NA>              wool            <NA>
##    H.Var1     H.Var2          H.Freq
## 1             breaks Min.   :10.00  
## 2             breaks 1st Qu.:15.25  
## 3             breaks Median :20.50  
## 4             breaks Mean   :21.67  
## 5             breaks 3rd Qu.:25.50  
## 6             breaks Max.   :43.00  
## 7               wool           A:9  
## 8               wool           B:9  
## 9               wool            <NA>
## 10              wool            <NA>
## 11              wool            <NA>
## 12              wool            <NA>

as.data.frame.model.matrix(by(warpbreaks[, 1:2],   warpbreaks[, 3],summary))

##                                                                                      by(warpbreaks[, 1:2], warpbreaks[, 3], summary)
## L Min.   :14.00  , 1st Qu.:26.00  , Median :29.50  , Mean   :36.39  , 3rd Qu.:49.25  , Max.   :70.00  , A:9  , B:9  , NA, NA, NA, NA
## M Min.   :12.00  , 1st Qu.:18.25  , Median :27.00  , Mean   :26.39  , 3rd Qu.:33.75  , Max.   :42.00  , A:9  , B:9  , NA, NA, NA, NA
## H Min.   :10.00  , 1st Qu.:15.25  , Median :20.50  , Mean   :21.67  , 3rd Qu.:25.50  , Max.   :43.00  , A:9  , B:9  , NA, NA, NA, NA

complete {tidyr} R Documentation

# complete {tidyr}  R Documentation
# Complete a data frame with missing combinations of data.
# 
# Description
# 
# Turns implicit missing values into explicit missing values. This is a wrapper around expand(), left_join() and replace_na that's useful for completing missing combinations of data.
# 
# Usage
# 
# complete(data, ..., fill = list())

library(dplyr, warn.conflicts = FALSE)
df <- data_frame(
  group = c(1:2, 1),
  item_id = c(1:2, 2),
  item_name = c("a", "b", "b"),
  value1 = 1:3,
  value2 = 4:6
)
df %>% complete(group, nesting(item_id, item_name))

## # A tibble: 4 x 5
##   group item_id item_name value1 value2
##   <dbl>   <dbl> <chr>      <int>  <int>
## 1  1.00    1.00 a              1      4
## 2  1.00    2.00 b              3      6
## 3  2.00    1.00 a             NA     NA
## 4  2.00    2.00 b              2      5

# You can also choose to fill in missing values
df %>% complete(group, nesting(item_id, item_name), fill = list(value1 = 0))

## # A tibble: 4 x 5
##   group item_id item_name value1 value2
##   <dbl>   <dbl> <chr>      <dbl>  <int>
## 1  1.00    1.00 a           1.00      4
## 2  1.00    2.00 b           3.00      6
## 3  2.00    1.00 a           0        NA
## 4  2.00    2.00 b           2.00      5

# You can also choose to fill in missing values
df %>% complete(group, nesting(item_id, item_name), fill =list(value1 =0,value2 =0))

## # A tibble: 4 x 5
##   group item_id item_name value1 value2
##   <dbl>   <dbl> <chr>      <dbl>  <dbl>
## 1  1.00    1.00 a           1.00   4.00
## 2  1.00    2.00 b           3.00   6.00
## 3  2.00    1.00 a           0      0   
## 4  2.00    2.00 b           2.00   5.00

df %>% complete(group,item_id,item_name)

## # A tibble: 8 x 5
##   group item_id item_name value1 value2
##   <dbl>   <dbl> <chr>      <int>  <int>
## 1  1.00    1.00 a              1      4
## 2  1.00    1.00 b             NA     NA
## 3  1.00    2.00 a             NA     NA
## 4  1.00    2.00 b              3      6
## 5  2.00    1.00 a             NA     NA
## 6  2.00    1.00 b             NA     NA
## 7  2.00    2.00 a             NA     NA
## 8  2.00    2.00 b              2      5

Replace missing values

# replace_na {tidyr}    R Documentation
# Replace missing values
# 
# Description
# 
# Replace missing values
# 
# Usage
# 
# replace_na(data, replace = list(), ...)
# Arguments
# 
# data  
# A data frame.
# replace   
# A named list given the value to replace NA with for each column.
# ...   
# Additional arguments for methods. Currently unused.
# Examples

library(dplyr)
df <- data_frame(x = c(1, 2, NA), y = c("a", NA, "b"))
df

## # A tibble: 3 x 2
##       x y    
##   <dbl> <chr>
## 1  1.00 a    
## 2  2.00 <NA> 
## 3 NA    b

df %>% replace_na(list(x = 0, y = "unknown"))

## # A tibble: 3 x 2
##       x y      
##   <dbl> <chr>  
## 1  1.00 a      
## 2  2.00 unknown
## 3  0    b

Expand data frame to include all combinations of values

# expand {tidyr}    R Documentation
# Expand data frame to include all combinations of values
# 
# Description
# 
# expand() is often useful in conjunction with left_join if you want to convert implicit missing values to explicit missing values. Or you can use it in conjunction with anti_join() to figure out which combinations are missing.
# 
# Usage
# 
# expand(data, ...)
# 
# crossing(...)
# 
# crossing_(x)
# 
# nesting(...)
# 
# nesting_(x)

library(dplyr)
# All possible combinations of vs & cyl, even those that aren't
# present in the data
expand(mtcars, vs, cyl)

## # A tibble: 6 x 2
##      vs   cyl
##   <dbl> <dbl>
## 1  0     4.00
## 2  0     6.00
## 3  0     8.00
## 4  1.00  4.00
## 5  1.00  6.00
## 6  1.00  8.00

# Only combinations of vs and cyl that appear in the data
expand(mtcars, nesting(vs, cyl))

## # A tibble: 5 x 2
##      vs   cyl
##   <dbl> <dbl>
## 1  0     4.00
## 2  0     6.00
## 3  0     8.00
## 4  1.00  4.00
## 5  1.00  6.00

# Implicit missings ---------------------------------------------------------
df <- data_frame(
  year   = c(2010, 2010, 2010, 2010, 2012, 2012, 2012),
  qtr    = c(   1,    2,    3,    4,    1,    2,    3),
  return = rnorm(7)
)
df %>% expand(year, qtr)

## # A tibble: 8 x 2
##    year   qtr
##   <dbl> <dbl>
## 1  2010  1.00
## 2  2010  2.00
## 3  2010  3.00
## 4  2010  4.00
## 5  2012  1.00
## 6  2012  2.00
## 7  2012  3.00
## 8  2012  4.00

df %>% expand(year = 2010:2012, qtr)

## # A tibble: 12 x 2
##     year   qtr
##    <int> <dbl>
##  1  2010  1.00
##  2  2010  2.00
##  3  2010  3.00
##  4  2010  4.00
##  5  2011  1.00
##  6  2011  2.00
##  7  2011  3.00
##  8  2011  4.00
##  9  2012  1.00
## 10  2012  2.00
## 11  2012  3.00
## 12  2012  4.00

qtr    = c(   1,    2,    3,    4,    1,    2,    3)
expand.grid(year = 2010:2012, qtr)

##    year Var2
## 1  2010    1
## 2  2011    1
## 3  2012    1
## 4  2010    2
## 5  2011    2
## 6  2012    2
## 7  2010    3
## 8  2011    3
## 9  2012    3
## 10 2010    4
## 11 2011    4
## 12 2012    4
## 13 2010    1
## 14 2011    1
## 15 2012    1
## 16 2010    2
## 17 2011    2
## 18 2012    2
## 19 2010    3
## 20 2011    3
## 21 2012    3

df %>% expand(year = full_seq(year, 1), qtr)

## # A tibble: 12 x 2
##     year   qtr
##    <dbl> <dbl>
##  1  2010  1.00
##  2  2010  2.00
##  3  2010  3.00
##  4  2010  4.00
##  5  2011  1.00
##  6  2011  2.00
##  7  2011  3.00
##  8  2011  4.00
##  9  2012  1.00
## 10  2012  2.00
## 11  2012  3.00
## 12  2012  4.00

df %>% complete(year = full_seq(year, 1), qtr)

## # A tibble: 12 x 3
##     year   qtr  return
##    <dbl> <dbl>   <dbl>
##  1  2010  1.00   0.846
##  2  2010  2.00 - 0.197
##  3  2010  3.00   0.283
##  4  2010  4.00   0.173
##  5  2011  1.00  NA    
##  6  2011  2.00  NA    
##  7  2011  3.00  NA    
##  8  2011  4.00  NA    
##  9  2012  1.00   0.218
## 10  2012  2.00 - 0.506
## 11  2012  3.00   0.262
## 12  2012  4.00  NA

# Nesting -------------------------------------------------------------------
# Each person was given one of two treatments, repeated three times
# But some of the replications haven't happened yet, so we have
# incomplete data:
experiment <- data_frame(
  name = rep(c("Alex", "Robert", "Sam"), c(3, 2, 1)),
  trt  = rep(c("a", "b", "a"), c(3, 2, 1)),
  rep = c(1, 2, 3, 1, 2, 1),
  measurment_1 = runif(6),
  measurment_2 = runif(6)
)

# We can figure out the complete set of data with expand()
# Each person only gets one treatment, so we nest name and trt together:
all <- experiment %>% expand(nesting(name, trt), rep)
all

## # A tibble: 9 x 3
##   name   trt     rep
##   <chr>  <chr> <dbl>
## 1 Alex   a      1.00
## 2 Alex   a      2.00
## 3 Alex   a      3.00
## 4 Robert b      1.00
## 5 Robert b      2.00
## 6 Robert b      3.00
## 7 Sam    a      1.00
## 8 Sam    a      2.00
## 9 Sam    a      3.00

# We can use anti_join to figure out which observations are missing
all %>% anti_join(experiment)

## # A tibble: 3 x 3
##   name   trt     rep
##   <chr>  <chr> <dbl>
## 1 Robert b      3.00
## 2 Sam    a      2.00
## 3 Sam    a      3.00

# And use right_join to add in the appropriate missing values to the
# original data
experiment %>% right_join(all)

## # A tibble: 9 x 5
##   name   trt     rep measurment_1 measurment_2
##   <chr>  <chr> <dbl>        <dbl>        <dbl>
## 1 Alex   a      1.00      0.0669        0.0381
## 2 Alex   a      2.00      0.134         0.724 
## 3 Alex   a      3.00      0.353         0.827 
## 4 Robert b      1.00      0.00973       0.391 
## 5 Robert b      2.00      0.874         0.0731
## 6 Robert b      3.00     NA            NA     
## 7 Sam    a      1.00      0.675         0.178 
## 8 Sam    a      2.00     NA            NA     
## 9 Sam    a      3.00     NA            NA

# Or use the complete() short-hand
experiment %>% complete(nesting(name, trt), rep)

## # A tibble: 9 x 5
##   name   trt     rep measurment_1 measurment_2
##   <chr>  <chr> <dbl>        <dbl>        <dbl>
## 1 Alex   a      1.00      0.0669        0.0381
## 2 Alex   a      2.00      0.134         0.724 
## 3 Alex   a      3.00      0.353         0.827 
## 4 Robert b      1.00      0.00973       0.391 
## 5 Robert b      2.00      0.874         0.0731
## 6 Robert b      3.00     NA            NA     
## 7 Sam    a      1.00      0.675         0.178 
## 8 Sam    a      2.00     NA            NA     
## 9 Sam    a      3.00     NA            NA

# The scoped variants of summarise() and mutate() make it easy to
# apply the same transformation to multiple variables:

iris %>%
  group_by(Species) %>%
  summarise_all(mean)

## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5.01        3.43         1.46       0.246
## 2 versicolor         5.94        2.77         4.26       1.33 
## 3 virginica          6.59        2.97         5.55       2.03

# There are three variants.
# * _all affects every variable
# * _at affects variables selected with a character vector or vars()
# * _if affects variables selected with a predicate function:

starwars %>% summarise_at(vars(height:mass), mean, na.rm = TRUE)

## # A tibble: 1 x 2
##   height  mass
##    <dbl> <dbl>
## 1    174  97.3

starwars %>% summarise_at(c("height", "mass"), mean, na.rm = TRUE)

## # A tibble: 1 x 2
##   height  mass
##    <dbl> <dbl>
## 1    174  97.3

starwars %>% summarise_if(is.numeric, mean, na.rm = TRUE)

## # A tibble: 1 x 3
##   height  mass birth_year
##    <dbl> <dbl>      <dbl>
## 1    174  97.3       87.6

# mutate_if is particularly useful for transforming variables from
# one type to another
iris %>% as_tibble() %>% mutate_if(is.factor, as.character)

## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
##  1         5.10        3.50         1.40       0.200 setosa 
##  2         4.90        3.00         1.40       0.200 setosa 
##  3         4.70        3.20         1.30       0.200 setosa 
##  4         4.60        3.10         1.50       0.200 setosa 
##  5         5.00        3.60         1.40       0.200 setosa 
##  6         5.40        3.90         1.70       0.400 setosa 
##  7         4.60        3.40         1.40       0.300 setosa 
##  8         5.00        3.40         1.50       0.200 setosa 
##  9         4.40        2.90         1.40       0.200 setosa 
## 10         4.90        3.10         1.50       0.100 setosa 
## # ... with 140 more rows

iris %>% as_tibble() %>% mutate_if(is.double, as.integer)

## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <int>       <int>        <int>       <int> <fct>  
##  1            5           3            1           0 setosa 
##  2            4           3            1           0 setosa 
##  3            4           3            1           0 setosa 
##  4            4           3            1           0 setosa 
##  5            5           3            1           0 setosa 
##  6            5           3            1           0 setosa 
##  7            4           3            1           0 setosa 
##  8            5           3            1           0 setosa 
##  9            4           2            1           0 setosa 
## 10            4           3            1           0 setosa 
## # ... with 140 more rows

# ---------------------------------------------------------------------------
# If you want apply multiple transformations, use funs()
by_species <- iris %>% group_by(Species)

by_species %>% summarise_all(funs(min, max))

## # A tibble: 3 x 9
##   Species    Sepal.Length_min Sepal.Width_min Petal.Length_min
##   <fct>                 <dbl>           <dbl>            <dbl>
## 1 setosa                 4.30            2.30             1.00
## 2 versicolor             4.90            2.00             3.00
## 3 virginica              4.90            2.20             4.50
## # ... with 5 more variables: Petal.Width_min <dbl>,
## #   Sepal.Length_max <dbl>, Sepal.Width_max <dbl>, Petal.Length_max <dbl>,
## #   Petal.Width_max <dbl>

by_species %>% mutate_all(funs(. / 2.54))

## # A tibble: 150 x 5
## # Groups:   Species [3]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1         2.01        1.38        0.551      0.0787 setosa 
##  2         1.93        1.18        0.551      0.0787 setosa 
##  3         1.85        1.26        0.512      0.0787 setosa 
##  4         1.81        1.22        0.591      0.0787 setosa 
##  5         1.97        1.42        0.551      0.0787 setosa 
##  6         2.13        1.54        0.669      0.157  setosa 
##  7         1.81        1.34        0.551      0.118  setosa 
##  8         1.97        1.34        0.591      0.0787 setosa 
##  9         1.73        1.14        0.551      0.0787 setosa 
## 10         1.93        1.22        0.591      0.0394 setosa 
## # ... with 140 more rows

# Function names will be included if .funs has names or multiple inputs
by_species %>% mutate_all(funs(cm = . / 2.54))

## # A tibble: 150 x 9
## # Groups:   Species [3]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1         5.10        3.50         1.40       0.200 setosa 
##  2         4.90        3.00         1.40       0.200 setosa 
##  3         4.70        3.20         1.30       0.200 setosa 
##  4         4.60        3.10         1.50       0.200 setosa 
##  5         5.00        3.60         1.40       0.200 setosa 
##  6         5.40        3.90         1.70       0.400 setosa 
##  7         4.60        3.40         1.40       0.300 setosa 
##  8         5.00        3.40         1.50       0.200 setosa 
##  9         4.40        2.90         1.40       0.200 setosa 
## 10         4.90        3.10         1.50       0.100 setosa 
## # ... with 140 more rows, and 4 more variables: Sepal.Length_cm <dbl>,
## #   Sepal.Width_cm <dbl>, Petal.Length_cm <dbl>, Petal.Width_cm <dbl>

by_species %>% summarise_all(funs(med = median))

## # A tibble: 3 x 5
##   Species    Sepal.Length_med Sepal.Width_med Petal.Length_med
##   <fct>                 <dbl>           <dbl>            <dbl>
## 1 setosa                 5.00            3.40             1.50
## 2 versicolor             5.90            2.80             4.35
## 3 virginica              6.50            3.00             5.55
## # ... with 1 more variable: Petal.Width_med <dbl>

by_species %>% summarise_all(funs(Q3 = quantile), probs = 0.75)

## # A tibble: 3 x 5
##   Species    Sepal.Length_Q3 Sepal.Width_Q3 Petal.Length_Q3 Petal.Width_Q3
##   <fct>                <dbl>          <dbl>           <dbl>          <dbl>
## 1 setosa                5.20           3.68            1.58          0.300
## 2 versicolor            6.30           3.00            4.60          1.50 
## 3 virginica             6.90           3.18            5.88          2.30

by_species %>% summarise_all(c("min", "max"))

## # A tibble: 3 x 9
##   Species    Sepal.Length_min Sepal.Width_min Petal.Length_min
##   <fct>                 <dbl>           <dbl>            <dbl>
## 1 setosa                 4.30            2.30             1.00
## 2 versicolor             4.90            2.00             3.00
## 3 virginica              4.90            2.20             4.50
## # ... with 5 more variables: Petal.Width_min <dbl>,
## #   Sepal.Length_max <dbl>, Sepal.Width_max <dbl>, Petal.Length_max <dbl>,
## #   Petal.Width_max <dbl>

dplyr::do

library(dplyr)
library(tidyverse)


browser()

## Called from: eval(expr, envir, enclos)

person = c( 'Grace', 'Grace', 'Grace', 'Rob', 'Rob', 'Rob' )
foods   = c( 'apple', 'banana', 'cucumber', 'spaghetti', 'cucumber', 'banana' )

eaten <- data.frame(person, foods, stringsAsFactors = FALSE)

#eaten %>% group_by(person) %>% do(function(x) combn(x$foods, m = 2))

by_cyl <- group_by(mtcars, cyl)
#by_cyl
#mtcars
do(by_cyl, head(., 2))

## # A tibble: 6 x 11
## # Groups:   cyl [3]
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  22.8  4.00   108  93.0  3.85  2.32  18.6  1.00  1.00  4.00  1.00
## 2  24.4  4.00   147  62.0  3.69  3.19  20.0  1.00  0     4.00  2.00
## 3  21.0  6.00   160 110    3.90  2.62  16.5  0     1.00  4.00  4.00
## 4  21.0  6.00   160 110    3.90  2.88  17.0  0     1.00  4.00  4.00
## 5  18.7  8.00   360 175    3.15  3.44  17.0  0     0     3.00  2.00
## 6  14.3  8.00   360 245    3.21  3.57  15.8  0     0     3.00  4.00

mtcars%>%group_by(cyl)%>%head(.,2)

## # A tibble: 2 x 11
## # Groups:   cyl [1]
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  21.0  6.00   160   110  3.90  2.62  16.5     0  1.00  4.00  4.00
## 2  21.0  6.00   160   110  3.90  2.88  17.0     0  1.00  4.00  4.00

models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
models

## Source: local data frame [3 x 2]
## Groups: <by row>
## 
## # A tibble: 3 x 2
##     cyl mod     
## * <dbl> <list>  
## 1  4.00 <S3: lm>
## 2  6.00 <S3: lm>
## 3  8.00 <S3: lm>

summarise(models, rsq = summary(mod)$r.squared)

## # A tibble: 3 x 1
##      rsq
##    <dbl>
## 1 0.648 
## 2 0.0106
## 3 0.270

models %>% do(data.frame(coef = coef(.$mod)))

## Source: local data frame [6 x 1]
## Groups: <by row>
## 
## # A tibble: 6 x 1
##        coef
## *     <dbl>
## 1  40.9    
## 2 - 0.135  
## 3  19.1    
## 4   0.00361
## 5  22.0    
## 6 - 0.0196

models %>% do(data.frame(
  var = names(coef(.$mod)),
  coef(summary(.$mod)))
)

## Source: local data frame [6 x 5]
## Groups: <by row>
## 
## # A tibble: 6 x 5
##   var          Estimate Std..Error t.value   Pr...t..
## * <fct>           <dbl>      <dbl>   <dbl>      <dbl>
## 1 (Intercept)  40.9        3.59     11.4   0.00000120
## 2 disp        - 0.135      0.0332  - 4.07  0.00278   
## 3 (Intercept)  19.1        2.91      6.55  0.00124   
## 4 disp          0.00361    0.0156    0.232 0.826     
## 5 (Intercept)  22.0        3.35      6.59  0.0000259 
## 6 disp        - 0.0196     0.00932 - 2.11  0.0568

models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
models

## Source: local data frame [3 x 2]
## Groups: <by row>
## 
## # A tibble: 3 x 2
##     cyl mod     
## * <dbl> <list>  
## 1  4.00 <S3: lm>
## 2  6.00 <S3: lm>
## 3  8.00 <S3: lm>

summarise(models, rsq = summary(mod)$r.squared)

## # A tibble: 3 x 1
##      rsq
##    <dbl>
## 1 0.648 
## 2 0.0106
## 3 0.270

models %>% do(data.frame(coef = coef(.$mod)))

## Source: local data frame [6 x 1]
## Groups: <by row>
## 
## # A tibble: 6 x 1
##        coef
## *     <dbl>
## 1  40.9    
## 2 - 0.135  
## 3  19.1    
## 4   0.00361
## 5  22.0    
## 6 - 0.0196

models %>% do(data.frame(
  var = names(coef(.$mod)),
  coef(summary(.$mod)))
)

## Source: local data frame [6 x 5]
## Groups: <by row>
## 
## # A tibble: 6 x 5
##   var          Estimate Std..Error t.value   Pr...t..
## * <fct>           <dbl>      <dbl>   <dbl>      <dbl>
## 1 (Intercept)  40.9        3.59     11.4   0.00000120
## 2 disp        - 0.135      0.0332  - 4.07  0.00278   
## 3 (Intercept)  19.1        2.91      6.55  0.00124   
## 4 disp          0.00361    0.0156    0.232 0.826     
## 5 (Intercept)  22.0        3.35      6.59  0.0000259 
## 6 disp        - 0.0196     0.00932 - 2.11  0.0568

models <- by_cyl %>% do(
  mod_linear = lm(mpg ~ disp, data = .),
  mod_quad = lm(mpg ~ poly(disp, 2), data = .)
)
models

## Source: local data frame [3 x 3]
## Groups: <by row>
## 
## # A tibble: 3 x 3
##     cyl mod_linear mod_quad
## * <dbl> <list>     <list>  
## 1  4.00 <S3: lm>   <S3: lm>
## 2  6.00 <S3: lm>   <S3: lm>
## 3  8.00 <S3: lm>   <S3: lm>

compare <- models %>% do(aov = anova(.$mod_linear, .$mod_quad))
# compare %>% summarise(p.value = aov$`Pr(>F)`)

if (require("nycflights13")) {
# You can use it to do any arbitrary computation, like fitting a linear
# model. Let's explore how carrier departure delays vary over the time
carriers <- group_by(flights, carrier)
group_size(carriers)

mods <- do(carriers, mod = lm(arr_delay ~ dep_time, data = .))
mods %>% do(as.data.frame(coef(.$mod)))
mods %>% summarise(rsq = summary(mod)$r.squared)

#> Error: <text>:33:0: unexpected end of input
#> 31: 
#> 32: 
#>    ^
# NOT RUN {
# This longer example shows the progress bar in action
by_dest <- flights %>% group_by(dest) %>% filter(n() > 100)
library(mgcv)
by_dest %>% do(smooth = gam(arr_delay ~ s(dep_time) + month, data = .))
# }
}

## debug at <text>#24: carriers <- group_by(flights, carrier)
## debug at <text>#25: group_size(carriers)
## debug at <text>#27: mods <- do(carriers, mod = lm(arr_delay ~ dep_time, data = .))
## debug at <text>#28: mods %>% do(as.data.frame(coef(.$mod)))
## debug at <text>#29: mods %>% summarise(rsq = summary(mod)$r.squared)
## debug at <text>#37: by_dest <- flights %>% group_by(dest) %>% filter(n() > 100)
## debug at <text>#38: library(mgcv)
## debug at <text>#39: by_dest %>% do(smooth = gam(arr_delay ~ s(dep_time) + month, 
##     data = .))

## Source: local data frame [93 x 2]
## Groups: <by row>
## 
## # A tibble: 93 x 2
##    dest  smooth   
##  * <chr> <list>   
##  1 ABQ   <S3: gam>
##  2 ACK   <S3: gam>
##  3 ALB   <S3: gam>
##  4 ATL   <S3: gam>
##  5 AUS   <S3: gam>
##  6 AVL   <S3: gam>
##  7 BDL   <S3: gam>
##  8 BGR   <S3: gam>
##  9 BHM   <S3: gam>
## 10 BNA   <S3: gam>
## # ... with 83 more rows

stopCluster(cl)

Dplyr Tutorial 1

Nana Boateng

February 04, 2018