library(parallel)
# Calculate the number of cores
no_cores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(no_cores, type="FORK")
summarise=dplyr::summarise
list=c("tidyverse")
R<-suppressWarnings(suppressMessages(sapply(list, library, character.only = TRUE)))
select=dplyr::select
head(iris) %>%
rowwise() %>%
mutate(Max.Len= max(Sepal.Length,Petal.Length))
## Source: local data frame [6 x 6]
## Groups: <by row>
##
## # A tibble: 6 x 6
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species Max.Len
## <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 5.10 3.50 1.40 0.200 setosa 5.10
## 2 4.90 3.00 1.40 0.200 setosa 4.90
## 3 4.70 3.20 1.30 0.200 setosa 4.70
## 4 4.60 3.10 1.50 0.200 setosa 4.60
## 5 5.00 3.60 1.40 0.200 setosa 5.00
## 6 5.40 3.90 1.70 0.400 setosa 5.40
a=as.tibble(matrix(1:9,3,3))
#a%>%rowwise()mutate(Meancol= mean(.))
#a%>%colMeans(.,na.rm=TRUE)
a%>%mutate(col=colMeans(.,na.rm=TRUE))
## # A tibble: 3 x 4
## V1 V2 V3 col
## <int> <int> <int> <dbl>
## 1 1 4 7 2.00
## 2 2 5 8 5.00
## 3 3 6 9 8.00
a%>%rowwise() %>%
mutate(col2 = mean(V1,V2,V3,na.rm=TRUE))
## Source: local data frame [3 x 4]
## Groups: <by row>
##
## # A tibble: 3 x 4
## V1 V2 V3 col2
## <int> <int> <int> <int>
## 1 1 4 7 1
## 2 2 5 8 2
## 3 3 6 9 3
one <- mtcars[1:4, ]
two <- mtcars[5:8, ]
three <- mtcars[5, ]
#bind_cols(one, two)
#bind_cols(list(one, two))
bind_rows(one,three)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
df <- expand.grid(x = 1:3, y = 3:1)
df %>% rowwise() %>% do(i = seq(.$x, .$y))
## Source: local data frame [9 x 1]
## Groups: <by row>
##
## # A tibble: 9 x 1
## i
## * <list>
## 1 <int [3]>
## 2 <int [2]>
## 3 <int [1]>
## 4 <int [2]>
## 5 <int [1]>
## 6 <int [2]>
## 7 <int [1]>
## 8 <int [2]>
## 9 <int [3]>
#.Last.value %>% dplyr::summarize(n = length(i))
x <- c(1, 2, NA, 4, NA, 6)
y <- c(NA, 2, 3, 4, 5, NA)
# Use this to piece together a complete vector:
coalesce(x, y)
## [1] 1 2 3 4 5 6
# Or just replace missing value with a constant:
coalesce(x, 0)
## [1] 1 2 0 4 0 6
union(x,y)%>%na.omit()
## [1] 1 2 4 6 3 5
## attr(,"na.action")
## [1] 3
## attr(,"class")
## [1] "omit"
union_all(x,y)
## [1] 1 2 NA 4 NA 6 NA 2 3 4 5 NA
x <- c(1, 5, 2, -99, -99, 10)
na_if(x, -99)
## [1] 1 5 2 NA NA 10
x <- sample(c("a", "b", "c", NA), 10, replace = TRUE)
# The default is to leave non-replaced values as is
recode(x, a = "Apple")
## [1] "Apple" NA "c" "c" "c" "b" "c" "b"
## [9] "b" "c"
# But you can choose to override the default:
recode(x, a = "Apple", .default = NA_character_)
## [1] "Apple" NA NA NA NA NA NA NA
## [9] NA NA
# You can also choose what value is used for missing values
recode(x, a = "Apple", .default = NA_character_, .missing = "Unknown")
## [1] "Apple" "Unknown" NA NA NA NA NA
## [8] NA NA NA
a vectorised set of if and else ifs. You provide it a set of test-result pairs as formulas: The left side of the formula should return a logical vector, and the right hand side should return either a single value, or a vector the same length as the left hand side. All results must be the same type of vector.
x <- 1:40
case_when(
x %% 35 == 0 ~ "fizz buzz",
x %% 5 == 0 ~ "fizz",
x %% 7 == 0 ~ "buzz",
TRUE ~ as.character(x)
)
## [1] "1" "2" "3" "4" "fizz"
## [6] "6" "buzz" "8" "9" "fizz"
## [11] "11" "12" "13" "buzz" "fizz"
## [16] "16" "17" "18" "19" "fizz"
## [21] "buzz" "22" "23" "24" "fizz"
## [26] "26" "27" "buzz" "29" "fizz"
## [31] "31" "32" "33" "34" "fizz buzz"
## [36] "36" "37" "38" "39" "fizz"
# mutate_if is particularly useful for transforming variables from
# one type to another
iris %>% as_tibble() %>% mutate_if(is.factor, as.character)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.10 3.50 1.40 0.200 setosa
## 2 4.90 3.00 1.40 0.200 setosa
## 3 4.70 3.20 1.30 0.200 setosa
## 4 4.60 3.10 1.50 0.200 setosa
## 5 5.00 3.60 1.40 0.200 setosa
## 6 5.40 3.90 1.70 0.400 setosa
## 7 4.60 3.40 1.40 0.300 setosa
## 8 5.00 3.40 1.50 0.200 setosa
## 9 4.40 2.90 1.40 0.200 setosa
## 10 4.90 3.10 1.50 0.100 setosa
## # ... with 140 more rows
iris %>% as_tibble() %>% mutate_if(is.double, as.integer)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <int> <int> <int> <int> <fct>
## 1 5 3 1 0 setosa
## 2 4 3 1 0 setosa
## 3 4 3 1 0 setosa
## 4 4 3 1 0 setosa
## 5 5 3 1 0 setosa
## 6 5 3 1 0 setosa
## 7 4 3 1 0 setosa
## 8 5 3 1 0 setosa
## 9 4 2 1 0 setosa
## 10 4 3 1 0 setosa
## # ... with 140 more rows
iris%>%head(3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
# The scoped variants of summarise() and mutate() make it easy to
# apply the same transformation to multiple variables:
iris %>%
group_by(Species) %>%
dplyr::summarise(sepal=mean(Sepal.Length,na.rm=TRUE))
## # A tibble: 3 x 2
## Species sepal
## <fct> <dbl>
## 1 setosa 5.01
## 2 versicolor 5.94
## 3 virginica 6.59
iris %>%
group_by(Species) %>%
summarise_all(mean)
## # A tibble: 3 x 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.01 3.43 1.46 0.246
## 2 versicolor 5.94 2.77 4.26 1.33
## 3 virginica 6.59 2.97 5.55 2.03
# There are three variants.
# * _all affects every variable
# * _at affects variables selected with a character vector or vars()
# * _if affects variables selected with a predicate function:
starwars %>% summarise_at(vars(height:mass), mean, na.rm = TRUE)
## # A tibble: 1 x 2
## height mass
## <dbl> <dbl>
## 1 174 97.3
starwars %>% summarise_at(c("height", "mass"), mean, na.rm = TRUE)
## # A tibble: 1 x 2
## height mass
## <dbl> <dbl>
## 1 174 97.3
starwars %>% summarise_if(is.numeric, mean, na.rm = TRUE)
## # A tibble: 1 x 3
## height mass birth_year
## <dbl> <dbl> <dbl>
## 1 174 97.3 87.6
# mutate_if is particularly useful for transforming variables from
# one type to another
#mutate_if(is.character, str_to_lower)
iris %>% as_tibble() %>% mutate_if(is.factor, as.character)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.10 3.50 1.40 0.200 setosa
## 2 4.90 3.00 1.40 0.200 setosa
## 3 4.70 3.20 1.30 0.200 setosa
## 4 4.60 3.10 1.50 0.200 setosa
## 5 5.00 3.60 1.40 0.200 setosa
## 6 5.40 3.90 1.70 0.400 setosa
## 7 4.60 3.40 1.40 0.300 setosa
## 8 5.00 3.40 1.50 0.200 setosa
## 9 4.40 2.90 1.40 0.200 setosa
## 10 4.90 3.10 1.50 0.100 setosa
## # ... with 140 more rows
iris %>% as_tibble() %>% mutate_if(is.double, as.integer)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <int> <int> <int> <int> <fct>
## 1 5 3 1 0 setosa
## 2 4 3 1 0 setosa
## 3 4 3 1 0 setosa
## 4 4 3 1 0 setosa
## 5 5 3 1 0 setosa
## 6 5 3 1 0 setosa
## 7 4 3 1 0 setosa
## 8 5 3 1 0 setosa
## 9 4 2 1 0 setosa
## 10 4 3 1 0 setosa
## # ... with 140 more rows
#for every numeric variable replace NA observations with 0
iris %>% mutate_if(is.numeric, funs(ifelse(is.na(.), 0, .)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 60 5.2 2.7 3.9 1.4 versicolor
## 61 5.0 2.0 3.5 1.0 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 63 6.0 2.2 4.0 1.0 versicolor
## 64 6.1 2.9 4.7 1.4 versicolor
## 65 5.6 2.9 3.6 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 67 5.6 3.0 4.5 1.5 versicolor
## 68 5.8 2.7 4.1 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 72 6.1 2.8 4.0 1.3 versicolor
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 75 6.4 2.9 4.3 1.3 versicolor
## 76 6.6 3.0 4.4 1.4 versicolor
## 77 6.8 2.8 4.8 1.4 versicolor
## 78 6.7 3.0 5.0 1.7 versicolor
## 79 6.0 2.9 4.5 1.5 versicolor
## 80 5.7 2.6 3.5 1.0 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 84 6.0 2.7 5.1 1.6 versicolor
## 85 5.4 3.0 4.5 1.5 versicolor
## 86 6.0 3.4 4.5 1.6 versicolor
## 87 6.7 3.1 4.7 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 89 5.6 3.0 4.1 1.3 versicolor
## 90 5.5 2.5 4.0 1.3 versicolor
## 91 5.5 2.6 4.4 1.2 versicolor
## 92 6.1 3.0 4.6 1.4 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 95 5.6 2.7 4.2 1.3 versicolor
## 96 5.7 3.0 4.2 1.2 versicolor
## 97 5.7 2.9 4.2 1.3 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
## 99 5.1 2.5 3.0 1.1 versicolor
## 100 5.7 2.8 4.1 1.3 versicolor
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
iris%>% mutate_if(is.numeric, coalesce, ... = 0)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 60 5.2 2.7 3.9 1.4 versicolor
## 61 5.0 2.0 3.5 1.0 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 63 6.0 2.2 4.0 1.0 versicolor
## 64 6.1 2.9 4.7 1.4 versicolor
## 65 5.6 2.9 3.6 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 67 5.6 3.0 4.5 1.5 versicolor
## 68 5.8 2.7 4.1 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 72 6.1 2.8 4.0 1.3 versicolor
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 75 6.4 2.9 4.3 1.3 versicolor
## 76 6.6 3.0 4.4 1.4 versicolor
## 77 6.8 2.8 4.8 1.4 versicolor
## 78 6.7 3.0 5.0 1.7 versicolor
## 79 6.0 2.9 4.5 1.5 versicolor
## 80 5.7 2.6 3.5 1.0 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 84 6.0 2.7 5.1 1.6 versicolor
## 85 5.4 3.0 4.5 1.5 versicolor
## 86 6.0 3.4 4.5 1.6 versicolor
## 87 6.7 3.1 4.7 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 89 5.6 3.0 4.1 1.3 versicolor
## 90 5.5 2.5 4.0 1.3 versicolor
## 91 5.5 2.6 4.4 1.2 versicolor
## 92 6.1 3.0 4.6 1.4 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 95 5.6 2.7 4.2 1.3 versicolor
## 96 5.7 3.0 4.2 1.2 versicolor
## 97 5.7 2.9 4.2 1.3 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
## 99 5.1 2.5 3.0 1.1 versicolor
## 100 5.7 2.8 4.1 1.3 versicolor
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
pacman::p_load("lubridate", "dplyr", "magrittr")
test <- data.frame(
aroma = c("5/10", "2/10", "3/10"),
taste = c("2/20", "5/20", "15/20"),
orderdt = structure(c(1343692800, 1360022400,1381968000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
shipdt = structure(c(1343692800, 1360022400,1381968000), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
stringsAsFactors = FALSE)
str(test)
## 'data.frame': 3 obs. of 4 variables:
## $ aroma : chr "5/10" "2/10" "3/10"
## $ taste : chr "2/20" "5/20" "15/20"
## $ orderdt: POSIXct, format: "2012-07-31" "2013-02-05" ...
## $ shipdt : POSIXct, format: "2012-07-31" "2013-02-05" ...
test <- test %>% mutate_if(is.POSIXt, as.Date)
#convert character variables to factor variables
test <- test %>% mutate_if(is.character, as.factor)
glimpse(test)
## Observations: 3
## Variables: 4
## $ aroma <fct> 5/10, 2/10, 3/10
## $ taste <fct> 2/20, 5/20, 15/20
## $ orderdt <date> 2012-07-31, 2013-02-05, 2013-10-17
## $ shipdt <date> 2012-07-31, 2013-02-05, 2013-10-17
# ---------------------------------------------------------------------------
# If you want apply multiple transformations, use funs()
by_species <- iris %>% group_by(Species)
by_species %>% summarise_all(funs(min, max))
## # A tibble: 3 x 9
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 4.30 2.30 1.00
## 2 versicolor 4.90 2.00 3.00
## 3 virginica 4.90 2.20 4.50
## # ... with 5 more variables: Petal.Width_min <dbl>,
## # Sepal.Length_max <dbl>, Sepal.Width_max <dbl>, Petal.Length_max <dbl>,
## # Petal.Width_max <dbl>
# Note that output variable name now includes the function name, in order to
# keep things distinct.
# You can express more complex inline transformations using .
by_species %>% mutate_all(funs(. / 2.54))%>%head(3)
## # A tibble: 3 x 5
## # Groups: Species [1]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 2.01 1.38 0.551 0.0787 setosa
## 2 1.93 1.18 0.551 0.0787 setosa
## 3 1.85 1.26 0.512 0.0787 setosa
# Function names will be included if .funs has names or multiple inputs
by_species %>% mutate_all(funs(cm = . / 2.54))%>%head(3)
## # A tibble: 3 x 9
## # Groups: Species [1]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.10 3.50 1.40 0.200 setosa
## 2 4.90 3.00 1.40 0.200 setosa
## 3 4.70 3.20 1.30 0.200 setosa
## # ... with 4 more variables: Sepal.Length_cm <dbl>, Sepal.Width_cm <dbl>,
## # Petal.Length_cm <dbl>, Petal.Width_cm <dbl>
by_species %>% summarise_all(funs(med = median))
## # A tibble: 3 x 5
## Species Sepal.Length_med Sepal.Width_med Petal.Length_med
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 5.00 3.40 1.50
## 2 versicolor 5.90 2.80 4.35
## 3 virginica 6.50 3.00 5.55
## # ... with 1 more variable: Petal.Width_med <dbl>
by_species %>% summarise_all(funs(Q3 = quantile), probs = 0.75)
## # A tibble: 3 x 5
## Species Sepal.Length_Q3 Sepal.Width_Q3 Petal.Length_Q3 Petal.Width_Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.20 3.68 1.58 0.300
## 2 versicolor 6.30 3.00 4.60 1.50
## 3 virginica 6.90 3.18 5.88 2.30
by_species %>% summarise_all(c("min", "max"))
## # A tibble: 3 x 9
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 4.30 2.30 1.00
## 2 versicolor 4.90 2.00 3.00
## 3 virginica 4.90 2.20 4.50
## # ... with 5 more variables: Petal.Width_min <dbl>,
## # Sepal.Length_max <dbl>, Sepal.Width_max <dbl>, Petal.Length_max <dbl>,
## # Petal.Width_max <dbl>
by_species %>% mutate_if(is.numeric,funs(cm = . / 2.54))%>%head(3)
## # A tibble: 3 x 9
## # Groups: Species [1]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.10 3.50 1.40 0.200 setosa
## 2 4.90 3.00 1.40 0.200 setosa
## 3 4.70 3.20 1.30 0.200 setosa
## # ... with 4 more variables: Sepal.Length_cm <dbl>, Sepal.Width_cm <dbl>,
## # Petal.Length_cm <dbl>, Petal.Width_cm <dbl>
library(tidyverse)
data(iris)
iris%>%head(3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
Iris_summary2 <- iris %>% # the names of the new data frame and the data frame to be summarised
group_by(Species) %>% # the grouping variable
summarise_at("Petal.Length",funs(min, max,mean(.,na.rm=TRUE),sd,n(),SE_PL = sd(.)/sqrt(n())))
Iris_summary2
## # A tibble: 3 x 7
## Species min max mean sd n SE_PL
## <fct> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 setosa 1.00 1.90 1.46 0.174 50 0.0246
## 2 versicolor 3.00 5.10 4.26 0.470 50 0.0665
## 3 virginica 4.50 6.90 5.55 0.552 50 0.0780
# From http://stackoverflow.com/questions/1181060
stocks <- data_frame(
time = as.Date('2009-01-01') + 0:9,
X = rnorm(10, 0, 1),
Y = rnorm(10, 0, 2),
Z = rnorm(10, 0, 4)
)
gather(stocks, stock, price, -time) %>%head(3)
## # A tibble: 3 x 3
## time stock price
## <date> <chr> <dbl>
## 1 2009-01-01 X 2.05
## 2 2009-01-02 X 0.382
## 3 2009-01-03 X 0.564
stocks %>% gather(stock, price, -time) %>%head(3)
## # A tibble: 3 x 3
## time stock price
## <date> <chr> <dbl>
## 1 2009-01-01 X 2.05
## 2 2009-01-02 X 0.382
## 3 2009-01-03 X 0.564
# get first observation for each Species in iris data -- base R
mini_iris <- iris[c(1, 51, 101), ]
# gather Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
#gather(mini_iris, key = flower_att, value = measurement,Species)
gather(mini_iris, key = flower_att, value = measurement,
Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%head(3)
## Species flower_att measurement
## 1 setosa Sepal.Length 5.1
## 2 versicolor Sepal.Length 7.0
## 3 virginica Sepal.Length 6.3
gather(mini_iris, key = flower_att, value = measurement,-Species)%>%head(3)
## Species flower_att measurement
## 1 setosa Sepal.Length 5.1
## 2 versicolor Sepal.Length 7.0
## 3 virginica Sepal.Length 6.3
# same result but less verbose
gather(mini_iris, key = flower_att, value = measurement, -Species)%>%head(3)
## Species flower_att measurement
## 1 setosa Sepal.Length 5.1
## 2 versicolor Sepal.Length 7.0
## 3 virginica Sepal.Length 6.3
# repeat iris example using dplyr and the pipe operator
mini_iris <-
iris %>%
group_by(Species) %>%
dplyr::slice(1)
mini_iris %>% gather(key = flower_att, value = measurement, -Species)%>%head(3)
## # A tibble: 3 x 3
## # Groups: Species [3]
## Species flower_att measurement
## <fct> <chr> <dbl>
## 1 setosa Sepal.Length 5.10
## 2 versicolor Sepal.Length 7.00
## 3 virginica Sepal.Length 6.30
by(warpbreaks[, 1], warpbreaks[, -1],summary)
## wool: A
## tension: L
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 26.00 51.00 44.56 54.00 70.00
## --------------------------------------------------------
## wool: B
## tension: L
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 20.00 29.00 28.22 31.00 44.00
## --------------------------------------------------------
## wool: A
## tension: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 18 21 24 30 36
## --------------------------------------------------------
## wool: B
## tension: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 21.00 28.00 28.78 39.00 42.00
## --------------------------------------------------------
## wool: A
## tension: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 18.00 24.00 24.56 28.00 43.00
## --------------------------------------------------------
## wool: B
## tension: H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 15.00 17.00 18.78 21.00 28.00
by(warpbreaks[, 1], warpbreaks[, -1],fivenum)
## wool: A
## tension: L
## [1] 25 26 51 54 70
## --------------------------------------------------------
## wool: B
## tension: L
## [1] 14 20 29 31 44
## --------------------------------------------------------
## wool: A
## tension: M
## [1] 12 18 21 30 36
## --------------------------------------------------------
## wool: B
## tension: M
## [1] 16 21 28 39 42
## --------------------------------------------------------
## wool: A
## tension: H
## [1] 10 18 24 28 43
## --------------------------------------------------------
## wool: B
## tension: H
## [1] 13 15 17 21 28
# Iris_summary2 <- iris %>% # the names of the new data frame and the data frame to be summarised
# group_by(Species) %>% # the grouping variable
# summarise_at("Petal.Length",funs(fivenum(.)))
# Iris_summary2
((tapply(iris$Petal.Length,iris$Species,fivenum)))
## $setosa
## [1] 1.0 1.4 1.5 1.6 1.9
##
## $versicolor
## [1] 3.00 4.00 4.35 4.60 5.10
##
## $virginica
## [1] 4.50 5.10 5.55 5.90 6.90
tapply(warpbreaks$breaks,warpbreaks$tension,summary)
## $L
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 26.00 29.50 36.39 49.25 70.00
##
## $M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.00 18.25 27.00 26.39 33.75 42.00
##
## $H
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.00 15.25 20.50 21.67 25.50 43.00
by(warpbreaks[, 1:2], warpbreaks[, 3],summary)
## warpbreaks[, 3]: L
## breaks wool
## Min. :14.00 A:9
## 1st Qu.:26.00 B:9
## Median :29.50
## Mean :36.39
## 3rd Qu.:49.25
## Max. :70.00
## --------------------------------------------------------
## warpbreaks[, 3]: M
## breaks wool
## Min. :12.00 A:9
## 1st Qu.:18.25 B:9
## Median :27.00
## Mean :26.39
## 3rd Qu.:33.75
## Max. :42.00
## --------------------------------------------------------
## warpbreaks[, 3]: H
## breaks wool
## Min. :10.00 A:9
## 1st Qu.:15.25 B:9
## Median :20.50
## Mean :21.67
## 3rd Qu.:25.50
## Max. :43.00
by(warpbreaks[, 1], warpbreaks[, 3],fivenum)
## warpbreaks[, 3]: L
## [1] 14.0 26.0 29.5 51.0 70.0
## --------------------------------------------------------
## warpbreaks[, 3]: M
## [1] 12 18 27 35 42
## --------------------------------------------------------
## warpbreaks[, 3]: H
## [1] 10.0 15.0 20.5 26.0 43.0
b=by(warpbreaks[, 1], warpbreaks[, 3],fivenum)
do.call(rbind,b)
## [,1] [,2] [,3] [,4] [,5]
## L 14 26 29.5 51 70
## M 12 18 27.0 35 42
## H 10 15 20.5 26 43
class(b)
## [1] "by"
class(do.call(rbind,b))
## [1] "matrix"
as.matrix.data.frame(do.call(rbind,b))
## [,1] [,2] [,3] [,4] [,5]
## [1,] 14 26 29.5 51 70
## [2,] 12 18 27.0 35 42
## [3,] 10 15 20.5 26 43
as.data.frame.matrix(do.call(rbind,b))
## V1 V2 V3 V4 V5
## L 14 26 29.5 51 70
## M 12 18 27.0 35 42
## H 10 15 20.5 26 43
as.data.frame.list(by(warpbreaks[, 1:2], warpbreaks[, 3],summary))
## L.Var1 L.Var2 L.Freq M.Var1 M.Var2 M.Freq
## 1 breaks Min. :14.00 breaks Min. :12.00
## 2 breaks 1st Qu.:26.00 breaks 1st Qu.:18.25
## 3 breaks Median :29.50 breaks Median :27.00
## 4 breaks Mean :36.39 breaks Mean :26.39
## 5 breaks 3rd Qu.:49.25 breaks 3rd Qu.:33.75
## 6 breaks Max. :70.00 breaks Max. :42.00
## 7 wool A:9 wool A:9
## 8 wool B:9 wool B:9
## 9 wool <NA> wool <NA>
## 10 wool <NA> wool <NA>
## 11 wool <NA> wool <NA>
## 12 wool <NA> wool <NA>
## H.Var1 H.Var2 H.Freq
## 1 breaks Min. :10.00
## 2 breaks 1st Qu.:15.25
## 3 breaks Median :20.50
## 4 breaks Mean :21.67
## 5 breaks 3rd Qu.:25.50
## 6 breaks Max. :43.00
## 7 wool A:9
## 8 wool B:9
## 9 wool <NA>
## 10 wool <NA>
## 11 wool <NA>
## 12 wool <NA>
as.data.frame.model.matrix(by(warpbreaks[, 1:2], warpbreaks[, 3],summary))
## by(warpbreaks[, 1:2], warpbreaks[, 3], summary)
## L Min. :14.00 , 1st Qu.:26.00 , Median :29.50 , Mean :36.39 , 3rd Qu.:49.25 , Max. :70.00 , A:9 , B:9 , NA, NA, NA, NA
## M Min. :12.00 , 1st Qu.:18.25 , Median :27.00 , Mean :26.39 , 3rd Qu.:33.75 , Max. :42.00 , A:9 , B:9 , NA, NA, NA, NA
## H Min. :10.00 , 1st Qu.:15.25 , Median :20.50 , Mean :21.67 , 3rd Qu.:25.50 , Max. :43.00 , A:9 , B:9 , NA, NA, NA, NA
# complete {tidyr} R Documentation
# Complete a data frame with missing combinations of data.
#
# Description
#
# Turns implicit missing values into explicit missing values. This is a wrapper around expand(), left_join() and replace_na that's useful for completing missing combinations of data.
#
# Usage
#
# complete(data, ..., fill = list())
library(dplyr, warn.conflicts = FALSE)
df <- data_frame(
group = c(1:2, 1),
item_id = c(1:2, 2),
item_name = c("a", "b", "b"),
value1 = 1:3,
value2 = 4:6
)
df %>% complete(group, nesting(item_id, item_name))
## # A tibble: 4 x 5
## group item_id item_name value1 value2
## <dbl> <dbl> <chr> <int> <int>
## 1 1.00 1.00 a 1 4
## 2 1.00 2.00 b 3 6
## 3 2.00 1.00 a NA NA
## 4 2.00 2.00 b 2 5
# You can also choose to fill in missing values
df %>% complete(group, nesting(item_id, item_name), fill = list(value1 = 0))
## # A tibble: 4 x 5
## group item_id item_name value1 value2
## <dbl> <dbl> <chr> <dbl> <int>
## 1 1.00 1.00 a 1.00 4
## 2 1.00 2.00 b 3.00 6
## 3 2.00 1.00 a 0 NA
## 4 2.00 2.00 b 2.00 5
# You can also choose to fill in missing values
df %>% complete(group, nesting(item_id, item_name), fill =list(value1 =0,value2 =0))
## # A tibble: 4 x 5
## group item_id item_name value1 value2
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1.00 1.00 a 1.00 4.00
## 2 1.00 2.00 b 3.00 6.00
## 3 2.00 1.00 a 0 0
## 4 2.00 2.00 b 2.00 5.00
df %>% complete(group,item_id,item_name)
## # A tibble: 8 x 5
## group item_id item_name value1 value2
## <dbl> <dbl> <chr> <int> <int>
## 1 1.00 1.00 a 1 4
## 2 1.00 1.00 b NA NA
## 3 1.00 2.00 a NA NA
## 4 1.00 2.00 b 3 6
## 5 2.00 1.00 a NA NA
## 6 2.00 1.00 b NA NA
## 7 2.00 2.00 a NA NA
## 8 2.00 2.00 b 2 5
# replace_na {tidyr} R Documentation
# Replace missing values
#
# Description
#
# Replace missing values
#
# Usage
#
# replace_na(data, replace = list(), ...)
# Arguments
#
# data
# A data frame.
# replace
# A named list given the value to replace NA with for each column.
# ...
# Additional arguments for methods. Currently unused.
# Examples
library(dplyr)
df <- data_frame(x = c(1, 2, NA), y = c("a", NA, "b"))
df
## # A tibble: 3 x 2
## x y
## <dbl> <chr>
## 1 1.00 a
## 2 2.00 <NA>
## 3 NA b
df %>% replace_na(list(x = 0, y = "unknown"))
## # A tibble: 3 x 2
## x y
## <dbl> <chr>
## 1 1.00 a
## 2 2.00 unknown
## 3 0 b
# expand {tidyr} R Documentation
# Expand data frame to include all combinations of values
#
# Description
#
# expand() is often useful in conjunction with left_join if you want to convert implicit missing values to explicit missing values. Or you can use it in conjunction with anti_join() to figure out which combinations are missing.
#
# Usage
#
# expand(data, ...)
#
# crossing(...)
#
# crossing_(x)
#
# nesting(...)
#
# nesting_(x)
library(dplyr)
# All possible combinations of vs & cyl, even those that aren't
# present in the data
expand(mtcars, vs, cyl)
## # A tibble: 6 x 2
## vs cyl
## <dbl> <dbl>
## 1 0 4.00
## 2 0 6.00
## 3 0 8.00
## 4 1.00 4.00
## 5 1.00 6.00
## 6 1.00 8.00
# Only combinations of vs and cyl that appear in the data
expand(mtcars, nesting(vs, cyl))
## # A tibble: 5 x 2
## vs cyl
## <dbl> <dbl>
## 1 0 4.00
## 2 0 6.00
## 3 0 8.00
## 4 1.00 4.00
## 5 1.00 6.00
# Implicit missings ---------------------------------------------------------
df <- data_frame(
year = c(2010, 2010, 2010, 2010, 2012, 2012, 2012),
qtr = c( 1, 2, 3, 4, 1, 2, 3),
return = rnorm(7)
)
df %>% expand(year, qtr)
## # A tibble: 8 x 2
## year qtr
## <dbl> <dbl>
## 1 2010 1.00
## 2 2010 2.00
## 3 2010 3.00
## 4 2010 4.00
## 5 2012 1.00
## 6 2012 2.00
## 7 2012 3.00
## 8 2012 4.00
df %>% expand(year = 2010:2012, qtr)
## # A tibble: 12 x 2
## year qtr
## <int> <dbl>
## 1 2010 1.00
## 2 2010 2.00
## 3 2010 3.00
## 4 2010 4.00
## 5 2011 1.00
## 6 2011 2.00
## 7 2011 3.00
## 8 2011 4.00
## 9 2012 1.00
## 10 2012 2.00
## 11 2012 3.00
## 12 2012 4.00
qtr = c( 1, 2, 3, 4, 1, 2, 3)
expand.grid(year = 2010:2012, qtr)
## year Var2
## 1 2010 1
## 2 2011 1
## 3 2012 1
## 4 2010 2
## 5 2011 2
## 6 2012 2
## 7 2010 3
## 8 2011 3
## 9 2012 3
## 10 2010 4
## 11 2011 4
## 12 2012 4
## 13 2010 1
## 14 2011 1
## 15 2012 1
## 16 2010 2
## 17 2011 2
## 18 2012 2
## 19 2010 3
## 20 2011 3
## 21 2012 3
df %>% expand(year = full_seq(year, 1), qtr)
## # A tibble: 12 x 2
## year qtr
## <dbl> <dbl>
## 1 2010 1.00
## 2 2010 2.00
## 3 2010 3.00
## 4 2010 4.00
## 5 2011 1.00
## 6 2011 2.00
## 7 2011 3.00
## 8 2011 4.00
## 9 2012 1.00
## 10 2012 2.00
## 11 2012 3.00
## 12 2012 4.00
df %>% complete(year = full_seq(year, 1), qtr)
## # A tibble: 12 x 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2010 1.00 0.846
## 2 2010 2.00 - 0.197
## 3 2010 3.00 0.283
## 4 2010 4.00 0.173
## 5 2011 1.00 NA
## 6 2011 2.00 NA
## 7 2011 3.00 NA
## 8 2011 4.00 NA
## 9 2012 1.00 0.218
## 10 2012 2.00 - 0.506
## 11 2012 3.00 0.262
## 12 2012 4.00 NA
# Nesting -------------------------------------------------------------------
# Each person was given one of two treatments, repeated three times
# But some of the replications haven't happened yet, so we have
# incomplete data:
experiment <- data_frame(
name = rep(c("Alex", "Robert", "Sam"), c(3, 2, 1)),
trt = rep(c("a", "b", "a"), c(3, 2, 1)),
rep = c(1, 2, 3, 1, 2, 1),
measurment_1 = runif(6),
measurment_2 = runif(6)
)
# We can figure out the complete set of data with expand()
# Each person only gets one treatment, so we nest name and trt together:
all <- experiment %>% expand(nesting(name, trt), rep)
all
## # A tibble: 9 x 3
## name trt rep
## <chr> <chr> <dbl>
## 1 Alex a 1.00
## 2 Alex a 2.00
## 3 Alex a 3.00
## 4 Robert b 1.00
## 5 Robert b 2.00
## 6 Robert b 3.00
## 7 Sam a 1.00
## 8 Sam a 2.00
## 9 Sam a 3.00
# We can use anti_join to figure out which observations are missing
all %>% anti_join(experiment)
## # A tibble: 3 x 3
## name trt rep
## <chr> <chr> <dbl>
## 1 Robert b 3.00
## 2 Sam a 2.00
## 3 Sam a 3.00
# And use right_join to add in the appropriate missing values to the
# original data
experiment %>% right_join(all)
## # A tibble: 9 x 5
## name trt rep measurment_1 measurment_2
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alex a 1.00 0.0669 0.0381
## 2 Alex a 2.00 0.134 0.724
## 3 Alex a 3.00 0.353 0.827
## 4 Robert b 1.00 0.00973 0.391
## 5 Robert b 2.00 0.874 0.0731
## 6 Robert b 3.00 NA NA
## 7 Sam a 1.00 0.675 0.178
## 8 Sam a 2.00 NA NA
## 9 Sam a 3.00 NA NA
# Or use the complete() short-hand
experiment %>% complete(nesting(name, trt), rep)
## # A tibble: 9 x 5
## name trt rep measurment_1 measurment_2
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alex a 1.00 0.0669 0.0381
## 2 Alex a 2.00 0.134 0.724
## 3 Alex a 3.00 0.353 0.827
## 4 Robert b 1.00 0.00973 0.391
## 5 Robert b 2.00 0.874 0.0731
## 6 Robert b 3.00 NA NA
## 7 Sam a 1.00 0.675 0.178
## 8 Sam a 2.00 NA NA
## 9 Sam a 3.00 NA NA
# The scoped variants of summarise() and mutate() make it easy to
# apply the same transformation to multiple variables:
iris %>%
group_by(Species) %>%
summarise_all(mean)
## # A tibble: 3 x 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.01 3.43 1.46 0.246
## 2 versicolor 5.94 2.77 4.26 1.33
## 3 virginica 6.59 2.97 5.55 2.03
# There are three variants.
# * _all affects every variable
# * _at affects variables selected with a character vector or vars()
# * _if affects variables selected with a predicate function:
starwars %>% summarise_at(vars(height:mass), mean, na.rm = TRUE)
## # A tibble: 1 x 2
## height mass
## <dbl> <dbl>
## 1 174 97.3
starwars %>% summarise_at(c("height", "mass"), mean, na.rm = TRUE)
## # A tibble: 1 x 2
## height mass
## <dbl> <dbl>
## 1 174 97.3
starwars %>% summarise_if(is.numeric, mean, na.rm = TRUE)
## # A tibble: 1 x 3
## height mass birth_year
## <dbl> <dbl> <dbl>
## 1 174 97.3 87.6
# mutate_if is particularly useful for transforming variables from
# one type to another
iris %>% as_tibble() %>% mutate_if(is.factor, as.character)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.10 3.50 1.40 0.200 setosa
## 2 4.90 3.00 1.40 0.200 setosa
## 3 4.70 3.20 1.30 0.200 setosa
## 4 4.60 3.10 1.50 0.200 setosa
## 5 5.00 3.60 1.40 0.200 setosa
## 6 5.40 3.90 1.70 0.400 setosa
## 7 4.60 3.40 1.40 0.300 setosa
## 8 5.00 3.40 1.50 0.200 setosa
## 9 4.40 2.90 1.40 0.200 setosa
## 10 4.90 3.10 1.50 0.100 setosa
## # ... with 140 more rows
iris %>% as_tibble() %>% mutate_if(is.double, as.integer)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <int> <int> <int> <int> <fct>
## 1 5 3 1 0 setosa
## 2 4 3 1 0 setosa
## 3 4 3 1 0 setosa
## 4 4 3 1 0 setosa
## 5 5 3 1 0 setosa
## 6 5 3 1 0 setosa
## 7 4 3 1 0 setosa
## 8 5 3 1 0 setosa
## 9 4 2 1 0 setosa
## 10 4 3 1 0 setosa
## # ... with 140 more rows
# ---------------------------------------------------------------------------
# If you want apply multiple transformations, use funs()
by_species <- iris %>% group_by(Species)
by_species %>% summarise_all(funs(min, max))
## # A tibble: 3 x 9
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 4.30 2.30 1.00
## 2 versicolor 4.90 2.00 3.00
## 3 virginica 4.90 2.20 4.50
## # ... with 5 more variables: Petal.Width_min <dbl>,
## # Sepal.Length_max <dbl>, Sepal.Width_max <dbl>, Petal.Length_max <dbl>,
## # Petal.Width_max <dbl>
by_species %>% mutate_all(funs(. / 2.54))
## # A tibble: 150 x 5
## # Groups: Species [3]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 2.01 1.38 0.551 0.0787 setosa
## 2 1.93 1.18 0.551 0.0787 setosa
## 3 1.85 1.26 0.512 0.0787 setosa
## 4 1.81 1.22 0.591 0.0787 setosa
## 5 1.97 1.42 0.551 0.0787 setosa
## 6 2.13 1.54 0.669 0.157 setosa
## 7 1.81 1.34 0.551 0.118 setosa
## 8 1.97 1.34 0.591 0.0787 setosa
## 9 1.73 1.14 0.551 0.0787 setosa
## 10 1.93 1.22 0.591 0.0394 setosa
## # ... with 140 more rows
# Function names will be included if .funs has names or multiple inputs
by_species %>% mutate_all(funs(cm = . / 2.54))
## # A tibble: 150 x 9
## # Groups: Species [3]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.10 3.50 1.40 0.200 setosa
## 2 4.90 3.00 1.40 0.200 setosa
## 3 4.70 3.20 1.30 0.200 setosa
## 4 4.60 3.10 1.50 0.200 setosa
## 5 5.00 3.60 1.40 0.200 setosa
## 6 5.40 3.90 1.70 0.400 setosa
## 7 4.60 3.40 1.40 0.300 setosa
## 8 5.00 3.40 1.50 0.200 setosa
## 9 4.40 2.90 1.40 0.200 setosa
## 10 4.90 3.10 1.50 0.100 setosa
## # ... with 140 more rows, and 4 more variables: Sepal.Length_cm <dbl>,
## # Sepal.Width_cm <dbl>, Petal.Length_cm <dbl>, Petal.Width_cm <dbl>
by_species %>% summarise_all(funs(med = median))
## # A tibble: 3 x 5
## Species Sepal.Length_med Sepal.Width_med Petal.Length_med
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 5.00 3.40 1.50
## 2 versicolor 5.90 2.80 4.35
## 3 virginica 6.50 3.00 5.55
## # ... with 1 more variable: Petal.Width_med <dbl>
by_species %>% summarise_all(funs(Q3 = quantile), probs = 0.75)
## # A tibble: 3 x 5
## Species Sepal.Length_Q3 Sepal.Width_Q3 Petal.Length_Q3 Petal.Width_Q3
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.20 3.68 1.58 0.300
## 2 versicolor 6.30 3.00 4.60 1.50
## 3 virginica 6.90 3.18 5.88 2.30
by_species %>% summarise_all(c("min", "max"))
## # A tibble: 3 x 9
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 4.30 2.30 1.00
## 2 versicolor 4.90 2.00 3.00
## 3 virginica 4.90 2.20 4.50
## # ... with 5 more variables: Petal.Width_min <dbl>,
## # Sepal.Length_max <dbl>, Sepal.Width_max <dbl>, Petal.Length_max <dbl>,
## # Petal.Width_max <dbl>
library(dplyr)
library(tidyverse)
browser()
## Called from: eval(expr, envir, enclos)
person = c( 'Grace', 'Grace', 'Grace', 'Rob', 'Rob', 'Rob' )
foods = c( 'apple', 'banana', 'cucumber', 'spaghetti', 'cucumber', 'banana' )
eaten <- data.frame(person, foods, stringsAsFactors = FALSE)
#eaten %>% group_by(person) %>% do(function(x) combn(x$foods, m = 2))
by_cyl <- group_by(mtcars, cyl)
#by_cyl
#mtcars
do(by_cyl, head(., 2))
## # A tibble: 6 x 11
## # Groups: cyl [3]
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 22.8 4.00 108 93.0 3.85 2.32 18.6 1.00 1.00 4.00 1.00
## 2 24.4 4.00 147 62.0 3.69 3.19 20.0 1.00 0 4.00 2.00
## 3 21.0 6.00 160 110 3.90 2.62 16.5 0 1.00 4.00 4.00
## 4 21.0 6.00 160 110 3.90 2.88 17.0 0 1.00 4.00 4.00
## 5 18.7 8.00 360 175 3.15 3.44 17.0 0 0 3.00 2.00
## 6 14.3 8.00 360 245 3.21 3.57 15.8 0 0 3.00 4.00
mtcars%>%group_by(cyl)%>%head(.,2)
## # A tibble: 2 x 11
## # Groups: cyl [1]
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21.0 6.00 160 110 3.90 2.62 16.5 0 1.00 4.00 4.00
## 2 21.0 6.00 160 110 3.90 2.88 17.0 0 1.00 4.00 4.00
models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
models
## Source: local data frame [3 x 2]
## Groups: <by row>
##
## # A tibble: 3 x 2
## cyl mod
## * <dbl> <list>
## 1 4.00 <S3: lm>
## 2 6.00 <S3: lm>
## 3 8.00 <S3: lm>
summarise(models, rsq = summary(mod)$r.squared)
## # A tibble: 3 x 1
## rsq
## <dbl>
## 1 0.648
## 2 0.0106
## 3 0.270
models %>% do(data.frame(coef = coef(.$mod)))
## Source: local data frame [6 x 1]
## Groups: <by row>
##
## # A tibble: 6 x 1
## coef
## * <dbl>
## 1 40.9
## 2 - 0.135
## 3 19.1
## 4 0.00361
## 5 22.0
## 6 - 0.0196
models %>% do(data.frame(
var = names(coef(.$mod)),
coef(summary(.$mod)))
)
## Source: local data frame [6 x 5]
## Groups: <by row>
##
## # A tibble: 6 x 5
## var Estimate Std..Error t.value Pr...t..
## * <fct> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 40.9 3.59 11.4 0.00000120
## 2 disp - 0.135 0.0332 - 4.07 0.00278
## 3 (Intercept) 19.1 2.91 6.55 0.00124
## 4 disp 0.00361 0.0156 0.232 0.826
## 5 (Intercept) 22.0 3.35 6.59 0.0000259
## 6 disp - 0.0196 0.00932 - 2.11 0.0568
models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
models
## Source: local data frame [3 x 2]
## Groups: <by row>
##
## # A tibble: 3 x 2
## cyl mod
## * <dbl> <list>
## 1 4.00 <S3: lm>
## 2 6.00 <S3: lm>
## 3 8.00 <S3: lm>
summarise(models, rsq = summary(mod)$r.squared)
## # A tibble: 3 x 1
## rsq
## <dbl>
## 1 0.648
## 2 0.0106
## 3 0.270
models %>% do(data.frame(coef = coef(.$mod)))
## Source: local data frame [6 x 1]
## Groups: <by row>
##
## # A tibble: 6 x 1
## coef
## * <dbl>
## 1 40.9
## 2 - 0.135
## 3 19.1
## 4 0.00361
## 5 22.0
## 6 - 0.0196
models %>% do(data.frame(
var = names(coef(.$mod)),
coef(summary(.$mod)))
)
## Source: local data frame [6 x 5]
## Groups: <by row>
##
## # A tibble: 6 x 5
## var Estimate Std..Error t.value Pr...t..
## * <fct> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 40.9 3.59 11.4 0.00000120
## 2 disp - 0.135 0.0332 - 4.07 0.00278
## 3 (Intercept) 19.1 2.91 6.55 0.00124
## 4 disp 0.00361 0.0156 0.232 0.826
## 5 (Intercept) 22.0 3.35 6.59 0.0000259
## 6 disp - 0.0196 0.00932 - 2.11 0.0568
models <- by_cyl %>% do(
mod_linear = lm(mpg ~ disp, data = .),
mod_quad = lm(mpg ~ poly(disp, 2), data = .)
)
models
## Source: local data frame [3 x 3]
## Groups: <by row>
##
## # A tibble: 3 x 3
## cyl mod_linear mod_quad
## * <dbl> <list> <list>
## 1 4.00 <S3: lm> <S3: lm>
## 2 6.00 <S3: lm> <S3: lm>
## 3 8.00 <S3: lm> <S3: lm>
compare <- models %>% do(aov = anova(.$mod_linear, .$mod_quad))
# compare %>% summarise(p.value = aov$`Pr(>F)`)
if (require("nycflights13")) {
# You can use it to do any arbitrary computation, like fitting a linear
# model. Let's explore how carrier departure delays vary over the time
carriers <- group_by(flights, carrier)
group_size(carriers)
mods <- do(carriers, mod = lm(arr_delay ~ dep_time, data = .))
mods %>% do(as.data.frame(coef(.$mod)))
mods %>% summarise(rsq = summary(mod)$r.squared)
#> Error: <text>:33:0: unexpected end of input
#> 31:
#> 32:
#> ^
# NOT RUN {
# This longer example shows the progress bar in action
by_dest <- flights %>% group_by(dest) %>% filter(n() > 100)
library(mgcv)
by_dest %>% do(smooth = gam(arr_delay ~ s(dep_time) + month, data = .))
# }
}
## debug at <text>#24: carriers <- group_by(flights, carrier)
## debug at <text>#25: group_size(carriers)
## debug at <text>#27: mods <- do(carriers, mod = lm(arr_delay ~ dep_time, data = .))
## debug at <text>#28: mods %>% do(as.data.frame(coef(.$mod)))
## debug at <text>#29: mods %>% summarise(rsq = summary(mod)$r.squared)
## debug at <text>#37: by_dest <- flights %>% group_by(dest) %>% filter(n() > 100)
## debug at <text>#38: library(mgcv)
## debug at <text>#39: by_dest %>% do(smooth = gam(arr_delay ~ s(dep_time) + month,
## data = .))
## Source: local data frame [93 x 2]
## Groups: <by row>
##
## # A tibble: 93 x 2
## dest smooth
## * <chr> <list>
## 1 ABQ <S3: gam>
## 2 ACK <S3: gam>
## 3 ALB <S3: gam>
## 4 ATL <S3: gam>
## 5 AUS <S3: gam>
## 6 AVL <S3: gam>
## 7 BDL <S3: gam>
## 8 BGR <S3: gam>
## 9 BHM <S3: gam>
## 10 BNA <S3: gam>
## # ... with 83 more rows
stopCluster(cl)