Pada bagian ini saya belajar tentang Iteratons with purrr.
library(tidyverse)
library(caret)
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
Misal kita ingin mencari median dari setiap kolom dari dataset diatas.
median(df$a)
## [1] 0.5761079
median(df$b)
## [1] -0.1103244
median(df$c)
## [1] 0.3987047
median(df$d)
## [1] 0.1792801
secara praktis kita bisa menggunakan copy paste seperti diatas, tapi sebaiknya jangan menggunakan copy paste bila lebih dari dua kali. Lebih baik gunakan sebagai berikut:
output <- vector("double", ncol(df)) # 1. output
for (i in seq_along(df)) { # 2. sequence
output[[i]] <- median(df[[i]]) # 3. body
}
output
## [1] 0.5761079 -0.1103244 0.3987047 0.1792801
Modifying an Existing Object
rescale1<-function(x){
range1<-range(x, na.rm=T)
(x-range1[1])/(range1[2]-range1[1])
}
df$a <- rescale1(df$a)
df$b <- rescale1(df$b)
df$c <- rescale1(df$c)
df$d <- rescale1(df$d)
df
## # A tibble: 10 x 4
## a b c d
## <dbl> <dbl> <dbl> <dbl>
## 1 0.648 0.760 0.617 0.127
## 2 0.314 0.331 0.618 0.431
## 3 0 0.807 0.182 1
## 4 0.754 0.256 0.908 0.474
## 5 0.899 0.892 1 0.303
## 6 0.733 0.848 0.467 0.446
## 7 1 1 0.628 0
## 8 0.327 0 0.483 0.335
## 9 0.800 0.730 0 0.607
## 10 0.378 0.497 0.382 0.321
Karena output masih sama dengan input (banyak baris dan kolom sama)
for (i in seq_along(df)) {
df[[i]]<-rescale1(df[[i]])
}
df
## # A tibble: 10 x 4
## a b c d
## <dbl> <dbl> <dbl> <dbl>
## 1 0.648 0.760 0.617 0.127
## 2 0.314 0.331 0.618 0.431
## 3 0 0.807 0.182 1
## 4 0.754 0.256 0.908 0.474
## 5 0.899 0.892 1 0.303
## 6 0.733 0.848 0.467 0.446
## 7 1 1 0.628 0
## 8 0.327 0 0.483 0.335
## 9 0.800 0.730 0 0.607
## 10 0.378 0.497 0.382 0.321
Unknown Output Length
Terkadang kita tidak mengetahui panjang dari outputnya. Misal kita ingin membuat suatu barisan distribusi normal dengan rata-rata pada variabel means dan dengan banyak datanya random sampling dari 100.
set.seed(100)
means <- c(0, 1, 2)
output <- double()
for (i in seq_along(means)) {
n <- sample(100, 1)
output <- c(output, rnorm(n, means[[i]]))
}
str(output)
## num [1:210] -0.6505 -1.5859 -0.0407 -0.331 -0.9531 ...
Kode diatas adalah tidak efisien (kurang paham juga, kenapa tidak efisien). Solusi terbaiknya adalah dengan menyimpan hasil pada list dan kemudian gabungkan menjadi satu vektor.
set.seed(100)
out <- vector("list", length(means))
for (i in seq_along(means)) {
n <- sample(100, 1)
out[[i]] <- rnorm(n, means[[i]])
}
str(out)
## List of 3
## $ : num [1:74] -0.6505 -1.5859 -0.0407 -0.331 -0.9531 ...
## $ : num [1:78] 2.4494 0.0426 1.8998 1.7998 1.5189 ...
## $ : num [1:58] 2.25 1.66 1.89 1.9 2.26 ...
str(unlist(out))
## num [1:210] -0.6505 -1.5859 -0.0407 -0.331 -0.9531 ...
Unknown Sequence Length
Terkadang kita tidak mengetahui panjang dari input. Hal ini dapat teratasi dengan while loops.
contoh:
Hasil keduanya adalah equivalen
x<-runif(100)
# For loops --------------------------------------
for (i in seq_along(x)) {
# body
}
# while loops ------------------------------------
i <- 1
while (i <= length(x)) {
# body
i<-i+1
}
ini contoh penggunaan while dibukunya, bingung juga hehehhehe.
flip <- function() {sample(c("T", "H"), 1)}
flips <- 0
nheads <- 0
while (nheads < 3) {
if (flip() == "H") {
nheads <- nheads + 1
} else {
nheads <- 0
}
flips <- flips + 1
}
flips
## [1] 15
Sebaiknya lakukan loops pada function dengan kata lain loopsnya ada dalam function, jangan langsung menggunakan loops pada code, karena R adalah bahasa pemrograman fungsional.
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
output <- vector("double", length(df))
for (i in seq_along(df)) {
output[[i]] <- mean(df[[i]])
}
output
## [1] 0.849816019 0.007359621 0.622279241 -0.136471099
Seperti dibawah ini.
col_mean <- function(df) {
output <- vector("double", length(df))
for (i in seq_along(df)) {
output[i] <- mean(df[[i]])
}
output
}
output
## [1] 0.849816019 0.007359621 0.622279241 -0.136471099
Jika ingin menghitung median dan standar deviasi tinggal copy paste code diatas.
col_median <- function(df) {
output <- vector("double", length(df))
for (i in seq_along(df)) {
output[i] <- median(df[[i]])
}
output
}
output
## [1] 0.849816019 0.007359621 0.622279241 -0.136471099
col_sd <- function(df) {
output <- vector("double", length(df))
for (i in seq_along(df)) {
output[i] <- sd(df[[i]])
}
output
}
output
## [1] 0.849816019 0.007359621 0.622279241 -0.136471099
Tapi karena kita menggunakan copy paste lebih dari sekali, lebih baik gunakan function. Jadi nati function dalam function, seperti dibawah:
col_summary <- function(df, fun) {
out <- vector("double", length(df))
for (i in seq_along(df)) {
out[i] <- fun(df[[i]])
}
out
}
col_summary(df, median)
## [1] 1.03281408 -0.09431933 0.56830976 -0.31375560
col_summary(df, sd)
## [1] 0.7716724 0.8653501 0.9127150 0.8555120
col_summary(df, mean)
## [1] 0.849816019 0.007359621 0.622279241 -0.136471099
map_dbl(df, mean)
## a b c d
## 0.849816019 0.007359621 0.622279241 -0.136471099
map_dbl(df, median)
## a b c d
## 1.03281408 -0.09431933 0.56830976 -0.31375560
df%>%
map_dbl(mean, trim=0.5)
## a b c d
## 1.03281408 -0.09431933 0.56830976 -0.31375560
df%>%
map_dbl(median)
## a b c d
## 1.03281408 -0.09431933 0.56830976 -0.31375560
From this
models <- mtcars %>%
split(.$cyl) %>%
map(function(df) lm(mpg ~ wt, data = df))
To this
models <- mtcars %>%
split(.$cyl) %>%
map(~lm(mpg ~ wt, data = .))
models %>%
map(summary) %>%
map_dbl(~.$r.squared)
## 4 6 8
## 0.5086326 0.4645102 0.4229655
models %>%
map(summary) %>%
map_dbl("r.squared")
## 4 6 8
## 0.5086326 0.4645102 0.4229655
x1 <- list(
c(0.27, 0.37, 0.57, 0.91, 0.20),
c(0.90, 0.94, 0.66, 0.63, 0.06),
c(0.21, 0.18, 0.69, 0.38, 0.77)
)
x2 <- list(
c(0.50, 0.72, 0.99, 0.38, 0.78),
c(0.93, 0.21, 0.65, 0.13, 0.27),
c(0.39, 0.01, 0.38, 0.87, 0.34)
)
threshold <- function(x, cutoff = 0.8) x[x > cutoff]
x1 %>% sapply(threshold) %>% str()
## List of 3
## $ : num 0.91
## $ : num [1:2] 0.9 0.94
## $ : num(0)
x2 %>% sapply(threshold) %>% str()
## num [1:3] 0.99 0.93 0.87
safe_log <- safely(log)
str(safe_log(10))
## List of 2
## $ result: num 2.3
## $ error : NULL
#> List of 2
#> $ result: num 2.3
#> $ error : NULL
str(safe_log("a"))
## List of 2
## $ result: NULL
## $ error :List of 2
## ..$ message: chr "non-numeric argument to mathematical function"
## ..$ call : language .Primitive("log")(x, base)
## ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
x <- list(1, 10, "a")
y <- x %>% map(safely(log))
str(y)
## List of 3
## $ :List of 2
## ..$ result: num 0
## ..$ error : NULL
## $ :List of 2
## ..$ result: num 2.3
## ..$ error : NULL
## $ :List of 2
## ..$ result: NULL
## ..$ error :List of 2
## .. ..$ message: chr "non-numeric argument to mathematical function"
## .. ..$ call : language .Primitive("log")(x, base)
## .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
y <- y %>% transpose()
str(y)
## List of 2
## $ result:List of 3
## ..$ : num 0
## ..$ : num 2.3
## ..$ : NULL
## $ error :List of 3
## ..$ : NULL
## ..$ : NULL
## ..$ :List of 2
## .. ..$ message: chr "non-numeric argument to mathematical function"
## .. ..$ call : language .Primitive("log")(x, base)
## .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
is_ok <- y$error %>% map_lgl(is_null)
x[!is_ok]
## [[1]]
## [1] "a"
#> [[1]]
#> [1] "a"
y$result[is_ok] %>% flatten_dbl()
## [1] 0.000000 2.302585
#> [1] 0.0 2.3
x <- list(1, 10, "a")
x %>% map_dbl(possibly(log, NA_real_))
## [1] 0.000000 2.302585 NA
x <- list(1, -1)
x %>% map(quietly(log)) %>% str()
## List of 2
## $ :List of 4
## ..$ result : num 0
## ..$ output : chr ""
## ..$ warnings: chr(0)
## ..$ messages: chr(0)
## $ :List of 4
## ..$ result : num NaN
## ..$ output : chr ""
## ..$ warnings: chr "NaNs produced"
## ..$ messages: chr(0)
mu <- list(5, 10, -3)
mu %>%
map(rnorm, n = 5) %>%
str()
## List of 3
## $ : num [1:5] 4.47 5.75 4.69 6.27 5.98
## $ : num [1:5] 8.91 10.45 9.11 8.83 11.5
## $ : num [1:5] -3.12 -1.78 -3.85 -4.75 -2.24
sigma <- list(1, 5, 10)
seq_along(mu) %>%
map(~rnorm(5, mu[[.]], sigma[[.]])) %>%
str()
## List of 3
## $ : num [1:5] 4.54 4.99 5.64 4.85 5.69
## $ : num [1:5] 8.29 5.88 16.92 8.29 8.87
## $ : num [1:5] 10.417 -13.239 -3.859 -14.839 0.877
map2(mu, sigma, rnorm, n = 5) %>% str()
## List of 3
## $ : num [1:5] 3.87 4.36 7.45 5.6 5.93
## $ : num [1:5] 12.27 22.18 4.72 14.81 8.05
## $ : num [1:5] 6.9 2.09 -21.89 -5.3 1.21
map2 <- function(x, y, f, ...) {
out <- vector("list", length(x))
for (i in seq_along(x)) {
out[[i]] <- f(x[[i]], y[[i]], ...)
}
out
}
n <- list(1, 3, 5)
args1 <- list(n, mu, sigma)
args1 %>%
pmap(rnorm) %>%
str()
## List of 3
## $ : num 3.64
## $ : num [1:3] 17.52 12.43 -1.83
## $ : num [1:5] -3.48 -1.18 -6.06 -15.96 -16.77
args2 <- list(mean = mu, sd = sigma, n = n)
args2 %>%
pmap(rnorm) %>%
str()
## List of 3
## $ : num 7.11
## $ : num [1:3] 12.41 7.03 12.12
## $ : num [1:5] -0.569 9.663 16.062 11.642 2
params <- tribble(
~mean, ~sd, ~n,
5, 1, 1,
10, 5, 3,
-3, 10, 5
)
params %>%
pmap(rnorm)
## [[1]]
## [1] 5.259037
##
## [[2]]
## [1] 3.534351 12.720815 11.674402
##
## [[3]]
## [1] -4.183738 -26.497384 -8.851210 4.602829 9.856654
f <- c("runif", "rnorm", "rpois")
param <- list(
list(min = -1, max = 1),
list(sd = 5),
list(lambda = 10)
)
invoke_map(f, param, n = 5) %>% str()
## List of 3
## $ : num [1:5] -0.421 -0.511 0.178 -0.238 0.411
## $ : num [1:5] -5.98 5.1 -2.39 -3.3 3.39
## $ : int [1:5] 9 8 14 13 6
sim <- tribble(
~f, ~params,
"runif", list(min = -1, max = 1),
"rnorm", list(sd = 5),
"rpois", list(lambda = 10)
)
sim %>%
mutate(sim = invoke_map(f, params, n = 10))
## # A tibble: 3 x 3
## f params sim
## <chr> <list> <list>
## 1 runif <named list [2]> <dbl [10]>
## 2 rnorm <named list [1]> <dbl [10]>
## 3 rpois <named list [1]> <int [10]>
x <- list(1, "a", 3)
x %>%
walk(print)
## [1] 1
## [1] "a"
## [1] 3
library(ggplot2)
plots <- mtcars %>%
split(.$cyl) %>%
map(~ggplot(., aes(mpg, wt)) + geom_point())
paths <- stringr::str_c(names(plots), ".pdf")
pwalk(list(paths, plots), ggsave, path = tempdir())
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
iris %>%
keep(is.factor) %>%
str()
## 'data.frame': 150 obs. of 1 variable:
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#> 'data.frame': 150 obs. of 1 variable:
#> $ Species: Factor w/ 3 levels "setosa","versicolor",..: ...
iris %>%
discard(is.factor) %>%
str()
## 'data.frame': 150 obs. of 4 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#> 'data.frame': 150 obs. of 4 variables:
x <- list(1:5, letters, list(10))
x %>%
some(is_character)
## [1] TRUE
#> [1] TRUE
x %>%
every(is_vector)
## [1] TRUE
#> [1] TRUEx <- list(1:5, letters, list(10))
x %>%
some(is_character)
## [1] TRUE
#> [1] TRUE
x %>%
every(is_vector)
## [1] TRUE
#> [1] TRUE
x <- sample(10)
x
## [1] 8 3 7 2 1 6 9 10 4 5
#> [1] 8 7 5 6 9 2 10 1 3 4
x %>%
detect(~ . > 5)
## [1] 8
#> [1] 8
x %>%
detect_index(~ . > 5)
## [1] 1
#> [1] 1
x %>%
head_while(~ . > 5)
## [1] 8
#> [1] 8 7
x %>%
tail_while(~ . > 5)
## integer(0)
dfs <- list(
age = tibble(name = "John", age = 30),
sex = tibble(name = c("John", "Mary"), sex = c("M", "F")),
trt = tibble(name = "Mary", treatment = "A")
)
dfs %>% reduce(full_join)
## Joining, by = "name"
## Joining, by = "name"
## # A tibble: 2 x 4
## name age sex treatment
## <chr> <dbl> <chr> <chr>
## 1 John 30 M <NA>
## 2 Mary NA F A
#> Joining, by = "name"
#> Joining, by = "name"
#> # A tibble: 2 × 4
#> name age sex treatment
#> <chr> <dbl> <chr> <chr>
#> 1 John 30 M <NA>
#> 2 Mary NA F A
vs <- list(
c(1, 3, 5, 6, 10),
c(1, 2, 3, 7, 8, 10),
c(1, 2, 3, 4, 8, 9, 10)
)
vs %>% reduce(intersect)
## [1] 1 3 10
#> [1] 1 3 10
x <- sample(10)
x
## [1] 6 7 1 8 5 4 9 10 2 3
#> [1] 6 9 8 5 2 4 7 1 10 3
x %>% accumulate(`+`)
## [1] 6 13 14 22 27 31 40 50 52 55
#> [1] 6 15 23 28 30 34 41 42 52 55