Iterations with purrr

Pada bagian ini saya belajar tentang Iteratons with purrr.

library(tidyverse)
library(caret)

For Loops

df <- tibble(
 a = rnorm(10),
 b = rnorm(10),
 c = rnorm(10),
 d = rnorm(10)
)

Misal kita ingin mencari median dari setiap kolom dari dataset diatas.

median(df$a)

## [1] 0.5761079

median(df$b)

## [1] -0.1103244

median(df$c)

## [1] 0.3987047

median(df$d)

## [1] 0.1792801

secara praktis kita bisa menggunakan copy paste seperti diatas, tapi sebaiknya jangan menggunakan copy paste bila lebih dari dua kali. Lebih baik gunakan sebagai berikut:

output <- vector("double", ncol(df)) # 1. output
for (i in seq_along(df)) { # 2. sequence
 output[[i]] <- median(df[[i]]) # 3. body
}
output

## [1]  0.5761079 -0.1103244  0.3987047  0.1792801

Modifying an Existing Object

rescale1<-function(x){
  range1<-range(x, na.rm=T)
  (x-range1[1])/(range1[2]-range1[1])
}
df$a <- rescale1(df$a)
df$b <- rescale1(df$b)
df$c <- rescale1(df$c)
df$d <- rescale1(df$d)
df

## # A tibble: 10 x 4
##        a     b     c     d
##    <dbl> <dbl> <dbl> <dbl>
##  1 0.648 0.760 0.617 0.127
##  2 0.314 0.331 0.618 0.431
##  3 0     0.807 0.182 1    
##  4 0.754 0.256 0.908 0.474
##  5 0.899 0.892 1     0.303
##  6 0.733 0.848 0.467 0.446
##  7 1     1     0.628 0    
##  8 0.327 0     0.483 0.335
##  9 0.800 0.730 0     0.607
## 10 0.378 0.497 0.382 0.321

Karena output masih sama dengan input (banyak baris dan kolom sama)

for (i in seq_along(df)) {
df[[i]]<-rescale1(df[[i]])  
}

df

## # A tibble: 10 x 4
##        a     b     c     d
##    <dbl> <dbl> <dbl> <dbl>
##  1 0.648 0.760 0.617 0.127
##  2 0.314 0.331 0.618 0.431
##  3 0     0.807 0.182 1    
##  4 0.754 0.256 0.908 0.474
##  5 0.899 0.892 1     0.303
##  6 0.733 0.848 0.467 0.446
##  7 1     1     0.628 0    
##  8 0.327 0     0.483 0.335
##  9 0.800 0.730 0     0.607
## 10 0.378 0.497 0.382 0.321

Unknown Output Length

Terkadang kita tidak mengetahui panjang dari outputnya. Misal kita ingin membuat suatu barisan distribusi normal dengan rata-rata pada variabel means dan dengan banyak datanya random sampling dari 100.

set.seed(100)
means <- c(0, 1, 2)
output <- double()
for (i in seq_along(means)) {
 n <- sample(100, 1)
 output <- c(output, rnorm(n, means[[i]]))
}
str(output)

##  num [1:210] -0.6505 -1.5859 -0.0407 -0.331 -0.9531 ...

Kode diatas adalah tidak efisien (kurang paham juga, kenapa tidak efisien). Solusi terbaiknya adalah dengan menyimpan hasil pada list dan kemudian gabungkan menjadi satu vektor.

set.seed(100)
out <- vector("list", length(means))
for (i in seq_along(means)) {
 n <- sample(100, 1)
 out[[i]] <- rnorm(n, means[[i]])
}
str(out)

## List of 3
##  $ : num [1:74] -0.6505 -1.5859 -0.0407 -0.331 -0.9531 ...
##  $ : num [1:78] 2.4494 0.0426 1.8998 1.7998 1.5189 ...
##  $ : num [1:58] 2.25 1.66 1.89 1.9 2.26 ...

str(unlist(out))

##  num [1:210] -0.6505 -1.5859 -0.0407 -0.331 -0.9531 ...

Unknown Sequence Length

Terkadang kita tidak mengetahui panjang dari input. Hal ini dapat teratasi dengan while loops.

contoh:

Hasil keduanya adalah equivalen

x<-runif(100)
# For loops --------------------------------------
for (i in seq_along(x)) {
 # body
}


# while loops ------------------------------------
i <- 1
while (i <= length(x)) {
 # body
  i<-i+1
}

ini contoh penggunaan while dibukunya, bingung juga hehehhehe.

flip <- function() {sample(c("T", "H"), 1)}
flips <- 0
nheads <- 0

while (nheads < 3) {
  if (flip() == "H") {
    nheads <- nheads + 1
    } else {
      nheads <- 0
      }
  flips <- flips + 1
  }

flips

## [1] 15

For Loops Versus Functional

Sebaiknya lakukan loops pada function dengan kata lain loopsnya ada dalam function, jangan langsung menggunakan loops pada code, karena R adalah bahasa pemrograman fungsional.

df <- tibble(
 a = rnorm(10),
 b = rnorm(10),
 c = rnorm(10),
 d = rnorm(10)
)

output <- vector("double", length(df))
for (i in seq_along(df)) {
  output[[i]] <- mean(df[[i]])
  }
output

## [1]  0.849816019  0.007359621  0.622279241 -0.136471099

Seperti dibawah ini.

col_mean <- function(df) {
  output <- vector("double", length(df))
  for (i in seq_along(df)) {
    output[i] <- mean(df[[i]])
    }
  output
}

output

## [1]  0.849816019  0.007359621  0.622279241 -0.136471099

Jika ingin menghitung median dan standar deviasi tinggal copy paste code diatas.

col_median <- function(df) {
  output <- vector("double", length(df))
  for (i in seq_along(df)) {
    output[i] <- median(df[[i]])
    }
  output
}
output

## [1]  0.849816019  0.007359621  0.622279241 -0.136471099

col_sd <- function(df) {
  output <- vector("double", length(df))
  for (i in seq_along(df)) {
    output[i] <- sd(df[[i]])
    }
  output
}
output

## [1]  0.849816019  0.007359621  0.622279241 -0.136471099

Tapi karena kita menggunakan copy paste lebih dari sekali, lebih baik gunakan function. Jadi nati function dalam function, seperti dibawah:

col_summary <- function(df, fun) {
  out <- vector("double", length(df))
  for (i in seq_along(df)) {
    out[i] <- fun(df[[i]])
    }
  out
}

col_summary(df, median)

## [1]  1.03281408 -0.09431933  0.56830976 -0.31375560

col_summary(df, sd)

## [1] 0.7716724 0.8653501 0.9127150 0.8555120

col_summary(df, mean)

## [1]  0.849816019  0.007359621  0.622279241 -0.136471099

The Map Functions

map_dbl(df, mean)

##            a            b            c            d 
##  0.849816019  0.007359621  0.622279241 -0.136471099

map_dbl(df, median)

##           a           b           c           d 
##  1.03281408 -0.09431933  0.56830976 -0.31375560

df%>%
  map_dbl(mean, trim=0.5)

##           a           b           c           d 
##  1.03281408 -0.09431933  0.56830976 -0.31375560

df%>%
  map_dbl(median)

##           a           b           c           d 
##  1.03281408 -0.09431933  0.56830976 -0.31375560

Shorcuts

From this

models <- mtcars %>%
 split(.$cyl) %>%
 map(function(df) lm(mpg ~ wt, data = df))

To this

models <- mtcars %>%
 split(.$cyl) %>%
 map(~lm(mpg ~ wt, data = .))

models %>%
 map(summary) %>%
 map_dbl(~.$r.squared)

##         4         6         8 
## 0.5086326 0.4645102 0.4229655

models %>%
 map(summary) %>%
 map_dbl("r.squared")

##         4         6         8 
## 0.5086326 0.4645102 0.4229655

Base R

x1 <- list(
 c(0.27, 0.37, 0.57, 0.91, 0.20),
 c(0.90, 0.94, 0.66, 0.63, 0.06),
 c(0.21, 0.18, 0.69, 0.38, 0.77)
)
x2 <- list(
 c(0.50, 0.72, 0.99, 0.38, 0.78),
 c(0.93, 0.21, 0.65, 0.13, 0.27),
 c(0.39, 0.01, 0.38, 0.87, 0.34)
)
threshold <- function(x, cutoff = 0.8) x[x > cutoff]
x1 %>% sapply(threshold) %>% str()

## List of 3
##  $ : num 0.91
##  $ : num [1:2] 0.9 0.94
##  $ : num(0)

x2 %>% sapply(threshold) %>% str()

##  num [1:3] 0.99 0.93 0.87

Dealing with Failure

safe_log <- safely(log)
str(safe_log(10))

## List of 2
##  $ result: num 2.3
##  $ error : NULL

#> List of 2
#> $ result: num 2.3
#> $ error : NULL
str(safe_log("a"))

## List of 2
##  $ result: NULL
##  $ error :List of 2
##   ..$ message: chr "non-numeric argument to mathematical function"
##   ..$ call   : language .Primitive("log")(x, base)
##   ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"

x <- list(1, 10, "a")
y <- x %>% map(safely(log))
str(y)

## List of 3
##  $ :List of 2
##   ..$ result: num 0
##   ..$ error : NULL
##  $ :List of 2
##   ..$ result: num 2.3
##   ..$ error : NULL
##  $ :List of 2
##   ..$ result: NULL
##   ..$ error :List of 2
##   .. ..$ message: chr "non-numeric argument to mathematical function"
##   .. ..$ call   : language .Primitive("log")(x, base)
##   .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"

y <- y %>% transpose()
str(y)

## List of 2
##  $ result:List of 3
##   ..$ : num 0
##   ..$ : num 2.3
##   ..$ : NULL
##  $ error :List of 3
##   ..$ : NULL
##   ..$ : NULL
##   ..$ :List of 2
##   .. ..$ message: chr "non-numeric argument to mathematical function"
##   .. ..$ call   : language .Primitive("log")(x, base)
##   .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"

is_ok <- y$error %>% map_lgl(is_null)
x[!is_ok]

## [[1]]
## [1] "a"

#> [[1]]
#> [1] "a"
y$result[is_ok] %>% flatten_dbl()

## [1] 0.000000 2.302585

#> [1] 0.0 2.3

x <- list(1, 10, "a")
x %>% map_dbl(possibly(log, NA_real_))

## [1] 0.000000 2.302585       NA

x <- list(1, -1)
x %>% map(quietly(log)) %>% str()

## List of 2
##  $ :List of 4
##   ..$ result  : num 0
##   ..$ output  : chr ""
##   ..$ warnings: chr(0) 
##   ..$ messages: chr(0) 
##  $ :List of 4
##   ..$ result  : num NaN
##   ..$ output  : chr ""
##   ..$ warnings: chr "NaNs produced"
##   ..$ messages: chr(0)

Mapping over Multiple Arguments

mu <- list(5, 10, -3)
mu %>%
 map(rnorm, n = 5) %>%
 str()

## List of 3
##  $ : num [1:5] 4.47 5.75 4.69 6.27 5.98
##  $ : num [1:5] 8.91 10.45 9.11 8.83 11.5
##  $ : num [1:5] -3.12 -1.78 -3.85 -4.75 -2.24

sigma <- list(1, 5, 10)
seq_along(mu) %>%
 map(~rnorm(5, mu[[.]], sigma[[.]])) %>%
 str()

## List of 3
##  $ : num [1:5] 4.54 4.99 5.64 4.85 5.69
##  $ : num [1:5] 8.29 5.88 16.92 8.29 8.87
##  $ : num [1:5] 10.417 -13.239 -3.859 -14.839 0.877

map2(mu, sigma, rnorm, n = 5) %>% str()

## List of 3
##  $ : num [1:5] 3.87 4.36 7.45 5.6 5.93
##  $ : num [1:5] 12.27 22.18 4.72 14.81 8.05
##  $ : num [1:5] 6.9 2.09 -21.89 -5.3 1.21

map2 <- function(x, y, f, ...) {
 out <- vector("list", length(x))
 for (i in seq_along(x)) {
 out[[i]] <- f(x[[i]], y[[i]], ...)
 }
 out
}

n <- list(1, 3, 5)
args1 <- list(n, mu, sigma)
args1 %>%
 pmap(rnorm) %>%
 str()

## List of 3
##  $ : num 3.64
##  $ : num [1:3] 17.52 12.43 -1.83
##  $ : num [1:5] -3.48 -1.18 -6.06 -15.96 -16.77

args2 <- list(mean = mu, sd = sigma, n = n)
args2 %>%
 pmap(rnorm) %>%
 str()

## List of 3
##  $ : num 7.11
##  $ : num [1:3] 12.41 7.03 12.12
##  $ : num [1:5] -0.569 9.663 16.062 11.642 2

params <- tribble(
 ~mean, ~sd, ~n,
 5, 1, 1,
 10, 5, 3,
 -3, 10, 5
)
params %>%
 pmap(rnorm)

## [[1]]
## [1] 5.259037
## 
## [[2]]
## [1]  3.534351 12.720815 11.674402
## 
## [[3]]
## [1]  -4.183738 -26.497384  -8.851210   4.602829   9.856654

Invoking Different Functions

f <- c("runif", "rnorm", "rpois")
param <- list(
 list(min = -1, max = 1),
 list(sd = 5),
 list(lambda = 10)
)

invoke_map(f, param, n = 5) %>% str()

## List of 3
##  $ : num [1:5] -0.421 -0.511 0.178 -0.238 0.411
##  $ : num [1:5] -5.98 5.1 -2.39 -3.3 3.39
##  $ : int [1:5] 9 8 14 13 6

sim <- tribble(
 ~f, ~params,
 "runif", list(min = -1, max = 1),
 "rnorm", list(sd = 5),
 "rpois", list(lambda = 10)
)
sim %>%
 mutate(sim = invoke_map(f, params, n = 10))

## # A tibble: 3 x 3
##   f     params           sim       
##   <chr> <list>           <list>    
## 1 runif <named list [2]> <dbl [10]>
## 2 rnorm <named list [1]> <dbl [10]>
## 3 rpois <named list [1]> <int [10]>

Walk

x <- list(1, "a", 3)
x %>%
  walk(print)

## [1] 1
## [1] "a"
## [1] 3

library(ggplot2)
plots <- mtcars %>%
 split(.$cyl) %>%
 map(~ggplot(., aes(mpg, wt)) + geom_point())
paths <- stringr::str_c(names(plots), ".pdf")
pwalk(list(paths, plots), ggsave, path = tempdir())

## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image

Other Pattern of for Loops

iris %>%
 keep(is.factor) %>%
 str()

## 'data.frame':    150 obs. of  1 variable:
##  $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

#> 'data.frame': 150 obs. of 1 variable:
#> $ Species: Factor w/ 3 levels "setosa","versicolor",..: ...
iris %>%
 discard(is.factor) %>%
 str()

## 'data.frame':    150 obs. of  4 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...

#> 'data.frame': 150 obs. of 4 variables:

x <- list(1:5, letters, list(10))
x %>%
 some(is_character)

## [1] TRUE

#> [1] TRUE
x %>%
 every(is_vector)

## [1] TRUE

#> [1] TRUEx <- list(1:5, letters, list(10))
x %>%
 some(is_character)

## [1] TRUE

#> [1] TRUE
x %>%
 every(is_vector)

## [1] TRUE

#> [1] TRUE

x <- sample(10)
x

##  [1]  8  3  7  2  1  6  9 10  4  5

#> [1] 8 7 5 6 9 2 10 1 3 4
x %>%
 detect(~ . > 5)

## [1] 8

#> [1] 8
x %>%
 detect_index(~ . > 5)

## [1] 1

#> [1] 1

x %>%
 head_while(~ . > 5)

## [1] 8

#> [1] 8 7
x %>%
 tail_while(~ . > 5)

## integer(0)

dfs <- list(
 age = tibble(name = "John", age = 30),
 sex = tibble(name = c("John", "Mary"), sex = c("M", "F")),
 trt = tibble(name = "Mary", treatment = "A")
)
dfs %>% reduce(full_join)

## Joining, by = "name"
## Joining, by = "name"

## # A tibble: 2 x 4
##   name    age sex   treatment
##   <chr> <dbl> <chr> <chr>    
## 1 John     30 M     <NA>     
## 2 Mary     NA F     A

#> Joining, by = "name"
#> Joining, by = "name"
#> # A tibble: 2 × 4
#> name age sex treatment
#> <chr> <dbl> <chr> <chr>
#> 1 John 30 M <NA>
#> 2 Mary NA F A

vs <- list(
 c(1, 3, 5, 6, 10),
 c(1, 2, 3, 7, 8, 10),
 c(1, 2, 3, 4, 8, 9, 10)
)
vs %>% reduce(intersect)

## [1]  1  3 10

#> [1] 1 3 10

x <- sample(10)
x

##  [1]  6  7  1  8  5  4  9 10  2  3

#> [1] 6 9 8 5 2 4 7 1 10 3
x %>% accumulate(`+`)

##  [1]  6 13 14 22 27 31 40 50 52 55

#> [1] 6 15 23 28 30 34 41 42 52 55