r4ds_ch17.R

# 17章　purrrでイテレーション

# 「nhn-techorus.datascienceteam / bookreading · GitLab」 https://gitlab.com/nhn-techorus.datascienceteam/bookreading
# 「personal/sakai · master · nhn-techorus.datascienceteam / bookreading · GitLab」 https://gitlab.com/nhn-techorus.datascienceteam/bookreading/tree/master/personal/sakai
# 「R for Data Science」 http://r4ds.had.co.nz/iteration.html
# 「r4ds-exercise-solutions/iteration.Rmd at master · jrnold/r4ds-exercise-solutions · GitHub」 https://github.com/jrnold/r4ds-exercise-solutions/blob/master/iteration.Rmd
# 「R for Data Science Solutions」 https://jrnold.github.io/r4ds-exercise-solutions/iteration.html

# 「RPubs - r4ds_ch17」 http://rpubs.com/tocci36/r4ds_ch17

# 17.1　はじめに

# 命令型プログラミングと関数型プログラミング

# 17.1.1　用意するもの

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.3.3

## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --

## √ ggplot2 2.2.1     √ purrr   0.2.4
## √ tibble  1.3.4     √ dplyr   0.7.4
## √ tidyr   0.7.2     √ stringr 1.2.0
## √ readr   1.1.1     √ forcats 0.2.0

## Warning: package 'tibble' was built under R version 3.3.3

## Warning: package 'tidyr' was built under R version 3.3.3

## Warning: package 'readr' was built under R version 3.3.3

## Warning: package 'purrr' was built under R version 3.3.3

## Warning: package 'dplyr' was built under R version 3.3.3

## Warning: package 'stringr' was built under R version 3.3.3

## Warning: package 'forcats' was built under R version 3.3.3

## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

# 17.2　forループ p278
df <- tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)

median(df$a)

## [1] -0.1644697

median(df$b)

## [1] 0.4789213

median(df$c)

## [1] -0.0915009

median(df$d)

## [1] -0.5395308

output <- vector("double", ncol(df)) # 1. 出力
for (i in seq_along(df)) { # 2. シーケンス
  output[[i]] <- median(df[[i]]) # 3. 本体
}
output

## [1] -0.1644697  0.4789213 -0.0915009 -0.5395308

#?seq_along
# お馴染みの1:length(x)よりも安全で、ベクトルの長さが0の場合にもseq_along()はうまく処理してくれる
y <- vector("double", 0)
seq_along(y)

## integer(0)

1:length(y)

## [1] 1 0

# 練習問題 p279 
# 1. 次のforループを書きなさい。
#   a. mtcars の各列の平均を計算する。
output <- vector("double", ncol(mtcars))
names(output) <- names(mtcars)
for (i in names(mtcars)) {
  output[i] <- mean(mtcars[[i]])
}
output

##        mpg        cyl       disp         hp       drat         wt 
##  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250 
##       qsec         vs         am       gear       carb 
##  17.848750   0.437500   0.406250   3.687500   2.812500

#   b. nycflights13::flights の各列の型を決定する。
data("flights", package = "nycflights13")
output <- vector("list", ncol(flights))
names(output) <- names(flights)
for (i in names(flights)) {
  output[[i]] <- class(flights[[i]])
}
output

## $year
## [1] "integer"
## 
## $month
## [1] "integer"
## 
## $day
## [1] "integer"
## 
## $dep_time
## [1] "integer"
## 
## $sched_dep_time
## [1] "integer"
## 
## $dep_delay
## [1] "numeric"
## 
## $arr_time
## [1] "integer"
## 
## $sched_arr_time
## [1] "integer"
## 
## $arr_delay
## [1] "numeric"
## 
## $carrier
## [1] "character"
## 
## $flight
## [1] "integer"
## 
## $tailnum
## [1] "character"
## 
## $origin
## [1] "character"
## 
## $dest
## [1] "character"
## 
## $air_time
## [1] "numeric"
## 
## $distance
## [1] "numeric"
## 
## $hour
## [1] "numeric"
## 
## $minute
## [1] "numeric"
## 
## $time_hour
## [1] "POSIXct" "POSIXt"

#?data

#   c. irisの各列の重複しない値の個数を計算する。
data(iris)
iris_uniq <- vector("double", ncol(iris))
names(iris_uniq) <- names(iris)
for (i in names(iris)) {
  iris_uniq[i] <- length(unique(iris[[i]]))
}
iris_uniq

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##           35           23           43           22            3

#   d. μ＝－10, 0, 10, 100のそれぞれについて正規乱数を10個生成する。

# number to draw
n <- 10
# values of the mean
mu <- c(-10, 0, 10, 100)
normals <- vector("list", length(mu))
for (i in seq_along(normals)) {
  normals[[i]] <- rnorm(n, mean = mu[i])
}
normals

## [[1]]
##  [1] -10.963594  -8.529333 -11.118950 -10.061983 -11.085086 -10.045363
##  [7]  -9.117913  -8.914534  -9.597413  -8.094066
## 
## [[2]]
##  [1] -1.93294838 -1.96698278  0.59356476 -0.22537187 -0.89193862
##  [6] -0.70415888 -0.10903386  0.25735643 -0.02202842 -1.48645574
## 
## [[3]]
##  [1] 10.606188  8.502946  9.366019 10.474606 11.661852 10.804707  8.892807
##  [8]  9.271668  8.577027  9.325091
## 
## [[4]]
##  [1]  98.44949 100.68337  99.70210 100.24529  99.41596  99.96408  98.92164
##  [8] 101.89987  99.94767 100.71988

# However, we don’t need a for loop for this since rnorm recycles means.
matrix(rnorm(n * length(mu), mean = mu), ncol = n)

##             [,1]         [,2]        [,3]        [,4]        [,5]
## [1,]  -8.6129467  -9.29461817 -10.7997606 -9.71443638  -9.9490001
## [2,]  -0.5187033   0.04363637  -0.2811731  0.06133795  -0.1028223
## [3,]   9.8103163   9.16962203  10.8023348 10.73869343  10.8062561
## [4,] 100.5999376 100.86246700  99.2161077 98.41637552 101.7372860
##            [,6]       [,7]       [,8]        [,9]         [,10]
## [1,] -9.7244334 -9.2638062 -9.2018730 -11.0339366 -1.136711e+01
## [2,] -0.2587833 -0.6339773  0.3070537  -0.1181517 -4.967254e-04
## [3,] 12.1025541  9.0554173  8.8167438  10.3911799  1.049595e+01
## [4,] 99.7411432 97.8955094 98.4828931 101.4362403  1.003480e+02

# ループを書く前に、出力、シーケンス、本体について考えること。


# 2. 次の3例で、既存のベクトルに関する関数を利用してforループを解消しなさい。
out <- ""
for (x in letters) {
  out <- stringr::str_c(out, x)
}

stringr::str_c(letters, collapse = "")

## [1] "abcdefghijklmnopqrstuvwxyz"

x <- sample(100)
sd <- 0
for (i in seq_along(x)) {
  sd <- sd + (x[i] - mean(x)) ^ 2
}
sd <- sqrt(sd / (length(x) - 1))

sd(x)

## [1] 29.01149

# Or
sqrt(sum((x - mean(x)) ^ 2) / (length(x) - 1))

## [1] 29.01149

x <- runif(100)
out <- vector("numeric", length(x))

all.equal(cumsum(x),out)

## [1] "Mean relative difference: 1"

out[1] <- x[1]
for (i in 2:length(x)) {
  out[i] <- out[i - 1] + x[i]
}

# TODO

# 3. 関数を書くのとforループのスキルを組み合わせて次を行う。
#   a. 童謡「Alice the Camel」の歌詞を印刷（print()）するforループを書きなさい＊1。
humps <- c("five", "four", "three", "two", "one", "no")
for (i in humps) {
  cat(str_c("Alice the camel has ", rep(i, 3), " humps.",
            collapse = "\n"), "\n")
  if (i == "no") {
    cat("Now Alice is a horse.\n")
  } else {
    cat("So go, Alice, go.\n")
  }
  cat("\n")
}

## Alice the camel has five humps.
## Alice the camel has five humps.
## Alice the camel has five humps. 
## So go, Alice, go.
## 
## Alice the camel has four humps.
## Alice the camel has four humps.
## Alice the camel has four humps. 
## So go, Alice, go.
## 
## Alice the camel has three humps.
## Alice the camel has three humps.
## Alice the camel has three humps. 
## So go, Alice, go.
## 
## Alice the camel has two humps.
## Alice the camel has two humps.
## Alice the camel has two humps. 
## So go, Alice, go.
## 
## Alice the camel has one humps.
## Alice the camel has one humps.
## Alice the camel has one humps. 
## So go, Alice, go.
## 
## Alice the camel has no humps.
## Alice the camel has no humps.
## Alice the camel has no humps. 
## Now Alice is a horse.

#   b. 童謡「Ten in the Bed」を関数にしなさい。人が何人でも、寝方がどのようでも作成できるよ
#   うにしなさい＊2。

numbers <- c("ten", "nine", "eight", "seven", "six", "five",
             "four", "three", "two", "one")
for (i in numbers) {
  cat(str_c("There were ", i, " in the bed\n"))
  cat("and the little one said\n")
  if (i == "one") {
    cat("I'm lonely...")
  } else {
    cat("Roll over, roll over\n")
    cat("So they all rolled over and one fell out.\n")
  }
  cat("\n")
}

## There were ten in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
## 
## There were nine in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
## 
## There were eight in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
## 
## There were seven in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
## 
## There were six in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
## 
## There were five in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
## 
## There were four in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
## 
## There were three in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
## 
## There were two in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
## 
## There were one in the bed
## and the little one said
## I'm lonely...

#   c. 「99 Bottles of Beer on the Wall」を関数に変換しなさい。場所がどこでも、どんな液体を
#   含んだ容器がいくつでもよいように一般化しなさい＊3。

bottles <- function(i) {
  if (i > 2) {
    bottles <- str_c(i - 1, " bottles")
  } else if (i == 2) {
    bottles <- "1 bottle"
  } else {
    bottles <- "no more bottles"
  }
  bottles
}

beer_bottles <- function(n) {
  # should test whether n >= 1.
  for (i in seq(n, 1)) {
    cat(str_c(bottles(i), " of beer on the wall, ", bottles(i), " of beer.\n"))
    cat(str_c("Take one down and pass it around, ", bottles(i - 1),
              " of beer on the wall.\n\n"))
  }
  cat("No more bottles of beer on the wall, no more bottles of beer.\n")
  cat(str_c("Go to the store and buy some more, ", bottles(n), " of beer on the wall.\n"))
}
beer_bottles(3)

## 2 bottles of beer on the wall, 2 bottles of beer.
## Take one down and pass it around, 1 bottle of beer on the wall.
## 
## 1 bottle of beer on the wall, 1 bottle of beer.
## Take one down and pass it around, no more bottles of beer on the wall.
## 
## no more bottles of beer on the wall, no more bottles of beer.
## Take one down and pass it around, no more bottles of beer on the wall.
## 
## No more bottles of beer on the wall, no more bottles of beer.
## Go to the store and buy some more, 2 bottles of beer on the wall.

# 4. 前もって出力を割り当てず、ステップごとにベクトルの長さを伸ばすforループをよく見かける。
output <- vector("integer", 0)
for (i in seq_along(x)) {
  output <- c(output, lengths(x[[i]]))
}
output

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

# これは性能にどう影響するか。実験を設計して実施しなさい。
add_to_vector <- function(n) {
  output <- vector("integer", 0)
  for (i in seq_len(n)) {
    output <- c(output, i)
  }
  output  
}
microbenchmark::microbenchmark(add_to_vector(10000), times = 3)

## Unit: milliseconds
##                  expr      min       lq     mean   median       uq
##  add_to_vector(10000) 99.94955 100.2007 108.7802 100.4518 113.1956
##       max neval
##  125.9393     3

#??microbenchmark

add_to_vector_2 <- function(n) {
  output <- vector("integer", n)
  for (i in seq_len(n)) {
    output[[i]] <- i
  }
  output
}
microbenchmark::microbenchmark(add_to_vector_2(10000), times = 3)

## Unit: milliseconds
##                    expr      min       lq     mean   median       uq
##  add_to_vector_2(10000) 9.366771 9.470339 9.562295 9.573906 9.660056
##       max neval
##  9.746207     3

# あらかじめ割り当てられたベクトルは約100倍高速です！あなたは異なる答えを得るかもしれませんが、ベクトルが長く、オブジェクトが大きければ大きいほど、事前割り振りは追加よりも優れています。


# 17.3　forループのバリエーション

# forループには4つのバリエーションがあります。
# ● 新たなオブジェクトを作らず、既存オブジェクトを変更する。
# ● 添字ではなく、名前や値についてループする。
# ● 長さが不明な出力を扱う。
# ● 長さのわからないシーケンスを扱う。


# 17.3.1　既存オブジェクトの変更

df <- tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)
rescale01 <- function(x) {
  rng <- range(x, na.rm = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}
df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)

for (i in seq_along(df)) {
  df[[i]] <- rescale01(df[[i]])
}

# 17.3.2　ループパターン

# 要素を使ってループ：for (x in xs)

# 名前を使ってループ：(nm in names(xs))
results <- vector("list", length(x))
names(results) <- names(x)

# 数値の添字では、その位置で名前と値が両方取得できるので、最もよく使われます。
for (i in seq_along(x)) {
  name <- names(x)[[i]]
  value <- x[[i]]
}

# 17.3.3　出力長不明

means <- c(0, 1, 2)
output <- double()
for (i in seq_along(means)) {
  n <- sample(100, 1)
  output <- c(output, rnorm(n, means[[i]]))
}
str(output)

##  num [1:111] 0.434 0.102 1.23 1.101 0.842 ...

out <- vector("list", length(means))
for (i in seq_along(means)) {
  n <- sample(100, 1)
  out[[i]] <- rnorm(n, means[[i]])
}
str(out)

## List of 3
##  $ : num [1:67] 0.634 -1.114 -0.636 0.559 -1.457 ...
##  $ : num [1:18] 0.307 0.345 0.349 1.89 0.975 ...
##  $ : num [1:80] 3.757 0.155 2.605 2.206 0.736 ...

str(unlist(out))

##  num [1:165] 0.634 -1.114 -0.636 0.559 -1.457 ...

# より安全なのはpurrr::flatten_dbl()を使うことです。これは、入力がdoubleのリストでないとエラーを投げます。
# ● 長い文字列を生成する場合。繰り返しのたびに前の文字列にpaste()するのではなく、出力を文
# 字ベクトルに保存しておき、paste(output, collapse = "")でベクトルの要素を連結して文字
# 列にする。
# ● 大きなデータフレームを生成する場合。繰り返しのたびにrbind()で逐次作る代わりに、出力を
# リストに保存し、dplyr::bind_rows(output)を使って出力を組み合わせて1つのデータフレー
# ムにまとめる。

# 17.3.4　シーケンス長不明

#while (condition) {
  # body
#}

#for (i in seq_along(x)) {
#  # body
#}
# 次と等価
i <- 1
while (i <= length(x)) {
  # body
  i <- i + 1
}

flip <- function() sample(c("T", "H"), 1)
flips <- 0
nheads <- 0
while (nheads < 3) {
  if (flip() == "H") {
    nheads <- nheads + 1
  } else {
    nheads <- 0
  }
  flips <- flips + 1
}
#Flips
# 日本語版誤植？ http://r4ds.had.co.nz/iteration.html では小文字で正しい。
flips

## [1] 9

# 練習問題 p284

# 1. 読み込むCSVファイルが格納されているディレクトリがあるとする。パスは、
files <- dir("data/", pattern = "\\.csv$", full.names = TRUE)
# というベクトルで与えられてお
# り、read_csv()で各ファイルを読むとする。それらを読み込んで1つのデータフレームにする
# forループを書きなさい。
df <- vector("list", length(files))
for (fname in seq_along(files)) {
  df[[i]] <- read_csv(files[[i]])
}
df <- bind_rows(df)
#?bind_rows

# 2. for (nm in names(x))を使ったが、xに名前がなかったらどうなるか。要素の一部だけに名前
# がある場合はどうなるか。名前が重複していたらどうなるか。
x <- 1:3
print(names(x))

## NULL

#> NULL
for (nm in names(x)) {
  print(nm)
  print(x[[nm]])
}

# Note that the length of NULL is zero:
length(NULL)

## [1] 0

x <- c(a = 1, 2, c = 3)
names(x)

## [1] "a" ""  "c"

# for (nm in names(x)) {
#   print(nm)
#   print(x[[nm]])
# }
# [1] "a"
# [1] 1
# [1] ""
# x[[nm]] でエラー:  添え字が許される範囲外です 

# 3. データフレームの数値の列の平均を名前とともに出力する関数を書きなさい。例えば、show_
# mean(iris)が次のような出力をする。
#show_mean(iris)
#> Sepal.Length: 5.84
#> Sepal.Width: 3.06
#> Petal.Length: 3.76
#> Petal.Width: 1.20
# （追加問題：変数名の長さが変わっても数値がきちんと並ぶようにするため、どんな関数を使っ
# たか。）
show_mean <- function(df, digits = 2) {
  # Get max length of any variable in the dataset
  maxstr <- max(str_length(names(df)))
  for (nm in names(df)) {
    if (is.numeric(df[[nm]])) {
      cat(str_c(str_pad(str_c(nm, ":"), maxstr + 1L, side = "right"),
                format(mean(df[[nm]]), digits = digits, nsmall = digits),
                sep = " "),
          "\n")
    }
  }
}
show_mean(iris)

## Sepal.Length: 5.84 
## Sepal.Width:  3.06 
## Petal.Length: 3.76 
## Petal.Width:  1.20

# 4. 次のコードは何をするか。どのような作業をしているか。
mtcars2 <- mtcars
trans <- list(
  disp = function(x) x * 0.0163871,
  am = function(x) {
    factor(x, labels = c("auto", "manual"))
  }
)
for (var in names(trans)) {
  mtcars2[[var]] <- trans[[var]](mtcars[[var]])
}

glimpse(mtcars)

## Observations: 32
## Variables: 11
## $ mpg  <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19....
## $ cyl  <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, ...
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 1...
## $ hp   <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, ...
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9...
## $ wt   <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3...
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2...
## $ vs   <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ am   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, ...

glimpse(mtcars2)

## Observations: 32
## Variables: 11
## $ mpg  <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19....
## $ cyl  <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, ...
## $ disp <dbl> 2.621936, 2.621936, 1.769807, 4.227872, 5.899356, 3.68709...
## $ hp   <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, ...
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9...
## $ wt   <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3...
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2...
## $ vs   <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ am   <fctr> manual, manual, manual, auto, auto, auto, auto, auto, au...
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, ...

# 代入した変数を編集した方がいい
# factor化してる

# 17.4　forループと関数型
df <- tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)

# 各列の平均を計算することは、forループを使って書けます。
output <- vector("double", length(df))
for (i in seq_along(df)) {
  output[[i]] <- mean(df[[i]])
}
output

## [1] -0.63392925 -0.38588948  0.03254921 -0.14172525

col_mean <- function(df) {
  output <- vector("double", length(df))
  for (i in seq_along(df)) {
    output[i] <- mean(df[[i]])
  }
  output
}

col_median <- function(df) {
  output <- vector("double", length(df))
  for (i in seq_along(df)) {
    output[i] <- median(df[[i]])
  }
  output
}
col_sd <- function(df) {
  output <- vector("double", length(df))
  for (i in seq_along(df)) {
    output[i] <- sd(df[[i]])
  }
  output
}

f1 <- function(x) abs(x - mean(x)) ^ 1
f2 <- function(x) abs(x - mean(x)) ^ 2
f3 <- function(x) abs(x - mean(x)) ^ 3

col_summary <- function(df, fun) {
  out <- vector("double", length(df))
  for (i in seq_along(df)) {
    out[i] <- fun(df[[i]])
  }
  out
}
col_summary(df, median)

## [1] -0.82168690 -0.69558981  0.05413290  0.05417785

col_summary(df, mean)

## [1] -0.63392925 -0.38588948  0.03254921 -0.14172525

# 練習問題 p287

# 1. apply()のドキュメントを読みなさい。第2の例では、どんなforループ2つを一般化しているか。
# 2. col_summary()を数値列にだけ適用するよう変更しなさい。数値列に対してTRUEとなる論理ベ
# クトルを返す関数is_numeric()を使うとよい。
col_summary <- function(df, fun) {
  out <- vector("double", length(df))
  for (i in seq_along(df)) {
    out[i] <- fun(df[[i]])
  }
  out
}

# The adapted version is,

col_summary2 <- function(df, fun) {
  # test whether each colum is numeric
  numeric_cols <- vector("logical", length(df))
  for (i in seq_along(df)) {
    numeric_cols[[i]] <- is.numeric(df[[i]])
  }
  # indexes of numeric columns
  idxs <- seq_along(df)[numeric_cols]
  # number of numeric columns
  n <- sum(numeric_cols)
  out <- vector("double", n)
  for (i in idxs) {
    out[i] <- fun(df[[i]])
  }
  out
}

# numericの発音は？

# magrittr::%$%

# Let’s test that it works,

df <- tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = letters[1:10],
  d = rnorm(10)
)
col_summary2(df, mean)

## [1] -0.8799454  0.6079939  0.0000000 -0.4561139

# 17.5　マップ関数



# 17.5.1　ショートカット

models <- mtcars %>%
  split(.$cyl) %>%
  map(function(df) lm(mpg ~ wt, data = df))

models <- mtcars %>%
  split(.$cyl) %>%
  map(~lm(mpg ~ wt, data = .))

# ※こっちの方がいいかも
models3 <- mtcars %>%
  split(.$cyl) %>%
  map(~lm(.$mpg ~ .$wt))

models %>%
  map(summary) %>%
  map_dbl(~.$r.squared)

##         4         6         8 
## 0.5086326 0.4645102 0.4229655

x <- list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9))
x %>% map_dbl(2)

## [1] 2 5 8

# 17.5.2　基本R
x1 <- list(
  c(0.27, 0.37, 0.57, 0.91, 0.20),
  c(0.90, 0.94, 0.66, 0.63, 0.06),
  c(0.21, 0.18, 0.69, 0.38, 0.77)
)
x2 <- list(
  c(0.50, 0.72, 0.99, 0.38, 0.78),
  c(0.93, 0.21, 0.65, 0.13, 0.27),
  c(0.39, 0.01, 0.38, 0.87, 0.34)
)
threshold <- function(x, cutoff = 0.8) x[x > cutoff]
x1 %>% sapply(threshold) %>% str()

## List of 3
##  $ : num 0.91
##  $ : num [1:2] 0.9 0.94
##  $ : num(0)

x2 %>% sapply(threshold) %>% str()

##  num [1:3] 0.99 0.93 0.87

# 練習問題 p291

# 17.6　失敗の処理

safe_log <- safely(log)
str(safe_log(10))

## List of 2
##  $ result: num 2.3
##  $ error : NULL

str(safe_log("a"))

## List of 2
##  $ result: NULL
##  $ error :List of 2
##   ..$ message: chr " 数学関数に数値でない引数が渡されました "
##   ..$ call   : language log(x = x, base = base)
##   ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"

x <- list(1, 10, "a")
y <- x %>% map(safely(log))
str(y)

## List of 3
##  $ :List of 2
##   ..$ result: num 0
##   ..$ error : NULL
##  $ :List of 2
##   ..$ result: num 2.3
##   ..$ error : NULL
##  $ :List of 2
##   ..$ result: NULL
##   ..$ error :List of 2
##   .. ..$ message: chr " 数学関数に数値でない引数が渡されました "
##   .. ..$ call   : language log(x = x, base = base)
##   .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"

y <- y %>% transpose()
str(y)

## List of 2
##  $ result:List of 3
##   ..$ : num 0
##   ..$ : num 2.3
##   ..$ : NULL
##  $ error :List of 3
##   ..$ : NULL
##   ..$ : NULL
##   ..$ :List of 2
##   .. ..$ message: chr " 数学関数に数値でない引数が渡されました "
##   .. ..$ call   : language log(x = x, base = base)
##   .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"

is_ok <- y$error %>% map_lgl(is_null)
x[!is_ok]

## [[1]]
## [1] "a"

y$result[is_ok] %>% flatten_dbl()

## [1] 0.000000 2.302585

x <- list(1, 10, "a")
x %>% map_dbl(possibly(log, NA_real_))

## [1] 0.000000 2.302585       NA

x <- list(1, -1)
x %>% map(quietly(log)) %>% str()

## List of 2
##  $ :List of 4
##   ..$ result  : num 0
##   ..$ output  : chr ""
##   ..$ warnings: chr(0) 
##   ..$ messages: chr(0) 
##  $ :List of 4
##   ..$ result  : num NaN
##   ..$ output  : chr ""
##   ..$ warnings: chr " 計算結果が NaN になりました "
##   ..$ messages: chr(0)

# bloom
# 山川さんがRStudioでコメントを閉じるのをやってたがどうやるんだろう？ 
# 「RStudioの折りたたみ機能 - もうカツ丼はいいよな」 http://rion778.hatenablog.com/entry/2015/05/31/175055

# map_dfc(), map_dfr()

# 17.7から次回

# 17.7　複数引数へのマップ
mu <- c(5, 10, -3)
mu %>%
  map(rnorm, n = 5) %>%
  str()

## List of 3
##  $ : num [1:5] 4.61 4.82 4.74 4.98 4.28
##  $ : num [1:5] 9.68 8.52 9.15 8.97 10.9
##  $ : num [1:5] -1.97 -4.73 -1.78 -3.1 -4

sigma <- list(1, 5, 10)
#sigma <- list(1, 5, 10, 100)
seq_along(mu) %>%
  map(~rnorm(5, mu[[.]], sigma[[.]])) %>%
  str()

## List of 3
##  $ : num [1:5] 3.01 3.9 5.49 3.4 3.26
##  $ : num [1:5] 4.32 7.66 9.43 19.18 2.52
##  $ : num [1:5] -14.31 -4.96 -20.85 14.86 9.68

map2(mu, sigma, rnorm, n = 5) %>% str()

## List of 3
##  $ : num [1:5] 4.61 5.12 5.1 6.49 3.87
##  $ : num [1:5] 2.15 10.82 16.23 11.29 17.1
##  $ : num [1:5] 0.634 -13.626 6.525 -19.402 -9.738

map2 <- function(x, y, f, ...) {
  out <- vector("list", length(x))
  for (i in seq_along(x)) {
    out[[i]] <- f(x[[i]], y[[i]], ...)
  }
  out
}

n <- list(1, 3, 5)
args1 <- list(n, mu, sigma)
args1 %>%
  pmap(rnorm) %>%
  str()

## List of 3
##  $ : num 5.44
##  $ : num [1:3] 6.62 -3.76 16.26
##  $ : num [1:5] 5.31 -8.92 -9.99 -4.43 -4.99

args2 <- list(mean = mu, sd = sigma, n = n)
args2 %>%
  pmap(rnorm) %>%
  str()

## List of 3
##  $ : num 5.55
##  $ : num [1:3] 9.04 13.89 3.51
##  $ : num [1:5] 14.27 4.815 1.145 0.718 -10.613

params <- tribble(
  ~mean, ~sd, ~n,
  5, 1, 1,
  10, 5, 3,
  -3, 10, 5
)
params %>%
  pmap(rnorm)

## [[1]]
## [1] 7.129727
## 
## [[2]]
## [1] 12.173793  8.756864 -1.652082
## 
## [[3]]
## [1] -3.2136615  9.2915577 -0.1430352  1.4359142 -3.3225850

# 17.7.1　さまざまな関数を呼び出す
f <- c("runif", "rnorm", "rpois")
param <- list(
  list(min = -1, max = 1),
  list(sd = 5),
  list(lambda = 10)
)

invoke_map(f, param, n = 5) %>% str()

## List of 3
##  $ : num [1:5] -0.945 -0.57 0.436 0.601 -0.186
##  $ : num [1:5] -7.47 -4.49 1.98 -3.48 3.12
##  $ : int [1:5] 7 7 11 11 8

sim <- tribble(
  ~f, ~params,
  "runif", list(min = -1, max = 1),
  "rnorm", list(sd = 5),
  "rpois", list(lambda = 10)
)
sim %>%
  mutate(sim = invoke_map(f, params, n = 10))

## Warning: package 'bindrcpp' was built under R version 3.3.3

## # A tibble: 3 x 3
##       f     params        sim
##   <chr>     <list>     <list>
## 1 runif <list [2]> <dbl [10]>
## 2 rnorm <list [1]> <dbl [10]>
## 3 rpois <list [1]> <int [10]>

# 17.8　ウォーク
x <- list(1, "a", 3)
x %>%
  walk(print)

## [1] 1
## [1] "a"
## [1] 3

library(ggplot2)
plots <- mtcars %>%
  split(.$cyl) %>%
  map(~ggplot(., aes(mpg, wt)) + geom_point())
paths <- stringr::str_c(names(plots), ".pdf")
pwalk(list(paths, plots), ggsave, path = tempdir())

## Saving 7 x 5 in image

## Saving 7 x 5 in image
## Saving 7 x 5 in image

# 17.9　forループの他のパターン

# 17.9.1　述語関数
iris %>%
  keep(is.factor) %>%
  str()

## 'data.frame':    150 obs. of  1 variable:
##  $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

iris %>%
  discard(is.factor) %>%
  str()

## 'data.frame':    150 obs. of  4 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...

x <- list(1:5, letters, list(10))
x %>%
  some(is_character)

## [1] TRUE

x %>%
  every(is_vector)

## [1] TRUE

x <- sample(10)
x

##  [1]  1  8  6 10  2  5  9  7  4  3

x %>%
  detect(~ . > 5)

## [1] 8

x %>%
  detect_index(~ . > 5)

## [1] 2

x %>%
  head_while(~ . > 5)

## integer(0)

x %>%
  tail_while(~ . > 5)

## integer(0)

# 17.9.2　reduceとaccumulate
dfs <- list(
  age = tibble(name = "John", age = 30),
  sex = tibble(name = c("John", "Mary"), sex = c("M", "F")),
  trt = tibble(name = "Mary", treatment = "A")
)
dfs %>% reduce(full_join)

## Joining, by = "name"

## Joining, by = "name"

## # A tibble: 2 x 4
##    name   age   sex treatment
##   <chr> <dbl> <chr>     <chr>
## 1  John    30     M      <NA>
## 2  Mary    NA     F         A

vs <- list(
  c(1, 3, 5, 6, 10),
  c(1, 2, 3, 7, 8, 10),
  c(1, 2, 3, 4, 8, 9, 10)
)
vs %>% reduce(intersect)

## [1]  1  3 10

x <- sample(10)
x

##  [1]  4 10  3  5  6  7  9  1  8  2

x %>% accumulate(`+`)

##  [1]  4 14 17 22 28 35 44 45 53 55

1:3 %>% accumulate(`*`)

## [1] 1 2 6

#?accumulate_right
# integer numeric?
typeof(1:3)

## [1] "integer"

class(1:3)

## [1] "integer"

# 練習問題 p300
# 1. forループを使って、every()の自分のバージョンを作りなさい。purrr::every()と比較しな
# さい。purrrのバージョンは、読者の作ったのとは異なる動作を何かしているか。

# Use ... to pass arguments to the function
every2 <- function(.x, .p, ...) {
  for (i in .x) {
    if (!.p(i, ...)) {
      # If any is FALSE we know not all of then were TRUE
      return(FALSE)
    }
  }
  # if nothing was FALSE, then it is TRUE
  TRUE  
}

every2(1:3, function(x) {x > 1})

## [1] FALSE

every2(1:3, function(x) {x > 0})

## [1] TRUE

# 2. col_sum()を修正して、データフレームのすべての数値列に要約関数を適用するよう強化しなさい。
col_sum2 <- function(df, f, ...) {
  map(keep(df, is.numeric), f, ...)
}
col_sum2(iris, mean)

## $Sepal.Length
## [1] 5.843333
## 
## $Sepal.Width
## [1] 3.057333
## 
## $Petal.Length
## [1] 3.758
## 
## $Petal.Width
## [1] 1.199333

# 3. col_sum()と等価な基本R実装の試作は次のようになった。
col_sum3 <- function(df, f) {
  is_num <- sapply(df, is.numeric)
  df_num <- df[, is_num]
  sapply(df_num, f)
}
# しかし、これでは次に示すようにバグがある。
df <- tibble(
  x = 1:3,
  y = 3:1,
  z = c("a", "b", "c")
)
# OK
col_sum3(df, mean)

## x y 
## 2 2

# 問題あり:数値ベクトルを返すとは限らない
col_sum3(df[1:2], mean)

## x y 
## 2 2

col_sum3(df[1], mean)

## x 
## 2

#col_sum3(df[0], mean)
#  Error: Unsupported index type: list 
# 何がバグの原因か。

# The problem is that sapply does not always return numeric vectors.
# If no columns are selected, instead of returning an empty numeric vector,
# it returns an empty list. This causes an error since we can’t use a list with [.
# 問題は、sapplyは常に数値ベクトルを返すとは限らないことです。
# 列が選択されていない場合、空の数値ベクトルを返すのではなく、空のリストを返します。
# [でリストを使うことができないので、エラーを引き起こします。
sapply(df[0], is.numeric)

## named list()

sapply(df[1], is.numeric)

##    x 
## TRUE

sapply(df[1:2], is.numeric)

##    x    y 
## TRUE TRUE

# if (ncol(df) == 0)...

# purrrチートシートを見るといい
# 「Cheatsheets – RStudio」 https://www.rstudio.com/resources/cheatsheets/

# ~p301

r4ds_ch17.R

JP16828

Tue May 22 17:10:57 2018