# 17章 purrrでイテレーション
# 「nhn-techorus.datascienceteam / bookreading · GitLab」 https://gitlab.com/nhn-techorus.datascienceteam/bookreading
# 「personal/sakai · master · nhn-techorus.datascienceteam / bookreading · GitLab」 https://gitlab.com/nhn-techorus.datascienceteam/bookreading/tree/master/personal/sakai
# 「R for Data Science」 http://r4ds.had.co.nz/iteration.html
# 「r4ds-exercise-solutions/iteration.Rmd at master · jrnold/r4ds-exercise-solutions · GitHub」 https://github.com/jrnold/r4ds-exercise-solutions/blob/master/iteration.Rmd
# 「R for Data Science Solutions」 https://jrnold.github.io/r4ds-exercise-solutions/iteration.html
# 「RPubs - r4ds_ch17」 http://rpubs.com/tocci36/r4ds_ch17
# 17.1 はじめに
# 命令型プログラミングと関数型プログラミング
# 17.1.1 用意するもの
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## √ ggplot2 2.2.1 √ purrr 0.2.4
## √ tibble 1.3.4 √ dplyr 0.7.4
## √ tidyr 0.7.2 √ stringr 1.2.0
## √ readr 1.1.1 √ forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.3.3
## Warning: package 'tidyr' was built under R version 3.3.3
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'purrr' was built under R version 3.3.3
## Warning: package 'dplyr' was built under R version 3.3.3
## Warning: package 'stringr' was built under R version 3.3.3
## Warning: package 'forcats' was built under R version 3.3.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# 17.2 forループ p278
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
median(df$a)
## [1] -0.1644697
median(df$b)
## [1] 0.4789213
median(df$c)
## [1] -0.0915009
median(df$d)
## [1] -0.5395308
output <- vector("double", ncol(df)) # 1. 出力
for (i in seq_along(df)) { # 2. シーケンス
output[[i]] <- median(df[[i]]) # 3. 本体
}
output
## [1] -0.1644697 0.4789213 -0.0915009 -0.5395308
#?seq_along
# お馴染みの1:length(x)よりも安全で、ベクトルの長さが0の場合にもseq_along()はうまく処理してくれる
y <- vector("double", 0)
seq_along(y)
## integer(0)
1:length(y)
## [1] 1 0
# 練習問題 p279
# 1. 次のforループを書きなさい。
# a. mtcars の各列の平均を計算する。
output <- vector("double", ncol(mtcars))
names(output) <- names(mtcars)
for (i in names(mtcars)) {
output[i] <- mean(mtcars[[i]])
}
output
## mpg cyl disp hp drat wt
## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250
## qsec vs am gear carb
## 17.848750 0.437500 0.406250 3.687500 2.812500
# b. nycflights13::flights の各列の型を決定する。
data("flights", package = "nycflights13")
output <- vector("list", ncol(flights))
names(output) <- names(flights)
for (i in names(flights)) {
output[[i]] <- class(flights[[i]])
}
output
## $year
## [1] "integer"
##
## $month
## [1] "integer"
##
## $day
## [1] "integer"
##
## $dep_time
## [1] "integer"
##
## $sched_dep_time
## [1] "integer"
##
## $dep_delay
## [1] "numeric"
##
## $arr_time
## [1] "integer"
##
## $sched_arr_time
## [1] "integer"
##
## $arr_delay
## [1] "numeric"
##
## $carrier
## [1] "character"
##
## $flight
## [1] "integer"
##
## $tailnum
## [1] "character"
##
## $origin
## [1] "character"
##
## $dest
## [1] "character"
##
## $air_time
## [1] "numeric"
##
## $distance
## [1] "numeric"
##
## $hour
## [1] "numeric"
##
## $minute
## [1] "numeric"
##
## $time_hour
## [1] "POSIXct" "POSIXt"
#?data
# c. irisの各列の重複しない値の個数を計算する。
data(iris)
iris_uniq <- vector("double", ncol(iris))
names(iris_uniq) <- names(iris)
for (i in names(iris)) {
iris_uniq[i] <- length(unique(iris[[i]]))
}
iris_uniq
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 35 23 43 22 3
# d. μ=-10, 0, 10, 100のそれぞれについて正規乱数を10個生成する。
# number to draw
n <- 10
# values of the mean
mu <- c(-10, 0, 10, 100)
normals <- vector("list", length(mu))
for (i in seq_along(normals)) {
normals[[i]] <- rnorm(n, mean = mu[i])
}
normals
## [[1]]
## [1] -10.963594 -8.529333 -11.118950 -10.061983 -11.085086 -10.045363
## [7] -9.117913 -8.914534 -9.597413 -8.094066
##
## [[2]]
## [1] -1.93294838 -1.96698278 0.59356476 -0.22537187 -0.89193862
## [6] -0.70415888 -0.10903386 0.25735643 -0.02202842 -1.48645574
##
## [[3]]
## [1] 10.606188 8.502946 9.366019 10.474606 11.661852 10.804707 8.892807
## [8] 9.271668 8.577027 9.325091
##
## [[4]]
## [1] 98.44949 100.68337 99.70210 100.24529 99.41596 99.96408 98.92164
## [8] 101.89987 99.94767 100.71988
# However, we don’t need a for loop for this since rnorm recycles means.
matrix(rnorm(n * length(mu), mean = mu), ncol = n)
## [,1] [,2] [,3] [,4] [,5]
## [1,] -8.6129467 -9.29461817 -10.7997606 -9.71443638 -9.9490001
## [2,] -0.5187033 0.04363637 -0.2811731 0.06133795 -0.1028223
## [3,] 9.8103163 9.16962203 10.8023348 10.73869343 10.8062561
## [4,] 100.5999376 100.86246700 99.2161077 98.41637552 101.7372860
## [,6] [,7] [,8] [,9] [,10]
## [1,] -9.7244334 -9.2638062 -9.2018730 -11.0339366 -1.136711e+01
## [2,] -0.2587833 -0.6339773 0.3070537 -0.1181517 -4.967254e-04
## [3,] 12.1025541 9.0554173 8.8167438 10.3911799 1.049595e+01
## [4,] 99.7411432 97.8955094 98.4828931 101.4362403 1.003480e+02
# ループを書く前に、出力、シーケンス、本体について考えること。
# 2. 次の3例で、既存のベクトルに関する関数を利用してforループを解消しなさい。
out <- ""
for (x in letters) {
out <- stringr::str_c(out, x)
}
stringr::str_c(letters, collapse = "")
## [1] "abcdefghijklmnopqrstuvwxyz"
x <- sample(100)
sd <- 0
for (i in seq_along(x)) {
sd <- sd + (x[i] - mean(x)) ^ 2
}
sd <- sqrt(sd / (length(x) - 1))
sd(x)
## [1] 29.01149
# Or
sqrt(sum((x - mean(x)) ^ 2) / (length(x) - 1))
## [1] 29.01149
x <- runif(100)
out <- vector("numeric", length(x))
all.equal(cumsum(x),out)
## [1] "Mean relative difference: 1"
out[1] <- x[1]
for (i in 2:length(x)) {
out[i] <- out[i - 1] + x[i]
}
# TODO
# 3. 関数を書くのとforループのスキルを組み合わせて次を行う。
# a. 童謡「Alice the Camel」の歌詞を印刷(print())するforループを書きなさい*1。
humps <- c("five", "four", "three", "two", "one", "no")
for (i in humps) {
cat(str_c("Alice the camel has ", rep(i, 3), " humps.",
collapse = "\n"), "\n")
if (i == "no") {
cat("Now Alice is a horse.\n")
} else {
cat("So go, Alice, go.\n")
}
cat("\n")
}
## Alice the camel has five humps.
## Alice the camel has five humps.
## Alice the camel has five humps.
## So go, Alice, go.
##
## Alice the camel has four humps.
## Alice the camel has four humps.
## Alice the camel has four humps.
## So go, Alice, go.
##
## Alice the camel has three humps.
## Alice the camel has three humps.
## Alice the camel has three humps.
## So go, Alice, go.
##
## Alice the camel has two humps.
## Alice the camel has two humps.
## Alice the camel has two humps.
## So go, Alice, go.
##
## Alice the camel has one humps.
## Alice the camel has one humps.
## Alice the camel has one humps.
## So go, Alice, go.
##
## Alice the camel has no humps.
## Alice the camel has no humps.
## Alice the camel has no humps.
## Now Alice is a horse.
# b. 童謡「Ten in the Bed」を関数にしなさい。人が何人でも、寝方がどのようでも作成できるよ
# うにしなさい*2。
numbers <- c("ten", "nine", "eight", "seven", "six", "five",
"four", "three", "two", "one")
for (i in numbers) {
cat(str_c("There were ", i, " in the bed\n"))
cat("and the little one said\n")
if (i == "one") {
cat("I'm lonely...")
} else {
cat("Roll over, roll over\n")
cat("So they all rolled over and one fell out.\n")
}
cat("\n")
}
## There were ten in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
##
## There were nine in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
##
## There were eight in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
##
## There were seven in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
##
## There were six in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
##
## There were five in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
##
## There were four in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
##
## There were three in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
##
## There were two in the bed
## and the little one said
## Roll over, roll over
## So they all rolled over and one fell out.
##
## There were one in the bed
## and the little one said
## I'm lonely...
# c. 「99 Bottles of Beer on the Wall」を関数に変換しなさい。場所がどこでも、どんな液体を
# 含んだ容器がいくつでもよいように一般化しなさい*3。
bottles <- function(i) {
if (i > 2) {
bottles <- str_c(i - 1, " bottles")
} else if (i == 2) {
bottles <- "1 bottle"
} else {
bottles <- "no more bottles"
}
bottles
}
beer_bottles <- function(n) {
# should test whether n >= 1.
for (i in seq(n, 1)) {
cat(str_c(bottles(i), " of beer on the wall, ", bottles(i), " of beer.\n"))
cat(str_c("Take one down and pass it around, ", bottles(i - 1),
" of beer on the wall.\n\n"))
}
cat("No more bottles of beer on the wall, no more bottles of beer.\n")
cat(str_c("Go to the store and buy some more, ", bottles(n), " of beer on the wall.\n"))
}
beer_bottles(3)
## 2 bottles of beer on the wall, 2 bottles of beer.
## Take one down and pass it around, 1 bottle of beer on the wall.
##
## 1 bottle of beer on the wall, 1 bottle of beer.
## Take one down and pass it around, no more bottles of beer on the wall.
##
## no more bottles of beer on the wall, no more bottles of beer.
## Take one down and pass it around, no more bottles of beer on the wall.
##
## No more bottles of beer on the wall, no more bottles of beer.
## Go to the store and buy some more, 2 bottles of beer on the wall.
# 4. 前もって出力を割り当てず、ステップごとにベクトルの長さを伸ばすforループをよく見かける。
output <- vector("integer", 0)
for (i in seq_along(x)) {
output <- c(output, lengths(x[[i]]))
}
output
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# これは性能にどう影響するか。実験を設計して実施しなさい。
add_to_vector <- function(n) {
output <- vector("integer", 0)
for (i in seq_len(n)) {
output <- c(output, i)
}
output
}
microbenchmark::microbenchmark(add_to_vector(10000), times = 3)
## Unit: milliseconds
## expr min lq mean median uq
## add_to_vector(10000) 99.94955 100.2007 108.7802 100.4518 113.1956
## max neval
## 125.9393 3
#??microbenchmark
add_to_vector_2 <- function(n) {
output <- vector("integer", n)
for (i in seq_len(n)) {
output[[i]] <- i
}
output
}
microbenchmark::microbenchmark(add_to_vector_2(10000), times = 3)
## Unit: milliseconds
## expr min lq mean median uq
## add_to_vector_2(10000) 9.366771 9.470339 9.562295 9.573906 9.660056
## max neval
## 9.746207 3
# あらかじめ割り当てられたベクトルは約100倍高速です!あなたは異なる答えを得るかもしれませんが、ベクトルが長く、オブジェクトが大きければ大きいほど、事前割り振りは追加よりも優れています。
# 17.3 forループのバリエーション
# forループには4つのバリエーションがあります。
# ● 新たなオブジェクトを作らず、既存オブジェクトを変更する。
# ● 添字ではなく、名前や値についてループする。
# ● 長さが不明な出力を扱う。
# ● 長さのわからないシーケンスを扱う。
# 17.3.1 既存オブジェクトの変更
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
rescale01 <- function(x) {
rng <- range(x, na.rm = TRUE)
(x - rng[1]) / (rng[2] - rng[1])
}
df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)
for (i in seq_along(df)) {
df[[i]] <- rescale01(df[[i]])
}
# 17.3.2 ループパターン
# 要素を使ってループ:for (x in xs)
# 名前を使ってループ:(nm in names(xs))
results <- vector("list", length(x))
names(results) <- names(x)
# 数値の添字では、その位置で名前と値が両方取得できるので、最もよく使われます。
for (i in seq_along(x)) {
name <- names(x)[[i]]
value <- x[[i]]
}
# 17.3.3 出力長不明
means <- c(0, 1, 2)
output <- double()
for (i in seq_along(means)) {
n <- sample(100, 1)
output <- c(output, rnorm(n, means[[i]]))
}
str(output)
## num [1:111] 0.434 0.102 1.23 1.101 0.842 ...
out <- vector("list", length(means))
for (i in seq_along(means)) {
n <- sample(100, 1)
out[[i]] <- rnorm(n, means[[i]])
}
str(out)
## List of 3
## $ : num [1:67] 0.634 -1.114 -0.636 0.559 -1.457 ...
## $ : num [1:18] 0.307 0.345 0.349 1.89 0.975 ...
## $ : num [1:80] 3.757 0.155 2.605 2.206 0.736 ...
str(unlist(out))
## num [1:165] 0.634 -1.114 -0.636 0.559 -1.457 ...
# より安全なのはpurrr::flatten_dbl()を使うことです。これは、入力がdoubleのリストでないとエラーを投げます。
# ● 長い文字列を生成する場合。繰り返しのたびに前の文字列にpaste()するのではなく、出力を文
# 字ベクトルに保存しておき、paste(output, collapse = "")でベクトルの要素を連結して文字
# 列にする。
# ● 大きなデータフレームを生成する場合。繰り返しのたびにrbind()で逐次作る代わりに、出力を
# リストに保存し、dplyr::bind_rows(output)を使って出力を組み合わせて1つのデータフレー
# ムにまとめる。
# 17.3.4 シーケンス長不明
#while (condition) {
# body
#}
#for (i in seq_along(x)) {
# # body
#}
# 次と等価
i <- 1
while (i <= length(x)) {
# body
i <- i + 1
}
flip <- function() sample(c("T", "H"), 1)
flips <- 0
nheads <- 0
while (nheads < 3) {
if (flip() == "H") {
nheads <- nheads + 1
} else {
nheads <- 0
}
flips <- flips + 1
}
#Flips
# 日本語版誤植? http://r4ds.had.co.nz/iteration.html では小文字で正しい。
flips
## [1] 9
# 練習問題 p284
# 1. 読み込むCSVファイルが格納されているディレクトリがあるとする。パスは、
files <- dir("data/", pattern = "\\.csv$", full.names = TRUE)
# というベクトルで与えられてお
# り、read_csv()で各ファイルを読むとする。それらを読み込んで1つのデータフレームにする
# forループを書きなさい。
df <- vector("list", length(files))
for (fname in seq_along(files)) {
df[[i]] <- read_csv(files[[i]])
}
df <- bind_rows(df)
#?bind_rows
# 2. for (nm in names(x))を使ったが、xに名前がなかったらどうなるか。要素の一部だけに名前
# がある場合はどうなるか。名前が重複していたらどうなるか。
x <- 1:3
print(names(x))
## NULL
#> NULL
for (nm in names(x)) {
print(nm)
print(x[[nm]])
}
# Note that the length of NULL is zero:
length(NULL)
## [1] 0
x <- c(a = 1, 2, c = 3)
names(x)
## [1] "a" "" "c"
# for (nm in names(x)) {
# print(nm)
# print(x[[nm]])
# }
# [1] "a"
# [1] 1
# [1] ""
# x[[nm]] でエラー: 添え字が許される範囲外です
# 3. データフレームの数値の列の平均を名前とともに出力する関数を書きなさい。例えば、show_
# mean(iris)が次のような出力をする。
#show_mean(iris)
#> Sepal.Length: 5.84
#> Sepal.Width: 3.06
#> Petal.Length: 3.76
#> Petal.Width: 1.20
# (追加問題:変数名の長さが変わっても数値がきちんと並ぶようにするため、どんな関数を使っ
# たか。)
show_mean <- function(df, digits = 2) {
# Get max length of any variable in the dataset
maxstr <- max(str_length(names(df)))
for (nm in names(df)) {
if (is.numeric(df[[nm]])) {
cat(str_c(str_pad(str_c(nm, ":"), maxstr + 1L, side = "right"),
format(mean(df[[nm]]), digits = digits, nsmall = digits),
sep = " "),
"\n")
}
}
}
show_mean(iris)
## Sepal.Length: 5.84
## Sepal.Width: 3.06
## Petal.Length: 3.76
## Petal.Width: 1.20
# 4. 次のコードは何をするか。どのような作業をしているか。
mtcars2 <- mtcars
trans <- list(
disp = function(x) x * 0.0163871,
am = function(x) {
factor(x, labels = c("auto", "manual"))
}
)
for (var in names(trans)) {
mtcars2[[var]] <- trans[[var]](mtcars[[var]])
}
glimpse(mtcars)
## Observations: 32
## Variables: 11
## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19....
## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, ...
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 1...
## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, ...
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9...
## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3...
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2...
## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, ...
glimpse(mtcars2)
## Observations: 32
## Variables: 11
## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19....
## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, ...
## $ disp <dbl> 2.621936, 2.621936, 1.769807, 4.227872, 5.899356, 3.68709...
## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, ...
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9...
## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3...
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2...
## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ am <fctr> manual, manual, manual, auto, auto, auto, auto, auto, au...
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, ...
# 代入した変数を編集した方がいい
# factor化してる
# 17.4 forループと関数型
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
# 各列の平均を計算することは、forループを使って書けます。
output <- vector("double", length(df))
for (i in seq_along(df)) {
output[[i]] <- mean(df[[i]])
}
output
## [1] -0.63392925 -0.38588948 0.03254921 -0.14172525
col_mean <- function(df) {
output <- vector("double", length(df))
for (i in seq_along(df)) {
output[i] <- mean(df[[i]])
}
output
}
col_median <- function(df) {
output <- vector("double", length(df))
for (i in seq_along(df)) {
output[i] <- median(df[[i]])
}
output
}
col_sd <- function(df) {
output <- vector("double", length(df))
for (i in seq_along(df)) {
output[i] <- sd(df[[i]])
}
output
}
f1 <- function(x) abs(x - mean(x)) ^ 1
f2 <- function(x) abs(x - mean(x)) ^ 2
f3 <- function(x) abs(x - mean(x)) ^ 3
col_summary <- function(df, fun) {
out <- vector("double", length(df))
for (i in seq_along(df)) {
out[i] <- fun(df[[i]])
}
out
}
col_summary(df, median)
## [1] -0.82168690 -0.69558981 0.05413290 0.05417785
col_summary(df, mean)
## [1] -0.63392925 -0.38588948 0.03254921 -0.14172525
# 練習問題 p287
# 1. apply()のドキュメントを読みなさい。第2の例では、どんなforループ2つを一般化しているか。
# 2. col_summary()を数値列にだけ適用するよう変更しなさい。数値列に対してTRUEとなる論理ベ
# クトルを返す関数is_numeric()を使うとよい。
col_summary <- function(df, fun) {
out <- vector("double", length(df))
for (i in seq_along(df)) {
out[i] <- fun(df[[i]])
}
out
}
# The adapted version is,
col_summary2 <- function(df, fun) {
# test whether each colum is numeric
numeric_cols <- vector("logical", length(df))
for (i in seq_along(df)) {
numeric_cols[[i]] <- is.numeric(df[[i]])
}
# indexes of numeric columns
idxs <- seq_along(df)[numeric_cols]
# number of numeric columns
n <- sum(numeric_cols)
out <- vector("double", n)
for (i in idxs) {
out[i] <- fun(df[[i]])
}
out
}
# numericの発音は?
# magrittr::%$%
# Let’s test that it works,
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = letters[1:10],
d = rnorm(10)
)
col_summary2(df, mean)
## [1] -0.8799454 0.6079939 0.0000000 -0.4561139
# 17.5 マップ関数
# 17.5.1 ショートカット
models <- mtcars %>%
split(.$cyl) %>%
map(function(df) lm(mpg ~ wt, data = df))
models <- mtcars %>%
split(.$cyl) %>%
map(~lm(mpg ~ wt, data = .))
# ※こっちの方がいいかも
models3 <- mtcars %>%
split(.$cyl) %>%
map(~lm(.$mpg ~ .$wt))
models %>%
map(summary) %>%
map_dbl(~.$r.squared)
## 4 6 8
## 0.5086326 0.4645102 0.4229655
x <- list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9))
x %>% map_dbl(2)
## [1] 2 5 8
# 17.5.2 基本R
x1 <- list(
c(0.27, 0.37, 0.57, 0.91, 0.20),
c(0.90, 0.94, 0.66, 0.63, 0.06),
c(0.21, 0.18, 0.69, 0.38, 0.77)
)
x2 <- list(
c(0.50, 0.72, 0.99, 0.38, 0.78),
c(0.93, 0.21, 0.65, 0.13, 0.27),
c(0.39, 0.01, 0.38, 0.87, 0.34)
)
threshold <- function(x, cutoff = 0.8) x[x > cutoff]
x1 %>% sapply(threshold) %>% str()
## List of 3
## $ : num 0.91
## $ : num [1:2] 0.9 0.94
## $ : num(0)
x2 %>% sapply(threshold) %>% str()
## num [1:3] 0.99 0.93 0.87
# 練習問題 p291
# 17.6 失敗の処理
safe_log <- safely(log)
str(safe_log(10))
## List of 2
## $ result: num 2.3
## $ error : NULL
str(safe_log("a"))
## List of 2
## $ result: NULL
## $ error :List of 2
## ..$ message: chr " 数学関数に数値でない引数が渡されました "
## ..$ call : language log(x = x, base = base)
## ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
x <- list(1, 10, "a")
y <- x %>% map(safely(log))
str(y)
## List of 3
## $ :List of 2
## ..$ result: num 0
## ..$ error : NULL
## $ :List of 2
## ..$ result: num 2.3
## ..$ error : NULL
## $ :List of 2
## ..$ result: NULL
## ..$ error :List of 2
## .. ..$ message: chr " 数学関数に数値でない引数が渡されました "
## .. ..$ call : language log(x = x, base = base)
## .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
y <- y %>% transpose()
str(y)
## List of 2
## $ result:List of 3
## ..$ : num 0
## ..$ : num 2.3
## ..$ : NULL
## $ error :List of 3
## ..$ : NULL
## ..$ : NULL
## ..$ :List of 2
## .. ..$ message: chr " 数学関数に数値でない引数が渡されました "
## .. ..$ call : language log(x = x, base = base)
## .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
is_ok <- y$error %>% map_lgl(is_null)
x[!is_ok]
## [[1]]
## [1] "a"
y$result[is_ok] %>% flatten_dbl()
## [1] 0.000000 2.302585
x <- list(1, 10, "a")
x %>% map_dbl(possibly(log, NA_real_))
## [1] 0.000000 2.302585 NA
x <- list(1, -1)
x %>% map(quietly(log)) %>% str()
## List of 2
## $ :List of 4
## ..$ result : num 0
## ..$ output : chr ""
## ..$ warnings: chr(0)
## ..$ messages: chr(0)
## $ :List of 4
## ..$ result : num NaN
## ..$ output : chr ""
## ..$ warnings: chr " 計算結果が NaN になりました "
## ..$ messages: chr(0)
# bloom
# 山川さんがRStudioでコメントを閉じるのをやってたがどうやるんだろう?
# 「RStudioの折りたたみ機能 - もうカツ丼はいいよな」 http://rion778.hatenablog.com/entry/2015/05/31/175055
# map_dfc(), map_dfr()
# 17.7から次回
# 17.7 複数引数へのマップ
mu <- c(5, 10, -3)
mu %>%
map(rnorm, n = 5) %>%
str()
## List of 3
## $ : num [1:5] 4.61 4.82 4.74 4.98 4.28
## $ : num [1:5] 9.68 8.52 9.15 8.97 10.9
## $ : num [1:5] -1.97 -4.73 -1.78 -3.1 -4
sigma <- list(1, 5, 10)
#sigma <- list(1, 5, 10, 100)
seq_along(mu) %>%
map(~rnorm(5, mu[[.]], sigma[[.]])) %>%
str()
## List of 3
## $ : num [1:5] 3.01 3.9 5.49 3.4 3.26
## $ : num [1:5] 4.32 7.66 9.43 19.18 2.52
## $ : num [1:5] -14.31 -4.96 -20.85 14.86 9.68
map2(mu, sigma, rnorm, n = 5) %>% str()
## List of 3
## $ : num [1:5] 4.61 5.12 5.1 6.49 3.87
## $ : num [1:5] 2.15 10.82 16.23 11.29 17.1
## $ : num [1:5] 0.634 -13.626 6.525 -19.402 -9.738
map2 <- function(x, y, f, ...) {
out <- vector("list", length(x))
for (i in seq_along(x)) {
out[[i]] <- f(x[[i]], y[[i]], ...)
}
out
}
n <- list(1, 3, 5)
args1 <- list(n, mu, sigma)
args1 %>%
pmap(rnorm) %>%
str()
## List of 3
## $ : num 5.44
## $ : num [1:3] 6.62 -3.76 16.26
## $ : num [1:5] 5.31 -8.92 -9.99 -4.43 -4.99
args2 <- list(mean = mu, sd = sigma, n = n)
args2 %>%
pmap(rnorm) %>%
str()
## List of 3
## $ : num 5.55
## $ : num [1:3] 9.04 13.89 3.51
## $ : num [1:5] 14.27 4.815 1.145 0.718 -10.613
params <- tribble(
~mean, ~sd, ~n,
5, 1, 1,
10, 5, 3,
-3, 10, 5
)
params %>%
pmap(rnorm)
## [[1]]
## [1] 7.129727
##
## [[2]]
## [1] 12.173793 8.756864 -1.652082
##
## [[3]]
## [1] -3.2136615 9.2915577 -0.1430352 1.4359142 -3.3225850
# 17.7.1 さまざまな関数を呼び出す
f <- c("runif", "rnorm", "rpois")
param <- list(
list(min = -1, max = 1),
list(sd = 5),
list(lambda = 10)
)
invoke_map(f, param, n = 5) %>% str()
## List of 3
## $ : num [1:5] -0.945 -0.57 0.436 0.601 -0.186
## $ : num [1:5] -7.47 -4.49 1.98 -3.48 3.12
## $ : int [1:5] 7 7 11 11 8
sim <- tribble(
~f, ~params,
"runif", list(min = -1, max = 1),
"rnorm", list(sd = 5),
"rpois", list(lambda = 10)
)
sim %>%
mutate(sim = invoke_map(f, params, n = 10))
## Warning: package 'bindrcpp' was built under R version 3.3.3
## # A tibble: 3 x 3
## f params sim
## <chr> <list> <list>
## 1 runif <list [2]> <dbl [10]>
## 2 rnorm <list [1]> <dbl [10]>
## 3 rpois <list [1]> <int [10]>
# 17.8 ウォーク
x <- list(1, "a", 3)
x %>%
walk(print)
## [1] 1
## [1] "a"
## [1] 3
library(ggplot2)
plots <- mtcars %>%
split(.$cyl) %>%
map(~ggplot(., aes(mpg, wt)) + geom_point())
paths <- stringr::str_c(names(plots), ".pdf")
pwalk(list(paths, plots), ggsave, path = tempdir())
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
# 17.9 forループの他のパターン
# 17.9.1 述語関数
iris %>%
keep(is.factor) %>%
str()
## 'data.frame': 150 obs. of 1 variable:
## $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris %>%
discard(is.factor) %>%
str()
## 'data.frame': 150 obs. of 4 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
x <- list(1:5, letters, list(10))
x %>%
some(is_character)
## [1] TRUE
x %>%
every(is_vector)
## [1] TRUE
x <- sample(10)
x
## [1] 1 8 6 10 2 5 9 7 4 3
x %>%
detect(~ . > 5)
## [1] 8
x %>%
detect_index(~ . > 5)
## [1] 2
x %>%
head_while(~ . > 5)
## integer(0)
x %>%
tail_while(~ . > 5)
## integer(0)
# 17.9.2 reduceとaccumulate
dfs <- list(
age = tibble(name = "John", age = 30),
sex = tibble(name = c("John", "Mary"), sex = c("M", "F")),
trt = tibble(name = "Mary", treatment = "A")
)
dfs %>% reduce(full_join)
## Joining, by = "name"
## Joining, by = "name"
## # A tibble: 2 x 4
## name age sex treatment
## <chr> <dbl> <chr> <chr>
## 1 John 30 M <NA>
## 2 Mary NA F A
vs <- list(
c(1, 3, 5, 6, 10),
c(1, 2, 3, 7, 8, 10),
c(1, 2, 3, 4, 8, 9, 10)
)
vs %>% reduce(intersect)
## [1] 1 3 10
x <- sample(10)
x
## [1] 4 10 3 5 6 7 9 1 8 2
x %>% accumulate(`+`)
## [1] 4 14 17 22 28 35 44 45 53 55
1:3 %>% accumulate(`*`)
## [1] 1 2 6
#?accumulate_right
# integer numeric?
typeof(1:3)
## [1] "integer"
class(1:3)
## [1] "integer"
# 練習問題 p300
# 1. forループを使って、every()の自分のバージョンを作りなさい。purrr::every()と比較しな
# さい。purrrのバージョンは、読者の作ったのとは異なる動作を何かしているか。
# Use ... to pass arguments to the function
every2 <- function(.x, .p, ...) {
for (i in .x) {
if (!.p(i, ...)) {
# If any is FALSE we know not all of then were TRUE
return(FALSE)
}
}
# if nothing was FALSE, then it is TRUE
TRUE
}
every2(1:3, function(x) {x > 1})
## [1] FALSE
every2(1:3, function(x) {x > 0})
## [1] TRUE
# 2. col_sum()を修正して、データフレームのすべての数値列に要約関数を適用するよう強化しなさい。
col_sum2 <- function(df, f, ...) {
map(keep(df, is.numeric), f, ...)
}
col_sum2(iris, mean)
## $Sepal.Length
## [1] 5.843333
##
## $Sepal.Width
## [1] 3.057333
##
## $Petal.Length
## [1] 3.758
##
## $Petal.Width
## [1] 1.199333
# 3. col_sum()と等価な基本R実装の試作は次のようになった。
col_sum3 <- function(df, f) {
is_num <- sapply(df, is.numeric)
df_num <- df[, is_num]
sapply(df_num, f)
}
# しかし、これでは次に示すようにバグがある。
df <- tibble(
x = 1:3,
y = 3:1,
z = c("a", "b", "c")
)
# OK
col_sum3(df, mean)
## x y
## 2 2
# 問題あり:数値ベクトルを返すとは限らない
col_sum3(df[1:2], mean)
## x y
## 2 2
col_sum3(df[1], mean)
## x
## 2
#col_sum3(df[0], mean)
# Error: Unsupported index type: list
# 何がバグの原因か。
# The problem is that sapply does not always return numeric vectors.
# If no columns are selected, instead of returning an empty numeric vector,
# it returns an empty list. This causes an error since we can’t use a list with [.
# 問題は、sapplyは常に数値ベクトルを返すとは限らないことです。
# 列が選択されていない場合、空の数値ベクトルを返すのではなく、空のリストを返します。
# [でリストを使うことができないので、エラーを引き起こします。
sapply(df[0], is.numeric)
## named list()
sapply(df[1], is.numeric)
## x
## TRUE
sapply(df[1:2], is.numeric)
## x y
## TRUE TRUE
# if (ncol(df) == 0)...
# purrrチートシートを見るといい
# 「Cheatsheets – RStudio」 https://www.rstudio.com/resources/cheatsheets/
# ~p301