purrr

0.1 purrr package

purrr 패키지는 R에서 깔끔하게 반복 작업 처리하는 패키지이다.

0.1.1 packages

library(tidyverse)  
library(tibble)

# pivot_longer(), pivot_wider() : 데이터형태-롱(gather),와이드(spread)

0.1.2 apply(), map() 비교

a <- c(1, 2, 3, 4, 5)

for(i in 1:5){
  print(a[i]+1)
}

## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6

lapply(a, function(x){x+1})   # a는 1:5 함수에 입력, 입력+1 결과가 list 형식으로 출력

## [[1]]
## [1] 2
## 
## [[2]]
## [1] 3
## 
## [[3]]
## [1] 4
## 
## [[4]]
## [1] 5
## 
## [[5]]
## [1] 6

map(a, function(x){x+1})      # lapply(a, function(x){x+1}) 동일함   # 기본 리스트로 출력

## [[1]]
## [1] 2
## 
## [[2]]
## [1] 3
## 
## [[3]]
## [1] 4
## 
## [[4]]
## [1] 5
## 
## [[5]]
## [1] 6

sapply(a, function(x){x+1})

## [1] 2 3 4 5 6

map_dbl(a, function(x){x+1})

## [1] 2 3 4 5 6

map_dbl(a, ~.x+1)

## [1] 2 3 4 5 6

# function(x){x+1} 은  ~.x+1  
# purrr에서는 '여기서부터 함수야'하고 알려주려는 목적으로  ~ 기호 사용.  x는  .x 가 됨

0.1.3 map_(), map2_(), pmap_*()

map_() : 변수 하나만 가지고 계산 map2_() : 작업 대상이 변수 두 개일 때 사용 pmap_*() : 자료가 세 개 이상일 때 사용

a <- c(1:5)
b <- c(1, 22, 333, 4444, 55555)
c <- c('abc', 'def', 'ghi')
d <- c(5, 4, 3, 2, 1)

map_int(b, str_length)

## [1] 1 2 3 4 5

map_lgl(b, is.numeric)

## [1] TRUE TRUE TRUE TRUE TRUE

map_chr(c, ~ paste0(.x, 'z'))

## [1] "abcz" "defz" "ghiz"

map2(a, d, sum)         # 리스트로 출력

## [[1]]
## [1] 6
## 
## [[2]]
## [1] 6
## 
## [[3]]
## [1] 6
## 
## [[4]]
## [1] 6
## 
## [[5]]
## [1] 6

map2_dbl(a, d, sum)     # 벡터 형태로 나타내고 싶으시면

## [1] 6 6 6 6 6

map2_dbl(a, d, ~.x+.y)  # 함수로 계산 function(x, y){x+y}. 벡터 형태 출력

## [1] 6 6 6 6 6

pmap_dbl(list(a, b, d), ~..2-..1+..3)   # 자리별로 b-a+d 를 계산하는 코드

## [1]     5    24   333  4442 55551

0.1.4 데이터를 반복적으로 골라낼 때

e <- list(
  list(-1, x=1, y=c(1), z='a'),
  list(-2, x=4, y=c(2, 3), z='b'),
  list(-3, x=9, y=c(4, 5, 6)))
e

## [[1]]
## [[1]][[1]]
## [1] -1
## 
## [[1]]$x
## [1] 1
## 
## [[1]]$y
## [1] 1
## 
## [[1]]$z
## [1] "a"
## 
## 
## [[2]]
## [[2]][[1]]
## [1] -2
## 
## [[2]]$x
## [1] 4
## 
## [[2]]$y
## [1] 2 3
## 
## [[2]]$z
## [1] "b"
## 
## 
## [[3]]
## [[3]][[1]]
## [1] -3
## 
## [[3]]$x
## [1] 9
## 
## [[3]]$y
## [1] 4 5 6

map_dbl(e, 1)     # 첫번재 요소 출력

## [1] -1 -2 -3

map_dbl(e, 2)     # 두번째 요소 출력

## [1] 1 4 9

map_dbl(e, 'x')   # 'x'라는 이름 붙은 요소 출력

## [1] 1 4 9

map_dbl(e, 'y')   # 'y'라는 데이터가 붙어 있지만 데이터 개수가 달라서 벡터로 출력하려고 하면 에러남. 이 때는 map()을 써서 리스트로 받아야 합니다.

## Error: Result 2 must be a single double, not a double vector of length 2

map(e, 'y')

## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2 3
## 
## [[3]]
## [1] 4 5 6

map(e, list('y', 1))  # y 가운데 첫 번째 값(1, 2, 4)만 뽑아 내고 싶을 때

## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 4

map_chr(e, 'z')

## Error: Result 3 must be a single string, not NULL of length 0

map_chr(e, 'z', .default=NA)

## [1] "a" "b" NA

0.1.5 map_df(), modify()

f <- tibble(a=c(17, 23, 4, 10, 11), 
            b=c(24, 5, 6, 12, 18), 
            c=c(1, 7, 13, 19, 25), 
            d=c(8, 14, 20, 21, 2), 
            e=c(15, 16, 22, 3, 9))
f

## # A tibble: 5 x 5
##       a     b     c     d     e
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    17    24     1     8    15
## 2    23     5     7    14    16
## 3     4     6    13    20    22
## 4    10    12    19    21     3
## 5    11    18    25     2     9

f %>% mutate(sum=a+b+c+d+e)

## # A tibble: 5 x 6
##       a     b     c     d     e   sum
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    17    24     1     8    15    65
## 2    23     5     7    14    16    65
## 3     4     6    13    20    22    65
## 4    10    12    19    21     3    65
## 5    11    18    25     2     9    65

f %>%
  rowwise() %>%      # rowwise()는 Group input by rows
  mutate(max=max(a, b, c, d, e))

## Source: local data frame [5 x 6]
## Groups: <by row>
## 
## # A tibble: 5 x 6
##       a     b     c     d     e   max
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    17    24     1     8    15    24
## 2    23     5     7    14    16    23
## 3     4     6    13    20    22    22
## 4    10    12    19    21     3    21
## 5    11    18    25     2     9    25

map(f, sum)

## $a
## [1] 65
## 
## $b
## [1] 65
## 
## $c
## [1] 65
## 
## $d
## [1] 65
## 
## $e
## [1] 65

map_df(f, sum)     # 데이터 프레임(Data Frame) 형태

## # A tibble: 1 x 5
##       a     b     c     d     e
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    65    65    65    65    65

map_df(f, ~.x+1)   # +1 함수 계산

## # A tibble: 5 x 5
##       a     b     c     d     e
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    18    25     2     9    16
## 2    24     6     8    15    17
## 3     5     7    14    21    23
## 4    11    13    20    22     4
## 5    12    19    26     3    10

modify(f, ~.x+1)   # map() 함수 사촌 가운데 modify()는 원래 자료형 그대로 결과 출력

## # A tibble: 5 x 5
##       a     b     c     d     e
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    18    25     2     9    16
## 2    24     6     8    15    17
## 3     5     7    14    21    23
## 4    11    13    20    22     4
## 5    12    19    26     3    10

0.1.6 실제 사례

kbo <- read.csv('kbo.csv') %>% as_tibble()   # tbl_df(kbo) -> kbo
str(kbo)

## Classes 'tbl_df', 'tbl' and 'data.frame':    303 obs. of  8 variables:
##  $ 연도      : int  1982 1982 1982 1982 1982 1982 1983 1983 1983 1983 ...
##  $ X10년대   : Factor w/ 4 levels "a","b","c","d": 1 1 1 1 1 1 1 1 1 1 ...
##  $ 구단      : Factor w/ 24 levels "KIA","kt","KT",..: 5 15 7 21 11 13 15 21 5 7 ...
##  $ 타석당득점: num  0.137 0.141 0.129 0.125 0.115 0.102 0.116 0.113 0.109 0.111 ...
##  $ 타율      : num  0.282 0.266 0.283 0.261 0.256 0.24 0.263 0.267 0.256 0.259 ...
##  $ 출루율    : num  0.354 0.347 0.349 0.328 0.344 0.304 0.333 0.332 0.323 0.324 ...
##  $ 장타력    : num  0.41 0.392 0.412 0.408 0.373 0.345 0.393 0.385 0.354 0.362 ...
##  $ OPS       : num  0.764 0.739 0.762 0.736 0.717 0.648 0.726 0.717 0.677 0.687 ...

lm(타석당득점 ~ 타율, kbo) %>% summary() %>% 
  .$r.squared

## [1] 0.7757327

lm(타석당득점 ~ 출루율, kbo) %>% summary() %>% 
  .$r.squared

## [1] 0.7698036

lm(타석당득점 ~ 장타력, kbo) %>% summary() %>% 
  .$r.squared

## [1] 0.8762027

lm(타석당득점 ~ OPS, kbo) %>% summary() %>% 
  .$r.squared

## [1] 0.9158364

kbo %>% 
  select(타율, 출루율, 장타력, OPS) %>%
  map(~lm(kbo$타석당득점~.x))

## $타율
## 
## Call:
## lm(formula = kbo$타석당득점 ~ .x)
## 
## Coefficients:
## (Intercept)           .x  
##     -0.1259       0.9222  
## 
## 
## $출루율
## 
## Call:
## lm(formula = kbo$타석당득점 ~ .x)
## 
## Coefficients:
## (Intercept)           .x  
##     -0.1647       0.8328  
## 
## 
## $장타력
## 
## Call:
## lm(formula = kbo$타석당득점 ~ .x)
## 
## Coefficients:
## (Intercept)           .x  
##    -0.03826      0.39921  
## 
## 
## $OPS
## 
## Call:
## lm(formula = kbo$타석당득점 ~ .x)
## 
## Coefficients:
## (Intercept)           .x  
##    -0.09679      0.29370

kbo %>% 
  select(타율, 출루율, 장타력, OPS) %>%
  map(~lm(kbo$타석당득점~.x)) %>%
  map(summary)

## $타율
## 
## Call:
## lm(formula = kbo$타석당득점 ~ .x)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0257544 -0.0053930  0.0005455  0.0042343  0.0216233 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.125931   0.007615  -16.54   <2e-16 ***
## .x           0.922209   0.028581   32.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007795 on 301 degrees of freedom
## Multiple R-squared:  0.7757, Adjusted R-squared:  0.775 
## F-statistic:  1041 on 1 and 301 DF,  p-value: < 2.2e-16
## 
## 
## $출루율
## 
## Call:
## lm(formula = kbo$타석당득점 ~ .x)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0249185 -0.0052528 -0.0002377  0.0047366  0.0232427 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.164745   0.008966  -18.38   <2e-16 ***
## .x           0.832831   0.026250   31.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007897 on 301 degrees of freedom
## Multiple R-squared:  0.7698, Adjusted R-squared:  0.769 
## F-statistic:  1007 on 1 and 301 DF,  p-value: < 2.2e-16
## 
## 
## $장타력
## 
## Call:
## lm(formula = kbo$타석당득점 ~ .x)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0136000 -0.0043552 -0.0002445  0.0037635  0.0227674 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.038256   0.003431  -11.15   <2e-16 ***
## .x           0.399205   0.008649   46.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.005791 on 301 degrees of freedom
## Multiple R-squared:  0.8762, Adjusted R-squared:  0.8758 
## F-statistic:  2130 on 1 and 301 DF,  p-value: < 2.2e-16
## 
## 
## $OPS
## 
## Call:
## lm(formula = kbo$타석당득점 ~ .x)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.010735 -0.003197 -0.000217  0.003359  0.020748 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.096792   0.003787  -25.56   <2e-16 ***
## .x           0.293700   0.005132   57.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.004775 on 301 degrees of freedom
## Multiple R-squared:  0.9158, Adjusted R-squared:  0.9156 
## F-statistic:  3275 on 1 and 301 DF,  p-value: < 2.2e-16

kbo %>% 
  select(타율, 출루율, 장타력, OPS) %>%
  map(~lm(kbo$타석당득점~.x)) %>%
  map(summary) %>%
  map('r.squared')               #  R²만 뽑아냄

## $타율
## [1] 0.7757327
## 
## $출루율
## [1] 0.7698036
## 
## $장타력
## [1] 0.8762027
## 
## $OPS
## [1] 0.9158364

kbo %>% 
  select(타율, 출루율, 장타력, OPS) %>%
  map(~lm(kbo$타석당득점~.x)) %>%
  map(summary) %>%
  map_df('r.squared')       # 출력 형태 데이터 프레임

## # A tibble: 1 x 4
##    타율 출루율 장타력   OPS
##   <dbl>  <dbl>  <dbl> <dbl>
## 1 0.776  0.770  0.876 0.916

# 컴퓨터는 롱 폼 처리에 더 익숙
# tidyverse 생태계에는 이 두 가지 폼 사이를 오갈 수 있도록 tidyr 패키지
# pivot_longer(),  cf) pivot_wider()
# pivot_longer() 함수는  tidyr를 업데이트하면서 따라 새로 생긴 함수. 옛날 방식으로 gather() 함수를 사용  

kbo %>% 
  pivot_longer(cols=타율:OPS, names_to='기록', values_to='값')  # 타율 ops 를 롱폼으로 기록과, 값 컬럼으로 변형

## # A tibble: 1,212 x 6
##     연도 X10년대 구단  타석당득점 기록      값
##    <int> <fct>   <fct>      <dbl> <chr>  <dbl>
##  1  1982 a       MBC        0.137 타율   0.282
##  2  1982 a       MBC        0.137 출루율 0.354
##  3  1982 a       MBC        0.137 장타력 0.41 
##  4  1982 a       MBC        0.137 OPS    0.764
##  5  1982 a       삼성       0.141 타율   0.266
##  6  1982 a       삼성       0.141 출루율 0.347
##  7  1982 a       삼성       0.141 장타력 0.392
##  8  1982 a       삼성       0.141 OPS    0.739
##  9  1982 a       OB         0.129 타율   0.283
## 10  1982 a       OB         0.129 출루율 0.349
## # … with 1,202 more rows

kbo %>%
  gather(key=기록, value=값, 타율:OPS)  # gather() 함수를 사용

## # A tibble: 1,212 x 6
##     연도 X10년대 구단  타석당득점 기록     값
##    <int> <fct>   <fct>      <dbl> <chr> <dbl>
##  1  1982 a       MBC        0.137 타율  0.282
##  2  1982 a       삼성       0.141 타율  0.266
##  3  1982 a       OB         0.129 타율  0.283
##  4  1982 a       해태       0.125 타율  0.261
##  5  1982 a       롯데       0.115 타율  0.256
##  6  1982 a       삼미       0.102 타율  0.24 
##  7  1983 a       삼성       0.116 타율  0.263
##  8  1983 a       해태       0.113 타율  0.267
##  9  1983 a       MBC        0.109 타율  0.256
## 10  1983 a       OB         0.111 타율  0.259
## # … with 1,202 more rows

kbo %>% 
  pivot_longer(cols=타율:OPS, names_to='기록', values_to='값') %>%
  group_by(기록) %>%
  nest()                     # nest() 함수를 써서 이 내용을 중첩. 데이터가 모두 어디론가 숨어버림

## # A tibble: 4 x 2
## # Groups:   기록 [4]
##   기록             data
##   <chr>  <list<df[,5]>>
## 1 타율        [303 × 5]
## 2 출루율      [303 × 5]
## 3 장타력      [303 × 5]
## 4 OPS         [303 × 5]

kbo %>% 
  pivot_longer(cols=타율:OPS, names_to='기록', values_to='값') %>%
  group_by(기록) %>%
  nest() -> kbo2

kbo2$data            # 데이터 프레임을 중첩하면서 생긴 data 열 내용은 열어 보면 됨

## <list_of<
##   tbl_df<
##     연도      : integer
##     X10년대   : factor<10564>
##     구단      : factor<3ff7b>
##     타석당득점: double
##     값        : double
##   >
## >[4]>
## [[1]]
## # A tibble: 303 x 5
##     연도 X10년대 구단  타석당득점    값
##    <int> <fct>   <fct>      <dbl> <dbl>
##  1  1982 a       MBC        0.137 0.282
##  2  1982 a       삼성       0.141 0.266
##  3  1982 a       OB         0.129 0.283
##  4  1982 a       해태       0.125 0.261
##  5  1982 a       롯데       0.115 0.256
##  6  1982 a       삼미       0.102 0.24 
##  7  1983 a       삼성       0.116 0.263
##  8  1983 a       해태       0.113 0.267
##  9  1983 a       MBC        0.109 0.256
## 10  1983 a       OB         0.111 0.259
## # … with 293 more rows
## 
## [[2]]
## # A tibble: 303 x 5
##     연도 X10년대 구단  타석당득점    값
##    <int> <fct>   <fct>      <dbl> <dbl>
##  1  1982 a       MBC        0.137 0.354
##  2  1982 a       삼성       0.141 0.347
##  3  1982 a       OB         0.129 0.349
##  4  1982 a       해태       0.125 0.328
##  5  1982 a       롯데       0.115 0.344
##  6  1982 a       삼미       0.102 0.304
##  7  1983 a       삼성       0.116 0.333
##  8  1983 a       해태       0.113 0.332
##  9  1983 a       MBC        0.109 0.323
## 10  1983 a       OB         0.111 0.324
## # … with 293 more rows
## 
## [[3]]
## # A tibble: 303 x 5
##     연도 X10년대 구단  타석당득점    값
##    <int> <fct>   <fct>      <dbl> <dbl>
##  1  1982 a       MBC        0.137 0.41 
##  2  1982 a       삼성       0.141 0.392
##  3  1982 a       OB         0.129 0.412
##  4  1982 a       해태       0.125 0.408
##  5  1982 a       롯데       0.115 0.373
##  6  1982 a       삼미       0.102 0.345
##  7  1983 a       삼성       0.116 0.393
##  8  1983 a       해태       0.113 0.385
##  9  1983 a       MBC        0.109 0.354
## 10  1983 a       OB         0.111 0.362
## # … with 293 more rows
## 
## [[4]]
## # A tibble: 303 x 5
##     연도 X10년대 구단  타석당득점    값
##    <int> <fct>   <fct>      <dbl> <dbl>
##  1  1982 a       MBC        0.137 0.764
##  2  1982 a       삼성       0.141 0.739
##  3  1982 a       OB         0.129 0.762
##  4  1982 a       해태       0.125 0.736
##  5  1982 a       롯데       0.115 0.717
##  6  1982 a       삼미       0.102 0.648
##  7  1983 a       삼성       0.116 0.726
##  8  1983 a       해태       0.113 0.717
##  9  1983 a       MBC        0.109 0.677
## 10  1983 a       OB         0.111 0.687
## # … with 293 more rows

kbo2$data %>%
  set_names(., kbo2$기록)

## <list_of<
##   tbl_df<
##     연도      : integer
##     X10년대   : factor<10564>
##     구단      : factor<3ff7b>
##     타석당득점: double
##     값        : double
##   >
## >[4]>
## $타율
## # A tibble: 303 x 5
##     연도 X10년대 구단  타석당득점    값
##    <int> <fct>   <fct>      <dbl> <dbl>
##  1  1982 a       MBC        0.137 0.282
##  2  1982 a       삼성       0.141 0.266
##  3  1982 a       OB         0.129 0.283
##  4  1982 a       해태       0.125 0.261
##  5  1982 a       롯데       0.115 0.256
##  6  1982 a       삼미       0.102 0.24 
##  7  1983 a       삼성       0.116 0.263
##  8  1983 a       해태       0.113 0.267
##  9  1983 a       MBC        0.109 0.256
## 10  1983 a       OB         0.111 0.259
## # … with 293 more rows
## 
## $출루율
## # A tibble: 303 x 5
##     연도 X10년대 구단  타석당득점    값
##    <int> <fct>   <fct>      <dbl> <dbl>
##  1  1982 a       MBC        0.137 0.354
##  2  1982 a       삼성       0.141 0.347
##  3  1982 a       OB         0.129 0.349
##  4  1982 a       해태       0.125 0.328
##  5  1982 a       롯데       0.115 0.344
##  6  1982 a       삼미       0.102 0.304
##  7  1983 a       삼성       0.116 0.333
##  8  1983 a       해태       0.113 0.332
##  9  1983 a       MBC        0.109 0.323
## 10  1983 a       OB         0.111 0.324
## # … with 293 more rows
## 
## $장타력
## # A tibble: 303 x 5
##     연도 X10년대 구단  타석당득점    값
##    <int> <fct>   <fct>      <dbl> <dbl>
##  1  1982 a       MBC        0.137 0.41 
##  2  1982 a       삼성       0.141 0.392
##  3  1982 a       OB         0.129 0.412
##  4  1982 a       해태       0.125 0.408
##  5  1982 a       롯데       0.115 0.373
##  6  1982 a       삼미       0.102 0.345
##  7  1983 a       삼성       0.116 0.393
##  8  1983 a       해태       0.113 0.385
##  9  1983 a       MBC        0.109 0.354
## 10  1983 a       OB         0.111 0.362
## # … with 293 more rows
## 
## $OPS
## # A tibble: 303 x 5
##     연도 X10년대 구단  타석당득점    값
##    <int> <fct>   <fct>      <dbl> <dbl>
##  1  1982 a       MBC        0.137 0.764
##  2  1982 a       삼성       0.141 0.739
##  3  1982 a       OB         0.129 0.762
##  4  1982 a       해태       0.125 0.736
##  5  1982 a       롯데       0.115 0.717
##  6  1982 a       삼미       0.102 0.648
##  7  1983 a       삼성       0.116 0.726
##  8  1983 a       해태       0.113 0.717
##  9  1983 a       MBC        0.109 0.677
## 10  1983 a       OB         0.111 0.687
## # … with 293 more rows

kbo2$data %>%
  set_names(., kbo2$기록) %>%
  map_df(~lm(타석당득점~값, data=.) %>%     # map_df()를 써서 반복 작업을 처리
           summary() %>%
           .$r.squared)

## # A tibble: 1 x 4
##    타율 출루율 장타력   OPS
##   <dbl>  <dbl>  <dbl> <dbl>
## 1 0.776  0.770  0.876 0.916

kbo %>% 
  pivot_longer(cols=타율:OPS, names_to='기록', values_to='값') %>%
  group_by(기록) %>%
  nest() %>%
  mutate(model=map(data, ~lm(타석당득점~값, data=.) %>% 
                     summary() %>% 
                     .$r.squared)) %>%
  unnest(model)

## # A tibble: 4 x 3
## # Groups:   기록 [4]
##   기록             data model
##   <chr>  <list<df[,5]>> <dbl>
## 1 타율        [303 × 5] 0.776
## 2 출루율      [303 × 5] 0.770
## 3 장타력      [303 × 5] 0.876
## 4 OPS         [303 × 5] 0.916

kbo %>% 
  pivot_longer(cols=타율:OPS, names_to='기록', values_to='값') %>%
  group_by(기록) %>%
  nest() %>%
  mutate(model=map(data, ~lm(타석당득점~값, data=.) %>% 
                     summary() %>% 
                     .$r.squared)) %>%
  unnest(model) %>%
  select(-data)

## # A tibble: 4 x 2
## # Groups:   기록 [4]
##   기록   model
##   <chr>  <dbl>
## 1 타율   0.776
## 2 출루율 0.770
## 3 장타력 0.876
## 4 OPS    0.916

kbo %>% 
  pivot_longer(cols=타율:OPS, names_to='기록', values_to='값') %>%
  group_by(X10년대, 기록) %>%
  nest() %>%
  mutate(model=map(data, ~lm(타석당득점~값, data=.) %>% 
                     summary() %>% 
                     .$r.squared)) %>%
  unnest(model) %>%
  select(-data)

## # A tibble: 16 x 3
## # Groups:   X10년대, 기록 [16]
##    X10년대 기록   model
##    <fct>   <chr>  <dbl>
##  1 a       타율   0.673
##  2 a       출루율 0.649
##  3 a       장타력 0.797
##  4 a       OPS    0.829
##  5 b       타율   0.828
##  6 b       출루율 0.831
##  7 b       장타력 0.883
##  8 b       OPS    0.929
##  9 c       타율   0.626
## 10 c       출루율 0.680
## 11 c       장타력 0.800
## 12 c       OPS    0.872
## 13 d       타율   0.815
## 14 d       출루율 0.722
## 15 d       장타력 0.896
## 16 d       OPS    0.936

# pivot_wider() 와이드 폼으로 바꾸려면 맨 아래 를 추가하면 됩니다.
kbo %>% 
  pivot_longer(cols=타율:OPS, names_to='기록', values_to='값') %>%
  group_by(X10년대, 기록) %>%
  nest() %>%
  mutate(model=map(data, ~lm(타석당득점~값, data=.) %>% 
                     summary() %>% 
                     .$r.squared)) %>%
  unnest(model) %>%
  select(-data) %>%
  pivot_wider(names_from=기록, values_from=model)     # 동일한 결과  spread(key=기록, value=model)

## # A tibble: 4 x 5
## # Groups:   X10년대 [4]
##   X10년대  타율 출루율 장타력   OPS
##   <fct>   <dbl>  <dbl>  <dbl> <dbl>
## 1 a       0.673  0.649  0.797 0.829
## 2 b       0.828  0.831  0.883 0.929
## 3 c       0.626  0.680  0.800 0.872
## 4 d       0.815  0.722  0.896 0.936

# 그래프 추가
kbo %>% 
  pivot_longer(cols=타율:OPS, names_to='기록', values_to='값') %>%
  group_by(X10년대, 기록) %>%
  nest() %>%
  mutate(model=map(data, ~lm(타석당득점~값, data=.) %>% 
                     summary() %>% 
                     .$r.squared)) %>%
  unnest(model) %>%
  select(-data) %>%
  ggplot(aes(X10년대, model, fill=기록)) +
  geom_bar(stat='identity', position=position_dodge2(reverse=TRUE)) +
  scale_fill_viridis_d()

purrr

updragon

2020 3 3

0.1 purrr package

0.1.1 packages

0.1.2 apply(), map() 비교

0.1.3 map_(), map2_(), pmap_*()

0.1.4 데이터를 반복적으로 골라낼 때

0.1.5 map_df(), modify()

0.1.6 실제 사례