문제

여러 그룹에 대한 데이터를 묶은 데이터프레임에서 조건에 맞는 그룹에 mutate를 사용하는 방법.

mtcars 데이터를 cyl에 따라 그룹을 묶고, cyl=4 혹은 cyl=8인 그룹의 disp\(\times 100\) 하는 방법에 대해 생각해보자

suppressPackageStartupMessages(library(tidyverse))
dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
print(dat,n=10)
## # A tibble: 32 x 3
##       cyl  disp  qsec
##    <fctr> <dbl> <dbl>
##  1      6 160.0 16.46
##  2      6 160.0 17.02
##  3      4 108.0 18.61
##  4      6 258.0 19.44
##  5      8 360.0 17.02
##  6      6 225.0 20.22
##  7      8 360.0 15.84
##  8      4 146.7 20.00
##  9      4 140.8 22.90
## 10      6 167.6 18.30
## # ... with 22 more rows

1. 직접 데이터 리스트를 수정

suppressPackageStartupMessages(library(magrittr))
dat$disp[dat$cyl%in%c(4,8)]%<>%multiply_by(100)
# which is equivalent to dat$disp[dat$cyl%in%c(4,8)]<-dat$disp[dat$cyl%in%c(4,8)]*10
head(dat)
## # A tibble: 6 x 3
##      cyl  disp  qsec
##   <fctr> <dbl> <dbl>
## 1      6   160 16.46
## 2      6   160 17.02
## 3      4 10800 18.61
## 4      6   258 19.44
## 5      8 36000 17.02
## 6      6   225 20.22

2. 데이터 프레임을 분리해서 계산한 다음에 다시 합침

dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat_4_8=dat%>%filter(cyl%in%c(4,8))
dat_4_8<-dat_4_8%>%mutate(disp=disp*100)
dat<-bind_rows(dat%>%filter(!cyl%in%c(4,8)) , dat_4_8)
print(dat,n=10) # the order is different, but this is not a big deal.
## # A tibble: 32 x 3
##       cyl    disp  qsec
##    <fctr>   <dbl> <dbl>
##  1      6   160.0 16.46
##  2      6   160.0 17.02
##  3      6   258.0 19.44
##  4      6   225.0 20.22
##  5      6   167.6 18.30
##  6      6   167.6 18.90
##  7      6   145.0 15.50
##  8      4 10800.0 18.61
##  9      8 36000.0 17.02
## 10      8 36000.0 15.84
## # ... with 22 more rows

3. purrr이용

dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat<-dat%>%mutate(cyl=as.character(cyl))%>%mutate(disp=purrr::pmap_dbl( list(cyl,disp), 
             function(cyl,disp) {
               if(cyl%in%c("4","8")){
                   return(disp*100)
               }else{
                   return(disp)
               }
               }) )

head(dat,10)
## # A tibble: 10 x 3
##      cyl    disp  qsec
##    <chr>   <dbl> <dbl>
##  1     6   160.0 16.46
##  2     6   160.0 17.02
##  3     4 10800.0 18.61
##  4     6   258.0 19.44
##  5     8 36000.0 17.02
##  6     6   225.0 20.22
##  7     8 36000.0 15.84
##  8     4 14670.0 20.00
##  9     4 14080.0 22.90
## 10     6   167.6 18.30

4. ifelse 이용

사실 간단한 경우를 아니면 ifelse를 이용하는 것은 한계가 있어 보인다..?

dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat%>%mutate(disp=ifelse(cyl%in%c(4,8),disp*100,disp))%>%head(10)
## # A tibble: 10 x 3
##       cyl    disp  qsec
##    <fctr>   <dbl> <dbl>
##  1      6   160.0 16.46
##  2      6   160.0 17.02
##  3      4 10800.0 18.61
##  4      6   258.0 19.44
##  5      8 36000.0 17.02
##  6      6   225.0 20.22
##  7      8 36000.0 15.84
##  8      4 14670.0 20.00
##  9      4 14080.0 22.90
## 10      6   167.6 18.30

5. Group_by?

dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)

dat%>%group_by(abc=cyl%in%c(4,8))%>%mutate(disp=if_else(abc,disp*100,disp))%>%ungroup()%>%select(-abc)%>%head(10)
## # A tibble: 10 x 3
##       cyl    disp  qsec
##    <fctr>   <dbl> <dbl>
##  1      6   160.0 16.46
##  2      6   160.0 17.02
##  3      4 10800.0 18.61
##  4      6   258.0 19.44
##  5      8 36000.0 17.02
##  6      6   225.0 20.22
##  7      8 36000.0 15.84
##  8      4 14670.0 20.00
##  9      4 14080.0 22.90
## 10      6   167.6 18.30
dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat%>%group_by(abc=cyl%in%c(4,8))%>%mutate(disp=disp*10*abc+disp*(!abc))%>%ungroup()%>%select(-abc)%>%head(10)
## # A tibble: 10 x 3
##       cyl   disp  qsec
##    <fctr>  <dbl> <dbl>
##  1      6  160.0 16.46
##  2      6  160.0 17.02
##  3      4 1080.0 18.61
##  4      6  258.0 19.44
##  5      8 3600.0 17.02
##  6      6  225.0 20.22
##  7      8 3600.0 15.84
##  8      4 1467.0 20.00
##  9      4 1408.0 22.90
## 10      6  167.6 18.30

7. case_when

dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)

dat%>%mutate(disp=case_when(
  cyl%in%c(4,8)~disp*10,
  TRUE~disp
))%>%head(10)
## # A tibble: 10 x 3
##       cyl   disp  qsec
##    <fctr>  <dbl> <dbl>
##  1      6  160.0 16.46
##  2      6  160.0 17.02
##  3      4 1080.0 18.61
##  4      6  258.0 19.44
##  5      8 3600.0 17.02
##  6      6  225.0 20.22
##  7      8 3600.0 15.84
##  8      4 1467.0 20.00
##  9      4 1408.0 22.90
## 10      6  167.6 18.30
벤치마크

첫번째 방법이 제일 빠르긴하지만, 변수 하나만 하는게 아니라 여러 변수에 걸쳐 작업을 할 경우 코드를 작성하는 것이 굉장히 번거로울 수 있다..

library(microbenchmark)
dat7<-dat6<-dat5<-dat4<-dat3<-dat2<-dat1<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)

# list
f1<-function(dat1){
  dat1$disp[dat1$cyl%in%c(4,8)]<-dat1$disp[dat1$cyl%in%c(4,8)]*100
  return(dat1)
}

# bind rows
f2<-function(dat2){
  dat_4_8=dat2%>%filter(cyl%in%c(4,8))
  dat_4_8<-dat_4_8%>%mutate(disp=disp*100)
  dat2<-bind_rows(dat2%>%filter(!cyl%in%c(4,8)) , dat_4_8)
  return(dat2)
}

# purrr
f3<-function(dat3){
  dat3<-dat3%>%mutate(cyl=as.character(cyl))%>%mutate(disp=purrr::pmap_dbl( list(cyl,disp), 
             function(cyl,disp) {
               if(cyl%in%c("4","8")){
                   return(disp*100)
               }else{
                   return(disp)
               }
               }) )
  return(dat3)
}

# ifelse
f4<-function(dat4){
  dat4<-dat4%>%mutate(disp=ifelse(cyl%in%c(4,8),disp*100,disp))
  return(dat4)
}

f5<-function(dat5){
  dat5<-dat5%>%group_by(abc=cyl%in%c(4,8))%>%mutate(disp=if_else(abc,disp*100,disp))%>%ungroup()%>%select(-abc)
  return(dat5)
}

f6<-function(dat6){
  dat6<-dat6%>%mutate(abc=cyl%in%c(4,8))%>%mutate(disp=disp*10*abc+disp*(!abc))%>%select(-abc)
  return(dat6)
}

f7<-function(dat7){
  dat7<-dat7%>%mutate(disp=case_when(
    cyl%in%c(4,8)~disp*10,
    TRUE~disp
  ))
}

bench<-microbenchmark::microbenchmark(f1(dat1),f2(dat2),f3(dat3),f4(dat4),f5(dat5),f6(dat6),f7(dat7))

ggplot2::autoplot(bench)