문제
여러 그룹에 대한 데이터를 묶은 데이터프레임에서 조건에 맞는 그룹에 mutate를 사용하는 방법.
mtcars 데이터를 cyl에 따라 그룹을 묶고, cyl=4 혹은 cyl=8인 그룹의 disp를 \(\times 100\) 하는 방법에 대해 생각해보자
suppressPackageStartupMessages(library(tidyverse))
dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
print(dat,n=10)
## # A tibble: 32 x 3
## cyl disp qsec
## <fctr> <dbl> <dbl>
## 1 6 160.0 16.46
## 2 6 160.0 17.02
## 3 4 108.0 18.61
## 4 6 258.0 19.44
## 5 8 360.0 17.02
## 6 6 225.0 20.22
## 7 8 360.0 15.84
## 8 4 146.7 20.00
## 9 4 140.8 22.90
## 10 6 167.6 18.30
## # ... with 22 more rows
suppressPackageStartupMessages(library(magrittr))
dat$disp[dat$cyl%in%c(4,8)]%<>%multiply_by(100)
# which is equivalent to dat$disp[dat$cyl%in%c(4,8)]<-dat$disp[dat$cyl%in%c(4,8)]*10
head(dat)
## # A tibble: 6 x 3
## cyl disp qsec
## <fctr> <dbl> <dbl>
## 1 6 160 16.46
## 2 6 160 17.02
## 3 4 10800 18.61
## 4 6 258 19.44
## 5 8 36000 17.02
## 6 6 225 20.22
dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat_4_8=dat%>%filter(cyl%in%c(4,8))
dat_4_8<-dat_4_8%>%mutate(disp=disp*100)
dat<-bind_rows(dat%>%filter(!cyl%in%c(4,8)) , dat_4_8)
print(dat,n=10) # the order is different, but this is not a big deal.
## # A tibble: 32 x 3
## cyl disp qsec
## <fctr> <dbl> <dbl>
## 1 6 160.0 16.46
## 2 6 160.0 17.02
## 3 6 258.0 19.44
## 4 6 225.0 20.22
## 5 6 167.6 18.30
## 6 6 167.6 18.90
## 7 6 145.0 15.50
## 8 4 10800.0 18.61
## 9 8 36000.0 17.02
## 10 8 36000.0 15.84
## # ... with 22 more rows
purrr이용dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat<-dat%>%mutate(cyl=as.character(cyl))%>%mutate(disp=purrr::pmap_dbl( list(cyl,disp),
function(cyl,disp) {
if(cyl%in%c("4","8")){
return(disp*100)
}else{
return(disp)
}
}) )
head(dat,10)
## # A tibble: 10 x 3
## cyl disp qsec
## <chr> <dbl> <dbl>
## 1 6 160.0 16.46
## 2 6 160.0 17.02
## 3 4 10800.0 18.61
## 4 6 258.0 19.44
## 5 8 36000.0 17.02
## 6 6 225.0 20.22
## 7 8 36000.0 15.84
## 8 4 14670.0 20.00
## 9 4 14080.0 22.90
## 10 6 167.6 18.30
ifelse 이용사실 간단한 경우를 아니면 ifelse를 이용하는 것은 한계가 있어 보인다..?
dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat%>%mutate(disp=ifelse(cyl%in%c(4,8),disp*100,disp))%>%head(10)
## # A tibble: 10 x 3
## cyl disp qsec
## <fctr> <dbl> <dbl>
## 1 6 160.0 16.46
## 2 6 160.0 17.02
## 3 4 10800.0 18.61
## 4 6 258.0 19.44
## 5 8 36000.0 17.02
## 6 6 225.0 20.22
## 7 8 36000.0 15.84
## 8 4 14670.0 20.00
## 9 4 14080.0 22.90
## 10 6 167.6 18.30
Group_by?dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat%>%group_by(abc=cyl%in%c(4,8))%>%mutate(disp=if_else(abc,disp*100,disp))%>%ungroup()%>%select(-abc)%>%head(10)
## # A tibble: 10 x 3
## cyl disp qsec
## <fctr> <dbl> <dbl>
## 1 6 160.0 16.46
## 2 6 160.0 17.02
## 3 4 10800.0 18.61
## 4 6 258.0 19.44
## 5 8 36000.0 17.02
## 6 6 225.0 20.22
## 7 8 36000.0 15.84
## 8 4 14670.0 20.00
## 9 4 14080.0 22.90
## 10 6 167.6 18.30
dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat%>%group_by(abc=cyl%in%c(4,8))%>%mutate(disp=disp*10*abc+disp*(!abc))%>%ungroup()%>%select(-abc)%>%head(10)
## # A tibble: 10 x 3
## cyl disp qsec
## <fctr> <dbl> <dbl>
## 1 6 160.0 16.46
## 2 6 160.0 17.02
## 3 4 1080.0 18.61
## 4 6 258.0 19.44
## 5 8 3600.0 17.02
## 6 6 225.0 20.22
## 7 8 3600.0 15.84
## 8 4 1467.0 20.00
## 9 4 1408.0 22.90
## 10 6 167.6 18.30
dat<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
dat%>%mutate(disp=case_when(
cyl%in%c(4,8)~disp*10,
TRUE~disp
))%>%head(10)
## # A tibble: 10 x 3
## cyl disp qsec
## <fctr> <dbl> <dbl>
## 1 6 160.0 16.46
## 2 6 160.0 17.02
## 3 4 1080.0 18.61
## 4 6 258.0 19.44
## 5 8 3600.0 17.02
## 6 6 225.0 20.22
## 7 8 3600.0 15.84
## 8 4 1467.0 20.00
## 9 4 1408.0 22.90
## 10 6 167.6 18.30
첫번째 방법이 제일 빠르긴하지만, 변수 하나만 하는게 아니라 여러 변수에 걸쳐 작업을 할 경우 코드를 작성하는 것이 굉장히 번거로울 수 있다..
library(microbenchmark)
dat7<-dat6<-dat5<-dat4<-dat3<-dat2<-dat1<-as_data_frame(mtcars)%>%mutate(cyl=as.factor(cyl))%>%select(cyl,disp,qsec)
# list
f1<-function(dat1){
dat1$disp[dat1$cyl%in%c(4,8)]<-dat1$disp[dat1$cyl%in%c(4,8)]*100
return(dat1)
}
# bind rows
f2<-function(dat2){
dat_4_8=dat2%>%filter(cyl%in%c(4,8))
dat_4_8<-dat_4_8%>%mutate(disp=disp*100)
dat2<-bind_rows(dat2%>%filter(!cyl%in%c(4,8)) , dat_4_8)
return(dat2)
}
# purrr
f3<-function(dat3){
dat3<-dat3%>%mutate(cyl=as.character(cyl))%>%mutate(disp=purrr::pmap_dbl( list(cyl,disp),
function(cyl,disp) {
if(cyl%in%c("4","8")){
return(disp*100)
}else{
return(disp)
}
}) )
return(dat3)
}
# ifelse
f4<-function(dat4){
dat4<-dat4%>%mutate(disp=ifelse(cyl%in%c(4,8),disp*100,disp))
return(dat4)
}
f5<-function(dat5){
dat5<-dat5%>%group_by(abc=cyl%in%c(4,8))%>%mutate(disp=if_else(abc,disp*100,disp))%>%ungroup()%>%select(-abc)
return(dat5)
}
f6<-function(dat6){
dat6<-dat6%>%mutate(abc=cyl%in%c(4,8))%>%mutate(disp=disp*10*abc+disp*(!abc))%>%select(-abc)
return(dat6)
}
f7<-function(dat7){
dat7<-dat7%>%mutate(disp=case_when(
cyl%in%c(4,8)~disp*10,
TRUE~disp
))
}
bench<-microbenchmark::microbenchmark(f1(dat1),f2(dat2),f3(dat3),f4(dat4),f5(dat5),f6(dat6),f7(dat7))
ggplot2::autoplot(bench)