이전 시리즈에서 저장한 R file 불러오기

load("./r_Intro/gss_2018.RData") # .은 현재 설정된 워킹 디렉토리

\(~\)

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: 패키지 'ggplot2'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tibble'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tidyr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'readr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.1.2에서 작성되었습니다
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

1. Data Type 변환

1-1. 변수 하나만 변환

class(gss_2018_3$ABANY) #먼저 해당 변수(ABANY)의 형태 파악
## [1] "integer"

numeric으로 바꿔서 저장

gss_2018_3$ABANY <- as.numeric(gss_2018_3$ABANY)  
class(gss_2018_3$ABANY)
## [1] "numeric"

integer로

gss_2018_3$ABANY <- as.integer(gss_2018_3$ABANY) 
class(gss_2018_3$ABANY)
## [1] "integer"

character로

gss_2018_3$ABANY <- as.character(gss_2018_3$ABANY) 
class(gss_2018_3$ABANY)
## [1] "character"

factor로

gss_2018_3$ABANY <- factor(gss_2018_3$ABANY) #factor로
class(gss_2018_3$ABANY)
## [1] "factor"

만약 기존 변수가 factor로 되어있다면? character 변환 후 numeric으로!

gss_2018_3$ABANY <- as.numeric(as.character(gss_2018_3$ABANY)) #factor로
class(gss_2018_3$ABANY)
## [1] "numeric"

만약 변수값 중 “,”가 포함된 숫자를 인지 못한다면? e.g. 45,345

gss_2018_3$ABANY <- as.character(gss_2018_3$ABANY) #우선 character로 변환 후
gss_2018_3$ABANY <- gsub(",", "", gss_2018_3$ABANY) #gsub을 활용하여 ","를 ""로 바꾼 뒤 저장하라
grub 참고: (https://rbasall.tistory.com/m/82)

\(~\)

1-2. 변수 여러개 동시 변환1: 변수 기준으로 선택하기: dplyr::mutate_at

library(dplyr)

numeric

gss_2018_3 <- mutate_at(gss_2018_3, vars(ABANY, ABDEFECT), as.numeric)

factor

gss_2018_3 <- mutate_at(gss_2018_3, vars(ABANY, ABDEFECT), as.factor)

character

gss_2018_3 <- mutate_at(gss_2018_3, vars(ABANY, ABDEFECT), as.character)

integer

gss_2018_3 <- mutate_at(gss_2018_3, vars(ABANY, ABDEFECT), as.integer)

\(~\)

1-3. 변수 여러개 동시 변환1: Data Type 기준으로 선택하기: purrr::map_if

library(purrr)

factor 변수들을 numeric 변수들로

gss_2018_3 <- gss_2018_3 %>% map_if(is.factor, as.numeric) 

integer 변수들을 numeric 변수들로

gss_2018_3 <- gss_2018_3 %>% map_if(is.integer, as.numeric) 

character 변수들을 numeric 변수들로

gss_2018_3 <- gss_2018_3 %>% map_if(is.character, as.numeric) 

\(~\)

2. Label 만들기: factor 변수, ordered 변수

2-1. Norminal Variable

v7 <- c(0, 1) # 0, 1로 구성된 객체 만드는데
v7 <- factor(v7, levels=c(0:1), labels=c("앞", "뒤")) #범주는 0, 1이고 라벨은 앞, 뒤
v7
## [1] 앞 뒤
## Levels: 앞 뒤

2-2. Ordered Variable

v8 <- c(1, 2, 3, 3, 2, 1, 1, 2, 3, 3, 2, 1)
v8 <- ordered(v8, levels=c(1:3), labels=c("First", "Second", "Third"))
v8
##  [1] First  Second Third  Third  Second First  First  Second Third  Third 
## [11] Second First 
## Levels: First < Second < Third

2-3. Reordering Factors

예시로 연령(범주) factor vector 하나 만든 뒤

age_factor <- cut(gss_2018_3$AGE
                         , breaks = c(0, 19, 29, 39, 49, 59, 69, 79, 89)
                         , labels = c("10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s"))

fct_relevel: level 재지정

levels(age_factor)
## [1] "10s" "20s" "30s" "40s" "50s" "60s" "70s" "80s"
age_factor2 <- fct_relevel(age_factor,
                           "20s", "60s")
levels(age_factor2)
## [1] "20s" "60s" "10s" "30s" "40s" "50s" "70s" "80s"
age_factor3 <- fct_relevel(age_factor,
                           "50s", after = 2)

levels(age_factor3)
## [1] "10s" "20s" "50s" "30s" "40s" "60s" "70s" "80s"

fct_infreq(): n이 많은 level 순서대로

age_factor4 <- fct_infreq(age_factor)

levels(age_factor4)
## [1] "30s" "50s" "40s" "60s" "20s" "70s" "80s" "10s"

fct_rev(): 반대 순서로

age_factor5 <- fct_rev(age_factor)

levels(age_factor5)
## [1] "80s" "70s" "60s" "50s" "40s" "30s" "20s" "10s"
age_factor6 <- fct_rev(fct_infreq(age_factor))

levels(age_factor6)
## [1] "10s" "80s" "70s" "20s" "60s" "40s" "50s" "30s"

fct_lump(): 상위 몇 개만 남기고 나머지는 “Other”로

age_factor7 <- fct_lump(age_factor, n = 3)

levels(age_factor7)
## [1] "30s"   "40s"   "50s"   "Other"

fct_recode

age_factor8 <- fct_recode(age_factor,
           "20-40" = "20s",
           "20-40" = "30s",
           "20-40" = "40s",
           "50-80" = "50s",
           "50-80" = "60s",
           "50-80" = "70s",
           "50-80" = "80s")
levels(age_factor8)
## [1] "10s"   "20-40" "50-80"

fct_collapse()

age_factor9 <- fct_collapse(age_factor,
           "20-40" = c("20s","30s"),
           "40-80" = c("40s","50s","60s","70s","80s"))
levels(age_factor9)
## [1] "10s"   "20-40" "40-80"

\(~\)

3. 변수 Recode 하기

참고: (https://m.blog.naver.com/bsj104/221575842321)

3-1. 기존 값을 새로운 값으로 대체하라

gss_2018_3$ABANY2 <- c(0, 1, NA)[match(gss_2018_3$ABANY, c(1, 2, NA))] #기존 변수 ABANY에서 1, 2, NA에 맞춰서 ABANY2를 만들고 0, 1, NA를 넣어라
summary(gss_2018_3$ABANY2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.4987  1.0000  1.0000     824

\(~\)

3-2. 연속형을 구간으로 나눠라

gss_2018_3$AGE <- as.numeric(gss_2018_3$AGE) #먼저 연속형으로 만들고
summary(gss_2018_3$AGE) #대략적으로 각 범주값을 어떻게 할지 보고
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.00   34.00   48.00   48.97   63.00   89.00       7
gss_2018_3$AGE2 <- cut(gss_2018_3$AGE
                         , breaks = c(0, 19, 29, 39, 49, 59, 69, 79, 89) # 1<10s<=19 / 19<20s<=29 ...
                         , label=F)
summary(gss_2018_3$AGE2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   3.000   4.000   4.439   6.000   8.000       7

\(~\)

3-3. 만약 구간 나눈 뒤 라벨 붙여서 factor로 만들고 싶다면?

summary(gss_2018_3$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.00   34.00   48.00   48.97   63.00   89.00       7
gss_2018_3$AGE3 <- cut(gss_2018_3$AGE
                         , breaks = c(0, 19, 29, 39, 49, 59, 69, 79, 89) # 0<10s<=19 / 19<20s<=30s ...
                         , labels = c("10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s"))
summary(gss_2018_3$AGE3)
##  10s  20s  30s  40s  50s  60s  70s  80s NA's 
##   48  350  450  365  410  361  239  118    7

\(~\)

3-4: ifelse 활용하기! (평소에 매우 유용)

ifelse(data$v1 == 1, “일”, 0) : v1변수 내 값이 1이라면 “일”로 바꾸고, 나머지는 다 0으로 바꿔라

ifelse(data$v2 <= 5, 1, NA) : v2변수 내 값이 5이하라면 1로 바꾸고, 나머지는 결측치(NA)로 바꿔라

gss_2018_3$AGE4 <- ifelse(gss_2018_3$AGE < 20, 0
                  , ifelse(gss_2018_3$AGE < 30, 1
                  , ifelse(gss_2018_3$AGE < 40, 2
                  , ifelse(gss_2018_3$AGE < 50, 3
                  , ifelse(gss_2018_3$AGE < 60, 4
                  , ifelse(gss_2018_3$AGE < 70, 5
                  , ifelse(gss_2018_3$AGE < 80, 6
                  , ifelse(gss_2018_3$AGE < 90, 7, NA))))))))
summary(gss_2018_3$AGE4)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.000   3.000   3.439   5.000   7.000       7

이후 Factor로 바꾸고 싶을땐?

gss_2018_3$AGE4 <- factor(gss_2018_3$AGE4, level=c(0:7), label=c("20s", "30s", "40s", "50s", "60s", "70s", "80s", "90s"))
class(gss_2018_3$AGE4)
## [1] "factor"
summary(gss_2018_3$AGE4)
##  20s  30s  40s  50s  60s  70s  80s  90s NA's 
##   48  350  450  365  410  361  239  118    7

3-5: case_when()

ifelse()와 유사함

gss_2018_3$AGE5 <- case_when(gss_2018_3$AGE < 20 ~ 0
                            ,gss_2018_3$AGE < 30 ~ 1
                            ,gss_2018_3$AGE < 40 ~ 2
                            ,gss_2018_3$AGE < 50 ~ 3
                            ,gss_2018_3$AGE < 60 ~ 4
                            ,gss_2018_3$AGE < 70 ~ 5
                            ,gss_2018_3$AGE < 80 ~ 6
                            ,gss_2018_3$AGE < 90 ~ 7
                            ,TRUE ~ as.numeric(NA)) #NA할 때 ifelse처럼 그냥 쓰면 안 됨
summary(gss_2018_3$AGE5)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.000   3.000   3.439   5.000   7.000       7