load("./r_Intro/gss_2018.RData") # .은 현재 설정된 워킹 디렉토리
\(~\)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: 패키지 'ggplot2'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tibble'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'tidyr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'readr'는 R 버전 4.1.2에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.1.2에서 작성되었습니다
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
class(gss_2018_3$ABANY) #먼저 해당 변수(ABANY)의 형태 파악
## [1] "integer"
gss_2018_3$ABANY <- as.numeric(gss_2018_3$ABANY)
class(gss_2018_3$ABANY)
## [1] "numeric"
gss_2018_3$ABANY <- as.integer(gss_2018_3$ABANY)
class(gss_2018_3$ABANY)
## [1] "integer"
gss_2018_3$ABANY <- as.character(gss_2018_3$ABANY)
class(gss_2018_3$ABANY)
## [1] "character"
gss_2018_3$ABANY <- factor(gss_2018_3$ABANY) #factor로
class(gss_2018_3$ABANY)
## [1] "factor"
gss_2018_3$ABANY <- as.numeric(as.character(gss_2018_3$ABANY)) #factor로
class(gss_2018_3$ABANY)
## [1] "numeric"
gss_2018_3$ABANY <- as.character(gss_2018_3$ABANY) #우선 character로 변환 후
gss_2018_3$ABANY <- gsub(",", "", gss_2018_3$ABANY) #gsub을 활용하여 ","를 ""로 바꾼 뒤 저장하라
\(~\)
library(dplyr)
gss_2018_3 <- mutate_at(gss_2018_3, vars(ABANY, ABDEFECT), as.numeric)
gss_2018_3 <- mutate_at(gss_2018_3, vars(ABANY, ABDEFECT), as.factor)
gss_2018_3 <- mutate_at(gss_2018_3, vars(ABANY, ABDEFECT), as.character)
gss_2018_3 <- mutate_at(gss_2018_3, vars(ABANY, ABDEFECT), as.integer)
\(~\)
library(purrr)
gss_2018_3 <- gss_2018_3 %>% map_if(is.factor, as.numeric)
gss_2018_3 <- gss_2018_3 %>% map_if(is.integer, as.numeric)
gss_2018_3 <- gss_2018_3 %>% map_if(is.character, as.numeric)
\(~\)
v7 <- c(0, 1) # 0, 1로 구성된 객체 만드는데
v7 <- factor(v7, levels=c(0:1), labels=c("앞", "뒤")) #범주는 0, 1이고 라벨은 앞, 뒤
v7
## [1] 앞 뒤
## Levels: 앞 뒤
v8 <- c(1, 2, 3, 3, 2, 1, 1, 2, 3, 3, 2, 1)
v8 <- ordered(v8, levels=c(1:3), labels=c("First", "Second", "Third"))
v8
## [1] First Second Third Third Second First First Second Third Third
## [11] Second First
## Levels: First < Second < Third
age_factor <- cut(gss_2018_3$AGE
, breaks = c(0, 19, 29, 39, 49, 59, 69, 79, 89)
, labels = c("10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s"))
levels(age_factor)
## [1] "10s" "20s" "30s" "40s" "50s" "60s" "70s" "80s"
age_factor2 <- fct_relevel(age_factor,
"20s", "60s")
levels(age_factor2)
## [1] "20s" "60s" "10s" "30s" "40s" "50s" "70s" "80s"
age_factor3 <- fct_relevel(age_factor,
"50s", after = 2)
levels(age_factor3)
## [1] "10s" "20s" "50s" "30s" "40s" "60s" "70s" "80s"
age_factor4 <- fct_infreq(age_factor)
levels(age_factor4)
## [1] "30s" "50s" "40s" "60s" "20s" "70s" "80s" "10s"
age_factor5 <- fct_rev(age_factor)
levels(age_factor5)
## [1] "80s" "70s" "60s" "50s" "40s" "30s" "20s" "10s"
age_factor6 <- fct_rev(fct_infreq(age_factor))
levels(age_factor6)
## [1] "10s" "80s" "70s" "20s" "60s" "40s" "50s" "30s"
age_factor7 <- fct_lump(age_factor, n = 3)
levels(age_factor7)
## [1] "30s" "40s" "50s" "Other"
age_factor8 <- fct_recode(age_factor,
"20-40" = "20s",
"20-40" = "30s",
"20-40" = "40s",
"50-80" = "50s",
"50-80" = "60s",
"50-80" = "70s",
"50-80" = "80s")
levels(age_factor8)
## [1] "10s" "20-40" "50-80"
age_factor9 <- fct_collapse(age_factor,
"20-40" = c("20s","30s"),
"40-80" = c("40s","50s","60s","70s","80s"))
levels(age_factor9)
## [1] "10s" "20-40" "40-80"
\(~\)
참고: (https://m.blog.naver.com/bsj104/221575842321)
gss_2018_3$ABANY2 <- c(0, 1, NA)[match(gss_2018_3$ABANY, c(1, 2, NA))] #기존 변수 ABANY에서 1, 2, NA에 맞춰서 ABANY2를 만들고 0, 1, NA를 넣어라
summary(gss_2018_3$ABANY2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.4987 1.0000 1.0000 824
\(~\)
gss_2018_3$AGE <- as.numeric(gss_2018_3$AGE) #먼저 연속형으로 만들고
summary(gss_2018_3$AGE) #대략적으로 각 범주값을 어떻게 할지 보고
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.00 34.00 48.00 48.97 63.00 89.00 7
gss_2018_3$AGE2 <- cut(gss_2018_3$AGE
, breaks = c(0, 19, 29, 39, 49, 59, 69, 79, 89) # 1<10s<=19 / 19<20s<=29 ...
, label=F)
summary(gss_2018_3$AGE2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 3.000 4.000 4.439 6.000 8.000 7
\(~\)
summary(gss_2018_3$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.00 34.00 48.00 48.97 63.00 89.00 7
gss_2018_3$AGE3 <- cut(gss_2018_3$AGE
, breaks = c(0, 19, 29, 39, 49, 59, 69, 79, 89) # 0<10s<=19 / 19<20s<=30s ...
, labels = c("10s", "20s", "30s", "40s", "50s", "60s", "70s", "80s"))
summary(gss_2018_3$AGE3)
## 10s 20s 30s 40s 50s 60s 70s 80s NA's
## 48 350 450 365 410 361 239 118 7
\(~\)
gss_2018_3$AGE4 <- ifelse(gss_2018_3$AGE < 20, 0
, ifelse(gss_2018_3$AGE < 30, 1
, ifelse(gss_2018_3$AGE < 40, 2
, ifelse(gss_2018_3$AGE < 50, 3
, ifelse(gss_2018_3$AGE < 60, 4
, ifelse(gss_2018_3$AGE < 70, 5
, ifelse(gss_2018_3$AGE < 80, 6
, ifelse(gss_2018_3$AGE < 90, 7, NA))))))))
summary(gss_2018_3$AGE4)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.000 3.000 3.439 5.000 7.000 7
gss_2018_3$AGE4 <- factor(gss_2018_3$AGE4, level=c(0:7), label=c("20s", "30s", "40s", "50s", "60s", "70s", "80s", "90s"))
class(gss_2018_3$AGE4)
## [1] "factor"
summary(gss_2018_3$AGE4)
## 20s 30s 40s 50s 60s 70s 80s 90s NA's
## 48 350 450 365 410 361 239 118 7
gss_2018_3$AGE5 <- case_when(gss_2018_3$AGE < 20 ~ 0
,gss_2018_3$AGE < 30 ~ 1
,gss_2018_3$AGE < 40 ~ 2
,gss_2018_3$AGE < 50 ~ 3
,gss_2018_3$AGE < 60 ~ 4
,gss_2018_3$AGE < 70 ~ 5
,gss_2018_3$AGE < 80 ~ 6
,gss_2018_3$AGE < 90 ~ 7
,TRUE ~ as.numeric(NA)) #NA할 때 ifelse처럼 그냥 쓰면 안 됨
summary(gss_2018_3$AGE5)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.000 3.000 3.439 5.000 7.000 7