library(foreign)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readxl)
read.spss("C:\\Users\\chosun\\Downloads\\Koweps_hpc10_2015_beta1.sav",to.data.frame = T)->welfare
## Warning in
## read.spss("C:\\Users\\chosun\\Downloads\\Koweps_hpc10_2015_beta1.sav", :
## C:\Users\chosun\Downloads\Koweps_hpc10_2015_beta1.sav: Compression bias (0) is
## not the usual value of 100
welfare <- rename(welfare,
sex = h10_g3, # 성별
birth = h10_g4, # 태어난 연도
marriage = h10_g10, # 혼인 상태
religion = h10_g11, # 종교
income = p1002_8aq1, # 월급
code_job = h10_eco9, # 직종 코드
code_region = h10_reg7) # 지역 코드
9-4 연령대 파생변수 만들기
welfare$age <- 2015 - welfare$birth + 1
summary(welfare$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 28.00 50.00 48.43 70.00 109.00
welfare <- welfare %>%
mutate(ageg = ifelse(age < 30, "young",
ifelse(age <= 59, "middle", "old")))
table(welfare$ageg)
##
## middle old young
## 6049 6281 4334
qplot(welfare$ageg)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
연령대에 따른 월급 차이
ageg_income <- welfare %>%
filter(!is.na(income)) %>%
group_by(ageg) %>%
summarise(mean_income = mean(income))
ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) + geom_col()
ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) +
geom_col() +
scale_x_discrete(limits = c("young", "middle", "old"))
ggplot(data=ageg_income, aes(x=reorder(ageg, mean_income), y=mean_income))+geom_col()
9-5연령대 및 성별 월급 차이
sex_income <- welfare %>%
filter(!is.na(income)) %>%
group_by(ageg, sex) %>%
summarise(mean_income = mean(income))
## `summarise()` has grouped output by 'ageg'. You can override using the
## `.groups` argument.
ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) +
geom_col() +
scale_x_discrete(limits = c("young", "middle", "old"))
ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) +
geom_col(position = "dodge") +
scale_x_discrete(limits = c("young", "middle", "old"))
나이 및 성별 월급 차이
sex_age <- welfare %>%
filter(!is.na(income)) %>%
group_by(age, sex) %>%
summarise(mean_income = mean(income))
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
ggplot(data = sex_age, aes(x = age, y = mean_income, col = sex)) + geom_line()
9-6직업 변수 검토
class(welfare$code_job)
## [1] "numeric"
table(welfare$code_job)
##
## 111 120 131 132 133 134 135 139 141 149 151 152 153 159 211 212
## 2 16 10 11 9 3 7 10 35 20 26 18 15 16 8 4
## 213 221 222 223 224 231 232 233 234 235 236 237 239 241 242 243
## 3 17 31 12 4 41 5 3 6 48 14 2 29 12 4 63
## 244 245 246 247 248 251 252 253 254 259 261 271 272 273 274 281
## 4 33 59 77 38 14 111 24 67 109 4 15 11 4 36 17
## 283 284 285 286 289 311 312 313 314 320 330 391 392 399 411 412
## 8 10 26 16 5 140 260 220 84 75 15 4 13 87 47 12
## 421 422 423 429 431 432 441 442 510 521 522 530 611 612 613 620
## 124 71 5 14 20 33 154 197 192 353 5 106 1320 11 40 2
## 630 710 721 722 730 741 742 743 751 752 753 761 762 771 772 773
## 20 29 30 22 16 27 3 34 34 5 49 69 27 11 61 86
## 774 780 791 792 799 811 812 819 821 822 823 831 832 841 842 843
## 7 17 5 21 45 16 1 6 9 9 23 5 17 32 10 4
## 851 852 853 854 855 861 862 863 864 871 873 874 875 876 881 882
## 19 13 7 33 9 3 14 17 31 2 257 34 37 2 2 3
## 891 892 899 910 921 922 930 941 942 951 952 953 991 992 999 1011
## 8 19 16 102 31 74 289 325 99 125 122 73 45 12 141 2
## 1012
## 17
library(readxl)
list_job<- read_excel("C:\\Users\\chosun\\Downloads\\Koweps_Codebook.xlsx",col_names = T, sheet = 2)
head(list_job)
## # A tibble: 6 × 2
## code_job job
## <dbl> <chr>
## 1 111 의회의원 고위공무원 및 공공단체임원
## 2 112 기업고위임원
## 3 120 행정 및 경영지원 관리자
## 4 131 연구 교육 및 법률 관련 관리자
## 5 132 보험 및 금융 관리자
## 6 133 보건 및 사회복지 관련 관리자
welfare에 직업명 결합
welfare <- left_join(welfare, list_job, by = "code_job")
welfare %>% filter(!is.na(code_job)) %>% select(code_job, job) %>% head(10)
## code_job job
## 1 942 경비원 및 검표원
## 2 762 전기공
## 3 530 방문 노점 및 통신 판매 관련 종사자
## 4 999 기타 서비스관련 단순 종사원
## 5 312 경영관련 사무원
## 6 254 문리 기술 및 예능 강사
## 7 510 영업 종사자
## 8 530 방문 노점 및 통신 판매 관련 종사자
## 9 286 스포츠 및 레크레이션 관련 전문가
## 10 521 매장 판매 종사자
직업별 월급 차이
job_income <- welfare %>%
filter(!is.na(job) & !is.na(income)) %>%
group_by(job) %>%
summarise(mean_income = mean(income))
top10 <- job_income %>%
arrange(desc(mean_income)) %>%
head(10)
ggplot(data = top10, aes(x=reorder(job,-mean_income), y=mean_income))+geom_col()+coord_flip()
9-7 성별 직업빈도표 성별나누기
class(welfare$sex)
## [1] "numeric"
table(welfare$sex)
##
## 1 2
## 7578 9086
welfare$sex <- ifelse(welfare$sex == 9, NA, welfare$sex)
table(is.na(welfare$sex))
##
## FALSE
## 16664
welfare$sex <- ifelse(welfare$sex == 1, "male", "female")
table(welfare$sex)
##
## female male
## 9086 7578
qplot(welfare$sex)
성별 직업 빈도 분석하기
job_male <- welfare %>%
filter(!is.na(job) & sex == "male") %>%
group_by(job) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(10)
job_female <- welfare %>%
filter(!is.na(job) & sex == "female") %>%
group_by(job) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(10)
ggplot(data = job_male, aes(x = reorder(job, n), y = n)) +geom_col() + coord_flip()
ggplot(data = job_female, aes(x = reorder(job, n), y = n)) +geom_col() + coord_flip()
9-8 종교 유무에 따른 이혼율 종교 유무
welfare$religion <- ifelse(welfare$religion == 1, "yes", "no")
qplot(welfare$religion)
혼인 상태
welfare$group_marriage <- ifelse(welfare$marriage == 1, "marriage",
ifelse(welfare$marriage == 3, "divorce", NA))
qplot(welfare$group_marriage)
종교 유무에 따른 이혼율
religion_marriage <- welfare %>%
filter(!is.na(group_marriage)) %>%
group_by(religion, group_marriage) %>%
summarise(n = n()) %>%
mutate(tot_group = sum(n)) %>%
mutate(pct = round(n/tot_group*100, 1))
## `summarise()` has grouped output by 'religion'. You can override using the
## `.groups` argument.
divorce <- religion_marriage %>%
filter(group_marriage == "divorce") %>%
select(religion, pct)
ggplot(data = divorce, aes(x = religion, y = pct)) + geom_col()
종교가 있는 사람들이 이혼을 덜 한다.
연령대 및 종교 유무에 따른 이혼율
ageg_religion_marriage <- welfare %>%
filter(!is.na(group_marriage) & ageg != "young") %>%
group_by(ageg, religion, group_marriage) %>%
summarise(n = n()) %>%
mutate(tot_group = sum(n)) %>%
mutate(pct = round(n/tot_group*100, 1))
## `summarise()` has grouped output by 'ageg', 'religion'. You can override using
## the `.groups` argument.
df_divorce <- ageg_religion_marriage %>%
filter(group_marriage == "divorce") %>%
select(ageg, religion, pct)
ggplot(data = df_divorce, aes(x = ageg, y = pct, fill = religion )) +
geom_col(position = "dodge")
9-9 지역별 연령대 비율 지역 코드 목록
table(welfare$code_region)
##
## 1 2 3 4 5 6 7
## 2486 3711 2785 2036 1467 1257 2922
list_region <- data.frame(code_region = c(1:7),
region = c("서울",
"수도권(인천/경기)",
"부산/경남/울산",
"대구/경북",
"대전/충남",
"강원/충북",
"광주/전남/전북/제주도"))
welfare <- left_join(welfare, list_region, by = "code_region")
지역별 연령대
region_ageg <- welfare %>%
group_by(region, ageg) %>%
summarise(n = n()) %>%
mutate(tot_group = sum(n)) %>%
mutate(pct = round(n/tot_group*100, 2))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip()
노년층 비율 내림차순 정렬
list_order_old <- region_ageg %>%
filter(ageg == "old") %>%
arrange(pct)
region_ageg$ageg <- factor(region_ageg$ageg,
level = c("old", "middle", "young"))
order <- list_order_old$region
order
## [1] "수도권(인천/경기)" "서울" "대전/충남"
## [4] "부산/경남/울산" "광주/전남/전북/제주도" "강원/충북"
## [7] "대구/경북"
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) +
geom_col() +
coord_flip() +
scale_x_discrete(limits = order)
연령대 순으로 막대 색깔 나열하기
region_ageg$ageg <- factor(region_ageg$ageg,
level = c("old", "middle", "young"))
ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) +
geom_col() +
coord_flip() +
scale_x_discrete(limits = order)