ch9

library(foreign) 
library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) 
library(readxl)

read.spss("C:\\Users\\chosun\\Downloads\\Koweps_hpc10_2015_beta1.sav",to.data.frame = T)->welfare

## Warning in
## read.spss("C:\\Users\\chosun\\Downloads\\Koweps_hpc10_2015_beta1.sav", :
## C:\Users\chosun\Downloads\Koweps_hpc10_2015_beta1.sav: Compression bias (0) is
## not the usual value of 100

welfare <- rename(welfare,
                  sex = h10_g3, # 성별
                  birth = h10_g4, # 태어난 연도
                  marriage = h10_g10, # 혼인 상태
                  religion = h10_g11, # 종교
                  income = p1002_8aq1, # 월급
                  code_job = h10_eco9, # 직종 코드
                  code_region = h10_reg7) # 지역 코드

9-4 연령대 파생변수 만들기

welfare$age <- 2015 - welfare$birth + 1
summary(welfare$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   28.00   50.00   48.43   70.00  109.00

welfare <- welfare %>%
  mutate(ageg = ifelse(age < 30, "young",
                       ifelse(age <= 59, "middle", "old")))
table(welfare$ageg)

## 
## middle    old  young 
##   6049   6281   4334

qplot(welfare$ageg)

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

연령대에 따른 월급 차이

ageg_income <- welfare %>%
  filter(!is.na(income)) %>%
  group_by(ageg) %>%
  summarise(mean_income = mean(income))
ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) + geom_col()

ggplot(data = ageg_income, aes(x = ageg, y = mean_income)) +
  geom_col() +
  scale_x_discrete(limits = c("young", "middle", "old"))

ggplot(data=ageg_income, aes(x=reorder(ageg, mean_income), y=mean_income))+geom_col()

9-5연령대 및 성별 월급 차이

sex_income <- welfare %>%
  filter(!is.na(income)) %>%
  group_by(ageg, sex) %>%
  summarise(mean_income = mean(income))

## `summarise()` has grouped output by 'ageg'. You can override using the
## `.groups` argument.

ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) +
  geom_col() +
  scale_x_discrete(limits = c("young", "middle", "old"))

ggplot(data = sex_income, aes(x = ageg, y = mean_income, fill = sex)) +
  geom_col(position = "dodge") +
  scale_x_discrete(limits = c("young", "middle", "old"))

나이 및 성별 월급 차이

sex_age <- welfare %>%
  filter(!is.na(income)) %>%
  group_by(age, sex) %>%
  summarise(mean_income = mean(income))

## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

ggplot(data = sex_age, aes(x = age, y = mean_income, col = sex)) + geom_line()

9-6직업 변수 검토

class(welfare$code_job)

## [1] "numeric"

table(welfare$code_job)

## 
##  111  120  131  132  133  134  135  139  141  149  151  152  153  159  211  212 
##    2   16   10   11    9    3    7   10   35   20   26   18   15   16    8    4 
##  213  221  222  223  224  231  232  233  234  235  236  237  239  241  242  243 
##    3   17   31   12    4   41    5    3    6   48   14    2   29   12    4   63 
##  244  245  246  247  248  251  252  253  254  259  261  271  272  273  274  281 
##    4   33   59   77   38   14  111   24   67  109    4   15   11    4   36   17 
##  283  284  285  286  289  311  312  313  314  320  330  391  392  399  411  412 
##    8   10   26   16    5  140  260  220   84   75   15    4   13   87   47   12 
##  421  422  423  429  431  432  441  442  510  521  522  530  611  612  613  620 
##  124   71    5   14   20   33  154  197  192  353    5  106 1320   11   40    2 
##  630  710  721  722  730  741  742  743  751  752  753  761  762  771  772  773 
##   20   29   30   22   16   27    3   34   34    5   49   69   27   11   61   86 
##  774  780  791  792  799  811  812  819  821  822  823  831  832  841  842  843 
##    7   17    5   21   45   16    1    6    9    9   23    5   17   32   10    4 
##  851  852  853  854  855  861  862  863  864  871  873  874  875  876  881  882 
##   19   13    7   33    9    3   14   17   31    2  257   34   37    2    2    3 
##  891  892  899  910  921  922  930  941  942  951  952  953  991  992  999 1011 
##    8   19   16  102   31   74  289  325   99  125  122   73   45   12  141    2 
## 1012 
##   17

library(readxl)
list_job<- read_excel("C:\\Users\\chosun\\Downloads\\Koweps_Codebook.xlsx",col_names = T, sheet = 2)
head(list_job)

## # A tibble: 6 × 2
##   code_job job                                
##      <dbl> <chr>                              
## 1      111 의회의원 고위공무원 및 공공단체임원
## 2      112 기업고위임원                       
## 3      120 행정 및 경영지원 관리자            
## 4      131 연구 교육 및 법률 관련 관리자      
## 5      132 보험 및 금융 관리자                
## 6      133 보건 및 사회복지 관련 관리자

welfare에 직업명 결합

welfare <- left_join(welfare, list_job, by = "code_job")
welfare %>%  filter(!is.na(code_job)) %>% select(code_job, job) %>% head(10)

##    code_job                                job
## 1       942                   경비원 및 검표원
## 2       762                             전기공
## 3       530 방문 노점 및 통신 판매 관련 종사자
## 4       999        기타 서비스관련 단순 종사원
## 5       312                    경영관련 사무원
## 6       254             문리 기술 및 예능 강사
## 7       510                        영업 종사자
## 8       530 방문 노점 및 통신 판매 관련 종사자
## 9       286   스포츠 및 레크레이션 관련 전문가
## 10      521                   매장 판매 종사자

직업별 월급 차이

job_income <- welfare %>%
  filter(!is.na(job) & !is.na(income)) %>%
  group_by(job) %>%
  summarise(mean_income = mean(income))

top10 <- job_income %>%
  arrange(desc(mean_income)) %>%
  head(10)
ggplot(data = top10, aes(x=reorder(job,-mean_income), y=mean_income))+geom_col()+coord_flip()

9-7 성별 직업빈도표 성별나누기

class(welfare$sex)

## [1] "numeric"

table(welfare$sex)

## 
##    1    2 
## 7578 9086

welfare$sex <- ifelse(welfare$sex == 9, NA, welfare$sex)
table(is.na(welfare$sex))

## 
## FALSE 
## 16664

welfare$sex <- ifelse(welfare$sex == 1, "male", "female")
table(welfare$sex)

## 
## female   male 
##   9086   7578

qplot(welfare$sex)

성별 직업 빈도 분석하기

job_male <- welfare %>%
  filter(!is.na(job) & sex == "male") %>%
  group_by(job) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  head(10)

job_female <- welfare %>%
  filter(!is.na(job) & sex == "female") %>%
  group_by(job) %>%
  summarise(n = n()) %>%
  arrange(desc(n)) %>%
  head(10)

ggplot(data = job_male, aes(x = reorder(job, n), y = n)) +geom_col() + coord_flip()

ggplot(data = job_female, aes(x = reorder(job, n), y = n)) +geom_col() + coord_flip()

9-8 종교 유무에 따른 이혼율 종교 유무

welfare$religion <- ifelse(welfare$religion == 1, "yes", "no")
qplot(welfare$religion)

혼인 상태

welfare$group_marriage <- ifelse(welfare$marriage == 1, "marriage",
                                 ifelse(welfare$marriage == 3, "divorce", NA))
qplot(welfare$group_marriage)

종교 유무에 따른 이혼율

religion_marriage <- welfare %>%
  filter(!is.na(group_marriage)) %>%
  group_by(religion, group_marriage) %>%
  summarise(n = n()) %>%
  mutate(tot_group = sum(n)) %>%
  mutate(pct = round(n/tot_group*100, 1))

## `summarise()` has grouped output by 'religion'. You can override using the
## `.groups` argument.

divorce <- religion_marriage %>%
  filter(group_marriage == "divorce") %>%
  select(religion, pct)
ggplot(data = divorce, aes(x = religion, y = pct)) + geom_col()

종교가 있는 사람들이 이혼을 덜 한다.

연령대 및 종교 유무에 따른 이혼율

ageg_religion_marriage <- welfare %>%
 filter(!is.na(group_marriage) & ageg != "young") %>%
 group_by(ageg, religion, group_marriage) %>%
 summarise(n = n()) %>%
 mutate(tot_group = sum(n)) %>%
 mutate(pct = round(n/tot_group*100, 1))

## `summarise()` has grouped output by 'ageg', 'religion'. You can override using
## the `.groups` argument.

df_divorce <- ageg_religion_marriage %>%
 filter(group_marriage == "divorce") %>%
 select(ageg, religion, pct)

ggplot(data = df_divorce, aes(x = ageg, y = pct, fill = religion )) +
 geom_col(position = "dodge")

9-9 지역별 연령대 비율 지역 코드 목록

table(welfare$code_region)

## 
##    1    2    3    4    5    6    7 
## 2486 3711 2785 2036 1467 1257 2922

list_region <- data.frame(code_region = c(1:7),
                          region = c("서울",
                                     "수도권(인천/경기)",
                                     "부산/경남/울산",
                                     "대구/경북",
                                     "대전/충남",
                                     "강원/충북",
                                     "광주/전남/전북/제주도"))

welfare <- left_join(welfare, list_region, by = "code_region")

지역별 연령대

region_ageg <- welfare %>%
  group_by(region, ageg) %>%
  summarise(n = n()) %>%
  mutate(tot_group = sum(n)) %>%
  mutate(pct = round(n/tot_group*100, 2))

## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) + geom_col() + coord_flip()

노년층 비율 내림차순 정렬

list_order_old <- region_ageg %>%
  filter(ageg == "old") %>%
  arrange(pct)

region_ageg$ageg <- factor(region_ageg$ageg,
                           level = c("old", "middle", "young"))

order <- list_order_old$region
order

## [1] "수도권(인천/경기)"     "서울"                  "대전/충남"            
## [4] "부산/경남/울산"        "광주/전남/전북/제주도" "강원/충북"            
## [7] "대구/경북"

ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) +
  geom_col() +
  coord_flip() +
  scale_x_discrete(limits = order)

연령대 순으로 막대 색깔 나열하기

region_ageg$ageg <- factor(region_ageg$ageg,
 level = c("old", "middle", "young"))

ggplot(data = region_ageg, aes(x = region, y = pct, fill = ageg)) +
 geom_col() +
 coord_flip() +
 scale_x_discrete(limits = order)

ch9

2023-11-16