cfps2016child <- read_dta("../cfps/data/2016AllData/cfps2016child_201807.dta",
                          encoding = "GB2312")
#cfps2016adult <- read_dta("../cfps/data/2016AllData/cfps2016adult_201808.dta",
#                          encoding = "GB2312")
cfps2016familyconf <- read_dta("../cfps/data/2016AllData/cfps2016famconf_201804.dta",
                           encoding = "GB2312")
cfps2016familycon <- read_dta("../cfps/data/2016AllData/cfps2016famecon_201807.dta",
                           encoding = "GB2312")
#cfps2016familycon %>% sjPlot::view_df()

儿童变量选取,并处理

cfps2016child %>% count(wd402)
## # A tibble: 106 x 2
##             wd402     n
##         <dbl+lbl> <int>
##  1  -8 [不适用]    7186
##  2  -2 [拒绝回答]     5
##  3  -1 [不知道]      44
##  4   0               40
##  5   1                2
##  6   9                1
##  7  30                1
##  8  50                3
##  9  69                1
## 10 100                6
## # ... with 96 more rows
child <- cfps2016child  %>% 
  dplyr::select(fid16,
                wf601m,#为孩子学习放弃看电视
                wf602m,#常与孩子谈学校里的事
                wf603m,#要求孩子完成作业
                wf604m,#检查孩子作业
                wf605m,#阻止孩子看电视
                wf606m,#限制孩子看的节目
                wh9,#你期望的受教育程度
                pd501b,#过去12个月学校教育支出(元)
                pd503r,#过去12个月课外辅导费(元)
                pd577r,#过去12个月其他费用 (元)
               # pd5ckp,#教育总支出确认
                pa301,#孩子现在的户口状况
                wz301,#父母关心孩子教育
                wz302,#父母主动与孩子沟通
                cfps_age,cfps_gender,#urban16
                wd4,#是否教育存钱
                
                )%>%
  filter(! wf601m %in% c(-1,-8,79),! wf602m %in% c(-1,-8,79),! wf603m %in% c(-1,-8,79) ,
         ! wf604m %in% c(-1,-8,79),! wf605m %in% c(-1,-8,79) ,! wf606m %in% c(-1,-8,79) )%>%#不知道,不适用,情况不知道
  filter(! wh9 %in% c(-1,-8,-2) )%>%
  filter(! pd501b %in% c(-1,-8,-2) )%>%
  filter(! pd503r %in% c(-1,-8,79) )%>%
  filter(! pd577r %in% c(-1,-8,-2) )%>%
 # filter( pd5ckp ==5 )%>%
  filter( pa301 %in% c(1,3) )%>%
  filter(! wz301 %in% c(-8,79) )%>%
  filter(! wz301 %in% c(79) ,cfps_age != -1, wd4 %in% c(1,5)) %>% 
  #filter(cfps_age %in% c(10 ~15)) %>%
  drop_na()

儿童表变量重编码

child_c <- child %>%
  mutate(pd501b_c = if_else(pd501b >0,log(pd501b),0))%>%
  mutate(pd503r_c = if_else(pd503r >0,log(pd503r),0))%>%
  mutate(pd577r_c = if_else(pd577r >0,log(pd577r),0))%>%
  mutate(wh9_c = case_when(
    wh9 == 9 ~ 0,#按年限教育重编码
    wh9 == 2 ~ 6,
    wh9 == 3 ~ 9,
    wh9 == 4 ~ 12,
    wh9 == 5 ~ 15,
    wh9 == 6 ~ 16,
    wh9 == 7 ~ 19,
    wh9 == 8 ~ 23,
    TRUE ~ NA_real_
  ))
## Warning in log(pd503r): 产生了NaNs
change <- function(x){
  case_when(
    x == 1 ~ 5,
    x == 2 ~ 4,
    x == 3 ~ 3,
    x == 4 ~ 2,
    x == 5 ~ 1,
    TRUE ~ NA_real_
  )
}

child_change <- child_c %>%
  mutate(wf601m_c = change(wf601m),#家庭教育参与重编码
         wf602m_c = change(wf602m),
         wf603m_c = change(wf603m),
         wf604m_c = change(wf604m),
         wf605m_c = change(wf605m),
         wf606m_c = change(wf606m),
         wz301_c = change(wz301),
         wz302_c = change(wz302),
         wf_average = (wf601m_c + wf602m_c+wf603m_c+wf604m_c+wf605m_c+wf606m_c)/6
  )
child_change %>%head()
## # A tibble: 6 x 30
##    fid16  wf601m  wf602m  wf603m  wf604m  wf605m  wf606m     wh9 pd501b
##   <dbl+> <dbl+l> <dbl+l> <dbl+l> <dbl+l> <dbl+l> <dbl+l> <dbl+l> <dbl+>
## 1 110011 2 [经常(~ 4 [很少(~ 5 [从不]~  5 [从不]~  2 [经常(~ 5 [从不]~  4 [高中]~    3000
## 2 110020 3 [偶尔(~ 2 [经常(~ 2 [经常(~ 3 [偶尔(~ 3 [偶尔(~ 3 [偶尔(~ 6 [大学本~   3500
## 3 110043 3 [偶尔(~ 4 [很少(~ 3 [偶尔(~ 2 [经常(~ 3 [偶尔(~ 4 [很少(~ 5 [大专]~    3000
## 4 120032 5 [从不]~  2 [经常(~ 5 [从不]~  4 [很少(~ 4 [很少(~ 4 [很少(~ 6 [大学本~     98
## 5 120072 3 [偶尔(~ 2 [经常(~ 2 [经常(~ 3 [偶尔(~ 2 [经常(~ 4 [很少(~ 4 [高中]~     250
## 6 120095 5 [从不]~  4 [很少(~ 1 [很经常~ 5 [从不]~  1 [很经常~ 5 [从不]~  6 [大学本~   1000
## # ... with 21 more variables: pd503r <dbl+lbl>, pd577r <dbl+lbl>,
## #   pa301 <dbl+lbl>, wz301 <dbl+lbl>, wz302 <dbl+lbl>, cfps_age <dbl+lbl>,
## #   cfps_gender <dbl+lbl>, wd4 <dbl+lbl>, pd501b_c <dbl+lbl>,
## #   pd503r_c <dbl+lbl>, pd577r_c <dbl+lbl>, wh9_c <dbl>, wf601m_c <dbl>,
## #   wf602m_c <dbl>, wf603m_c <dbl>, wf604m_c <dbl>, wf605m_c <dbl>,
## #   wf606m_c <dbl>, wz301_c <dbl>, wz302_c <dbl>, wf_average <dbl>

家庭表变量选取

#family %>% count(tb4_a16_f)
familyconf <- cfps2016familyconf  %>% 
  dplyr::select(fid16,
                tb4_a16_m,#母亲最高学历
                tb4_a16_f,#父
                )%>%
  filter(! tb4_a16_m %in% c(-8,-1,-2,-9),
         ! tb4_a16_f %in% c(-8,-1,-2,-9))%>%
  mutate(max_edu = if_else(tb4_a16_f >tb4_a16_m,tb4_a16_f,tb4_a16_m)) %>%
  drop_na()

家庭表变量重编码

change_2 <- function(x){
  case_when(
    x == 9 ~ 0,#按年限教育重编码
    x == 2 ~ 6,
    x == 3 ~ 9,
    x == 4 ~ 12,
    x == 5 ~ 15,
    x == 6 ~ 16,
    x == 7 ~ 19,
    x == 8 ~ 23,
    TRUE ~ NA_real_
  )
}
family_change <- familyconf %>%
  mutate(tb4_a16_m_c = change_2(tb4_a16_m),
         tb4_a16_f_c = change_2(tb4_a16_f) )%>%
  drop_na()

family_change %>% count(fid16)
## # A tibble: 9,044 x 2
##     fid16     n
##     <dbl> <int>
##  1 100051     1
##  2 100160     1
##  3 100376     1
##  4 100435     3
##  5 100453     3
##  6 100551     2
##  7 100724     1
##  8 100810     1
##  9 100879     1
## 10 101021     1
## # ... with 9,034 more rows
bbbb<- family_change %>%
  dplyr::distinct(fid16,.keep_all = TRUE)
cfps2016familycon %>% count(fincome1)
## # A tibble: 3,999 x 2
##     fincome1     n
##    <dbl+lbl> <int>
##  1         5     3
##  2        50     1
##  3        85     1
##  4       160     2
##  5       200     2
##  6       230     1
##  7       300     1
##  8       400     3
##  9       429     1
## 10       500     7
## # ... with 3,989 more rows
familycon <- cfps2016familycon  %>% 
  dplyr::select(fid16,
                finc,#收入
                )%>%
  filter(! finc %in% c(-8,-1,-2))%>%
  drop_na()


familycon %>% count(fid16)
## # A tibble: 13,842 x 2
##        fid16     n
##    <dbl+lbl> <int>
##  1    100051     1
##  2    100160     1
##  3    100286     1
##  4    100376     1
##  5    100435     1
##  6    100453     1
##  7    100531     1
##  8    100551     1
##  9    100569     1
## 10    100724     1
## # ... with 13,832 more rows
aaaa<- familycon %>%
  dplyr::distinct(fid16,.keep_all = TRUE)
library(visdat)
## Warning: package 'visdat' was built under R version 3.6.1
#adult %>%vis_dat()
child %>% vis_dat()

family_change %>% vis_dat()

#合并

fam <- family_change[!duplicated(family_change$fid16), ] 
bbb <- fam  %>% group_by("fid16") %>% count(fid16)
all <-child_change %>% left_join(bbbb,by ="fid16")%>%
  drop_na()
## Warning: Column `fid16` has different attributes on LHS and RHS of join
#all <-child_change %>% left_join(fam,by ="fid16")%>%
 # drop_na()
#all %>% is.na()
 #all_1 <-all %>% left_join(aaaa,by ="fid16")%>%
 #  drop_na()
#all_1 %>% colnames
#ready_cor <- all_1 %>%
  #dplyr::select(finc_c,max_edu,wf_average,wz302_c,wz301_c,cfps_gender,pd503r_c,wh9_c,pd501b_c,
  #              pd577r_c)
#cor_m <- all_1 %>%
 # cor(starts_with("wf60"))
 #write.csv(ready_cor,"all_5.csv")