3.check missings of the variables
tabyl(wave1,age) #no missing
##  age   n     percent
##   14  41 0.006611837
##   15 708 0.114175133
##   16 722 0.116432833
##   17 667 0.107563296
##   18  35 0.005644251
##   24  24 0.003870343
##   25 577 0.093049508
##   26 678 0.109337204
##   27 647 0.104338010
##   28  87 0.014029995
##   34  22 0.003547815
##   35 502 0.080954685
##   36 618 0.099661345
##   37 772 0.124496049
##   38 101 0.016287696
tabyl(wave1,sex_gen) #no missing
##  sex_gen    n   percent
##        1 3029 0.4884696
##        2 3172 0.5115304
tabyl(wave1,relstat) # 34 cases reporting -7, needs cleaning
##  relstat    n      percent
##       -7   34 0.0054829866
##        1 2448 0.3947750363
##        2 1012 0.1631994840
##        3  660 0.1064344461
##        4 1735 0.2797935817
##        5   23 0.0037090792
##        6  146 0.0235445896
##        7   63 0.0101596517
##        8   76 0.0122560877
##        9    3 0.0004837929
##       10    1 0.0001612643
tabyl(wave1, nkids) #no missing
##  nkids    n      percent
##      0 4091 0.6597323012
##      1  856 0.1380422512
##      2  877 0.1414288018
##      3  287 0.0462828576
##      4   62 0.0099983874
##      5   19 0.0030640219
##      6    6 0.0009675859
##      7    1 0.0001612643
##     10    2 0.0003225286
tabyl(wave1, hlt1) #10 cases reporting -1 or -2, needs cleaning
##  hlt1    n      percent
##    -2    7 0.0011288502
##    -1    3 0.0004837929
##     1  107 0.0172552814
##     2  580 0.0935333011
##     3 1172 0.1890017739
##     4 2785 0.4491211095
##     5 1547 0.2494758910
tabyl(wave1,sd30) #28 cases reporting -1 or -2, needs cleaning
##  sd30    n      percent
##    -2   19 0.0030640219
##    -1    9 0.0014513788
##     1 1875 0.3023705854
##     2 2008 0.3238187389
##     3  388 0.0625705531
##     4    5 0.0008063216
##     5  147 0.0237058539
##     6   65 0.0104821803
##     7 1685 0.2717303661
tabyl(wave1,sat1i4) #9 cases  reporting -1 or -2, needs cleaning
##  sat1i4    n      percent
##      -2    3 0.0004837929
##      -1    6 0.0009675859
##       0   28 0.0045154007
##       1   21 0.0033865506
##       2   55 0.0088695372
##       3   70 0.0112885019
##       4   80 0.0129011450
##       5  237 0.0382196420
##       6  271 0.0437026286
##       7  486 0.0783744557
##       8  901 0.1452991453
##       9 1181 0.1904531527
##      10 2862 0.4615384615
 
4.Recoding
wave1b <- wave1 %>% 
  transmute(
    age, 
    nkids,
    famsat=case_when(sat1i4<0 ~ as.numeric(NA), #specify when sat should be considered missing
                   TRUE ~ as.numeric(sat1i4)),
    
    gender=as_factor(sex_gen) %>% fct_drop(), #treat sex_gen as categorical, and drop unused level
    
    relstat=as_factor(relstat), #treat relationship status as categorical
    relstat_new1=case_when(
      relstat=="-7 Incomplete data" ~ as.character(NA),#specify when it should be missing
      TRUE ~ as.character(relstat)
                          ) %>% as_factor() %>% fct_drop(),
      #make relstat as a factor, and then drop unused levels in relstat_new1
    
    health=case_when(
      hlt1<0 ~ as.numeric(NA),#specify when it should be missing
      TRUE ~ as.numeric(hlt1)),
    
    religion=case_when(
      sd30<0 ~ as.character(NA), #specify when it should be missing
      sd30==7 ~ "No",#specify when it should be "no religion"
      sd30 %in% c(1:6) ~ "Yes" #specify when it should be "have religion"
           ) %>% as_factor()%>%fct_relevel("No", "Yes") #use "No" as reference level
           
           )%>% drop_na() #drop all observations with missing values in the sample
# sample size change from 6201 to 6126