3.check missings of the variables
tabyl(wave1,age) #no missing
## age n percent
## 14 41 0.006611837
## 15 708 0.114175133
## 16 722 0.116432833
## 17 667 0.107563296
## 18 35 0.005644251
## 24 24 0.003870343
## 25 577 0.093049508
## 26 678 0.109337204
## 27 647 0.104338010
## 28 87 0.014029995
## 34 22 0.003547815
## 35 502 0.080954685
## 36 618 0.099661345
## 37 772 0.124496049
## 38 101 0.016287696
tabyl(wave1,sex_gen) #no missing
## sex_gen n percent
## 1 3029 0.4884696
## 2 3172 0.5115304
tabyl(wave1,relstat) # 34 cases reporting -7, needs cleaning
## relstat n percent
## -7 34 0.0054829866
## 1 2448 0.3947750363
## 2 1012 0.1631994840
## 3 660 0.1064344461
## 4 1735 0.2797935817
## 5 23 0.0037090792
## 6 146 0.0235445896
## 7 63 0.0101596517
## 8 76 0.0122560877
## 9 3 0.0004837929
## 10 1 0.0001612643
tabyl(wave1, nkids) #no missing
## nkids n percent
## 0 4091 0.6597323012
## 1 856 0.1380422512
## 2 877 0.1414288018
## 3 287 0.0462828576
## 4 62 0.0099983874
## 5 19 0.0030640219
## 6 6 0.0009675859
## 7 1 0.0001612643
## 10 2 0.0003225286
tabyl(wave1, hlt1) #10 cases reporting -1 or -2, needs cleaning
## hlt1 n percent
## -2 7 0.0011288502
## -1 3 0.0004837929
## 1 107 0.0172552814
## 2 580 0.0935333011
## 3 1172 0.1890017739
## 4 2785 0.4491211095
## 5 1547 0.2494758910
tabyl(wave1,sd30) #28 cases reporting -1 or -2, needs cleaning
## sd30 n percent
## -2 19 0.0030640219
## -1 9 0.0014513788
## 1 1875 0.3023705854
## 2 2008 0.3238187389
## 3 388 0.0625705531
## 4 5 0.0008063216
## 5 147 0.0237058539
## 6 65 0.0104821803
## 7 1685 0.2717303661
tabyl(wave1,sat1i4) #9 cases reporting -1 or -2, needs cleaning
## sat1i4 n percent
## -2 3 0.0004837929
## -1 6 0.0009675859
## 0 28 0.0045154007
## 1 21 0.0033865506
## 2 55 0.0088695372
## 3 70 0.0112885019
## 4 80 0.0129011450
## 5 237 0.0382196420
## 6 271 0.0437026286
## 7 486 0.0783744557
## 8 901 0.1452991453
## 9 1181 0.1904531527
## 10 2862 0.4615384615
4.Recoding
wave1b <- wave1 %>%
transmute(
age,
nkids,
famsat=case_when(sat1i4<0 ~ as.numeric(NA), #specify when sat should be considered missing
TRUE ~ as.numeric(sat1i4)),
gender=as_factor(sex_gen) %>% fct_drop(), #treat sex_gen as categorical, and drop unused level
relstat=as_factor(relstat), #treat relationship status as categorical
relstat_new1=case_when(
relstat=="-7 Incomplete data" ~ as.character(NA),#specify when it should be missing
TRUE ~ as.character(relstat)
) %>% as_factor() %>% fct_drop(),
#make relstat as a factor, and then drop unused levels in relstat_new1
health=case_when(
hlt1<0 ~ as.numeric(NA),#specify when it should be missing
TRUE ~ as.numeric(hlt1)),
religion=case_when(
sd30<0 ~ as.character(NA), #specify when it should be missing
sd30==7 ~ "No",#specify when it should be "no religion"
sd30 %in% c(1:6) ~ "Yes" #specify when it should be "have religion"
) %>% as_factor()%>%fct_relevel("No", "Yes") #use "No" as reference level
)%>% drop_na() #drop all observations with missing values in the sample
# sample size change from 6201 to 6126