Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6),family satisfaction (sat1i4), work satisfaction (sat1i1).
library(skimr)
library(tidyverse) # Recoding and cleaning
library(haven) # Import data.
library(janitor) # Tabulation
library(ggplot2) # For plottingwave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6),family satisfaction (sat1i4), work satisfaction (sat1i1).
#check coding across 6 waves
sex_fun <- function(df) {
  table(as_factor(df$sex_gen))
        }
sapply(mget(paste0("wave", 1:6)), sex_fun)##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -10 not in demodiff                   0     0     0     0     0     0
## -7 Incomplete data                    0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## 1 Male                             3029  2197  1905  1668  1493  1342
## 2 Female                           3172  2339  2050  1813  1626  1477#same coding for gender
lfs_fun <- function(df) {
  table(as_factor(df$lfs))
        }
sapply(mget(paste0("wave", 1:6)), lfs_fun)##                                                        wave1 wave2 wave3 wave4
## -7 Incomplete data                                        12    22    24     7
## -3 Does not apply                                          0     0     0     0
## 1 nw, education                                         2229  1441  1093   725
## 2 nw, parental leave                                     237   146   148   116
## 3 nw, homemaker                                          253   120    97    85
## 4 nw, unemployed                                         297   235   180   156
## 5 nw, military service                                     9     8    33    30
## 6 nw, retired                                             19    19    21    22
## 7 nw, other                                               33    15    21    26
## 8 w, vocational training                                 308   387   371   381
## 9 w, full-time employment                               1929  1337  1206  1159
## 10 w, part-time employment                               468   419   405   415
## 11 w, marginal employment (geringfügige Beschäftigung)   142   137   129   144
## 12 w, self-employed                                      202   164   159   153
## 13 w, other                                               63    86    68    62
##                                                        wave5 wave6
## -7 Incomplete data                                         4     1
## -3 Does not apply                                          0     0
## 1 nw, education                                          499   425
## 2 nw, parental leave                                      90    78
## 3 nw, homemaker                                           69    50
## 4 nw, unemployed                                         133   124
## 5 nw, military service                                    44    16
## 6 nw, retired                                             22    27
## 7 nw, other                                               28    19
## 8 w, vocational training                                 324   232
## 9 w, full-time employment                               1127  1109
## 10 w, part-time employment                               409   388
## 11 w, marginal employment (geringfügige Beschäftigung)   146   146
## 12 w, self-employed                                      167   156
## 13 w, other                                               57    48#same coding for labor force participation
sat_fun <- function(df) {
  table(as_factor(df$sat6))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251#same coding for life satisfaction
fam_fun <- function(df) {
  table(as_factor(df$sat1i4))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251#same coding for family satisfaction
work_fun <- function(df) {
  table(as_factor(df$sat1i1))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251#same coding for work satisfaction
age_fun <- function(df) {
  summary(df$age)
        }
sapply(mget(paste0("wave", 1:6)), age_fun)##            wave1   wave2    wave3    wave4    wave5    wave6
## Min.    14.00000 15.0000 16.00000 17.00000 18.00000 19.00000
## 1st Qu. 17.00000 17.0000 18.00000 19.00000 20.00000 21.00000
## Median  26.00000 27.0000 28.00000 29.00000 30.00000 31.00000
## Mean    25.83728 26.4235 27.24526 28.46596 29.54569 30.66761
## 3rd Qu. 35.00000 36.0000 37.00000 38.00000 39.00000 40.00000
## Max.    38.00000 39.0000 40.00000 41.00000 42.00000 43.00000# no meaningless age
kid_fun <- function(df) {
  summary(df$nkidsbio)
        }
sapply(mget(paste0("wave", 1:6)), kid_fun)##             wave1  wave2      wave3      wave4      wave5      wave6
## Min.    -7.000000 -7.000  0.0000000  0.0000000  0.0000000  0.0000000
## 1st Qu.  0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Median   0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Mean     0.600387  0.625  0.6558786  0.7199081  0.7608208  0.8031217
## 3rd Qu.  1.000000  1.000  1.0000000  1.0000000  2.0000000  2.0000000
## Max.    10.000000 10.000 10.0000000 10.0000000 10.0000000 10.0000000#in wave1 and 2, there are respondents who have no. of children <0.Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6),family satisfaction (sat1i4), work satisfaction (sat1i1).
clean_fun <- function(df) {
df %>% 
  transmute(
    id, 
    wave,
    age, 
    sex=as_factor(sex_gen), #make sex_gen as a factor
    lfs=as_factor(lfs), #make lfs as a factor
    lfs=case_when(lfs== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for lfs
                      TRUE ~ as.character(lfs))%>%  
      as_factor(), #make lfs as a factor again
    kidno=case_when(nkidsbio<0 ~ as.numeric(NA),  #specify when kidno is missing 
                   TRUE ~ as.numeric(nkidsbio)),
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
    family_sat=case_when(sat1i4<0 ~ as.numeric(NA), #specify when sat1i4 is missing
                   TRUE ~ as.numeric(sat1i4)),
    work_sat=case_when(sat1i1<0 ~ as.numeric(NA), #specify when sat1i1 is missing
                   TRUE ~ as.numeric(sat1i1))
    )
            
}
wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6),family satisfaction (sat1i4), work satisfaction (sat1i1).
skim(wave1a)| Name | wave1a | 
| Number of rows | 6201 | 
| Number of columns | 9 | 
| _______________________ | |
| Column type frequency: | |
| factor | 2 | 
| numeric | 7 | 
| ________________________ | |
| Group variables | None | 
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts | 
|---|---|---|---|---|---|
| sex | 0 | 1 | FALSE | 2 | 2 F: 3172, 1 M: 3029, -10: 0, -7 : 0 | 
| lfs | 12 | 1 | FALSE | 13 | 1 n: 2229, 9 w: 1929, 10 : 468, 8 w: 308 | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 375526333.49 | 2.15848e+08 | 174000 | 186678000 | 378910000 | 561330000 | 749731000 | ▇▇▇▇▇ | 
| wave | 0 | 1 | 1.00 | 0.00000e+00 | 1 | 1 | 1 | 1 | 1 | ▁▁▇▁▁ | 
| age | 0 | 1 | 25.84 | 8.36000e+00 | 14 | 17 | 26 | 35 | 38 | ▇▁▇▁▇ | 
| kidno | 4 | 1 | 0.61 | 1.00000e+00 | 0 | 0 | 0 | 1 | 10 | ▇▁▁▁▁ | 
| sat | 5 | 1 | 7.61 | 1.75000e+00 | 0 | 7 | 8 | 9 | 10 | ▁▁▂▇▅ | 
| family_sat | 9 | 1 | 8.61 | 1.88000e+00 | 0 | 8 | 9 | 10 | 10 | ▁▁▁▃▇ | 
| work_sat | 23 | 1 | 7.23 | 2.22000e+00 | 0 | 6 | 8 | 9 | 10 | ▁▁▃▇▆ | 
skim(wave2a)| Name | wave2a | 
| Number of rows | 4536 | 
| Number of columns | 9 | 
| _______________________ | |
| Column type frequency: | |
| factor | 2 | 
| numeric | 7 | 
| ________________________ | |
| Group variables | None | 
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts | 
|---|---|---|---|---|---|
| sex | 0 | 1 | FALSE | 2 | 2 F: 2339, 1 M: 2197, -10: 0, -7 : 0 | 
| lfs | 22 | 1 | FALSE | 13 | 1 n: 1441, 9 w: 1337, 10 : 419, 8 w: 387 | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 374873331.79 | 216507934.68 | 423000 | 186121000 | 379368000 | 563072250 | 749731000 | ▇▇▇▇▇ | 
| wave | 0 | 1 | 2.00 | 0.00 | 2 | 2 | 2 | 2 | 2 | ▁▁▇▁▁ | 
| age | 0 | 1 | 26.42 | 8.54 | 15 | 17 | 27 | 36 | 39 | ▇▁▆▁▆ | 
| kidno | 4 | 1 | 0.63 | 1.02 | 0 | 0 | 0 | 1 | 10 | ▇▁▁▁▁ | 
| sat | 6 | 1 | 7.73 | 1.63 | 0 | 7 | 8 | 9 | 10 | ▁▁▂▇▅ | 
| family_sat | 9 | 1 | 8.48 | 1.81 | 0 | 8 | 9 | 10 | 10 | ▁▁▁▃▇ | 
| work_sat | 14 | 1 | 7.17 | 2.23 | 0 | 6 | 8 | 9 | 10 | ▁▁▃▇▅ | 
skim(wave3a)| Name | wave3a | 
| Number of rows | 3955 | 
| Number of columns | 9 | 
| _______________________ | |
| Column type frequency: | |
| factor | 2 | 
| numeric | 7 | 
| ________________________ | |
| Group variables | None | 
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts | 
|---|---|---|---|---|---|
| sex | 0 | 1.00 | FALSE | 2 | 2 F: 2050, 1 M: 1905, -10: 0, -7 : 0 | 
| lfs | 24 | 0.99 | FALSE | 13 | 9 w: 1206, 1 n: 1093, 10 : 405, 8 w: 371 | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 375488344.88 | 217691561.30 | 174000 | 186174500 | 379202000 | 563958500 | 749731000 | ▇▇▇▇▇ | 
| wave | 0 | 1 | 3.00 | 0.00 | 3 | 3 | 3 | 3 | 3 | ▁▁▇▁▁ | 
| age | 0 | 1 | 27.25 | 8.51 | 16 | 18 | 28 | 37 | 40 | ▇▁▆▁▆ | 
| kidno | 0 | 1 | 0.66 | 1.03 | 0 | 0 | 0 | 1 | 10 | ▇▁▁▁▁ | 
| sat | 5 | 1 | 7.59 | 1.66 | 0 | 7 | 8 | 9 | 10 | ▁▁▂▇▅ | 
| family_sat | 1 | 1 | 8.34 | 1.78 | 0 | 8 | 9 | 10 | 10 | ▁▁▂▅▇ | 
| work_sat | 16 | 1 | 7.22 | 2.14 | 0 | 6 | 8 | 9 | 10 | ▁▁▃▇▅ | 
skim(wave4a)| Name | wave4a | 
| Number of rows | 3481 | 
| Number of columns | 9 | 
| _______________________ | |
| Column type frequency: | |
| factor | 2 | 
| numeric | 7 | 
| ________________________ | |
| Group variables | None | 
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts | 
|---|---|---|---|---|---|
| sex | 0 | 1 | FALSE | 2 | 2 F: 1813, 1 M: 1668, -10: 0, -7 : 0 | 
| lfs | 7 | 1 | FALSE | 13 | 9 w: 1159, 1 n: 725, 10 : 415, 8 w: 381 | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 374317614.48 | 218152720.53 | 423000 | 183307000 | 377376000 | 565701000 | 749731000 | ▇▇▇▇▇ | 
| wave | 0 | 1.00 | 4.00 | 0.00 | 4 | 4 | 4 | 4 | 4 | ▁▁▇▁▁ | 
| age | 0 | 1.00 | 28.47 | 8.56 | 17 | 19 | 29 | 38 | 41 | ▇▁▆▁▇ | 
| kidno | 0 | 1.00 | 0.72 | 1.07 | 0 | 0 | 0 | 1 | 10 | ▇▁▁▁▁ | 
| sat | 4 | 1.00 | 7.53 | 1.70 | 0 | 7 | 8 | 9 | 10 | ▁▁▂▇▅ | 
| family_sat | 4 | 1.00 | 8.31 | 1.78 | 0 | 8 | 9 | 10 | 10 | ▁▁▂▅▇ | 
| work_sat | 28 | 0.99 | 7.26 | 2.17 | 0 | 6 | 8 | 9 | 10 | ▁▁▂▇▅ | 
skim(wave5a)| Name | wave5a | 
| Number of rows | 3119 | 
| Number of columns | 9 | 
| _______________________ | |
| Column type frequency: | |
| factor | 2 | 
| numeric | 7 | 
| ________________________ | |
| Group variables | None | 
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts | 
|---|---|---|---|---|---|
| sex | 0 | 1 | FALSE | 2 | 2 F: 1626, 1 M: 1493, -10: 0, -7 : 0 | 
| lfs | 4 | 1 | FALSE | 13 | 9 w: 1127, 1 n: 499, 10 : 409, 8 w: 324 | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 376790387.30 | 218227325.96 | 423000 | 184491500 | 381315000 | 567148500 | 749731000 | ▇▇▇▇▇ | 
| wave | 0 | 1.00 | 5.00 | 0.00 | 5 | 5 | 5 | 5 | 5 | ▁▁▇▁▁ | 
| age | 0 | 1.00 | 29.55 | 8.54 | 18 | 20 | 30 | 39 | 42 | ▇▁▆▁▇ | 
| kidno | 0 | 1.00 | 0.76 | 1.09 | 0 | 0 | 0 | 2 | 10 | ▇▁▁▁▁ | 
| sat | 3 | 1.00 | 7.53 | 1.62 | 0 | 7 | 8 | 9 | 10 | ▁▁▂▇▃ | 
| family_sat | 2 | 1.00 | 8.23 | 1.76 | 0 | 7 | 9 | 10 | 10 | ▁▁▂▆▇ | 
| work_sat | 21 | 0.99 | 7.32 | 2.09 | 0 | 7 | 8 | 9 | 10 | ▁▁▂▇▅ | 
skim(wave6a)| Name | wave6a | 
| Number of rows | 2819 | 
| Number of columns | 9 | 
| _______________________ | |
| Column type frequency: | |
| factor | 2 | 
| numeric | 7 | 
| ________________________ | |
| Group variables | None | 
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts | 
|---|---|---|---|---|---|
| sex | 0 | 1 | FALSE | 2 | 2 F: 1477, 1 M: 1342, -10: 0, -7 : 0 | 
| lfs | 1 | 1 | FALSE | 13 | 9 w: 1109, 1 n: 425, 10 : 388, 8 w: 232 | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 375146854.56 | 217789668.51 | 423000 | 184944000 | 378192000 | 564950000 | 749197000 | ▇▇▇▇▇ | 
| wave | 0 | 1.00 | 6.00 | 0.00 | 6 | 6 | 6 | 6 | 6 | ▁▁▇▁▁ | 
| age | 0 | 1.00 | 30.67 | 8.51 | 19 | 21 | 31 | 40 | 43 | ▇▁▆▁▇ | 
| kidno | 0 | 1.00 | 0.80 | 1.10 | 0 | 0 | 0 | 2 | 10 | ▇▁▁▁▁ | 
| sat | 4 | 1.00 | 7.59 | 1.61 | 0 | 7 | 8 | 9 | 10 | ▁▁▂▇▃ | 
| family_sat | 1 | 1.00 | 8.26 | 1.78 | 0 | 8 | 9 | 10 | 10 | ▁▁▂▅▇ | 
| work_sat | 18 | 0.99 | 7.27 | 2.13 | 0 | 6 | 8 | 9 | 10 | ▁▁▂▇▅ | 
# in wave6, 18 respondents had missing information in work satisfaction.
# wave 3 has the highest missing in labor force status, 24 respondents