No. 1

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6),family satisfaction (sat1i4), work satisfaction (sat1i1).

  • please import data of wave1-6 (i.e.from “data anchor1_50percent_Eng.dta” to “data anchor6_50percent_Eng.dta”)

Answer

library(skimr)
library(tidyverse) # Recoding and cleaning
library(haven) # Import data.
library(janitor) # Tabulation
library(ggplot2) # For plotting
wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")

No. 2

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6),family satisfaction (sat1i4), work satisfaction (sat1i1).

  • check whether the coding and levels are consistent across 6 waves for these variables:

Answer

#check coding across 6 waves

sex_fun <- function(df) {
  table(as_factor(df$sex_gen))
        }
sapply(mget(paste0("wave", 1:6)), sex_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -10 not in demodiff                   0     0     0     0     0     0
## -7 Incomplete data                    0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## 1 Male                             3029  2197  1905  1668  1493  1342
## 2 Female                           3172  2339  2050  1813  1626  1477
#same coding for gender

lfs_fun <- function(df) {
  table(as_factor(df$lfs))
        }
sapply(mget(paste0("wave", 1:6)), lfs_fun)
##                                                        wave1 wave2 wave3 wave4
## -7 Incomplete data                                        12    22    24     7
## -3 Does not apply                                          0     0     0     0
## 1 nw, education                                         2229  1441  1093   725
## 2 nw, parental leave                                     237   146   148   116
## 3 nw, homemaker                                          253   120    97    85
## 4 nw, unemployed                                         297   235   180   156
## 5 nw, military service                                     9     8    33    30
## 6 nw, retired                                             19    19    21    22
## 7 nw, other                                               33    15    21    26
## 8 w, vocational training                                 308   387   371   381
## 9 w, full-time employment                               1929  1337  1206  1159
## 10 w, part-time employment                               468   419   405   415
## 11 w, marginal employment (geringfügige Beschäftigung)   142   137   129   144
## 12 w, self-employed                                      202   164   159   153
## 13 w, other                                               63    86    68    62
##                                                        wave5 wave6
## -7 Incomplete data                                         4     1
## -3 Does not apply                                          0     0
## 1 nw, education                                          499   425
## 2 nw, parental leave                                      90    78
## 3 nw, homemaker                                           69    50
## 4 nw, unemployed                                         133   124
## 5 nw, military service                                    44    16
## 6 nw, retired                                             22    27
## 7 nw, other                                               28    19
## 8 w, vocational training                                 324   232
## 9 w, full-time employment                               1127  1109
## 10 w, part-time employment                               409   388
## 11 w, marginal employment (geringfügige Beschäftigung)   146   146
## 12 w, self-employed                                      167   156
## 13 w, other                                               57    48
#same coding for labor force participation

sat_fun <- function(df) {
  table(as_factor(df$sat6))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251
#same coding for life satisfaction

fam_fun <- function(df) {
  table(as_factor(df$sat1i4))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251
#same coding for family satisfaction

work_fun <- function(df) {
  table(as_factor(df$sat1i1))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251
#same coding for work satisfaction

age_fun <- function(df) {
  summary(df$age)
        }
sapply(mget(paste0("wave", 1:6)), age_fun)
##            wave1   wave2    wave3    wave4    wave5    wave6
## Min.    14.00000 15.0000 16.00000 17.00000 18.00000 19.00000
## 1st Qu. 17.00000 17.0000 18.00000 19.00000 20.00000 21.00000
## Median  26.00000 27.0000 28.00000 29.00000 30.00000 31.00000
## Mean    25.83728 26.4235 27.24526 28.46596 29.54569 30.66761
## 3rd Qu. 35.00000 36.0000 37.00000 38.00000 39.00000 40.00000
## Max.    38.00000 39.0000 40.00000 41.00000 42.00000 43.00000
# no meaningless age

kid_fun <- function(df) {
  summary(df$nkidsbio)
        }
sapply(mget(paste0("wave", 1:6)), kid_fun)
##             wave1  wave2      wave3      wave4      wave5      wave6
## Min.    -7.000000 -7.000  0.0000000  0.0000000  0.0000000  0.0000000
## 1st Qu.  0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Median   0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Mean     0.600387  0.625  0.6558786  0.7199081  0.7608208  0.8031217
## 3rd Qu.  1.000000  1.000  1.0000000  1.0000000  2.0000000  2.0000000
## Max.    10.000000 10.000 10.0000000 10.0000000 10.0000000 10.0000000
#in wave1 and 2, there are respondents who have no. of children <0.

No. 3

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6),family satisfaction (sat1i4), work satisfaction (sat1i1).

  • clean the variables across 6 waves.

Answer

clean_fun <- function(df) {
df %>% 
  transmute(
    id, 
    wave,
    age, 
    sex=as_factor(sex_gen), #make sex_gen as a factor
    lfs=as_factor(lfs), #make lfs as a factor
    lfs=case_when(lfs== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for lfs
                      TRUE ~ as.character(lfs))%>%  
      as_factor(), #make lfs as a factor again
    kidno=case_when(nkidsbio<0 ~ as.numeric(NA),  #specify when kidno is missing 
                   TRUE ~ as.numeric(nkidsbio)),
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
    family_sat=case_when(sat1i4<0 ~ as.numeric(NA), #specify when sat1i4 is missing
                   TRUE ~ as.numeric(sat1i4)),
    work_sat=case_when(sat1i1<0 ~ as.numeric(NA), #specify when sat1i1 is missing
                   TRUE ~ as.numeric(sat1i1))
    )
            
}

wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)

No. 4

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6),family satisfaction (sat1i4), work satisfaction (sat1i1).

  • please have a quick view of the six cleaned dataset. how many respondent ed had missing information of work satisfaction in wave6? which wave has the highest number of missing in labor force status (lfs)

Answer

skim(wave1a)
Data summary
Name wave1a
Number of rows 6201
Number of columns 9
_______________________
Column type frequency:
factor 2
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 2 F: 3172, 1 M: 3029, -10: 0, -7 : 0
lfs 12 1 FALSE 13 1 n: 2229, 9 w: 1929, 10 : 468, 8 w: 308

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 375526333.49 2.15848e+08 174000 186678000 378910000 561330000 749731000 ▇▇▇▇▇
wave 0 1 1.00 0.00000e+00 1 1 1 1 1 ▁▁▇▁▁
age 0 1 25.84 8.36000e+00 14 17 26 35 38 ▇▁▇▁▇
kidno 4 1 0.61 1.00000e+00 0 0 0 1 10 ▇▁▁▁▁
sat 5 1 7.61 1.75000e+00 0 7 8 9 10 ▁▁▂▇▅
family_sat 9 1 8.61 1.88000e+00 0 8 9 10 10 ▁▁▁▃▇
work_sat 23 1 7.23 2.22000e+00 0 6 8 9 10 ▁▁▃▇▆
skim(wave2a)
Data summary
Name wave2a
Number of rows 4536
Number of columns 9
_______________________
Column type frequency:
factor 2
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 2 F: 2339, 1 M: 2197, -10: 0, -7 : 0
lfs 22 1 FALSE 13 1 n: 1441, 9 w: 1337, 10 : 419, 8 w: 387

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 374873331.79 216507934.68 423000 186121000 379368000 563072250 749731000 ▇▇▇▇▇
wave 0 1 2.00 0.00 2 2 2 2 2 ▁▁▇▁▁
age 0 1 26.42 8.54 15 17 27 36 39 ▇▁▆▁▆
kidno 4 1 0.63 1.02 0 0 0 1 10 ▇▁▁▁▁
sat 6 1 7.73 1.63 0 7 8 9 10 ▁▁▂▇▅
family_sat 9 1 8.48 1.81 0 8 9 10 10 ▁▁▁▃▇
work_sat 14 1 7.17 2.23 0 6 8 9 10 ▁▁▃▇▅
skim(wave3a)
Data summary
Name wave3a
Number of rows 3955
Number of columns 9
_______________________
Column type frequency:
factor 2
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1.00 FALSE 2 2 F: 2050, 1 M: 1905, -10: 0, -7 : 0
lfs 24 0.99 FALSE 13 9 w: 1206, 1 n: 1093, 10 : 405, 8 w: 371

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 375488344.88 217691561.30 174000 186174500 379202000 563958500 749731000 ▇▇▇▇▇
wave 0 1 3.00 0.00 3 3 3 3 3 ▁▁▇▁▁
age 0 1 27.25 8.51 16 18 28 37 40 ▇▁▆▁▆
kidno 0 1 0.66 1.03 0 0 0 1 10 ▇▁▁▁▁
sat 5 1 7.59 1.66 0 7 8 9 10 ▁▁▂▇▅
family_sat 1 1 8.34 1.78 0 8 9 10 10 ▁▁▂▅▇
work_sat 16 1 7.22 2.14 0 6 8 9 10 ▁▁▃▇▅
skim(wave4a)
Data summary
Name wave4a
Number of rows 3481
Number of columns 9
_______________________
Column type frequency:
factor 2
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 2 F: 1813, 1 M: 1668, -10: 0, -7 : 0
lfs 7 1 FALSE 13 9 w: 1159, 1 n: 725, 10 : 415, 8 w: 381

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 374317614.48 218152720.53 423000 183307000 377376000 565701000 749731000 ▇▇▇▇▇
wave 0 1.00 4.00 0.00 4 4 4 4 4 ▁▁▇▁▁
age 0 1.00 28.47 8.56 17 19 29 38 41 ▇▁▆▁▇
kidno 0 1.00 0.72 1.07 0 0 0 1 10 ▇▁▁▁▁
sat 4 1.00 7.53 1.70 0 7 8 9 10 ▁▁▂▇▅
family_sat 4 1.00 8.31 1.78 0 8 9 10 10 ▁▁▂▅▇
work_sat 28 0.99 7.26 2.17 0 6 8 9 10 ▁▁▂▇▅
skim(wave5a)
Data summary
Name wave5a
Number of rows 3119
Number of columns 9
_______________________
Column type frequency:
factor 2
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 2 F: 1626, 1 M: 1493, -10: 0, -7 : 0
lfs 4 1 FALSE 13 9 w: 1127, 1 n: 499, 10 : 409, 8 w: 324

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 376790387.30 218227325.96 423000 184491500 381315000 567148500 749731000 ▇▇▇▇▇
wave 0 1.00 5.00 0.00 5 5 5 5 5 ▁▁▇▁▁
age 0 1.00 29.55 8.54 18 20 30 39 42 ▇▁▆▁▇
kidno 0 1.00 0.76 1.09 0 0 0 2 10 ▇▁▁▁▁
sat 3 1.00 7.53 1.62 0 7 8 9 10 ▁▁▂▇▃
family_sat 2 1.00 8.23 1.76 0 7 9 10 10 ▁▁▂▆▇
work_sat 21 0.99 7.32 2.09 0 7 8 9 10 ▁▁▂▇▅
skim(wave6a)
Data summary
Name wave6a
Number of rows 2819
Number of columns 9
_______________________
Column type frequency:
factor 2
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 2 F: 1477, 1 M: 1342, -10: 0, -7 : 0
lfs 1 1 FALSE 13 9 w: 1109, 1 n: 425, 10 : 388, 8 w: 232

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 375146854.56 217789668.51 423000 184944000 378192000 564950000 749197000 ▇▇▇▇▇
wave 0 1.00 6.00 0.00 6 6 6 6 6 ▁▁▇▁▁
age 0 1.00 30.67 8.51 19 21 31 40 43 ▇▁▆▁▇
kidno 0 1.00 0.80 1.10 0 0 0 2 10 ▇▁▁▁▁
sat 4 1.00 7.59 1.61 0 7 8 9 10 ▁▁▂▇▃
family_sat 1 1.00 8.26 1.78 0 8 9 10 10 ▁▁▂▅▇
work_sat 18 0.99 7.27 2.13 0 6 8 9 10 ▁▁▂▇▅
# in wave6, 18 respondents had missing information in work satisfaction.
# wave 3 has the highest missing in labor force status, 24 respondents