Exercise 7

No. 1

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6)，family satisfaction (sat1i4), work satisfaction (sat1i1).

please import data of wave1-6 (i.e.from “data anchor1_50percent_Eng.dta” to “data anchor6_50percent_Eng.dta”)

Answer

library(skimr)
library(tidyverse) # Recoding and cleaning
library(haven) # Import data.
library(janitor) # Tabulation
library(ggplot2) # For plotting

wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")

No. 2

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6)，family satisfaction (sat1i4), work satisfaction (sat1i1).

check whether the coding and levels are consistent across 6 waves for these variables:

Answer

#check coding across 6 waves

sex_fun <- function(df) {
  table(as_factor(df$sex_gen))
        }
sapply(mget(paste0("wave", 1:6)), sex_fun)

##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -10 not in demodiff                   0     0     0     0     0     0
## -7 Incomplete data                    0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## 1 Male                             3029  2197  1905  1668  1493  1342
## 2 Female                           3172  2339  2050  1813  1626  1477

#same coding for gender

lfs_fun <- function(df) {
  table(as_factor(df$lfs))
        }
sapply(mget(paste0("wave", 1:6)), lfs_fun)

##                                                        wave1 wave2 wave3 wave4
## -7 Incomplete data                                        12    22    24     7
## -3 Does not apply                                          0     0     0     0
## 1 nw, education                                         2229  1441  1093   725
## 2 nw, parental leave                                     237   146   148   116
## 3 nw, homemaker                                          253   120    97    85
## 4 nw, unemployed                                         297   235   180   156
## 5 nw, military service                                     9     8    33    30
## 6 nw, retired                                             19    19    21    22
## 7 nw, other                                               33    15    21    26
## 8 w, vocational training                                 308   387   371   381
## 9 w, full-time employment                               1929  1337  1206  1159
## 10 w, part-time employment                               468   419   405   415
## 11 w, marginal employment (geringfügige Beschäftigung)   142   137   129   144
## 12 w, self-employed                                      202   164   159   153
## 13 w, other                                               63    86    68    62
##                                                        wave5 wave6
## -7 Incomplete data                                         4     1
## -3 Does not apply                                          0     0
## 1 nw, education                                          499   425
## 2 nw, parental leave                                      90    78
## 3 nw, homemaker                                           69    50
## 4 nw, unemployed                                         133   124
## 5 nw, military service                                    44    16
## 6 nw, retired                                             22    27
## 7 nw, other                                               28    19
## 8 w, vocational training                                 324   232
## 9 w, full-time employment                               1127  1109
## 10 w, part-time employment                               409   388
## 11 w, marginal employment (geringfügige Beschäftigung)   146   146
## 12 w, self-employed                                      167   156
## 13 w, other                                               57    48

#same coding for labor force participation

sat_fun <- function(df) {
  table(as_factor(df$sat6))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)

##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251

#same coding for life satisfaction

fam_fun <- function(df) {
  table(as_factor(df$sat1i4))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)

##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251

#same coding for family satisfaction

work_fun <- function(df) {
  table(as_factor(df$sat1i1))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)

##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251

#same coding for work satisfaction

age_fun <- function(df) {
  summary(df$age)
        }
sapply(mget(paste0("wave", 1:6)), age_fun)

##            wave1   wave2    wave3    wave4    wave5    wave6
## Min.    14.00000 15.0000 16.00000 17.00000 18.00000 19.00000
## 1st Qu. 17.00000 17.0000 18.00000 19.00000 20.00000 21.00000
## Median  26.00000 27.0000 28.00000 29.00000 30.00000 31.00000
## Mean    25.83728 26.4235 27.24526 28.46596 29.54569 30.66761
## 3rd Qu. 35.00000 36.0000 37.00000 38.00000 39.00000 40.00000
## Max.    38.00000 39.0000 40.00000 41.00000 42.00000 43.00000

# no meaningless age

kid_fun <- function(df) {
  summary(df$nkidsbio)
        }
sapply(mget(paste0("wave", 1:6)), kid_fun)

##             wave1  wave2      wave3      wave4      wave5      wave6
## Min.    -7.000000 -7.000  0.0000000  0.0000000  0.0000000  0.0000000
## 1st Qu.  0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Median   0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Mean     0.600387  0.625  0.6558786  0.7199081  0.7608208  0.8031217
## 3rd Qu.  1.000000  1.000  1.0000000  1.0000000  2.0000000  2.0000000
## Max.    10.000000 10.000 10.0000000 10.0000000 10.0000000 10.0000000

#in wave1 and 2, there are respondents who have no. of children <0.

No. 3

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6)，family satisfaction (sat1i4), work satisfaction (sat1i1).

clean the variables across 6 waves.

Answer

clean_fun <- function(df) {
df %>% 
  transmute(
    id, 
    wave,
    age, 
    sex=as_factor(sex_gen), #make sex_gen as a factor
    lfs=as_factor(lfs), #make lfs as a factor
    lfs=case_when(lfs== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for lfs
                      TRUE ~ as.character(lfs))%>%  
      as_factor(), #make lfs as a factor again
    kidno=case_when(nkidsbio<0 ~ as.numeric(NA),  #specify when kidno is missing 
                   TRUE ~ as.numeric(nkidsbio)),
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
    family_sat=case_when(sat1i4<0 ~ as.numeric(NA), #specify when sat1i4 is missing
                   TRUE ~ as.numeric(sat1i4)),
    work_sat=case_when(sat1i1<0 ~ as.numeric(NA), #specify when sat1i1 is missing
                   TRUE ~ as.numeric(sat1i1))
    )
            
}

wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)

No. 4

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), life satisfaction (sat6)，family satisfaction (sat1i4), work satisfaction (sat1i1).

please have a quick view of the six cleaned dataset. how many respondent ed had missing information of work satisfaction in wave6? which wave has the highest number of missing in labor force status (lfs)

Answer

skim(wave1a)

Data summary
Name	wave1a
Number of rows	6201
Number of columns	9
_______________________
Column type frequency:
factor	2
numeric	7
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
sex	0	1	FALSE	2	2 F: 3172, 1 M: 3029, -10: 0, -7 : 0
lfs	12	1	FALSE	13	1 n: 2229, 9 w: 1929, 10 : 468, 8 w: 308

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1	375526333.49	2.15848e+08	174000	186678000	378910000	561330000	749731000	▇▇▇▇▇
wave	0	1	1.00	0.00000e+00	1	1	1	1	1	▁▁▇▁▁
age	0	1	25.84	8.36000e+00	14	17	26	35	38	▇▁▇▁▇
kidno	4	1	0.61	1.00000e+00	0	0	0	1	10	▇▁▁▁▁
sat	5	1	7.61	1.75000e+00	0	7	8	9	10	▁▁▂▇▅
family_sat	9	1	8.61	1.88000e+00	0	8	9	10	10	▁▁▁▃▇
work_sat	23	1	7.23	2.22000e+00	0	6	8	9	10	▁▁▃▇▆

skim(wave2a)

Data summary
Name	wave2a
Number of rows	4536
Number of columns	9
_______________________
Column type frequency:
factor	2
numeric	7
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
sex	0	1	FALSE	2	2 F: 2339, 1 M: 2197, -10: 0, -7 : 0
lfs	22	1	FALSE	13	1 n: 1441, 9 w: 1337, 10 : 419, 8 w: 387

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1	374873331.79	216507934.68	423000	186121000	379368000	563072250	749731000	▇▇▇▇▇
wave	0	1	2.00	0.00	2	2	2	2	2	▁▁▇▁▁
age	0	1	26.42	8.54	15	17	27	36	39	▇▁▆▁▆
kidno	4	1	0.63	1.02	0	0	0	1	10	▇▁▁▁▁
sat	6	1	7.73	1.63	0	7	8	9	10	▁▁▂▇▅
family_sat	9	1	8.48	1.81	0	8	9	10	10	▁▁▁▃▇
work_sat	14	1	7.17	2.23	0	6	8	9	10	▁▁▃▇▅

skim(wave3a)

Data summary
Name	wave3a
Number of rows	3955
Number of columns	9
_______________________
Column type frequency:
factor	2
numeric	7
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
sex	0	1.00	FALSE	2	2 F: 2050, 1 M: 1905, -10: 0, -7 : 0
lfs	24	0.99	FALSE	13	9 w: 1206, 1 n: 1093, 10 : 405, 8 w: 371

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1	375488344.88	217691561.30	174000	186174500	379202000	563958500	749731000	▇▇▇▇▇
wave	0	1	3.00	0.00	3	3	3	3	3	▁▁▇▁▁
age	0	1	27.25	8.51	16	18	28	37	40	▇▁▆▁▆
kidno	0	1	0.66	1.03	0	0	0	1	10	▇▁▁▁▁
sat	5	1	7.59	1.66	0	7	8	9	10	▁▁▂▇▅
family_sat	1	1	8.34	1.78	0	8	9	10	10	▁▁▂▅▇
work_sat	16	1	7.22	2.14	0	6	8	9	10	▁▁▃▇▅

skim(wave4a)

Data summary
Name	wave4a
Number of rows	3481
Number of columns	9
_______________________
Column type frequency:
factor	2
numeric	7
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
sex	0	1	FALSE	2	2 F: 1813, 1 M: 1668, -10: 0, -7 : 0
lfs	7	1	FALSE	13	9 w: 1159, 1 n: 725, 10 : 415, 8 w: 381

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	374317614.48	218152720.53	423000	183307000	377376000	565701000	749731000	▇▇▇▇▇
wave	0	1.00	4.00	0.00	4	4	4	4	4	▁▁▇▁▁
age	0	1.00	28.47	8.56	17	19	29	38	41	▇▁▆▁▇
kidno	0	1.00	0.72	1.07	0	0	0	1	10	▇▁▁▁▁
sat	4	1.00	7.53	1.70	0	7	8	9	10	▁▁▂▇▅
family_sat	4	1.00	8.31	1.78	0	8	9	10	10	▁▁▂▅▇
work_sat	28	0.99	7.26	2.17	0	6	8	9	10	▁▁▂▇▅

skim(wave5a)

Data summary
Name	wave5a
Number of rows	3119
Number of columns	9
_______________________
Column type frequency:
factor	2
numeric	7
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
sex	0	1	FALSE	2	2 F: 1626, 1 M: 1493, -10: 0, -7 : 0
lfs	4	1	FALSE	13	9 w: 1127, 1 n: 499, 10 : 409, 8 w: 324

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	376790387.30	218227325.96	423000	184491500	381315000	567148500	749731000	▇▇▇▇▇
wave	0	1.00	5.00	0.00	5	5	5	5	5	▁▁▇▁▁
age	0	1.00	29.55	8.54	18	20	30	39	42	▇▁▆▁▇
kidno	0	1.00	0.76	1.09	0	0	0	2	10	▇▁▁▁▁
sat	3	1.00	7.53	1.62	0	7	8	9	10	▁▁▂▇▃
family_sat	2	1.00	8.23	1.76	0	7	9	10	10	▁▁▂▆▇
work_sat	21	0.99	7.32	2.09	0	7	8	9	10	▁▁▂▇▅

skim(wave6a)

Data summary
Name	wave6a
Number of rows	2819
Number of columns	9
_______________________
Column type frequency:
factor	2
numeric	7
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
sex	0	1	FALSE	2	2 F: 1477, 1 M: 1342, -10: 0, -7 : 0
lfs	1	1	FALSE	13	9 w: 1109, 1 n: 425, 10 : 388, 8 w: 232

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	375146854.56	217789668.51	423000	184944000	378192000	564950000	749197000	▇▇▇▇▇
wave	0	1.00	6.00	0.00	6	6	6	6	6	▁▁▇▁▁
age	0	1.00	30.67	8.51	19	21	31	40	43	▇▁▆▁▇
kidno	0	1.00	0.80	1.10	0	0	0	2	10	▇▁▁▁▁
sat	4	1.00	7.59	1.61	0	7	8	9	10	▁▁▂▇▃
family_sat	1	1.00	8.26	1.78	0	8	9	10	10	▁▁▂▅▇
work_sat	18	0.99	7.27	2.13	0	6	8	9	10	▁▁▂▇▅

# in wave6, 18 respondents had missing information in work satisfaction.
# wave 3 has the highest missing in labor force status, 24 respondents

Exercise 7

Mengni Chen

22-10-2025

No. 1

Question

Answer

No. 2

Question

Answer

No. 3

Question

Answer

No. 4

Question

Answer