You are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), and life satisfaction (sat6).

No. 1

Question

  • Step 1: please import data of wave1-6 and check whether the coding and levels are consistent across 6 waves

Answer

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(Hmisc) # Weighted statistics.
library(skimr) #provide summary for multiple variables
library(splitstackshape) #transform wide data (with stacked variables) to long data
wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")
#or you can use loop to read dataset
for (i in 1:6) {
  assign(paste0("wave", i), #assign is similar to <-; paste0 is to combine wave and i into a name, i ranges from 1 to 6. 
         read_dta(paste0("anchor", i, "_50percent_Eng.dta"))
         )
} 
#check coding across 6 waves

sex_fun <- function(df) {
  table(as_factor(df$sex_gen))
        }
sapply(mget(paste0("wave", 1:6)), sex_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -10 not in demodiff                   0     0     0     0     0     0
## -7 Incomplete data                    0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## 1 Male                             3029  2197  1905  1668  1493  1342
## 2 Female                           3172  2339  2050  1813  1626  1477
#same coding for gender

lfs_fun <- function(df) {
  table(as_factor(df$lfs))
        }
sapply(mget(paste0("wave", 1:6)), lfs_fun)
##                                                        wave1 wave2 wave3 wave4
## -7 Incomplete data                                        12    22    24     7
## -3 Does not apply                                          0     0     0     0
## 1 nw, education                                         2229  1441  1093   725
## 2 nw, parental leave                                     237   146   148   116
## 3 nw, homemaker                                          253   120    97    85
## 4 nw, unemployed                                         297   235   180   156
## 5 nw, military service                                     9     8    33    30
## 6 nw, retired                                             19    19    21    22
## 7 nw, other                                               33    15    21    26
## 8 w, vocational training                                 308   387   371   381
## 9 w, full-time employment                               1929  1337  1206  1159
## 10 w, part-time employment                               468   419   405   415
## 11 w, marginal employment (geringfügige Beschäftigung)   142   137   129   144
## 12 w, self-employed                                      202   164   159   153
## 13 w, other                                               63    86    68    62
##                                                        wave5 wave6
## -7 Incomplete data                                         4     1
## -3 Does not apply                                          0     0
## 1 nw, education                                          499   425
## 2 nw, parental leave                                      90    78
## 3 nw, homemaker                                           69    50
## 4 nw, unemployed                                         133   124
## 5 nw, military service                                    44    16
## 6 nw, retired                                             22    27
## 7 nw, other                                               28    19
## 8 w, vocational training                                 324   232
## 9 w, full-time employment                               1127  1109
## 10 w, part-time employment                               409   388
## 11 w, marginal employment (geringfügige Beschäftigung)   146   146
## 12 w, self-employed                                      167   156
## 13 w, other                                               57    48
#same coding for labor force participation

sat_fun <- function(df) {
  table(as_factor(df$sat6))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251
#same coding for life satisfaction

age_fun <- function(df) {
  summary(df$age)
        }
sapply(mget(paste0("wave", 1:6)), age_fun)
##            wave1   wave2    wave3    wave4    wave5    wave6
## Min.    14.00000 15.0000 16.00000 17.00000 18.00000 19.00000
## 1st Qu. 17.00000 17.0000 18.00000 19.00000 20.00000 21.00000
## Median  26.00000 27.0000 28.00000 29.00000 30.00000 31.00000
## Mean    25.83728 26.4235 27.24526 28.46596 29.54569 30.66761
## 3rd Qu. 35.00000 36.0000 37.00000 38.00000 39.00000 40.00000
## Max.    38.00000 39.0000 40.00000 41.00000 42.00000 43.00000
# no meaningless age

kid_fun <- function(df) {
  summary(df$nkidsbio)
        }
sapply(mget(paste0("wave", 1:6)), kid_fun)
##             wave1  wave2      wave3      wave4      wave5      wave6
## Min.    -7.000000 -7.000  0.0000000  0.0000000  0.0000000  0.0000000
## 1st Qu.  0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Median   0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Mean     0.600387  0.625  0.6558786  0.7199081  0.7608208  0.8031217
## 3rd Qu.  1.000000  1.000  1.0000000  1.0000000  2.0000000  2.0000000
## Max.    10.000000 10.000 10.0000000 10.0000000 10.0000000 10.0000000
#in wave1 and 2, there are respondents who have no. of children <0.

No. 2

Question

  • Step 2: clean the variables across 6 waves and drop all missing values

Answer

clean_fun <- function(df) {
df %>% 
  transmute(
    id=zap_label(id), #remove label of id
    age=zap_label(age), #remove label of age
    wave=as.numeric(wave),
    sex=as_factor(sex_gen), #make sex_gen as a factor
    lfs=as_factor(lfs), #make lfs as a factor
    lfs=case_when(lfs== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for lfs
                      TRUE ~ as.character(lfs))%>%  
      as_factor(), #make lfs as a factor again
    kidno=case_when(nkidsbio<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(nkidsbio)),
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6))
                   ) %>% drop_na()
}

wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)

No. 3

Question

  • Step 3: combine the six waves together to generate a panel data

Answer

allwave <- rbind(wave1a, wave2a, wave3a, wave4a, wave5a, wave6a) #rbind is combine dataset by rows

No. 4

Question

Now transform the panel data to a wide data (person level data)

Answer

allwave_wide <- pivot_wider(
  allwave, #dataset to transform
  id_cols = id, #to identify the id column, in this case, "id" is the column
  names_from = wave, #i want use "wave" to create columns that reflect variables in each wave
  values_from = c(age, sex, lfs, kidno, sat) #specify where the value comes from
)

No. 5

Question

Now transform the wide data back to panel data

Answer

allwave_tolong<- merged.stack(allwave_wide, #dataset for transfrom
                            var.stubs = c("age", "sex","lfs", "kidno", "sat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = "_")   %>% 
#sep is to specify the character that separates the "variable name" from the "times" in the source
                           drop_na(sex)
#drop the observations which did not join the wave
#Why i use drop_na(sex): sex has not missing values across all waves (step1), if it is missing, then it means the person does not participate in that wave

#use head() to have a look at the first several observations
head(allwave_tolong, 20)
##         id .time_1 age      sex
##  1: 174000       1  25   1 Male
##  2: 174000       3  27   1 Male
##  3: 309000       1  27 2 Female
##  4: 423000       1  16   1 Male
##  5: 423000       2  17   1 Male
##  6: 423000       3  18   1 Male
##  7: 423000       4  19   1 Male
##  8: 423000       5  20   1 Male
##  9: 423000       6  22   1 Male
## 10: 828000       1  16   1 Male
## 11: 828000       2  17   1 Male
## 12: 828000       3  18   1 Male
## 13: 828000       4  19   1 Male
## 14: 828000       5  20   1 Male
## 15: 828000       6  21   1 Male
## 16: 945000       1  25   1 Male
## 17: 945000       2  26   1 Male
## 18: 945000       3  27   1 Male
## 19: 945000       4  28   1 Male
## 20: 945000       5  29   1 Male
##                                                        lfs kidno sat
##  1: 11 w, marginal employment (geringfügige Beschäftigung)     0   6
##  2:                              9 w, full-time employment     0   7
##  3:                                        3 nw, homemaker     1   8
##  4:                                        1 nw, education     0   9
##  5:                                        1 nw, education     0   7
##  6:                                        1 nw, education     0   7
##  7:                                            13 w, other     0   7
##  8:                                       4 nw, unemployed     0   7
##  9:                                       4 nw, unemployed     0   7
## 10:                                        1 nw, education     0   8
## 11:                                        1 nw, education     0   9
## 12:                                        1 nw, education     0   8
## 13:                                        1 nw, education     0   7
## 14:                                        1 nw, education     0   9
## 15:                                        1 nw, education     0   7
## 16:                              9 w, full-time employment     0   8
## 17:                              9 w, full-time employment     0   9
## 18:                                        1 nw, education     0   8
## 19:                                        1 nw, education     0   7
## 20:                              9 w, full-time employment     0   8