You are going to compile a dataset that have variables: 1) id, 2) age, 3) sex_gen, 4) no. of children ever born (nkids), 5) satisfaction with work (sat1i1), 6) satisfaction with family (sat1i4).

No. 1

Question

  • Step 1: please import data of wave1-6 and check whether the coding and levels are consistent across 6 waves

Answer

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(Hmisc) # Weighted statistics.
library(skimr) #provide summary for multiple variables
library(splitstackshape) #transform wide data (with stacked variables) to long data
wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")
#check coding across 6 waves

sex_fun <- function(df) {
  table(as_factor(df$sex_gen))
        }
sapply(mget(paste0("wave", 1:6)), sex_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -10 not in demodiff                   0     0     0     0     0     0
## -7 Incomplete data                    0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## 1 Male                             3029  2197  1905  1668  1493  1342
## 2 Female                           3172  2339  2050  1813  1626  1477
#same coding for gender

child_fun <- function(df) {
  table(as_factor(df$nkids))
        }
sapply(mget(paste0("wave", 1:6)), child_fun)
##                    wave1 wave2 wave3 wave4 wave5 wave6
## -7 Incomplete data     0     4     0     0     0     0
## 0                   4091  2951  2521  2120  1844  1598
## 1                    856   613   558   494   443   421
## 2                    877   665   587   572   549   529
## 3                    287   226   224   225   217   212
## 4                     62    56    47    48    42    40
## 5                     19    13    13    13    17    16
## 6                      6     5     2     6     4     1
## 7                      1     2     2     2     2     1
## 10                     2     1     1     1     1     1
#same coding for no. of children

worksat_fun <- function(df) {
  table(as_factor(df$sat1i1))
        }
sapply(mget(paste0("wave", 1:6)), worksat_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                         13    10     9    25    17    16
## -1 Don't know                        10     4     7     3     4     2
## 0 Very dissatisfied                 103    90    58    55    46    40
## 1                                    50    39    31    28    21    25
## 2                                    89    92    63    66    54    56
## 3                                   208   127   142   102    78    87
## 4                                   219   169   114   123    93    75
## 5                                   593   382   330   247   225   192
## 6                                   494   346   325   273   249   239
## 7                                  1154   892   774   648   586   509
## 8                                  1490  1166  1043   966   885   822
## 9                                   773   622   554   480   469   415
## 10 Very satisfied                  1005   597   505   465   392   341
familysat_fun <- function(df) {
  table(as_factor(df$sat1i1))
        }
sapply(mget(paste0("wave", 1:6)), familysat_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                         13    10     9    25    17    16
## -1 Don't know                        10     4     7     3     4     2
## 0 Very dissatisfied                 103    90    58    55    46    40
## 1                                    50    39    31    28    21    25
## 2                                    89    92    63    66    54    56
## 3                                   208   127   142   102    78    87
## 4                                   219   169   114   123    93    75
## 5                                   593   382   330   247   225   192
## 6                                   494   346   325   273   249   239
## 7                                  1154   892   774   648   586   509
## 8                                  1490  1166  1043   966   885   822
## 9                                   773   622   554   480   469   415
## 10 Very satisfied                  1005   597   505   465   392   341
#coding of all variables across years are the same

No. 2

Question

  • Step 2: clean the variables across 6 waves and drop all missing values

Answer

clean_fun <- function(df) {
df %>% 
  transmute(
    id, 
    age, 
    wave=as.numeric(wave),
    sex=as_factor(sex_gen), #make sex_gen as a factor
    kidno=case_when(nkids<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(nkids)),
    worksat=case_when(sat1i1<0 ~ as.numeric(NA), #specify when sat1i1 is missing
                   TRUE ~ as.numeric(sat1i1)),
    famsat=case_when(sat1i1<0 ~ as.numeric(NA), #specify when sat1i4 is missing
                   TRUE ~ as.numeric(sat1i4))
                   ) %>% drop_na()
}

wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)

No. 3

Question

  • Step 3: combine the six waves together to generate a panel data(i.e. a long format data), named it as “allwave”

Answer

allwave <- rbind(wave1a, wave2a, wave3a, wave4a, wave5a, wave6a) #rbind is combine dataset by rows

No. 4

Question

  • Now transform the “allwave” panel data to a wide data (i.e. a wide format data), and name it “allwave_towide”.

Answer

allwave_towide <- pivot_wider(
  allwave, #dataset to transform
  id_cols = id, #to identify the id column, in this case, "id" is the column
  names_from = wave, #i want use "wave" to create columns that reflect variables in each wave
  values_from = c(age, sex, kidno, worksat, famsat) #specify where the value comes from
)

No. 5

Question

  • Now transform the wide data “allwave_towide” back to panel data (i.e. long format data), named it “allwave_tolong”
  • How many person-year observations do you have in the “allwave_tolong” data?

Answer

allwave_tolong<- merged.stack(allwave_towide, #dataset for transfrom
                            var.stubs = c("age", "sex","kidno", "worksat", "famsat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = "_")   
#sep is to specify the character that separates the "variable name" from the "times" in the source

#use head() to have a look at the first several observations
head(allwave_tolong, 20)
##         id .time_1 age      sex kidno worksat famsat
##  1: 174000       1  25   1 Male     0       7      7
##  2: 174000       2  NA     <NA>    NA      NA     NA
##  3: 174000       3  27   1 Male     0       8      7
##  4: 174000       4  NA     <NA>    NA      NA     NA
##  5: 174000       5  NA     <NA>    NA      NA     NA
##  6: 174000       6  NA     <NA>    NA      NA     NA
##  7: 309000       1  27 2 Female     1       0     10
##  8: 309000       2  NA     <NA>    NA      NA     NA
##  9: 309000       3  NA     <NA>    NA      NA     NA
## 10: 309000       4  NA     <NA>    NA      NA     NA
## 11: 309000       5  NA     <NA>    NA      NA     NA
## 12: 309000       6  NA     <NA>    NA      NA     NA
## 13: 423000       1  16   1 Male     0       8     10
## 14: 423000       2  17   1 Male     0       8      9
## 15: 423000       3  18   1 Male     0       9      9
## 16: 423000       4  19   1 Male     0       7     10
## 17: 423000       5  20   1 Male     0       7     10
## 18: 423000       6  22   1 Male     0       9     10
## 19: 828000       1  16   1 Male     0       9      9
## 20: 828000       2  17   1 Male     0       8      9
allwave_tolong_v1 <- allwave_tolong %>% drop_na(sex)
#drop the observations which are missing in some waves
#Why i use drop_na(sex): sex has no missing values across all waves (step1), if it is missing, then it means the person does not participate in that wave

#we have 23983 person-year observations

No. 6

Question

  • 6.1 I want to focus on respondents who participated in all six waves. You need to join the six wave data together. Which way of join will you use? Please write down your r code to join the data.
  • 6.2 Then make it as a panel data (i.e. long format)
  • 6.3 How many people participated in all six waves?

Answer for 6.1

wave1b <- wave1a %>% 
rename(age.1=age, wave.1=wave,  sex.1=sex, kidno.1=kidno, worksat.1=worksat, famsat.1=famsat) 
#rename variables, with variable name.1 to specify it is from wave1

wave2b <- wave2a %>% 
rename(age.2=age, wave.2=wave,  sex.2=sex, kidno.2=kidno, worksat.2=worksat, famsat.2=famsat)
#rename variables, with variable name.2 to specify it is from wave2

wave3b <- wave3a %>% 
rename(age.3=age, wave.3=wave,  sex.3=sex, kidno.3=kidno, worksat.3=worksat, famsat.3=famsat)

wave4b <- wave4a %>% 
rename(age.4=age, wave.4=wave,  sex.4=sex, kidno.4=kidno, worksat.4=worksat, famsat.4=famsat)

wave5b <- wave5a %>% 
rename(age.5=age, wave.5=wave,  sex.5=sex, kidno.5=kidno, worksat.5=worksat, famsat.5=famsat)

wave6b <- wave6a %>% 
rename(age.6=age, wave.6=wave,  sex.6=sex, kidno.6=kidno, worksat.6=worksat, famsat.6=famsat)

joindata <- inner_join(wave1b, wave2b, by = "id") %>%  # inner join wave1b and wave2b
  inner_join(wave3b, by = "id") %>% # inner join with wave3b
  inner_join(wave4b, by = "id") %>% # inner join with wave4b
  inner_join(wave5b, by = "id") %>% # inner join with wave5b
  inner_join(wave6b, by = "id") # inner join with wave6b

Answer for 6.2

joindata_tolong<- merged.stack(joindata, #dataset for transfrom
                            var.stubs = c("age", "wave", "sex","kidno", "worksat", "famsat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = ".")   

Answer for 6.3

table(joindata_tolong$wave)
## 
##    1    2    3    4    5    6 
## 2385 2385 2385 2385 2385 2385