You are going to compile a dataset that have variables: 1) id, 2) age, 3) sex_gen, 4) no. of children ever born (nkids), 5) satisfaction with work (sat1i1), 6) satisfaction with family (sat1i4).
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(Hmisc) # Weighted statistics.
library(skimr) #provide summary for multiple variables
library(splitstackshape) #transform wide data (with stacked variables) to long data
wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")
#check coding across 6 waves
sex_fun <- function(df) {
table(as_factor(df$sex_gen))
}
sapply(mget(paste0("wave", 1:6)), sex_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## -10 not in demodiff 0 0 0 0 0 0
## -7 Incomplete data 0 0 0 0 0 0
## -4 Filter error / Incorrect entry 0 0 0 0 0 0
## -3 Does not apply 0 0 0 0 0 0
## 1 Male 3029 2197 1905 1668 1493 1342
## 2 Female 3172 2339 2050 1813 1626 1477
#same coding for gender
child_fun <- function(df) {
table(as_factor(df$nkids))
}
sapply(mget(paste0("wave", 1:6)), child_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## -7 Incomplete data 0 4 0 0 0 0
## 0 4091 2951 2521 2120 1844 1598
## 1 856 613 558 494 443 421
## 2 877 665 587 572 549 529
## 3 287 226 224 225 217 212
## 4 62 56 47 48 42 40
## 5 19 13 13 13 17 16
## 6 6 5 2 6 4 1
## 7 1 2 2 2 2 1
## 10 2 1 1 1 1 1
#same coding for no. of children
worksat_fun <- function(df) {
table(as_factor(df$sat1i1))
}
sapply(mget(paste0("wave", 1:6)), worksat_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value 0 0 0 0 0 0
## -4 Filter error / Incorrect entry 0 0 0 0 0 0
## -3 Does not apply 0 0 0 0 0 0
## -2 No answer 13 10 9 25 17 16
## -1 Don't know 10 4 7 3 4 2
## 0 Very dissatisfied 103 90 58 55 46 40
## 1 50 39 31 28 21 25
## 2 89 92 63 66 54 56
## 3 208 127 142 102 78 87
## 4 219 169 114 123 93 75
## 5 593 382 330 247 225 192
## 6 494 346 325 273 249 239
## 7 1154 892 774 648 586 509
## 8 1490 1166 1043 966 885 822
## 9 773 622 554 480 469 415
## 10 Very satisfied 1005 597 505 465 392 341
familysat_fun <- function(df) {
table(as_factor(df$sat1i1))
}
sapply(mget(paste0("wave", 1:6)), familysat_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value 0 0 0 0 0 0
## -4 Filter error / Incorrect entry 0 0 0 0 0 0
## -3 Does not apply 0 0 0 0 0 0
## -2 No answer 13 10 9 25 17 16
## -1 Don't know 10 4 7 3 4 2
## 0 Very dissatisfied 103 90 58 55 46 40
## 1 50 39 31 28 21 25
## 2 89 92 63 66 54 56
## 3 208 127 142 102 78 87
## 4 219 169 114 123 93 75
## 5 593 382 330 247 225 192
## 6 494 346 325 273 249 239
## 7 1154 892 774 648 586 509
## 8 1490 1166 1043 966 885 822
## 9 773 622 554 480 469 415
## 10 Very satisfied 1005 597 505 465 392 341
#coding of all variables across years are the same
clean_fun <- function(df) {
df %>%
transmute(
id,
age,
wave=as.numeric(wave),
sex=as_factor(sex_gen), #make sex_gen as a factor
kidno=case_when(nkids<0 ~ as.numeric(NA), #specify when hlt1 is missing
TRUE ~ as.numeric(nkids)),
worksat=case_when(sat1i1<0 ~ as.numeric(NA), #specify when sat1i1 is missing
TRUE ~ as.numeric(sat1i1)),
famsat=case_when(sat1i1<0 ~ as.numeric(NA), #specify when sat1i4 is missing
TRUE ~ as.numeric(sat1i4))
) %>% drop_na()
}
wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)
allwave <- rbind(wave1a, wave2a, wave3a, wave4a, wave5a, wave6a) #rbind is combine dataset by rows
allwave_towide <- pivot_wider(
allwave, #dataset to transform
id_cols = id, #to identify the id column, in this case, "id" is the column
names_from = wave, #i want use "wave" to create columns that reflect variables in each wave
values_from = c(age, sex, kidno, worksat, famsat) #specify where the value comes from
)
allwave_tolong<- merged.stack(allwave_towide, #dataset for transfrom
var.stubs = c("age", "sex","kidno", "worksat", "famsat"),
#var.stubs is to specify the prefixes of the variable groups
sep = "_")
#sep is to specify the character that separates the "variable name" from the "times" in the source
#use head() to have a look at the first several observations
head(allwave_tolong, 20)
## id .time_1 age sex kidno worksat famsat
## 1: 174000 1 25 1 Male 0 7 7
## 2: 174000 2 NA <NA> NA NA NA
## 3: 174000 3 27 1 Male 0 8 7
## 4: 174000 4 NA <NA> NA NA NA
## 5: 174000 5 NA <NA> NA NA NA
## 6: 174000 6 NA <NA> NA NA NA
## 7: 309000 1 27 2 Female 1 0 10
## 8: 309000 2 NA <NA> NA NA NA
## 9: 309000 3 NA <NA> NA NA NA
## 10: 309000 4 NA <NA> NA NA NA
## 11: 309000 5 NA <NA> NA NA NA
## 12: 309000 6 NA <NA> NA NA NA
## 13: 423000 1 16 1 Male 0 8 10
## 14: 423000 2 17 1 Male 0 8 9
## 15: 423000 3 18 1 Male 0 9 9
## 16: 423000 4 19 1 Male 0 7 10
## 17: 423000 5 20 1 Male 0 7 10
## 18: 423000 6 22 1 Male 0 9 10
## 19: 828000 1 16 1 Male 0 9 9
## 20: 828000 2 17 1 Male 0 8 9
allwave_tolong_v1 <- allwave_tolong %>% drop_na(sex)
#drop the observations which are missing in some waves
#Why i use drop_na(sex): sex has no missing values across all waves (step1), if it is missing, then it means the person does not participate in that wave
#we have 23983 person-year observations
wave1b <- wave1a %>%
rename(age.1=age, wave.1=wave, sex.1=sex, kidno.1=kidno, worksat.1=worksat, famsat.1=famsat)
#rename variables, with variable name.1 to specify it is from wave1
wave2b <- wave2a %>%
rename(age.2=age, wave.2=wave, sex.2=sex, kidno.2=kidno, worksat.2=worksat, famsat.2=famsat)
#rename variables, with variable name.2 to specify it is from wave2
wave3b <- wave3a %>%
rename(age.3=age, wave.3=wave, sex.3=sex, kidno.3=kidno, worksat.3=worksat, famsat.3=famsat)
wave4b <- wave4a %>%
rename(age.4=age, wave.4=wave, sex.4=sex, kidno.4=kidno, worksat.4=worksat, famsat.4=famsat)
wave5b <- wave5a %>%
rename(age.5=age, wave.5=wave, sex.5=sex, kidno.5=kidno, worksat.5=worksat, famsat.5=famsat)
wave6b <- wave6a %>%
rename(age.6=age, wave.6=wave, sex.6=sex, kidno.6=kidno, worksat.6=worksat, famsat.6=famsat)
joindata <- inner_join(wave1b, wave2b, by = "id") %>% # inner join wave1b and wave2b
inner_join(wave3b, by = "id") %>% # inner join with wave3b
inner_join(wave4b, by = "id") %>% # inner join with wave4b
inner_join(wave5b, by = "id") %>% # inner join with wave5b
inner_join(wave6b, by = "id") # inner join with wave6b
joindata_tolong<- merged.stack(joindata, #dataset for transfrom
var.stubs = c("age", "wave", "sex","kidno", "worksat", "famsat"),
#var.stubs is to specify the prefixes of the variable groups
sep = ".")
table(joindata_tolong$wave)
##
## 1 2 3 4 5 6
## 2385 2385 2385 2385 2385 2385