You are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), and life satisfaction (sat6).
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(Hmisc) # Weighted statistics.
library(skimr) #provide summary for multiple variables
library(splitstackshape) #transform wide data (with stacked variables) to long data
wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")
#or you can use loop to read dataset
for (i in 1:6) {
assign(paste0("wave", i), #assign is similar to <-; paste0 is to combine wave and i into a name, i ranges from 1 to 6.
read_dta(paste0("anchor", i, "_50percent_Eng.dta"))
)
}
#check coding across 6 waves
sex_fun <- function(df) {
table(as_factor(df$sex_gen))
}
sapply(mget(paste0("wave", 1:6)), sex_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## -10 not in demodiff 0 0 0 0 0 0
## -7 Incomplete data 0 0 0 0 0 0
## -4 Filter error / Incorrect entry 0 0 0 0 0 0
## -3 Does not apply 0 0 0 0 0 0
## 1 Male 3029 2197 1905 1668 1493 1342
## 2 Female 3172 2339 2050 1813 1626 1477
#same coding for gender
lfs_fun <- function(df) {
table(as_factor(df$lfs))
}
sapply(mget(paste0("wave", 1:6)), lfs_fun)
## wave1 wave2 wave3 wave4
## -7 Incomplete data 12 22 24 7
## -3 Does not apply 0 0 0 0
## 1 nw, education 2229 1441 1093 725
## 2 nw, parental leave 237 146 148 116
## 3 nw, homemaker 253 120 97 85
## 4 nw, unemployed 297 235 180 156
## 5 nw, military service 9 8 33 30
## 6 nw, retired 19 19 21 22
## 7 nw, other 33 15 21 26
## 8 w, vocational training 308 387 371 381
## 9 w, full-time employment 1929 1337 1206 1159
## 10 w, part-time employment 468 419 405 415
## 11 w, marginal employment (geringfügige Beschäftigung) 142 137 129 144
## 12 w, self-employed 202 164 159 153
## 13 w, other 63 86 68 62
## wave5 wave6
## -7 Incomplete data 4 1
## -3 Does not apply 0 0
## 1 nw, education 499 425
## 2 nw, parental leave 90 78
## 3 nw, homemaker 69 50
## 4 nw, unemployed 133 124
## 5 nw, military service 44 16
## 6 nw, retired 22 27
## 7 nw, other 28 19
## 8 w, vocational training 324 232
## 9 w, full-time employment 1127 1109
## 10 w, part-time employment 409 388
## 11 w, marginal employment (geringfügige Beschäftigung) 146 146
## 12 w, self-employed 167 156
## 13 w, other 57 48
#same coding for labor force participation
sat_fun <- function(df) {
table(as_factor(df$sat6))
}
sapply(mget(paste0("wave", 1:6)), sat_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value 0 0 0 0 0 0
## -4 Filter error / Incorrect entry 0 0 0 0 0 0
## -3 Does not apply 0 0 0 0 0 0
## -2 No answer 2 5 4 4 3 4
## -1 Don't know 3 1 1 0 0 0
## 0 Very dissatisfied 26 15 9 13 4 5
## 1 18 5 10 12 7 5
## 2 45 27 38 34 22 20
## 3 110 58 57 56 60 42
## 4 133 84 88 72 75 81
## 5 395 249 221 205 188 130
## 6 508 316 291 282 219 223
## 7 1178 898 863 726 691 592
## 8 1877 1417 1282 1138 1042 974
## 9 1157 933 701 631 563 492
## 10 Very satisfied 749 528 390 308 245 251
#same coding for life satisfaction
age_fun <- function(df) {
summary(df$age)
}
sapply(mget(paste0("wave", 1:6)), age_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## Min. 14.00000 15.0000 16.00000 17.00000 18.00000 19.00000
## 1st Qu. 17.00000 17.0000 18.00000 19.00000 20.00000 21.00000
## Median 26.00000 27.0000 28.00000 29.00000 30.00000 31.00000
## Mean 25.83728 26.4235 27.24526 28.46596 29.54569 30.66761
## 3rd Qu. 35.00000 36.0000 37.00000 38.00000 39.00000 40.00000
## Max. 38.00000 39.0000 40.00000 41.00000 42.00000 43.00000
# no meaningless age
kid_fun <- function(df) {
summary(df$nkidsbio)
}
sapply(mget(paste0("wave", 1:6)), kid_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## Min. -7.000000 -7.000 0.0000000 0.0000000 0.0000000 0.0000000
## 1st Qu. 0.000000 0.000 0.0000000 0.0000000 0.0000000 0.0000000
## Median 0.000000 0.000 0.0000000 0.0000000 0.0000000 0.0000000
## Mean 0.600387 0.625 0.6558786 0.7199081 0.7608208 0.8031217
## 3rd Qu. 1.000000 1.000 1.0000000 1.0000000 2.0000000 2.0000000
## Max. 10.000000 10.000 10.0000000 10.0000000 10.0000000 10.0000000
#in wave1 and 2, there are respondents who have no. of children <0.
clean_fun <- function(df) {
df %>%
transmute(
id=zap_label(id), #remove label of id
age=zap_label(age), #remove label of age
wave=as.numeric(wave),
sex=as_factor(sex_gen), #make sex_gen as a factor
lfs=as_factor(lfs), #make lfs as a factor
lfs=case_when(lfs== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for lfs
TRUE ~ as.character(lfs))%>%
as_factor(), #make lfs as a factor again
kidno=case_when(nkidsbio<0 ~ as.numeric(NA), #specify when hlt1 is missing
TRUE ~ as.numeric(nkidsbio)),
sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
TRUE ~ as.numeric(sat6))
) %>% drop_na()
}
wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)
allwave <- rbind(wave1a, wave2a, wave3a, wave4a, wave5a, wave6a) #rbind is combine dataset by rows
Now transform the panel data to a wide data (person level data)
allwave_wide <- pivot_wider(
allwave, #dataset to transform
id_cols = id, #to identify the id column, in this case, "id" is the column
names_from = wave, #i want use "wave" to create columns that reflect variables in each wave
values_from = c(age, sex, lfs, kidno, sat) #specify where the value comes from
)
Now transform the wide data back to panel data
allwave_tolong<- merged.stack(allwave_wide, #dataset for transfrom
var.stubs = c("age", "sex","lfs", "kidno", "sat"),
#var.stubs is to specify the prefixes of the variable groups
sep = "_") %>%
#sep is to specify the character that separates the "variable name" from the "times" in the source
drop_na(sex)
#drop the observations which did not join the wave
#Why i use drop_na(sex): sex has not missing values across all waves (step1), if it is missing, then it means the person does not participate in that wave
#use head() to have a look at the first several observations
head(allwave_tolong, 20)
## id .time_1 age sex
## 1: 174000 1 25 1 Male
## 2: 174000 3 27 1 Male
## 3: 309000 1 27 2 Female
## 4: 423000 1 16 1 Male
## 5: 423000 2 17 1 Male
## 6: 423000 3 18 1 Male
## 7: 423000 4 19 1 Male
## 8: 423000 5 20 1 Male
## 9: 423000 6 22 1 Male
## 10: 828000 1 16 1 Male
## 11: 828000 2 17 1 Male
## 12: 828000 3 18 1 Male
## 13: 828000 4 19 1 Male
## 14: 828000 5 20 1 Male
## 15: 828000 6 21 1 Male
## 16: 945000 1 25 1 Male
## 17: 945000 2 26 1 Male
## 18: 945000 3 27 1 Male
## 19: 945000 4 28 1 Male
## 20: 945000 5 29 1 Male
## lfs kidno sat
## 1: 11 w, marginal employment (geringfügige Beschäftigung) 0 6
## 2: 9 w, full-time employment 0 7
## 3: 3 nw, homemaker 1 8
## 4: 1 nw, education 0 9
## 5: 1 nw, education 0 7
## 6: 1 nw, education 0 7
## 7: 13 w, other 0 7
## 8: 4 nw, unemployed 0 7
## 9: 4 nw, unemployed 0 7
## 10: 1 nw, education 0 8
## 11: 1 nw, education 0 9
## 12: 1 nw, education 0 8
## 13: 1 nw, education 0 7
## 14: 1 nw, education 0 9
## 15: 1 nw, education 0 7
## 16: 9 w, full-time employment 0 8
## 17: 9 w, full-time employment 0 9
## 18: 1 nw, education 0 8
## 19: 1 nw, education 0 7
## 20: 9 w, full-time employment 0 8