Copy the following code to obtain dataset3. With dataset3, please do the following regression.
#Run the following to prepare your dataset
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(ggplot2) # Allows us to create nice figures.
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.
#import data
dataset1 <- read_dta("anchor1_50percent_Eng.dta")
#a dataset of age, sex, life satisfaction, no.of children ever born
#create dataset2
dataset2 <- dataset1 %>%
transmute(
age,
sex_gen=as_factor(sex_gen) %>% fct_drop(),
nkidsbio=case_when(nkidsbio<0 ~ as.numeric(NA),
TRUE ~ as.numeric(nkidsbio)),
sat6=case_when(sat6<0 ~ as.numeric(NA),
TRUE ~ as.numeric(sat6)), #specify when sat should be considered missing
relstat=as_factor(relstat),
relstat=case_when(relstat=="-7 Incomplete data" ~ as.character(NA),
TRUE ~ as.character(relstat)) %>%
as_factor() %>%
fct_drop()
) %>%
drop_na()
#create dataset3
dataset3 <- dataset2 %>%
mutate(
marital=case_when(
relstat %in% c("1 Never married single","2 Never married LAT","3 Never married COHAB") ~ "Nevermarried",
relstat %in% c("4 Married COHAB","5 Married noncohabiting") ~ 'Married',
relstat %in% c("6 Divorced/separated single","7 Divorced/separated LAT","8 Divorced/separated COHAB") ~ 'Divorced',
relstat %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widow'
) %>% as_factor(),
parenthood=case_when(
nkidsbio>0 ~ "Have kids",
nkidsbio==0 ~ "No kids") %>%
as_factor()
)%>%
filter(marital!= "Widow")
model1 <- lm_robust(formula = sat6 ~ age + sex_gen + marital + parenthood, data = dataset3)
summary(model1)
##
## Call:
## lm_robust(formula = sat6 ~ age + sex_gen + marital + parenthood,
## data = dataset3)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 8.52022 0.087294 97.6037 0.000e+00 8.34910 8.69135
## age -0.04259 0.003852 -11.0543 3.843e-28 -0.05014 -0.03503
## sex_gen2 Female 0.03697 0.044375 0.8332 4.048e-01 -0.05002 0.12396
## maritalMarried 0.67765 0.080028 8.4677 3.099e-17 0.52077 0.83453
## maritalDivorced -0.41739 0.147118 -2.8371 4.568e-03 -0.70579 -0.12898
## parenthoodHave kids 0.01415 0.081160 0.1744 8.616e-01 -0.14495 0.17326
## DF
## (Intercept) 6148
## age 6148
## sex_gen2 Female 6148
## maritalMarried 6148
## maritalDivorced 6148
## parenthoodHave kids 6148
##
## Multiple R-squared: 0.04032 , Adjusted R-squared: 0.03954
## F-statistic: 42.91 on 5 and 6148 DF, p-value: < 2.2e-16
model2 <- lm_robust(formula = sat6 ~ age + sex_gen*marital + parenthood, data = dataset3)
summary(model2)
##
## Call:
## lm_robust(formula = sat6 ~ age + sex_gen * marital + parenthood,
## data = dataset3)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.521746 0.087984 96.85557 0.000e+00
## age -0.042555 0.003853 -11.04338 4.330e-28
## sex_gen2 Female 0.032771 0.053094 0.61722 5.371e-01
## maritalMarried 0.683116 0.098286 6.95026 4.023e-12
## maritalDivorced -0.509612 0.249568 -2.04198 4.120e-02
## parenthoodHave kids 0.011517 0.081296 0.14167 8.873e-01
## sex_gen2 Female:maritalMarried -0.005617 0.097550 -0.05758 9.541e-01
## sex_gen2 Female:maritalDivorced 0.142108 0.288261 0.49298 6.220e-01
## CI Lower CI Upper DF
## (Intercept) 8.34927 8.69423 6146
## age -0.05011 -0.03500 6146
## sex_gen2 Female -0.07131 0.13685 6146
## maritalMarried 0.49044 0.87579 6146
## maritalDivorced -0.99885 -0.02037 6146
## parenthoodHave kids -0.14785 0.17089 6146
## sex_gen2 Female:maritalMarried -0.19685 0.18562 6146
## sex_gen2 Female:maritalDivorced -0.42298 0.70720 6146
##
## Multiple R-squared: 0.04039 , Adjusted R-squared: 0.0393
## F-statistic: 30.68 on 7 and 6146 DF, p-value: < 2.2e-16
Combine model 1 to model 2, output the results to a html file and label the names of independent variables to make readers easy to understand.
#first create Regression.html to see the order of variables
htmlreg(list(model1, model2), include.ci = FALSE,
caption = "My regression table",
caption.above = TRUE,
single.row = TRUE,
digits = 3,
file = "Regression.html")
## The table was written to the file 'Regression.html'.
#first create Regression_readable names.html to change the variable names accroding to the order of the variables shown in Regression.html
htmlreg(list(model1, model2), include.ci = FALSE,
custom.coef.names = c("Intercept",
"Age",
"Female",
"Married",
"Divorced",
"Have kids",
"Female x married",
"Female x divorced"),
caption = "My regression table",
caption.above = TRUE,
single.row = TRUE,
digits = 3,
file = "Regression_readable names.html")
## The table was written to the file 'Regression_readable names.html'.
Save the result of Model 2 as a dataset
result_m2<- model2 %>% broom::tidy()
result_m2
## term estimate std.error statistic
## 1 (Intercept) 8.521746003 0.087984058 96.85556900
## 2 age -0.042555021 0.003853442 -11.04337806
## 3 sex_gen2 Female 0.032770627 0.053094178 0.61721694
## 4 maritalMarried 0.683116214 0.098286482 6.95025602
## 5 maritalDivorced -0.509611544 0.249567598 -2.04197799
## 6 parenthoodHave kids 0.011517126 0.081296432 0.14166828
## 7 sex_gen2 Female:maritalMarried -0.005617192 0.097550190 -0.05758258
## 8 sex_gen2 Female:maritalDivorced 0.142108391 0.288261430 0.49298441
## p.value conf.low conf.high df outcome
## 1 0.000000e+00 8.34926645 8.69422556 6146 sat6
## 2 4.329806e-28 -0.05010912 -0.03500092 6146 sat6
## 3 5.371145e-01 -0.07131255 0.13685380 6146 sat6
## 4 4.022887e-12 0.49044030 0.87579212 6146 sat6
## 5 4.119631e-02 -0.99885140 -0.02037169 6146 sat6
## 6 8.873467e-01 -0.14785234 0.17088659 6146 sat6
## 7 9.540830e-01 -0.19684971 0.18561533 6146 sat6
## 8 6.220412e-01 -0.42298492 0.70720170 6146 sat6
#or you can have the following code
result_m2<- tidy(model2)
Plot the result in a point chart, and also display the confidence interval of each coefficient
ggplot(data = result_m2,
aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI
coord_flip() +
geom_hline(yintercept = 0, color = "red", lty = "dashed") # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dashed line
to make your graph a nicer one
result_m2$predictor <- c("Intercept",
"Age",
"Female",
"Married",
"Divorced",
"Have kids",
"Female x married",
"Female x divorced") #create a variable named "predictor" in the result_m2 dataset.
#generate a variable "predictor" in the dataset "result_m2" where the IVs are labelled in a way easy to understand
result_m2$predictor <- as_factor(result_m2$predictor)
ggplot(data = result_m2,
aes(x = predictor, y = estimate, ymin = conf.low, ymax = conf.high)) + #now i change x=term to x= predictor
geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI
coord_flip() +
geom_hline(yintercept = 0, color = "red", lty = "dashed")+ # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dahed line
ylab("Regression coefficient with CI") #change the name of yaxis
#you can also test what if you don't make result_m2$predictor as a factor.
#if you don't want to include intercept
ggplot(data = result_m2[-1,], #remove row 1, as intercetp is located in row 1
aes(x = predictor, y = estimate, ymin = conf.low, ymax = conf.high)) + #now i change x=term to x= predictor
geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI
coord_flip() +
geom_hline(yintercept = 0, color = "red", lty = "dashed")+ # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dahed line
ylab("Regression coefficient with CI") #change the name of yaxis
Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), and life satisfaction (sat6).
wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")
Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), and life satisfaction (sat6).
#check coding across 6 waves
sex_fun <- function(df) {
table(as_factor(df$sex_gen))
}
sapply(mget(paste0("wave", 1:6)), sex_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## -10 not in demodiff 0 0 0 0 0 0
## -7 Incomplete data 0 0 0 0 0 0
## -4 Filter error / Incorrect entry 0 0 0 0 0 0
## -3 Does not apply 0 0 0 0 0 0
## 1 Male 3029 2197 1905 1668 1493 1342
## 2 Female 3172 2339 2050 1813 1626 1477
#same coding for gender
lfs_fun <- function(df) {
table(as_factor(df$lfs))
}
sapply(mget(paste0("wave", 1:6)), lfs_fun)
## wave1 wave2 wave3 wave4
## -7 Incomplete data 12 22 24 7
## -3 Does not apply 0 0 0 0
## 1 nw, education 2229 1441 1093 725
## 2 nw, parental leave 237 146 148 116
## 3 nw, homemaker 253 120 97 85
## 4 nw, unemployed 297 235 180 156
## 5 nw, military service 9 8 33 30
## 6 nw, retired 19 19 21 22
## 7 nw, other 33 15 21 26
## 8 w, vocational training 308 387 371 381
## 9 w, full-time employment 1929 1337 1206 1159
## 10 w, part-time employment 468 419 405 415
## 11 w, marginal employment (geringfügige Beschäftigung) 142 137 129 144
## 12 w, self-employed 202 164 159 153
## 13 w, other 63 86 68 62
## wave5 wave6
## -7 Incomplete data 4 1
## -3 Does not apply 0 0
## 1 nw, education 499 425
## 2 nw, parental leave 90 78
## 3 nw, homemaker 69 50
## 4 nw, unemployed 133 124
## 5 nw, military service 44 16
## 6 nw, retired 22 27
## 7 nw, other 28 19
## 8 w, vocational training 324 232
## 9 w, full-time employment 1127 1109
## 10 w, part-time employment 409 388
## 11 w, marginal employment (geringfügige Beschäftigung) 146 146
## 12 w, self-employed 167 156
## 13 w, other 57 48
#same coding for labor force participation
sat_fun <- function(df) {
table(as_factor(df$sat6))
}
sapply(mget(paste0("wave", 1:6)), sat_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value 0 0 0 0 0 0
## -4 Filter error / Incorrect entry 0 0 0 0 0 0
## -3 Does not apply 0 0 0 0 0 0
## -2 No answer 2 5 4 4 3 4
## -1 Don't know 3 1 1 0 0 0
## 0 Very dissatisfied 26 15 9 13 4 5
## 1 18 5 10 12 7 5
## 2 45 27 38 34 22 20
## 3 110 58 57 56 60 42
## 4 133 84 88 72 75 81
## 5 395 249 221 205 188 130
## 6 508 316 291 282 219 223
## 7 1178 898 863 726 691 592
## 8 1877 1417 1282 1138 1042 974
## 9 1157 933 701 631 563 492
## 10 Very satisfied 749 528 390 308 245 251
#same coding for life satisfaction
age_fun <- function(df) {
summary(df$age)
}
sapply(mget(paste0("wave", 1:6)), age_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## Min. 14.00000 15.0000 16.00000 17.00000 18.00000 19.00000
## 1st Qu. 17.00000 17.0000 18.00000 19.00000 20.00000 21.00000
## Median 26.00000 27.0000 28.00000 29.00000 30.00000 31.00000
## Mean 25.83728 26.4235 27.24526 28.46596 29.54569 30.66761
## 3rd Qu. 35.00000 36.0000 37.00000 38.00000 39.00000 40.00000
## Max. 38.00000 39.0000 40.00000 41.00000 42.00000 43.00000
# no meaningless age
kid_fun <- function(df) {
summary(df$nkidsbio)
}
sapply(mget(paste0("wave", 1:6)), kid_fun)
## wave1 wave2 wave3 wave4 wave5 wave6
## Min. -7.000000 -7.000 0.0000000 0.0000000 0.0000000 0.0000000
## 1st Qu. 0.000000 0.000 0.0000000 0.0000000 0.0000000 0.0000000
## Median 0.000000 0.000 0.0000000 0.0000000 0.0000000 0.0000000
## Mean 0.600387 0.625 0.6558786 0.7199081 0.7608208 0.8031217
## 3rd Qu. 1.000000 1.000 1.0000000 1.0000000 2.0000000 2.0000000
## Max. 10.000000 10.000 10.0000000 10.0000000 10.0000000 10.0000000
#in wave1 and 2, there are respondents who have no. of children <0.
Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), and life satisfaction (sat6).
clean_fun <- function(df) {
df %>%
transmute(
id,
wave,
age,
sex=as_factor(sex_gen), #make sex_gen as a factor
lfs=as_factor(lfs), #make lfs as a factor
lfs=case_when(lfs== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for lfs
TRUE ~ as.character(lfs))%>%
as_factor(), #make lfs as a factor again
kidno=case_when(nkidsbio<0 ~ as.numeric(NA), #specify when hlt1 is missing
TRUE ~ as.numeric(nkidsbio)),
sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
TRUE ~ as.numeric(sat6))
) %>% drop_na()
}
wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)