Now, the question is “does first birth affect women’s life satisfaction?” use different models and select the approriate model Prepare the dataset
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(splitstackshape) #transform wide data (with stacked variables) to long data
library(ggplot2)
library(plm)
library(lmtest)
##Import 6 waves of women data
for (i in 1:6) {
assign(paste0("women", i), #assign is similar to <-; paste0 is to combine wave and i into a name, i ranges from 1 to 6.
read_dta(paste0("wave", i, "_women.dta"))
)
}
##Clean 6 waves of women data
clean_fun <- function(df) { df %>%
transmute(
id=zap_label(id), #remove label of id
age=zap_label(age), #remove label of age
wave=as.numeric(wave),
relstat=as_factor(relstat), #make relstat as a factor
relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
hlt=case_when(hlt1<0 ~ as.numeric(NA), #specify when hlt1 is missing
TRUE ~ as.numeric(hlt1)),
nkidsbio=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
TRUE ~ as.numeric(nkidsbio)),
sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
TRUE ~ as.numeric(sat6)),
)%>% drop_na() }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)
women1b <- women1a %>% filter(nkidsbio==0)%>% #keep individuals who are childless in the first wave
rename(wave.1=wave, age.1=age, relstat.1=relstat, hlt.1=hlt, nkidsbio.1=nkidsbio, sat.1=sat ) #rename variables
women2b <- women2a %>%
rename(wave.2=wave, age.2=age, relstat.2=relstat, hlt.2=hlt, nkidsbio.2=nkidsbio, sat.2=sat )
women3b <- women3a %>%
rename(wave.3=wave, age.3=age, relstat.3=relstat, hlt.3=hlt, nkidsbio.3=nkidsbio, sat.3=sat )
women4b <- women4a %>%
rename(wave.4=wave, age.4=age, relstat.4=relstat, hlt.4=hlt, nkidsbio.4=nkidsbio, sat.4=sat )
women5b <- women5a %>%
rename(wave.5=wave, age.5=age, relstat.5=relstat, hlt.5=hlt, nkidsbio.5=nkidsbio, sat.5=sat )
women6b <- women6a %>%
rename(wave.6=wave, age.6=age, relstat.6=relstat, hlt.6=hlt, nkidsbio.6=nkidsbio, sat.6=sat )
women_wide <- left_join(women1b, women2b, by = "id") %>% # left join women1b and women2b
left_join(women3b, by = "id") %>% # left join with women3b
left_join(women4b, by = "id") %>% # left join with women4b
left_join(women5b, by = "id") %>% # left join with women5b
left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them
women_long<- merged.stack(women_wide, #dataset for transfrom
var.stubs = c("age", "wave", "relstat", "hlt","nkidsbio", "sat"),
#var.stubs is to specify the prefixes of the variable groups
sep = ".") %>%
#sep is to specify the character that separates the "variable name" from the "times" in the source
drop_na(wave)
#drop the observations which did not join the wave
women_long <- women_long %>%
group_by(id) %>%
mutate(
firstkid=case_when( nkidsbio!=dplyr::lag(nkidsbio, 1) & dplyr::lag(nkidsbio, 1)==0 & nkidsbio>0 ~ 1,
TRUE ~ 0),
firstkid2=case_when( nkidsbio!=dplyr::lag(nkidsbio, 1) & dplyr::lag(nkidsbio, 1)==0 & nkidsbio==2 ~ 1,
TRUE ~ 0)
) #to identify individual whose first childbearing is twins
twinid <- women_long$id[women_long$firstkid2==1]
women_long <- women_long[!(women_long$id %in% twinid),] #remove respondents whose first childbearing is twins
women_long <- filter(women_long, nkidsbio<2) # remove repeated event of childbearing, only focus on having first child
panel_data <- pdata.frame(women_long, index=c("id", "wave")) #define panel data
Run a random effect model to investigate the impact of first birth on women’s subjective wellbeing, and get a robust standard error estimation results
model1 <- plm(sat ~ nkidsbio, data=women_long, model="random") #random effect model
summary(model1) #results of random effect
## Oneway (individual) effect Random Effect Model
## (Swamy-Arora's transformation)
##
## Call:
## plm(formula = sat ~ nkidsbio, data = women_long, model = "random")
##
## Unbalanced Panel: n = 3753, T = 1-6, N = 14809
##
## Effects:
## var std.dev share
## idiosyncratic 1.534 1.238 0.57
## individual 1.156 1.075 0.43
## theta:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2449 0.5009 0.5745 0.5250 0.5745 0.5745
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7.8497 -0.5199 0.1618 -0.0026 0.7840 4.1609
##
## Coefficients:
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept) 7.646763 0.021375 357.7462 < 2.2e-16 ***
## nkidsbio 0.171767 0.060486 2.8398 0.004514 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 29620
## Residual Sum of Squares: 22888
## R-Squared: 0.22738
## Adj. R-Squared: 0.22733
## Chisq: 8.06443 on 1 DF, p-value: 0.0045143
model1.1<- coeftest(model1, vcov. = vcovHC, type = "HC1") #get results with robust standard error
Run a fixed effect model and a pooled regression model to compare the results in three models
model2 <- plm(sat ~ nkidsbio, data=women_long, model="within") #fixed effect model
summary(model2) #results of fixed effect
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = sat ~ nkidsbio, data = women_long, model = "within")
##
## Unbalanced Panel: n = 3753, T = 1-6, N = 14809
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -8.00000 -0.50000 0.00000 0.59092 5.66667
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## nkidsbio 0.181837 0.068196 2.6664 0.007679 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 16966
## Residual Sum of Squares: 16956
## R-Squared: 0.00064269
## Adj. R-Squared: -0.33862
## F-statistic: 7.10949 on 1 and 11055 DF, p-value: 0.0076788
model2.1<- coeftest(model2, vcov. = vcovHC, type = "HC1") #get results with robust standard error
model3 <- plm(sat ~ nkidsbio, data=women_long, model="pooling") #pooled OLS
summary(model3) #results of pooled OLS
## Pooling Model
##
## Call:
## plm(formula = sat ~ nkidsbio, data = women_long, model = "pooling")
##
## Unbalanced Panel: n = 3753, T = 1-6, N = 14809
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -7.7834 -0.6381 0.3619 1.3619 2.3619
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## (Intercept) 7.638097 0.013859 551.1472 < 2e-16 ***
## nkidsbio 0.145319 0.059330 2.4493 0.01432 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 39832
## Residual Sum of Squares: 39816
## R-Squared: 0.00040499
## Adj. R-Squared: 0.00033749
## F-statistic: 5.99919 on 1 and 14807 DF, p-value: 0.014324
model3.1<- coeftest(model3, vcov. = vcovHC, type = "HC1") #get results with robust standard error
texreg::htmlreg(list(model1.1, model2.1, model3.1),
custom.model.names=c("Random effect", "Fixed effect", "Pooled OLS"),
include.ci = FALSE, omit.coef = "factor", center=TRUE,file = "compare2.html")
## The table was written to the file 'compare2.html'.
Do a BPLM test to see whether we should select a random effect or pooled OLS regression
plmtest(model3, type=c("bp"))
##
## Lagrange Multiplier Test - (Breusch-Pagan)
##
## data: sat ~ nkidsbio
## chisq = 5119.3, df = 1, p-value < 2.2e-16
## alternative hypothesis: significant effects
#BPLM test is rejected. The test suggests that a random effect model should be used.
Do a Hausman test to see whether we should select a random effect or fixed effect model
phtest(model2, model1)
##
## Hausman Test
##
## data: sat ~ nkidsbio
## chisq = 0.10219, df = 1, p-value = 0.7492
## alternative hypothesis: one model is inconsistent
#Hausam test is not rejected. The test suggests that a random effect model should be used.