Based on pairfam wave1 dataset, compile a dataset consists of variables as following:
age, sex(sex_gen), life satisfaction (sat6), relstat (relationship), no.of children(nkidsbio), weight (cdweight)
Drop observations with missing values
#Question1a: Based on pairfam wave1 dataset, compile a dataset consists of variables as following:age, sex(sex_gen), life satisfaction (sat6), relstat (relationship), no.of children(nkidsbio), weight (cdweight)
#Question1b: Drop observations with missing values
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(Hmisc) # Weighting
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.
dataset1 <- read_dta("anchor1_50percent_Eng.dta")
#a dataset of age, sex, life satisfaction, no.of children ever born, weight
dataset2 <- dataset1 %>%
transmute(
age=zap_labels(age), #remove labels of age
sex_gen=as_factor(sex_gen) %>% fct_drop(), #treat sex_gen as a categorical, and drop unused levels
nkidsbio=case_when(nkidsbio<0 ~ as.numeric(NA),TRUE ~ as.numeric(nkidsbio)) %>%
zap_label(), #specify when nkidsbio should be missing and then remove the label of nkidsbio
sat6=case_when(sat6<0 ~ as.numeric(NA), TRUE ~ as.numeric(sat6)) %>% #specify when sat should be considered missing
zap_label(), #remove labels of sat6
relstat=as_factor(relstat), #treat relationship status as categorical
relstat=case_when(relstat=="-7 Incomplete data" ~ as.character(NA), TRUE ~ as.character(relstat)) %>% #specify when relstat should be considered missing
as_factor() %>% #make relstat as a factor again, as the ~ as.character(NA) has made it a character variable
fct_drop(), #drop unused levels in relstat
cdweight=zap_label(cdweight)) %>% #remove labels of cdweight
drop_na() #remove all observations that are missing
#Question2a:Generate a new variable "marital" for relationship status with categories of "Never married", "Married", "Separate/Divorced", "Widowed".
#Question2b:Generate a new variable "parenthood" based on nkidsbio with categories of "Have kids", "No kids".
dataset3 <- dataset2 %>%
mutate(
marital=case_when(
relstat %in% c("1 Never married single","2 Never married LAT","3 Never married COHAB") ~ "Nevermarried",
# when relstat has any of the three situations, I assign "Nevermarried" to new variable "marital1"
relstat %in% c("4 Married COHAB","5 Married noncohabiting") ~ 'Married',
# when relstat has any of the two situations, I assign "Married" to new variable "marital1
relstat %in% c("6 Divorced/separated single","7 Divorced/separated LAT","8 Divorced/separated COHAB") ~ 'Divorced',
# when relstat has any of the three situations, I assign "Divorced" to new variable "marital1"
relstat %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widow'
# when relstat has any of the two situations, I assign "Widow" to new variable "marital1"
) %>% as_factor(),
#after case_when, marital is a character variable, so make it a facto nor
parenthood=case_when(
nkidsbio>0 ~ "Have kids",
# when nkidsbio should be "Have kids"
nkidsbio==0 ~ "No kids") %>% # when nkidsbio should be "No kids"
as_factor() #make parenthood a categorical variable
)
#Question3a: Model1: regress life satisfaction on marital status and parenthood while not considering the weight
#Question3b: Model2: regress life satisfaction on marital status and parenthood while considering the weight
#Question3c: Model3: regress life satisfaction on marital status and parenthood while considering the weight and robust standard error
#Question3d: The coefficient for marital status is _____. The coefficient for parenthood is _____.
#check variables first
table(dataset3$marital) # there are only 4 observations in "Widow"
##
## Nevermarried Married Divorced Widow
## 4114 1756 284 4
table(dataset3$parenthood)
##
## No kids Have kids
## 4105 2053
#Therefore, I decided to drop the 4 widow observations
dataset4 <- filter(dataset3, marital!= "Widow") #exclude respondents who are widow
regression1 <- lm(data = dataset4,
formula = sat6 ~ marital + parenthood)
#ols regression without considering weight and robust standard errors
summary(regression1)
##
## Call:
## lm(formula = sat6 ~ marital + parenthood, data = dataset4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0205 -0.6253 0.3747 1.3747 3.3752
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.62532 0.02768 275.519 < 2e-16 ***
## maritalMarried 0.39515 0.07162 5.518 3.58e-08 ***
## maritalDivorced -0.74063 0.11734 -6.312 2.95e-10 ***
## parenthoodHave kids -0.25987 0.06964 -3.732 0.000192 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.732 on 6150 degrees of freedom
## Multiple R-squared: 0.01892, Adjusted R-squared: 0.01844
## F-statistic: 39.52 on 3 and 6150 DF, p-value: < 2.2e-16
regression2 <- lm(data = dataset4,
formula = sat6 ~ marital + parenthood,
weights = cdweight)
#ols regression with considering weight
summary(regression2)
##
## Call:
## lm(formula = sat6 ~ marital + parenthood, data = dataset4, weights = cdweight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -13.0649 -0.6243 0.3895 1.2512 4.3363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.52851 0.02822 266.748 < 2e-16 ***
## maritalMarried 0.42713 0.06929 6.164 7.52e-10 ***
## maritalDivorced -0.87381 0.12471 -7.007 2.70e-12 ***
## parenthoodHave kids -0.21093 0.06884 -3.064 0.00219 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.781 on 6150 degrees of freedom
## Multiple R-squared: 0.02103, Adjusted R-squared: 0.02055
## F-statistic: 44.04 on 3 and 6150 DF, p-value: < 2.2e-16
regression3 <- lm_robust(data = dataset4,
formula = sat6 ~ marital + parenthood,
weights = cdweight)
#ols regression with considering weight and robust standard errors
summary(regression3)
##
## Call:
## lm_robust(formula = sat6 ~ marital + parenthood, data = dataset4,
## weights = cdweight)
##
## Weighted, Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 7.5285 0.03204 234.992 0.000e+00 7.4657 7.59131
## maritalMarried 0.4271 0.08673 4.925 8.663e-07 0.2571 0.59715
## maritalDivorced -0.8738 0.17238 -5.069 4.116e-07 -1.2117 -0.53588
## parenthoodHave kids -0.2109 0.08680 -2.430 1.513e-02 -0.3811 -0.04076
## DF
## (Intercept) 6150
## maritalMarried 6150
## maritalDivorced 6150
## parenthoodHave kids 6150
##
## Multiple R-squared: 0.02103 , Adjusted R-squared: 0.02055
## F-statistic: 27.01 on 3 and 6150 DF, p-value: < 2.2e-16
#Question4: Model 4: Add age and sex to Model3 as controls. What happen to the estimate for marital status and parenthood.
regression4 <- lm_robust(data = dataset4,
formula = sat6 ~ marital + parenthood + age + sex_gen,
weights = cdweight)
summary(regression4)
##
## Call:
## lm_robust(formula = sat6 ~ marital + parenthood + age + sex_gen,
## data = dataset4, weights = cdweight)
##
## Weighted, Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 8.53299 0.101602 83.9848 0.000e+00 8.33382 8.73217
## maritalMarried 0.70646 0.092586 7.6303 2.699e-14 0.52495 0.88796
## maritalDivorced -0.55662 0.177203 -3.1411 1.691e-03 -0.90400 -0.20924
## parenthoodHave kids 0.04590 0.092878 0.4942 6.212e-01 -0.13617 0.22797
## age -0.04473 0.004602 -9.7191 3.606e-22 -0.05375 -0.03571
## sex_gen2 Female 0.03982 0.051325 0.7759 4.379e-01 -0.06079 0.14044
## DF
## (Intercept) 6148
## maritalMarried 6148
## maritalDivorced 6148
## parenthoodHave kids 6148
## age 6148
## sex_gen2 Female 6148
##
## Multiple R-squared: 0.04644 , Adjusted R-squared: 0.04567
## F-statistic: 35.11 on 5 and 6148 DF, p-value: < 2.2e-16
Make a regression table of the results of Models 1, 2, 3 and 4.
#Question5: Make a regression table of the results of Models 1, 2, 3 and 4.
texreg::screenreg(list(regression1, regression2, regression3, regression4),
include.ci = FALSE, digits = 3)
##
## ===========================================================================
## Model 1 Model 2 Model 3 Model 4
## ---------------------------------------------------------------------------
## (Intercept) 7.625 *** 7.529 *** 7.529 *** 8.533 ***
## (0.028) (0.028) (0.032) (0.102)
## maritalMarried 0.395 *** 0.427 *** 0.427 *** 0.706 ***
## (0.072) (0.069) (0.087) (0.093)
## maritalDivorced -0.741 *** -0.874 *** -0.874 *** -0.557 **
## (0.117) (0.125) (0.172) (0.177)
## parenthoodHave kids -0.260 *** -0.211 ** -0.211 * 0.046
## (0.070) (0.069) (0.087) (0.093)
## age -0.045 ***
## (0.005)
## sex_gen2 Female 0.040
## (0.051)
## ---------------------------------------------------------------------------
## R^2 0.019 0.021 0.021 0.046
## Adj. R^2 0.018 0.021 0.021 0.046
## Num. obs. 6154 6154 6154 6154
## RMSE 1.781 1.758
## ===========================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
# include.ci=FALSE means don't include CI, digit=3 to specify the digits of your result.