The beginning steps are the same steps from assignment 2, in which I create an age at first marriage variable.
#set up age at first marriage variable
library(car)
## Loading required package: carData
mdata$status <- Recode(mdata$fmarital,
recodes= "1 = 1; 5 = 0; 2 = 2; 3=2; 4=2")
mdata$evermarried<-Recode(mdata$fmarital,"1:4 = 1; 5= 0")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##here I use nformwife and numwife as a way to point out if the respondent has been married more than once
##I make 4 groups: those who are married and this in their first marriage, those who are married and this is a previous marriage than their first, those who are divorce/separated or widowed, and the never married
marriedsetnfw <- filter(mdata, mdata$status==1 & numwife==1)
marriedsetfw<-filter(mdata, status==1 & nformwife>0)
nmarriedset <- filter(mdata, status ==0)
marriedinpast<-filter(mdata, status==2 & nformwife>0)
marriedsetnfw$ageevent <-marriedsetnfw$hisagem
marriedsetfw$ageevent<-marriedsetfw$agemarrn
nmarriedset$ageevent <- nmarriedset$ager
marriedinpast$ageevent<-marriedinpast$agemarrn
#after making the age of event variable the same (Ageevent) for all the groups I bind them together
atrisk <- rbind(marriedsetnfw, nmarriedset, marriedsetfw, marriedinpast)
#filter out NAs
atrisk<- atrisk%>%
filter(ageevent!="NA")%>%
filter(ageevent!=98)
Next, I filter my risk set for NH white and NH black men.
#filter for non-Hispanic white and non-Hispanic black men only
library(dplyr)
atrisk2<-atrisk%>%
filter(hispanic==2)%>%
filter(race!=3)
Here I create a variable for age at interview, then I use this to create age at birth. Next I calculate age at bachelors degree by comparing age to year earned bachelors.
#convert date of interview from century months to year
atrisk2$yrint <-((atrisk2$cmintvw-1)/12) + 1900
#calculate birth year
atrisk2$yrbirth<-atrisk2$yrint-atrisk2$age_r
#select columns needed for analysis
atrisk3<-select(atrisk2, ageevent, evermarried, earnba_y, race, caseid, yrint, yrbirth, wgt2017_2019, havedeg, age_r)
#calculate age at bachelors
atrisk3$agebach<-atrisk3$earnba_y-atrisk3$yrbirth%>%
as.integer()
Next I make my risk set into a person-year file for discrete time analysis.
#create person-year file
library(survival)
pydata<-survSplit(atrisk3, cut=c(17:50),end="ageevent", event="evermarried")
#create a time varying bachelors variable
pydata$bacheventtv1<-ifelse(pydata$ageevent>pydata$agebach,1,0)
pydata1<-pydata%>%
na.omit(bacheventtv1)
head(pydata1[, c("ageevent", "evermarried", "caseid", "agebach", "bacheventtv1", "race")], n=40)
## ageevent evermarried caseid agebach bacheventtv1 race
## 1 17 0 80734 21 0 2
## 2 18 0 80734 21 0 2
## 3 19 0 80734 21 0 2
## 4 20 0 80734 21 0 2
## 5 21 0 80734 21 0 2
## 6 22 0 80734 21 1 2
## 7 23 0 80734 21 1 2
## 8 24 0 80734 21 1 2
## 9 25 0 80734 21 1 2
## 10 26 0 80734 21 1 2
## 11 27 1 80734 21 1 2
## 12 17 0 80739 23 0 2
## 13 18 0 80739 23 0 2
## 14 19 0 80739 23 0 2
## 15 20 0 80739 23 0 2
## 16 21 0 80739 23 0 2
## 17 22 0 80739 23 0 2
## 18 23 0 80739 23 0 2
## 19 24 0 80739 23 1 2
## 20 25 0 80739 23 1 2
## 21 26 0 80739 23 1 2
## 22 27 0 80739 23 1 2
## 23 28 0 80739 23 1 2
## 24 29 0 80739 23 1 2
## 25 30 0 80739 23 1 2
## 26 31 1 80739 23 1 2
## 77 17 0 80794 21 0 2
## 78 18 0 80794 21 0 2
## 79 19 0 80794 21 0 2
## 80 20 0 80794 21 0 2
## 81 21 0 80794 21 0 2
## 82 22 0 80794 21 1 2
## 83 23 0 80794 21 1 2
## 84 24 0 80794 21 1 2
## 85 25 0 80794 21 1 2
## 86 26 0 80794 21 1 2
## 87 27 0 80794 21 1 2
## 88 28 1 80794 21 1 2
## 94 17 0 80837 23 0 2
## 95 18 0 80837 23 0 2
Next I show some survival curves based on race, education, and both. The scale of 1 indicates complete survival and as you go down the y axis, the higher the risk of marriage.
#show the effects of race alone
library(ggsurvfit)
## Loading required package: ggplot2
pydata1$race <- car::Recode(pydata1$race, recodes="1='NH Black Men'; 2='NH White men'")
s2 <- survfit(Surv(ageevent, evermarried) ~ race, data = pydata1)
str(s2)
## List of 17
## $ n : int [1:2] 1896 9318
## $ time : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
## $ n.risk : num [1:66] 1896 1766 1636 1506 1377 ...
## $ n.event : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
## $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
## $ surv : num [1:66] 1 1 0.999 0.998 0.997 ...
## $ std.err : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
## $ cumhaz : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
## $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
## $ strata : Named int [1:2] 33 33
## ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
## $ type : chr "right"
## $ logse : logi TRUE
## $ conf.int : num 0.95
## $ conf.type: chr "log"
## $ lower : num [1:66] 1 1 0.998 0.996 0.995 ...
## $ upper : num [1:66] 1 1 1 1 1 ...
## $ call : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
## - attr(*, "class")= chr "survfit"
survfit2(Surv(ageevent, evermarried) ~ race, data = pydata1) %>%
ggsurvfit() +
labs(
x = "Age",
y = "Cumulative survival probabilitay"
) +
add_confidence_interval() +
add_risktable()
#show the effects of edu alone
library(car)
pydata1$bachevent <- car::Recode(pydata1$bacheventtv1, recodes="1='BA/BS Degree'; 0='No College Degree'")
s3 <- survfit(Surv(ageevent, evermarried) ~ bacheventtv1, data = pydata1)
str(s2)
## List of 17
## $ n : int [1:2] 1896 9318
## $ time : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
## $ n.risk : num [1:66] 1896 1766 1636 1506 1377 ...
## $ n.event : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
## $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
## $ surv : num [1:66] 1 1 0.999 0.998 0.997 ...
## $ std.err : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
## $ cumhaz : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
## $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
## $ strata : Named int [1:2] 33 33
## ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
## $ type : chr "right"
## $ logse : logi TRUE
## $ conf.int : num 0.95
## $ conf.type: chr "log"
## $ lower : num [1:66] 1 1 0.998 0.996 0.995 ...
## $ upper : num [1:66] 1 1 1 1 1 ...
## $ call : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
## - attr(*, "class")= chr "survfit"
survfit2(Surv(ageevent, evermarried) ~ bachevent, data = pydata1) %>%
ggsurvfit() +
labs(
x = "Age",
y = "Cumulative survival probabilitay"
) +
add_confidence_interval() +
add_risktable()
#show the effects of race and edu
s3 <- survfit(Surv(ageevent, evermarried) ~ race + bachevent, data = pydata1)
str(s2)
## List of 17
## $ n : int [1:2] 1896 9318
## $ time : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
## $ n.risk : num [1:66] 1896 1766 1636 1506 1377 ...
## $ n.event : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
## $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
## $ surv : num [1:66] 1 1 0.999 0.998 0.997 ...
## $ std.err : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
## $ cumhaz : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
## $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
## $ strata : Named int [1:2] 33 33
## ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
## $ type : chr "right"
## $ logse : logi TRUE
## $ conf.int : num 0.95
## $ conf.type: chr "log"
## $ lower : num [1:66] 1 1 0.998 0.996 0.995 ...
## $ upper : num [1:66] 1 1 1 1 1 ...
## $ call : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
## - attr(*, "class")= chr "survfit"
survfit2(Surv(ageevent, evermarried) ~ race + bachevent, data = pydata1) %>%
ggsurvfit() +
labs(
x = "Age",
y = "Cumulative survival probabilitay"
) +
add_confidence_interval() +
add_risktable()
You can see here that NH black men with a bachelor’s degree have the
highest survival for getting married while NH white men without a
bachelor’s degree have the lowest survival.
Now I add weights to the data and create a visual for comparison.
#Create a graph based on your PY file showing how the probability of marrying at each age differs by whether or not respondent has a degree for non-Hispanic white and black men.
raceeducprobs<- pydata1 %>%
group_by(ageevent, race, bachevent) %>%
dplyr::summarize(p=weighted.mean(evermarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'race'. You can override using
## the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)
raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)
raceeducprobs$me <- 2*raceeducprobs$sep
ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=bachevent, color = bachevent, group = bachevent)) +
geom_line() + ylim(0,.15) + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education",
subtitle="US-Born Men Ages 18 to 49",
caption="Source: 2017-2019 Men's files") +
xlab(label="Age at First Marriage") +
ylab(label="Ajusted First Marriage Rate") + facet_grid(~race)
This visual seems to have some issues, but the survival curves explain
that education and race do matter in determining age at first marriage.
The differences are more visible when comparing race alone. The patterns
for age at first marriage race by education status are not as clear.
Next, I run two models: one for NH whites and one for NH blacks using age as a categorical variable and the time varying variable of whether the respondent had a bachelor’s degree before getting married as a dichotomous variable.
#add weights
pydata1$wt<-pydata1$wgt2017_2019/mean(pydata1$wgt2017_2019)
#age group as a categorical method
pydata1 <- pydata1 %>%
mutate(agegrp = (case_when(ageevent %in% c(15:19) ~ '15-19',
ageevent %in% c(20:24) ~ '20-24',
ageevent %in% c(24:29) ~ '25-29',
ageevent %in% c(30:34) ~ '30-34',
ageevent %in% c(35:39) ~ '35-39',
ageevent %in% c(40:44) ~ '40-44',
ageevent %in% c(45:49) ~ '45-49'
)
)
)
#filter for whites
pydatawhite<-pydata1%>%
filter(race=='NH White men')
whitemodelcat <- glm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", weights=wt, data = pydatawhite)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(whitemodelcat)
##
## Call:
## glm(formula = evermarried ~ agegrp + bacheventtv1, family = "binomial",
## data = pydatawhite, weights = wt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.1843 0.7470 -9.618 < 2e-16 ***
## agegrp20-24 3.7237 0.7534 4.943 7.71e-07 ***
## agegrp25-29 4.5664 0.7583 6.022 1.72e-09 ***
## agegrp30-34 4.6872 0.7627 6.145 7.98e-10 ***
## agegrp35-39 4.2907 0.7797 5.503 3.74e-08 ***
## agegrp40-44 3.8940 0.8374 4.650 3.32e-06 ***
## agegrp45-49 3.3150 1.1444 2.897 0.00377 **
## bacheventtv1 0.5286 0.1286 4.111 3.94e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4132.8 on 9317 degrees of freedom
## Residual deviance: 3691.6 on 9310 degrees of freedom
## AIC: 3718.9
##
## Number of Fisher Scoring iterations: 9
#filter for blacks
pydatablack<-pydata1%>%
filter(race=='NH Black Men')
blackmodelcat <- glm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", weights=wt, data = pydatablack)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(blackmodelcat)
##
## Call:
## glm(formula = evermarried ~ agegrp + bacheventtv1, family = "binomial",
## data = pydatablack, weights = wt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.0828 0.7639 -6.654 2.85e-11 ***
## agegrp20-24 1.7324 0.8108 2.137 0.0326 *
## agegrp25-29 1.7825 0.8718 2.045 0.0409 *
## agegrp30-34 1.8386 0.9083 2.024 0.0430 *
## agegrp35-39 2.1710 0.9563 2.270 0.0232 *
## agegrp40-44 -13.0490 701.9704 -0.019 0.9852
## agegrp45-49 -13.0788 1152.3491 -0.011 0.9909
## bacheventtv1 0.7210 0.3945 1.828 0.0676 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 430.26 on 1895 degrees of freedom
## Residual deviance: 402.06 on 1888 degrees of freedom
## AIC: 420.75
##
## Number of Fisher Scoring iterations: 16
#The reference group for age is age 15-19.
#The white model shows significant effects for each age category as well as the time varying variable for if someone had a bachelors degree prior to the event. We can see that risk of marriage peaks for white men in age 30-34.
#The standard error for the age groups 40-44 and 45-49 in the black sample is large likely due to the fact that there is a small sample of people experiencing the event. Those two age groups and the time varying degree for whether someone had a bachelors degree prior to the event are not significant.
Lastly, I calculate the predicted probabilities for each age group.
#calculate predicted probabilities
#categorical white with white
whitedatacat <- expand.grid(agegrp=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"), bacheventtv1=0:1)
whitedatacat$phat <- predict(whitemodelcat, whitedatacat, type="response")
whitedatacat
## agegrp bacheventtv1 phat
## 1 15-19 0 0.0007578569
## 2 20-24 0 0.0304558475
## 3 25-29 0 0.0679948275
## 4 30-34 0 0.0760678256
## 5 35-39 0 0.0524750559
## 6 40-44 0 0.0359064375
## 7 45-49 0 0.0204466953
## 8 15-19 1 0.0012851241
## 9 20-24 1 0.0505987756
## 10 25-29 1 0.1101448046
## 11 30-34 1 0.1225641380
## 12 35-39 1 0.0858909471
## 13 40-44 1 0.0594333016
## 14 45-49 1 0.0342032910
#categorical black with black
blackdatacat <- expand.grid(agegrp=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"), bacheventtv1=0:1)
blackdatacat$phat <- predict(blackmodelcat, blackdatacat, type="response")
blackdatacat
## agegrp bacheventtv1 phat
## 1 15-19 0 6.164374e-03
## 2 20-24 0 3.388204e-02
## 3 25-29 0 3.556291e-02
## 4 30-34 0 3.753588e-02
## 5 35-39 0 5.157388e-02
## 6 40-44 0 1.334889e-08
## 7 45-49 0 1.295697e-08
## 8 15-19 1 1.259534e-02
## 9 20-24 1 6.727205e-02
## 10 25-29 1 7.048853e-02
## 11 30-34 1 7.424993e-02
## 12 35-39 1 1.005836e-01
## 13 40-44 1 2.745272e-08
## 14 45-49 1 2.664672e-08
Here I recalculate the predicted probabilities using weighted means. I calculate the phats for NH white and black men using their weighted means, then I swap the means using the white means for the black model and the black means for the white model.
#weighted predicted probabilities
newwhitedata <- pydatawhite %>%
group_by(agegrp) %>%
dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())
newwhitedata$phat <- predict(whitemodelcat, newwhitedata, type="response")
as.data.frame(newwhitedata)
## agegrp bacheventtv1 n phat
## 1 15-19 0.0000000 2119 0.0007578569
## 2 20-24 0.2250636 3329 0.0341723185
## 3 25-29 0.8045248 2160 0.1004170177
## 4 30-34 0.8922898 1028 0.1165710690
## 5 35-39 0.9376811 426 0.0833393913
## 6 40-44 0.9875819 198 0.0590673855
## 7 45-49 1.0000000 58 0.0342032910
newblackdata <- pydatablack %>%
group_by(agegrp) %>%
dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())
newblackdata$phat <- predict(blackmodelcat, newblackdata, type="response", se.fit = FALSE)
as.data.frame(newblackdata)
## agegrp bacheventtv1 n phat
## 1 15-19 0.0000000 390 6.164374e-03
## 2 20-24 0.1785535 622 3.835878e-02
## 3 25-29 0.7622716 454 6.005170e-02
## 4 30-34 0.8663114 237 6.789006e-02
## 5 35-39 0.8516483 119 9.131174e-02
## 6 40-44 0.9617991 54 2.670688e-08
## 7 45-49 1.0000000 20 2.664672e-08
#swap
#BLACK DATA WITH WHITE MODEL
whitemodel_blackdata<-pydatablack%>%
group_by(agegrp) %>%
dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())
whitemodel_blackdata$phat <- predict(whitemodelcat, newblackdata, type="response")
as.data.frame(whitemodel_blackdata)
## agegrp bacheventtv1 n phat
## 1 15-19 0.0000000 390 0.0007578569
## 2 20-24 0.1785535 622 0.0333700550
## 3 25-29 0.7622716 454 0.0984171823
## 4 30-34 0.8663114 237 0.1151642119
## 5 35-39 0.8516483 119 0.0799301347
## 6 40-44 0.9617991 54 0.0583143914
## 7 45-49 1.0000000 20 0.0342032910
#WHITE DATA WITH BLACK MODEL
blackmodel_whitedata <- pydatawhite %>%
group_by(agegrp) %>%
dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())
blackmodel_whitedata$phat <- predict(blackmodelcat, newwhitedata, type="response")
as.data.frame(blackmodel_whitedata)
## agegrp bacheventtv1 n phat
## 1 15-19 0.0000000 2119 6.164374e-03
## 2 20-24 0.2250636 3329 3.961514e-02
## 3 25-29 0.8045248 2160 6.179459e-02
## 4 30-34 0.8922898 1028 6.908503e-02
## 5 35-39 0.9376811 426 9.659095e-02
## 6 40-44 0.9875819 198 2.720801e-08
## 7 45-49 1.0000000 58 2.664672e-08
I calculate the probability of marrying before 50 and carry out a decomposition in excel to show how much of the difference in first marriage rate between NH white men and NH black men is due to differences in whether or not they have a bachelor’s degree.