#set up age at first marriage variable
library(car)
## Loading required package: carData
mdata$status <- Recode(mdata$fmarital,
recodes= "1 = 1; 5 = 0; 2 = 2; 3=2; 4=2")
mdata$evermarried<-Recode(mdata$fmarital,"1:4 = 1; 5= 0")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##here I use nformwife and numwife as a way to point out if the respondent has been married more than once
##I make 4 groups: those who are married and this in their first marriage, those who are married and this is a previous marriage than their first, those who are divorce/separated or widowed, and the never married
marriedsetnfw <- filter(mdata, mdata$status==1 & numwife==1)
marriedsetfw<-filter(mdata, status==1 & nformwife>0)
nmarriedset <- filter(mdata, status ==0)
marriedinpast<-filter(mdata, status==2 & nformwife>0)
marriedsetnfw$ageevent <-marriedsetnfw$hisagem
marriedsetfw$ageevent<-marriedsetfw$agemarrn
nmarriedset$ageevent <- nmarriedset$ager
marriedinpast$ageevent<-marriedinpast$agemarrn
#after making the age of event variable the same (Ageevent) for all the groups I bind them together
atrisk <- rbind(marriedsetnfw, nmarriedset, marriedsetfw, marriedinpast)
#filter out NAs
atrisk<- atrisk%>%
filter(ageevent!="NA")%>%
filter(ageevent!=98)
#filter for non-Hispanic white and non-Hispanic black men only
library(dplyr)
atrisk2<-atrisk%>%
filter(hispanic==2)%>%
filter(race!=3)
#convert date of interview from century months to year
atrisk2$yrint <-((atrisk2$cmintvw-1)/12) + 1900
#calculate birth year
atrisk2$yrbirth<-atrisk2$yrint-atrisk2$age_r
#select columns needed for analysis
atrisk3<-select(atrisk2, ageevent, evermarried, earnba_y, race, caseid, yrint, yrbirth, wgt2017_2019, havedeg, age_r)
#calculate age at bachelors
atrisk3$agebach<-atrisk3$earnba_y-atrisk3$yrbirth
#create person-year file
library(survival)
pydata<-survSplit(atrisk3, cut=c(17:50),end="ageevent", event="evermarried")
#create a time varying bachelors variable
pydata$bacheventtv1<-ifelse(pydata$ageevent>pydata$agebach,1,0)
pydata1<-pydata%>%
na.omit(bacheventtv1)
head(pydata1[, c("ageevent", "evermarried", "caseid", "agebach", "bacheventtv1")], n=40)
## ageevent evermarried caseid agebach bacheventtv1
## 1 17 0 80734 20.41667 0
## 2 18 0 80734 20.41667 0
## 3 19 0 80734 20.41667 0
## 4 20 0 80734 20.41667 0
## 5 21 0 80734 20.41667 1
## 6 22 0 80734 20.41667 1
## 7 23 0 80734 20.41667 1
## 8 24 0 80734 20.41667 1
## 9 25 0 80734 20.41667 1
## 10 26 0 80734 20.41667 1
## 11 27 1 80734 20.41667 1
## 12 17 0 80739 22.50000 0
## 13 18 0 80739 22.50000 0
## 14 19 0 80739 22.50000 0
## 15 20 0 80739 22.50000 0
## 16 21 0 80739 22.50000 0
## 17 22 0 80739 22.50000 0
## 18 23 0 80739 22.50000 1
## 19 24 0 80739 22.50000 1
## 20 25 0 80739 22.50000 1
## 21 26 0 80739 22.50000 1
## 22 27 0 80739 22.50000 1
## 23 28 0 80739 22.50000 1
## 24 29 0 80739 22.50000 1
## 25 30 0 80739 22.50000 1
## 26 31 1 80739 22.50000 1
## 77 17 0 80794 21.00000 0
## 78 18 0 80794 21.00000 0
## 79 19 0 80794 21.00000 0
## 80 20 0 80794 21.00000 0
## 81 21 0 80794 21.00000 0
## 82 22 0 80794 21.00000 1
## 83 23 0 80794 21.00000 1
## 84 24 0 80794 21.00000 1
## 85 25 0 80794 21.00000 1
## 86 26 0 80794 21.00000 1
## 87 27 0 80794 21.00000 1
## 88 28 1 80794 21.00000 1
## 94 17 0 80837 22.58333 0
## 95 18 0 80837 22.58333 0
#show the effects of race alone
library(ggsurvfit)
## Loading required package: ggplot2
pydata1$race <- car::Recode(pydata1$race, recodes="1='NH Black Men'; 2='NH White men'")
s2 <- survfit(Surv(ageevent, evermarried) ~ race, data = pydata1)
str(s2)
## List of 17
## $ n : int [1:2] 1896 9318
## $ time : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
## $ n.risk : num [1:66] 1896 1766 1636 1506 1377 ...
## $ n.event : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
## $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
## $ surv : num [1:66] 1 1 0.999 0.998 0.997 ...
## $ std.err : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
## $ cumhaz : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
## $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
## $ strata : Named int [1:2] 33 33
## ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
## $ type : chr "right"
## $ logse : logi TRUE
## $ conf.int : num 0.95
## $ conf.type: chr "log"
## $ lower : num [1:66] 1 1 0.998 0.996 0.995 ...
## $ upper : num [1:66] 1 1 1 1 1 ...
## $ call : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
## - attr(*, "class")= chr "survfit"
survfit2(Surv(ageevent, evermarried) ~ race, data = pydata1) %>%
ggsurvfit() +
labs(
x = "Age",
y = "Cumulative survival probabilitay"
) +
add_confidence_interval() +
add_risktable()
#show the effects of edu alone
library(car)
pydata1$bachevent <- car::Recode(pydata1$bacheventtv1, recodes="1='BA/BS Degree'; 0='No College Degree'")
s3 <- survfit(Surv(ageevent, evermarried) ~ bacheventtv1, data = pydata1)
str(s2)
## List of 17
## $ n : int [1:2] 1896 9318
## $ time : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
## $ n.risk : num [1:66] 1896 1766 1636 1506 1377 ...
## $ n.event : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
## $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
## $ surv : num [1:66] 1 1 0.999 0.998 0.997 ...
## $ std.err : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
## $ cumhaz : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
## $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
## $ strata : Named int [1:2] 33 33
## ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
## $ type : chr "right"
## $ logse : logi TRUE
## $ conf.int : num 0.95
## $ conf.type: chr "log"
## $ lower : num [1:66] 1 1 0.998 0.996 0.995 ...
## $ upper : num [1:66] 1 1 1 1 1 ...
## $ call : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
## - attr(*, "class")= chr "survfit"
survfit2(Surv(ageevent, evermarried) ~ bachevent, data = pydata1) %>%
ggsurvfit() +
labs(
x = "Age",
y = "Cumulative survival probabilitay"
) +
add_confidence_interval() +
add_risktable()
#show the effects of race and edu
s3 <- survfit(Surv(ageevent, evermarried) ~ race + bachevent, data = pydata1)
str(s2)
## List of 17
## $ n : int [1:2] 1896 9318
## $ time : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
## $ n.risk : num [1:66] 1896 1766 1636 1506 1377 ...
## $ n.event : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
## $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
## $ surv : num [1:66] 1 1 0.999 0.998 0.997 ...
## $ std.err : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
## $ cumhaz : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
## $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
## $ strata : Named int [1:2] 33 33
## ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
## $ type : chr "right"
## $ logse : logi TRUE
## $ conf.int : num 0.95
## $ conf.type: chr "log"
## $ lower : num [1:66] 1 1 0.998 0.996 0.995 ...
## $ upper : num [1:66] 1 1 1 1 1 ...
## $ call : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
## - attr(*, "class")= chr "survfit"
survfit2(Surv(ageevent, evermarried) ~ race + bachevent, data = pydata1) %>%
ggsurvfit() +
labs(
x = "Age",
y = "Cumulative survival probabilitay"
) +
add_confidence_interval() +
add_risktable()
#Create a graph based on your PY file showing how the probability of marrying at each age differs by whether or not respondent has a degree for non-Hispanic white and black men.
#add weights
pydata1$wt<-pydata1$wgt2017_2019/mean(pydata1$wgt2017_2019)
raceeducprobs<- pydata1 %>%
group_by(age_r, race, bachevent) %>%
dplyr::summarize(p=weighted.mean(evermarried,wt,na.rm = TRUE),n=n())%>%
na.omit(bacheventtv1)
## `summarise()` has grouped output by 'age_r', 'race'. You can override using the
## `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)
raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)
raceeducprobs$me <- 2*raceeducprobs$sep
ggplot(data =raceeducprobs, aes(x = age_r, y = p, ymin=p-me, ymax=p+me, fill=bachevent,color = bachevent, group = bachevent)) +
geom_line() + ylim(0,.15) + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education",
subtitle="US-Born Men Ages 18 to 49",
caption="Source: 2017-2019 Men's files") +
xlab(label="Age at Survey") +
ylab(label="Ajusted First Marriage Rate") + facet_grid(~race)
#In this visual you can see that the first marriage rate is higher for
college educated white men at all ages, but the pattern is not as clear
for black men.
#age group as a categorical method
pydata1 <- pydata1 %>%
mutate(agegrp = (case_when(ageevent %in% c(15:19) ~ '15-19',
ageevent %in% c(20:24) ~ '20-24',
ageevent %in% c(24:29) ~ '25-29',
ageevent %in% c(30:34) ~ '30-34',
ageevent %in% c(35:39) ~ '35-39',
ageevent %in% c(40:44) ~ '40-44',
ageevent %in% c(45:49) ~ '45-49'
)
)
)
#filter for whites
pydatawhite<-pydata1%>%
filter(race=='NH White men')
whitemodelcat <- glm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", weights=wt, data = pydatawhite)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(whitemodelcat)
##
## Call:
## glm(formula = evermarried ~ agegrp + bacheventtv1, family = "binomial",
## data = pydatawhite, weights = wt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.1858 0.7470 -9.620 < 2e-16 ***
## agegrp20-24 3.4511 0.7565 4.562 5.06e-06 ***
## agegrp25-29 4.2225 0.7610 5.549 2.87e-08 ***
## agegrp30-34 4.3379 0.7650 5.670 1.43e-08 ***
## agegrp35-39 3.9378 0.7818 5.037 4.73e-07 ***
## agegrp40-44 3.5354 0.8392 4.213 2.52e-05 ***
## agegrp45-49 2.9556 1.1457 2.580 0.00989 **
## bacheventtv1 0.8896 0.1395 6.375 1.83e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4132.8 on 9317 degrees of freedom
## Residual deviance: 3664.0 on 9310 degrees of freedom
## AIC: 3692.1
##
## Number of Fisher Scoring iterations: 9
#filter for blacks
pydatablack<-pydata1%>%
filter(race=='NH Black Men')
blackmodelcat <- glm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", weights=wt, data = pydatablack)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(blackmodelcat)
##
## Call:
## glm(formula = evermarried ~ agegrp + bacheventtv1, family = "binomial",
## data = pydatablack, weights = wt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.0828 0.7639 -6.654 2.85e-11 ***
## agegrp20-24 1.8144 0.8123 2.234 0.02551 *
## agegrp25-29 2.1419 0.8622 2.484 0.01299 *
## agegrp30-34 2.2409 0.8969 2.499 0.01247 *
## agegrp35-39 2.5669 0.9456 2.715 0.00663 **
## agegrp40-44 -12.6246 702.4393 -0.018 0.98566
## agegrp45-49 -12.6333 1152.3491 -0.011 0.99125
## bacheventtv1 0.2755 0.3727 0.739 0.45982
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 430.26 on 1895 degrees of freedom
## Residual deviance: 404.96 on 1888 degrees of freedom
## AIC: 424.99
##
## Number of Fisher Scoring iterations: 16
#The reference group for age is age 15-19.
#The white model shows significant effects for each age category as well as the time varying variable for if someone had a bachelors degree prior to the event. We can see that risk of marriage peaks for white men in age 30-34.
#The standard error for the age groups 40-44 and 45-49 in the black sample is large likely due to the fact that there is a small sample of people experiencing the event. Those two age groups and the time varying degree for whether someone had a bachelors degree prior to the event are not significant.
#There is a large difference in AIC for the white sample (4311) and the black sample (497.78). This information is not useful for consideration since the samples are different, though.
#calculate predicted probabilities
#categorical white with white
whitedatacat <- expand.grid(agegrp=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"), bacheventtv1=0:1)
whitedatacat$phat <- predict(whitemodelcat, whitedatacat, type="response")
whitedatacat
## agegrp bacheventtv1 phat
## 1 15-19 0 0.0007566839
## 2 20-24 0 0.0233227668
## 3 25-29 0 0.0491126244
## 4 30-34 0 0.0547890898
## 5 35-39 0 0.0373974861
## 6 40-44 0 0.0253230993
## 7 45-49 0 0.0143403158
## 8 15-19 1 0.0018398986
## 9 20-24 1 0.0549340448
## 10 25-29 1 0.1116819528
## 11 30-34 1 0.1236498212
## 12 35-39 1 0.0863978589
## 13 40-44 1 0.0594804784
## 14 45-49 1 0.0342032910
#categorical black with black
blackdatacat <- expand.grid(agegrp=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"), bacheventtv1=0:1)
blackdatacat$phat <- predict(blackmodelcat, blackdatacat, type="response")
blackdatacat
## agegrp bacheventtv1 phat
## 1 15-19 0 6.164374e-03
## 2 20-24 0 3.667158e-02
## 3 25-29 0 5.017140e-02
## 4 30-34 0 5.510436e-02
## 5 35-39 0 7.475314e-02
## 6 40-44 0 2.040651e-08
## 7 45-49 0 2.023090e-08
## 8 15-19 1 8.103437e-03
## 9 20-24 1 4.774595e-02
## 10 25-29 1 6.504727e-02
## 11 30-34 1 7.133302e-02
## 12 35-39 1 9.617949e-02
## 13 40-44 1 2.687802e-08
## 14 45-49 1 2.664672e-08
#weighted predicted probabilities
newwhitedata <- pydatawhite %>%
group_by(agegrp) %>%
dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())
newwhitedata$phat <- predict(whitemodelcat, newwhitedata, type="response")
as.data.frame(newwhitedata)
## agegrp bacheventtv1 n phat
## 1 15-19 0.001082858 2119 0.0007574126
## 2 20-24 0.369061960 3329 0.0320958308
## 3 25-29 0.843750547 2160 0.0986180322
## 4 30-34 0.911505298 1028 0.1153687347
## 5 35-39 0.947156427 426 0.0827586431
## 6 40-44 0.990065530 198 0.0589879926
## 7 45-49 1.000000000 58 0.0342032910
newblackdata <- pydatablack %>%
group_by(agegrp) %>%
dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())
newblackdata$phat <- predict(blackmodelcat, newblackdata, type="response", se.fit = FALSE)
as.data.frame(newblackdata)
## agegrp bacheventtv1 n phat
## 1 15-19 0.0000000 390 6.164374e-03
## 2 20-24 0.2864599 622 3.956332e-02
## 3 25-29 0.8076380 454 6.189812e-02
## 4 30-34 0.8772952 237 6.912616e-02
## 5 35-39 0.8662148 119 9.302333e-02
## 6 40-44 1.0000000 54 2.687802e-08
## 7 45-49 1.0000000 20 2.664672e-08
#swap
#BLACK DATA WITH WHITE MODEL
whitemodel_blackdata<-pydatablack%>%
group_by(agegrp) %>%
dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())
whitemodel_blackdata$phat <- predict(whitemodelcat, newblackdata, type="response")
as.data.frame(whitemodel_blackdata)
## agegrp bacheventtv1 n phat
## 1 15-19 0.0000000 390 0.0007566839
## 2 20-24 0.2864599 622 0.0298898663
## 3 25-29 0.8076380 454 0.0957988774
## 4 30-34 0.8772952 237 0.1122989038
## 5 35-39 0.8662148 119 0.0774543393
## 6 40-44 1.0000000 54 0.0594804784
## 7 45-49 1.0000000 20 0.0342032910
#WHITE DATA WITH BLACK MODEL
blackmodel_whitedata <- pydatawhite %>%
group_by(agegrp) %>%
dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())
blackmodel_whitedata$phat <- predict(blackmodelcat, newwhitedata, type="response")
as.data.frame(blackmodel_whitedata)
## agegrp bacheventtv1 n phat
## 1 15-19 0.001082858 2119 6.166201e-03
## 2 20-24 0.369061960 3329 4.043701e-02
## 3 25-29 0.843750547 2160 6.247826e-02
## 4 30-34 0.911505298 1028 6.973500e-02
## 5 35-39 0.947156427 426 9.492157e-02
## 6 40-44 0.990065530 198 2.680457e-08
## 7 45-49 1.000000000 58 2.664672e-08