Assignment 3

The beginning steps are the same steps from assignment 2, in which I create an age at first marriage variable.

#set up age at first marriage variable
library(car)

## Loading required package: carData

mdata$status <- Recode(mdata$fmarital,
                      recodes= "1 = 1; 5 = 0; 2 = 2; 3=2; 4=2")

mdata$evermarried<-Recode(mdata$fmarital,"1:4 = 1; 5= 0")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

##here I use nformwife and numwife as a way to point out if the respondent has been married more than once

##I make 4 groups: those who are married and this in their first marriage, those who are married and this is a previous marriage than their first, those who are divorce/separated or widowed, and the never married

marriedsetnfw <- filter(mdata, mdata$status==1 & numwife==1)

marriedsetfw<-filter(mdata, status==1  & nformwife>0)

nmarriedset <- filter(mdata, status ==0)

marriedinpast<-filter(mdata, status==2 & nformwife>0)

marriedsetnfw$ageevent <-marriedsetnfw$hisagem

marriedsetfw$ageevent<-marriedsetfw$agemarrn

nmarriedset$ageevent <- nmarriedset$ager

marriedinpast$ageevent<-marriedinpast$agemarrn

#after making the age of event variable the same (Ageevent) for all the groups I bind them together

atrisk <- rbind(marriedsetnfw, nmarriedset, marriedsetfw, marriedinpast)

#filter out NAs

atrisk<- atrisk%>%
  filter(ageevent!="NA")%>%
  filter(ageevent!=98)

Next, I filter my risk set for NH white and NH black men.

#filter for non-Hispanic white and non-Hispanic black men only

library(dplyr)

atrisk2<-atrisk%>%
  filter(hispanic==2)%>%
  filter(race!=3)

Here I create a variable for age at interview, then I use this to create age at birth. Next I calculate age at bachelors degree by comparing age to year earned bachelors.

#convert date of interview from century months to year

atrisk2$yrint <-((atrisk2$cmintvw-1)/12) + 1900

#calculate birth year 

atrisk2$yrbirth<-atrisk2$yrint-atrisk2$age_r

#select columns needed for analysis

atrisk3<-select(atrisk2, ageevent, evermarried, earnba_y, race, caseid, yrint, yrbirth, wgt2017_2019, havedeg, age_r) 

#calculate age at bachelors

atrisk3$agebach<-atrisk3$earnba_y-atrisk3$yrbirth%>%
  as.integer()

Next I make my risk set into a person-year file for discrete time analysis.

#create person-year file

library(survival)


pydata<-survSplit(atrisk3, cut=c(17:50),end="ageevent", event="evermarried")



#create a time varying bachelors variable
pydata$bacheventtv1<-ifelse(pydata$ageevent>pydata$agebach,1,0)

pydata1<-pydata%>%
  na.omit(bacheventtv1)


head(pydata1[, c("ageevent", "evermarried", "caseid", "agebach", "bacheventtv1", "race")], n=40)

##    ageevent evermarried caseid agebach bacheventtv1 race
## 1        17           0  80734      21            0    2
## 2        18           0  80734      21            0    2
## 3        19           0  80734      21            0    2
## 4        20           0  80734      21            0    2
## 5        21           0  80734      21            0    2
## 6        22           0  80734      21            1    2
## 7        23           0  80734      21            1    2
## 8        24           0  80734      21            1    2
## 9        25           0  80734      21            1    2
## 10       26           0  80734      21            1    2
## 11       27           1  80734      21            1    2
## 12       17           0  80739      23            0    2
## 13       18           0  80739      23            0    2
## 14       19           0  80739      23            0    2
## 15       20           0  80739      23            0    2
## 16       21           0  80739      23            0    2
## 17       22           0  80739      23            0    2
## 18       23           0  80739      23            0    2
## 19       24           0  80739      23            1    2
## 20       25           0  80739      23            1    2
## 21       26           0  80739      23            1    2
## 22       27           0  80739      23            1    2
## 23       28           0  80739      23            1    2
## 24       29           0  80739      23            1    2
## 25       30           0  80739      23            1    2
## 26       31           1  80739      23            1    2
## 77       17           0  80794      21            0    2
## 78       18           0  80794      21            0    2
## 79       19           0  80794      21            0    2
## 80       20           0  80794      21            0    2
## 81       21           0  80794      21            0    2
## 82       22           0  80794      21            1    2
## 83       23           0  80794      21            1    2
## 84       24           0  80794      21            1    2
## 85       25           0  80794      21            1    2
## 86       26           0  80794      21            1    2
## 87       27           0  80794      21            1    2
## 88       28           1  80794      21            1    2
## 94       17           0  80837      23            0    2
## 95       18           0  80837      23            0    2

Next I show some survival curves based on race, education, and both. The scale of 1 indicates complete survival and as you go down the y axis, the higher the risk of marriage.

#show the effects of race alone

library(ggsurvfit)

## Loading required package: ggplot2

pydata1$race <- car::Recode(pydata1$race, recodes="1='NH Black Men'; 2='NH White men'")


s2 <- survfit(Surv(ageevent, evermarried) ~ race, data = pydata1)
str(s2)

## List of 17
##  $ n        : int [1:2] 1896 9318
##  $ time     : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
##  $ n.risk   : num [1:66] 1896 1766 1636 1506 1377 ...
##  $ n.event  : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
##  $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
##  $ surv     : num [1:66] 1 1 0.999 0.998 0.997 ...
##  $ std.err  : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
##  $ cumhaz   : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
##  $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
##  $ strata   : Named int [1:2] 33 33
##   ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
##  $ type     : chr "right"
##  $ logse    : logi TRUE
##  $ conf.int : num 0.95
##  $ conf.type: chr "log"
##  $ lower    : num [1:66] 1 1 0.998 0.996 0.995 ...
##  $ upper    : num [1:66] 1 1 1 1 1 ...
##  $ call     : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
##  - attr(*, "class")= chr "survfit"

survfit2(Surv(ageevent, evermarried) ~ race, data = pydata1) %>% 
  ggsurvfit() +
  labs(
    x = "Age",
    y = "Cumulative survival probabilitay"
    ) + 
  add_confidence_interval() +
  add_risktable()

#show the effects of edu alone

library(car)

pydata1$bachevent <- car::Recode(pydata1$bacheventtv1, recodes="1='BA/BS Degree'; 0='No College Degree'")

s3 <- survfit(Surv(ageevent, evermarried) ~ bacheventtv1, data = pydata1)
str(s2)

## List of 17
##  $ n        : int [1:2] 1896 9318
##  $ time     : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
##  $ n.risk   : num [1:66] 1896 1766 1636 1506 1377 ...
##  $ n.event  : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
##  $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
##  $ surv     : num [1:66] 1 1 0.999 0.998 0.997 ...
##  $ std.err  : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
##  $ cumhaz   : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
##  $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
##  $ strata   : Named int [1:2] 33 33
##   ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
##  $ type     : chr "right"
##  $ logse    : logi TRUE
##  $ conf.int : num 0.95
##  $ conf.type: chr "log"
##  $ lower    : num [1:66] 1 1 0.998 0.996 0.995 ...
##  $ upper    : num [1:66] 1 1 1 1 1 ...
##  $ call     : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
##  - attr(*, "class")= chr "survfit"

survfit2(Surv(ageevent, evermarried) ~ bachevent, data = pydata1) %>% 
  ggsurvfit() +
  labs(
    x = "Age",
    y = "Cumulative survival probabilitay"
    ) + 
  add_confidence_interval() +
  add_risktable()

#show the effects of race and edu


s3 <- survfit(Surv(ageevent, evermarried) ~ race + bachevent, data = pydata1)
str(s2)

## List of 17
##  $ n        : int [1:2] 1896 9318
##  $ time     : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
##  $ n.risk   : num [1:66] 1896 1766 1636 1506 1377 ...
##  $ n.event  : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
##  $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
##  $ surv     : num [1:66] 1 1 0.999 0.998 0.997 ...
##  $ std.err  : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
##  $ cumhaz   : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
##  $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
##  $ strata   : Named int [1:2] 33 33
##   ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
##  $ type     : chr "right"
##  $ logse    : logi TRUE
##  $ conf.int : num 0.95
##  $ conf.type: chr "log"
##  $ lower    : num [1:66] 1 1 0.998 0.996 0.995 ...
##  $ upper    : num [1:66] 1 1 1 1 1 ...
##  $ call     : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
##  - attr(*, "class")= chr "survfit"

survfit2(Surv(ageevent, evermarried) ~ race + bachevent, data = pydata1) %>% 
  ggsurvfit() +
  labs(
    x = "Age",
    y = "Cumulative survival probabilitay"
    ) + 
  add_confidence_interval() +
  add_risktable()

You can see here that NH black men with a bachelor’s degree have the highest survival for getting married while NH white men without a bachelor’s degree have the lowest survival.

Now I add weights to the data and create a visual for comparison.

#Create a graph based on your PY file showing how the probability of marrying at each age differs by whether or not respondent has a degree for non-Hispanic white and black men.


raceeducprobs<- pydata1 %>%
    group_by(ageevent, race, bachevent) %>%
    dplyr::summarize(p=weighted.mean(evermarried,wgt2017_2019,na.rm = TRUE),n=n())

## `summarise()` has grouped output by 'ageevent', 'race'. You can override using
## the `.groups` argument.

raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)

raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)

raceeducprobs$me <- 2*raceeducprobs$sep

ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=bachevent, color = bachevent, group = bachevent)) +
     geom_line() + ylim(0,.15) + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education", 
       subtitle="US-Born Men Ages 18 to 49", 
       caption="Source: 2017-2019 Men's files") +
       xlab(label="Age at First Marriage") +
  ylab(label="Ajusted First Marriage Rate") + facet_grid(~race)

This visual seems to have some issues, but the survival curves explain that education and race do matter in determining age at first marriage. The differences are more visible when comparing race alone. The patterns for age at first marriage race by education status are not as clear.

Next, I run two models: one for NH whites and one for NH blacks using age as a categorical variable and the time varying variable of whether the respondent had a bachelor’s degree before getting married as a dichotomous variable.

#add weights
pydata1$wt<-pydata1$wgt2017_2019/mean(pydata1$wgt2017_2019)

#age group as a categorical method

pydata1 <- pydata1 %>% 
  mutate(agegrp = (case_when(ageevent %in% c(15:19) ~ '15-19',
                             ageevent %in% c(20:24) ~ '20-24',
                             ageevent %in% c(24:29) ~ '25-29',
                             ageevent %in% c(30:34) ~ '30-34',
                             ageevent %in% c(35:39) ~ '35-39',
                             ageevent %in% c(40:44) ~ '40-44',
                             ageevent %in% c(45:49) ~ '45-49'
                             )
                   )
         )

#filter for whites

pydatawhite<-pydata1%>%
  filter(race=='NH White men')

whitemodelcat <- glm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", weights=wt, data = pydatawhite)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(whitemodelcat)

## 
## Call:
## glm(formula = evermarried ~ agegrp + bacheventtv1, family = "binomial", 
##     data = pydatawhite, weights = wt)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -7.1843     0.7470  -9.618  < 2e-16 ***
## agegrp20-24    3.7237     0.7534   4.943 7.71e-07 ***
## agegrp25-29    4.5664     0.7583   6.022 1.72e-09 ***
## agegrp30-34    4.6872     0.7627   6.145 7.98e-10 ***
## agegrp35-39    4.2907     0.7797   5.503 3.74e-08 ***
## agegrp40-44    3.8940     0.8374   4.650 3.32e-06 ***
## agegrp45-49    3.3150     1.1444   2.897  0.00377 ** 
## bacheventtv1   0.5286     0.1286   4.111 3.94e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4132.8  on 9317  degrees of freedom
## Residual deviance: 3691.6  on 9310  degrees of freedom
## AIC: 3718.9
## 
## Number of Fisher Scoring iterations: 9

#filter for blacks
pydatablack<-pydata1%>%
  filter(race=='NH Black Men')

blackmodelcat <- glm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", weights=wt, data = pydatablack)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(blackmodelcat)

## 
## Call:
## glm(formula = evermarried ~ agegrp + bacheventtv1, family = "binomial", 
##     data = pydatablack, weights = wt)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -5.0828     0.7639  -6.654 2.85e-11 ***
## agegrp20-24     1.7324     0.8108   2.137   0.0326 *  
## agegrp25-29     1.7825     0.8718   2.045   0.0409 *  
## agegrp30-34     1.8386     0.9083   2.024   0.0430 *  
## agegrp35-39     2.1710     0.9563   2.270   0.0232 *  
## agegrp40-44   -13.0490   701.9704  -0.019   0.9852    
## agegrp45-49   -13.0788  1152.3491  -0.011   0.9909    
## bacheventtv1    0.7210     0.3945   1.828   0.0676 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 430.26  on 1895  degrees of freedom
## Residual deviance: 402.06  on 1888  degrees of freedom
## AIC: 420.75
## 
## Number of Fisher Scoring iterations: 16

#The reference group for age is age 15-19.

#The white model shows significant effects for each age category as well as the time varying variable for if someone had a bachelors degree prior to the event. We can see that risk of marriage peaks for white men in age 30-34.

#The standard error for the age groups 40-44 and 45-49 in the black sample is large likely due to the fact that there is a small sample of people experiencing the event. Those two age groups and the time varying degree for whether someone had a bachelors degree prior to the event are not significant.

Lastly, I calculate the predicted probabilities for each age group.

#calculate predicted probabilities 

#categorical white with white
whitedatacat <- expand.grid(agegrp=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"), bacheventtv1=0:1)
whitedatacat$phat <- predict(whitemodelcat, whitedatacat, type="response")

whitedatacat

##    agegrp bacheventtv1         phat
## 1   15-19            0 0.0007578569
## 2   20-24            0 0.0304558475
## 3   25-29            0 0.0679948275
## 4   30-34            0 0.0760678256
## 5   35-39            0 0.0524750559
## 6   40-44            0 0.0359064375
## 7   45-49            0 0.0204466953
## 8   15-19            1 0.0012851241
## 9   20-24            1 0.0505987756
## 10  25-29            1 0.1101448046
## 11  30-34            1 0.1225641380
## 12  35-39            1 0.0858909471
## 13  40-44            1 0.0594333016
## 14  45-49            1 0.0342032910

#categorical black with black
blackdatacat <- expand.grid(agegrp=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"), bacheventtv1=0:1)

blackdatacat$phat <- predict(blackmodelcat, blackdatacat, type="response")

blackdatacat

##    agegrp bacheventtv1         phat
## 1   15-19            0 6.164374e-03
## 2   20-24            0 3.388204e-02
## 3   25-29            0 3.556291e-02
## 4   30-34            0 3.753588e-02
## 5   35-39            0 5.157388e-02
## 6   40-44            0 1.334889e-08
## 7   45-49            0 1.295697e-08
## 8   15-19            1 1.259534e-02
## 9   20-24            1 6.727205e-02
## 10  25-29            1 7.048853e-02
## 11  30-34            1 7.424993e-02
## 12  35-39            1 1.005836e-01
## 13  40-44            1 2.745272e-08
## 14  45-49            1 2.664672e-08

Here I recalculate the predicted probabilities using weighted means. I calculate the phats for NH white and black men using their weighted means, then I swap the means using the white means for the black model and the black means for the white model.

#weighted predicted probabilities

newwhitedata <- pydatawhite %>%                           
  group_by(agegrp) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

newwhitedata$phat <- predict(whitemodelcat, newwhitedata, type="response")

as.data.frame(newwhitedata)

##   agegrp bacheventtv1    n         phat
## 1  15-19    0.0000000 2119 0.0007578569
## 2  20-24    0.2250636 3329 0.0341723185
## 3  25-29    0.8045248 2160 0.1004170177
## 4  30-34    0.8922898 1028 0.1165710690
## 5  35-39    0.9376811  426 0.0833393913
## 6  40-44    0.9875819  198 0.0590673855
## 7  45-49    1.0000000   58 0.0342032910

newblackdata <- pydatablack %>% 
  group_by(agegrp) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

newblackdata$phat <- predict(blackmodelcat, newblackdata, type="response", se.fit = FALSE)

as.data.frame(newblackdata)

##   agegrp bacheventtv1   n         phat
## 1  15-19    0.0000000 390 6.164374e-03
## 2  20-24    0.1785535 622 3.835878e-02
## 3  25-29    0.7622716 454 6.005170e-02
## 4  30-34    0.8663114 237 6.789006e-02
## 5  35-39    0.8516483 119 9.131174e-02
## 6  40-44    0.9617991  54 2.670688e-08
## 7  45-49    1.0000000  20 2.664672e-08

#swap

#BLACK DATA WITH WHITE MODEL

whitemodel_blackdata<-pydatablack%>%
  group_by(agegrp) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

whitemodel_blackdata$phat <- predict(whitemodelcat, newblackdata, type="response")

as.data.frame(whitemodel_blackdata)

##   agegrp bacheventtv1   n         phat
## 1  15-19    0.0000000 390 0.0007578569
## 2  20-24    0.1785535 622 0.0333700550
## 3  25-29    0.7622716 454 0.0984171823
## 4  30-34    0.8663114 237 0.1151642119
## 5  35-39    0.8516483 119 0.0799301347
## 6  40-44    0.9617991  54 0.0583143914
## 7  45-49    1.0000000  20 0.0342032910

#WHITE DATA WITH BLACK MODEL

blackmodel_whitedata <- pydatawhite %>%                           
  group_by(agegrp) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

blackmodel_whitedata$phat <- predict(blackmodelcat, newwhitedata, type="response")

as.data.frame(blackmodel_whitedata)

##   agegrp bacheventtv1    n         phat
## 1  15-19    0.0000000 2119 6.164374e-03
## 2  20-24    0.2250636 3329 3.961514e-02
## 3  25-29    0.8045248 2160 6.179459e-02
## 4  30-34    0.8922898 1028 6.908503e-02
## 5  35-39    0.9376811  426 9.659095e-02
## 6  40-44    0.9875819  198 2.720801e-08
## 7  45-49    1.0000000   58 2.664672e-08

I calculate the probability of marrying before 50 and carry out a decomposition in excel to show how much of the difference in first marriage rate between NH white men and NH black men is due to differences in whether or not they have a bachelor’s degree.

Assignment 3

Rebecca Luttinen

2023-10-05