#set up age at first marriage variable
library(car)
## Loading required package: carData
mdata$status <- Recode(mdata$fmarital,
                      recodes= "1 = 1; 5 = 0; 2 = 2; 3=2; 4=2")

mdata$evermarried<-Recode(mdata$fmarital,"1:4 = 1; 5= 0")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##here I use nformwife and numwife as a way to point out if the respondent has been married more than once

##I make 4 groups: those who are married and this in their first marriage, those who are married and this is a previous marriage than their first, those who are divorce/separated or widowed, and the never married

marriedsetnfw <- filter(mdata, mdata$status==1 & numwife==1)

marriedsetfw<-filter(mdata, status==1  & nformwife>0)

nmarriedset <- filter(mdata, status ==0)

marriedinpast<-filter(mdata, status==2 & nformwife>0)

marriedsetnfw$ageevent <-marriedsetnfw$hisagem

marriedsetfw$ageevent<-marriedsetfw$agemarrn

nmarriedset$ageevent <- nmarriedset$ager

marriedinpast$ageevent<-marriedinpast$agemarrn

#after making the age of event variable the same (Ageevent) for all the groups I bind them together

atrisk <- rbind(marriedsetnfw, nmarriedset, marriedsetfw, marriedinpast)

#filter out NAs

atrisk<- atrisk%>%
  filter(ageevent!="NA")%>%
  filter(ageevent!=98)
#filter for non-Hispanic white and non-Hispanic black men only

library(dplyr)

atrisk2<-atrisk%>%
  filter(hispanic==2)%>%
  filter(race!=3)
#convert date of interview from century months to year

atrisk2$yrint <-((atrisk2$cmintvw-1)/12) + 1900

#calculate birth year 

atrisk2$yrbirth<-atrisk2$yrint-atrisk2$age_r

#select columns needed for analysis

atrisk3<-select(atrisk2, ageevent, evermarried, earnba_y, race, caseid, yrint, yrbirth, wgt2017_2019, havedeg, age_r) 

#calculate age at bachelors

atrisk3$agebach<-atrisk3$earnba_y-atrisk3$yrbirth
#create person-year file

library(survival)


pydata<-survSplit(atrisk3, cut=c(17:50),end="ageevent", event="evermarried")



#create a time varying bachelors variable
pydata$bacheventtv1<-ifelse(pydata$ageevent>pydata$agebach,1,0)

pydata1<-pydata%>%
  na.omit(bacheventtv1)


head(pydata1[, c("ageevent", "evermarried", "caseid", "agebach", "bacheventtv1")], n=40)
##    ageevent evermarried caseid  agebach bacheventtv1
## 1        17           0  80734 20.41667            0
## 2        18           0  80734 20.41667            0
## 3        19           0  80734 20.41667            0
## 4        20           0  80734 20.41667            0
## 5        21           0  80734 20.41667            1
## 6        22           0  80734 20.41667            1
## 7        23           0  80734 20.41667            1
## 8        24           0  80734 20.41667            1
## 9        25           0  80734 20.41667            1
## 10       26           0  80734 20.41667            1
## 11       27           1  80734 20.41667            1
## 12       17           0  80739 22.50000            0
## 13       18           0  80739 22.50000            0
## 14       19           0  80739 22.50000            0
## 15       20           0  80739 22.50000            0
## 16       21           0  80739 22.50000            0
## 17       22           0  80739 22.50000            0
## 18       23           0  80739 22.50000            1
## 19       24           0  80739 22.50000            1
## 20       25           0  80739 22.50000            1
## 21       26           0  80739 22.50000            1
## 22       27           0  80739 22.50000            1
## 23       28           0  80739 22.50000            1
## 24       29           0  80739 22.50000            1
## 25       30           0  80739 22.50000            1
## 26       31           1  80739 22.50000            1
## 77       17           0  80794 21.00000            0
## 78       18           0  80794 21.00000            0
## 79       19           0  80794 21.00000            0
## 80       20           0  80794 21.00000            0
## 81       21           0  80794 21.00000            0
## 82       22           0  80794 21.00000            1
## 83       23           0  80794 21.00000            1
## 84       24           0  80794 21.00000            1
## 85       25           0  80794 21.00000            1
## 86       26           0  80794 21.00000            1
## 87       27           0  80794 21.00000            1
## 88       28           1  80794 21.00000            1
## 94       17           0  80837 22.58333            0
## 95       18           0  80837 22.58333            0
#show the effects of race alone

library(ggsurvfit)
## Loading required package: ggplot2
pydata1$race <- car::Recode(pydata1$race, recodes="1='NH Black Men'; 2='NH White men'")


s2 <- survfit(Surv(ageevent, evermarried) ~ race, data = pydata1)
str(s2)
## List of 17
##  $ n        : int [1:2] 1896 9318
##  $ time     : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
##  $ n.risk   : num [1:66] 1896 1766 1636 1506 1377 ...
##  $ n.event  : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
##  $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
##  $ surv     : num [1:66] 1 1 0.999 0.998 0.997 ...
##  $ std.err  : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
##  $ cumhaz   : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
##  $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
##  $ strata   : Named int [1:2] 33 33
##   ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
##  $ type     : chr "right"
##  $ logse    : logi TRUE
##  $ conf.int : num 0.95
##  $ conf.type: chr "log"
##  $ lower    : num [1:66] 1 1 0.998 0.996 0.995 ...
##  $ upper    : num [1:66] 1 1 1 1 1 ...
##  $ call     : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
##  - attr(*, "class")= chr "survfit"
survfit2(Surv(ageevent, evermarried) ~ race, data = pydata1) %>% 
  ggsurvfit() +
  labs(
    x = "Age",
    y = "Cumulative survival probabilitay"
    ) + 
  add_confidence_interval() +
  add_risktable()

#show the effects of edu alone

library(car)

pydata1$bachevent <- car::Recode(pydata1$bacheventtv1, recodes="1='BA/BS Degree'; 0='No College Degree'")

s3 <- survfit(Surv(ageevent, evermarried) ~ bacheventtv1, data = pydata1)
str(s2)
## List of 17
##  $ n        : int [1:2] 1896 9318
##  $ time     : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
##  $ n.risk   : num [1:66] 1896 1766 1636 1506 1377 ...
##  $ n.event  : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
##  $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
##  $ surv     : num [1:66] 1 1 0.999 0.998 0.997 ...
##  $ std.err  : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
##  $ cumhaz   : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
##  $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
##  $ strata   : Named int [1:2] 33 33
##   ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
##  $ type     : chr "right"
##  $ logse    : logi TRUE
##  $ conf.int : num 0.95
##  $ conf.type: chr "log"
##  $ lower    : num [1:66] 1 1 0.998 0.996 0.995 ...
##  $ upper    : num [1:66] 1 1 1 1 1 ...
##  $ call     : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
##  - attr(*, "class")= chr "survfit"
survfit2(Surv(ageevent, evermarried) ~ bachevent, data = pydata1) %>% 
  ggsurvfit() +
  labs(
    x = "Age",
    y = "Cumulative survival probabilitay"
    ) + 
  add_confidence_interval() +
  add_risktable()

#show the effects of race and edu


s3 <- survfit(Surv(ageevent, evermarried) ~ race + bachevent, data = pydata1)
str(s2)
## List of 17
##  $ n        : int [1:2] 1896 9318
##  $ time     : num [1:66] 17 18 19 20 21 22 23 24 25 26 ...
##  $ n.risk   : num [1:66] 1896 1766 1636 1506 1377 ...
##  $ n.event  : num [1:66] 0 0 1 2 1 1 3 5 4 3 ...
##  $ n.censor : num [1:66] 130 130 129 127 126 125 121 111 103 94 ...
##  $ surv     : num [1:66] 1 1 0.999 0.998 0.997 ...
##  $ std.err  : num [1:66] 0 0 0.000611 0.001121 0.001336 ...
##  $ cumhaz   : num [1:66] 0 0 0.000611 0.001939 0.002665 ...
##  $ std.chaz : num [1:66] 0 0 0.000611 0.00112 0.001335 ...
##  $ strata   : Named int [1:2] 33 33
##   ..- attr(*, "names")= chr [1:2] "race=NH Black Men" "race=NH White men"
##  $ type     : chr "right"
##  $ logse    : logi TRUE
##  $ conf.int : num 0.95
##  $ conf.type: chr "log"
##  $ lower    : num [1:66] 1 1 0.998 0.996 0.995 ...
##  $ upper    : num [1:66] 1 1 1 1 1 ...
##  $ call     : language survfit(formula = Surv(ageevent, evermarried) ~ race, data = pydata1)
##  - attr(*, "class")= chr "survfit"
survfit2(Surv(ageevent, evermarried) ~ race + bachevent, data = pydata1) %>% 
  ggsurvfit() +
  labs(
    x = "Age",
    y = "Cumulative survival probabilitay"
    ) + 
  add_confidence_interval() +
  add_risktable()

#Create a graph based on your PY file showing how the probability of marrying at each age differs by whether or not respondent has a degree for non-Hispanic white and black men.


#add weights
pydata1$wt<-pydata1$wgt2017_2019/mean(pydata1$wgt2017_2019)


raceeducprobs<- pydata1 %>%
    group_by(age_r, race, bachevent) %>%
    dplyr::summarize(p=weighted.mean(evermarried,wt,na.rm = TRUE),n=n())%>%
 na.omit(bacheventtv1)
## `summarise()` has grouped output by 'age_r', 'race'. You can override using the
## `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)

raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)

raceeducprobs$me <- 2*raceeducprobs$sep

ggplot(data =raceeducprobs, aes(x = age_r, y = p, ymin=p-me, ymax=p+me, fill=bachevent,color = bachevent, group = bachevent)) +
     geom_line() + ylim(0,.15) + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education", 
       subtitle="US-Born Men Ages 18 to 49", 
       caption="Source: 2017-2019 Men's files") +
       xlab(label="Age at Survey") +
  ylab(label="Ajusted First Marriage Rate") + facet_grid(~race) 

#In this visual you can see that the first marriage rate is higher for college educated white men at all ages, but the pattern is not as clear for black men.

#age group as a categorical method

pydata1 <- pydata1 %>% 
  mutate(agegrp = (case_when(ageevent %in% c(15:19) ~ '15-19',
                             ageevent %in% c(20:24) ~ '20-24',
                             ageevent %in% c(24:29) ~ '25-29',
                             ageevent %in% c(30:34) ~ '30-34',
                             ageevent %in% c(35:39) ~ '35-39',
                             ageevent %in% c(40:44) ~ '40-44',
                             ageevent %in% c(45:49) ~ '45-49'
                             )
                   )
         )

#filter for whites

pydatawhite<-pydata1%>%
  filter(race=='NH White men')

whitemodelcat <- glm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", weights=wt, data = pydatawhite)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(whitemodelcat)
## 
## Call:
## glm(formula = evermarried ~ agegrp + bacheventtv1, family = "binomial", 
##     data = pydatawhite, weights = wt)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -7.1858     0.7470  -9.620  < 2e-16 ***
## agegrp20-24    3.4511     0.7565   4.562 5.06e-06 ***
## agegrp25-29    4.2225     0.7610   5.549 2.87e-08 ***
## agegrp30-34    4.3379     0.7650   5.670 1.43e-08 ***
## agegrp35-39    3.9378     0.7818   5.037 4.73e-07 ***
## agegrp40-44    3.5354     0.8392   4.213 2.52e-05 ***
## agegrp45-49    2.9556     1.1457   2.580  0.00989 ** 
## bacheventtv1   0.8896     0.1395   6.375 1.83e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4132.8  on 9317  degrees of freedom
## Residual deviance: 3664.0  on 9310  degrees of freedom
## AIC: 3692.1
## 
## Number of Fisher Scoring iterations: 9
#filter for blacks
pydatablack<-pydata1%>%
  filter(race=='NH Black Men')

blackmodelcat <- glm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", weights=wt, data = pydatablack) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(blackmodelcat)
## 
## Call:
## glm(formula = evermarried ~ agegrp + bacheventtv1, family = "binomial", 
##     data = pydatablack, weights = wt)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -5.0828     0.7639  -6.654 2.85e-11 ***
## agegrp20-24     1.8144     0.8123   2.234  0.02551 *  
## agegrp25-29     2.1419     0.8622   2.484  0.01299 *  
## agegrp30-34     2.2409     0.8969   2.499  0.01247 *  
## agegrp35-39     2.5669     0.9456   2.715  0.00663 ** 
## agegrp40-44   -12.6246   702.4393  -0.018  0.98566    
## agegrp45-49   -12.6333  1152.3491  -0.011  0.99125    
## bacheventtv1    0.2755     0.3727   0.739  0.45982    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 430.26  on 1895  degrees of freedom
## Residual deviance: 404.96  on 1888  degrees of freedom
## AIC: 424.99
## 
## Number of Fisher Scoring iterations: 16

#The reference group for age is age 15-19.

#The white model shows significant effects for each age category as well as the time varying variable for if someone had a bachelors degree prior to the event. We can see that risk of marriage peaks for white men in age 30-34.

#The standard error for the age groups 40-44 and 45-49 in the black sample is large likely due to the fact that there is a small sample of people experiencing the event. Those two age groups and the time varying degree for whether someone had a bachelors degree prior to the event are not significant.

#There is a large difference in AIC for the white sample (4311) and the black sample (497.78). This information is not useful for consideration since the samples are different, though.

#calculate predicted probabilities 

#categorical white with white
whitedatacat <- expand.grid(agegrp=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"), bacheventtv1=0:1)
whitedatacat$phat <- predict(whitemodelcat, whitedatacat, type="response")

whitedatacat
##    agegrp bacheventtv1         phat
## 1   15-19            0 0.0007566839
## 2   20-24            0 0.0233227668
## 3   25-29            0 0.0491126244
## 4   30-34            0 0.0547890898
## 5   35-39            0 0.0373974861
## 6   40-44            0 0.0253230993
## 7   45-49            0 0.0143403158
## 8   15-19            1 0.0018398986
## 9   20-24            1 0.0549340448
## 10  25-29            1 0.1116819528
## 11  30-34            1 0.1236498212
## 12  35-39            1 0.0863978589
## 13  40-44            1 0.0594804784
## 14  45-49            1 0.0342032910
#categorical black with black
blackdatacat <- expand.grid(agegrp=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"), bacheventtv1=0:1)

blackdatacat$phat <- predict(blackmodelcat, blackdatacat, type="response")

blackdatacat
##    agegrp bacheventtv1         phat
## 1   15-19            0 6.164374e-03
## 2   20-24            0 3.667158e-02
## 3   25-29            0 5.017140e-02
## 4   30-34            0 5.510436e-02
## 5   35-39            0 7.475314e-02
## 6   40-44            0 2.040651e-08
## 7   45-49            0 2.023090e-08
## 8   15-19            1 8.103437e-03
## 9   20-24            1 4.774595e-02
## 10  25-29            1 6.504727e-02
## 11  30-34            1 7.133302e-02
## 12  35-39            1 9.617949e-02
## 13  40-44            1 2.687802e-08
## 14  45-49            1 2.664672e-08
#weighted predicted probabilities

newwhitedata <- pydatawhite %>%                           
  group_by(agegrp) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

newwhitedata$phat <- predict(whitemodelcat, newwhitedata, type="response")

as.data.frame(newwhitedata)
##   agegrp bacheventtv1    n         phat
## 1  15-19  0.001082858 2119 0.0007574126
## 2  20-24  0.369061960 3329 0.0320958308
## 3  25-29  0.843750547 2160 0.0986180322
## 4  30-34  0.911505298 1028 0.1153687347
## 5  35-39  0.947156427  426 0.0827586431
## 6  40-44  0.990065530  198 0.0589879926
## 7  45-49  1.000000000   58 0.0342032910
newblackdata <- pydatablack %>% 
  group_by(agegrp) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

newblackdata$phat <- predict(blackmodelcat, newblackdata, type="response", se.fit = FALSE)

as.data.frame(newblackdata)
##   agegrp bacheventtv1   n         phat
## 1  15-19    0.0000000 390 6.164374e-03
## 2  20-24    0.2864599 622 3.956332e-02
## 3  25-29    0.8076380 454 6.189812e-02
## 4  30-34    0.8772952 237 6.912616e-02
## 5  35-39    0.8662148 119 9.302333e-02
## 6  40-44    1.0000000  54 2.687802e-08
## 7  45-49    1.0000000  20 2.664672e-08
#swap

#BLACK DATA WITH WHITE MODEL

whitemodel_blackdata<-pydatablack%>%
  group_by(agegrp) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

whitemodel_blackdata$phat <- predict(whitemodelcat, newblackdata, type="response")

as.data.frame(whitemodel_blackdata)
##   agegrp bacheventtv1   n         phat
## 1  15-19    0.0000000 390 0.0007566839
## 2  20-24    0.2864599 622 0.0298898663
## 3  25-29    0.8076380 454 0.0957988774
## 4  30-34    0.8772952 237 0.1122989038
## 5  35-39    0.8662148 119 0.0774543393
## 6  40-44    1.0000000  54 0.0594804784
## 7  45-49    1.0000000  20 0.0342032910
#WHITE DATA WITH BLACK MODEL

blackmodel_whitedata <- pydatawhite %>%                           
  group_by(agegrp) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

blackmodel_whitedata$phat <- predict(blackmodelcat, newwhitedata, type="response")

as.data.frame(blackmodel_whitedata)
##   agegrp bacheventtv1    n         phat
## 1  15-19  0.001082858 2119 6.166201e-03
## 2  20-24  0.369061960 3329 4.043701e-02
## 3  25-29  0.843750547 2160 6.247826e-02
## 4  30-34  0.911505298 1028 6.973500e-02
## 5  35-39  0.947156427  426 9.492157e-02
## 6  40-44  0.990065530  198 2.680457e-08
## 7  45-49  1.000000000   58 2.664672e-08