The beginning steps are the same steps from assignment 2, in which I create an age at first marriage variable.

#set up age at first marriage variable
library(car)
## Loading required package: carData
mdata$status <- Recode(mdata$fmarital,
                      recodes= "1 = 1; 5 = 0; 2 = 2; 3=2; 4=2")

mdata$evermarried<-Recode(mdata$fmarital,"1:4 = 1; 5= 0")

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##here I use nformwife and numwife as a way to point out if the respondent has been married more than once

##I make 4 groups: those who are married and this in their first marriage, those who are married and this is a previous marriage than their first, those who are divorce/separated or widowed, and the never married

marriedsetnfw <- filter(mdata, mdata$status==1 & numwife==1)

marriedsetfw<-filter(mdata, status==1  & nformwife>0)

nmarriedset <- filter(mdata, status ==0)

marriedinpast<-filter(mdata, status==2 & nformwife>0)

marriedsetnfw$ageevent <-marriedsetnfw$hisagem

marriedsetfw$ageevent<-marriedsetfw$agemarrn

nmarriedset$ageevent <- nmarriedset$ager

marriedinpast$ageevent<-marriedinpast$agemarrn

#after making the age of event variable the same (Ageevent) for all the groups I bind them together

atrisk <- rbind(marriedsetnfw, nmarriedset, marriedsetfw, marriedinpast)

#filter out NAs

atrisk<- atrisk%>%
  filter(ageevent!="NA")%>%
  filter(ageevent!=98)

Next, I filter my risk set for NH white and NH black men.

#filter for non-Hispanic white and non-Hispanic black men only

library(dplyr)

atrisk2<-atrisk%>%
  filter(hispanic==2)%>%
  filter(race!=3)


#show table of age of event and whether ever married 
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
tabyl(atrisk2, ageevent, evermarried)
##  ageevent   0   1
##        15 110   0
##        16 102   2
##        17 128   7
##        18 127  27
##        19 119  42
##        20  76  59
##        21  85 100
##        22  78  87
##        23  80  94
##        24  92 103
##        25  77 114
##        26  75  88
##        27  73  94
##        28  94  72
##        29  61  81
##        30  72  60
##        31  65  46
##        32  64  40
##        33  69  43
##        34  41  33
##        35  43  22
##        36  42  22
##        37  37  14
##        38  36  12
##        39  33   5
##        40  32  11
##        41  27   6
##        42  33   4
##        43  21   2
##        44  20   1
##        45  22   4
##        46  22   1
##        47  28   0
##        48  21   0
##        49  27   0
#note if married ever married=1 and not married=0

#show this table by race 

tabyl(atrisk2, ageevent, evermarried, race)
## $`1`
##  ageevent  0  1
##        15 32  0
##        16 32  1
##        17 52  2
##        18 45  6
##        19 38 11
##        20 27 15
##        21 27 15
##        22 25  7
##        23 27 15
##        24 24 26
##        25 30 23
##        26 30  7
##        27 28 14
##        28 39 11
##        29 25 17
##        30 25 17
##        31 19  7
##        32 10  5
##        33 19  8
##        34 19  6
##        35 11  4
##        36 12  4
##        37 13  6
##        38  8  2
##        39 14  1
##        40 12  2
##        41  9  1
##        42 10  2
##        43  6  0
##        44  6  0
##        45  8  1
##        46  7  0
##        47 10  0
##        48  8  0
##        49 11  0
## 
## $`2`
##  ageevent  0  1
##        15 78  0
##        16 70  1
##        17 76  5
##        18 82 21
##        19 81 31
##        20 49 44
##        21 58 85
##        22 53 80
##        23 53 79
##        24 68 77
##        25 47 91
##        26 45 81
##        27 45 80
##        28 55 61
##        29 36 64
##        30 47 43
##        31 46 39
##        32 54 35
##        33 50 35
##        34 22 27
##        35 32 18
##        36 30 18
##        37 24  8
##        38 28 10
##        39 19  4
##        40 20  9
##        41 18  5
##        42 23  2
##        43 15  2
##        44 14  1
##        45 14  3
##        46 15  1
##        47 18  0
##        48 13  0
##        49 16  0
#note 1= black race and 2=white race

Here I create a variable for age at interview, then I use this to create age at birth. Next I calculate age at bachelors degree by comparing age to year earned bachelors.

#convert date of interview from century months to year

atrisk2$yrint <-((atrisk2$cmintvw-1)/12) + 1900

#calculate birth year 

atrisk2$yrbirth<-atrisk2$intvwyear-atrisk2$age_r

#select columns needed for analysis

atrisk3<-select(atrisk2, ageevent, evermarried, earnba_y, race, caseid, yrint, yrbirth, wgt2017_2019, havedeg, age_r, secu, sest) 

#calculate age at bachelors

atrisk3$agebach<-atrisk3$earnba_y-atrisk3$yrbirth%>%
  as.integer()

Next I make my risk set into a person-year file for discrete time analysis.

#create person-year file

library(survival)


pydata<-survSplit(atrisk3, cut=c(17:50),end="ageevent", event="evermarried")



#create a time varying bachelors variable
pydata$bacheventtv1<-ifelse(pydata$ageevent>pydata$agebach,1,0)

#replace NA's (individuals who did not get a bachelor's degree) with 0

pydata$bacheventtv1 <- pydata$bacheventtv1 %>% replace(is.na(.), 0)
 

head(pydata[, c("ageevent", "evermarried", "caseid", "agebach", "bacheventtv1", "race")], n=60)
##    ageevent evermarried caseid agebach bacheventtv1 race
## 1        17           0  80734      21            0    2
## 2        18           0  80734      21            0    2
## 3        19           0  80734      21            0    2
## 4        20           0  80734      21            0    2
## 5        21           0  80734      21            0    2
## 6        22           0  80734      21            1    2
## 7        23           0  80734      21            1    2
## 8        24           0  80734      21            1    2
## 9        25           0  80734      21            1    2
## 10       26           0  80734      21            1    2
## 11       27           1  80734      21            1    2
## 12       17           0  80739      23            0    2
## 13       18           0  80739      23            0    2
## 14       19           0  80739      23            0    2
## 15       20           0  80739      23            0    2
## 16       21           0  80739      23            0    2
## 17       22           0  80739      23            0    2
## 18       23           0  80739      23            0    2
## 19       24           0  80739      23            1    2
## 20       25           0  80739      23            1    2
## 21       26           0  80739      23            1    2
## 22       27           0  80739      23            1    2
## 23       28           0  80739      23            1    2
## 24       29           0  80739      23            1    2
## 25       30           0  80739      23            1    2
## 26       31           1  80739      23            1    2
## 27       17           0  80745      NA            0    2
## 28       18           0  80745      NA            0    2
## 29       19           0  80745      NA            0    2
## 30       20           0  80745      NA            0    2
## 31       21           0  80745      NA            0    2
## 32       22           0  80745      NA            0    2
## 33       23           1  80745      NA            0    2
## 34       17           0  80770      NA            0    2
## 35       18           0  80770      NA            0    2
## 36       19           0  80770      NA            0    2
## 37       20           0  80770      NA            0    2
## 38       21           0  80770      NA            0    2
## 39       22           0  80770      NA            0    2
## 40       23           0  80770      NA            0    2
## 41       24           0  80770      NA            0    2
## 42       25           0  80770      NA            0    2
## 43       26           0  80770      NA            0    2
## 44       27           0  80770      NA            0    2
## 45       28           0  80770      NA            0    2
## 46       29           0  80770      NA            0    2
## 47       30           0  80770      NA            0    2
## 48       31           0  80770      NA            0    2
## 49       32           0  80770      NA            0    2
## 50       33           0  80770      NA            0    2
## 51       34           0  80770      NA            0    2
## 52       35           0  80770      NA            0    2
## 53       36           1  80770      NA            0    2
## 54       17           0  80775      NA            0    2
## 55       18           0  80775      NA            0    2
## 56       19           0  80775      NA            0    2
## 57       20           0  80775      NA            0    2
## 58       21           0  80775      NA            0    2
## 59       22           0  80775      NA            0    2
## 60       23           0  80775      NA            0    2
#recodes for simplicity
  

pydata$bachevent <- car::Recode(pydata$bacheventtv, recodes="1='BA/BS Degree'; 0='No College Degree'")


pydata$race <- car::Recode(pydata$race, recodes="1='NH Black Men'; 2='NH White men'")
#employ survey design

library(survey)
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
design<-svydesign(ids = ~secu,
                strata = ~sest,
                weights=~wgt2017_2019, 
                data=pydata,
                nest=T)

Now I add weights to the data and create a visual for comparison.

#Create a graph based on your PY file showing how the probability of marrying at each age differs by whether or not respondent has a degree for non-Hispanic white and black men.

library(ggplot2)

raceprobs <- svyby(~evermarried, by = ~ageevent + race+ bachevent, design = design, 
                   FUN = svymean, na.rm = TRUE)

raceprobs$lim <- 2*raceprobs$se
  
ggplot(data =raceprobs, aes(x = ageevent, y = evermarried, ymin=evermarried-lim, ymax=evermarried+lim, fill=bachevent ,color = bachevent, group = bachevent)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Race", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") + facet_grid(~race)

#visual for first marriage rates by race and age

raceprobs <- svyby(~evermarried, by = ~ageevent + race, design = design, 
                   FUN = svymean, na.rm = TRUE)

raceprobs$lim <- 2*raceprobs$se

ggplot(data =raceprobs, aes(x = ageevent, y = evermarried, ymin=evermarried-lim, ymax=evermarried+lim, fill=race,color = race, group = race)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Race", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Adjusted First Marriage Rate") 

#add weights
pydata$wt<-pydata$wgt2017_2019/mean(pydata$wgt2017_2019)

#age group as a categorical method

pydata <- pydata %>% 
  mutate(agegrp = (case_when(ageevent %in% c(15:19) ~ '15-19',
                             ageevent %in% c(20:24) ~ '20-24',
                             ageevent %in% c(24:29) ~ '25-29',
                             ageevent %in% c(30:34) ~ '30-34',
                             ageevent %in% c(35:39) ~ '35-39',
                             ageevent %in% c(40:44) ~ '40-44',
                             ageevent %in% c(45:49) ~ '45-49'
                             ) )
         )
library(survey)

#filter for whites

pydatawhite<-pydata%>%
  filter(race=='NH White men')

#employ survey design

library(survey)

designwhite<-svydesign(ids = ~secu,
                strata = ~sest,
                weights=~wgt2017_2019, 
                data=pydatawhite,
                nest=T)

whitemodelcat <- svyglm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", design=designwhite)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(whitemodelcat)
## 
## Call:
## svyglm(formula = evermarried ~ agegrp + bacheventtv1, design = designwhite, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019, 
##     data = pydatawhite, nest = T)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4.58366    0.18380 -24.938  < 2e-16 ***
## agegrp20-24   1.49366    0.19297   7.740 6.24e-10 ***
## agegrp25-29   1.97069    0.20205   9.754 7.12e-13 ***
## agegrp30-34   1.89582    0.20612   9.198 4.43e-12 ***
## agegrp35-39   1.57547    0.23656   6.660 2.68e-08 ***
## agegrp40-44   1.26562    0.38232   3.310   0.0018 ** 
## agegrp45-49   0.25017    0.82707   0.302   0.7636    
## bacheventtv1  0.53492    0.08995   5.947 3.24e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9999374)
## 
## Number of Fisher Scoring iterations: 7
#filter for blacks
pydatablack<-pydata%>%
  filter(race=='NH Black Men')

#employ survey design

library(survey)

designwblack<-svydesign(ids = ~secu,
                strata = ~sest,
                weights=~wgt2017_2019, 
                data=pydatablack,
                nest=T)


blackmodelcat <- svyglm(evermarried ~ agegrp+ bacheventtv1, family = "binomial", design=designwblack) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(blackmodelcat)
## 
## Call:
## svyglm(formula = evermarried ~ agegrp + bacheventtv1, design = designwblack, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019, 
##     data = pydatablack, nest = T)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -4.9226     0.3093 -15.917  < 2e-16 ***
## agegrp20-24    1.4852     0.3572   4.158 0.000142 ***
## agegrp25-29    1.7220     0.3554   4.845 1.54e-05 ***
## agegrp30-34    1.9484     0.3929   4.959 1.05e-05 ***
## agegrp35-39    1.6788     0.5012   3.349 0.001647 ** 
## agegrp40-44    0.7085     0.5931   1.195 0.238540    
## agegrp45-49   -0.6999     1.1003  -0.636 0.527927    
## bacheventtv1   0.6258     0.2877   2.175 0.034932 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.00177)
## 
## Number of Fisher Scoring iterations: 7
whitemodel <- svyglm(evermarried ~ factor(ageevent) + bacheventtv1, family = "binomial", design=designwhite)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(whitemodel)
## 
## Call:
## svyglm(formula = evermarried ~ factor(ageevent) + bacheventtv1, 
##     design = designwhite, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019, 
##     data = pydatawhite, nest = T)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -15.64300    0.19395 -80.656  < 2e-16 ***
## factor(ageevent)16  10.92166    1.02500  10.655 1.88e-09 ***
## factor(ageevent)17   8.63370    0.56531  15.272 4.01e-12 ***
## factor(ageevent)18  11.42800    0.35387  32.294  < 2e-16 ***
## factor(ageevent)19  11.50596    0.30248  38.038  < 2e-16 ***
## factor(ageevent)20  12.02367    0.26334  45.659  < 2e-16 ***
## factor(ageevent)21  12.62631    0.26189  48.212  < 2e-16 ***
## factor(ageevent)22  12.54735    0.22750  55.153  < 2e-16 ***
## factor(ageevent)23  12.79541    0.23764  53.844  < 2e-16 ***
## factor(ageevent)24  12.71617    0.26189  48.556  < 2e-16 ***
## factor(ageevent)25  12.81299    0.24388  52.538  < 2e-16 ***
## factor(ageevent)26  13.08326    0.27405  47.740  < 2e-16 ***
## factor(ageevent)27  13.28442    0.22515  59.002  < 2e-16 ***
## factor(ageevent)28  13.03679    0.23938  54.461  < 2e-16 ***
## factor(ageevent)29  13.04545    0.23305  55.977  < 2e-16 ***
## factor(ageevent)30  12.97293    0.24721  52.477  < 2e-16 ***
## factor(ageevent)31  13.00493    0.30950  42.020  < 2e-16 ***
## factor(ageevent)32  13.02529    0.30928  42.115  < 2e-16 ***
## factor(ageevent)33  12.93682    0.23564  54.900  < 2e-16 ***
## factor(ageevent)34  12.91439    0.30934  41.748  < 2e-16 ***
## factor(ageevent)35  12.53168    0.40267  31.121  < 2e-16 ***
## factor(ageevent)36  13.17635    0.37589  35.054  < 2e-16 ***
## factor(ageevent)37  12.31615    0.44722  27.539  < 2e-16 ***
## factor(ageevent)38  12.67968    0.45761  27.709  < 2e-16 ***
## factor(ageevent)39  11.86523    0.62037  19.126 7.16e-14 ***
## factor(ageevent)40  13.10090    0.52609  24.902 5.73e-16 ***
## factor(ageevent)41  12.33916    0.52207  23.635 1.50e-15 ***
## factor(ageevent)42  11.27280    0.78906  14.286 1.29e-11 ***
## factor(ageevent)43  11.83152    0.82894  14.273 1.31e-11 ***
## factor(ageevent)44  11.09421    1.01228  10.960 1.18e-09 ***
## factor(ageevent)45  12.22257    0.78635  15.543 2.94e-12 ***
## factor(ageevent)46  10.77307    1.00789  10.689 1.78e-09 ***
## factor(ageevent)47  -0.04757    0.23116  -0.206    0.839    
## factor(ageevent)48  -0.02009    0.25124  -0.080    0.937    
## factor(ageevent)49   0.10060    0.24905   0.404    0.691    
## bacheventtv1         0.48406    0.09439   5.128 5.98e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9944435)
## 
## Number of Fisher Scoring iterations: 14
blackmodel <- svyglm(evermarried ~ factor(ageevent)+ bacheventtv1, family = "binomial", design=designwblack) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(blackmodel)
## 
## Call:
## svyglm(formula = evermarried ~ factor(ageevent) + bacheventtv1, 
##     design = designwblack, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019, 
##     data = pydatablack, nest = T)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -17.58689    0.32186 -54.641  < 2e-16 ***
## factor(ageevent)16  14.18362    1.08741  13.044 2.78e-10 ***
## factor(ageevent)17  11.02850    0.79506  13.871 1.06e-10 ***
## factor(ageevent)18  12.35023    0.54553  22.639 3.91e-14 ***
## factor(ageevent)19  13.41308    0.52304  25.644 4.98e-15 ***
## factor(ageevent)20  14.02314    0.57234  24.502 1.06e-14 ***
## factor(ageevent)21  13.56901    0.40557  33.457  < 2e-16 ***
## factor(ageevent)22  13.87240    0.62272  22.277 5.10e-14 ***
## factor(ageevent)23  13.89547    0.46695  29.758 4.18e-16 ***
## factor(ageevent)24  14.95257    0.46269  32.317  < 2e-16 ***
## factor(ageevent)25  14.49797    0.36159  40.095  < 2e-16 ***
## factor(ageevent)26  13.89443    0.63818  21.772 7.44e-14 ***
## factor(ageevent)27  14.86185    0.45085  32.964  < 2e-16 ***
## factor(ageevent)28  13.57922    0.52624  25.804 4.49e-15 ***
## factor(ageevent)29  14.69234    0.42682  34.423  < 2e-16 ***
## factor(ageevent)30  15.19558    0.44793  33.924  < 2e-16 ***
## factor(ageevent)31  13.84265    0.74841  18.496 1.07e-12 ***
## factor(ageevent)32  14.49105    0.60298  24.032 1.46e-14 ***
## factor(ageevent)33  14.25222    0.51736  27.548 1.51e-15 ***
## factor(ageevent)34  14.73877    0.52875  27.875 1.24e-15 ***
## factor(ageevent)35  13.93540    0.62836  22.177 5.49e-14 ***
## factor(ageevent)36  14.67667    0.79554  18.449 1.12e-12 ***
## factor(ageevent)37  15.03104    0.68001  22.104 5.80e-14 ***
## factor(ageevent)38  13.87241    0.73702  18.822 8.05e-13 ***
## factor(ageevent)39  13.03937    1.05620  12.345 6.51e-10 ***
## factor(ageevent)40  13.43071    0.77989  17.221 3.40e-12 ***
## factor(ageevent)41  13.71458    1.09438  12.532 5.17e-10 ***
## factor(ageevent)42  14.17699    0.92140  15.386 2.07e-11 ***
## factor(ageevent)43  -0.13174    0.38807  -0.339    0.738    
## factor(ageevent)44  -0.13211    0.39274  -0.336    0.741    
## factor(ageevent)45  13.07587    1.00834  12.968 3.05e-10 ***
## factor(ageevent)46  -0.14548    0.42649  -0.341    0.737    
## factor(ageevent)47  -0.13258    0.44133  -0.300    0.768    
## factor(ageevent)48  -0.04527    0.52971  -0.085    0.933    
## factor(ageevent)49  -0.06686    0.50545  -0.132    0.896    
## bacheventtv1         0.55673    0.30222   1.842    0.083 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9728703)
## 
## Number of Fisher Scoring iterations: 16

Here I calculate the predicted probabilities using weighted means. I calculate the phats for NH white and black men using their weighted means, then I swap the means using the white means for the black model and the black means for the white model.

#weighted predicted probabilities

newwhitedata <- pydatawhite %>%                           
  group_by(ageevent) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

newwhitedata$phat <- predict(whitemodel, newwhitedata, type="response")

as.data.frame(newwhitedata)
##    ageevent bacheventtv1    n         phat
## 1        15  0.000000000   78 1.608163e-07
## 2        16  0.000000000   71 8.824615e-03
## 3        17  0.000000000 2325 9.026185e-04
## 4        18  0.000000000 2244 1.455723e-02
## 5        19  0.000000000 2141 1.571901e-02
## 6        20  0.002463456 2029 2.613138e-02
## 7        21  0.004149484 1936 4.676671e-02
## 8        22  0.048133357 1793 4.426214e-02
## 9        23  0.186357539 1660 5.967092e-02
## 10       24  0.275414376 1528 5.767535e-02
## 11       25  0.324431202 1383 6.458727e-02
## 12       26  0.345045659 1245 8.372925e-02
## 13       27  0.352063404 1119 1.008187e-01
## 14       28  0.342936438  994 8.015738e-02
## 15       29  0.343754279  878 8.082734e-02
## 16       30  0.358808404  778 7.611190e-02
## 17       31  0.345299320  688 7.792150e-02
## 18       32  0.333120468  603 7.896741e-02
## 19       33  0.321120336  514 7.237697e-02
## 20       34  0.316520014  429 7.073863e-02
## 21       35  0.313681261  380 4.929024e-02
## 22       36  0.335014456  330 9.075221e-02
## 23       37  0.287579928  282 3.963316e-02
## 24       38  0.274675930  250 5.570514e-02
## 25       39  0.256396389  212 2.524252e-02
## 26       40  0.280950682  189 8.270807e-02
## 27       41  0.267958041  160 4.015072e-02
## 28       42  0.264705208  137 1.417403e-02
## 29       43  0.280608899  112 2.470694e-02
## 30       44  0.294314448   95 1.205291e-02
## 31       45  0.287390482   80 3.621777e-02
## 32       46  0.274051554   63 8.686374e-03
## 33       47  0.206228594   47 1.694428e-07
## 34       48  0.102148101   29 1.656079e-07
## 35       49  0.034895322   16 1.808647e-07
newblackdata <- pydatablack %>% 
  group_by(ageevent) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

newblackdata$phat <- predict(blackmodel, newblackdata, type="response", se.fit = FALSE)

as.data.frame(newblackdata)
##    ageevent bacheventtv1   n         phat
## 1        15   0.00000000  32 2.302022e-08
## 2        16   0.00000000  33 3.219336e-02
## 3        17   0.00000000 889 1.416152e-03
## 4        18   0.00000000 835 5.289852e-03
## 5        19   0.00000000 784 1.516014e-02
## 6        20   0.00000000 735 2.755170e-02
## 7        21   0.00255432 693 1.769769e-02
## 8        22   0.02295590 651 2.408684e-02
## 9        23   0.06730008 619 2.523528e-02
## 10       24   0.11854169 577 7.120493e-02
## 11       25   0.14027244 527 4.693925e-02
## 12       26   0.15837934 474 2.648625e-02
## 13       27   0.17693694 437 6.745017e-02
## 14       28   0.19117968 395 1.981633e-02
## 15       29   0.19137694 345 5.797592e-02
## 16       30   0.17733780 303 9.173941e-02
## 17       31   0.18778020 261 2.558834e-02
## 18       32   0.18419645 235 4.772985e-02
## 19       33   0.20278867 220 3.835443e-02
## 20       34   0.16934375 193 5.987017e-02
## 21       35   0.16013393 168 2.758949e-02
## 22       36   0.15682518 153 5.609860e-02
## 23       37   0.12472773 137 7.681583e-02
## 24       38   0.13839131 118 2.564471e-02
## 25       39   0.14402949 108 1.134753e-02
## 26       40   0.12585182  93 1.652659e-02
## 27       41   0.16001604  79 2.224305e-02
## 28       42   0.16918474  69 3.503622e-02
## 29       43   0.16713494  57 2.214653e-08
## 30       44   0.17198014  51 2.219820e-08
## 31       45   0.17956506  45 1.199675e-02
## 32       46   0.14694798  36 2.160031e-08
## 33       47   0.12601381  29 2.162713e-08
## 34       48   0.08247742  19 2.303516e-08
## 35       49   0.15668918  11 2.349410e-08
#swap

#BLACK DATA WITH WHITE MODEL

whitemodel_blackdata<-pydatablack%>%
  group_by(ageevent) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

whitemodel_blackdata$phat <- predict(whitemodel, newblackdata, type="response")

as.data.frame(whitemodel_blackdata)
##    ageevent bacheventtv1   n         phat
## 1        15   0.00000000  32 1.608163e-07
## 2        16   0.00000000  33 8.824615e-03
## 3        17   0.00000000 889 9.026185e-04
## 4        18   0.00000000 835 1.455723e-02
## 5        19   0.00000000 784 1.571901e-02
## 6        20   0.00000000 735 2.610105e-02
## 7        21   0.00255432 693 4.673230e-02
## 8        22   0.02295590 651 4.374943e-02
## 9        23   0.06730008 619 5.651812e-02
## 10       24   0.11854169 577 5.368431e-02
## 11       25   0.14027244 527 5.940614e-02
## 12       26   0.15837934 474 7.705283e-02
## 13       27   0.17693694 437 9.338965e-02
## 14       28   0.19117968 395 7.490543e-02
## 15       29   0.19137694 345 7.551411e-02
## 16       30   0.17733780 303 7.016037e-02
## 17       31   0.18778020 261 7.261638e-02
## 18       32   0.18419645 235 7.388094e-02
## 19       33   0.20278867 220 6.862426e-02
## 20       34   0.16934375 193 6.619640e-02
## 21       35   0.16013393 168 4.592168e-02
## 22       36   0.15682518 153 8.388165e-02
## 23       37   0.12472773 137 3.673923e-02
## 24       38   0.13839131 118 5.233500e-02
## 25       39   0.14402949 108 2.393819e-02
## 26       40   0.12585182  93 7.718773e-02
## 27       41   0.16001604  79 3.818475e-02
## 28       42   0.16918474  69 1.354225e-02
## 29       43   0.16713494  57 2.341736e-02
## 30       44   0.17198014  51 1.136778e-02
## 31       45   0.17956506  45 3.443935e-02
## 32       46   0.14694798  36 8.172285e-03
## 33       47   0.12601381  29 1.629897e-07
## 34       48   0.08247742  19 1.640385e-07
## 35       49   0.15668918  11 1.918482e-07
#WHITE DATA WITH BLACK MODEL

blackmodel_whitedata <- pydatawhite %>%                           
  group_by(ageevent) %>% 
  dplyr::summarize(bacheventtv1=weighted.mean(bacheventtv1,wgt2017_2019,na.rm = TRUE),n=n())

blackmodel_whitedata$phat <- predict(blackmodel, newwhitedata, type="response")

as.data.frame(blackmodel_whitedata)
##    ageevent bacheventtv1    n         phat
## 1        15  0.000000000   78 2.302022e-08
## 2        16  0.000000000   71 3.219336e-02
## 3        17  0.000000000 2325 1.416152e-03
## 4        18  0.000000000 2244 5.289852e-03
## 5        19  0.000000000 2141 1.516014e-02
## 6        20  0.002463456 2029 2.758847e-02
## 7        21  0.004149484 1936 1.771314e-02
## 8        22  0.048133357 1793 2.441854e-02
## 9        23  0.186357539 1660 2.691807e-02
## 10       24  0.275414376 1528 7.720159e-02
## 11       25  0.324431202 1383 5.174487e-02
## 12       26  0.345045659 1245 2.930188e-02
## 13       27  0.352063404 1119 7.384757e-02
## 14       28  0.342936438  994 2.152571e-02
## 15       29  0.343754279  878 6.278653e-02
## 16       30  0.358808404  778 1.005119e-01
## 17       31  0.345299320  688 2.786829e-02
## 18       32  0.333120468  603 5.164274e-02
## 19       33  0.321120336  514 4.085953e-02
## 20       34  0.316520014  429 6.465182e-02
## 21       35  0.313681261  380 2.997788e-02
## 22       36  0.335014456  330 6.158881e-02
## 23       37  0.287579928  282 8.349707e-02
## 24       38  0.274675930  250 2.761036e-02
## 25       39  0.256396389  212 1.207124e-02
## 26       40  0.280950682  189 1.799023e-02
## 27       41  0.267958041  160 2.358822e-02
## 28       42  0.264705208  137 3.687927e-02
## 29       43  0.280608899  112 2.359075e-08
## 30       44  0.294314448   95 2.376273e-08
## 31       45  0.287390482   80 1.272952e-02
## 32       46  0.274051554   63 2.318417e-08
## 33       47  0.206228594   47 2.261484e-08
## 34       48  0.102148101   29 2.328881e-08
## 35       49  0.034895322   16 2.195386e-08

I calculate the probability of marrying before 50 (the sum of dx, highlighted in yellow) and carry out a decomposition in excel to show how much of the difference in first marriage rate between NH white men and NH black men is due to differences in whether or not they have a bachelor’s degree.