Load Libraries

# theme for nice plotting
theme_nice <- theme_classic()+
                theme(
                  axis.line.y.left = element_line(colour = "black"),
                  axis.line.y.right = element_line(colour = "black"),
                  axis.line.x.bottom = element_line(colour = "black"),
                  axis.line.x.top = element_line(colour = "black"),
                  axis.text.y = element_text(colour = "black", size = 12),
                  axis.text.x = element_text(color = "black", size = 12),
                  axis.ticks = element_line(color = "black")) +
                theme(
                  axis.ticks.length = unit(-0.25, "cm"), 
                  axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")), 
                  axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 2426161 129.6    4148770 221.6         NA  4148770 221.6
## Vcells 4097332  31.3    8388608  64.0      16384  6229455  47.6
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
data<- read_csv("2017_2019_Male_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 5206 Columns: 3009
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1939): caseid, rscrninf, rscrage, rscrhisp, rscrrace, age_a, age_r, age...
## lgl (1070): sedbclc3, sedbclc4, sedwhlc3, sedwhlc4, sedconlc4, p2currwife, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Sub-select just variable you will be needing

# Create a new data frame 'data' with selected columns
data <- data[, c("caseid","age_r", "fmarital", "agemarr", "agemarrn4", "agemarrn2", "agemarrn3", "hisagem", "evrmarry", "agemarrn", "hisprace2", "fmarno", "nformwife", "hieduc", "wgt2017_2019", "earnba_y", "intvwyear")]

Determine ever married status

data <- data %>%
  mutate(evmarried = ifelse(fmarno == 1 & fmarital == 1, 1, 
                                     ifelse(fmarno == 0 & fmarital == 5, 0, 
                                            ifelse(fmarno %in% c(1:5), 1, NA))))
data %>%
  group_by(evmarried) %>%
  dplyr::summarize(n=n()) %>%
  dplyr::mutate(pct = n / sum(n))
## # A tibble: 2 × 3
##   evmarried     n   pct
##       <dbl> <int> <dbl>
## 1         0  3245 0.623
## 2         1  1961 0.377

Age at event

#ever married age event
data$ageevent<- pmin(data$hisagem,data$agemarrn, na.rm=T)



#never married age 
data$ageevent <- ifelse(data$fmarital == 5, data$age_r, data$ageevent)

# filter for NA values 
data <- data %>% 
  filter(ageevent !=98 & !is.na(ageevent))
tabyl(data$ageevent)
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## ℹ The deprecated feature was likely used in the janitor package.
##   Please report the issue at <]8;;https://github.com/sfirke/janitor/issueshttps://github.com/sfirke/janitor/issues]8;;>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##  data$ageevent   n     percent
##             15 186 0.036399217
##             16 192 0.037573386
##             17 234 0.045792564
##             18 271 0.053033268
##             19 276 0.054011742
##             20 199 0.038943249
##             21 264 0.051663405
##             22 248 0.048532290
##             23 251 0.049119374
##             24 267 0.052250489
##             25 284 0.055577299
##             26 240 0.046966732
##             27 234 0.045792564
##             28 239 0.046771037
##             29 206 0.040313112
##             30 197 0.038551859
##             31 165 0.032289628
##             32 162 0.031702544
##             33 146 0.028571429
##             34 108 0.021135029
##             35  92 0.018003914
##             36  82 0.016046967
##             37  73 0.014285714
##             38  64 0.012524462
##             39  57 0.011154599
##             40  58 0.011350294
##             41  44 0.008610568
##             42  50 0.009784736
##             43  32 0.006262231
##             44  31 0.006066536
##             45  30 0.005870841
##             46  35 0.006849315
##             47  33 0.006457926
##             48  29 0.005675147
##             49  31 0.006066536

Filter data for NH White and NH Black

data <- data %>% 
  filter(hisprace2 == 2 | hisprace2 == 3)

Determine the age they got their bachelor’s degree

data <- data %>%
  mutate(
    bach_degree = ifelse(hieduc %in% c(5:11), 0,
                  ifelse(hieduc %in% c(12:15), 1, NA)),
    birth_year = intvwyear - age_r,
    age_bach_degree = earnba_y - birth_year)


data %>%
  group_by(evmarried, ageevent, age_bach_degree) %>%
  dplyr::summarize(n=n()) %>%
  mutate(pct = n / sum(n))
## `summarise()` has grouped output by 'evmarried', 'ageevent'. You can override
## using the `.groups` argument.
## # A tibble: 379 × 5
## # Groups:   evmarried, ageevent [66]
##    evmarried ageevent age_bach_degree     n    pct
##        <dbl>    <dbl>           <dbl> <int>  <dbl>
##  1         0       15              NA   106 1     
##  2         0       16              NA    97 1     
##  3         0       17              NA   122 1     
##  4         0       18              NA   115 1     
##  5         0       19              NA   115 1     
##  6         0       20              NA    70 1     
##  7         0       21              NA    81 1     
##  8         0       22              21     4 0.0519
##  9         0       22              22     3 0.0390
## 10         0       22              NA    70 0.909 
## # ℹ 369 more rows

Marriage and Having a BA/BS

likely events

  • When a person has never married, but does not have a BA/BS, then 1.
  • #When a person has never married, but has a BA/BS, then 2.
  • When a person has married, but does not have a BA/BS, then 3.
  • When a person has married, has a BA/BS, but got married prior to their BA/BS, then 4.
  • When a person has married, has a BA/BS, but got their BA/BS prior to marrying, then 5.
  • When a person has married, has a BA/BS, but did both at the same age, then 6.
data <- data %>%
  mutate(marriage_babs = case_when((evmarried == 0) & (is.na(age_bach_degree)) ~ 1,
                             (evmarried == 0) & (!is.na(age_bach_degree)) ~ 2,
                             (evmarried == 1) & (is.na(age_bach_degree)) ~ 3,
                             (evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent < age_bach_degree) ~ 4,
                             (evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent > age_bach_degree) ~ 5,
                             (evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent == age_bach_degree) ~ 6))

data %>%
  group_by(evmarried, ageevent, age_bach_degree, marriage_babs) %>%
  dplyr::summarize(n=n()) %>%
  mutate(pct = n / sum(n))
## `summarise()` has grouped output by 'evmarried', 'ageevent', 'age_bach_degree'.
## You can override using the `.groups` argument.
## # A tibble: 379 × 6
## # Groups:   evmarried, ageevent, age_bach_degree [379]
##    evmarried ageevent age_bach_degree marriage_babs     n   pct
##        <dbl>    <dbl>           <dbl>         <dbl> <int> <dbl>
##  1         0       15              NA             1   106     1
##  2         0       16              NA             1    97     1
##  3         0       17              NA             1   122     1
##  4         0       18              NA             1   115     1
##  5         0       19              NA             1   115     1
##  6         0       20              NA             1    70     1
##  7         0       21              NA             1    81     1
##  8         0       22              21             2     4     1
##  9         0       22              22             2     3     1
## 10         0       22              NA             1    70     1
## # ℹ 369 more rows
data %>% 
  group_by(marriage_babs) %>%
  dplyr::summarize(n=n()) %>%
  mutate(pct = n / sum(n))
## # A tibble: 6 × 3
##   marriage_babs     n    pct
##           <dbl> <int>  <dbl>
## 1             1  1669 0.512 
## 2             2   343 0.105 
## 3             3   783 0.240 
## 4             4    79 0.0243
## 5             5   346 0.106 
## 6             6    37 0.0114

conditional probabilities

library(ggsurvfit)
## 
## Attaching package: 'ggsurvfit'
## The following object is masked from 'package:psych':
## 
##     %+%
library(relsurv)
## Loading required package: survival
## Loading required package: date
## 
## Attaching package: 'relsurv'
## The following object is masked from 'package:lubridate':
## 
##     years
pydata<-survSplit(data, cut=c(15:49),end="ageevent",event="evmarried")


pydata$babs_event <- case_when(pydata$marriage_babs == 1 ~ 0,
                               pydata$marriage_babs == 3 ~ 0,
                               pydata$marriage_babs == 4 ~ 0,
                               pydata$marriage_babs == 2 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
                               pydata$marriage_babs == 2 & (pydata$ageevent < pydata$age_bach_degree) ~ 0,
                               pydata$marriage_babs == 5 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
                               pydata$marriage_babs == 5 & (pydata$ageevent < pydata$age_bach_degree) ~ 0,
                               pydata$marriage_babs == 6 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
                               pydata$marriage_babs == 6 & (pydata$ageevent < pydata$age_bach_degree) ~ 0)
head(pydata[, c("caseid", "ageevent", "age_bach_degree", "marriage_babs")], n=50)
##    caseid ageevent age_bach_degree marriage_babs
## 1   80717       15              22             2
## 2   80717       16              22             2
## 3   80717       17              22             2
## 4   80717       18              22             2
## 5   80717       19              22             2
## 6   80717       20              22             2
## 7   80717       21              22             2
## 8   80717       22              22             2
## 9   80717       23              22             2
## 10  80717       24              22             2
## 11  80717       25              22             2
## 12  80717       26              22             2
## 13  80717       27              22             2
## 14  80717       28              22             2
## 15  80717       29              22             2
## 16  80717       30              22             2
## 17  80717       31              22             2
## 18  80721       15              NA             1
## 19  80721       16              NA             1
## 20  80721       17              NA             1
## 21  80724       15              NA             3
## 22  80724       16              NA             3
## 23  80724       17              NA             3
## 24  80724       18              NA             3
## 25  80732       15              22             2
## 26  80732       16              22             2
## 27  80732       17              22             2
## 28  80732       18              22             2
## 29  80732       19              22             2
## 30  80732       20              22             2
## 31  80732       21              22             2
## 32  80732       22              22             2
## 33  80732       23              22             2
## 34  80732       24              22             2
## 35  80732       25              22             2
## 36  80732       26              22             2
## 37  80732       27              22             2
## 38  80732       28              22             2
## 39  80732       29              22             2
## 40  80732       30              22             2
## 41  80732       31              22             2
## 42  80732       32              22             2
## 43  80732       33              22             2
## 44  80732       34              22             2
## 45  80732       35              22             2
## 46  80732       36              22             2
## 47  80732       37              22             2
## 48  80732       38              22             2
## 49  80732       39              22             2
## 50  80734       15              21             5

Age and Race

raceprobs<- pydata %>%
    dplyr::group_by(ageevent, hisprace2) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())

raceprobs$num <- raceprobs$p*(1-raceprobs$p)

raceprobs$sep <- sqrt(raceprobs$num/raceprobs$n)

raceprobs$me <- 2*raceprobs$sep
  
raceprobs$hisprace2 <- car::Recode(raceprobs$hisprace2, recodes="2='White Men'; 3='Black Men'; else=NA",
                       as.factor=T)

ggplot(data =raceprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=hisprace2,color = hisprace2, group = hisprace2)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Race", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Adjusted First Marriage Rate") +theme_nice

#Event by Education

Results by Education

educprobs<- pydata %>%
    group_by(ageevent, babs_event) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)

educprobs$sep <- sqrt(educprobs$num/educprobs$n)

educprobs$me <- 2*educprobs$sep

educprobs$babs_event <- case_when(educprobs$babs_event == 1 ~ "BA/BS Degree",educprobs$babs_event == 0 ~ "No College Degree")

ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs_event,color = babs_event, group = babs_event)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education (Time-Varying)", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Adjusted First Marriage Rate")+theme_nice

educprobs<- pydata %>%
    group_by(ageevent, bach_degree) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)

educprobs$sep <- sqrt(educprobs$num/educprobs$n)

educprobs$me <- 2*educprobs$sep

educprobs$babs <- case_when(educprobs$bach_degree == 1 ~ "BA/BS Degree",educprobs$bach_degree == 0 ~ "No College Degree")

ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs,color = babs, group = babs)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Adjusted First Marriage Rate")+theme_nice

Race and Education

raceeducprobs<- pydata %>%
    group_by(ageevent, hisprace2, babs_event) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'hisprace2'. You can override
## using the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)

raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)

raceeducprobs$me <- 2*raceeducprobs$sep
  
raceeducprobs$babs_event <- case_when(raceeducprobs$babs_event == 1 ~ "BA/BS Degree",raceeducprobs$babs_event == 0 ~ "No College Degree")

raceeducprobs$hisprace2 <- case_when(raceeducprobs$hisprace2 == 2 ~ "White Men",raceeducprobs$hisprace2 == 3 ~ "Black Men")

ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs_event,color = babs_event, group = babs_event)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Race and Education (Time Varying)", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Ajusted First Marriage Rate") + facet_grid(~hisprace2)+theme_nice

raceeducprobs<- pydata %>%
    group_by(ageevent, hisprace2, bach_degree) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'hisprace2'. You can override
## using the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)

raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)

raceeducprobs$me <- 2*raceeducprobs$sep
  
raceeducprobs$bach_degree <- case_when(raceeducprobs$bach_degree == 1 ~ "BA/BS Degree",raceeducprobs$bach_degree == 0 ~ "No College Degree")

raceeducprobs$hisprace2 <- case_when(raceeducprobs$hisprace2 == 2 ~ "White Men",raceeducprobs$hisprace2 == 3 ~ "Black Men")

ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=bach_degree,color = bach_degree, group = bach_degree)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Race and Education", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Ajusted First Marriage Rate") + facet_grid(~hisprace2)+theme_nice

pydata$age_cat <- car::Recode(pydata$ageevent,recodes = "15:19 = '15-19'; 20:24 = '20-24'; 25:29 = '25-29'; 30:34 = '30-34'; 35:39 = '35-39'; 40:44 = '40-44'; 45:49 = '45-49'; else=NA", as.factor=T)

pydata$new_weight <- pydata$wgt2017_2019/mean(pydata$wgt2017_2019) #standize the weight 

pydata$black <- ifelse(pydata$hisprace2==3,1,0)

General Model

model_1 <- glm(evmarried~ age_cat + black, family = "binomial", weights=new_weight, data = pydata) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(model_1)
## 
## Call:
## glm(formula = evmarried ~ age_cat + black, family = "binomial", 
##     data = pydata, weights = new_weight)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0020  -0.2956  -0.1851  -0.0954   7.6805  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.09536    0.10624 -47.962  < 2e-16 ***
## age_cat20-24  2.06148    0.11493  17.937  < 2e-16 ***
## age_cat25-29  2.69533    0.11442  23.556  < 2e-16 ***
## age_cat30-34  2.64754    0.12348  21.440  < 2e-16 ***
## age_cat35-39  2.32274    0.15183  15.298  < 2e-16 ***
## age_cat40-44  2.10117    0.21108   9.954  < 2e-16 ***
## age_cat45-49  0.90428    0.55708   1.623    0.105    
## black        -0.47912    0.07779  -6.159 7.33e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13433  on 41591  degrees of freedom
## Residual deviance: 12321  on 41584  degrees of freedom
## AIC: 12429
## 
## Number of Fisher Scoring iterations: 7
model_2 <- glm(evmarried ~ age_cat + black + babs_event, family = "binomial", weights=new_weight, data = pydata) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(model_2)
## 
## Call:
## glm(formula = evmarried ~ age_cat + black + babs_event, family = "binomial", 
##     data = pydata, weights = new_weight)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1576  -0.2941  -0.1802  -0.0953   7.6885  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.10609    0.10631 -48.031  < 2e-16 ***
## age_cat20-24  1.96850    0.11561  17.027  < 2e-16 ***
## age_cat25-29  2.49191    0.11730  21.243  < 2e-16 ***
## age_cat30-34  2.44622    0.12614  19.392  < 2e-16 ***
## age_cat35-39  2.13813    0.15377  13.904  < 2e-16 ***
## age_cat40-44  1.92692    0.21247   9.069  < 2e-16 ***
## age_cat45-49  0.76649    0.55756   1.375    0.169    
## black        -0.39209    0.07855  -4.992 5.99e-07 ***
## babs_event    0.51762    0.05798   8.928  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13433  on 41591  degrees of freedom
## Residual deviance: 12244  on 41583  degrees of freedom
## AIC: 12345
## 
## Number of Fisher Scoring iterations: 7

White Model

pydata_whites <-filter(pydata, black == 0)

whitemodel <- glm(evmarried ~ age_cat + babs_event, family = "binomial", weights=new_weight, data = pydata_whites) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(whitemodel)
## 
## Call:
## glm(formula = evmarried ~ age_cat + babs_event, family = "binomial", 
##     data = pydata_whites, weights = new_weight)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1638  -0.3269  -0.2088  -0.1048   7.6853  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.10186    0.11292 -45.180   <2e-16 ***
## age_cat20-24  1.95864    0.12346  15.864   <2e-16 ***
## age_cat25-29  2.50161    0.12535  19.957   <2e-16 ***
## age_cat30-34  2.42261    0.13543  17.888   <2e-16 ***
## age_cat35-39  2.09886    0.16723  12.551   <2e-16 ***
## age_cat40-44  2.03891    0.22448   9.083   <2e-16 ***
## age_cat45-49  0.91816    0.58801   1.561    0.118    
## babs_event    0.51497    0.06087   8.461   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 11566  on 30309  degrees of freedom
## Residual deviance: 10546  on 30302  degrees of freedom
## AIC: 10722
## 
## Number of Fisher Scoring iterations: 7

Black Model

pydata_blacks <-filter(pydata, black == 1)

blackmodel<- glm(evmarried ~ age_cat + babs_event, family = "binomial", weights=new_weight, data = pydata_blacks) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(blackmodel)
## 
## Call:
## glm(formula = evmarried ~ age_cat + babs_event, family = "binomial", 
##     data = pydata_blacks, weights = new_weight)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9297  -0.2063  -0.1325  -0.0673   6.3784  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.52834    0.30422 -18.172  < 2e-16 ***
## age_cat20-24  2.04001    0.32991   6.184 6.27e-10 ***
## age_cat25-29  2.42096    0.33390   7.250 4.15e-13 ***
## age_cat30-34  2.59231    0.34856   7.437 1.03e-13 ***
## age_cat35-39  2.34178    0.39916   5.867 4.44e-09 ***
## age_cat40-44  1.21153    0.67723   1.789   0.0736 .  
## age_cat45-49 -0.05683    1.77633  -0.032   0.9745    
## babs_event    0.54577    0.19258   2.834   0.0046 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1829.7  on 11281  degrees of freedom
## Residual deviance: 1693.2  on 11274  degrees of freedom
## AIC: 1631.3
## 
## Number of Fisher Scoring iterations: 8
whitedata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),babs_event=0:1)

whitedata$phat <- predict(whitemodel, whitedata, type="response")

whitedata
##    age_cat babs_event        phat
## 1    15-19          0 0.006048585
## 2    20-24          0 0.041358991
## 3    25-29          0 0.069122309
## 4    30-34          0 0.064208888
## 5    35-39          0 0.047290468
## 6    40-44          0 0.044661688
## 7    45-49          0 0.015013139
## 8    15-19          1 0.010081805
## 9    20-24          1 0.067342106
## 10   25-29          1 0.110536144
## 11   30-34          1 0.103004646
## 12   35-39          1 0.076701787
## 13   40-44          1 0.072562615
## 14   45-49          1 0.024874367
blackdata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),babs_event=0:1)

blackdata$phat <- predict(blackmodel, blackdata, type="response")

blackdata
##    age_cat babs_event        phat
## 1    15-19          0 0.003956844
## 2    20-24          0 0.029646024
## 3    25-29          0 0.042803900
## 4    30-34          0 0.050400876
## 5    35-39          0 0.039674527
## 6    40-44          0 0.013166675
## 7    45-49          0 0.003739046
## 8    15-19          1 0.006809714
## 9    20-24          1 0.050089266
## 10   25-29          1 0.071650548
## 11   30-34          1 0.083918463
## 12   35-39          1 0.066558858
## 13   40-44          1 0.022509734
## 14   45-49          1 0.006435899
whitedata$babs_event<- recode(whitedata$babs_event, "1='BA/BS Degree'; 0='No College Degree'")

ggplot(whitedata, aes(x = age_cat, y = phat, group=babs_event, fill=babs_event,color = babs_event)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for White Men")

blackdata$babs_event<- recode(blackdata$babs_event, "1='BA/BS Degree'; 0='No College Degree'")

ggplot(blackdata, aes(x = age_cat, y = phat, group=babs_event, fill=babs_event,color = babs_event)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for Black Men")

Swapping of models and data

newwhitedata <- pydata_whites %>%                           
  group_by(age_cat) %>% 
  dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())

newwhitedata$phat <- predict(whitemodel, newwhitedata, type="response")

as.data.frame(newwhitedata)
##   age_cat   babs_event     n        phat
## 1   15-19 0.0004530251 11139 0.006049987
## 2   20-24 0.1590766209  8611 0.044731842
## 3   25-29 0.3684261830  5423 0.082373970
## 4   30-34 0.3642379080  2891 0.076443677
## 5   35-39 0.3322900790  1391 0.055625528
## 6   40-44 0.3039027132   646 0.051835760
## 7   45-49 0.2210554539   209 0.016792882
newblackdata <- pydata_blacks %>%                           
  group_by(age_cat) %>% 
  dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())

newblackdata$phat <- predict(blackmodel, newblackdata, type="response")

as.data.frame(newblackdata)
##   age_cat babs_event    n        phat
## 1   15-19 0.00000000 4028 0.003956844
## 2   20-24 0.06555702 3021 0.030692792
## 3   25-29 0.16775968 2009 0.046716248
## 4   30-34 0.16704397 1114 0.054947438
## 5   35-39 0.13738011  639 0.042631917
## 6   40-44 0.15986562  336 0.014349850
## 7   45-49 0.14881902  135 0.004054127
whitemodel_blackdata <- pydata_blacks %>%                           
  group_by(age_cat) %>% 
  dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())

whitemodel_blackdata$phat <- predict(whitemodel, whitemodel_blackdata, type="response")

as.data.frame(whitemodel_blackdata)
##   age_cat babs_event    n        phat
## 1   15-19 0.00000000 4028 0.006048585
## 2   20-24 0.06555702 3021 0.042718449
## 3   25-29 0.16775968 2009 0.074892341
## 4   30-34 0.16704397 1114 0.069575572
## 5   35-39 0.13738011  639 0.050581964
## 6   40-44 0.15986562  336 0.048308985
## 7   45-49 0.14881902  135 0.016189593
blackmodel_whitedata <- pydata_whites %>%                           
  group_by(age_cat) %>% 
  dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())

blackmodel_whitedata$phat <- predict(blackmodel, blackmodel_whitedata, type="response")

as.data.frame(blackmodel_whitedata)
##   age_cat   babs_event     n        phat
## 1   15-19 0.0004530251 11139 0.003957819
## 2   20-24 0.1590766209  8611 0.032248201
## 3   25-29 0.3684261830  5423 0.051842886
## 4   30-34 0.3642379080  2891 0.060811308
## 5   35-39 0.3322900790  1391 0.047191128
## 6   40-44 0.3039027132   646 0.015505238
## 7   45-49 0.2210554539   209 0.004216464

Competing Risk

data$age_cat <- car::Recode(data$ageevent,recodes = "15:19 = '15-19'; 20:24 = '20-24'; 25:29 = '25-29'; 30:34 = '30-34'; 35:39 = '35-39'; 40:44 = '40-44'; 45:49 = '45-49'; else=NA", as.factor=T)


fprobs<- data %>%
    group_by(age_cat) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())

ggplot(fprobs, aes(x = age_cat, y = p, group =1 )) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate by Age")

data <- data %>%
  mutate(nh_wh_marr = ifelse(evmarried == 1 & hisprace2 == 2,1,0),
         nh_bl_marr = ifelse(evmarried == 1 & hisprace2 == 3,1,0),
         marr_by_race = ifelse(evmarried == 1 & hisprace2 == 2,1,
                               ifelse(evmarried == 1 & hisprace2 == 3,2,0))) #1 is for NH whites marrying, 2 is for NH Blacks marrying, 0 is for no marriages whatsoever regardless of race)

nh_wh_probs<- data %>%
  group_by(age_cat) %>%
  dplyr::summarize(p=weighted.mean(nh_wh_marr,wgt2017_2019,na.rm = TRUE),n=n())

ggplot(nh_wh_probs, aes(x = factor(age_cat), y = p, group = 1)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate (NH White) by Age")

nh_bl_probs<- data %>%
  group_by(age_cat) %>%
  dplyr::summarize(p=weighted.mean(nh_bl_marr,wgt2017_2019,na.rm = TRUE),n=n())

ggplot(nh_bl_probs, aes(x = age_cat, y = p, group =1 )) +
  geom_point() +
  geom_line() + 
  labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate (NH Black) by Age")

library(nnet)
compriskmodel <-multinom(formula = marr_by_race ~ age_cat, data = data)
## # weights:  24 (14 variable)
## initial  value 3578.180224 
## iter  10 value 2532.030674
## iter  20 value 2486.812645
## final  value 2486.783619 
## converged
summary(compriskmodel)
## Call:
## multinom(formula = marr_by_race ~ age_cat, data = data)
## 
## Coefficients:
##   (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1   -2.311634     2.209127     2.356211     1.780331     1.143977   0.39628878
## 2   -3.428630     1.696617     1.816311     1.427148     1.008216  -0.04545012
##   age_cat45-49
## 1    -1.011440
## 2    -1.280696
## 
## Std. Errors:
##   (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1   0.1413633    0.1593380    0.1598667    0.1706138    0.2083191    0.2834871
## 2   0.2394978    0.2727925    0.2729364    0.2928099    0.3541421    0.5614702
##   age_cat45-49
## 1    0.5281609
## 2    1.0325556
## 
## Residual Deviance: 4973.567 
## AIC: 5001.567
data$new_weight <- data$wgt2017_2019/mean(data$wgt2017_2019)

data$new_weight = data$wgt2017_2019/mean(data$wgt2017_2019) #creating new weight

anymodel <- glm(evmarried ~ age_cat, new_weight, family = "binomial", data = data) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(anymodel)
## 
## Call:
## glm(formula = evmarried ~ age_cat, family = "binomial", data = data, 
##     weights = new_weight)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3703  -0.8313  -0.4013   0.8676   4.4941  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.6079     0.1171 -13.731  < 2e-16 ***
## age_cat20-24   1.8421     0.1347  13.671  < 2e-16 ***
## age_cat25-29   2.1603     0.1363  15.846  < 2e-16 ***
## age_cat30-34   1.6993     0.1474  11.527  < 2e-16 ***
## age_cat35-39   1.0591     0.1783   5.940 2.85e-09 ***
## age_cat40-44   0.6287     0.2421   2.597  0.00941 ** 
## age_cat45-49  -1.5946     0.5728  -2.784  0.00537 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4505.8  on 3256  degrees of freedom
## Residual deviance: 4051.8  on 3250  degrees of freedom
## AIC: 3995.5
## 
## Number of Fisher Scoring iterations: 5
library(gbm)
## Loaded gbm 2.1.8.1
compriskmodel <-multinom(formula = marr_by_race ~ factor(evmarried), data = data)
## # weights:  9 (4 variable)
## initial  value 3578.180224 
## iter  10 value 591.840457
## final  value 577.578838 
## converged
summary(compriskmodel)
## Call:
## multinom(formula = marr_by_race ~ factor(evmarried), data = data)
## 
## Coefficients:
##   (Intercept) factor(evmarried)1
## 1   -12.70518           24.57202
## 2   -11.03884           21.35572
## 
## Std. Errors:
##   (Intercept) factor(evmarried)1
## 1    12.79633           17.39152
## 2     5.56221           13.02539
## 
## Residual Deviance: 1155.158 
## AIC: 1163.158
anymodel <- glm(evmarried ~ factor(ageevent), weights=new_weight, family = "binomial", data = data)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(anymodel)
## 
## Call:
## glm(formula = evmarried ~ factor(ageevent), family = "binomial", 
##     data = data, weights = new_weight)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6818  -0.8337  -0.2389   0.8488   3.7547  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)        -16.63186  261.51882  -0.064    0.949
## factor(ageevent)16  11.76824  261.52258   0.045    0.964
## factor(ageevent)17  13.27880  261.51943   0.051    0.960
## factor(ageevent)18  15.71972  261.51888   0.060    0.952
## factor(ageevent)19  15.92562  261.51888   0.061    0.951
## factor(ageevent)20  16.45440  261.51886   0.063    0.950
## factor(ageevent)21  16.60451  261.51885   0.063    0.949
## factor(ageevent)22  16.82407  261.51886   0.064    0.949
## factor(ageevent)23  17.22629  261.51886   0.066    0.947
## factor(ageevent)24  17.25150  261.51886   0.066    0.947
## factor(ageevent)25  17.15481  261.51886   0.066    0.948
## factor(ageevent)26  17.49893  261.51887   0.067    0.947
## factor(ageevent)27  17.47596  261.51886   0.067    0.947
## factor(ageevent)28  16.65639  261.51886   0.064    0.949
## factor(ageevent)29  17.09184  261.51887   0.065    0.948
## factor(ageevent)30  17.05200  261.51888   0.065    0.948
## factor(ageevent)31  17.07489  261.51890   0.065    0.948
## factor(ageevent)32  16.52822  261.51888   0.063    0.950
## factor(ageevent)33  16.15155  261.51890   0.062    0.951
## factor(ageevent)34  16.75322  261.51895   0.064    0.949
## factor(ageevent)35  16.06973  261.51897   0.061    0.951
## factor(ageevent)36  16.42463  261.51892   0.063    0.950
## factor(ageevent)37  16.15067  261.51902   0.062    0.951
## factor(ageevent)38  15.94210  261.51902   0.061    0.951
## factor(ageevent)39  15.13919  261.51926   0.058    0.954
## factor(ageevent)40  16.13955  261.51905   0.062    0.951
## factor(ageevent)41  15.59706  261.51923   0.060    0.952
## factor(ageevent)42  14.64536  261.51968   0.056    0.955
## factor(ageevent)43  15.98953  261.51918   0.061    0.951
## factor(ageevent)44  14.32893  261.52106   0.055    0.956
## factor(ageevent)45  14.91363  261.51961   0.057    0.955
## factor(ageevent)46  13.34649  261.52292   0.051    0.959
## factor(ageevent)47   0.05916  588.53811   0.000    1.000
## factor(ageevent)48   0.03428  662.14810   0.000    1.000
## factor(ageevent)49   0.10625  647.05364   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4505.8  on 3256  degrees of freedom
## Residual deviance: 3869.1  on 3222  degrees of freedom
## AIC: 3865.4
## 
## Number of Fisher Scoring iterations: 15
as.data.frame(fprobs)
##   age_cat          p   n
## 1   15-19 0.16687517 628
## 2   20-24 0.55827326 811
## 3   25-29 0.63468717 788
## 4   30-34 0.52282340 510
## 5   35-39 0.36613989 252
## 6   40-44 0.27303553 152
## 7   45-49 0.03906987 116

To get those who are at risk based on race and babs event

#Total population hisprace2( Black)
#Total Population hisprace2(white)
#Group evmarried and hisprace2, 
#babs_event is they have bachelor's degree or not 



pydata %>%
  group_by(hisprace2, babs_event) %>% # 3 is black 
  summarise(Count = n())
## `summarise()` has grouped output by 'hisprace2'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   hisprace2 [2]
##   hisprace2 babs_event Count
##       <dbl>      <dbl> <int>
## 1         2          0 25654
## 2         2          1  4656
## 3         3          0 10417
## 4         3          1   865
# To get those who experienced the event
pydata %>%
  group_by(hisprace2, babs_event) %>%
  summarise(Count = sum(evmarried == 1))
## `summarise()` has grouped output by 'hisprace2'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   hisprace2 [2]
##   hisprace2 babs_event Count
##       <dbl>      <dbl> <int>
## 1         2          0   682
## 2         2          1   345
## 3         3          0   180
## 4         3          1    38