Load Libraries

# theme for nice plotting
theme_nice <- theme_classic()+
                theme(
                  axis.line.y.left = element_line(colour = "black"),
                  axis.line.y.right = element_line(colour = "black"),
                  axis.line.x.bottom = element_line(colour = "black"),
                  axis.line.x.top = element_line(colour = "black"),
                  axis.text.y = element_text(colour = "black", size = 12),
                  axis.text.x = element_text(color = "black", size = 12),
                  axis.ticks = element_line(color = "black")) +
                theme(
                  axis.ticks.length = unit(-0.25, "cm"), 
                  axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")), 
                  axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 2426326 129.6    4148825 221.6         NA  4148825 221.6
## Vcells 4098232  31.3    8388608  64.0      16384  6257752  47.8
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
data<- read_csv("2017_2019_Male_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 5206 Columns: 3009
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1939): caseid, rscrninf, rscrage, rscrhisp, rscrrace, age_a, age_r, age...
## lgl (1070): sedbclc3, sedbclc4, sedwhlc3, sedwhlc4, sedconlc4, p2currwife, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Sub-select just variable you will be needing

# Create a new data frame 'data' with selected columns
data <- data[, c("caseid","age_r", "fmarital", "agemarr", "agemarrn4", "agemarrn2", "agemarrn3", "hisagem", "evrmarry", "agemarrn", "hisprace2", "fmarno", "nformwife", "hieduc", "wgt2017_2019", "earnba_y", "intvwyear", "sest", "secu")]

Determine ever married status

data <- data %>%
  mutate(evmarried = ifelse(fmarno == 1 & fmarital == 1, 1, 
                                     ifelse(fmarno == 0 & fmarital == 5, 0, 
                                            ifelse(fmarno %in% c(1:5), 1, NA))))
data %>%
  group_by(evmarried) %>%
  dplyr::summarize(n=n()) %>%
  dplyr::mutate(pct = n / sum(n))
## # A tibble: 2 × 3
##   evmarried     n   pct
##       <dbl> <int> <dbl>
## 1         0  3245 0.623
## 2         1  1961 0.377

Age at event

#ever married age event
data$ageevent<- pmin(data$hisagem,data$agemarrn, na.rm=T)



#never married age 
data$ageevent <- ifelse(data$fmarital == 5, data$age_r, data$ageevent)

# filter for NA values 
data <- data %>% 
  filter(ageevent !=98 & !is.na(ageevent))
tabyl(data$ageevent)
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## ℹ The deprecated feature was likely used in the janitor package.
##   Please report the issue at <]8;;https://github.com/sfirke/janitor/issueshttps://github.com/sfirke/janitor/issues]8;;>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##  data$ageevent   n     percent
##             15 186 0.036399217
##             16 192 0.037573386
##             17 234 0.045792564
##             18 271 0.053033268
##             19 276 0.054011742
##             20 199 0.038943249
##             21 264 0.051663405
##             22 248 0.048532290
##             23 251 0.049119374
##             24 267 0.052250489
##             25 284 0.055577299
##             26 240 0.046966732
##             27 234 0.045792564
##             28 239 0.046771037
##             29 206 0.040313112
##             30 197 0.038551859
##             31 165 0.032289628
##             32 162 0.031702544
##             33 146 0.028571429
##             34 108 0.021135029
##             35  92 0.018003914
##             36  82 0.016046967
##             37  73 0.014285714
##             38  64 0.012524462
##             39  57 0.011154599
##             40  58 0.011350294
##             41  44 0.008610568
##             42  50 0.009784736
##             43  32 0.006262231
##             44  31 0.006066536
##             45  30 0.005870841
##             46  35 0.006849315
##             47  33 0.006457926
##             48  29 0.005675147
##             49  31 0.006066536

Filter data for NH White and NH Black

data <- data %>% 
  filter(hisprace2 == 2 | hisprace2 == 3)

Determine the age they got their bachelor’s degree

data <- data %>%
  mutate(
    bach_degree = ifelse(hieduc %in% c(5:11), 0,
                  ifelse(hieduc %in% c(12:15), 1, NA)),
    birth_year = intvwyear - age_r,
    age_bach_degree = earnba_y - birth_year)


data %>%
  group_by(evmarried, ageevent, age_bach_degree) %>%
  dplyr::summarize(n=n()) %>%
  mutate(pct = n / sum(n))
## `summarise()` has grouped output by 'evmarried', 'ageevent'. You can override
## using the `.groups` argument.
## # A tibble: 379 × 5
## # Groups:   evmarried, ageevent [66]
##    evmarried ageevent age_bach_degree     n    pct
##        <dbl>    <dbl>           <dbl> <int>  <dbl>
##  1         0       15              NA   106 1     
##  2         0       16              NA    97 1     
##  3         0       17              NA   122 1     
##  4         0       18              NA   115 1     
##  5         0       19              NA   115 1     
##  6         0       20              NA    70 1     
##  7         0       21              NA    81 1     
##  8         0       22              21     4 0.0519
##  9         0       22              22     3 0.0390
## 10         0       22              NA    70 0.909 
## # ℹ 369 more rows

Create survey design

library(survey)
## Loading required package: grid
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
des<-svydesign(ids = ~secu,
                strata = ~sest,
                weights=~wgt2017_2019, 
                data=data,
                nest=T)

Percent with a bachelor’s degree at each age for both race

library(survey)
library(dplyr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
bach_age_race <- svyby(~bach_degree, by = ~hisprace2 + age_r, design = des, FUN = svymean) %>%
  mutate(Race = ifelse(hisprace2 == 3, "% Black with bach", "& White with bach")) %>%
  select(age_r, Race, Percent_with_college_degree = bach_degree) %>%
  spread(Race, Percent_with_college_degree)

# Print the result
print(bach_age_race)
##    age_r & White with bach % Black with bach
## 1     15       0.000000000       0.000000000
## 2     16       0.000000000       0.000000000
## 3     17       0.000000000       0.000000000
## 4     18       0.000000000       0.000000000
## 5     19       0.000000000       0.000000000
## 6     20       0.000000000       0.000000000
## 7     21       0.008250833       0.000000000
## 8     22       0.114112456       0.007067989
## 9     23       0.301123729       0.153831478
## 10    24       0.394126772       0.271459435
## 11    25       0.360019297       0.256943867
## 12    26       0.458598748       0.101478745
## 13    27       0.385167571       0.006186328
## 14    28       0.379778158       0.143184877
## 15    29       0.328598010       0.209892625
## 16    30       0.349032816       0.112009745
## 17    31       0.469542786       0.148690892
## 18    32       0.407440425       0.092208286
## 19    33       0.341284328       0.330142638
## 20    34       0.387126825       0.068720672
## 21    35       0.366626178       0.307217765
## 22    36       0.479001848       0.073041272
## 23    37       0.426800075       0.200632472
## 24    38       0.489537256       0.460298863
## 25    39       0.283914810       0.171186043
## 26    40       0.599124196       0.256314847
## 27    41       0.556878984       0.098169769
## 28    42       0.413063878       0.231067351
## 29    43       0.297915871       0.557450719
## 30    44       0.406574667       0.213307189
## 31    45       0.453818304       0.357997355
## 32    46       0.448505331       0.506315460
## 33    47       0.313390039       0.270052526
## 34    48       0.375238404       0.310944449
## 35    49       0.307178148       0.157532206
## 36    50       1.000000000                NA

Marriage and Having a BA/BS

likely events

  • When a person has never married, but does not have a BA/BS, then 1.
  • #When a person has never married, but has a BA/BS, then 2.
  • When a person has married, but does not have a BA/BS, then 3.
  • When a person has married, has a BA/BS, but got married prior to their BA/BS, then 4.
  • When a person has married, has a BA/BS, but got their BA/BS prior to marrying, then 5.
  • When a person has married, has a BA/BS, but did both at the same age, then 6.
data <- data %>%
  mutate(marriage_babs = case_when((evmarried == 0) & (is.na(age_bach_degree)) ~ 1,
                             (evmarried == 0) & (!is.na(age_bach_degree)) ~ 2,
                             (evmarried == 1) & (is.na(age_bach_degree)) ~ 3,
                             (evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent < age_bach_degree) ~ 4,
                             (evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent > age_bach_degree) ~ 5,
                             (evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent == age_bach_degree) ~ 6))

data %>%
  group_by(evmarried, ageevent, age_bach_degree, marriage_babs) %>%
  dplyr::summarize(n=n()) %>%
  mutate(pct = n / sum(n))
## `summarise()` has grouped output by 'evmarried', 'ageevent', 'age_bach_degree'.
## You can override using the `.groups` argument.
## # A tibble: 379 × 6
## # Groups:   evmarried, ageevent, age_bach_degree [379]
##    evmarried ageevent age_bach_degree marriage_babs     n   pct
##        <dbl>    <dbl>           <dbl>         <dbl> <int> <dbl>
##  1         0       15              NA             1   106     1
##  2         0       16              NA             1    97     1
##  3         0       17              NA             1   122     1
##  4         0       18              NA             1   115     1
##  5         0       19              NA             1   115     1
##  6         0       20              NA             1    70     1
##  7         0       21              NA             1    81     1
##  8         0       22              21             2     4     1
##  9         0       22              22             2     3     1
## 10         0       22              NA             1    70     1
## # ℹ 369 more rows
data %>% 
  group_by(marriage_babs) %>%
  dplyr::summarize(n=n()) %>%
  mutate(pct = n / sum(n))
## # A tibble: 6 × 3
##   marriage_babs     n    pct
##           <dbl> <int>  <dbl>
## 1             1  1669 0.512 
## 2             2   343 0.105 
## 3             3   783 0.240 
## 4             4    79 0.0243
## 5             5   346 0.106 
## 6             6    37 0.0114

conditional probabilities

library(ggsurvfit)
## 
## Attaching package: 'ggsurvfit'
## The following object is masked from 'package:psych':
## 
##     %+%
library(relsurv)
## Loading required package: date
## 
## Attaching package: 'relsurv'
## The following object is masked from 'package:lubridate':
## 
##     years
pydata<-survSplit(data, cut=c(15:49),end="ageevent",event="evmarried")


pydata$babs_event <- case_when(pydata$marriage_babs == 1 ~ 0,
                               pydata$marriage_babs == 3 ~ 0,
                               pydata$marriage_babs == 4 ~ 0,
                               pydata$marriage_babs == 2 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
                               pydata$marriage_babs == 2 & (pydata$ageevent < pydata$age_bach_degree) ~ 0,
                               pydata$marriage_babs == 5 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
                               pydata$marriage_babs == 5 & (pydata$ageevent < pydata$age_bach_degree) ~ 0,
                               pydata$marriage_babs == 6 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
                               pydata$marriage_babs == 6 & (pydata$ageevent < pydata$age_bach_degree) ~ 0)
head(pydata[, c("caseid", "ageevent", "age_bach_degree", "marriage_babs")], n=50)
##    caseid ageevent age_bach_degree marriage_babs
## 1   80717       15              22             2
## 2   80717       16              22             2
## 3   80717       17              22             2
## 4   80717       18              22             2
## 5   80717       19              22             2
## 6   80717       20              22             2
## 7   80717       21              22             2
## 8   80717       22              22             2
## 9   80717       23              22             2
## 10  80717       24              22             2
## 11  80717       25              22             2
## 12  80717       26              22             2
## 13  80717       27              22             2
## 14  80717       28              22             2
## 15  80717       29              22             2
## 16  80717       30              22             2
## 17  80717       31              22             2
## 18  80721       15              NA             1
## 19  80721       16              NA             1
## 20  80721       17              NA             1
## 21  80724       15              NA             3
## 22  80724       16              NA             3
## 23  80724       17              NA             3
## 24  80724       18              NA             3
## 25  80732       15              22             2
## 26  80732       16              22             2
## 27  80732       17              22             2
## 28  80732       18              22             2
## 29  80732       19              22             2
## 30  80732       20              22             2
## 31  80732       21              22             2
## 32  80732       22              22             2
## 33  80732       23              22             2
## 34  80732       24              22             2
## 35  80732       25              22             2
## 36  80732       26              22             2
## 37  80732       27              22             2
## 38  80732       28              22             2
## 39  80732       29              22             2
## 40  80732       30              22             2
## 41  80732       31              22             2
## 42  80732       32              22             2
## 43  80732       33              22             2
## 44  80732       34              22             2
## 45  80732       35              22             2
## 46  80732       36              22             2
## 47  80732       37              22             2
## 48  80732       38              22             2
## 49  80732       39              22             2
## 50  80734       15              21             5

Age and Race

library(survey)
des2<-svydesign(ids = ~secu,
                strata = ~sest,
                weights=~wgt2017_2019, 
                data=pydata,
                nest=T)
# Use svyby to calculate weighted means and sample sizes by ageevent and hisprace2
raceprobs <- svyby(~evmarried, by = ~ageevent + hisprace2, design = des2, 
                   FUN = svymean, na.rm = TRUE)

raceprobs$me <- 2*raceprobs$se
  
raceprobs$hisprace2 <- car::Recode(raceprobs$hisprace2, recodes="2='White Men'; 3='Black Men'; else=NA",
                       as.factor=T)

ggplot(data =raceprobs, aes(x = ageevent, y = evmarried, ymin=evmarried-me, ymax=evmarried+me, fill=hisprace2,color = hisprace2, group = hisprace2)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Race", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Adjusted First Marriage Rate") +theme_nice

#Event by Education

Results by Education

educprobs<- pydata %>%
    group_by(ageevent, babs_event) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)

educprobs$sep <- sqrt(educprobs$num/educprobs$n)

educprobs$me <- 2*educprobs$sep

educprobs$babs_event <- case_when(educprobs$babs_event == 1 ~ "BA/BS Degree",educprobs$babs_event == 0 ~ "No College Degree")

ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs_event,color = babs_event, group = babs_event)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education (Time-Varying)", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Adjusted First Marriage Rate")+theme_nice

educprobs<- pydata %>%
    group_by(ageevent, bach_degree) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)

educprobs$sep <- sqrt(educprobs$num/educprobs$n)

educprobs$me <- 2*educprobs$sep

educprobs$babs <- case_when(educprobs$bach_degree == 1 ~ "BA/BS Degree",educprobs$bach_degree == 0 ~ "No College Degree")

ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs,color = babs, group = babs)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Adjusted First Marriage Rate")+theme_nice

Race and Education

raceeducprobs<- pydata %>%
    group_by(ageevent, hisprace2, babs_event) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'hisprace2'. You can override
## using the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)

raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)

raceeducprobs$me <- 2*raceeducprobs$sep
  
raceeducprobs$babs_event <- case_when(raceeducprobs$babs_event == 1 ~ "BA/BS Degree",raceeducprobs$babs_event == 0 ~ "No College Degree")

raceeducprobs$hisprace2 <- case_when(raceeducprobs$hisprace2 == 2 ~ "White Men",raceeducprobs$hisprace2 == 3 ~ "Black Men")

ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs_event,color = babs_event, group = babs_event)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Race and Education (Time Varying)", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Ajusted First Marriage Rate") + facet_grid(~hisprace2)+theme_nice

raceeducprobs<- pydata %>%
    group_by(ageevent, hisprace2, bach_degree) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'hisprace2'. You can override
## using the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)

raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)

raceeducprobs$me <- 2*raceeducprobs$sep
  
raceeducprobs$bach_degree <- case_when(raceeducprobs$bach_degree == 1 ~ "BA/BS Degree",raceeducprobs$bach_degree == 0 ~ "No College Degree")

raceeducprobs$hisprace2 <- case_when(raceeducprobs$hisprace2 == 2 ~ "White Men",raceeducprobs$hisprace2 == 3 ~ "Black Men")

ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=bach_degree,color = bach_degree, group = bach_degree)) +
     geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Race and Education", 
       subtitle="Men Ages 18 to 49", 
       caption="Source: 2017-2019 NSFG Male Respondent File") +
       xlab(label="Age at First Marriage") +
  ylab(label="Ajusted First Marriage Rate") + facet_grid(~hisprace2)+theme_nice

pydata$age_cat <- car::Recode(pydata$ageevent,recodes = "15:19 = '15-19'; 20:24 = '20-24'; 25:29 = '25-29'; 30:34 = '30-34'; 35:39 = '35-39'; 40:44 = '40-44'; 45:49 = '45-49'; else=NA", as.factor=T)

pydata$new_weight <- pydata$wgt2017_2019/mean(pydata$wgt2017_2019) #standize the weight 

pydata$black <- ifelse(pydata$hisprace2==3,1,0)

General Model

library(survey)
des2<-svydesign(ids = ~secu,
                strata = ~sest,
                weights=~wgt2017_2019, 
                data=pydata,
                nest=T)
model_1 <- svyglm(evmarried~ age_cat + black, family = "binomial", design=des2) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(model_1)
## 
## Call:
## svyglm(formula = evmarried ~ age_cat + black, design = des2, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019, 
##     data = pydata, nest = T)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -5.0954     0.1729 -29.471  < 2e-16 ***
## age_cat20-24   2.0615     0.1793  11.500 2.92e-15 ***
## age_cat25-29   2.6953     0.1954  13.795  < 2e-16 ***
## age_cat30-34   2.6475     0.2050  12.914  < 2e-16 ***
## age_cat35-39   2.3227     0.2333   9.958 3.67e-13 ***
## age_cat40-44   2.1012     0.3505   5.994 2.75e-07 ***
## age_cat45-49   0.9043     0.7467   1.211    0.232    
## black         -0.4791     0.1034  -4.634 2.87e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.001212)
## 
## Number of Fisher Scoring iterations: 7
model_2 <- svyglm(evmarried ~ age_cat + black + babs_event, family = "binomial", design=des2) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(model_2)
## 
## Call:
## svyglm(formula = evmarried ~ age_cat + black + babs_event, design = des2, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019, 
##     data = pydata, nest = T)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.10609    0.17357 -29.418  < 2e-16 ***
## age_cat20-24  1.96850    0.17849  11.028 1.65e-14 ***
## age_cat25-29  2.49191    0.18926  13.167  < 2e-16 ***
## age_cat30-34  2.44622    0.20010  12.225 4.72e-16 ***
## age_cat35-39  2.13813    0.23082   9.263 4.38e-12 ***
## age_cat40-44  1.92692    0.34570   5.574 1.25e-06 ***
## age_cat45-49  0.76649    0.74717   1.026 0.310321    
## black        -0.39209    0.10706  -3.662 0.000643 ***
## babs_event    0.51762    0.08425   6.144 1.76e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9991185)
## 
## Number of Fisher Scoring iterations: 7

White Model

pydata_whites <-filter(pydata, black == 0)


des3<-svydesign(ids = ~secu,
                strata = ~sest,
                weights=~wgt2017_2019, 
                data=pydata_whites,
                nest=T)


whitemodel <- svyglm(evmarried ~ age_cat + babs_event, family = "binomial", design=des3) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(whitemodel)
## 
## Call:
## svyglm(formula = evmarried ~ age_cat + babs_event, design = des3, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019, 
##     data = pydata_whites, nest = T)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.10186    0.18888 -27.011  < 2e-16 ***
## age_cat20-24  1.95864    0.19621   9.982 3.39e-13 ***
## age_cat25-29  2.50161    0.20871  11.986 6.76e-16 ***
## age_cat30-34  2.42261    0.21308  11.370 4.34e-15 ***
## age_cat35-39  2.09886    0.24357   8.617 3.10e-11 ***
## age_cat40-44  2.03891    0.36225   5.628 9.78e-07 ***
## age_cat45-49  0.91816    0.81532   1.126    0.266    
## babs_event    0.51497    0.09196   5.600 1.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000333)
## 
## Number of Fisher Scoring iterations: 7

Black Model

pydata_blacks <-filter(pydata, black == 1)

des4<-svydesign(ids = ~secu,
                strata = ~sest,
                weights=~wgt2017_2019, 
                data=pydata_blacks,
                nest=T)

blackmodel<- svyglm(evmarried ~ age_cat + babs_event, family = "binomial", design =des4) 
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(blackmodel)
## 
## Call:
## svyglm(formula = evmarried ~ age_cat + babs_event, design = des4, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019, 
##     data = pydata_blacks, nest = T)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.52834    0.31255 -17.688  < 2e-16 ***
## age_cat20-24  2.04001    0.36728   5.554 1.62e-06 ***
## age_cat25-29  2.42096    0.34772   6.962 1.46e-08 ***
## age_cat30-34  2.59231    0.40946   6.331 1.21e-07 ***
## age_cat35-39  2.34178    0.49077   4.772 2.13e-05 ***
## age_cat40-44  1.21153    0.60571   2.000   0.0518 .  
## age_cat45-49 -0.05683    1.10067  -0.052   0.9591    
## babs_event    0.54577    0.29188   1.870   0.0683 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000899)
## 
## Number of Fisher Scoring iterations: 8
whitedata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),babs_event=0:1)

whitedata$phat <- predict(whitemodel, whitedata, type="response")

whitedata
##    age_cat babs_event        phat
## 1    15-19          0 0.006048585
## 2    20-24          0 0.041358991
## 3    25-29          0 0.069122309
## 4    30-34          0 0.064208888
## 5    35-39          0 0.047290468
## 6    40-44          0 0.044661688
## 7    45-49          0 0.015013139
## 8    15-19          1 0.010081806
## 9    20-24          1 0.067342106
## 10   25-29          1 0.110536144
## 11   30-34          1 0.103004646
## 12   35-39          1 0.076701787
## 13   40-44          1 0.072562615
## 14   45-49          1 0.024874367
blackdata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),babs_event=0:1)

blackdata$phat <- predict(blackmodel, blackdata, type="response")

blackdata
##    age_cat babs_event        phat
## 1    15-19          0 0.003956844
## 2    20-24          0 0.029646024
## 3    25-29          0 0.042803900
## 4    30-34          0 0.050400876
## 5    35-39          0 0.039674527
## 6    40-44          0 0.013166675
## 7    45-49          0 0.003739046
## 8    15-19          1 0.006809714
## 9    20-24          1 0.050089266
## 10   25-29          1 0.071650548
## 11   30-34          1 0.083918463
## 12   35-39          1 0.066558858
## 13   40-44          1 0.022509734
## 14   45-49          1 0.006435899

Probability of marrying at each age

library(survey)
library(dplyr)
library(ggplot2)

# Assuming your survey design is named des2

prop_marrying <- svyby(~evrmarry, by = ~hisprace2 + age_r + bach_degree, design = des2, FUN = svymean) %>%
  mutate(me = 2 * se,
         babs_event = case_when(bach_degree == 1 ~ "BA/BS Degree", bach_degree == 0 ~ "No College Degree"),
         hisprace2 = case_when(hisprace2 == 2 ~ "White Men", hisprace2 == 3 ~ "Black Men"))

# Create the plot
ggplot(data = prop_marrying, aes(x = age_r, y = evrmarry, ymin = evrmarry - me, ymax = evrmarry + me, fill = babs_event, color = babs_event, group = babs_event)) +
  geom_line() +
  geom_ribbon(alpha = 0.3, aes(color = NULL)) +
  labs(
    title = "Probability of Marrying by Race and Education (Time Varying)",
    subtitle = "Men Ages 18 to 49",
    caption = "Source: 2017-2019 NSFG Male Respondent File"
  ) +
  xlab(label = "Age at First Marriage") +
  ylab(label = "Adjusted First Marriage Rate") +
  facet_grid(~hisprace2)

whitedata$babs_event<- recode(whitedata$babs_event, "1='BA/BS Degree'; 0='No College Degree'")

ggplot(whitedata, aes(x = age_cat, y = phat, group=babs_event, fill=babs_event,color = babs_event)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for White Men")
## Don't know how to automatically pick scale for object of type <svystat>.
## Defaulting to continuous.

blackdata$babs_event<- recode(blackdata$babs_event, "1='BA/BS Degree'; 0='No College Degree'")

ggplot(blackdata, aes(x = age_cat, y = phat, group=babs_event, fill=babs_event,color = babs_event)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for Black Men")
## Don't know how to automatically pick scale for object of type <svystat>.
## Defaulting to continuous.

Swapping of models and data

newwhitedata <- pydata_whites %>%                           
  group_by(age_cat) %>% 
  dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())

newwhitedata$phat <- predict(whitemodel, newwhitedata, type="response")

as.data.frame(newwhitedata)
##   age_cat   babs_event     n        phat
## 1   15-19 0.0004530251 11139 0.006049988
## 2   20-24 0.1590766209  8611 0.044731842
## 3   25-29 0.3684261830  5423 0.082373970
## 4   30-34 0.3642379080  2891 0.076443677
## 5   35-39 0.3322900790  1391 0.055625528
## 6   40-44 0.3039027132   646 0.051835760
## 7   45-49 0.2210554539   209 0.016792882
newblackdata <- pydata_blacks %>%                           
  group_by(age_cat) %>% 
  dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())

newblackdata$phat <- predict(blackmodel, newblackdata, type="response")

as.data.frame(newblackdata)
##   age_cat babs_event    n        phat
## 1   15-19 0.00000000 4028 0.003956844
## 2   20-24 0.06555702 3021 0.030692792
## 3   25-29 0.16775968 2009 0.046716248
## 4   30-34 0.16704397 1114 0.054947438
## 5   35-39 0.13738011  639 0.042631917
## 6   40-44 0.15986562  336 0.014349850
## 7   45-49 0.14881902  135 0.004054127
whitemodel_blackdata <- pydata_blacks %>%                           
  group_by(age_cat) %>% 
  dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())

whitemodel_blackdata$phat <- predict(whitemodel, whitemodel_blackdata, type="response")

as.data.frame(whitemodel_blackdata)
##   age_cat babs_event    n        phat
## 1   15-19 0.00000000 4028 0.006048585
## 2   20-24 0.06555702 3021 0.042718449
## 3   25-29 0.16775968 2009 0.074892341
## 4   30-34 0.16704397 1114 0.069575572
## 5   35-39 0.13738011  639 0.050581964
## 6   40-44 0.15986562  336 0.048308985
## 7   45-49 0.14881902  135 0.016189593
blackmodel_whitedata <- pydata_whites %>%                           
  group_by(age_cat) %>% 
  dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())

blackmodel_whitedata$phat <- predict(blackmodel, blackmodel_whitedata, type="response")

as.data.frame(blackmodel_whitedata)
##   age_cat   babs_event     n        phat
## 1   15-19 0.0004530251 11139 0.003957819
## 2   20-24 0.1590766209  8611 0.032248201
## 3   25-29 0.3684261830  5423 0.051842886
## 4   30-34 0.3642379080  2891 0.060811308
## 5   35-39 0.3322900790  1391 0.047191128
## 6   40-44 0.3039027132   646 0.015505238
## 7   45-49 0.2210554539   209 0.004216464

Competing Risk

data$age_cat <- car::Recode(data$ageevent,recodes = "15:19 = '15-19'; 20:24 = '20-24'; 25:29 = '25-29'; 30:34 = '30-34'; 35:39 = '35-39'; 40:44 = '40-44'; 45:49 = '45-49'; else=NA", as.factor=T)


fprobs<- data %>%
    group_by(age_cat) %>%
    dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())

ggplot(fprobs, aes(x = age_cat, y = p, group =1 )) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate by Age")

data <- data %>%
  mutate(nh_wh_marr = ifelse(evmarried == 1 & hisprace2 == 2,1,0),
         nh_bl_marr = ifelse(evmarried == 1 & hisprace2 == 3,1,0),
         marr_by_race = ifelse(evmarried == 1 & hisprace2 == 2,1,
                               ifelse(evmarried == 1 & hisprace2 == 3,2,0))) #1 is for NH whites marrying, 2 is for NH Blacks marrying, 0 is for no marriages whatsoever regardless of race)

nh_wh_probs<- data %>%
  group_by(age_cat) %>%
  dplyr::summarize(p=weighted.mean(nh_wh_marr,wgt2017_2019,na.rm = TRUE),n=n())

ggplot(nh_wh_probs, aes(x = factor(age_cat), y = p, group = 1)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate (NH White) by Age")

nh_bl_probs<- data %>%
  group_by(age_cat) %>%
  dplyr::summarize(p=weighted.mean(nh_bl_marr,wgt2017_2019,na.rm = TRUE),n=n())

ggplot(nh_bl_probs, aes(x = age_cat, y = p, group =1 )) +
  geom_point() +
  geom_line() + 
  labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate (NH Black) by Age")

library(nnet)
compriskmodel <-multinom(formula = marr_by_race ~ age_cat, data = data)
## # weights:  24 (14 variable)
## initial  value 3578.180224 
## iter  10 value 2532.030674
## iter  20 value 2486.812645
## final  value 2486.783619 
## converged
summary(compriskmodel)
## Call:
## multinom(formula = marr_by_race ~ age_cat, data = data)
## 
## Coefficients:
##   (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1   -2.311634     2.209127     2.356211     1.780331     1.143977   0.39628878
## 2   -3.428630     1.696617     1.816311     1.427148     1.008216  -0.04545012
##   age_cat45-49
## 1    -1.011440
## 2    -1.280696
## 
## Std. Errors:
##   (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1   0.1413633    0.1593380    0.1598667    0.1706138    0.2083191    0.2834871
## 2   0.2394978    0.2727925    0.2729364    0.2928099    0.3541421    0.5614702
##   age_cat45-49
## 1    0.5281609
## 2    1.0325556
## 
## Residual Deviance: 4973.567 
## AIC: 5001.567
data$new_weight <- data$wgt2017_2019/mean(data$wgt2017_2019)

data$new_weight = data$wgt2017_2019/mean(data$wgt2017_2019) #creating new weight


des5<-svydesign(ids = ~secu,
                strata = ~sest,
                weights=~new_weight, 
                data=data,
                nest=T)

anymodel <- svyglm(evmarried ~ age_cat, design = des5) 

summary(anymodel)
## 
## Call:
## svyglm(formula = evmarried ~ age_cat, design = des5)
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~new_weight, 
##     data = data, nest = T)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.16688    0.02647   6.303 8.62e-08 ***
## age_cat20-24  0.39140    0.04738   8.260 8.92e-11 ***
## age_cat25-29  0.46781    0.03813  12.268  < 2e-16 ***
## age_cat30-34  0.35595    0.04737   7.515 1.20e-09 ***
## age_cat35-39  0.19926    0.05698   3.497  0.00102 ** 
## age_cat40-44  0.10616    0.07693   1.380  0.17398    
## age_cat45-49 -0.12781    0.03734  -3.423  0.00128 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2176265)
## 
## Number of Fisher Scoring iterations: 2
library(gbm)
## Loaded gbm 2.1.8.1
compriskmodel <-multinom(formula = marr_by_race ~ factor(evmarried), data = data)
## # weights:  9 (4 variable)
## initial  value 3578.180224 
## iter  10 value 591.840457
## final  value 577.578838 
## converged
summary(compriskmodel)
## Call:
## multinom(formula = marr_by_race ~ factor(evmarried), data = data)
## 
## Coefficients:
##   (Intercept) factor(evmarried)1
## 1   -12.70518           24.57202
## 2   -11.03884           21.35572
## 
## Std. Errors:
##   (Intercept) factor(evmarried)1
## 1    12.79633           17.39152
## 2     5.56221           13.02539
## 
## Residual Deviance: 1155.158 
## AIC: 1163.158
anymodel <- svyglm(evmarried ~ factor(ageevent), weights=new_weight, design = des5)
summary(anymodel)
## 
## Call:
## svyglm(formula = evmarried ~ factor(ageevent), design = des5, 
##     weights = new_weight)
## 
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~new_weight, 
##     data = data, nest = T)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.145e-15  1.353e-15   0.846 0.407409    
## factor(ageevent)16  3.475e-03  3.563e-03   0.975 0.341016    
## factor(ageevent)17  1.208e-02  6.350e-03   1.902 0.071635 .  
## factor(ageevent)18  3.762e-01  1.089e-01   3.454 0.002510 ** 
## factor(ageevent)19  4.492e-01  1.269e-01   3.540 0.002055 ** 
## factor(ageevent)20  4.319e-01  1.103e-01   3.914 0.000859 ***
## factor(ageevent)21  3.980e-01  1.271e-01   3.130 0.005270 ** 
## factor(ageevent)22  5.402e-01  1.300e-01   4.155 0.000490 ***
## factor(ageevent)23  7.643e-01  6.390e-02  11.961 1.44e-10 ***
## factor(ageevent)24  7.437e-01  6.603e-02  11.264 4.13e-10 ***
## factor(ageevent)25  5.961e-01  9.722e-02   6.132 5.43e-06 ***
## factor(ageevent)26  7.963e-01  7.623e-02  10.446 1.51e-09 ***
## factor(ageevent)27  7.834e-01  5.894e-02  13.293 2.19e-11 ***
## factor(ageevent)28  5.408e-01  9.901e-02   5.463 2.39e-05 ***
## factor(ageevent)29  6.583e-01  9.477e-02   6.946 9.60e-07 ***
## factor(ageevent)30  6.409e-01  1.138e-01   5.631 1.64e-05 ***
## factor(ageevent)31  7.304e-01  9.434e-02   7.742 1.93e-07 ***
## factor(ageevent)32  4.712e-01  1.156e-01   4.074 0.000591 ***
## factor(ageevent)33  2.932e-01  9.756e-02   3.005 0.006989 ** 
## factor(ageevent)34  5.723e-01  1.553e-01   3.686 0.001466 ** 
## factor(ageevent)35  3.778e-01  2.041e-01   1.851 0.078950 .  
## factor(ageevent)36  4.855e-01  1.533e-01   3.167 0.004846 ** 
## factor(ageevent)37  5.103e-01  1.387e-01   3.679 0.001487 ** 
## factor(ageevent)38  3.918e-01  1.280e-01   3.061 0.006171 ** 
## factor(ageevent)39  1.229e-01  1.038e-01   1.184 0.250220    
## factor(ageevent)40  6.368e-01  2.014e-01   3.162 0.004905 ** 
## factor(ageevent)41  3.626e-01  1.495e-01   2.425 0.024887 *  
## factor(ageevent)42  1.208e-01  7.206e-02   1.677 0.109119    
## factor(ageevent)43  6.798e-01  2.121e-01   3.206 0.004440 ** 
## factor(ageevent)44  1.298e-01  1.164e-01   1.115 0.278089    
## factor(ageevent)45  1.540e-01  1.065e-01   1.445 0.163895    
## factor(ageevent)46  1.820e-02  1.921e-02   0.947 0.354719    
## factor(ageevent)47 -9.765e-16  1.312e-15  -0.744 0.465424    
## factor(ageevent)48 -9.378e-16  1.300e-15  -0.721 0.479114    
## factor(ageevent)49 -9.109e-16  1.300e-15  -0.700 0.491690    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.4258265)
## 
## Number of Fisher Scoring iterations: 2
as.data.frame(fprobs)
##   age_cat          p   n
## 1   15-19 0.16687517 628
## 2   20-24 0.55827326 811
## 3   25-29 0.63468717 788
## 4   30-34 0.52282340 510
## 5   35-39 0.36613989 252
## 6   40-44 0.27303553 152
## 7   45-49 0.03906987 116

To get those who are at risk based on race and babs event

#Total population hisprace2( Black)
#Total Population hisprace2(white)
#Group evmarried and hisprace2, 
#babs_event is they have bachelor's degree or not 



pydata %>%
  group_by(hisprace2, babs_event) %>% # 3 is black 
  summarise(Count = n())
## `summarise()` has grouped output by 'hisprace2'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   hisprace2 [2]
##   hisprace2 babs_event Count
##       <dbl>      <dbl> <int>
## 1         2          0 25654
## 2         2          1  4656
## 3         3          0 10417
## 4         3          1   865
# To get those who experienced the event
pydata %>%
  group_by(hisprace2, babs_event) %>%
  summarise(Count = sum(evmarried == 1))
## `summarise()` has grouped output by 'hisprace2'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups:   hisprace2 [2]
##   hisprace2 babs_event Count
##       <dbl>      <dbl> <int>
## 1         2          0   682
## 2         2          1   345
## 3         3          0   180
## 4         3          1    38