# theme for nice plotting
theme_nice <- theme_classic()+
theme(
axis.line.y.left = element_line(colour = "black"),
axis.line.y.right = element_line(colour = "black"),
axis.line.x.bottom = element_line(colour = "black"),
axis.line.x.top = element_line(colour = "black"),
axis.text.y = element_text(colour = "black", size = 12),
axis.text.x = element_text(color = "black", size = 12),
axis.ticks = element_line(color = "black")) +
theme(
axis.ticks.length = unit(-0.25, "cm"),
axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")),
axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2426161 129.6 4148770 221.6 NA 4148770 221.6
## Vcells 4097332 31.3 8388608 64.0 16384 6229455 47.6
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
data<- read_csv("2017_2019_Male_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 5206 Columns: 3009
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1939): caseid, rscrninf, rscrage, rscrhisp, rscrrace, age_a, age_r, age...
## lgl (1070): sedbclc3, sedbclc4, sedwhlc3, sedwhlc4, sedconlc4, p2currwife, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Create a new data frame 'data' with selected columns
data <- data[, c("caseid","age_r", "fmarital", "agemarr", "agemarrn4", "agemarrn2", "agemarrn3", "hisagem", "evrmarry", "agemarrn", "hisprace2", "fmarno", "nformwife", "hieduc", "wgt2017_2019", "earnba_y", "intvwyear")]
data <- data %>%
mutate(evmarried = ifelse(fmarno == 1 & fmarital == 1, 1,
ifelse(fmarno == 0 & fmarital == 5, 0,
ifelse(fmarno %in% c(1:5), 1, NA))))
data %>%
group_by(evmarried) %>%
dplyr::summarize(n=n()) %>%
dplyr::mutate(pct = n / sum(n))
## # A tibble: 2 × 3
## evmarried n pct
## <dbl> <int> <dbl>
## 1 0 3245 0.623
## 2 1 1961 0.377
#ever married age event
data$ageevent<- pmin(data$hisagem,data$agemarrn, na.rm=T)
#never married age
data$ageevent <- ifelse(data$fmarital == 5, data$age_r, data$ageevent)
# filter for NA values
data <- data %>%
filter(ageevent !=98 & !is.na(ageevent))
tabyl(data$ageevent)
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## ℹ The deprecated feature was likely used in the janitor package.
## Please report the issue at <]8;;https://github.com/sfirke/janitor/issueshttps://github.com/sfirke/janitor/issues]8;;>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## data$ageevent n percent
## 15 186 0.036399217
## 16 192 0.037573386
## 17 234 0.045792564
## 18 271 0.053033268
## 19 276 0.054011742
## 20 199 0.038943249
## 21 264 0.051663405
## 22 248 0.048532290
## 23 251 0.049119374
## 24 267 0.052250489
## 25 284 0.055577299
## 26 240 0.046966732
## 27 234 0.045792564
## 28 239 0.046771037
## 29 206 0.040313112
## 30 197 0.038551859
## 31 165 0.032289628
## 32 162 0.031702544
## 33 146 0.028571429
## 34 108 0.021135029
## 35 92 0.018003914
## 36 82 0.016046967
## 37 73 0.014285714
## 38 64 0.012524462
## 39 57 0.011154599
## 40 58 0.011350294
## 41 44 0.008610568
## 42 50 0.009784736
## 43 32 0.006262231
## 44 31 0.006066536
## 45 30 0.005870841
## 46 35 0.006849315
## 47 33 0.006457926
## 48 29 0.005675147
## 49 31 0.006066536
data <- data %>%
filter(hisprace2 == 2 | hisprace2 == 3)
data <- data %>%
mutate(
bach_degree = ifelse(hieduc %in% c(5:11), 0,
ifelse(hieduc %in% c(12:15), 1, NA)),
birth_year = intvwyear - age_r,
age_bach_degree = earnba_y - birth_year)
data %>%
group_by(evmarried, ageevent, age_bach_degree) %>%
dplyr::summarize(n=n()) %>%
mutate(pct = n / sum(n))
## `summarise()` has grouped output by 'evmarried', 'ageevent'. You can override
## using the `.groups` argument.
## # A tibble: 379 × 5
## # Groups: evmarried, ageevent [66]
## evmarried ageevent age_bach_degree n pct
## <dbl> <dbl> <dbl> <int> <dbl>
## 1 0 15 NA 106 1
## 2 0 16 NA 97 1
## 3 0 17 NA 122 1
## 4 0 18 NA 115 1
## 5 0 19 NA 115 1
## 6 0 20 NA 70 1
## 7 0 21 NA 81 1
## 8 0 22 21 4 0.0519
## 9 0 22 22 3 0.0390
## 10 0 22 NA 70 0.909
## # ℹ 369 more rows
likely events
data <- data %>%
mutate(marriage_babs = case_when((evmarried == 0) & (is.na(age_bach_degree)) ~ 1,
(evmarried == 0) & (!is.na(age_bach_degree)) ~ 2,
(evmarried == 1) & (is.na(age_bach_degree)) ~ 3,
(evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent < age_bach_degree) ~ 4,
(evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent > age_bach_degree) ~ 5,
(evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent == age_bach_degree) ~ 6))
data %>%
group_by(evmarried, ageevent, age_bach_degree, marriage_babs) %>%
dplyr::summarize(n=n()) %>%
mutate(pct = n / sum(n))
## `summarise()` has grouped output by 'evmarried', 'ageevent', 'age_bach_degree'.
## You can override using the `.groups` argument.
## # A tibble: 379 × 6
## # Groups: evmarried, ageevent, age_bach_degree [379]
## evmarried ageevent age_bach_degree marriage_babs n pct
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 0 15 NA 1 106 1
## 2 0 16 NA 1 97 1
## 3 0 17 NA 1 122 1
## 4 0 18 NA 1 115 1
## 5 0 19 NA 1 115 1
## 6 0 20 NA 1 70 1
## 7 0 21 NA 1 81 1
## 8 0 22 21 2 4 1
## 9 0 22 22 2 3 1
## 10 0 22 NA 1 70 1
## # ℹ 369 more rows
data %>%
group_by(marriage_babs) %>%
dplyr::summarize(n=n()) %>%
mutate(pct = n / sum(n))
## # A tibble: 6 × 3
## marriage_babs n pct
## <dbl> <int> <dbl>
## 1 1 1669 0.512
## 2 2 343 0.105
## 3 3 783 0.240
## 4 4 79 0.0243
## 5 5 346 0.106
## 6 6 37 0.0114
library(ggsurvfit)
##
## Attaching package: 'ggsurvfit'
## The following object is masked from 'package:psych':
##
## %+%
library(relsurv)
## Loading required package: survival
## Loading required package: date
##
## Attaching package: 'relsurv'
## The following object is masked from 'package:lubridate':
##
## years
pydata<-survSplit(data, cut=c(15:49),end="ageevent",event="evmarried")
pydata$babs_event <- case_when(pydata$marriage_babs == 1 ~ 0,
pydata$marriage_babs == 3 ~ 0,
pydata$marriage_babs == 4 ~ 0,
pydata$marriage_babs == 2 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
pydata$marriage_babs == 2 & (pydata$ageevent < pydata$age_bach_degree) ~ 0,
pydata$marriage_babs == 5 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
pydata$marriage_babs == 5 & (pydata$ageevent < pydata$age_bach_degree) ~ 0,
pydata$marriage_babs == 6 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
pydata$marriage_babs == 6 & (pydata$ageevent < pydata$age_bach_degree) ~ 0)
head(pydata[, c("caseid", "ageevent", "age_bach_degree", "marriage_babs")], n=50)
## caseid ageevent age_bach_degree marriage_babs
## 1 80717 15 22 2
## 2 80717 16 22 2
## 3 80717 17 22 2
## 4 80717 18 22 2
## 5 80717 19 22 2
## 6 80717 20 22 2
## 7 80717 21 22 2
## 8 80717 22 22 2
## 9 80717 23 22 2
## 10 80717 24 22 2
## 11 80717 25 22 2
## 12 80717 26 22 2
## 13 80717 27 22 2
## 14 80717 28 22 2
## 15 80717 29 22 2
## 16 80717 30 22 2
## 17 80717 31 22 2
## 18 80721 15 NA 1
## 19 80721 16 NA 1
## 20 80721 17 NA 1
## 21 80724 15 NA 3
## 22 80724 16 NA 3
## 23 80724 17 NA 3
## 24 80724 18 NA 3
## 25 80732 15 22 2
## 26 80732 16 22 2
## 27 80732 17 22 2
## 28 80732 18 22 2
## 29 80732 19 22 2
## 30 80732 20 22 2
## 31 80732 21 22 2
## 32 80732 22 22 2
## 33 80732 23 22 2
## 34 80732 24 22 2
## 35 80732 25 22 2
## 36 80732 26 22 2
## 37 80732 27 22 2
## 38 80732 28 22 2
## 39 80732 29 22 2
## 40 80732 30 22 2
## 41 80732 31 22 2
## 42 80732 32 22 2
## 43 80732 33 22 2
## 44 80732 34 22 2
## 45 80732 35 22 2
## 46 80732 36 22 2
## 47 80732 37 22 2
## 48 80732 38 22 2
## 49 80732 39 22 2
## 50 80734 15 21 5
raceprobs<- pydata %>%
dplyr::group_by(ageevent, hisprace2) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
raceprobs$num <- raceprobs$p*(1-raceprobs$p)
raceprobs$sep <- sqrt(raceprobs$num/raceprobs$n)
raceprobs$me <- 2*raceprobs$sep
raceprobs$hisprace2 <- car::Recode(raceprobs$hisprace2, recodes="2='White Men'; 3='Black Men'; else=NA",
as.factor=T)
ggplot(data =raceprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=hisprace2,color = hisprace2, group = hisprace2)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Race",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Adjusted First Marriage Rate") +theme_nice
#Event by Education
educprobs<- pydata %>%
group_by(ageevent, babs_event) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)
educprobs$sep <- sqrt(educprobs$num/educprobs$n)
educprobs$me <- 2*educprobs$sep
educprobs$babs_event <- case_when(educprobs$babs_event == 1 ~ "BA/BS Degree",educprobs$babs_event == 0 ~ "No College Degree")
ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs_event,color = babs_event, group = babs_event)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education (Time-Varying)",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Adjusted First Marriage Rate")+theme_nice
educprobs<- pydata %>%
group_by(ageevent, bach_degree) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)
educprobs$sep <- sqrt(educprobs$num/educprobs$n)
educprobs$me <- 2*educprobs$sep
educprobs$babs <- case_when(educprobs$bach_degree == 1 ~ "BA/BS Degree",educprobs$bach_degree == 0 ~ "No College Degree")
ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs,color = babs, group = babs)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Adjusted First Marriage Rate")+theme_nice
raceeducprobs<- pydata %>%
group_by(ageevent, hisprace2, babs_event) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'hisprace2'. You can override
## using the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)
raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)
raceeducprobs$me <- 2*raceeducprobs$sep
raceeducprobs$babs_event <- case_when(raceeducprobs$babs_event == 1 ~ "BA/BS Degree",raceeducprobs$babs_event == 0 ~ "No College Degree")
raceeducprobs$hisprace2 <- case_when(raceeducprobs$hisprace2 == 2 ~ "White Men",raceeducprobs$hisprace2 == 3 ~ "Black Men")
ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs_event,color = babs_event, group = babs_event)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Race and Education (Time Varying)",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Ajusted First Marriage Rate") + facet_grid(~hisprace2)+theme_nice
raceeducprobs<- pydata %>%
group_by(ageevent, hisprace2, bach_degree) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'hisprace2'. You can override
## using the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)
raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)
raceeducprobs$me <- 2*raceeducprobs$sep
raceeducprobs$bach_degree <- case_when(raceeducprobs$bach_degree == 1 ~ "BA/BS Degree",raceeducprobs$bach_degree == 0 ~ "No College Degree")
raceeducprobs$hisprace2 <- case_when(raceeducprobs$hisprace2 == 2 ~ "White Men",raceeducprobs$hisprace2 == 3 ~ "Black Men")
ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=bach_degree,color = bach_degree, group = bach_degree)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Race and Education",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Ajusted First Marriage Rate") + facet_grid(~hisprace2)+theme_nice
pydata$age_cat <- car::Recode(pydata$ageevent,recodes = "15:19 = '15-19'; 20:24 = '20-24'; 25:29 = '25-29'; 30:34 = '30-34'; 35:39 = '35-39'; 40:44 = '40-44'; 45:49 = '45-49'; else=NA", as.factor=T)
pydata$new_weight <- pydata$wgt2017_2019/mean(pydata$wgt2017_2019) #standize the weight
pydata$black <- ifelse(pydata$hisprace2==3,1,0)
model_1 <- glm(evmarried~ age_cat + black, family = "binomial", weights=new_weight, data = pydata)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(model_1)
##
## Call:
## glm(formula = evmarried ~ age_cat + black, family = "binomial",
## data = pydata, weights = new_weight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0020 -0.2956 -0.1851 -0.0954 7.6805
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.09536 0.10624 -47.962 < 2e-16 ***
## age_cat20-24 2.06148 0.11493 17.937 < 2e-16 ***
## age_cat25-29 2.69533 0.11442 23.556 < 2e-16 ***
## age_cat30-34 2.64754 0.12348 21.440 < 2e-16 ***
## age_cat35-39 2.32274 0.15183 15.298 < 2e-16 ***
## age_cat40-44 2.10117 0.21108 9.954 < 2e-16 ***
## age_cat45-49 0.90428 0.55708 1.623 0.105
## black -0.47912 0.07779 -6.159 7.33e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13433 on 41591 degrees of freedom
## Residual deviance: 12321 on 41584 degrees of freedom
## AIC: 12429
##
## Number of Fisher Scoring iterations: 7
model_2 <- glm(evmarried ~ age_cat + black + babs_event, family = "binomial", weights=new_weight, data = pydata)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(model_2)
##
## Call:
## glm(formula = evmarried ~ age_cat + black + babs_event, family = "binomial",
## data = pydata, weights = new_weight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1576 -0.2941 -0.1802 -0.0953 7.6885
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.10609 0.10631 -48.031 < 2e-16 ***
## age_cat20-24 1.96850 0.11561 17.027 < 2e-16 ***
## age_cat25-29 2.49191 0.11730 21.243 < 2e-16 ***
## age_cat30-34 2.44622 0.12614 19.392 < 2e-16 ***
## age_cat35-39 2.13813 0.15377 13.904 < 2e-16 ***
## age_cat40-44 1.92692 0.21247 9.069 < 2e-16 ***
## age_cat45-49 0.76649 0.55756 1.375 0.169
## black -0.39209 0.07855 -4.992 5.99e-07 ***
## babs_event 0.51762 0.05798 8.928 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13433 on 41591 degrees of freedom
## Residual deviance: 12244 on 41583 degrees of freedom
## AIC: 12345
##
## Number of Fisher Scoring iterations: 7
pydata_whites <-filter(pydata, black == 0)
whitemodel <- glm(evmarried ~ age_cat + babs_event, family = "binomial", weights=new_weight, data = pydata_whites)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(whitemodel)
##
## Call:
## glm(formula = evmarried ~ age_cat + babs_event, family = "binomial",
## data = pydata_whites, weights = new_weight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1638 -0.3269 -0.2088 -0.1048 7.6853
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.10186 0.11292 -45.180 <2e-16 ***
## age_cat20-24 1.95864 0.12346 15.864 <2e-16 ***
## age_cat25-29 2.50161 0.12535 19.957 <2e-16 ***
## age_cat30-34 2.42261 0.13543 17.888 <2e-16 ***
## age_cat35-39 2.09886 0.16723 12.551 <2e-16 ***
## age_cat40-44 2.03891 0.22448 9.083 <2e-16 ***
## age_cat45-49 0.91816 0.58801 1.561 0.118
## babs_event 0.51497 0.06087 8.461 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 11566 on 30309 degrees of freedom
## Residual deviance: 10546 on 30302 degrees of freedom
## AIC: 10722
##
## Number of Fisher Scoring iterations: 7
pydata_blacks <-filter(pydata, black == 1)
blackmodel<- glm(evmarried ~ age_cat + babs_event, family = "binomial", weights=new_weight, data = pydata_blacks)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(blackmodel)
##
## Call:
## glm(formula = evmarried ~ age_cat + babs_event, family = "binomial",
## data = pydata_blacks, weights = new_weight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9297 -0.2063 -0.1325 -0.0673 6.3784
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.52834 0.30422 -18.172 < 2e-16 ***
## age_cat20-24 2.04001 0.32991 6.184 6.27e-10 ***
## age_cat25-29 2.42096 0.33390 7.250 4.15e-13 ***
## age_cat30-34 2.59231 0.34856 7.437 1.03e-13 ***
## age_cat35-39 2.34178 0.39916 5.867 4.44e-09 ***
## age_cat40-44 1.21153 0.67723 1.789 0.0736 .
## age_cat45-49 -0.05683 1.77633 -0.032 0.9745
## babs_event 0.54577 0.19258 2.834 0.0046 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1829.7 on 11281 degrees of freedom
## Residual deviance: 1693.2 on 11274 degrees of freedom
## AIC: 1631.3
##
## Number of Fisher Scoring iterations: 8
whitedata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),babs_event=0:1)
whitedata$phat <- predict(whitemodel, whitedata, type="response")
whitedata
## age_cat babs_event phat
## 1 15-19 0 0.006048585
## 2 20-24 0 0.041358991
## 3 25-29 0 0.069122309
## 4 30-34 0 0.064208888
## 5 35-39 0 0.047290468
## 6 40-44 0 0.044661688
## 7 45-49 0 0.015013139
## 8 15-19 1 0.010081805
## 9 20-24 1 0.067342106
## 10 25-29 1 0.110536144
## 11 30-34 1 0.103004646
## 12 35-39 1 0.076701787
## 13 40-44 1 0.072562615
## 14 45-49 1 0.024874367
blackdata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),babs_event=0:1)
blackdata$phat <- predict(blackmodel, blackdata, type="response")
blackdata
## age_cat babs_event phat
## 1 15-19 0 0.003956844
## 2 20-24 0 0.029646024
## 3 25-29 0 0.042803900
## 4 30-34 0 0.050400876
## 5 35-39 0 0.039674527
## 6 40-44 0 0.013166675
## 7 45-49 0 0.003739046
## 8 15-19 1 0.006809714
## 9 20-24 1 0.050089266
## 10 25-29 1 0.071650548
## 11 30-34 1 0.083918463
## 12 35-39 1 0.066558858
## 13 40-44 1 0.022509734
## 14 45-49 1 0.006435899
whitedata$babs_event<- recode(whitedata$babs_event, "1='BA/BS Degree'; 0='No College Degree'")
ggplot(whitedata, aes(x = age_cat, y = phat, group=babs_event, fill=babs_event,color = babs_event)) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for White Men")
blackdata$babs_event<- recode(blackdata$babs_event, "1='BA/BS Degree'; 0='No College Degree'")
ggplot(blackdata, aes(x = age_cat, y = phat, group=babs_event, fill=babs_event,color = babs_event)) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for Black Men")
newwhitedata <- pydata_whites %>%
group_by(age_cat) %>%
dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())
newwhitedata$phat <- predict(whitemodel, newwhitedata, type="response")
as.data.frame(newwhitedata)
## age_cat babs_event n phat
## 1 15-19 0.0004530251 11139 0.006049987
## 2 20-24 0.1590766209 8611 0.044731842
## 3 25-29 0.3684261830 5423 0.082373970
## 4 30-34 0.3642379080 2891 0.076443677
## 5 35-39 0.3322900790 1391 0.055625528
## 6 40-44 0.3039027132 646 0.051835760
## 7 45-49 0.2210554539 209 0.016792882
newblackdata <- pydata_blacks %>%
group_by(age_cat) %>%
dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())
newblackdata$phat <- predict(blackmodel, newblackdata, type="response")
as.data.frame(newblackdata)
## age_cat babs_event n phat
## 1 15-19 0.00000000 4028 0.003956844
## 2 20-24 0.06555702 3021 0.030692792
## 3 25-29 0.16775968 2009 0.046716248
## 4 30-34 0.16704397 1114 0.054947438
## 5 35-39 0.13738011 639 0.042631917
## 6 40-44 0.15986562 336 0.014349850
## 7 45-49 0.14881902 135 0.004054127
whitemodel_blackdata <- pydata_blacks %>%
group_by(age_cat) %>%
dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())
whitemodel_blackdata$phat <- predict(whitemodel, whitemodel_blackdata, type="response")
as.data.frame(whitemodel_blackdata)
## age_cat babs_event n phat
## 1 15-19 0.00000000 4028 0.006048585
## 2 20-24 0.06555702 3021 0.042718449
## 3 25-29 0.16775968 2009 0.074892341
## 4 30-34 0.16704397 1114 0.069575572
## 5 35-39 0.13738011 639 0.050581964
## 6 40-44 0.15986562 336 0.048308985
## 7 45-49 0.14881902 135 0.016189593
blackmodel_whitedata <- pydata_whites %>%
group_by(age_cat) %>%
dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())
blackmodel_whitedata$phat <- predict(blackmodel, blackmodel_whitedata, type="response")
as.data.frame(blackmodel_whitedata)
## age_cat babs_event n phat
## 1 15-19 0.0004530251 11139 0.003957819
## 2 20-24 0.1590766209 8611 0.032248201
## 3 25-29 0.3684261830 5423 0.051842886
## 4 30-34 0.3642379080 2891 0.060811308
## 5 35-39 0.3322900790 1391 0.047191128
## 6 40-44 0.3039027132 646 0.015505238
## 7 45-49 0.2210554539 209 0.004216464
data$age_cat <- car::Recode(data$ageevent,recodes = "15:19 = '15-19'; 20:24 = '20-24'; 25:29 = '25-29'; 30:34 = '30-34'; 35:39 = '35-39'; 40:44 = '40-44'; 45:49 = '45-49'; else=NA", as.factor=T)
fprobs<- data %>%
group_by(age_cat) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
ggplot(fprobs, aes(x = age_cat, y = p, group =1 )) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate by Age")
data <- data %>%
mutate(nh_wh_marr = ifelse(evmarried == 1 & hisprace2 == 2,1,0),
nh_bl_marr = ifelse(evmarried == 1 & hisprace2 == 3,1,0),
marr_by_race = ifelse(evmarried == 1 & hisprace2 == 2,1,
ifelse(evmarried == 1 & hisprace2 == 3,2,0))) #1 is for NH whites marrying, 2 is for NH Blacks marrying, 0 is for no marriages whatsoever regardless of race)
nh_wh_probs<- data %>%
group_by(age_cat) %>%
dplyr::summarize(p=weighted.mean(nh_wh_marr,wgt2017_2019,na.rm = TRUE),n=n())
ggplot(nh_wh_probs, aes(x = factor(age_cat), y = p, group = 1)) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate (NH White) by Age")
nh_bl_probs<- data %>%
group_by(age_cat) %>%
dplyr::summarize(p=weighted.mean(nh_bl_marr,wgt2017_2019,na.rm = TRUE),n=n())
ggplot(nh_bl_probs, aes(x = age_cat, y = p, group =1 )) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate (NH Black) by Age")
library(nnet)
compriskmodel <-multinom(formula = marr_by_race ~ age_cat, data = data)
## # weights: 24 (14 variable)
## initial value 3578.180224
## iter 10 value 2532.030674
## iter 20 value 2486.812645
## final value 2486.783619
## converged
summary(compriskmodel)
## Call:
## multinom(formula = marr_by_race ~ age_cat, data = data)
##
## Coefficients:
## (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1 -2.311634 2.209127 2.356211 1.780331 1.143977 0.39628878
## 2 -3.428630 1.696617 1.816311 1.427148 1.008216 -0.04545012
## age_cat45-49
## 1 -1.011440
## 2 -1.280696
##
## Std. Errors:
## (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1 0.1413633 0.1593380 0.1598667 0.1706138 0.2083191 0.2834871
## 2 0.2394978 0.2727925 0.2729364 0.2928099 0.3541421 0.5614702
## age_cat45-49
## 1 0.5281609
## 2 1.0325556
##
## Residual Deviance: 4973.567
## AIC: 5001.567
data$new_weight <- data$wgt2017_2019/mean(data$wgt2017_2019)
data$new_weight = data$wgt2017_2019/mean(data$wgt2017_2019) #creating new weight
anymodel <- glm(evmarried ~ age_cat, new_weight, family = "binomial", data = data)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(anymodel)
##
## Call:
## glm(formula = evmarried ~ age_cat, family = "binomial", data = data,
## weights = new_weight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3703 -0.8313 -0.4013 0.8676 4.4941
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6079 0.1171 -13.731 < 2e-16 ***
## age_cat20-24 1.8421 0.1347 13.671 < 2e-16 ***
## age_cat25-29 2.1603 0.1363 15.846 < 2e-16 ***
## age_cat30-34 1.6993 0.1474 11.527 < 2e-16 ***
## age_cat35-39 1.0591 0.1783 5.940 2.85e-09 ***
## age_cat40-44 0.6287 0.2421 2.597 0.00941 **
## age_cat45-49 -1.5946 0.5728 -2.784 0.00537 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4505.8 on 3256 degrees of freedom
## Residual deviance: 4051.8 on 3250 degrees of freedom
## AIC: 3995.5
##
## Number of Fisher Scoring iterations: 5
library(gbm)
## Loaded gbm 2.1.8.1
compriskmodel <-multinom(formula = marr_by_race ~ factor(evmarried), data = data)
## # weights: 9 (4 variable)
## initial value 3578.180224
## iter 10 value 591.840457
## final value 577.578838
## converged
summary(compriskmodel)
## Call:
## multinom(formula = marr_by_race ~ factor(evmarried), data = data)
##
## Coefficients:
## (Intercept) factor(evmarried)1
## 1 -12.70518 24.57202
## 2 -11.03884 21.35572
##
## Std. Errors:
## (Intercept) factor(evmarried)1
## 1 12.79633 17.39152
## 2 5.56221 13.02539
##
## Residual Deviance: 1155.158
## AIC: 1163.158
anymodel <- glm(evmarried ~ factor(ageevent), weights=new_weight, family = "binomial", data = data)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(anymodel)
##
## Call:
## glm(formula = evmarried ~ factor(ageevent), family = "binomial",
## data = data, weights = new_weight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6818 -0.8337 -0.2389 0.8488 3.7547
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.63186 261.51882 -0.064 0.949
## factor(ageevent)16 11.76824 261.52258 0.045 0.964
## factor(ageevent)17 13.27880 261.51943 0.051 0.960
## factor(ageevent)18 15.71972 261.51888 0.060 0.952
## factor(ageevent)19 15.92562 261.51888 0.061 0.951
## factor(ageevent)20 16.45440 261.51886 0.063 0.950
## factor(ageevent)21 16.60451 261.51885 0.063 0.949
## factor(ageevent)22 16.82407 261.51886 0.064 0.949
## factor(ageevent)23 17.22629 261.51886 0.066 0.947
## factor(ageevent)24 17.25150 261.51886 0.066 0.947
## factor(ageevent)25 17.15481 261.51886 0.066 0.948
## factor(ageevent)26 17.49893 261.51887 0.067 0.947
## factor(ageevent)27 17.47596 261.51886 0.067 0.947
## factor(ageevent)28 16.65639 261.51886 0.064 0.949
## factor(ageevent)29 17.09184 261.51887 0.065 0.948
## factor(ageevent)30 17.05200 261.51888 0.065 0.948
## factor(ageevent)31 17.07489 261.51890 0.065 0.948
## factor(ageevent)32 16.52822 261.51888 0.063 0.950
## factor(ageevent)33 16.15155 261.51890 0.062 0.951
## factor(ageevent)34 16.75322 261.51895 0.064 0.949
## factor(ageevent)35 16.06973 261.51897 0.061 0.951
## factor(ageevent)36 16.42463 261.51892 0.063 0.950
## factor(ageevent)37 16.15067 261.51902 0.062 0.951
## factor(ageevent)38 15.94210 261.51902 0.061 0.951
## factor(ageevent)39 15.13919 261.51926 0.058 0.954
## factor(ageevent)40 16.13955 261.51905 0.062 0.951
## factor(ageevent)41 15.59706 261.51923 0.060 0.952
## factor(ageevent)42 14.64536 261.51968 0.056 0.955
## factor(ageevent)43 15.98953 261.51918 0.061 0.951
## factor(ageevent)44 14.32893 261.52106 0.055 0.956
## factor(ageevent)45 14.91363 261.51961 0.057 0.955
## factor(ageevent)46 13.34649 261.52292 0.051 0.959
## factor(ageevent)47 0.05916 588.53811 0.000 1.000
## factor(ageevent)48 0.03428 662.14810 0.000 1.000
## factor(ageevent)49 0.10625 647.05364 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4505.8 on 3256 degrees of freedom
## Residual deviance: 3869.1 on 3222 degrees of freedom
## AIC: 3865.4
##
## Number of Fisher Scoring iterations: 15
as.data.frame(fprobs)
## age_cat p n
## 1 15-19 0.16687517 628
## 2 20-24 0.55827326 811
## 3 25-29 0.63468717 788
## 4 30-34 0.52282340 510
## 5 35-39 0.36613989 252
## 6 40-44 0.27303553 152
## 7 45-49 0.03906987 116
#Total population hisprace2( Black)
#Total Population hisprace2(white)
#Group evmarried and hisprace2,
#babs_event is they have bachelor's degree or not
pydata %>%
group_by(hisprace2, babs_event) %>% # 3 is black
summarise(Count = n())
## `summarise()` has grouped output by 'hisprace2'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: hisprace2 [2]
## hisprace2 babs_event Count
## <dbl> <dbl> <int>
## 1 2 0 25654
## 2 2 1 4656
## 3 3 0 10417
## 4 3 1 865
# To get those who experienced the event
pydata %>%
group_by(hisprace2, babs_event) %>%
summarise(Count = sum(evmarried == 1))
## `summarise()` has grouped output by 'hisprace2'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: hisprace2 [2]
## hisprace2 babs_event Count
## <dbl> <dbl> <int>
## 1 2 0 682
## 2 2 1 345
## 3 3 0 180
## 4 3 1 38