library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(survival)
NSFGM1719 <- read_csv("C:/Users/jacob/Downloads/2017_2019_Male_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 5206 Columns: 3009
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1939): caseid, rscrninf, rscrage, rscrhisp, rscrrace, age_a, age_r, age...
## lgl (1070): sedbclc3, sedbclc4, sedwhlc3, sedwhlc4, sedconlc4, p2currwife, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- NSFGM1719 %>% select(caseid, age_r, hisagem,havedeg,fmarital, evrmarry,agemarr,agemarrn,agemarrn2,agemarrn3,agemarrn4, expschl, hisprace2,rscrrace, fmarno, earnba_y, intvwyear,sest,secu, wgt2017_2019)
data0 <- data %>% filter(hisprace2 %in% c(2,3))
tabyl(data0$evrmarry, data0$hisprace2, show_na = TRUE)
## data0$evrmarry n percent
## 0 2012 0.6049308
## 1 1314 0.3950692
Of the 3,326 NH white and Black men, 2012 never married and 1,314 have experienced a first marriage. Two rows with ageevent = 98 were excluded as NAs.
data0$ageevent <- pmin(data0$hisagem,data0$agemarrn, na.rm=T)
data0$ageevent <- ifelse(is.na(data0$ageevent), data0$age_r,data0$ageevent)
data0 <- data0 %>% filter((!ageevent %in%(c(98,NA))))
print("Age Event")
## [1] "Age Event"
tabyl(data0$ageevent)
## data0$ageevent n percent
## 15 106 0.031889290
## 16 98 0.029482551
## 17 129 0.038808664
## 18 142 0.042719615
## 19 153 0.046028881
## 20 126 0.037906137
## 21 174 0.052346570
## 22 162 0.048736462
## 23 165 0.049638989
## 24 186 0.055956679
## 25 181 0.054452467
## 26 154 0.046329723
## 27 165 0.049638989
## 28 153 0.046028881
## 29 141 0.042418773
## 30 132 0.039711191
## 31 106 0.031889290
## 32 107 0.032190132
## 33 108 0.032490975
## 34 70 0.021058965
## 35 66 0.019855596
## 36 64 0.019253911
## 37 50 0.015042118
## 38 47 0.014139591
## 39 39 0.011732852
## 40 44 0.013237064
## 41 37 0.011131167
## 42 38 0.011432010
## 43 24 0.007220217
## 44 22 0.006618532
## 45 28 0.008423586
## 46 23 0.006919374
## 47 31 0.009326113
## 48 27 0.008122744
## 49 26 0.007821901
hist(data0$ageevent)
print("Ever Married")
## [1] "Ever Married"
tabyl(data0$evrmarry)
## data0$evrmarry n percent
## 0 2012 0.6052948
## 1 1312 0.3947052
The histogram above shows that all values of age are valid (15-49 years).
tabyl(data0,ageevent, evrmarry)
## ageevent 0 1
## 15 106 0
## 16 97 1
## 17 122 7
## 18 115 27
## 19 115 38
## 20 70 56
## 21 81 93
## 22 77 85
## 23 77 88
## 24 85 101
## 25 69 112
## 26 67 87
## 27 71 94
## 28 84 69
## 29 60 81
## 30 71 61
## 31 59 47
## 32 62 45
## 33 65 43
## 34 39 31
## 35 41 25
## 36 39 25
## 37 35 15
## 38 34 13
## 39 31 8
## 40 32 12
## 41 27 10
## 42 31 7
## 43 20 4
## 44 19 3
## 45 22 6
## 46 19 4
## 47 26 5
## 48 20 7
## 49 24 2
2. Create a time varying variable for whether respondent has a
bachelor’s degree for your person-years file. (Hint: create a variable
for age of degree and add to your person-years file. Then determine
whether respondents obtained the degree prior to the age of risk. The PY
file will not include and respondents with missing data.)
#How many years before interview year did the respondent earn a bachelor's degree?
data0$ageba <- data0$age_r - (data0$intvwyear - data0$earnba_y)
hist(data0$ageba)
tabyl(data0$ageba)
## data0$ageba n percent valid_percent
## 19 3 0.0009025271 0.003645200
## 20 7 0.0021058965 0.008505468
## 21 100 0.0300842359 0.121506683
## 22 258 0.0776173285 0.313487242
## 23 172 0.0517448857 0.208991495
## 24 88 0.0264741276 0.106925881
## 25 33 0.0099277978 0.040097205
## 26 32 0.0096269555 0.038882139
## 27 16 0.0048134777 0.019441069
## 28 17 0.0051143201 0.020656136
## 29 16 0.0048134777 0.019441069
## 30 14 0.0042117930 0.017010936
## 31 15 0.0045126354 0.018226002
## 32 12 0.0036101083 0.014580802
## 33 11 0.0033092659 0.013365735
## 34 4 0.0012033694 0.004860267
## 35 7 0.0021058965 0.008505468
## 36 2 0.0006016847 0.002430134
## 37 4 0.0012033694 0.004860267
## 39 1 0.0003008424 0.001215067
## 40 5 0.0015042118 0.006075334
## 44 3 0.0009025271 0.003645200
## 45 2 0.0006016847 0.002430134
## 46 1 0.0003008424 0.001215067
## NA 2501 0.7524067389 NA
summary(data0$ageba)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 19.00 22.00 23.00 24.07 24.00 46.00 2501
The histogram shows that the median age to earn a bachelor’s degree was 23 years. This is valid and an expected result.
Marriage | BA/BS | Configuration | Pseudocode |
---|---|---|---|
Never Married | No BA/BS | 1 | evrmarry = 0; NA |
Never Married | BA/BS | 2 | evrmarry = 0; ageba |
Married | No BA/BS | 3 | evrmarry = 0; NA |
Married | Married, BA/BS | 4 | evrmarry = 1; ageevent < ageba |
Married | BA/BS, Married | 5 | evrmarry = 1; ageevent > ageba |
Married | Married = BA/BS | 6 | evrmarry = 1; ageevent == ageba |
#degmar assigns configurations
data0 <- data0 %>%
mutate(degmar = case_when(( evrmarry == 0) & (is.na(ageba)) ~ 1,
(evrmarry == 0) & (!is.na(ageba)) ~ 2,
(evrmarry == 1) & (is.na(ageba)) ~ 3,
(evrmarry == 1) & (!is.na(ageba)) & (ageevent < ageba) ~ 4,
(evrmarry == 1) & (!is.na(ageba)) & (ageevent > ageba) ~ 5,
(evrmarry == 1) & (!is.na(ageba)) & (ageevent == ageba) ~ 6))
pydata<-survSplit(data0, cut=c(15:49),end="ageevent",event="evrmarry")
pydata$babs <- is.na(pydata$ageba)
degmar | Description | Temporal Order | Event Value |
---|---|---|---|
1 | Unmarried, No BA/BS | N/A | 0 |
2 | Unmarried, BA/BS | ageevent(age_r) >= ageba | 1 |
2 | Unmarried, BA/BS | ageevent(age_r) < ageba | 0 |
3 | Married, No BA/BS | Married, N/A | 0 |
4 | Married, BA/BS | Married, BA/BS | 0 |
5 | Married, BA/BS | BA/BS, Married | 1 |
5 | Married, BA/BS | Married, BA/BS | 0 |
6 | Married, BA/BS | ageevent >= ageba | 1 |
6 | Married, BA/BS | ageevent < ageba | 0 |
pydata$tvba <- case_when(pydata$degmar == 1 ~ 0,
pydata$degmar == 3 ~ 0,
pydata$degmar == 4 ~ 0,
pydata$degmar == 2 & (pydata$ageevent >= pydata$ageba) ~ 1,
pydata$degmar == 2 & (pydata$ageevent < pydata$ageba) ~ 0,
pydata$degmar == 5 & (pydata$ageevent >= pydata$ageba) ~ 1,
pydata$degmar == 5 & (pydata$ageevent < pydata$ageba) ~ 0,
pydata$degmar == 6 & (pydata$ageevent >= pydata$ageba) ~ 1,
pydata$degmar == 6 & (pydata$ageevent < pydata$ageba) ~ 0)
raceprobs<- pydata %>%
dplyr::group_by(ageevent, hisprace2) %>%
dplyr::summarize(p=weighted.mean(evrmarry,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
raceprobs$num <- raceprobs$p*(1-raceprobs$p)
raceprobs$sep <- sqrt(raceprobs$num/raceprobs$n)
raceprobs$me <- 2*raceprobs$sep
raceprobs$hisprace2 <- car::Recode(raceprobs$hisprace2, recodes="2='White Men'; 3='Black Men'; else=NA",
as.factor=T)
ggplot(data =raceprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=hisprace2,color = hisprace2, group = hisprace2)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Race",
subtitle="Men Ages 15 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Adjusted First Marriage Rate")
educprobs<- pydata %>%
group_by(ageevent, tvba) %>%
dplyr::summarize(p=weighted.mean(evrmarry,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs<- pydata %>%
group_by(ageevent, tvba) %>%
dplyr::summarize(p=weighted.mean(evrmarry,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)
educprobs$sep <- sqrt(educprobs$num/educprobs$n)
educprobs$me <- 2*educprobs$sep
educprobs$babs <- case_when(educprobs$tvba == 1 ~ "BA/BS Degree",educprobs$tvba == 0 ~ "No College Degree")
ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=tvba,color = tvba, group = tvba)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education (Time-Varying Education )",
subtitle="Men Ages 15 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Adjusted First Marriage Rate")
educprobs$babs <- case_when(educprobs$babs == FALSE ~ "BA/BS Degree",educprobs$babs == TRUE ~ "No College Degree")
educprobs<- pydata %>%
group_by(ageevent, babs) %>%
dplyr::summarize(p=weighted.mean(evrmarry,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)
educprobs$sep <- sqrt(educprobs$num/educprobs$n)
educprobs$me <- 2*educprobs$sep
ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill= babs,color = babs, group = babs)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education",
subtitle="Men Ages 15 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Adjusted First Marriage Rate")
raceeducprobs<- pydata %>%
group_by(ageevent, hisprace2, tvba) %>%
dplyr::summarize(p=weighted.mean(evrmarry,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'hisprace2'. You can override
## using the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)
raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)
raceeducprobs$me <- 2*raceeducprobs$sep
raceeducprobs$tvba <- case_when(raceeducprobs$tvba == 1 ~ "BA/BS Degree",raceeducprobs$tvba == 0 ~ "No College Degree")
raceeducprobs$hisprace2 <- case_when(raceeducprobs$hisprace2 == 2 ~ "White Men",raceeducprobs$hisprace2 == 3 ~ "Black Men")
ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=tvba,color = tvba, group = tvba)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Race and Education (Time Varying)",
subtitle="Men Ages 15 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Ajusted First Marriage Rate") + facet_grid(~hisprace2)
pydata$age_cat <-
car::Recode(pydata$ageevent, recodes =
"15:19 = '15-19';
20:24 = '20-24';
25:29 = '25-29';
30:34 = '30-34';
35:39 = '35-39';
40:44 = '40-44';
45:49 = '45-49';
else=NA", as.factor = T)
#weight to the PY file
pydata$new_weights <- pydata$wgt2017_2019/mean(pydata$wgt2017_2019)
m1 <- glm(evrmarry~ age_cat + factor(hisprace2), family = "binomial", weights=new_weights, data = pydata)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(m1)
##
## Call:
## glm(formula = evrmarry ~ age_cat + factor(hisprace2), family = "binomial",
## data = pydata, weights = new_weights)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.11189 0.10574 -48.342 < 2e-16 ***
## age_cat20-24 2.05655 0.11439 17.978 < 2e-16 ***
## age_cat25-29 2.69092 0.11378 23.650 < 2e-16 ***
## age_cat30-34 2.64560 0.12236 21.622 < 2e-16 ***
## age_cat35-39 2.32895 0.14829 15.705 < 2e-16 ***
## age_cat40-44 2.32283 0.18734 12.399 < 2e-16 ***
## age_cat45-49 2.43728 0.26586 9.168 < 2e-16 ***
## factor(hisprace2)3 -0.47295 0.07538 -6.274 3.51e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13986 on 43234 degrees of freedom
## Residual deviance: 12857 on 43227 degrees of freedom
## AIC: 12873
##
## Number of Fisher Scoring iterations: 7
m2 <- glm(evrmarry~ age_cat + factor(hisprace2)+ tvba, family = "binomial", weights=new_weights, data = pydata)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(m2)
##
## Call:
## glm(formula = evrmarry ~ age_cat + factor(hisprace2) + tvba,
## family = "binomial", data = pydata, weights = new_weights)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.12261 0.10581 -48.413 < 2e-16 ***
## age_cat20-24 1.96580 0.11504 17.088 < 2e-16 ***
## age_cat25-29 2.49324 0.11649 21.404 < 2e-16 ***
## age_cat30-34 2.45016 0.12486 19.623 < 2e-16 ***
## age_cat35-39 2.14940 0.15014 14.316 < 2e-16 ***
## age_cat40-44 2.15017 0.18882 11.387 < 2e-16 ***
## age_cat45-49 2.29927 0.26677 8.619 < 2e-16 ***
## factor(hisprace2)3 -0.38689 0.07613 -5.082 3.73e-07 ***
## tvba 0.50952 0.05680 8.971 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13986 on 43234 degrees of freedom
## Residual deviance: 12780 on 43226 degrees of freedom
## AIC: 12788
##
## Number of Fisher Scoring iterations: 7
subsets <- split.data.frame(pydata,f = pydata$hisprace2)
White Subset
whitepy <- subsets$`2`
blackpy <- subsets$`3`
M_white <- glm(evrmarry ~ age_cat + tvba, family = "binomial", weights =new_weights, data = whitepy)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(M_white)
##
## Call:
## glm(formula = evrmarry ~ age_cat + tvba, family = "binomial",
## data = whitepy, weights = new_weights)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.11690 0.11241 -45.519 <2e-16 ***
## age_cat20-24 1.95537 0.12286 15.915 <2e-16 ***
## age_cat25-29 2.50505 0.12447 20.125 <2e-16 ***
## age_cat30-34 2.42605 0.13409 18.092 <2e-16 ***
## age_cat35-39 2.11231 0.16332 12.933 <2e-16 ***
## age_cat40-44 2.11281 0.20883 10.117 <2e-16 ***
## age_cat45-49 2.39058 0.28684 8.334 <2e-16 ***
## tvba 0.50833 0.05969 8.517 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12000 on 31465 degrees of freedom
## Residual deviance: 10962 on 31458 degrees of freedom
## AIC: 11093
##
## Number of Fisher Scoring iterations: 7
whitedata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),tvba=0:1)
whitedata$phat <- predict(M_white, whitedata, type="response")
whitedata
## age_cat tvba phat
## 1 15-19 0 0.005958871
## 2 20-24 0 0.040639299
## 3 25-29 0 0.068379538
## 4 30-34 0 0.063515375
## 5 35-39 0 0.047219065
## 6 40-44 0 0.047241354
## 7 45-49 0 0.061438381
## 8 15-19 1 0.009867785
## 9 20-24 1 0.065792231
## 10 25-29 1 0.108755420
## 11 30-34 1 0.101331526
## 12 35-39 1 0.076121291
## 13 40-44 1 0.076156132
## 14 45-49 1 0.098147521
M_black <- glm(evrmarry ~ age_cat + tvba, family = "binomial", weights =new_weights, data = blackpy)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(M_black)
##
## Call:
## glm(formula = evrmarry ~ age_cat + tvba, family = "binomial",
## data = blackpy, weights = new_weights)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.5500 0.3029 -18.326 < 2e-16 ***
## age_cat20-24 2.0402 0.3282 6.215 5.12e-10 ***
## age_cat25-29 2.4051 0.3320 7.245 4.34e-13 ***
## age_cat30-34 2.5979 0.3447 7.536 4.83e-14 ***
## age_cat35-39 2.3407 0.3900 6.001 1.96e-09 ***
## age_cat40-44 2.3190 0.4564 5.081 3.76e-07 ***
## age_cat45-49 1.8589 0.7374 2.521 0.01171 *
## tvba 0.5215 0.1873 2.784 0.00537 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1949.8 on 11768 degrees of freedom
## Residual deviance: 1814.6 on 11761 degrees of freedom
## AIC: 1705.5
##
## Number of Fisher Scoring iterations: 8
blackdata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),tvba=0:1)
blackdata$phat <- predict(M_black, blackdata, type="response")
blackdata
## age_cat tvba phat
## 1 15-19 0 0.003872429
## 2 20-24 0 0.029034043
## 3 25-29 0 0.041293898
## 4 30-34 0 0.049638656
## 5 35-39 0 0.038818233
## 6 40-44 0 0.038016981
## 7 45-49 0 0.024337081
## 8 15-19 1 0.006506094
## 9 20-24 1 0.047956450
## 10 25-29 1 0.067649747
## 11 30-34 1 0.080871275
## 12 35-39 1 0.063698974
## 13 40-44 1 0.062417505
## 14 45-49 1 0.040325462
whitedata$tvba <- car::recode(whitedata$tvba, recodes = "0='BA/BS Degree'; 1='No College Degree'")
ggplot(whitedata, aes(x = age_cat, y = phat, group=tvba, fill=factor(tvba),color = factor(tvba))) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for White Men")
blackdata$tvba <- recode(blackdata$tvba, "0='BA/BS Degree'; 1='No College Degree'") #I cannot figure out why this label is wrong.
## Warning: Unreplaced values treated as NA as `.x` is not compatible.
## Please specify replacements exhaustively or supply `.default`.
ggplot(blackdata, aes(x = age_cat, y = phat, group=tvba, fill=tvba,color = tvba)) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for Black Men")
Swapped Black and White Data Models
Black Data Black Means
blackdata2 <- blackpy %>%
group_by(age_cat) %>%
dplyr::summarize(tvba=weighted.mean(tvba,wgt2017_2019,na.rm = TRUE),n=n())
blackdata2$phat <- predict(M_black, blackdata2, type="response")
as.data.frame(blackdata2)
## age_cat tvba n phat
## 1 15-19 0.00000000 4128 0.003872429
## 2 20-24 0.06481221 3119 0.030002218
## 3 25-29 0.16458418 2099 0.044828861
## 4 30-34 0.16345329 1196 0.053817810
## 5 35-39 0.13327413 703 0.041496252
## 6 40-44 0.14942486 374 0.040971736
## 7 45-49 0.13842906 150 0.026111409
White Data, White Means
whitedata2 <- whitepy %>%
group_by(age_cat) %>%
dplyr::summarize(tvba=weighted.mean(tvba,wgt2017_2019,na.rm = TRUE),n=n())
whitedata2$phat <- predict(M_white, whitedata2, type="response")
as.data.frame(whitedata2)
## age_cat tvba n phat
## 1 15-19 0.0004463016 11374 0.005960215
## 2 20-24 0.1582734527 8846 0.043894647
## 3 25-29 0.3644028867 5647 0.081165681
## 4 30-34 0.3600549451 3081 0.075311592
## 5 35-39 0.3297265057 1524 0.055358466
## 6 40-44 0.3114523922 739 0.054900385
## 7 45-49 0.2373721010 255 0.068775721
Black Model, White Data
blackmodel_whitedata <- whitepy %>%
group_by(age_cat) %>%
summarize(tvba=weighted.mean(tvba,wgt2017_2019,na.rm = TRUE),n=n())
blackmodel_whitedata$phat <- predict(M_black, blackmodel_whitedata, type="response")
as.data.frame(blackmodel_whitedata)
## age_cat tvba n phat
## 1 15-19 0.0004463016 11374 0.003873326
## 2 20-24 0.1582734527 8846 0.031453629
## 3 25-29 0.3644028867 5647 0.049508637
## 4 30-34 0.3600549451 3081 0.059283992
## 5 35-39 0.3297265057 1524 0.045768096
## 6 40-44 0.3114523922 739 0.044423767
## 7 45-49 0.2373721010 255 0.027456141
White Model, Black Data
whitemodel_blackdata <- blackpy %>%
group_by(age_cat) %>%
summarize(tvba=weighted.mean(tvba,wgt2017_2019,na.rm = TRUE),n=n())
whitemodel_blackdata$phat <- predict(M_white, whitemodel_blackdata, type="response")
as.data.frame(whitemodel_blackdata)
## age_cat tvba n phat
## 1 15-19 0.00000000 4128 0.005958871
## 2 20-24 0.06481221 3119 0.041943419
## 3 25-29 0.16458418 2099 0.073905567
## 4 30-34 0.16345329 1196 0.068640529
## 5 35-39 0.13327413 703 0.050362215
## 6 40-44 0.14942486 374 0.050780186
## 7 45-49 0.13842906 150 0.065623505
data1 <- data0 %>%
mutate(age_cat = case_when(ageevent %in% c(15:19) ~ "15-19",
ageevent %in% c(20:24) ~ "20-24",
ageevent %in% c(25:29) ~ "25-29",
ageevent %in% c(30:34) ~ "30-34",
ageevent %in% c(35:39) ~ "35-39",
ageevent %in% c(40:44) ~ "40-44",
ageevent %in% c(45:49) ~ "45-49"))
fprobs<- data1 %>%
group_by(age_cat) %>%
summarize(p=weighted.mean(evrmarry,wgt2017_2019,na.rm = TRUE),n=n())
ggplot(fprobs, aes(x = age_cat, y = p)) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate by Age")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
data1 <- data1 %>%
mutate(nhw = ifelse(evrmarry == 1 & hisprace2 == 2,1,0),
nhb = ifelse(evrmarry == 1 & hisprace2 == 3,1,0),
marrace = ifelse(evrmarry == 1 & hisprace2 == 2,1,
ifelse(evrmarry == 1 & hisprace2 == 3,2,0)))
nh_wh_probs<- data1 %>%
group_by(age_cat) %>%
summarize(p=weighted.mean(nhw,wgt2017_2019,na.rm = TRUE),n=n())
ggplot(nh_wh_probs, aes(x = age_cat, y = p)) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate (NH White) by Age")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
library(nnet)
competingrisk <-multinom(formula = marrace ~ age_cat, data = data1)
## # weights: 24 (14 variable)
## initial value 3651.787248
## iter 10 value 2661.052265
## iter 20 value 2626.131547
## final value 2626.131414
## converged
summary(competingrisk)
## Call:
## multinom(formula = marrace ~ age_cat, data = data1)
##
## Coefficients:
## (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1 -2.311649 2.209136 2.372443 1.830777 1.308346 0.7476821
## 2 -3.428672 1.725213 1.816385 1.522515 1.231482 0.7660265
## age_cat45-49
## 1 0.4925283
## 2 0.5108623
##
## Std. Errors:
## (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1 0.1413641 0.1593387 0.1597303 0.1697842 0.2017200 0.2545042
## 2 0.2395023 0.2720475 0.2729396 0.2889061 0.3360287 0.4197982
## age_cat45-49
## 1 0.2907688
## 2 0.4827463
##
## Residual Deviance: 5252.263
## AIC: 5280.263
data1$new_weight <- data1$wgt2017_2019/mean(data1$wgt2017_2019)
data1$new_weight = data1$wgt2017_2019/mean(data1$wgt2017_2019) #creating new weight
anymodel <- glm(evrmarry ~ age_cat, new_weight, family = "binomial", data = data1)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(anymodel)
##
## Call:
## glm(formula = evrmarry ~ age_cat, family = "binomial", data = data1,
## weights = new_weight)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6079 0.1168 -13.767 < 2e-16 ***
## age_cat20-24 1.8426 0.1344 13.711 < 2e-16 ***
## age_cat25-29 2.1742 0.1359 16.000 < 2e-16 ***
## age_cat30-34 1.7335 0.1466 11.824 < 2e-16 ***
## age_cat35-39 1.1342 0.1755 6.463 1.03e-10 ***
## age_cat40-44 0.9593 0.2217 4.326 1.52e-05 ***
## age_cat45-49 0.0585 0.2889 0.202 0.84
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4603.3 on 3323 degrees of freedom
## Residual deviance: 4197.5 on 3317 degrees of freedom
## AIC: 4117.5
##
## Number of Fisher Scoring iterations: 4
compriskmodel <-multinom(formula = marrace ~ factor(ageevent), data = data1)
## # weights: 108 (70 variable)
## initial value 3651.787248
## iter 10 value 2605.635025
## iter 20 value 2565.049784
## iter 30 value 2559.155327
## iter 40 value 2555.642910
## iter 50 value 2554.767771
## iter 60 value 2554.693892
## iter 70 value 2554.688346
## final value 2554.687266
## converged
summary(compriskmodel)
## Call:
## multinom(formula = marrace ~ factor(ageevent), data = data1)
##
## Coefficients:
## (Intercept) factor(ageevent)16 factor(ageevent)17 factor(ageevent)18
## 1 -17.07808 -3.199853 13.88353 15.37769
## 2 -15.47024 10.895499 11.35930 12.51719
## factor(ageevent)19 factor(ageevent)20 factor(ageevent)21 factor(ageevent)22
## 1 15.70045 16.59077 17.05307 17.09099
## 2 12.92255 13.78670 13.71485 13.07255
## factor(ageevent)23 factor(ageevent)24 factor(ageevent)25 factor(ageevent)26
## 1 17.07807 16.95291 17.33261 17.25537
## 2 13.52434 14.28568 14.37164 13.21126
## factor(ageevent)27 factor(ageevent)28 factor(ageevent)29 factor(ageevent)30
## 1 17.19742 16.72480 17.15812 16.59959
## 2 13.84662 13.34202 14.14849 14.04078
## factor(ageevent)31 factor(ageevent)32 factor(ageevent)33 factor(ageevent)34
## 1 16.68941 16.61450 16.43005 16.67259
## 2 13.33862 13.13489 13.49310 13.41602
## factor(ageevent)35 factor(ageevent)36 factor(ageevent)37 factor(ageevent)38
## 1 16.36020 16.45902 15.72003 15.94954
## 2 13.36596 13.19298 13.70687 12.63660
## factor(ageevent)39 factor(ageevent)40 factor(ageevent)41 factor(ageevent)42
## 1 15.25375 15.80951 15.86173 15.03078
## 2 13.13467 13.10296 12.86762 13.13465
## factor(ageevent)43 factor(ageevent)44 factor(ageevent)45 factor(ageevent)46
## 1 15.18118 15.23231 15.37334 15.52036
## 2 12.47448 -13.44373 13.07265 -13.36482
## factor(ageevent)47 factor(ageevent)48 factor(ageevent)49
## 1 14.91882 15.69196 14.59285
## 2 12.90563 13.16790 -14.15332
##
## Std. Errors:
## (Intercept) factor(ageevent)16 factor(ageevent)17 factor(ageevent)18
## 1 0.06290520 2.570847e-09 0.4471029 0.2386604
## 2 0.09426982 9.777926e-01 0.6966723 0.4162577
## factor(ageevent)19 factor(ageevent)20 factor(ageevent)21 factor(ageevent)22
## 1 0.2111761 0.1982181 0.1658016 0.1680676
## 2 0.3481442 0.3072368 0.2956765 0.3936561
## factor(ageevent)23 factor(ageevent)24 factor(ageevent)25 factor(ageevent)26
## 1 0.1685326 0.1660685 0.1678462 0.1725366
## 2 0.3260180 0.2365880 0.2514639 0.3960176
## factor(ageevent)27 factor(ageevent)28 factor(ageevent)29 factor(ageevent)30
## 1 0.1702286 0.1763908 0.1847241 0.1964777
## 2 0.2984206 0.3373370 0.2882797 0.2779168
## factor(ageevent)31 factor(ageevent)32 factor(ageevent)33 factor(ageevent)34
## 1 0.2084196 0.2080148 0.2147521 0.2535524
## 2 0.3983758 0.4245638 0.3570352 0.4695095
## factor(ageevent)35 factor(ageevent)36 factor(ageevent)37 factor(ageevent)38
## 1 0.2719772 0.2700150 0.3679955 0.3423481
## 2 0.4682692 0.5170071 0.4380586 0.7109109
## factor(ageevent)39 factor(ageevent)40 factor(ageevent)41 factor(ageevent)42
## 1 0.4717142 0.3714124 0.3955491 0.5191585
## 2 0.5930381 0.5922199 0.7157758 0.5930460
## factor(ageevent)43 factor(ageevent)44 factor(ageevent)45 factor(ageevent)46
## 1 0.6038875 6.059717e-01 0.5310699 5.373144e-01
## 2 0.9966525 2.589956e-13 0.7211936 2.678039e-13
## factor(ageevent)47 factor(ageevent)48 factor(ageevent)49
## 1 0.5948324 0.4891119 7.168684e-01
## 2 0.7166302 0.7241695 1.571826e-13
##
## Residual Deviance: 5109.375
## AIC: 5249.375
anymodel <- glm(evrmarry ~ factor(ageevent), weights=new_weight, family = "binomial", data = data1)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(anymodel)
##
## Call:
## glm(formula = evrmarry ~ factor(ageevent), family = "binomial",
## data = data1, weights = new_weight)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.63 261.05 -0.064 0.949
## factor(ageevent)16 11.77 261.05 0.045 0.964
## factor(ageevent)17 13.28 261.05 0.051 0.959
## factor(ageevent)18 15.72 261.05 0.060 0.952
## factor(ageevent)19 15.93 261.05 0.061 0.951
## factor(ageevent)20 16.46 261.05 0.063 0.950
## factor(ageevent)21 16.61 261.05 0.064 0.949
## factor(ageevent)22 16.83 261.05 0.064 0.949
## factor(ageevent)23 17.23 261.05 0.066 0.947
## factor(ageevent)24 17.25 261.05 0.066 0.947
## factor(ageevent)25 17.16 261.05 0.066 0.948
## factor(ageevent)26 17.51 261.05 0.067 0.947
## factor(ageevent)27 17.48 261.05 0.067 0.947
## factor(ageevent)28 16.67 261.05 0.064 0.949
## factor(ageevent)29 17.15 261.05 0.066 0.948
## factor(ageevent)30 17.08 261.05 0.065 0.948
## factor(ageevent)31 17.10 261.05 0.066 0.948
## factor(ageevent)32 16.58 261.05 0.064 0.949
## factor(ageevent)33 16.20 261.05 0.062 0.951
## factor(ageevent)34 16.78 261.05 0.064 0.949
## factor(ageevent)35 16.19 261.05 0.062 0.951
## factor(ageevent)36 16.48 261.05 0.063 0.950
## factor(ageevent)37 16.21 261.05 0.062 0.950
## factor(ageevent)38 16.00 261.05 0.061 0.951
## factor(ageevent)39 15.31 261.05 0.059 0.953
## factor(ageevent)40 16.26 261.05 0.062 0.950
## factor(ageevent)41 16.08 261.05 0.062 0.951
## factor(ageevent)42 15.71 261.05 0.060 0.952
## factor(ageevent)43 16.03 261.05 0.061 0.951
## factor(ageevent)44 15.24 261.05 0.058 0.953
## factor(ageevent)45 15.13 261.05 0.058 0.954
## factor(ageevent)46 14.72 261.05 0.056 0.955
## factor(ageevent)47 15.36 261.05 0.059 0.953
## factor(ageevent)48 15.30 261.05 0.059 0.953
## factor(ageevent)49 14.44 261.05 0.055 0.956
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4603.3 on 3323 degrees of freedom
## Residual deviance: 4027.1 on 3289 degrees of freedom
## AIC: 4000.1
##
## Number of Fisher Scoring iterations: 15
as.data.frame(fprobs)
## age_cat p n
## 1 15-19 0.1668752 628
## 2 20-24 0.5584040 813
## 3 25-29 0.6378989 794
## 4 30-34 0.5313386 523
## 5 35-39 0.3837450 266
## 6 40-44 0.3432949 165
## 7 45-49 0.1751671 135