# theme for nice plotting
theme_nice <- theme_classic()+
theme(
axis.line.y.left = element_line(colour = "black"),
axis.line.y.right = element_line(colour = "black"),
axis.line.x.bottom = element_line(colour = "black"),
axis.line.x.top = element_line(colour = "black"),
axis.text.y = element_text(colour = "black", size = 12),
axis.text.x = element_text(color = "black", size = 12),
axis.ticks = element_line(color = "black")) +
theme(
axis.ticks.length = unit(-0.25, "cm"),
axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")),
axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2426326 129.6 4148825 221.6 NA 4148825 221.6
## Vcells 4098232 31.3 8388608 64.0 16384 6257752 47.8
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
data<- read_csv("2017_2019_Male_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 5206 Columns: 3009
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1939): caseid, rscrninf, rscrage, rscrhisp, rscrrace, age_a, age_r, age...
## lgl (1070): sedbclc3, sedbclc4, sedwhlc3, sedwhlc4, sedconlc4, p2currwife, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Create a new data frame 'data' with selected columns
data <- data[, c("caseid","age_r", "fmarital", "agemarr", "agemarrn4", "agemarrn2", "agemarrn3", "hisagem", "evrmarry", "agemarrn", "hisprace2", "fmarno", "nformwife", "hieduc", "wgt2017_2019", "earnba_y", "intvwyear", "sest", "secu")]
data <- data %>%
mutate(evmarried = ifelse(fmarno == 1 & fmarital == 1, 1,
ifelse(fmarno == 0 & fmarital == 5, 0,
ifelse(fmarno %in% c(1:5), 1, NA))))
data %>%
group_by(evmarried) %>%
dplyr::summarize(n=n()) %>%
dplyr::mutate(pct = n / sum(n))
## # A tibble: 2 × 3
## evmarried n pct
## <dbl> <int> <dbl>
## 1 0 3245 0.623
## 2 1 1961 0.377
#ever married age event
data$ageevent<- pmin(data$hisagem,data$agemarrn, na.rm=T)
#never married age
data$ageevent <- ifelse(data$fmarital == 5, data$age_r, data$ageevent)
# filter for NA values
data <- data %>%
filter(ageevent !=98 & !is.na(ageevent))
tabyl(data$ageevent)
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## ℹ The deprecated feature was likely used in the janitor package.
## Please report the issue at <]8;;https://github.com/sfirke/janitor/issueshttps://github.com/sfirke/janitor/issues]8;;>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## data$ageevent n percent
## 15 186 0.036399217
## 16 192 0.037573386
## 17 234 0.045792564
## 18 271 0.053033268
## 19 276 0.054011742
## 20 199 0.038943249
## 21 264 0.051663405
## 22 248 0.048532290
## 23 251 0.049119374
## 24 267 0.052250489
## 25 284 0.055577299
## 26 240 0.046966732
## 27 234 0.045792564
## 28 239 0.046771037
## 29 206 0.040313112
## 30 197 0.038551859
## 31 165 0.032289628
## 32 162 0.031702544
## 33 146 0.028571429
## 34 108 0.021135029
## 35 92 0.018003914
## 36 82 0.016046967
## 37 73 0.014285714
## 38 64 0.012524462
## 39 57 0.011154599
## 40 58 0.011350294
## 41 44 0.008610568
## 42 50 0.009784736
## 43 32 0.006262231
## 44 31 0.006066536
## 45 30 0.005870841
## 46 35 0.006849315
## 47 33 0.006457926
## 48 29 0.005675147
## 49 31 0.006066536
data <- data %>%
filter(hisprace2 == 2 | hisprace2 == 3)
data <- data %>%
mutate(
bach_degree = ifelse(hieduc %in% c(5:11), 0,
ifelse(hieduc %in% c(12:15), 1, NA)),
birth_year = intvwyear - age_r,
age_bach_degree = earnba_y - birth_year)
data %>%
group_by(evmarried, ageevent, age_bach_degree) %>%
dplyr::summarize(n=n()) %>%
mutate(pct = n / sum(n))
## `summarise()` has grouped output by 'evmarried', 'ageevent'. You can override
## using the `.groups` argument.
## # A tibble: 379 × 5
## # Groups: evmarried, ageevent [66]
## evmarried ageevent age_bach_degree n pct
## <dbl> <dbl> <dbl> <int> <dbl>
## 1 0 15 NA 106 1
## 2 0 16 NA 97 1
## 3 0 17 NA 122 1
## 4 0 18 NA 115 1
## 5 0 19 NA 115 1
## 6 0 20 NA 70 1
## 7 0 21 NA 81 1
## 8 0 22 21 4 0.0519
## 9 0 22 22 3 0.0390
## 10 0 22 NA 70 0.909
## # ℹ 369 more rows
library(survey)
## Loading required package: grid
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
des<-svydesign(ids = ~secu,
strata = ~sest,
weights=~wgt2017_2019,
data=data,
nest=T)
library(survey)
library(dplyr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
bach_age_race <- svyby(~bach_degree, by = ~hisprace2 + age_r, design = des, FUN = svymean) %>%
mutate(Race = ifelse(hisprace2 == 3, "% Black with bach", "& White with bach")) %>%
select(age_r, Race, Percent_with_college_degree = bach_degree) %>%
spread(Race, Percent_with_college_degree)
# Print the result
print(bach_age_race)
## age_r & White with bach % Black with bach
## 1 15 0.000000000 0.000000000
## 2 16 0.000000000 0.000000000
## 3 17 0.000000000 0.000000000
## 4 18 0.000000000 0.000000000
## 5 19 0.000000000 0.000000000
## 6 20 0.000000000 0.000000000
## 7 21 0.008250833 0.000000000
## 8 22 0.114112456 0.007067989
## 9 23 0.301123729 0.153831478
## 10 24 0.394126772 0.271459435
## 11 25 0.360019297 0.256943867
## 12 26 0.458598748 0.101478745
## 13 27 0.385167571 0.006186328
## 14 28 0.379778158 0.143184877
## 15 29 0.328598010 0.209892625
## 16 30 0.349032816 0.112009745
## 17 31 0.469542786 0.148690892
## 18 32 0.407440425 0.092208286
## 19 33 0.341284328 0.330142638
## 20 34 0.387126825 0.068720672
## 21 35 0.366626178 0.307217765
## 22 36 0.479001848 0.073041272
## 23 37 0.426800075 0.200632472
## 24 38 0.489537256 0.460298863
## 25 39 0.283914810 0.171186043
## 26 40 0.599124196 0.256314847
## 27 41 0.556878984 0.098169769
## 28 42 0.413063878 0.231067351
## 29 43 0.297915871 0.557450719
## 30 44 0.406574667 0.213307189
## 31 45 0.453818304 0.357997355
## 32 46 0.448505331 0.506315460
## 33 47 0.313390039 0.270052526
## 34 48 0.375238404 0.310944449
## 35 49 0.307178148 0.157532206
## 36 50 1.000000000 NA
likely events
data <- data %>%
mutate(marriage_babs = case_when((evmarried == 0) & (is.na(age_bach_degree)) ~ 1,
(evmarried == 0) & (!is.na(age_bach_degree)) ~ 2,
(evmarried == 1) & (is.na(age_bach_degree)) ~ 3,
(evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent < age_bach_degree) ~ 4,
(evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent > age_bach_degree) ~ 5,
(evmarried == 1) & (!is.na(age_bach_degree)) & (ageevent == age_bach_degree) ~ 6))
data %>%
group_by(evmarried, ageevent, age_bach_degree, marriage_babs) %>%
dplyr::summarize(n=n()) %>%
mutate(pct = n / sum(n))
## `summarise()` has grouped output by 'evmarried', 'ageevent', 'age_bach_degree'.
## You can override using the `.groups` argument.
## # A tibble: 379 × 6
## # Groups: evmarried, ageevent, age_bach_degree [379]
## evmarried ageevent age_bach_degree marriage_babs n pct
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 0 15 NA 1 106 1
## 2 0 16 NA 1 97 1
## 3 0 17 NA 1 122 1
## 4 0 18 NA 1 115 1
## 5 0 19 NA 1 115 1
## 6 0 20 NA 1 70 1
## 7 0 21 NA 1 81 1
## 8 0 22 21 2 4 1
## 9 0 22 22 2 3 1
## 10 0 22 NA 1 70 1
## # ℹ 369 more rows
data %>%
group_by(marriage_babs) %>%
dplyr::summarize(n=n()) %>%
mutate(pct = n / sum(n))
## # A tibble: 6 × 3
## marriage_babs n pct
## <dbl> <int> <dbl>
## 1 1 1669 0.512
## 2 2 343 0.105
## 3 3 783 0.240
## 4 4 79 0.0243
## 5 5 346 0.106
## 6 6 37 0.0114
library(ggsurvfit)
##
## Attaching package: 'ggsurvfit'
## The following object is masked from 'package:psych':
##
## %+%
library(relsurv)
## Loading required package: date
##
## Attaching package: 'relsurv'
## The following object is masked from 'package:lubridate':
##
## years
pydata<-survSplit(data, cut=c(15:49),end="ageevent",event="evmarried")
pydata$babs_event <- case_when(pydata$marriage_babs == 1 ~ 0,
pydata$marriage_babs == 3 ~ 0,
pydata$marriage_babs == 4 ~ 0,
pydata$marriage_babs == 2 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
pydata$marriage_babs == 2 & (pydata$ageevent < pydata$age_bach_degree) ~ 0,
pydata$marriage_babs == 5 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
pydata$marriage_babs == 5 & (pydata$ageevent < pydata$age_bach_degree) ~ 0,
pydata$marriage_babs == 6 & (pydata$ageevent >= pydata$age_bach_degree) ~ 1,
pydata$marriage_babs == 6 & (pydata$ageevent < pydata$age_bach_degree) ~ 0)
head(pydata[, c("caseid", "ageevent", "age_bach_degree", "marriage_babs")], n=50)
## caseid ageevent age_bach_degree marriage_babs
## 1 80717 15 22 2
## 2 80717 16 22 2
## 3 80717 17 22 2
## 4 80717 18 22 2
## 5 80717 19 22 2
## 6 80717 20 22 2
## 7 80717 21 22 2
## 8 80717 22 22 2
## 9 80717 23 22 2
## 10 80717 24 22 2
## 11 80717 25 22 2
## 12 80717 26 22 2
## 13 80717 27 22 2
## 14 80717 28 22 2
## 15 80717 29 22 2
## 16 80717 30 22 2
## 17 80717 31 22 2
## 18 80721 15 NA 1
## 19 80721 16 NA 1
## 20 80721 17 NA 1
## 21 80724 15 NA 3
## 22 80724 16 NA 3
## 23 80724 17 NA 3
## 24 80724 18 NA 3
## 25 80732 15 22 2
## 26 80732 16 22 2
## 27 80732 17 22 2
## 28 80732 18 22 2
## 29 80732 19 22 2
## 30 80732 20 22 2
## 31 80732 21 22 2
## 32 80732 22 22 2
## 33 80732 23 22 2
## 34 80732 24 22 2
## 35 80732 25 22 2
## 36 80732 26 22 2
## 37 80732 27 22 2
## 38 80732 28 22 2
## 39 80732 29 22 2
## 40 80732 30 22 2
## 41 80732 31 22 2
## 42 80732 32 22 2
## 43 80732 33 22 2
## 44 80732 34 22 2
## 45 80732 35 22 2
## 46 80732 36 22 2
## 47 80732 37 22 2
## 48 80732 38 22 2
## 49 80732 39 22 2
## 50 80734 15 21 5
library(survey)
des2<-svydesign(ids = ~secu,
strata = ~sest,
weights=~wgt2017_2019,
data=pydata,
nest=T)
# Use svyby to calculate weighted means and sample sizes by ageevent and hisprace2
raceprobs <- svyby(~evmarried, by = ~ageevent + hisprace2, design = des2,
FUN = svymean, na.rm = TRUE)
raceprobs$me <- 2*raceprobs$se
raceprobs$hisprace2 <- car::Recode(raceprobs$hisprace2, recodes="2='White Men'; 3='Black Men'; else=NA",
as.factor=T)
ggplot(data =raceprobs, aes(x = ageevent, y = evmarried, ymin=evmarried-me, ymax=evmarried+me, fill=hisprace2,color = hisprace2, group = hisprace2)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Race",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Adjusted First Marriage Rate") +theme_nice
#Event by Education
educprobs<- pydata %>%
group_by(ageevent, babs_event) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)
educprobs$sep <- sqrt(educprobs$num/educprobs$n)
educprobs$me <- 2*educprobs$sep
educprobs$babs_event <- case_when(educprobs$babs_event == 1 ~ "BA/BS Degree",educprobs$babs_event == 0 ~ "No College Degree")
ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs_event,color = babs_event, group = babs_event)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education (Time-Varying)",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Adjusted First Marriage Rate")+theme_nice
educprobs<- pydata %>%
group_by(ageevent, bach_degree) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent'. You can override using the
## `.groups` argument.
educprobs$num <- educprobs$p*(1-educprobs$p)
educprobs$sep <- sqrt(educprobs$num/educprobs$n)
educprobs$me <- 2*educprobs$sep
educprobs$babs <- case_when(educprobs$bach_degree == 1 ~ "BA/BS Degree",educprobs$bach_degree == 0 ~ "No College Degree")
ggplot(data =educprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs,color = babs, group = babs)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Age and Education",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Adjusted First Marriage Rate")+theme_nice
raceeducprobs<- pydata %>%
group_by(ageevent, hisprace2, babs_event) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'hisprace2'. You can override
## using the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)
raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)
raceeducprobs$me <- 2*raceeducprobs$sep
raceeducprobs$babs_event <- case_when(raceeducprobs$babs_event == 1 ~ "BA/BS Degree",raceeducprobs$babs_event == 0 ~ "No College Degree")
raceeducprobs$hisprace2 <- case_when(raceeducprobs$hisprace2 == 2 ~ "White Men",raceeducprobs$hisprace2 == 3 ~ "Black Men")
ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=babs_event,color = babs_event, group = babs_event)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Race and Education (Time Varying)",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Ajusted First Marriage Rate") + facet_grid(~hisprace2)+theme_nice
raceeducprobs<- pydata %>%
group_by(ageevent, hisprace2, bach_degree) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
## `summarise()` has grouped output by 'ageevent', 'hisprace2'. You can override
## using the `.groups` argument.
raceeducprobs$num <- raceeducprobs$p*(1-raceeducprobs$p)
raceeducprobs$sep <- sqrt(raceeducprobs$num/raceeducprobs$n)
raceeducprobs$me <- 2*raceeducprobs$sep
raceeducprobs$bach_degree <- case_when(raceeducprobs$bach_degree == 1 ~ "BA/BS Degree",raceeducprobs$bach_degree == 0 ~ "No College Degree")
raceeducprobs$hisprace2 <- case_when(raceeducprobs$hisprace2 == 2 ~ "White Men",raceeducprobs$hisprace2 == 3 ~ "Black Men")
ggplot(data =raceeducprobs, aes(x = ageevent, y = p, ymin=p-me, ymax=p+me, fill=bach_degree,color = bach_degree, group = bach_degree)) +
geom_line() + geom_ribbon(alpha=0.3,aes(color=NULL)) +
labs(title="Adjusted First Marriage Rates by Race and Education",
subtitle="Men Ages 18 to 49",
caption="Source: 2017-2019 NSFG Male Respondent File") +
xlab(label="Age at First Marriage") +
ylab(label="Ajusted First Marriage Rate") + facet_grid(~hisprace2)+theme_nice
pydata$age_cat <- car::Recode(pydata$ageevent,recodes = "15:19 = '15-19'; 20:24 = '20-24'; 25:29 = '25-29'; 30:34 = '30-34'; 35:39 = '35-39'; 40:44 = '40-44'; 45:49 = '45-49'; else=NA", as.factor=T)
pydata$new_weight <- pydata$wgt2017_2019/mean(pydata$wgt2017_2019) #standize the weight
pydata$black <- ifelse(pydata$hisprace2==3,1,0)
library(survey)
des2<-svydesign(ids = ~secu,
strata = ~sest,
weights=~wgt2017_2019,
data=pydata,
nest=T)
model_1 <- svyglm(evmarried~ age_cat + black, family = "binomial", design=des2)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(model_1)
##
## Call:
## svyglm(formula = evmarried ~ age_cat + black, design = des2,
## family = "binomial")
##
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019,
## data = pydata, nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.0954 0.1729 -29.471 < 2e-16 ***
## age_cat20-24 2.0615 0.1793 11.500 2.92e-15 ***
## age_cat25-29 2.6953 0.1954 13.795 < 2e-16 ***
## age_cat30-34 2.6475 0.2050 12.914 < 2e-16 ***
## age_cat35-39 2.3227 0.2333 9.958 3.67e-13 ***
## age_cat40-44 2.1012 0.3505 5.994 2.75e-07 ***
## age_cat45-49 0.9043 0.7467 1.211 0.232
## black -0.4791 0.1034 -4.634 2.87e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.001212)
##
## Number of Fisher Scoring iterations: 7
model_2 <- svyglm(evmarried ~ age_cat + black + babs_event, family = "binomial", design=des2)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(model_2)
##
## Call:
## svyglm(formula = evmarried ~ age_cat + black + babs_event, design = des2,
## family = "binomial")
##
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019,
## data = pydata, nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.10609 0.17357 -29.418 < 2e-16 ***
## age_cat20-24 1.96850 0.17849 11.028 1.65e-14 ***
## age_cat25-29 2.49191 0.18926 13.167 < 2e-16 ***
## age_cat30-34 2.44622 0.20010 12.225 4.72e-16 ***
## age_cat35-39 2.13813 0.23082 9.263 4.38e-12 ***
## age_cat40-44 1.92692 0.34570 5.574 1.25e-06 ***
## age_cat45-49 0.76649 0.74717 1.026 0.310321
## black -0.39209 0.10706 -3.662 0.000643 ***
## babs_event 0.51762 0.08425 6.144 1.76e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9991185)
##
## Number of Fisher Scoring iterations: 7
pydata_whites <-filter(pydata, black == 0)
des3<-svydesign(ids = ~secu,
strata = ~sest,
weights=~wgt2017_2019,
data=pydata_whites,
nest=T)
whitemodel <- svyglm(evmarried ~ age_cat + babs_event, family = "binomial", design=des3)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(whitemodel)
##
## Call:
## svyglm(formula = evmarried ~ age_cat + babs_event, design = des3,
## family = "binomial")
##
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019,
## data = pydata_whites, nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.10186 0.18888 -27.011 < 2e-16 ***
## age_cat20-24 1.95864 0.19621 9.982 3.39e-13 ***
## age_cat25-29 2.50161 0.20871 11.986 6.76e-16 ***
## age_cat30-34 2.42261 0.21308 11.370 4.34e-15 ***
## age_cat35-39 2.09886 0.24357 8.617 3.10e-11 ***
## age_cat40-44 2.03891 0.36225 5.628 9.78e-07 ***
## age_cat45-49 0.91816 0.81532 1.126 0.266
## babs_event 0.51497 0.09196 5.600 1.08e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000333)
##
## Number of Fisher Scoring iterations: 7
pydata_blacks <-filter(pydata, black == 1)
des4<-svydesign(ids = ~secu,
strata = ~sest,
weights=~wgt2017_2019,
data=pydata_blacks,
nest=T)
blackmodel<- svyglm(evmarried ~ age_cat + babs_event, family = "binomial", design =des4)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(blackmodel)
##
## Call:
## svyglm(formula = evmarried ~ age_cat + babs_event, design = des4,
## family = "binomial")
##
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~wgt2017_2019,
## data = pydata_blacks, nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.52834 0.31255 -17.688 < 2e-16 ***
## age_cat20-24 2.04001 0.36728 5.554 1.62e-06 ***
## age_cat25-29 2.42096 0.34772 6.962 1.46e-08 ***
## age_cat30-34 2.59231 0.40946 6.331 1.21e-07 ***
## age_cat35-39 2.34178 0.49077 4.772 2.13e-05 ***
## age_cat40-44 1.21153 0.60571 2.000 0.0518 .
## age_cat45-49 -0.05683 1.10067 -0.052 0.9591
## babs_event 0.54577 0.29188 1.870 0.0683 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000899)
##
## Number of Fisher Scoring iterations: 8
whitedata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),babs_event=0:1)
whitedata$phat <- predict(whitemodel, whitedata, type="response")
whitedata
## age_cat babs_event phat
## 1 15-19 0 0.006048585
## 2 20-24 0 0.041358991
## 3 25-29 0 0.069122309
## 4 30-34 0 0.064208888
## 5 35-39 0 0.047290468
## 6 40-44 0 0.044661688
## 7 45-49 0 0.015013139
## 8 15-19 1 0.010081806
## 9 20-24 1 0.067342106
## 10 25-29 1 0.110536144
## 11 30-34 1 0.103004646
## 12 35-39 1 0.076701787
## 13 40-44 1 0.072562615
## 14 45-49 1 0.024874367
blackdata <- expand.grid(age_cat=c("15-19","20-24","25-29","30-34","35-39","40-44","45-49"),babs_event=0:1)
blackdata$phat <- predict(blackmodel, blackdata, type="response")
blackdata
## age_cat babs_event phat
## 1 15-19 0 0.003956844
## 2 20-24 0 0.029646024
## 3 25-29 0 0.042803900
## 4 30-34 0 0.050400876
## 5 35-39 0 0.039674527
## 6 40-44 0 0.013166675
## 7 45-49 0 0.003739046
## 8 15-19 1 0.006809714
## 9 20-24 1 0.050089266
## 10 25-29 1 0.071650548
## 11 30-34 1 0.083918463
## 12 35-39 1 0.066558858
## 13 40-44 1 0.022509734
## 14 45-49 1 0.006435899
library(survey)
library(dplyr)
library(ggplot2)
# Assuming your survey design is named des2
prop_marrying <- svyby(~evrmarry, by = ~hisprace2 + age_r + bach_degree, design = des2, FUN = svymean) %>%
mutate(me = 2 * se,
babs_event = case_when(bach_degree == 1 ~ "BA/BS Degree", bach_degree == 0 ~ "No College Degree"),
hisprace2 = case_when(hisprace2 == 2 ~ "White Men", hisprace2 == 3 ~ "Black Men"))
# Create the plot
ggplot(data = prop_marrying, aes(x = age_r, y = evrmarry, ymin = evrmarry - me, ymax = evrmarry + me, fill = babs_event, color = babs_event, group = babs_event)) +
geom_line() +
geom_ribbon(alpha = 0.3, aes(color = NULL)) +
labs(
title = "Probability of Marrying by Race and Education (Time Varying)",
subtitle = "Men Ages 18 to 49",
caption = "Source: 2017-2019 NSFG Male Respondent File"
) +
xlab(label = "Age at First Marriage") +
ylab(label = "Adjusted First Marriage Rate") +
facet_grid(~hisprace2)
whitedata$babs_event<- recode(whitedata$babs_event, "1='BA/BS Degree'; 0='No College Degree'")
ggplot(whitedata, aes(x = age_cat, y = phat, group=babs_event, fill=babs_event,color = babs_event)) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for White Men")
## Don't know how to automatically pick scale for object of type <svystat>.
## Defaulting to continuous.
blackdata$babs_event<- recode(blackdata$babs_event, "1='BA/BS Degree'; 0='No College Degree'")
ggplot(blackdata, aes(x = age_cat, y = phat, group=babs_event, fill=babs_event,color = babs_event)) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Predicted First Marriage Probability", title = "Predicted First Marriage Probabilities by Education for Black Men")
## Don't know how to automatically pick scale for object of type <svystat>.
## Defaulting to continuous.
newwhitedata <- pydata_whites %>%
group_by(age_cat) %>%
dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())
newwhitedata$phat <- predict(whitemodel, newwhitedata, type="response")
as.data.frame(newwhitedata)
## age_cat babs_event n phat
## 1 15-19 0.0004530251 11139 0.006049988
## 2 20-24 0.1590766209 8611 0.044731842
## 3 25-29 0.3684261830 5423 0.082373970
## 4 30-34 0.3642379080 2891 0.076443677
## 5 35-39 0.3322900790 1391 0.055625528
## 6 40-44 0.3039027132 646 0.051835760
## 7 45-49 0.2210554539 209 0.016792882
newblackdata <- pydata_blacks %>%
group_by(age_cat) %>%
dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())
newblackdata$phat <- predict(blackmodel, newblackdata, type="response")
as.data.frame(newblackdata)
## age_cat babs_event n phat
## 1 15-19 0.00000000 4028 0.003956844
## 2 20-24 0.06555702 3021 0.030692792
## 3 25-29 0.16775968 2009 0.046716248
## 4 30-34 0.16704397 1114 0.054947438
## 5 35-39 0.13738011 639 0.042631917
## 6 40-44 0.15986562 336 0.014349850
## 7 45-49 0.14881902 135 0.004054127
whitemodel_blackdata <- pydata_blacks %>%
group_by(age_cat) %>%
dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())
whitemodel_blackdata$phat <- predict(whitemodel, whitemodel_blackdata, type="response")
as.data.frame(whitemodel_blackdata)
## age_cat babs_event n phat
## 1 15-19 0.00000000 4028 0.006048585
## 2 20-24 0.06555702 3021 0.042718449
## 3 25-29 0.16775968 2009 0.074892341
## 4 30-34 0.16704397 1114 0.069575572
## 5 35-39 0.13738011 639 0.050581964
## 6 40-44 0.15986562 336 0.048308985
## 7 45-49 0.14881902 135 0.016189593
blackmodel_whitedata <- pydata_whites %>%
group_by(age_cat) %>%
dplyr::summarize(babs_event=weighted.mean(babs_event,wgt2017_2019,na.rm = TRUE),n=n())
blackmodel_whitedata$phat <- predict(blackmodel, blackmodel_whitedata, type="response")
as.data.frame(blackmodel_whitedata)
## age_cat babs_event n phat
## 1 15-19 0.0004530251 11139 0.003957819
## 2 20-24 0.1590766209 8611 0.032248201
## 3 25-29 0.3684261830 5423 0.051842886
## 4 30-34 0.3642379080 2891 0.060811308
## 5 35-39 0.3322900790 1391 0.047191128
## 6 40-44 0.3039027132 646 0.015505238
## 7 45-49 0.2210554539 209 0.004216464
data$age_cat <- car::Recode(data$ageevent,recodes = "15:19 = '15-19'; 20:24 = '20-24'; 25:29 = '25-29'; 30:34 = '30-34'; 35:39 = '35-39'; 40:44 = '40-44'; 45:49 = '45-49'; else=NA", as.factor=T)
fprobs<- data %>%
group_by(age_cat) %>%
dplyr::summarize(p=weighted.mean(evmarried,wgt2017_2019,na.rm = TRUE),n=n())
ggplot(fprobs, aes(x = age_cat, y = p, group =1 )) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate by Age")
data <- data %>%
mutate(nh_wh_marr = ifelse(evmarried == 1 & hisprace2 == 2,1,0),
nh_bl_marr = ifelse(evmarried == 1 & hisprace2 == 3,1,0),
marr_by_race = ifelse(evmarried == 1 & hisprace2 == 2,1,
ifelse(evmarried == 1 & hisprace2 == 3,2,0))) #1 is for NH whites marrying, 2 is for NH Blacks marrying, 0 is for no marriages whatsoever regardless of race)
nh_wh_probs<- data %>%
group_by(age_cat) %>%
dplyr::summarize(p=weighted.mean(nh_wh_marr,wgt2017_2019,na.rm = TRUE),n=n())
ggplot(nh_wh_probs, aes(x = factor(age_cat), y = p, group = 1)) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate (NH White) by Age")
nh_bl_probs<- data %>%
group_by(age_cat) %>%
dplyr::summarize(p=weighted.mean(nh_bl_marr,wgt2017_2019,na.rm = TRUE),n=n())
ggplot(nh_bl_probs, aes(x = age_cat, y = p, group =1 )) +
geom_point() +
geom_line() +
labs(x = "Age", y = "Adjusted First Marriage Rate", title = "Adjusted First Marriage Rate (NH Black) by Age")
library(nnet)
compriskmodel <-multinom(formula = marr_by_race ~ age_cat, data = data)
## # weights: 24 (14 variable)
## initial value 3578.180224
## iter 10 value 2532.030674
## iter 20 value 2486.812645
## final value 2486.783619
## converged
summary(compriskmodel)
## Call:
## multinom(formula = marr_by_race ~ age_cat, data = data)
##
## Coefficients:
## (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1 -2.311634 2.209127 2.356211 1.780331 1.143977 0.39628878
## 2 -3.428630 1.696617 1.816311 1.427148 1.008216 -0.04545012
## age_cat45-49
## 1 -1.011440
## 2 -1.280696
##
## Std. Errors:
## (Intercept) age_cat20-24 age_cat25-29 age_cat30-34 age_cat35-39 age_cat40-44
## 1 0.1413633 0.1593380 0.1598667 0.1706138 0.2083191 0.2834871
## 2 0.2394978 0.2727925 0.2729364 0.2928099 0.3541421 0.5614702
## age_cat45-49
## 1 0.5281609
## 2 1.0325556
##
## Residual Deviance: 4973.567
## AIC: 5001.567
data$new_weight <- data$wgt2017_2019/mean(data$wgt2017_2019)
data$new_weight = data$wgt2017_2019/mean(data$wgt2017_2019) #creating new weight
des5<-svydesign(ids = ~secu,
strata = ~sest,
weights=~new_weight,
data=data,
nest=T)
anymodel <- svyglm(evmarried ~ age_cat, design = des5)
summary(anymodel)
##
## Call:
## svyglm(formula = evmarried ~ age_cat, design = des5)
##
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~new_weight,
## data = data, nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.16688 0.02647 6.303 8.62e-08 ***
## age_cat20-24 0.39140 0.04738 8.260 8.92e-11 ***
## age_cat25-29 0.46781 0.03813 12.268 < 2e-16 ***
## age_cat30-34 0.35595 0.04737 7.515 1.20e-09 ***
## age_cat35-39 0.19926 0.05698 3.497 0.00102 **
## age_cat40-44 0.10616 0.07693 1.380 0.17398
## age_cat45-49 -0.12781 0.03734 -3.423 0.00128 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2176265)
##
## Number of Fisher Scoring iterations: 2
library(gbm)
## Loaded gbm 2.1.8.1
compriskmodel <-multinom(formula = marr_by_race ~ factor(evmarried), data = data)
## # weights: 9 (4 variable)
## initial value 3578.180224
## iter 10 value 591.840457
## final value 577.578838
## converged
summary(compriskmodel)
## Call:
## multinom(formula = marr_by_race ~ factor(evmarried), data = data)
##
## Coefficients:
## (Intercept) factor(evmarried)1
## 1 -12.70518 24.57202
## 2 -11.03884 21.35572
##
## Std. Errors:
## (Intercept) factor(evmarried)1
## 1 12.79633 17.39152
## 2 5.56221 13.02539
##
## Residual Deviance: 1155.158
## AIC: 1163.158
anymodel <- svyglm(evmarried ~ factor(ageevent), weights=new_weight, design = des5)
summary(anymodel)
##
## Call:
## svyglm(formula = evmarried ~ factor(ageevent), design = des5,
## weights = new_weight)
##
## Survey design:
## svydesign(ids = ~secu, strata = ~sest, weights = ~new_weight,
## data = data, nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.145e-15 1.353e-15 0.846 0.407409
## factor(ageevent)16 3.475e-03 3.563e-03 0.975 0.341016
## factor(ageevent)17 1.208e-02 6.350e-03 1.902 0.071635 .
## factor(ageevent)18 3.762e-01 1.089e-01 3.454 0.002510 **
## factor(ageevent)19 4.492e-01 1.269e-01 3.540 0.002055 **
## factor(ageevent)20 4.319e-01 1.103e-01 3.914 0.000859 ***
## factor(ageevent)21 3.980e-01 1.271e-01 3.130 0.005270 **
## factor(ageevent)22 5.402e-01 1.300e-01 4.155 0.000490 ***
## factor(ageevent)23 7.643e-01 6.390e-02 11.961 1.44e-10 ***
## factor(ageevent)24 7.437e-01 6.603e-02 11.264 4.13e-10 ***
## factor(ageevent)25 5.961e-01 9.722e-02 6.132 5.43e-06 ***
## factor(ageevent)26 7.963e-01 7.623e-02 10.446 1.51e-09 ***
## factor(ageevent)27 7.834e-01 5.894e-02 13.293 2.19e-11 ***
## factor(ageevent)28 5.408e-01 9.901e-02 5.463 2.39e-05 ***
## factor(ageevent)29 6.583e-01 9.477e-02 6.946 9.60e-07 ***
## factor(ageevent)30 6.409e-01 1.138e-01 5.631 1.64e-05 ***
## factor(ageevent)31 7.304e-01 9.434e-02 7.742 1.93e-07 ***
## factor(ageevent)32 4.712e-01 1.156e-01 4.074 0.000591 ***
## factor(ageevent)33 2.932e-01 9.756e-02 3.005 0.006989 **
## factor(ageevent)34 5.723e-01 1.553e-01 3.686 0.001466 **
## factor(ageevent)35 3.778e-01 2.041e-01 1.851 0.078950 .
## factor(ageevent)36 4.855e-01 1.533e-01 3.167 0.004846 **
## factor(ageevent)37 5.103e-01 1.387e-01 3.679 0.001487 **
## factor(ageevent)38 3.918e-01 1.280e-01 3.061 0.006171 **
## factor(ageevent)39 1.229e-01 1.038e-01 1.184 0.250220
## factor(ageevent)40 6.368e-01 2.014e-01 3.162 0.004905 **
## factor(ageevent)41 3.626e-01 1.495e-01 2.425 0.024887 *
## factor(ageevent)42 1.208e-01 7.206e-02 1.677 0.109119
## factor(ageevent)43 6.798e-01 2.121e-01 3.206 0.004440 **
## factor(ageevent)44 1.298e-01 1.164e-01 1.115 0.278089
## factor(ageevent)45 1.540e-01 1.065e-01 1.445 0.163895
## factor(ageevent)46 1.820e-02 1.921e-02 0.947 0.354719
## factor(ageevent)47 -9.765e-16 1.312e-15 -0.744 0.465424
## factor(ageevent)48 -9.378e-16 1.300e-15 -0.721 0.479114
## factor(ageevent)49 -9.109e-16 1.300e-15 -0.700 0.491690
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.4258265)
##
## Number of Fisher Scoring iterations: 2
as.data.frame(fprobs)
## age_cat p n
## 1 15-19 0.16687517 628
## 2 20-24 0.55827326 811
## 3 25-29 0.63468717 788
## 4 30-34 0.52282340 510
## 5 35-39 0.36613989 252
## 6 40-44 0.27303553 152
## 7 45-49 0.03906987 116
#Total population hisprace2( Black)
#Total Population hisprace2(white)
#Group evmarried and hisprace2,
#babs_event is they have bachelor's degree or not
pydata %>%
group_by(hisprace2, babs_event) %>% # 3 is black
summarise(Count = n())
## `summarise()` has grouped output by 'hisprace2'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: hisprace2 [2]
## hisprace2 babs_event Count
## <dbl> <dbl> <int>
## 1 2 0 25654
## 2 2 1 4656
## 3 3 0 10417
## 4 3 1 865
# To get those who experienced the event
pydata %>%
group_by(hisprace2, babs_event) %>%
summarise(Count = sum(evmarried == 1))
## `summarise()` has grouped output by 'hisprace2'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: hisprace2 [2]
## hisprace2 babs_event Count
## <dbl> <dbl> <int>
## 1 2 0 682
## 2 2 1 345
## 3 3 0 180
## 4 3 1 38