1 + 1[1] 2
Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.
When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:
1 + 1[1] 2
You can add options to executable code like this
[1] 4
The echo: false option disables the printing of code (only output is displayed).
##load libraries
library(tidyverse)── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6 ✔ purrr 0.3.4
✔ tibble 3.1.8 ✔ dplyr 1.0.9
✔ tidyr 1.2.0 ✔ stringr 1.4.1
✔ readr 2.1.2 ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(haven)
library(survival)
library(car)Loading required package: carData
Attaching package: 'car'
The following object is masked from 'package:dplyr':
recode
The following object is masked from 'package:purrr':
some
library(survey)Loading required package: grid
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Attaching package: 'survey'
The following object is masked from 'package:graphics':
dotchart
library(muhaz)
library(eha)
library(foreign)##load data
usa2 <- read_stata("C:/Users/okabe/OneDrive/Pictures/Stats 2/usa_00011.dta.gz")
usa2<-zap_labels(usa2)##recodes fro variables
##Citizen
usa2$mycitizen<-Recode (usa2$citizen, recodes =" 1:2='Citizen'; 3:4= 'Not citizen';else=NA" ,as.factor=T)
usa2$ mycitizen<-relevel(usa2$mycitizen, ref='Citizen')
#English speaking
usa2$english <-Recode (usa2$speakeng, recodes =" 1='No'; 2:5='Speaks English'; 6='Not well'; else=NA" ,as.factor=T)
usa2$ english<-relevel(usa2$english, ref='Speaks English')
##educ
usa2$education<-Recode(usa2$educd, recodes="000:002='noschool';010:026='0Prim'; 030:061='1somehs'; 062:063='2hsgrad'; 065:090='3somecol'; 100:113='4colgrad';114:116='5masteranddoc'; else= NA", as.factor=T)
##Employed
usa2$employed<-Recode(usa2$empstat, recodes ="1= 'Employed';2:3='Unemployed';else=NA")
##sex
usa2$male<-as.factor(ifelse(usa2$sex==1, "Male", "Female"))
usa2$male<-relevel(usa2$male, ref='Male')
#Age cut into intervals
usa2$agec<-cut(usa2$age, breaks = c(18, 30, 40, 50, 60, 70, 80, 100), include.lowest = T, na.rm=T)
#usa$agec<-cut(usa$age, breaks=c(0,24,39,59,79,99))
##race
usa2$raceg<-Recode(usa2$race, recodes="1='White'; 2='Black';4:6='Asian'; else='other'", as.factor=T)
usa2$raceg<-relevel(usa2$raceg, ref='White')
##born
usa2$born<-Recode(usa2$bpl, recodes=" 200='Mexico' ; 210='Central America'; 300='South America';400:499='Europe'; 548:599= 'Asia';547='Middle East'; 600='Africa'; else= NA", as.factor=T)
usa2$ born<-relevel(usa2$born, ref='Europe')
##Poverty
usa2$poor<- Recode(usa2$poverty, recodes = "000= 'NA'; 001= 'Below poverty threshold';501= 'Above poverty threshold'", as.factor=T)
##income grouping
#usa1$inc<-Recode(usa1$incwage, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor = T)
#usa1$inc<-Recode(usa1$incwage, recodes = "1:1='1_lt10k'; 1960:1970='2_25-50k';1980='3_75-80k';1990:2002='4_100kplus';else= NA", as.factor = T)
#usa1$wage<- car::recode(usa1$incwage, recodes = "1:10000= '1_lt10k';10000:25000='2_10-25k';25000:35000='3_25-35k';35000:50000='4_35-50k';50000:200000= '5_50kplus'", as.factor = T)
##Occupation
usa2$occupation<- Recode(usa2$ind1990, recodes = "010:032= 'Agriculture'; 040:050= 'Mining';060= 'Construction'; 100:392='Manufacturing';400:472 ='Transportaion/Communication/Publicutilities';500:691= 'Wholesale&RetaiTrade';700:712= 'Finance/Ins/RealEst.'; 721:760= 'Business/Repair'; 761:791='PersonalServices'; 800:810= 'Ent.&Rec';812:893= 'Prof.services';900:932= 'Pub. Admin'; 940:960= 'Military'; else= NA ", as.factor=T)
##year naturalized
usa2$yrnatur <- ifelse(usa2$yrnatur>2020, NA, usa2$yrnatur)
##birthyear
usa2$birthyr<- ifelse(usa2$birthyr>2000,NA, usa2$birthyr)DEM 7223 – Event History Analysis Homework 5 10 points Submit either a link to an Rpub that has your homework published Please answer the following questions, use appropriate figures and statistical output to answer the questions.
usa1<- usa2 %>%
sample_frac(.01)%>%
mutate(cit_age = ifelse( mycitizen == "Citizen",
yrnatur - (birthyr),
year - (birthyr)),
d.event = ifelse(mycitizen == "Citizen", 1, 0))%>%
filter(complete.cases(agec,education,poor,male,english,yrnatur,raceg))
library(survival)
age_fit <- survfit(Surv(cit_age, d.event)~ 1,
data = usa1)
library(ggsurvfit)
age_fit %>%
ggsurvfit() +
add_confidence_interval(type = "ribbon") +
add_quantile() Fit the discrete time hazard model to your outcome You must form a person-period data set
##fit discrete time hazard model
pp<-survSplit(Surv(cit_age, d.event)~. ,
data = usa1[usa1$cit_age>0,],
cut=seq(0,60,12),
episode="year_citizenship") options(survey.lonely.psu = "adjust")
des<-svydesign (ids=~1, strata=~strata,
data=pp,
weight=~ perwt, nest=T )Consider both the general model and other time specifications Include all main effects in the model Test for an interaction between at least two of the predictors
##general model
m1<-svyglm(d.event~factor(year_citizenship) + male+raceg,
design=des, family=binomial (link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(m1)
Call:
svyglm(formula = d.event ~ factor(year_citizenship) + male +
raceg, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt,
nest = T)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.751660 0.058607 -46.951 <2e-16 ***
factor(year_citizenship)3 1.358012 0.061561 22.060 <2e-16 ***
factor(year_citizenship)4 2.189120 0.058840 37.204 <2e-16 ***
factor(year_citizenship)5 2.602049 0.060616 42.927 <2e-16 ***
factor(year_citizenship)6 2.863432 0.067334 42.526 <2e-16 ***
factor(year_citizenship)7 5.539669 0.054933 100.843 <2e-16 ***
maleFemale -0.047748 0.027560 -1.733 0.0832 .
racegAsian 0.040077 0.030728 1.304 0.1922
racegBlack 0.002012 0.050179 0.040 0.9680
racegother 0.001284 0.046922 0.027 0.9782
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9827029)
Number of Fisher Scoring iterations: 13
The general model: The general model shows a higher probability of attaining citizenship the longer an immigrant lives in the United States. It also shows that females have a lower chance of attaining citizenship compared to males. Additionally, compared to Asian and “Other” immigrants, Black immigrants have a lower chance of attaining citizenship in the United States.
##constant model
m0<-svyglm(d.event~ male+ raceg+education,
design=des, family=binomial (link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(m0)
Call:
svyglm(formula = d.event ~ male + raceg + education, design = des,
family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt,
nest = T)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.279583 0.053414 -23.956 < 2e-16 ***
maleFemale -0.001351 0.027022 -0.050 0.96013
racegAsian -0.014840 0.030648 -0.484 0.62824
racegBlack -0.052512 0.051351 -1.023 0.30650
racegother 0.032943 0.044987 0.732 0.46400
education1somehs 0.194831 0.068736 2.834 0.00459 **
education2hsgrad 0.254967 0.058866 4.331 1.49e-05 ***
education3somecol 0.385540 0.056350 6.842 7.97e-12 ***
education4colgrad 0.317562 0.058447 5.433 5.58e-08 ***
education5masteranddoc 0.263792 0.060620 4.352 1.36e-05 ***
educationnoschool -0.026977 0.079823 -0.338 0.73540
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1.000045)
Number of Fisher Scoring iterations: 5
The constant model: The constant model shows a low chance of immigrants attaining their citizenship status if they are Asian, Black and female. When education is added, it shows an increase with the probability of attaining citizenship for immigrants that have higher levels of education being with the results being statistically significant.
##linear model
m2<-svyglm(d.event~year_citizenship+raceg+male+education, design=des,
family=binomial (link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(m2)
Call:
svyglm(formula = d.event ~ year_citizenship + raceg + male +
education, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt,
nest = T)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.2913545 0.0755939 -56.769 < 2e-16 ***
year_citizenship 0.7157224 0.0110574 64.728 < 2e-16 ***
racegAsian -0.0009916 0.0322761 -0.031 0.97549
racegBlack -0.0230908 0.0513147 -0.450 0.65272
racegother 0.1244223 0.0474382 2.623 0.00872 **
maleFemale -0.0335661 0.0282776 -1.187 0.23523
education1somehs 0.4430659 0.0707061 6.266 3.75e-10 ***
education2hsgrad 0.6035728 0.0586124 10.298 < 2e-16 ***
education3somecol 0.8928638 0.0572673 15.591 < 2e-16 ***
education4colgrad 0.7461329 0.0588995 12.668 < 2e-16 ***
education5masteranddoc 0.6743015 0.0603434 11.174 < 2e-16 ***
educationnoschool -0.0851275 0.0762105 -1.117 0.26400
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9388104)
Number of Fisher Scoring iterations: 6
Linear model: When the linear and square term are added, the linear model shows statistical significance with year of citizenship. In that the probability of attaining citizenship increases based on the year of citizenship. Just like the first two models, a higher education increases the probability of attaining citizenship with education being statistically significant.
##quadratic model/poly 2
m3<-svyglm(d.event~year_citizenship+ I(year_citizenship^2)+raceg+education,
design=des, family=binomial (link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(m3)
Call:
svyglm(formula = d.event ~ year_citizenship + I(year_citizenship^2) +
raceg + education, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt,
nest = T)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.731374 0.124461 -46.049 < 2e-16 ***
year_citizenship 1.478863 0.054263 27.254 < 2e-16 ***
I(year_citizenship^2) -0.090838 0.006062 -14.984 < 2e-16 ***
racegAsian -0.002904 0.031426 -0.092 0.926
racegBlack -0.027758 0.049749 -0.558 0.577
racegother 0.117927 0.046354 2.544 0.011 *
education1somehs 0.419145 0.067927 6.170 6.9e-10 ***
education2hsgrad 0.565610 0.056994 9.924 < 2e-16 ***
education3somecol 0.850906 0.055811 15.246 < 2e-16 ***
education4colgrad 0.701688 0.057376 12.230 < 2e-16 ***
education5masteranddoc 0.628586 0.058755 10.698 < 2e-16 ***
educationnoschool -0.070759 0.075456 -0.938 0.348
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9545697)
Number of Fisher Scoring iterations: 6
The quadratic model: When the square root term is applied to this model, this model shows when parameters are added, there is a lower probability of attaining citizenship based on the year of citizenship with higher levels of education increasing the probability of attaining citizenship.With the hazard function increasing over time
##spline model
library(splines)
m4<-svyglm(d.event~ns(year_citizenship, df=3)+male+raceg, design=des,
family=binomial (link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(m4)
Call:
svyglm(formula = d.event ~ ns(year_citizenship, df = 3) + male +
raceg, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt,
nest = T)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.7914344 0.0598488 -46.641 <2e-16 ***
ns(year_citizenship, df = 3)1 1.8590049 0.0550277 33.783 <2e-16 ***
ns(year_citizenship, df = 3)2 4.5622655 0.1168268 39.052 <2e-16 ***
ns(year_citizenship, df = 3)3 2.8244562 0.0482147 58.581 <2e-16 ***
maleFemale -0.0470585 0.0270254 -1.741 0.0816 .
racegAsian 0.0394813 0.0300355 1.314 0.1887
racegBlack 0.0039847 0.0492522 0.081 0.9355
racegother 0.0006709 0.0463200 0.014 0.9884
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1.000903)
Number of Fisher Scoring iterations: 6
Spline model: The Spline model when parameters are added with three degrees of freedom are added to the year of citizenship shows an increase with citizenship attainment in year 2 and 3 with it being statically significant. With the hazard function increasing over time
Generate hazard plots for interesting cases highlighting the significant predictors in your analysis
newdat<-expand.grid(year_citizenship = unique(pp$year_citizenship),
raceg= unique(pp$raceg),
male = unique(pp$male),
education= unique(pp$education))
newdat<-newdat%>%
dplyr::filter(complete.cases(.))
newdat$h0<-predict(m0, newdata=newdat, type="response")
newdat$h1<-predict(m1, newdata=newdat, type="response")
newdat$h2<-predict(m2, newdata=newdat, type="response")
newdat$h3<-predict(m3, newdata=newdat, type="response")
newdat$h4<-predict(m4, newdata=newdat, type="response")
head(newdat) year_citizenship raceg male education h0 h1 h2
1 2 Asian Female 5masteranddoc 0.2997373 0.06137019 0.1029028
2 3 Asian Female 5masteranddoc 0.2997373 0.21829037 0.1991975
3 4 Asian Female 5masteranddoc 0.2997373 0.43187075 0.3651875
4 5 Asian Female 5masteranddoc 0.2997373 0.57448839 0.6052893
5 6 Asian Female 5masteranddoc 0.2997373 0.67034614 0.8506785
6 7 Asian Female 5masteranddoc 0.2997373 0.99999990 0.9795576
h3 h4
1 0.07795478 0.05905463
2 0.20238499 0.14034174
3 0.40866325 0.27006624
4 0.63861911 0.42583538
5 0.80685028 0.66455846
6 0.89085189 0.91623248
The table above shows variation in hazard times except for the constant model
library(magrittr)
Attaching package: 'magrittr'
The following object is masked from 'package:purrr':
set_names
The following object is masked from 'package:tidyr':
extract
library(data.table)
Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':
between, first, last
The following object is masked from 'package:purrr':
transpose
out<-melt(setDT(newdat),
id = c("year_citizenship", "raceg", "education","male"),
measure.vars = list(haz=c("h0", "h1","h2","h3", "h4")))
head(out, n=20) year_citizenship raceg education male variable value
1: 2 Asian 5masteranddoc Female h0 0.2997373
2: 3 Asian 5masteranddoc Female h0 0.2997373
3: 4 Asian 5masteranddoc Female h0 0.2997373
4: 5 Asian 5masteranddoc Female h0 0.2997373
5: 6 Asian 5masteranddoc Female h0 0.2997373
6: 7 Asian 5masteranddoc Female h0 0.2997373
7: 2 White 5masteranddoc Female h0 0.3034577
8: 3 White 5masteranddoc Female h0 0.3034577
9: 4 White 5masteranddoc Female h0 0.3034577
10: 5 White 5masteranddoc Female h0 0.3034577
11: 6 White 5masteranddoc Female h0 0.3034577
12: 7 White 5masteranddoc Female h0 0.3034577
13: 2 other 5masteranddoc Female h0 0.3118429
14: 3 other 5masteranddoc Female h0 0.3118429
15: 4 other 5masteranddoc Female h0 0.3118429
16: 5 other 5masteranddoc Female h0 0.3118429
17: 6 other 5masteranddoc Female h0 0.3118429
18: 7 other 5masteranddoc Female h0 0.3118429
19: 2 Black 5masteranddoc Female h0 0.2904518
20: 3 Black 5masteranddoc Female h0 0.2904518
The table above also shows variation in hazard times when looking at citizenship year, education, race and gender
library(dplyr)
library(ggplot2)
out%>%
dplyr::filter(education == "4colgrad")%>%
dplyr::mutate(mod= dplyr::case_when(.$variable =="h0" ~ "Constant",
.$variable =="h1" ~ "General",
.$variable =="h2" ~ "Linear",
.$variable =="h3" ~ "Quadratic",
.$variable =="h4" ~ "Spline"))%>%
ggplot(aes(x = year_citizenship*5, y=value ))+
geom_line(aes(group=raceg, color=raceg) )+
labs(title = "Hazard function for citizenship attainmentment",
subtitle = "Alternative model specifications")+
xlab("Age")+ylab("S(t)")+
facet_wrap(~mod, scales= "free_y")Don't know how to automatically pick scale for object of type svystat. Defaulting to continuous.
Hazard plot: Using the alternative time specifications, the plot shows when education is held constant for immigrants with a college degree, the constant, linear, quadratic, spline and general model shows the hazard function based on race an increase in the probability of attaining citizenship based on age and varying based of the race of an immigrant. I am not sure why my constant model is not a straight line but rather going up and down with the separate time intervals.
##model fit
AIC0<-AIC(m0)Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC1<-AIC(m1)Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC2<-AIC(m2)Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC3<-AIC(m3)Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC4<-AIC(m4)Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AICS<-data.frame(AIC = c(AIC0["AIC"],AIC1["AIC"],AIC2["AIC"],AIC3["AIC"],AIC4["AIC"]),
mod = factor(c("const", "general", "linear", "quadratic", "spline")) )
AICS$mod<-forcats::fct_relevel(AICS$mod,
c("general", "const" , "linear", "quadratic", "spline"))
AICS$deltaAIC<-AICS$AIC - AICS$AIC[AICS$mod=="general"]
knitr::kable(AICS[, c("mod", "AIC", "deltaAIC")],
caption = "Relative AIC for alternative time specifications")| mod | AIC | deltaAIC |
|---|---|---|
| const | 37922.82 | 7580.283847 |
| general | 30342.54 | 0.000000 |
| linear | 30347.80 | 5.260393 |
| quadratic | 30119.77 | -222.768259 |
| spline | 30544.33 | 201.798383 |
AIC: When using AIC to test which model fits the data best, the results show that the linear AIC fits the data best. Does it matter if the AIC number is a negative?