Homework 5

Author

Lydia Okabe

Quarto

Quarto enables you to weave together content and executable code into a finished document. To learn more about Quarto see https://quarto.org.

Running Code

When you click the Render button a document will be generated that includes both content and the output of embedded code. You can embed code like this:

1 + 1

[1] 2

You can add options to executable code like this

[1] 4

The echo: false option disables the printing of code (only output is displayed).

##load libraries
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.8     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.1
✔ readr   2.1.2     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

library(haven)
library(survival)
library(car)

Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

library(survey)

Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack


Attaching package: 'survey'

The following object is masked from 'package:graphics':

    dotchart

library(muhaz)
library(eha)
library(foreign)

##load data
usa2 <- read_stata("C:/Users/okabe/OneDrive/Pictures/Stats 2/usa_00011.dta.gz")
usa2<-zap_labels(usa2)

##recodes fro variables
##Citizen
usa2$mycitizen<-Recode (usa2$citizen, recodes =" 1:2='Citizen'; 3:4= 'Not citizen';else=NA" ,as.factor=T)

usa2$ mycitizen<-relevel(usa2$mycitizen, ref='Citizen')

#English speaking
usa2$english <-Recode (usa2$speakeng, recodes =" 1='No'; 2:5='Speaks English'; 6='Not well'; else=NA" ,as.factor=T)
usa2$ english<-relevel(usa2$english, ref='Speaks English')


##educ
usa2$education<-Recode(usa2$educd, recodes="000:002='noschool';010:026='0Prim'; 030:061='1somehs'; 062:063='2hsgrad'; 065:090='3somecol'; 100:113='4colgrad';114:116='5masteranddoc'; else= NA", as.factor=T)

##Employed

usa2$employed<-Recode(usa2$empstat, recodes ="1= 'Employed';2:3='Unemployed';else=NA")

##sex
usa2$male<-as.factor(ifelse(usa2$sex==1, "Male", "Female"))
usa2$male<-relevel(usa2$male, ref='Male')

#Age cut into intervals 

usa2$agec<-cut(usa2$age, breaks = c(18, 30, 40, 50, 60, 70, 80, 100), include.lowest = T, na.rm=T)







#usa$agec<-cut(usa$age, breaks=c(0,24,39,59,79,99))



##race

usa2$raceg<-Recode(usa2$race, recodes="1='White'; 2='Black';4:6='Asian'; else='other'", as.factor=T)
usa2$raceg<-relevel(usa2$raceg, ref='White')


##born

usa2$born<-Recode(usa2$bpl, recodes=" 200='Mexico' ; 210='Central America'; 300='South America';400:499='Europe'; 548:599= 'Asia';547='Middle East'; 600='Africa'; else= NA", as.factor=T)
usa2$ born<-relevel(usa2$born, ref='Europe')



##Poverty
usa2$poor<- Recode(usa2$poverty, recodes = "000= 'NA'; 001= 'Below poverty threshold';501= 'Above poverty threshold'", as.factor=T)

##income grouping

#usa1$inc<-Recode(usa1$incwage, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor = T)

#usa1$inc<-Recode(usa1$incwage, recodes = "1:1='1_lt10k'; 1960:1970='2_25-50k';1980='3_75-80k';1990:2002='4_100kplus';else= NA", as.factor = T)

#usa1$wage<- car::recode(usa1$incwage, recodes = "1:10000= '1_lt10k';10000:25000='2_10-25k';25000:35000='3_25-35k';35000:50000='4_35-50k';50000:200000= '5_50kplus'", as.factor = T)


##Occupation

usa2$occupation<- Recode(usa2$ind1990, recodes = "010:032= 'Agriculture'; 040:050= 'Mining';060= 'Construction'; 100:392='Manufacturing';400:472 ='Transportaion/Communication/Publicutilities';500:691= 'Wholesale&RetaiTrade';700:712= 'Finance/Ins/RealEst.'; 721:760= 'Business/Repair'; 761:791='PersonalServices'; 800:810= 'Ent.&Rec';812:893= 'Prof.services';900:932= 'Pub. Admin'; 940:960= 'Military'; else= NA ", as.factor=T)

##year naturalized

usa2$yrnatur <- ifelse(usa2$yrnatur>2020, NA, usa2$yrnatur)

##birthyear
usa2$birthyr<- ifelse(usa2$birthyr>2000,NA, usa2$birthyr)

DEM 7223 – Event History Analysis Homework 5 10 points Submit either a link to an Rpub that has your homework published Please answer the following questions, use appropriate figures and statistical output to answer the questions.

usa1<- usa2 %>%
   sample_frac(.01)%>%
 
    
 
     mutate(cit_age = ifelse( mycitizen == "Citizen", 
                                     yrnatur - (birthyr), 
                                     year - (birthyr)), 
                 d.event = ifelse(mycitizen == "Citizen", 1, 0))%>%
  filter(complete.cases(agec,education,poor,male,english,yrnatur,raceg))

library(survival)

        age_fit <- survfit(Surv(cit_age, d.event)~ 1, 
                           data = usa1)

        library(ggsurvfit)

        age_fit %>%
          ggsurvfit() +
          add_confidence_interval(type = "ribbon") +
          add_quantile()

Fit the discrete time hazard model to your outcome You must form a person-period data set

##fit discrete time hazard model 
pp<-survSplit(Surv(cit_age, d.event)~. ,
data = usa1[usa1$cit_age>0,],
cut=seq(0,60,12),
episode="year_citizenship")

 options(survey.lonely.psu = "adjust")

des<-svydesign (ids=~1, strata=~strata,
               data=pp,
               weight=~ perwt, nest=T )

Consider both the general model and other time specifications Include all main effects in the model Test for an interaction between at least two of the predictors

##general model
m1<-svyglm(d.event~factor(year_citizenship) + male+raceg,
design=des, family=binomial (link="cloglog"))

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(m1)


Call:
svyglm(formula = d.event ~ factor(year_citizenship) + male + 
    raceg, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt, 
    nest = T)

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)               -2.751660   0.058607 -46.951   <2e-16 ***
factor(year_citizenship)3  1.358012   0.061561  22.060   <2e-16 ***
factor(year_citizenship)4  2.189120   0.058840  37.204   <2e-16 ***
factor(year_citizenship)5  2.602049   0.060616  42.927   <2e-16 ***
factor(year_citizenship)6  2.863432   0.067334  42.526   <2e-16 ***
factor(year_citizenship)7  5.539669   0.054933 100.843   <2e-16 ***
maleFemale                -0.047748   0.027560  -1.733   0.0832 .  
racegAsian                 0.040077   0.030728   1.304   0.1922    
racegBlack                 0.002012   0.050179   0.040   0.9680    
racegother                 0.001284   0.046922   0.027   0.9782    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9827029)

Number of Fisher Scoring iterations: 13

The general model: The general model shows a higher probability of attaining citizenship the longer an immigrant lives in the United States. It also shows that females have a lower chance of attaining citizenship compared to males. Additionally, compared to Asian and “Other” immigrants, Black immigrants have a lower chance of attaining citizenship in the United States.

##constant model
m0<-svyglm(d.event~ male+ raceg+education,
design=des, family=binomial (link="cloglog"))

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(m0)


Call:
svyglm(formula = d.event ~ male + raceg + education, design = des, 
    family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt, 
    nest = T)

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -1.279583   0.053414 -23.956  < 2e-16 ***
maleFemale             -0.001351   0.027022  -0.050  0.96013    
racegAsian             -0.014840   0.030648  -0.484  0.62824    
racegBlack             -0.052512   0.051351  -1.023  0.30650    
racegother              0.032943   0.044987   0.732  0.46400    
education1somehs        0.194831   0.068736   2.834  0.00459 ** 
education2hsgrad        0.254967   0.058866   4.331 1.49e-05 ***
education3somecol       0.385540   0.056350   6.842 7.97e-12 ***
education4colgrad       0.317562   0.058447   5.433 5.58e-08 ***
education5masteranddoc  0.263792   0.060620   4.352 1.36e-05 ***
educationnoschool      -0.026977   0.079823  -0.338  0.73540    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1.000045)

Number of Fisher Scoring iterations: 5

The constant model: The constant model shows a low chance of immigrants attaining their citizenship status if they are Asian, Black and female. When education is added, it shows an increase with the probability of attaining citizenship for immigrants that have higher levels of education being with the results being statistically significant.

##linear model
m2<-svyglm(d.event~year_citizenship+raceg+male+education, design=des,
family=binomial (link="cloglog"))

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(m2)


Call:
svyglm(formula = d.event ~ year_citizenship + raceg + male + 
    education, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt, 
    nest = T)

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -4.2913545  0.0755939 -56.769  < 2e-16 ***
year_citizenship        0.7157224  0.0110574  64.728  < 2e-16 ***
racegAsian             -0.0009916  0.0322761  -0.031  0.97549    
racegBlack             -0.0230908  0.0513147  -0.450  0.65272    
racegother              0.1244223  0.0474382   2.623  0.00872 ** 
maleFemale             -0.0335661  0.0282776  -1.187  0.23523    
education1somehs        0.4430659  0.0707061   6.266 3.75e-10 ***
education2hsgrad        0.6035728  0.0586124  10.298  < 2e-16 ***
education3somecol       0.8928638  0.0572673  15.591  < 2e-16 ***
education4colgrad       0.7461329  0.0588995  12.668  < 2e-16 ***
education5masteranddoc  0.6743015  0.0603434  11.174  < 2e-16 ***
educationnoschool      -0.0851275  0.0762105  -1.117  0.26400    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9388104)

Number of Fisher Scoring iterations: 6

Linear model: When the linear and square term are added, the linear model shows statistical significance with year of citizenship. In that the probability of attaining citizenship increases based on the year of citizenship. Just like the first two models, a higher education increases the probability of attaining citizenship with education being statistically significant.

##quadratic model/poly 2
m3<-svyglm(d.event~year_citizenship+ I(year_citizenship^2)+raceg+education,
design=des, family=binomial (link="cloglog"))

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(m3)


Call:
svyglm(formula = d.event ~ year_citizenship + I(year_citizenship^2) + 
    raceg + education, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt, 
    nest = T)

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            -5.731374   0.124461 -46.049  < 2e-16 ***
year_citizenship        1.478863   0.054263  27.254  < 2e-16 ***
I(year_citizenship^2)  -0.090838   0.006062 -14.984  < 2e-16 ***
racegAsian             -0.002904   0.031426  -0.092    0.926    
racegBlack             -0.027758   0.049749  -0.558    0.577    
racegother              0.117927   0.046354   2.544    0.011 *  
education1somehs        0.419145   0.067927   6.170  6.9e-10 ***
education2hsgrad        0.565610   0.056994   9.924  < 2e-16 ***
education3somecol       0.850906   0.055811  15.246  < 2e-16 ***
education4colgrad       0.701688   0.057376  12.230  < 2e-16 ***
education5masteranddoc  0.628586   0.058755  10.698  < 2e-16 ***
educationnoschool      -0.070759   0.075456  -0.938    0.348    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9545697)

Number of Fisher Scoring iterations: 6

The quadratic model: When the square root term is applied to this model, this model shows when parameters are added, there is a lower probability of attaining citizenship based on the year of citizenship with higher levels of education increasing the probability of attaining citizenship.With the hazard function increasing over time

##spline  model
library(splines)
m4<-svyglm(d.event~ns(year_citizenship, df=3)+male+raceg, design=des,
family=binomial (link="cloglog"))

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(m4)


Call:
svyglm(formula = d.event ~ ns(year_citizenship, df = 3) + male + 
    raceg, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~1, strata = ~strata, data = pp, weight = ~perwt, 
    nest = T)

Coefficients:
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                   -2.7914344  0.0598488 -46.641   <2e-16 ***
ns(year_citizenship, df = 3)1  1.8590049  0.0550277  33.783   <2e-16 ***
ns(year_citizenship, df = 3)2  4.5622655  0.1168268  39.052   <2e-16 ***
ns(year_citizenship, df = 3)3  2.8244562  0.0482147  58.581   <2e-16 ***
maleFemale                    -0.0470585  0.0270254  -1.741   0.0816 .  
racegAsian                     0.0394813  0.0300355   1.314   0.1887    
racegBlack                     0.0039847  0.0492522   0.081   0.9355    
racegother                     0.0006709  0.0463200   0.014   0.9884    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1.000903)

Number of Fisher Scoring iterations: 6

Spline model: The Spline model when parameters are added with three degrees of freedom are added to the year of citizenship shows an increase with citizenship attainment in year 2 and 3 with it being statically significant. With the hazard function increasing over time

Generate hazard plots for interesting cases highlighting the significant predictors in your analysis

newdat<-expand.grid(year_citizenship = unique(pp$year_citizenship),
raceg= unique(pp$raceg),
male = unique(pp$male),
education= unique(pp$education))

newdat<-newdat%>%
dplyr::filter(complete.cases(.))

newdat$h0<-predict(m0, newdata=newdat, type="response")
newdat$h1<-predict(m1, newdata=newdat, type="response")
newdat$h2<-predict(m2, newdata=newdat, type="response")
newdat$h3<-predict(m3, newdata=newdat, type="response")
newdat$h4<-predict(m4, newdata=newdat, type="response")
head(newdat)

  year_citizenship raceg   male     education        h0         h1        h2
1                2 Asian Female 5masteranddoc 0.2997373 0.06137019 0.1029028
2                3 Asian Female 5masteranddoc 0.2997373 0.21829037 0.1991975
3                4 Asian Female 5masteranddoc 0.2997373 0.43187075 0.3651875
4                5 Asian Female 5masteranddoc 0.2997373 0.57448839 0.6052893
5                6 Asian Female 5masteranddoc 0.2997373 0.67034614 0.8506785
6                7 Asian Female 5masteranddoc 0.2997373 0.99999990 0.9795576
          h3         h4
1 0.07795478 0.05905463
2 0.20238499 0.14034174
3 0.40866325 0.27006624
4 0.63861911 0.42583538
5 0.80685028 0.66455846
6 0.89085189 0.91623248

The table above shows variation in hazard times except for the constant model

library(magrittr)


Attaching package: 'magrittr'

The following object is masked from 'package:purrr':

    set_names

The following object is masked from 'package:tidyr':

    extract

library(data.table)


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last

The following object is masked from 'package:purrr':

    transpose

out<-melt(setDT(newdat),
id = c("year_citizenship", "raceg", "education","male"),
measure.vars = list(haz=c("h0", "h1","h2","h3", "h4")))
head(out, n=20)

    year_citizenship raceg     education   male variable     value
 1:                2 Asian 5masteranddoc Female       h0 0.2997373
 2:                3 Asian 5masteranddoc Female       h0 0.2997373
 3:                4 Asian 5masteranddoc Female       h0 0.2997373
 4:                5 Asian 5masteranddoc Female       h0 0.2997373
 5:                6 Asian 5masteranddoc Female       h0 0.2997373
 6:                7 Asian 5masteranddoc Female       h0 0.2997373
 7:                2 White 5masteranddoc Female       h0 0.3034577
 8:                3 White 5masteranddoc Female       h0 0.3034577
 9:                4 White 5masteranddoc Female       h0 0.3034577
10:                5 White 5masteranddoc Female       h0 0.3034577
11:                6 White 5masteranddoc Female       h0 0.3034577
12:                7 White 5masteranddoc Female       h0 0.3034577
13:                2 other 5masteranddoc Female       h0 0.3118429
14:                3 other 5masteranddoc Female       h0 0.3118429
15:                4 other 5masteranddoc Female       h0 0.3118429
16:                5 other 5masteranddoc Female       h0 0.3118429
17:                6 other 5masteranddoc Female       h0 0.3118429
18:                7 other 5masteranddoc Female       h0 0.3118429
19:                2 Black 5masteranddoc Female       h0 0.2904518
20:                3 Black 5masteranddoc Female       h0 0.2904518

The table above also shows variation in hazard times when looking at citizenship year, education, race and gender

library(dplyr)

library(ggplot2)
out%>%
dplyr::filter(education == "4colgrad")%>%
dplyr::mutate(mod= dplyr::case_when(.$variable =="h0" ~ "Constant",

                                    .$variable =="h1" ~ "General",
.$variable =="h2" ~ "Linear",
.$variable =="h3" ~ "Quadratic",
.$variable =="h4" ~ "Spline"))%>%
ggplot(aes(x = year_citizenship*5, y=value ))+
geom_line(aes(group=raceg, color=raceg) )+
labs(title = "Hazard function for citizenship attainmentment",
subtitle = "Alternative model specifications")+
xlab("Age")+ylab("S(t)")+
  facet_wrap(~mod, scales= "free_y")

Don't know how to automatically pick scale for object of type svystat. Defaulting to continuous.

Hazard plot: Using the alternative time specifications, the plot shows when education is held constant for immigrants with a college degree, the constant, linear, quadratic, spline and general model shows the hazard function based on race an increase in the probability of attaining citizenship based on age and varying based of the race of an immigrant. I am not sure why my constant model is not a straight line but rather going up and down with the separate time intervals.

##model fit
AIC0<-AIC(m0)

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

AIC1<-AIC(m1)

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

AIC2<-AIC(m2)

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

AIC3<-AIC(m3)

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

AIC4<-AIC(m4)

Warning in eval(family$initialize): non-integer #successes in a binomial glm!

AICS<-data.frame(AIC = c(AIC0["AIC"],AIC1["AIC"],AIC2["AIC"],AIC3["AIC"],AIC4["AIC"]),
mod = factor(c("const", "general", "linear", "quadratic", "spline")) )
AICS$mod<-forcats::fct_relevel(AICS$mod,
c("general", "const" , "linear", "quadratic", "spline"))

AICS$deltaAIC<-AICS$AIC - AICS$AIC[AICS$mod=="general"]
knitr::kable(AICS[, c("mod", "AIC", "deltaAIC")],
caption = "Relative AIC for alternative time specifications")

Relative AIC for alternative time specifications
mod	AIC	deltaAIC
const	37922.82	7580.283847
general	30342.54	0.000000
linear	30347.80	5.260393
quadratic	30119.77	-222.768259
spline	30544.33	201.798383

AIC: When using AIC to test which model fits the data best, the results show that the linear AIC fits the data best. Does it matter if the AIC number is a negative?