Homework 5 - Discreet Time Hazard Models, Alternative Time Parameterizations

In this assignment, I will be creating discreet time hazard models with the ELS:2002 data, but this time, focused on alternative time parameterizations.

The dependent variable, as usual, is community college entry after high school, the independent variables will be limited to race and sex for ease of interpretation:

Sex (Male/Female - as Factor)
Race (Race/Ethnicity as defined by the ELS:2002 dataset - also as factor)

Here I create my variables:

colnames(els)<-toupper(colnames(els))

#college entry variables

entry_variables<-els %>% select(STU_ID,SCH_ID,STRAT_ID,PSU,F2HS2P_P,F2PS1SEC,F3HS2PS1,F3PS1SEC,BYQXDATP,F1RDTLFT,BYSES1QU,F3QWT,BYP33,BYSEX,BYRACE,BYPARED,BYGPARED,F1RGPP2,BYMATHSE,BYSTEXP,BYPARASP)


entry_variables<-entry_variables %>% mutate(PARENT_EXPECTATION=ifelse(BYPARASP==1,"Will.Not.Graduate.HS",
                                                                  ifelse(BYPARASP==2,"HS.Grad.or.GED",
                                                                         ifelse(BYPARASP==3,"Expect.2.Yr.College",
                                                                                ifelse(BYPARASP==4,"4.Year.School.No.Deg",
                                                                                       ifelse(BYPARASP==5,"Graduate.4.Year",
                                                                                              ifelse(BYPARASP==6,"Masters.Degree",
                                                                                                     ifelse(BYPARASP==7,"PhD.Expected",
                                                                                                            ifelse(BYPARASP<0,NA,NA)))))))))


entry_variables<-entry_variables %>% mutate(STUDENT_EXPECTATION=ifelse(BYSTEXP==1,"Will.Not.Graduate.HS",
                                                                                        ifelse(BYSTEXP==2,"HS.Grad.or.GED",
                                                                                               ifelse(BYSTEXP==3,"Expect.2.Yr.College",
                                                                                                      ifelse(BYSTEXP==4,"4.Year.School.No.Deg",
                                                                                                             ifelse(BYSTEXP==5,"Graduate.4.Year",
                                                                                                                    ifelse(BYSTEXP==6,"Masters.Degree",
                                                                                                                           ifelse(BYSTEXP==7,"PhD.Expected",
                                                                                                                                         ifelse(BYPARED<0,NA,NA)))))))))

entry_variables<-entry_variables %>% mutate(MATH_SELF=ifelse(BYMATHSE<(-2),NA_real_,BYMATHSE))

entry_variables<-entry_variables %>% mutate(HS_GPA=ifelse(F1RGPP2<0,NA_real_,F1RGPP2))


entry_variables<-entry_variables %>% mutate(F2_months_to_college=ifelse(F2HS2P_P<0,NA_real_,F2HS2P_P))

entry_variables<-entry_variables %>% mutate(F3_months_to_college=ifelse(F3HS2PS1<0,NA_real_,F3HS2PS1))

entry_variables<-entry_variables %>% mutate(F2_first_college_2_year=ifelse(F2PS1SEC==4,1,
                                                                           ifelse(F2PS1SEC<0,NA_real_,
                                                                                  ifelse(F2PS1SEC<!0 & F2PS1SEC!=4,0,0))))

entry_variables<-entry_variables %>% mutate(F3_first_college_2_year=ifelse(F3PS1SEC==4,1,
                                                                           ifelse(F3PS1SEC<0,NA_real_,
                                                                                  ifelse(F3PS1SEC<!0 & F3PS1SEC!=4,0,0))))

entry_variables_complete<-entry_variables %>% filter(!is.na(F3_first_college_2_year))

entry_variables_complete<-entry_variables_complete %>% mutate(TV_ENTERTAIN_HRS=ifelse(BYP33<0,NA,
                                                                                      ifelse(BYP33>=0 & BYP33<=3,0,
                                                                                             ifelse(BYP33>=4 & BYP33<=10,1,NA))))

entry_variables_complete<-entry_variables_complete %>% mutate(sex=ifelse(BYSEX==1,"Male",
                                                                         ifelse(BYSEX==2,"Female",
                                                                                ifelse(BYSEX<0,NA,NA))))


entry_variables_complete<-entry_variables_complete %>% mutate(parents_highest_ed=ifelse(BYPARED==1,"Did.Not.Graduate.HS",
                                                                                        ifelse(BYPARED==2,"HS.Grad.or.GED",
                                                                                               ifelse(BYPARED==3,"2.Year.School.No.Deg",
                                                                                                      ifelse(BYPARED==4,"2.Year.School.Awarded.Deg",
                                                                                                             ifelse(BYPARED==5,"4.Year.School.No.Deg",
                                                                                                                    ifelse(BYPARED==6,"4.Year.School.Awarded.Deg",
                                                                                                                           ifelse(BYPARED==7,"Masters.Degree",
                                                                                                                                  ifelse(BYPARED==8,"PhD.or.other.doctorate",
                                                                                                                                         ifelse(BYPARED<0,NA,NA))))))))))

entry_variables_complete<-entry_variables_complete %>% mutate(PARENT_HS_OR_LOWER=ifelse(parents_highest_ed=="Did.Not.Graduate.HS" | parents_highest_ed=="HS.Grad.or.GED",1,0))

entry_variables_complete<-entry_variables_complete %>% mutate(F3_UPDATED_MONTHS_TO_COLLEGE=ifelse(is.na(F3_months_to_college),21,F3_months_to_college))

entry_variables_complete<-entry_variables_complete %>% mutate(BY_LOW_SES=ifelse(BYSES1QU==1,1,
                                                                                ifelse(BYSES1QU==2,1,
                                                                                       ifelse(BYSES1QU==3,0,
                                                                                              ifelse(BYSES1QU==4,0,
                                                                                                     ifelse(BYSES1QU<0,NA_real_,0))))))

entry_variables_complete<-entry_variables_complete %>% mutate(race=ifelse(BYRACE==1,"Am.Indian",
                                                                          ifelse(BYRACE==2,"Asian/Pacific",
                                                                                 ifelse(BYRACE==3,"Afr.American",
                                                                                        ifelse(BYRACE==4|BYRACE==5,"Hispanic",
                                                                                               ifelse(BYRACE==6,"2orMoreRace",
                                                                                                      ifelse(BYRACE==7,"White",NA)))))))

entry_variables_complete<-entry_variables_complete %>% mutate(grandparents_highest_ed=ifelse(BYGPARED==1,"Did.Not.Graduate.HS",
                                                                                             ifelse(BYGPARED==2,"HS.Grad.or.GED",
                                                                                                    ifelse(BYGPARED==3,"2.Year.School.No.Deg",
                                                                                                           ifelse(BYGPARED==4,"2.Year.School.Awarded.Deg",
                                                                                                                  ifelse(BYGPARED==5,"4.Year.School.No.Deg",
                                                                                                                         ifelse(BYGPARED==6,"4.Year.School.Awarded.Deg",
                                                                                                                                ifelse(BYGPARED==7,"Masters.Degree",
                                                                                                                                       ifelse(BYGPARED==8,"PhD.or.other.doctorate",
                                                                                                                                              ifelse(BYGPARED<0,NA,NA))))))))))


entry_variables_complete<-entry_variables_complete %>% filter(!is.na(parents_highest_ed) & !is.na(PARENT_EXPECTATION) &!is.na(STUDENT_EXPECTATION) & !is.na(race) & !is.na(sex) & !is.na(MATH_SELF) & !is.na(HS_GPA))

entry_var_clean<-entry_variables_complete %>% select(STU_ID,STRAT_ID,PSU,F3_first_college_2_year,F3_UPDATED_MONTHS_TO_COLLEGE,sex,parents_highest_ed,BY_LOW_SES,race,F3QWT,PARENT_EXPECTATION,STUDENT_EXPECTATION,MATH_SELF,HS_GPA)

Person-Period Data

Here, I create person-period data from the ELS:2002 dataset.

entry_var_wide<-dcast(entry_var_clean,STU_ID~F3_UPDATED_MONTHS_TO_COLLEGE,value.var="F3_first_college_2_year")
entry_var_wide[is.na(entry_var_wide)]<-0

entry_var_long<-reshape2::melt(entry_var_wide,id.vars="STU_ID",variable.name="time",value.name="cc_entry")
entry_var_long<-entry_var_long %>% arrange(STU_ID)
entry_var_long<-merge(entry_var_long,entry_var_clean,entry_var_clean[c(1:3,6:12)],by.x="STU_ID",by.y="STU_ID")

head(entry_var_long)

##   STU_ID time cc_entry STRAT_ID PSU F3_first_college_2_year
## 1 101101    0        0      101   1                       1
## 2 101101    1        0      101   1                       1
## 3 101101    2        0      101   1                       1
## 4 101101    3        0      101   1                       1
## 5 101101    4        0      101   1                       1
## 6 101101    5        0      101   1                       1
##   F3_UPDATED_MONTHS_TO_COLLEGE    sex   parents_highest_ed BY_LOW_SES     race
## 1                           18 Female 4.Year.School.No.Deg          1 Hispanic
## 2                           18 Female 4.Year.School.No.Deg          1 Hispanic
## 3                           18 Female 4.Year.School.No.Deg          1 Hispanic
## 4                           18 Female 4.Year.School.No.Deg          1 Hispanic
## 5                           18 Female 4.Year.School.No.Deg          1 Hispanic
## 6                           18 Female 4.Year.School.No.Deg          1 Hispanic
##      F3QWT PARENT_EXPECTATION STUDENT_EXPECTATION MATH_SELF HS_GPA
## 1 185.9702    Graduate.4.Year Expect.2.Yr.College    -1.118      2
## 2 185.9702    Graduate.4.Year Expect.2.Yr.College    -1.118      2
## 3 185.9702    Graduate.4.Year Expect.2.Yr.College    -1.118      2
## 4 185.9702    Graduate.4.Year Expect.2.Yr.College    -1.118      2
## 5 185.9702    Graduate.4.Year Expect.2.Yr.College    -1.118      2
## 6 185.9702    Graduate.4.Year Expect.2.Yr.College    -1.118      2

With the person-period data now complete, we can create a survey design:

des1<-svydesign(ids=~STU_ID,strata = ~STRAT_ID,weights = ~F3QWT,data = entry_var_long,nest=T)
options(survey.lonely.psu = "adjust")

Main Effects

Now that we have a survey design, we can compute the main effects of a parameterized model by sex and race:

fit1<-svyglm(cc_entry~as.factor(time)+sex+race,design=des1,family=binomial(link = cloglog))

This method turns the time (in months) into factors that become coefficients along with the other variables:

summary(fit1)

## 
## Call:
## svyglm(formula = cc_entry ~ as.factor(time) + sex + race, design = des1, 
##     family = binomial(link = cloglog))
## 
## Survey design:
## svydesign(ids = ~STU_ID, strata = ~STRAT_ID, weights = ~F3QWT, 
##     data = entry_var_long, nest = T)
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -4.18768    0.16618 -25.200  < 2e-16 ***
## as.factor(time)1  -0.38851    0.19974  -1.945  0.05181 .  
## as.factor(time)2   1.34235    0.14695   9.135  < 2e-16 ***
## as.factor(time)3   2.12098    0.13930  15.226  < 2e-16 ***
## as.factor(time)4  -0.06831    0.18723  -0.365  0.71524    
## as.factor(time)5  -1.88967    0.38127  -4.956 7.38e-07 ***
## as.factor(time)6  -2.75070    0.46465  -5.920 3.39e-09 ***
## as.factor(time)7  -0.52788    0.21306  -2.478  0.01326 *  
## as.factor(time)8  -0.19210    0.19019  -1.010  0.31252    
## as.factor(time)9  -1.75842    0.33713  -5.216 1.89e-07 ***
## as.factor(time)10 -2.12453    0.37597  -5.651 1.67e-08 ***
## as.factor(time)11 -2.38099    0.39500  -6.028 1.76e-09 ***
## as.factor(time)12 -2.15305    0.41002  -5.251 1.56e-07 ***
## as.factor(time)13 -1.98319    0.35539  -5.580 2.50e-08 ***
## as.factor(time)14 -0.87931    0.21598  -4.071 4.73e-05 ***
## as.factor(time)15 -0.29621    0.19164  -1.546  0.12225    
## as.factor(time)16 -1.70130    0.32299  -5.267 1.43e-07 ***
## as.factor(time)17 -3.21739    0.52956  -6.076 1.31e-09 ***
## as.factor(time)18 -2.95252    0.47818  -6.175 7.05e-10 ***
## as.factor(time)19 -1.42337    0.27962  -5.090 3.68e-07 ***
## as.factor(time)20 -1.59630    0.29546  -5.403 6.81e-08 ***
## as.factor(time)21  1.41343    0.14638   9.656  < 2e-16 ***
## sexMale            0.03259    0.04450   0.732  0.46408    
## raceAfr.American  -0.07672    0.12927  -0.594  0.55286    
## raceAm.Indian      0.33371    0.23397   1.426  0.15383    
## raceAsian/Pacific -0.29609    0.13305  -2.225  0.02609 *  
## raceHispanic       0.35949    0.11806   3.045  0.00234 ** 
## raceWhite         -0.09171    0.11180  -0.820  0.41208    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9901415)
## 
## Number of Fisher Scoring iterations: 9

Sex remains a non-significant variable, while as in previous studies, hispanic ethnicity is now both positive and significant for first entry into community college.

General model vs. Alternative Time Specifications

Now we can test to see if the data is more suited for a general model, or some other time specification (constant, linear, squared, cubed, quadratic and natural spline):

fit.constant<-svyglm(F3_first_college_2_year~race+sex,design=des1,family="binomial")

fit.0<-svyglm(F3_first_college_2_year~as.factor(F3_UPDATED_MONTHS_TO_COLLEGE)-1+race+sex,design=des1,family="binomial")

#linear
fit.lin<-svyglm(F3_first_college_2_year~F3_UPDATED_MONTHS_TO_COLLEGE+race+sex,design=des1,family="binomial")

#squared
fit.sq<-svyglm(formula=F3_first_college_2_year~F3_UPDATED_MONTHS_TO_COLLEGE+I(F3_UPDATED_MONTHS_TO_COLLEGE^2)+race+sex,design=des1,family="binomial")

fit.cubic<-svyglm(formula=F3_first_college_2_year~F3_UPDATED_MONTHS_TO_COLLEGE+I(F3_UPDATED_MONTHS_TO_COLLEGE^2)+I(F3_UPDATED_MONTHS_TO_COLLEGE^3)+race+sex,design=des1,family="binomial")

fit.quadratic<-svyglm(formula=F3_first_college_2_year~F3_UPDATED_MONTHS_TO_COLLEGE+I(F3_UPDATED_MONTHS_TO_COLLEGE^2)+I(F3_UPDATED_MONTHS_TO_COLLEGE^3)+I(F3_UPDATED_MONTHS_TO_COLLEGE^4)+race+sex,design=des1,family="binomial")

fit.splines<-svyglm(formula=F3_first_college_2_year~ns(F3_UPDATED_MONTHS_TO_COLLEGE,df=4)+race+sex,design=des1,family="binomial")

plot_alternative<-expand.grid(month=seq(1,21,1),race=unique(entry_var_long$race),sex=unique(entry_var_long$sex))
plot_alternative<-plot_alternative %>% mutate(F3_UPDATED_MONTHS_TO_COLLEGE=month)
plot_alternative$constant<-predict(fit.constant,newdata=plot_alternative,type="response")
plot_alternative$general<-predict(fit.0,newdata =plot_alternative,type="response")
plot_alternative$linear<-predict(fit.lin,newdata=plot_alternative,type="response")
plot_alternative$squared<-predict(fit.sq,newdata=plot_alternative,type="response")
plot_alternative$cubed<-predict(fit.cubic,newdata=plot_alternative,type="response")
plot_alternative$quadratic<-predict(fit.quadratic,newdata=plot_alternative,type="response")
plot_alternative$splines<-predict(fit.splines,newdata=plot_alternative,type="response")

plot_alternative$F3_UPDATED_MONTHS_TO_COLLEGE<-NULL

head(plot_alternative,n=20)

##    month     race    sex  constant   general    linear   squared     cubed
## 1      1 Hispanic Female 0.4821913 0.5225196 0.3996129 0.3714531 0.4444030
## 2      2 Hispanic Female 0.4821913 0.3516407 0.4129747 0.4013275 0.4126264
## 3      3 Hispanic Female 0.4821913 0.3856909 0.4264660 0.4301749 0.3988449
## 4      4 Hispanic Female 0.4821913 0.3412803 0.4400676 0.4577035 0.4003479
## 5      5 Hispanic Female 0.4821913 0.4861375 0.4537599 0.4836816 0.4149490
## 6      6 Hispanic Female 0.4821913 0.5034060 0.4675224 0.5079356 0.4407592
## 7      7 Hispanic Female 0.4821913 0.6494684 0.4813345 0.5303464 0.4758068
## 8      8 Hispanic Female 0.4821913 0.6769173 0.4951752 0.5508417 0.5177267
## 9      9 Hispanic Female 0.4821913 0.7500708 0.5090233 0.5693886 0.5636664
## 10    10 Hispanic Female 0.4821913 0.5518260 0.5228576 0.5859845 0.6104704
## 11    11 Hispanic Female 0.4821913 0.5794448 0.5366569 0.6006491 0.6550746
## 12    12 Hispanic Female 0.4821913 0.5645215 0.5504003 0.6134170 0.6949216
## 13    13 Hispanic Female 0.4821913 0.6909953 0.5640671 0.6243314 0.7281979
## 14    14 Hispanic Female 0.4821913 0.6945516 0.5776373 0.6334387 0.7538048
## 15    15 Hispanic Female 0.4821913 0.7161738 0.5910914 0.6407840 0.7711015
## 16    16 Hispanic Female 0.4821913 0.6755936 0.6044103 0.6464080 0.7795211
## 17    17 Hispanic Female 0.4821913 0.5942846 0.6175761 0.6503444 0.7781296
## 18    18 Hispanic Female 0.4821913 0.4388060 0.6305716 0.6526181 0.7651434
## 19    19 Hispanic Female 0.4821913 0.7887922 0.6433804 0.6532440 0.7374052
## 20    20 Hispanic Female 0.4821913 0.6740807 0.6559875 0.6522261 0.6899515
##    quadratic   splines
## 1  0.4645824 0.6587537
## 2  0.3911810 0.5993873
## 3  0.3717604 0.5396623
## 4  0.3900977 0.4842199
## 5  0.4352176 0.4371941
## 6  0.4972912 0.4017056
## 7  0.5655618 0.3797416
## 8  0.6297328 0.3720461
## 9  0.6825281 0.3794672
## 10 0.7206329 0.4035334
## 11 0.7436294 0.4463707
## 12 0.7523365 0.5079153
## 13 0.7477053 0.5784704
## 14 0.7305692 0.6451170
## 15 0.7021564 0.6971134
## 16 0.6651391 0.7276681
## 17 0.6247381 0.7340025
## 18 0.5891516 0.7200376
## 19 0.5689567 0.6894043
## 20 0.5760680 0.6453666

Now we can reshape the data into long format for use in plots:

library(data.table)
library(magrittr)

out<-melt(setDT(plot_alternative),id=c("month","race","sex"),measure.vars = list(haz=c("constant","general","linear","squared","cubed","quadratic","splines")))
head(out,n=20)

##     month     race    sex variable     value
##  1:     1 Hispanic Female constant 0.4821913
##  2:     2 Hispanic Female constant 0.4821913
##  3:     3 Hispanic Female constant 0.4821913
##  4:     4 Hispanic Female constant 0.4821913
##  5:     5 Hispanic Female constant 0.4821913
##  6:     6 Hispanic Female constant 0.4821913
##  7:     7 Hispanic Female constant 0.4821913
##  8:     8 Hispanic Female constant 0.4821913
##  9:     9 Hispanic Female constant 0.4821913
## 10:    10 Hispanic Female constant 0.4821913
## 11:    11 Hispanic Female constant 0.4821913
## 12:    12 Hispanic Female constant 0.4821913
## 13:    13 Hispanic Female constant 0.4821913
## 14:    14 Hispanic Female constant 0.4821913
## 15:    15 Hispanic Female constant 0.4821913
## 16:    16 Hispanic Female constant 0.4821913
## 17:    17 Hispanic Female constant 0.4821913
## 18:    18 Hispanic Female constant 0.4821913
## 19:    19 Hispanic Female constant 0.4821913
## 20:    20 Hispanic Female constant 0.4821913

Hazard and Survival Plots for race and sex using with different parameterizations

Here I've created hazard and survival plots for race and sex using the different parameterizations set out above:

out %>%
  ggplot(aes(x=month,y=value))+geom_line(aes(group=race,color=race))+facet_wrap(~variable)

## Don't know how to automatically pick scale for object of type svystat. Defaulting to continuous.

out %>% filter(race=="Hispanic") %>%
  ggplot(aes(x=month,y=value))+geom_line(aes(group=sex,color=sex))+facet_wrap(~variable)

## Don't know how to automatically pick scale for object of type svystat. Defaulting to continuous.

The efficiency of these models can be tested using the AIC method:

library(knitr)

AIC.constant<-AIC(fit.constant)
AIC.general<-AIC(fit.0)
AIC.linear<-AIC(fit.lin)
AIC.sqare<-AIC(fit.sq)
AIC.cubic<-AIC(fit.cubic)
AIC.quad<-AIC(fit.quadratic)
AIC.spline<-AIC(fit.splines)

AIC_df<-data.frame(AIC=c(AIC.constant["AIC"],AIC.general["AIC"],AIC.linear["AIC"],AIC.sqare["AIC"],AIC.cubic["AIC"],AIC.quad["AIC"],AIC.spline["AIC"]),mod=factor(c("constant","general","linear","squared","cubic","quad","spline")))

AIC_df$deltaAIC<-AIC_df$AIC-AIC_df$AIC[AIC_df$mod=="general"]

kable(AIC_df,align = 'c')

AIC	mod	deltaAIC
183535.5	constant	6640.1380
176895.4	general	0.0000
179294.3	linear	2398.9061
179115.1	squared	2219.6938
177751.8	cubic	856.4052
176736.6	quad	-158.7955
176441.1	spline	-454.2634

We see that the Spline model is actually somewhat more efficient than the general model.

Hazard and Survival Plots

out %>% filter(variable=="splines") %>% group_by(race,sex) %>%
  dplyr::mutate(bigH=cumsum(value)) %>% ggplot(aes(x=month,y=bigH)) + geom_line(aes(group=race,color=race)) + facet_wrap(~sex) + ggtitle("Hazard Plot by Race and Sex for CC Entry")

out %>% filter(variable=="splines") %>% group_by(race,sex) %>%
  dplyr::mutate(bigH=cumsum(value),surv=exp(-cumsum(value))) %>% ggplot(aes(x=month,y=surv)) + geom_line(aes(group=race,color=race))+facet_wrap(~sex)+ ggtitle("Survival Plot by Race and Sex for CC Entry")

Conclusions

Hispanic ethnicity continues to be a significant factor in community college enrollment out of high school
Sex remains a non-significant factor
The natural spline model appears to be the most efficient of parameterizations, per the AIC method

Naslund Homework 5

Karlerik Naslund

October 20, 2020