library(haven)
library(foreign)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(readr)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(car)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(alr3)
##
## Attaching package: 'alr3'
## The following object is masked from 'package:MASS':
##
## forbes
library(zoo)
library(nortest)
library(plotrix)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
##
## rescale
## The following object is masked from 'package:readr':
##
## col_factor
library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
## The following object is masked from 'package:survey':
##
## calibrate
## The following object is masked from 'package:lmtest':
##
## lrtest
## The following object is masked from 'package:car':
##
## logit
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(sandwich)
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(survey)
library(muhaz)
library(ggpubr)
library(survminer)
library(eha)
##
## Attaching package: 'eha'
## The following objects are masked from 'package:VGAM':
##
## dgompertz, dmakeham, pgompertz, pmakeham, qgompertz, qmakeham,
## rgompertz, rmakeham
library(reshape2)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:pastecs':
##
## extract
nlsy97mig<-read.csv("C:\\Users\\Jaire\\OneDrive\\Desktop\\Research\\Data\\nlsy97mig.csv")
In preparation for an upcoming research study, I conduct a secondary assessment for the risk of migration during the Great Recession among young adults using a discrete time hazard model and longitudinal data from the National Longitudinal Survey of Youth 1997 (2007-2009).
Additionally, I use general, linear, and constant (<5 waves) time specifications.
The event of interest is migration, measured here as the survey respondent moving from their previous state, city, or county since the date of the last interview.
I use the follow predictors which were significant to estimate the risk of migration based on the previous discrete time model estimated in part 1:
Here I included the part 1 predictors in transformations, but I only intend to use the three listed above.
# Age at interview
nlsy97mig$age1<-ifelse(nlsy97mig$ragem07==-5, NA,nlsy97mig$ragem07/12)
nlsy97mig$age2<-ifelse(nlsy97mig$ragem08==-5, NA,nlsy97mig$ragem08/12)
nlsy97mig$age3<-ifelse(nlsy97mig$ragem09==-5, NA,nlsy97mig$ragem09/12)
# Migration since last interview
nlsy97mig$mig1<-ifelse(nlsy97mig$rmig07==-4, NA,nlsy97mig$rmig07)
nlsy97mig$mig2<-ifelse(nlsy97mig$rmig08==-4, NA,nlsy97mig$rmig08)
nlsy97mig$mig3<-ifelse(nlsy97mig$rmig09==-4, NA,nlsy97mig$rmig09)
nlsy97mig$mig1<-ifelse(nlsy97mig$rmig07==-5, NA,nlsy97mig$rmig07)
nlsy97mig$mig2<-ifelse(nlsy97mig$rmig08==-5, NA,nlsy97mig$rmig08)
nlsy97mig$mig3<-ifelse(nlsy97mig$rmig09==-5, NA,nlsy97mig$rmig09)
# Sex
nlsy97mig$male<-recode(nlsy97mig$rsex, recodes = "1=1; 2=0")
# Hispanic
nlsy97mig$hisp<-recode(nlsy97mig$rrace, recodes = "1=0; 2=1; 3=0;4=0")
#Highest Level of Education 4-year degree or higher
nlsy97mig$college4<-recode(nlsy97mig$reduc07, recodes = "-5=NA;-2=NA;1:4=0;5:8=1")
#Income past year below median household income 2007
nlsy97mig$income<-recode(nlsy97mig$rinc07, recodes = "-5=NA;-4=NA;-2=NA;-1=NA")
#Marital Status - married
nlsy97mig$married<-recode(nlsy97mig$rmar07, recodes = "-5=NA;-3=NA; 0=0; 2:4=0")
#Healthy - very good or excellent
nlsy97mig$healthy<-recode(nlsy97mig$rheal07,recodes = "-5=NA;-2=NA;1:2=1;3:5=0")
#subset censoring
nlsy97mig<-subset(nlsy97mig, is.na(mig1)==F&is.na(mig2)==F&
is.na(mig3)==F&is.na(age1)==F&
is.na(age2)==F&is.na(age3)==F&mig1!=1)
# Reshape
e.long<-reshape(data.frame(nlsy97mig), idvar = "rid",
varying = list(c("age1", "age2"),
c("age2", "age3")),
v.names = c("age_enter", "age_exit"),
times = 1:2, direction = "long")
e.long<-e.long[order(e.long$rid, e.long$time),]
e.long$migtran<-NA
e.long$migtran[e.long$mig1==0&e.long$mig2==1&e.long$time==1]<-1
e.long$migtran[e.long$mig2==0&e.long$mig3==1&e.long$time==2]<-1
e.long$migtran[e.long$mig1==0&e.long$mig2==0&e.long$time==1]<-0
e.long$migtran[e.long$mig2==0&e.long$mig3==0&e.long$time==2]<-0
#Censor failures from 2nd and 3rd period risk
failed1<-which(is.na(e.long$migtran)==T)
e.long<-e.long[-failed1,]
e.long$age1r<-round(e.long$age_enter,0)
e.long$age2r<-round(e.long$age_exit,0)
e.long$time_start<-e.long$time-1
head(e.long, n=10)
## rid wht psu strat rsex rrace rbirthyr rmar07 reduc07 rinc07 rheal07
## 26.1 26 125910 1 35 1 1 1980 0 1 -4 3
## 27.1 27 114180 1 35 1 1 1981 0 1 3000 2
## 60.1 60 281400 2 35 1 4 1984 0 3 3000 2
## 68.1 68 102999 1 35 1 1 1981 0 3 -4 2
## 70.2 70 94241 1 35 1 1 1982 0 5 -2 3
## 87.1 87 238852 2 37 1 2 1984 0 4 -4 2
## 100.2 100 101111 1 37 2 1 1981 0 1 23000 3
## 104.2 104 336171 2 37 2 4 1980 0 5 44000 3
## 113.2 113 299852 1 37 1 4 1984 0 5 5000 1
## 150.1 150 120418 1 34 1 1 1984 0 3 13000 2
## rage07 rage08 rage09 rmig07 rmig08 rmig09 ragem07 ragem08 ragem09 rcit07
## 26.1 27 28 29 0 0 -4 328 341 348 1
## 27.1 26 27 28 0 0 -4 317 329 341 1
## 60.1 24 24 26 0 1 1 288 297 312 1
## 68.1 26 27 28 0 0 -4 323 335 344 2
## 70.2 25 26 27 -4 0 0 302 315 326 1
## 87.1 23 24 25 0 0 -4 285 296 306 1
## 100.2 26 27 28 -4 0 0 323 334 345 1
## 104.2 26 28 28 -4 0 0 324 336 346 1
## 113.2 23 24 25 -4 0 0 282 297 306 1
## 150.1 23 24 25 0 0 1 284 297 307 1
## rcit08 rcit09 ripr07 ripr08 ripr09 rhhs07 rhhs08 rhhs09 remp07 remp08
## 26.1 1 1 75 -3 18 1 1 1 5 5
## 27.1 1 1 25 76 16 8 4 4 200501 200501
## 60.1 1 1 1721 649 233 5 5 10 5 200701
## 68.1 2 2 142 160 256 1 1 1 5 200801
## 70.2 1 1 -3 -3 -3 3 1 1 200701 200701
## 87.1 1 1 6 -3 -3 3 3 3 5 200701
## 100.2 1 1 205 0 10 4 3 3 200602 200602
## 104.2 1 1 574 1116 -3 5 2 2 200203 200203
## 113.2 1 1 1721 0 300 6 3 3 200602 5
## 150.1 1 1 226 -3 241 2 5 1 4 200802
## remp09 mig1 mig2 mig3 male hisp college4 income married healthy time
## 26.1 2 0 0 -4 1 0 0 NA 0 0 1
## 27.1 200501 0 0 -4 1 0 0 3000 0 1 1
## 60.1 5 0 1 1 1 0 0 3000 0 1 1
## 68.1 200801 0 0 -4 1 0 0 NA 0 1 1
## 70.2 200701 -4 0 0 1 0 1 NA 0 0 2
## 87.1 200701 0 0 -4 1 1 0 NA 0 1 1
## 100.2 200802 -4 0 0 0 0 0 23000 0 0 2
## 104.2 200802 -4 0 0 0 0 1 44000 0 0 2
## 113.2 5 -4 0 0 1 0 1 5000 0 1 2
## 150.1 200901 0 0 1 1 0 0 13000 0 1 1
## age_enter age_exit migtran age1r age2r time_start
## 26.1 27.33333 28.41667 0 27 28 0
## 27.1 26.41667 27.41667 0 26 27 0
## 60.1 24.00000 24.75000 1 24 25 0
## 68.1 26.91667 27.91667 0 27 28 0
## 70.2 26.25000 27.16667 0 26 27 1
## 87.1 23.75000 24.66667 0 24 25 0
## 100.2 27.83333 28.75000 0 28 29 1
## 104.2 28.00000 28.83333 0 28 29 1
## 113.2 24.75000 25.50000 0 25 26 1
## 150.1 23.66667 24.75000 0 24 25 0
e.long%>%
group_by(time)%>%
summarise(prop_event= mean(migtran, na.rm=T))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## time prop_event
## <int> <dbl>
## 1 1 0.308
## 2 2 0.296
Here we can see that the overall risk of migration is roughly 1 percent higher in the first wave than the second wave. Below is a simple descriptive analysis of the outcome by characteristics of the respondents:
e.long%>%
filter(complete.cases(college4, married, healthy, hisp, income,male))%>%
group_by(time, college4, married, healthy, hisp,income, male)%>%
summarise(prop_event= mean(migtran, na.rm=T))
## `summarise()` regrouping output by 'time', 'college4', 'married', 'healthy', 'hisp', 'income' (override with `.groups` argument)
## # A tibble: 704 x 8
## # Groups: time, college4, married, healthy, hisp, income [582]
## time college4 married healthy hisp income male prop_event
## <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 1 0 0 0 0 350 0 0
## 2 1 0 0 0 0 400 1 0
## 3 1 0 0 0 0 500 0 1
## 4 1 0 0 0 0 1000 0 0.5
## 5 1 0 0 0 0 1000 1 0
## 6 1 0 0 0 0 1200 0 0
## 7 1 0 0 0 0 2000 0 0
## 8 1 0 0 0 0 2000 1 1
## 9 1 0 0 0 0 2200 0 1
## 10 1 0 0 0 0 2500 0 0
## # ... with 694 more rows
These individuals vary widely in characteristics and migration events.
options("survey.adjust.domain.lonely"=T)
options("survey.lonely.psu"='adjust')
options(survey.lonely.psu = "adjust")
e.long<-e.long%>%
filter(complete.cases(psu,college4, married, healthy, hisp, income,male))
des<-svydesign(nest = TRUE,ids =~psu,
strata =~strat, weights =~wht,
data = e.long)
svymean(~strat, design=des)
## mean SE
## strat 39.27 1.3021
Here I estimate the general, constant, and linear models.
m1<-svyglm(migtran~factor(time)+college4+married+hisp,
design=des, family=binomial (link="logit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
m0<-svyglm(migtran~ college4+married+hisp,
design=des, family=binomial (link="logit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
m2<-svyglm(migtran~time+college4+married+hisp, design=des,
family=binomial (link="logit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(m0)
##
## Call:
## svyglm(formula = migtran ~ college4 + married + hisp, design = des,
## family = binomial(link = "logit"))
##
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht,
## data = e.long)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.7811 0.1123 -6.954 4.49e-10 ***
## college4 0.4874 0.1773 2.748 0.00717 **
## married 0.4435 0.1875 2.365 0.02006 *
## hisp -0.3406 0.1641 -2.075 0.04068 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000003)
##
## Number of Fisher Scoring iterations: 4
summary(m1)
##
## Call:
## svyglm(formula = migtran ~ factor(time) + college4 + married +
## hisp, design = des, family = binomial(link = "logit"))
##
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht,
## data = e.long)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.80700 0.13279 -6.077 2.6e-08 ***
## factor(time)2 0.06353 0.13456 0.472 0.63791
## college4 0.49050 0.17931 2.735 0.00745 **
## married 0.44540 0.18809 2.368 0.01993 *
## hisp -0.34195 0.16385 -2.087 0.03960 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9998748)
##
## Number of Fisher Scoring iterations: 4
summary(m2)
##
## Call:
## svyglm(formula = migtran ~ time + college4 + married + hisp,
## design = des, family = binomial(link = "logit"))
##
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht,
## data = e.long)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.87054 0.23515 -3.702 0.00036 ***
## time 0.06353 0.13456 0.472 0.63791
## college4 0.49050 0.17931 2.735 0.00745 **
## married 0.44540 0.18809 2.368 0.01993 *
## hisp -0.34195 0.16385 -2.087 0.03960 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9998748)
##
## Number of Fisher Scoring iterations: 4
#D. Check of Model Fit:
AIC0<-AIC(m0)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC1<-AIC(m1)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC2<-AIC(m2)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AICS<-data.frame(AIC = c(AIC0["AIC"],AIC1["AIC"],AIC2["AIC"]),
mod = factor(c("const", "general", "linear")))
AICS$mod<-forcats::fct_relevel(AICS$mod, c("general", "const" , "linear"))
AICS$deltaAIC<-AICS$AIC-AICS$AIC[AICS$mod=="general"]
knitr::kable(AICS[, c("mod", "AIC", "deltaAIC")], caption = "Relative AIC for alternative time specifications")
mod | AIC | deltaAIC |
---|---|---|
const | 1227.242 | -1.662878 |
general | 1228.904 | 0.000000 |
linear | 1228.904 | 0.000000 |
Here the results of the model fit tests show that the general and linear time specifications provide a model with the best fit.
newdat<-expand.grid(time = unique(e.long$time),
hisp= unique(e.long$hisp),
college4 = unique(e.long$college4),
married = unique(e.long$married))
newdat<-newdat%>%
dplyr::filter(complete.cases(.))
newdat$h0<-predict(m0, newdata=newdat, type="response")
newdat$h1<-predict(m1, newdata=newdat, type="response")
newdat$h2<-predict(m2, newdata=newdat, type="response")
head(newdat)
## time hisp college4 married h0 h1 h2
## 1 1 0 0 0 0.3140737 0.3085296 0.3085296
## 2 2 0 0 0 0.3140737 0.3222458 0.3222458
## 3 1 1 0 0 0.2456923 0.2406807 0.2406807
## 4 2 1 0 0 0.2456923 0.2524819 0.2524819
## 5 1 0 1 0 0.4270855 0.4215274 0.4215274
## 6 2 0 1 0 0.4270855 0.4370917 0.4370917
#Basic logistic model with ONLY time in the model
fit.0<-svyglm(migtran~as.factor(time)-1,design=des , family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.l<-svyglm(migtran~time,design=des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.s<-svyglm(migtran~time+I(time^2),design=des , family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.c<-svyglm(migtran~time+I(time^2)+I(time^3 ),design=des , family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.0)
##
## Call:
## svyglm(formula = migtran ~ as.factor(time) - 1, design = des,
## family = "binomial")
##
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht,
## data = e.long)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(time)1 -0.62020 0.09908 -6.259 1.05e-08 ***
## as.factor(time)2 -0.58545 0.11593 -5.050 2.07e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.001049)
##
## Number of Fisher Scoring iterations: 4
haz<-1/(1+exp(-coef(fit.0)))
time<-seq(1,2,1)
plot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for Migration")
St<-NA
time<-1:length(haz)
St[1]<-1-haz[1]
for(i in 2:length(haz)){
St[i]<-St[i-1]* (1-haz[i])
}
plot(y=St,x=time, type="l", main="Survival function for Great Recession Migration")
summary(fit.l)
##
## Call:
## svyglm(formula = migtran ~ time, design = des, family = "binomial")
##
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht,
## data = e.long)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.65495 0.19952 -3.283 0.00143 **
## time 0.03475 0.12964 0.268 0.78925
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.001049)
##
## Number of Fisher Scoring iterations: 4
summary(fit.s)
##
## Call:
## svyglm(formula = migtran ~ time + I(time^2), design = des, family = "binomial")
##
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht,
## data = e.long)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.65495 0.19952 -3.283 0.00143 **
## time 0.03475 0.12964 0.268 0.78925
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.001049)
##
## Number of Fisher Scoring iterations: 4
summary(fit.c)
##
## Call:
## svyglm(formula = migtran ~ time + I(time^2) + I(time^3), design = des,
## family = "binomial")
##
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht,
## data = e.long)
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.65495 0.19952 -3.283 0.00143 **
## time 0.03475 0.12964 0.268 0.78925
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.001049)
##
## Number of Fisher Scoring iterations: 4
dat<-expand.grid(year=seq(1,2))
dat$genmod<-predict(fit.0, newdata=data.frame(year=as.factor(1:2 )), type="response")
dat$lin<-predict(fit.l, newdata=dat, type="response")
dat
## year genmod lin
## 1 1 0.3497363 0.3497363
## 2 2 0.3576791 0.3576791
plot(genmod~year, dat, type="l", ylab="h(t)", xlab="Time")
title(main="Hazard function from different time parameterizations")
lines(lin~year, dat, col=2, lwd=2)
legend("topleft", legend=c("General Model", "Linear", "Square", "Cubic"), col=1:2, lwd=1.5)
aic<-round(c(
fit.l$deviance+2*length(fit.l$coefficients),
fit.s$deviance+2*length(fit.s$coefficients),
fit.c$deviance+2*length(fit.c$coefficients),
fit.0$deviance+2*length(fit.0$coefficients)),2)
#compare all aics to the one from the general model
dif.aic<-round(aic-aic[6],2)
data.frame(model =c( "linear","square", "cubic", "general"), aic=aic, aic_dif=dif.aic)
## model aic aic_dif
## 1 linear 1242.57 NA
## 2 square 1244.57 NA
## 3 cubic 1246.57 NA
## 4 general 1242.57 NA
Here the general and linear models are again the better fitting models.