library(haven)
library(foreign)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(readr)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(car)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(alr3)
##
## Attaching package: 'alr3'
## The following object is masked from 'package:MASS':
##
## forbes
library(zoo)
library(nortest)
library(plotrix)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
##
## rescale
## The following object is masked from 'package:readr':
##
## col_factor
library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
## The following object is masked from 'package:survey':
##
## calibrate
## The following object is masked from 'package:lmtest':
##
## lrtest
## The following object is masked from 'package:car':
##
## logit
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(sandwich)
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(survey)
library(muhaz)
library(ggpubr)
library(survminer)
library(eha)
##
## Attaching package: 'eha'
## The following objects are masked from 'package:VGAM':
##
## dgompertz, dmakeham, pgompertz, pmakeham, qgompertz, qmakeham,
## rgompertz, rmakeham
nlsy97mig<-read.csv("C:\\Users\\Jaire\\OneDrive\\Desktop\\Research\\Data\\nlsy97mig.csv")
In preparation for an upcoming research study, I assess the risk of migration during the Great Recession among Generation Y and Millennial respondents using a discrete time hazard model and longitudinal data from the National Longitudinal Survey of Youth 1997.
The event of interest is migration, operationalized here as the respondent moving from their previous state, city, or county since the date of the last interview.
I use the follow predictors to estimate the risk of migration:
Interaction term - Male*Healthy
# Age at interview
nlsy97mig$age1<-ifelse(nlsy97mig$ragem07==-5, NA,nlsy97mig$ragem07/12)
nlsy97mig$age2<-ifelse(nlsy97mig$ragem08==-5, NA,nlsy97mig$ragem08/12)
nlsy97mig$age3<-ifelse(nlsy97mig$ragem09==-5, NA,nlsy97mig$ragem09/12)
# Migration since last interview
nlsy97mig$mig1<-ifelse(nlsy97mig$rmig07==-4, NA,nlsy97mig$rmig07)
nlsy97mig$mig2<-ifelse(nlsy97mig$rmig08==-4, NA,nlsy97mig$rmig08)
nlsy97mig$mig3<-ifelse(nlsy97mig$rmig09==-4, NA,nlsy97mig$rmig09)
nlsy97mig$mig1<-ifelse(nlsy97mig$rmig07==-5, NA,nlsy97mig$rmig07)
nlsy97mig$mig2<-ifelse(nlsy97mig$rmig08==-5, NA,nlsy97mig$rmig08)
nlsy97mig$mig3<-ifelse(nlsy97mig$rmig09==-5, NA,nlsy97mig$rmig09)
# Sex
nlsy97mig$male<-recode(nlsy97mig$rsex, recodes = "1=1; 2=0")
# Hispanic
nlsy97mig$hisp<-recode(nlsy97mig$rrace, recodes = "1=0; 2=1; 3=0;4=0")
#Highest Level of Education 4-year degree or higher
nlsy97mig$college4<-recode(nlsy97mig$reduc07, recodes = "-5=NA;-2=NA;1:4=0;5:8=1")
#Income past year below median household income 2007
nlsy97mig$income<-recode(nlsy97mig$rinc07, recodes = "-5=NA;-4=NA;-2=NA;-1=NA")
#Marital Status - married
nlsy97mig$married<-recode(nlsy97mig$rmar07, recodes = "-5=NA;-3=NA; 0=0; 2:4=0")
#Healthy - very good or excellent
nlsy97mig$healthy<-recode(nlsy97mig$rheal07,recodes = "-5=NA;-2=NA;1:2=1;3:5=0")
#subset censoring
nlsy97mig<-subset(nlsy97mig, is.na(mig1)==F&is.na(mig2)==F&
is.na(mig3)==F&is.na(age1)==F&
is.na(age2)==F&is.na(age3)==F&mig1!=1)
# Reshape
e.long<-reshape(data.frame(nlsy97mig), idvar = "rid",
varying = list(c("age1", "age2"),
c("age2", "age3")),
v.names = c("age_enter", "age_exit"),
times = 1:2, direction = "long")
e.long<-e.long[order(e.long$rid, e.long$time),]
e.long$migtran<-NA
e.long$migtran[e.long$mig1==0&e.long$mig2==1&e.long$time==1]<-1
e.long$migtran[e.long$mig2==0&e.long$mig3==1&e.long$time==2]<-1
e.long$migtran[e.long$mig1==0&e.long$mig2==0&e.long$time==1]<-0
e.long$migtran[e.long$mig2==0&e.long$mig3==0&e.long$time==2]<-0
#Censor failures from 2nd and 3rd period risk
failed1<-which(is.na(e.long$migtran)==T)
e.long<-e.long[-failed1,]
e.long$age1r<-round(e.long$age_enter,0)
e.long$age2r<-round(e.long$age_exit,0)
e.long$time_start<-e.long$time-1
head(e.long, n=10)
## rid wht psu strat rsex rrace rbirthyr rmar07 reduc07 rinc07 rheal07
## 26.1 26 125910 1 35 1 1 1980 0 1 -4 3
## 27.1 27 114180 1 35 1 1 1981 0 1 3000 2
## 60.1 60 281400 2 35 1 4 1984 0 3 3000 2
## 68.1 68 102999 1 35 1 1 1981 0 3 -4 2
## 70.2 70 94241 1 35 1 1 1982 0 5 -2 3
## 87.1 87 238852 2 37 1 2 1984 0 4 -4 2
## 100.2 100 101111 1 37 2 1 1981 0 1 23000 3
## 104.2 104 336171 2 37 2 4 1980 0 5 44000 3
## 113.2 113 299852 1 37 1 4 1984 0 5 5000 1
## 150.1 150 120418 1 34 1 1 1984 0 3 13000 2
## rage07 rage08 rage09 rmig07 rmig08 rmig09 ragem07 ragem08 ragem09 mig1
## 26.1 27 28 29 0 0 -4 328 341 348 0
## 27.1 26 27 28 0 0 -4 317 329 341 0
## 60.1 24 24 26 0 1 1 288 297 312 0
## 68.1 26 27 28 0 0 -4 323 335 344 0
## 70.2 25 26 27 -4 0 0 302 315 326 -4
## 87.1 23 24 25 0 0 -4 285 296 306 0
## 100.2 26 27 28 -4 0 0 323 334 345 -4
## 104.2 26 28 28 -4 0 0 324 336 346 -4
## 113.2 23 24 25 -4 0 0 282 297 306 -4
## 150.1 23 24 25 0 0 1 284 297 307 0
## mig2 mig3 male hisp college4 income married healthy time age_enter
## 26.1 0 -4 1 0 0 NA 0 0 1 27.33333
## 27.1 0 -4 1 0 0 3000 0 1 1 26.41667
## 60.1 1 1 1 0 0 3000 0 1 1 24.00000
## 68.1 0 -4 1 0 0 NA 0 1 1 26.91667
## 70.2 0 0 1 0 1 NA 0 0 2 26.25000
## 87.1 0 -4 1 1 0 NA 0 1 1 23.75000
## 100.2 0 0 0 0 0 23000 0 0 2 27.83333
## 104.2 0 0 0 0 1 44000 0 0 2 28.00000
## 113.2 0 0 1 0 1 5000 0 1 2 24.75000
## 150.1 0 1 1 0 0 13000 0 1 1 23.66667
## age_exit migtran age1r age2r time_start
## 26.1 28.41667 0 27 28 0
## 27.1 27.41667 0 26 27 0
## 60.1 24.75000 1 24 25 0
## 68.1 27.91667 0 27 28 0
## 70.2 27.16667 0 26 27 1
## 87.1 24.66667 0 24 25 0
## 100.2 28.75000 0 28 29 1
## 104.2 28.83333 0 28 29 1
## 113.2 25.50000 0 25 26 1
## 150.1 24.75000 0 24 25 0
e.long%>%
group_by(time)%>%
summarise(prop_event= mean(migtran, na.rm=T))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## time prop_event
## <int> <dbl>
## 1 1 0.308
## 2 2 0.296
Here we can see that the risk of migration is roughly 1 percent higher in the first wave than the second wave. Below is a simple descriptive analysis of the outcome by characteristics of the respondents:
e.long%>%
filter(complete.cases(college4, married, healthy, hisp, income,male))%>%
group_by(time, college4, married, healthy, hisp,income, male)%>%
summarise(prop_event= mean(migtran, na.rm=T))
## `summarise()` regrouping output by 'time', 'college4', 'married', 'healthy', 'hisp', 'income' (override with `.groups` argument)
## # A tibble: 704 x 8
## # Groups: time, college4, married, healthy, hisp, income [582]
## time college4 married healthy hisp income male prop_event
## <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 1 0 0 0 0 350 0 0
## 2 1 0 0 0 0 400 1 0
## 3 1 0 0 0 0 500 0 1
## 4 1 0 0 0 0 1000 0 0.5
## 5 1 0 0 0 0 1000 1 0
## 6 1 0 0 0 0 1200 0 0
## 7 1 0 0 0 0 2000 0 0
## 8 1 0 0 0 0 2000 1 1
## 9 1 0 0 0 0 2200 0 1
## 10 1 0 0 0 0 2500 0 0
## # ... with 694 more rows
These individuals vary widely in characteristics and migration events.
options("survey.adjust.domain.lonely"=T)
options("survey.lonely.psu"='adjust')
options(survey.lonely.psu = "adjust")
e.long<-e.long%>%
filter(complete.cases(psu,college4, married, healthy, hisp, income,male))
des<-svydesign(nest = TRUE,ids =~psu,
strata =~strat, weights =~wht,
data = e.long)
svymean(~strat, design=des)
## mean SE
## strat 39.27 1.3021
fit1<-svyglm(migtran~as.factor(time_start)+male*healthy+married+college4+hisp+income-1,
design=des, family=binomial(link="logit"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit1)
##
## Call:
## svyglm(formula = migtran ~ as.factor(time_start) + male * healthy +
## married + college4 + hisp + income - 1, design = des, family = binomial(link = "logit"))
##
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht,
## data = e.long)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(time_start)0 -8.573e-01 2.414e-01 -3.552 0.000611 ***
## as.factor(time_start)1 -7.922e-01 2.207e-01 -3.589 0.000540 ***
## male 1.066e-01 2.826e-01 0.377 0.706906
## healthy 1.797e-01 2.125e-01 0.846 0.399968
## married 4.588e-01 1.919e-01 2.391 0.018901 *
## college4 4.814e-01 1.845e-01 2.609 0.010645 *
## hisp -3.321e-01 1.648e-01 -2.016 0.046834 *
## income -1.460e-06 5.091e-06 -0.287 0.774934
## male:healthy -2.469e-01 3.348e-01 -0.737 0.462873
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9999683)
##
## Number of Fisher Scoring iterations: 4
sums<-data.frame(round(summary(fit1)$coef, 3))
sums$HR<-round(exp(sums[,1]), 3)
sums
## Estimate Std..Error t.value Pr...t.. HR
## as.factor(time_start)0 -0.857 0.241 -3.552 0.001 0.424
## as.factor(time_start)1 -0.792 0.221 -3.589 0.001 0.453
## male 0.107 0.283 0.377 0.707 1.113
## healthy 0.180 0.212 0.846 0.400 1.197
## married 0.459 0.192 2.391 0.019 1.582
## college4 0.481 0.185 2.609 0.011 1.618
## hisp -0.332 0.165 -2.016 0.047 0.717
## income 0.000 0.000 -0.287 0.775 1.000
## male:healthy -0.247 0.335 -0.737 0.463 0.781
predicted probabilities for respondents within the analysis:
dat3<-expand.grid(time_start=as.factor(c(0,1)),healthy=c(0,1),male=c(0,1),married=c(0,1),
college4=c(0,1),hisp=c(0,1),income=c(0<=112215))
#scrub of unrealistic values
dat3$fitted<-as.numeric(predict(fit1, dat3, type="response"))
head(dat3)
## time_start healthy male married college4 hisp income fitted
## 1 0 0 0 0 0 0 TRUE 0.2978954
## 2 1 0 0 0 0 0 TRUE 0.3116880
## 3 0 1 0 0 0 0 TRUE 0.3367809
## 4 1 1 0 0 0 0 TRUE 0.3514727
## 5 0 0 1 0 0 0 TRUE 0.3206561
## 6 1 0 1 0 0 0 TRUE 0.3349997
library(ggplot2)
dat3%>%
mutate(edu = ifelse(college4==1, "4yr<=", ">4yr"),
group = paste(edu, married, sep="-"),
period=ifelse(time_start==1, "Period 1", "Period 2"))%>%
ggplot()+
geom_bar(aes(y=fitted,x=married,fill=married, group=married),stat="identity", position="dodge")+
facet_grid(~edu+period)+
ggtitle(label="Hazard of Migrating by College Education, Marital Status, and Wave")
After estimating the risk of migration during the Great Recession among Generation Y and Millennial respondents using a discrete time hazard model(logit) and longitudinal data from the National Longitudinal Survey of Youth 1997, my results show a moderate effect of three predictors.
Respondents who were married at the beginning of the Great Recession (2007) were at a significantly greater risk of moving from their previous state, city, or county than those who were not married (p<0.05).
Respondents who held a 4-year degree or more at the beginning of the Great Recession (2007) were at a significantly greater risk of moving from their previous state, city, or county than those who held less than a 4-year or more (p<0.05).
Hispanic respondents were at a significantly lower risk of moving from their previous state, city, or county than non-Hispanic respondents (p<0.05).
Overall, the general risk of migrating during each wave was significantly low (p<0.001)