library(haven)
library(foreign) 
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(broom)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(readr) 
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(car) 
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(alr3)

## 
## Attaching package: 'alr3'

## The following object is masked from 'package:MASS':
## 
##     forbes

library(zoo)
library(nortest)
library(plotrix)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:plotrix':
## 
##     rescale

## The following object is masked from 'package:readr':
## 
##     col_factor

library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(VGAM)

## Loading required package: stats4

## Loading required package: splines

## 
## Attaching package: 'VGAM'

## The following object is masked from 'package:survey':
## 
##     calibrate

## The following object is masked from 'package:lmtest':
## 
##     lrtest

## The following object is masked from 'package:car':
## 
##     logit

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(sandwich)
library(pastecs)

## 
## Attaching package: 'pastecs'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

library(survey)
library(muhaz)
library(ggpubr)
library(survminer)
library(eha)

## 
## Attaching package: 'eha'

## The following objects are masked from 'package:VGAM':
## 
##     dgompertz, dmakeham, pgompertz, pmakeham, qgompertz, qmakeham,
##     rgompertz, rmakeham

nlsy97mig<-read.csv("C:\\Users\\Jaire\\OneDrive\\Desktop\\Research\\Data\\nlsy97mig.csv")

Overview of Analysis

In preparation for an upcoming research study, I assess the risk of migration during the Great Recession among Generation Y and Millennial respondents using a discrete time hazard model and longitudinal data from the National Longitudinal Survey of Youth 1997.

The event of interest is migration, operationalized here as the respondent moving from their previous state, city, or county since the date of the last interview.

I use the follow predictors to estimate the risk of migration:

Income (YINC-1700) - discrete values (2007)
Education (YSCH-3113) - college /no college (2007)
Sex (KEY!SEX) – male/sex
Race (KEY!RACE_ETHNICITY) - minority/non-minority
Marital Status (CV_MARSTAT) - coupled/ not-coupled (2007)
Health (YHEA-100) – health/unhealthy (2007)

Interaction term - Male*Healthy

Transformations:

# Age at interview

nlsy97mig$age1<-ifelse(nlsy97mig$ragem07==-5, NA,nlsy97mig$ragem07/12)

nlsy97mig$age2<-ifelse(nlsy97mig$ragem08==-5, NA,nlsy97mig$ragem08/12)

nlsy97mig$age3<-ifelse(nlsy97mig$ragem09==-5, NA,nlsy97mig$ragem09/12)

# Migration since last interview
nlsy97mig$mig1<-ifelse(nlsy97mig$rmig07==-4, NA,nlsy97mig$rmig07)

nlsy97mig$mig2<-ifelse(nlsy97mig$rmig08==-4, NA,nlsy97mig$rmig08)

nlsy97mig$mig3<-ifelse(nlsy97mig$rmig09==-4, NA,nlsy97mig$rmig09)

nlsy97mig$mig1<-ifelse(nlsy97mig$rmig07==-5, NA,nlsy97mig$rmig07)

nlsy97mig$mig2<-ifelse(nlsy97mig$rmig08==-5, NA,nlsy97mig$rmig08)

nlsy97mig$mig3<-ifelse(nlsy97mig$rmig09==-5, NA,nlsy97mig$rmig09)


# Sex
nlsy97mig$male<-recode(nlsy97mig$rsex, recodes = "1=1; 2=0")

# Hispanic
nlsy97mig$hisp<-recode(nlsy97mig$rrace, recodes = "1=0; 2=1; 3=0;4=0")

#Highest Level of Education 4-year degree or higher
nlsy97mig$college4<-recode(nlsy97mig$reduc07, recodes = "-5=NA;-2=NA;1:4=0;5:8=1")

#Income past year below median household income 2007

nlsy97mig$income<-recode(nlsy97mig$rinc07, recodes = "-5=NA;-4=NA;-2=NA;-1=NA")

#Marital Status - married

nlsy97mig$married<-recode(nlsy97mig$rmar07, recodes = "-5=NA;-3=NA; 0=0; 2:4=0")

#Healthy - very good or excellent
 nlsy97mig$healthy<-recode(nlsy97mig$rheal07,recodes = "-5=NA;-2=NA;1:2=1;3:5=0")

#subset censoring
nlsy97mig<-subset(nlsy97mig, is.na(mig1)==F&is.na(mig2)==F&
                   is.na(mig3)==F&is.na(age1)==F&
                   is.na(age2)==F&is.na(age3)==F&mig1!=1)

# Reshape
e.long<-reshape(data.frame(nlsy97mig), idvar = "rid",
                varying = list(c("age1", "age2"),
                  c("age2", "age3")),
                v.names = c("age_enter", "age_exit"),
                times = 1:2, direction = "long")
e.long<-e.long[order(e.long$rid, e.long$time),]

e.long$migtran<-NA

e.long$migtran[e.long$mig1==0&e.long$mig2==1&e.long$time==1]<-1
e.long$migtran[e.long$mig2==0&e.long$mig3==1&e.long$time==2]<-1

e.long$migtran[e.long$mig1==0&e.long$mig2==0&e.long$time==1]<-0
e.long$migtran[e.long$mig2==0&e.long$mig3==0&e.long$time==2]<-0

#Censor failures from 2nd and 3rd period risk
failed1<-which(is.na(e.long$migtran)==T)
e.long<-e.long[-failed1,]

e.long$age1r<-round(e.long$age_enter,0)
e.long$age2r<-round(e.long$age_exit,0)
e.long$time_start<-e.long$time-1

head(e.long, n=10)

##       rid    wht psu strat rsex rrace rbirthyr rmar07 reduc07 rinc07 rheal07
## 26.1   26 125910   1    35    1     1     1980      0       1     -4       3
## 27.1   27 114180   1    35    1     1     1981      0       1   3000       2
## 60.1   60 281400   2    35    1     4     1984      0       3   3000       2
## 68.1   68 102999   1    35    1     1     1981      0       3     -4       2
## 70.2   70  94241   1    35    1     1     1982      0       5     -2       3
## 87.1   87 238852   2    37    1     2     1984      0       4     -4       2
## 100.2 100 101111   1    37    2     1     1981      0       1  23000       3
## 104.2 104 336171   2    37    2     4     1980      0       5  44000       3
## 113.2 113 299852   1    37    1     4     1984      0       5   5000       1
## 150.1 150 120418   1    34    1     1     1984      0       3  13000       2
##       rage07 rage08 rage09 rmig07 rmig08 rmig09 ragem07 ragem08 ragem09 mig1
## 26.1      27     28     29      0      0     -4     328     341     348    0
## 27.1      26     27     28      0      0     -4     317     329     341    0
## 60.1      24     24     26      0      1      1     288     297     312    0
## 68.1      26     27     28      0      0     -4     323     335     344    0
## 70.2      25     26     27     -4      0      0     302     315     326   -4
## 87.1      23     24     25      0      0     -4     285     296     306    0
## 100.2     26     27     28     -4      0      0     323     334     345   -4
## 104.2     26     28     28     -4      0      0     324     336     346   -4
## 113.2     23     24     25     -4      0      0     282     297     306   -4
## 150.1     23     24     25      0      0      1     284     297     307    0
##       mig2 mig3 male hisp college4 income married healthy time age_enter
## 26.1     0   -4    1    0        0     NA       0       0    1  27.33333
## 27.1     0   -4    1    0        0   3000       0       1    1  26.41667
## 60.1     1    1    1    0        0   3000       0       1    1  24.00000
## 68.1     0   -4    1    0        0     NA       0       1    1  26.91667
## 70.2     0    0    1    0        1     NA       0       0    2  26.25000
## 87.1     0   -4    1    1        0     NA       0       1    1  23.75000
## 100.2    0    0    0    0        0  23000       0       0    2  27.83333
## 104.2    0    0    0    0        1  44000       0       0    2  28.00000
## 113.2    0    0    1    0        1   5000       0       1    2  24.75000
## 150.1    0    1    1    0        0  13000       0       1    1  23.66667
##       age_exit migtran age1r age2r time_start
## 26.1  28.41667       0    27    28          0
## 27.1  27.41667       0    26    27          0
## 60.1  24.75000       1    24    25          0
## 68.1  27.91667       0    27    28          0
## 70.2  27.16667       0    26    27          1
## 87.1  24.66667       0    24    25          0
## 100.2 28.75000       0    28    29          1
## 104.2 28.83333       0    28    29          1
## 113.2 25.50000       0    25    26          1
## 150.1 24.75000       0    24    25          0

Descriptive Analysis of Rates:

e.long%>%
  group_by(time)%>%
  summarise(prop_event= mean(migtran, na.rm=T))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##    time prop_event
##   <int>      <dbl>
## 1     1      0.308
## 2     2      0.296

Here we can see that the risk of migration is roughly 1 percent higher in the first wave than the second wave. Below is a simple descriptive analysis of the outcome by characteristics of the respondents:

e.long%>%
  filter(complete.cases(college4, married, healthy, hisp, income,male))%>%
  group_by(time, college4, married, healthy, hisp,income, male)%>%
  summarise(prop_event= mean(migtran, na.rm=T))

## `summarise()` regrouping output by 'time', 'college4', 'married', 'healthy', 'hisp', 'income' (override with `.groups` argument)

## # A tibble: 704 x 8
## # Groups:   time, college4, married, healthy, hisp, income [582]
##     time college4 married healthy  hisp income  male prop_event
##    <int>    <dbl>   <dbl>   <dbl> <dbl>  <int> <dbl>      <dbl>
##  1     1        0       0       0     0    350     0        0  
##  2     1        0       0       0     0    400     1        0  
##  3     1        0       0       0     0    500     0        1  
##  4     1        0       0       0     0   1000     0        0.5
##  5     1        0       0       0     0   1000     1        0  
##  6     1        0       0       0     0   1200     0        0  
##  7     1        0       0       0     0   2000     0        0  
##  8     1        0       0       0     0   2000     1        1  
##  9     1        0       0       0     0   2200     0        1  
## 10     1        0       0       0     0   2500     0        0  
## # ... with 694 more rows

These individuals vary widely in characteristics and migration events.

Full Survey Design:

options("survey.adjust.domain.lonely"=T)
options("survey.lonely.psu"='adjust')
options(survey.lonely.psu = "adjust")
e.long<-e.long%>%
  filter(complete.cases(psu,college4, married, healthy, hisp, income,male))

des<-svydesign(nest = TRUE,ids =~psu,
               strata =~strat, weights =~wht,
               data = e.long)
svymean(~strat, design=des)

##        mean     SE
## strat 39.27 1.3021

Basic Discrete Time Model (Logit Function)

fit1<-svyglm(migtran~as.factor(time_start)+male*healthy+married+college4+hisp+income-1,
              design=des, family=binomial(link="logit"))

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit1)

## 
## Call:
## svyglm(formula = migtran ~ as.factor(time_start) + male * healthy + 
##     married + college4 + hisp + income - 1, design = des, family = binomial(link = "logit"))
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht, 
##     data = e.long)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## as.factor(time_start)0 -8.573e-01  2.414e-01  -3.552 0.000611 ***
## as.factor(time_start)1 -7.922e-01  2.207e-01  -3.589 0.000540 ***
## male                    1.066e-01  2.826e-01   0.377 0.706906    
## healthy                 1.797e-01  2.125e-01   0.846 0.399968    
## married                 4.588e-01  1.919e-01   2.391 0.018901 *  
## college4                4.814e-01  1.845e-01   2.609 0.010645 *  
## hisp                   -3.321e-01  1.648e-01  -2.016 0.046834 *  
## income                 -1.460e-06  5.091e-06  -0.287 0.774934    
## male:healthy           -2.469e-01  3.348e-01  -0.737 0.462873    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9999683)
## 
## Number of Fisher Scoring iterations: 4

Hazard Ratios:

sums<-data.frame(round(summary(fit1)$coef, 3))
sums$HR<-round(exp(sums[,1]), 3)
sums

##                        Estimate Std..Error t.value Pr...t..    HR
## as.factor(time_start)0   -0.857      0.241  -3.552    0.001 0.424
## as.factor(time_start)1   -0.792      0.221  -3.589    0.001 0.453
## male                      0.107      0.283   0.377    0.707 1.113
## healthy                   0.180      0.212   0.846    0.400 1.197
## married                   0.459      0.192   2.391    0.019 1.582
## college4                  0.481      0.185   2.609    0.011 1.618
## hisp                     -0.332      0.165  -2.016    0.047 0.717
## income                    0.000      0.000  -0.287    0.775 1.000
## male:healthy             -0.247      0.335  -0.737    0.463 0.781

predicted probabilities for respondents within the analysis:

dat3<-expand.grid(time_start=as.factor(c(0,1)),healthy=c(0,1),male=c(0,1),married=c(0,1),
                  college4=c(0,1),hisp=c(0,1),income=c(0<=112215))

#scrub of unrealistic values
dat3$fitted<-as.numeric(predict(fit1, dat3, type="response"))

head(dat3)

##   time_start healthy male married college4 hisp income    fitted
## 1          0       0    0       0        0    0   TRUE 0.2978954
## 2          1       0    0       0        0    0   TRUE 0.3116880
## 3          0       1    0       0        0    0   TRUE 0.3367809
## 4          1       1    0       0        0    0   TRUE 0.3514727
## 5          0       0    1       0        0    0   TRUE 0.3206561
## 6          1       0    1       0        0    0   TRUE 0.3349997

Hazard Plots

library(ggplot2)
dat3%>%
  mutate(edu = ifelse(college4==1, "4yr<=", ">4yr"),
         group = paste(edu, married, sep="-"), 
         period=ifelse(time_start==1, "Period 1", "Period 2"))%>%
  ggplot()+
  geom_bar(aes(y=fitted,x=married,fill=married, group=married),stat="identity", position="dodge")+
  facet_grid(~edu+period)+
  ggtitle(label="Hazard of Migrating by College Education, Marital Status, and Wave")

Results:

After estimating the risk of migration during the Great Recession among Generation Y and Millennial respondents using a discrete time hazard model(logit) and longitudinal data from the National Longitudinal Survey of Youth 1997, my results show a moderate effect of three predictors.

Respondents who were married at the beginning of the Great Recession (2007) were at a significantly greater risk of moving from their previous state, city, or county than those who were not married (p<0.05).

Respondents who held a 4-year degree or more at the beginning of the Great Recession (2007) were at a significantly greater risk of moving from their previous state, city, or county than those who held less than a 4-year or more (p<0.05).

Hispanic respondents were at a significantly lower risk of moving from their previous state, city, or county than non-Hispanic respondents (p<0.05).

Overall, the general risk of migrating during each wave was significantly low (p<0.001)

Discrete Time Hazard Model Part 1

Jairemy Edwards

10/19/2020