library(haven)
library(foreign) 
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(broom)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(readr) 
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(car) 
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(alr3)

## 
## Attaching package: 'alr3'

## The following object is masked from 'package:MASS':
## 
##     forbes

library(zoo)
library(nortest)
library(plotrix)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:plotrix':
## 
##     rescale

## The following object is masked from 'package:readr':
## 
##     col_factor

library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(VGAM)

## Loading required package: stats4

## Loading required package: splines

## 
## Attaching package: 'VGAM'

## The following object is masked from 'package:survey':
## 
##     calibrate

## The following object is masked from 'package:lmtest':
## 
##     lrtest

## The following object is masked from 'package:car':
## 
##     logit

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(sandwich)
library(pastecs)

## 
## Attaching package: 'pastecs'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

library(survey)
library(muhaz)
library(ggpubr)
library(survminer)
library(eha)

## 
## Attaching package: 'eha'

## The following objects are masked from 'package:VGAM':
## 
##     dgompertz, dmakeham, pgompertz, pmakeham, qgompertz, qmakeham,
##     rgompertz, rmakeham

library(reshape2)
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

## The following objects are masked from 'package:pastecs':
## 
##     first, last

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:pastecs':
## 
##     extract

nlsy97mig<-read.csv("C:\\Users\\Jaire\\OneDrive\\Desktop\\Research\\Data\\nlsy97mig.csv")

Overview of Analysis Part 2:

In preparation for an upcoming research study, I conduct a secondary assessment for the risk of migration during the Great Recession among young adults using a discrete time hazard model and longitudinal data from the National Longitudinal Survey of Youth 1997 (2007-2009).

Additionally, I use general, linear, and constant (<5 waves) time specifications.

The event of interest is migration, measured here as the survey respondent moving from their previous state, city, or county since the date of the last interview.

I use the follow predictors which were significant to estimate the risk of migration based on the previous discrete time model estimated in part 1:

Education (YSCH-3113) - college 4-year or more /Less than 4-year (2007)
Race (KEY!RACE_ETHNICITY) - Hispanic/non-Hispanic
Marital Status (CV_MARSTAT) - Married/other or not married (2007)

A. Transformations:

Here I included the part 1 predictors in transformations, but I only intend to use the three listed above.

# Age at interview

nlsy97mig$age1<-ifelse(nlsy97mig$ragem07==-5, NA,nlsy97mig$ragem07/12)

nlsy97mig$age2<-ifelse(nlsy97mig$ragem08==-5, NA,nlsy97mig$ragem08/12)

nlsy97mig$age3<-ifelse(nlsy97mig$ragem09==-5, NA,nlsy97mig$ragem09/12)

# Migration since last interview
nlsy97mig$mig1<-ifelse(nlsy97mig$rmig07==-4, NA,nlsy97mig$rmig07)

nlsy97mig$mig2<-ifelse(nlsy97mig$rmig08==-4, NA,nlsy97mig$rmig08)

nlsy97mig$mig3<-ifelse(nlsy97mig$rmig09==-4, NA,nlsy97mig$rmig09)

nlsy97mig$mig1<-ifelse(nlsy97mig$rmig07==-5, NA,nlsy97mig$rmig07)

nlsy97mig$mig2<-ifelse(nlsy97mig$rmig08==-5, NA,nlsy97mig$rmig08)

nlsy97mig$mig3<-ifelse(nlsy97mig$rmig09==-5, NA,nlsy97mig$rmig09)


# Sex
nlsy97mig$male<-recode(nlsy97mig$rsex, recodes = "1=1; 2=0")

# Hispanic
nlsy97mig$hisp<-recode(nlsy97mig$rrace, recodes = "1=0; 2=1; 3=0;4=0")

#Highest Level of Education 4-year degree or higher
nlsy97mig$college4<-recode(nlsy97mig$reduc07, recodes = "-5=NA;-2=NA;1:4=0;5:8=1")

#Income past year below median household income 2007

nlsy97mig$income<-recode(nlsy97mig$rinc07, recodes = "-5=NA;-4=NA;-2=NA;-1=NA")

#Marital Status - married

nlsy97mig$married<-recode(nlsy97mig$rmar07, recodes = "-5=NA;-3=NA; 0=0; 2:4=0")

#Healthy - very good or excellent
 nlsy97mig$healthy<-recode(nlsy97mig$rheal07,recodes = "-5=NA;-2=NA;1:2=1;3:5=0")

#subset censoring
nlsy97mig<-subset(nlsy97mig, is.na(mig1)==F&is.na(mig2)==F&
                   is.na(mig3)==F&is.na(age1)==F&
                   is.na(age2)==F&is.na(age3)==F&mig1!=1)

# Reshape
e.long<-reshape(data.frame(nlsy97mig), idvar = "rid",
                varying = list(c("age1", "age2"),
                  c("age2", "age3")),
                v.names = c("age_enter", "age_exit"),
                times = 1:2, direction = "long")
e.long<-e.long[order(e.long$rid, e.long$time),]

e.long$migtran<-NA

e.long$migtran[e.long$mig1==0&e.long$mig2==1&e.long$time==1]<-1
e.long$migtran[e.long$mig2==0&e.long$mig3==1&e.long$time==2]<-1

e.long$migtran[e.long$mig1==0&e.long$mig2==0&e.long$time==1]<-0
e.long$migtran[e.long$mig2==0&e.long$mig3==0&e.long$time==2]<-0

#Censor failures from 2nd and 3rd period risk
failed1<-which(is.na(e.long$migtran)==T)
e.long<-e.long[-failed1,]

e.long$age1r<-round(e.long$age_enter,0)
e.long$age2r<-round(e.long$age_exit,0)
e.long$time_start<-e.long$time-1

head(e.long, n=10)

##       rid    wht psu strat rsex rrace rbirthyr rmar07 reduc07 rinc07 rheal07
## 26.1   26 125910   1    35    1     1     1980      0       1     -4       3
## 27.1   27 114180   1    35    1     1     1981      0       1   3000       2
## 60.1   60 281400   2    35    1     4     1984      0       3   3000       2
## 68.1   68 102999   1    35    1     1     1981      0       3     -4       2
## 70.2   70  94241   1    35    1     1     1982      0       5     -2       3
## 87.1   87 238852   2    37    1     2     1984      0       4     -4       2
## 100.2 100 101111   1    37    2     1     1981      0       1  23000       3
## 104.2 104 336171   2    37    2     4     1980      0       5  44000       3
## 113.2 113 299852   1    37    1     4     1984      0       5   5000       1
## 150.1 150 120418   1    34    1     1     1984      0       3  13000       2
##       rage07 rage08 rage09 rmig07 rmig08 rmig09 ragem07 ragem08 ragem09 rcit07
## 26.1      27     28     29      0      0     -4     328     341     348      1
## 27.1      26     27     28      0      0     -4     317     329     341      1
## 60.1      24     24     26      0      1      1     288     297     312      1
## 68.1      26     27     28      0      0     -4     323     335     344      2
## 70.2      25     26     27     -4      0      0     302     315     326      1
## 87.1      23     24     25      0      0     -4     285     296     306      1
## 100.2     26     27     28     -4      0      0     323     334     345      1
## 104.2     26     28     28     -4      0      0     324     336     346      1
## 113.2     23     24     25     -4      0      0     282     297     306      1
## 150.1     23     24     25      0      0      1     284     297     307      1
##       rcit08 rcit09 ripr07 ripr08 ripr09 rhhs07 rhhs08 rhhs09 remp07 remp08
## 26.1       1      1     75     -3     18      1      1      1      5      5
## 27.1       1      1     25     76     16      8      4      4 200501 200501
## 60.1       1      1   1721    649    233      5      5     10      5 200701
## 68.1       2      2    142    160    256      1      1      1      5 200801
## 70.2       1      1     -3     -3     -3      3      1      1 200701 200701
## 87.1       1      1      6     -3     -3      3      3      3      5 200701
## 100.2      1      1    205      0     10      4      3      3 200602 200602
## 104.2      1      1    574   1116     -3      5      2      2 200203 200203
## 113.2      1      1   1721      0    300      6      3      3 200602      5
## 150.1      1      1    226     -3    241      2      5      1      4 200802
##       remp09 mig1 mig2 mig3 male hisp college4 income married healthy time
## 26.1       2    0    0   -4    1    0        0     NA       0       0    1
## 27.1  200501    0    0   -4    1    0        0   3000       0       1    1
## 60.1       5    0    1    1    1    0        0   3000       0       1    1
## 68.1  200801    0    0   -4    1    0        0     NA       0       1    1
## 70.2  200701   -4    0    0    1    0        1     NA       0       0    2
## 87.1  200701    0    0   -4    1    1        0     NA       0       1    1
## 100.2 200802   -4    0    0    0    0        0  23000       0       0    2
## 104.2 200802   -4    0    0    0    0        1  44000       0       0    2
## 113.2      5   -4    0    0    1    0        1   5000       0       1    2
## 150.1 200901    0    0    1    1    0        0  13000       0       1    1
##       age_enter age_exit migtran age1r age2r time_start
## 26.1   27.33333 28.41667       0    27    28          0
## 27.1   26.41667 27.41667       0    26    27          0
## 60.1   24.00000 24.75000       1    24    25          0
## 68.1   26.91667 27.91667       0    27    28          0
## 70.2   26.25000 27.16667       0    26    27          1
## 87.1   23.75000 24.66667       0    24    25          0
## 100.2  27.83333 28.75000       0    28    29          1
## 104.2  28.00000 28.83333       0    28    29          1
## 113.2  24.75000 25.50000       0    25    26          1
## 150.1  23.66667 24.75000       0    24    25          0

Descriptive Analysis of Rates:

e.long%>%
  group_by(time)%>%
  summarise(prop_event= mean(migtran, na.rm=T))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##    time prop_event
##   <int>      <dbl>
## 1     1      0.308
## 2     2      0.296

Here we can see that the overall risk of migration is roughly 1 percent higher in the first wave than the second wave. Below is a simple descriptive analysis of the outcome by characteristics of the respondents:

e.long%>%
  filter(complete.cases(college4, married, healthy, hisp, income,male))%>%
  group_by(time, college4, married, healthy, hisp,income, male)%>%
  summarise(prop_event= mean(migtran, na.rm=T))

## `summarise()` regrouping output by 'time', 'college4', 'married', 'healthy', 'hisp', 'income' (override with `.groups` argument)

## # A tibble: 704 x 8
## # Groups:   time, college4, married, healthy, hisp, income [582]
##     time college4 married healthy  hisp income  male prop_event
##    <int>    <dbl>   <dbl>   <dbl> <dbl>  <int> <dbl>      <dbl>
##  1     1        0       0       0     0    350     0        0  
##  2     1        0       0       0     0    400     1        0  
##  3     1        0       0       0     0    500     0        1  
##  4     1        0       0       0     0   1000     0        0.5
##  5     1        0       0       0     0   1000     1        0  
##  6     1        0       0       0     0   1200     0        0  
##  7     1        0       0       0     0   2000     0        0  
##  8     1        0       0       0     0   2000     1        1  
##  9     1        0       0       0     0   2200     0        1  
## 10     1        0       0       0     0   2500     0        0  
## # ... with 694 more rows

These individuals vary widely in characteristics and migration events.

Full Survey Design:

options("survey.adjust.domain.lonely"=T)
options("survey.lonely.psu"='adjust')
options(survey.lonely.psu = "adjust")
e.long<-e.long%>%
  filter(complete.cases(psu,college4, married, healthy, hisp, income,male))

des<-svydesign(nest = TRUE,ids =~psu,
               strata =~strat, weights =~wht,
               data = e.long)
svymean(~strat, design=des)

##        mean     SE
## strat 39.27 1.3021

B. General and Alternative Models:

Here I estimate the general, constant, and linear models.

General Model -

m1<-svyglm(migtran~factor(time)+college4+married+hisp,
           design=des, family=binomial (link="logit"))

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Constant Model -

m0<-svyglm(migtran~ college4+married+hisp,
           design=des, family=binomial (link="logit"))

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Linear Model -

m2<-svyglm(migtran~time+college4+married+hisp, design=des,
           family=binomial (link="logit"))

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

C. Main Effects

summary(m0)

## 
## Call:
## svyglm(formula = migtran ~ college4 + married + hisp, design = des, 
##     family = binomial(link = "logit"))
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht, 
##     data = e.long)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.7811     0.1123  -6.954 4.49e-10 ***
## college4      0.4874     0.1773   2.748  0.00717 ** 
## married       0.4435     0.1875   2.365  0.02006 *  
## hisp         -0.3406     0.1641  -2.075  0.04068 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000003)
## 
## Number of Fisher Scoring iterations: 4

summary(m1)

## 
## Call:
## svyglm(formula = migtran ~ factor(time) + college4 + married + 
##     hisp, design = des, family = binomial(link = "logit"))
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht, 
##     data = e.long)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.80700    0.13279  -6.077  2.6e-08 ***
## factor(time)2  0.06353    0.13456   0.472  0.63791    
## college4       0.49050    0.17931   2.735  0.00745 ** 
## married        0.44540    0.18809   2.368  0.01993 *  
## hisp          -0.34195    0.16385  -2.087  0.03960 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9998748)
## 
## Number of Fisher Scoring iterations: 4

summary(m2)

## 
## Call:
## svyglm(formula = migtran ~ time + college4 + married + hisp, 
##     design = des, family = binomial(link = "logit"))
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht, 
##     data = e.long)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.87054    0.23515  -3.702  0.00036 ***
## time         0.06353    0.13456   0.472  0.63791    
## college4     0.49050    0.17931   2.735  0.00745 ** 
## married      0.44540    0.18809   2.368  0.01993 *  
## hisp        -0.34195    0.16385  -2.087  0.03960 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9998748)
## 
## Number of Fisher Scoring iterations: 4

#D. Check of Model Fit:

AIC0<-AIC(m0)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

AIC1<-AIC(m1)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

AIC2<-AIC(m2)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

AICS<-data.frame(AIC = c(AIC0["AIC"],AIC1["AIC"],AIC2["AIC"]),
                 mod = factor(c("const", "general", "linear")))

AICS$mod<-forcats::fct_relevel(AICS$mod, c("general", "const" , "linear"))
AICS$deltaAIC<-AICS$AIC-AICS$AIC[AICS$mod=="general"]

knitr::kable(AICS[, c("mod", "AIC", "deltaAIC")], caption = "Relative AIC for alternative time specifications")

Relative AIC for alternative time specifications
mod	AIC	deltaAIC
const	1227.242	-1.662878
general	1228.904	0.000000
linear	1228.904	0.000000

Here the results of the model fit tests show that the general and linear time specifications provide a model with the best fit.

E. Plots:

newdat<-expand.grid(time = unique(e.long$time), 
                    hisp= unique(e.long$hisp), 
                    college4 = unique(e.long$college4), 
                    married = unique(e.long$married))
newdat<-newdat%>%
  dplyr::filter(complete.cases(.))

newdat$h0<-predict(m0, newdata=newdat, type="response")
newdat$h1<-predict(m1, newdata=newdat, type="response")
newdat$h2<-predict(m2, newdata=newdat, type="response")

head(newdat)

##   time hisp college4 married        h0        h1        h2
## 1    1    0        0       0 0.3140737 0.3085296 0.3085296
## 2    2    0        0       0 0.3140737 0.3222458 0.3222458
## 3    1    1        0       0 0.2456923 0.2406807 0.2406807
## 4    2    1        0       0 0.2456923 0.2524819 0.2524819
## 5    1    0        1       0 0.4270855 0.4215274 0.4215274
## 6    2    0        1       0 0.4270855 0.4370917 0.4370917

#Basic logistic model with ONLY time in the model

fit.0<-svyglm(migtran~as.factor(time)-1,design=des , family="binomial")

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.l<-svyglm(migtran~time,design=des, family="binomial")

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.s<-svyglm(migtran~time+I(time^2),design=des , family="binomial")

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.c<-svyglm(migtran~time+I(time^2)+I(time^3 ),design=des , family="binomial")

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.0)

## 
## Call:
## svyglm(formula = migtran ~ as.factor(time) - 1, design = des, 
##     family = "binomial")
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht, 
##     data = e.long)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## as.factor(time)1 -0.62020    0.09908  -6.259 1.05e-08 ***
## as.factor(time)2 -0.58545    0.11593  -5.050 2.07e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.001049)
## 
## Number of Fisher Scoring iterations: 4

haz<-1/(1+exp(-coef(fit.0)))
time<-seq(1,2,1)
plot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for Migration")

St<-NA
time<-1:length(haz)
St[1]<-1-haz[1]
for(i in 2:length(haz)){
  St[i]<-St[i-1]* (1-haz[i])
}


plot(y=St,x=time, type="l", main="Survival function for Great Recession Migration")

Linear, Square, and Cubic models with ONLY time in the model

summary(fit.l)

## 
## Call:
## svyglm(formula = migtran ~ time, design = des, family = "binomial")
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht, 
##     data = e.long)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.65495    0.19952  -3.283  0.00143 **
## time         0.03475    0.12964   0.268  0.78925   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.001049)
## 
## Number of Fisher Scoring iterations: 4

summary(fit.s)

## 
## Call:
## svyglm(formula = migtran ~ time + I(time^2), design = des, family = "binomial")
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht, 
##     data = e.long)
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.65495    0.19952  -3.283  0.00143 **
## time         0.03475    0.12964   0.268  0.78925   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.001049)
## 
## Number of Fisher Scoring iterations: 4

summary(fit.c)

## 
## Call:
## svyglm(formula = migtran ~ time + I(time^2) + I(time^3), design = des, 
##     family = "binomial")
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~psu, strata = ~strat, weights = ~wht, 
##     data = e.long)
## 
## Coefficients: (2 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -0.65495    0.19952  -3.283  0.00143 **
## time         0.03475    0.12964   0.268  0.78925   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.001049)
## 
## Number of Fisher Scoring iterations: 4

Harzards with Only time in the model

dat<-expand.grid(year=seq(1,2))
dat$genmod<-predict(fit.0, newdata=data.frame(year=as.factor(1:2 )), type="response")
dat$lin<-predict(fit.l, newdata=dat, type="response")
dat

##   year    genmod       lin
## 1    1 0.3497363 0.3497363
## 2    2 0.3576791 0.3576791

Hazard plot

plot(genmod~year, dat, type="l", ylab="h(t)", xlab="Time")
title(main="Hazard function from different time parameterizations")
lines(lin~year, dat, col=2, lwd=2)


legend("topleft", legend=c("General Model", "Linear", "Square", "Cubic"), col=1:2, lwd=1.5)

AIC using Time only models:

aic<-round(c(
  fit.l$deviance+2*length(fit.l$coefficients),
  fit.s$deviance+2*length(fit.s$coefficients),
  fit.c$deviance+2*length(fit.c$coefficients),
  fit.0$deviance+2*length(fit.0$coefficients)),2)

#compare all aics to the one from the general model
dif.aic<-round(aic-aic[6],2)
data.frame(model =c( "linear","square", "cubic", "general"), aic=aic, aic_dif=dif.aic)

##     model     aic aic_dif
## 1  linear 1242.57      NA
## 2  square 1244.57      NA
## 3   cubic 1246.57      NA
## 4 general 1242.57      NA

Here the general and linear models are again the better fitting models.

Discrete Time Hazard Model Part 2

Jairemy Edwards

10/26/2020