Dem 7223

Homework 1

DUE October 8th 40 points Submit your work to Rpubs.com and send me the link. The format of this assignment should be a formatted HTML page with your name at the top of the page. Refer to this RMD template for assistance.

Using a data set of your choosing ## A. Descriptive Analysis (15 points) ### 1) Define your event variable
My event variable will be the transition from good health to bad health over the two year period from which the survey is sampled. The groups of people will from three categories: 1) those who did not move 2) those who move willingly 3) those who moved unwillingly.

2) Define a duration or time variable

The duration of the time between survey waves is 2 years.

3) Define a censoring indicator

The censoring indicator will be those who do not change states from good health to poor health

4) Estimate the following functions of survival time for that variable and plot them:

Survival

datcomb <- rbind(dat05,dat07,dat09)


datcomb <- datcomb %>% 
  mutate(
    move.y = ifelse(move == 'not move', 'not move',
                    ifelse(move.why == 'not displaced', 'not displaced', 'displaced')),
    edu = ifelse(edu <=11, 'lths','hs+'),
    time_start = ifelse(year == 2005, 0,
                        ifelse(year == 2007, 1,2)),
    time = time_start +1,
    healthd = ifelse(health == 'good', 0,1)
  ) %>% 
    arrange(id)

#remove those in poor health at time 1
phealth <- subset(datcomb, heal1 == 1)
ph <- phealth$id
datcomb <- datcomb[!(datcomb$id %in% ph),]

#assigning values for transitions to poor health
datcomb$healtran <- NA
datcomb$healtran[datcomb$heal1==0&datcomb$heal2==1&datcomb$time==2] <- 1
datcomb$healtran[datcomb$heal2==0&datcomb$heal3==1&datcomb$time==3] <- 1
datcomb$healtran[datcomb$heal1==0&datcomb$heal2==0&datcomb$time==1] <- 0
datcomb$healtran[datcomb$heal1==0&datcomb$heal2==0&datcomb$time==2] <- 0
datcomb$healtran[datcomb$heal2==0&datcomb$heal3==0&datcomb$time==3] <- 0

datcomb <- datcomb[complete.cases(datcomb), ]
#failed1 <- which(is.na(datcomb$healtran)==T)
#datcomb <- datcomb[-failed1]
#finding censored cases, that is those who did not transition from good to poor health
datcomb$cens <- 0
datcomb$cens[datcomb$heal1==0&datcomb$heal2==0&datcomb$heal3==0] <- 1
table(datcomb$cens)
## 
##     0     1 
##  4524 34319
options(survey.lonely.psu = "adjust")
des = svydesign(ids = ~1,
                strata = NULL,
                weights = datcomb$wt,
                data = datcomb)

fit1 <- survfit(Surv(time, healtran)~move.y, data = datcomb)
fit1
## Call: survfit(formula = Surv(time, healtran) ~ move.y, data = datcomb)
## 
##                          n events median 0.95LCL 0.95UCL
## move.y=displaced      1874    387     NA      NA      NA
## move.y=not displaced 10658    933     NA      NA      NA
## move.y=not move      26311   3204     NA      NA      NA
summary(fit1)
## Call: survfit(formula = Surv(time, healtran) ~ move.y, data = datcomb)
## 
##                 move.y=displaced 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     2   1718     157    0.909 0.00695        0.895        0.922
##     3   1031     230    0.706 0.01296        0.681        0.732
## 
##                 move.y=not displaced 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     2   7058     470    0.933 0.00297        0.928        0.939
##     3   3575     463    0.813 0.00584        0.801        0.824
## 
##                 move.y=not move 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     2  18326    1472    0.920 0.00201        0.916        0.924
##     3   9641    1732    0.754 0.00395        0.747        0.762
#str(datcomb)
ggsurvplot(fit1, conf.int=T, ylim=c(.88,1), xlim=c(1,4),title = "Survival Function of the health of those people surveyed \n not moved vs move not displaced vs moved displaced")

#table(datcomb$move.y)
#checking on difference of groups, Chisq = 81.8, p=2e-16
fit2 <- survdiff(Surv(time, healtran)~move.y, data = datcomb)
fit2
## Call:
## survdiff(formula = Surv(time, healtran) ~ move.y, data = datcomb)
## 
##                          N Observed Expected (O-E)^2/E (O-E)^2/V
## move.y=displaced      1874      387      309     19.95      24.6
## move.y=not displaced 10658      933     1155     42.72      65.7
## move.y=not move      26311     3204     3060      6.75      23.9
## 
##  Chisq= 79.6  on 2 degrees of freedom, p= <2e-16
#table(datcomb$healtran,datcomb$time)

b. Hazard

haz1 <- kphaz.fit(time=datcomb$time, status=datcomb$healtran, strata = datcomb$move.y, method = "product-limit")
haz2 <- kphaz.fit(time=datcomb$time, status=datcomb$healtran, method = "product-limit")
#kphaz.plot(haz1)
#kphaz.plot(haz2)
table(datcomb$time)
## 
##     1     2     3 
## 11741 12855 14247
haz2
## $time
## [1] 2.5
## 
## $haz
## [1] 1.051665
## 
## $var
## [1] NaN
summary(haz2)
##      Length Class  Mode   
## time 1      -none- numeric
## haz  1      -none- numeric
## var  1      -none- numeric
#plot(y=haz1$haz[1:2], x=haz1$time[1:2], col=1, lty= 1, type = "s")
#lines(y=haz1$haz[3:4], x=haz1$time[3:4], col=2, lty=1, type = "s")

#haz2 <- survfit(Surv(time, healtran)~move.y, data = datcomb, fun = s)
#plot(fit1, lty = 1:3, fun = 'haz')
#ggsurvplot(fit1, conf.int=T, ylim=c(0,1), data = datcomb, fun = 'cumhaz' )

I am having issues getting the hazard functions to plot. Haven’t been able to figure it out.

c. Cumulative Hazard

#cumhaz <- survfit(Surv(time, healtran)~move.y, type = 'fleming', data = datcomb)
#plot(fit1, lty=1:3, fun = 'cumhaz', ylab = 'Cumulative Hazard')
ggsurvplot(fit1, conf.int=T, ylim=c(0,.5), fun = 'cumhaz', title = 'Cumulative Hazard Function')

#ggsurvplot(fit1, conf.int=T, ylim=c(0,1), data = datcomb, fun = 'cumhaz ' )

5) Carry out the following analysis

a. Kaplan-Meier survival analysis of the outcome

ggsurvplot(fit1, conf.int=T, ylim=c(.88,1), xlim=c(1,4),title = "Survival Function of the health of those people surveyed \n not moved vs move not displaced vs moved displaced")

By looking at the Survival probability of the three, there appears to be evidence that there are diffrerences between the the three groups , although there does not appear to be evidence that those that are displaced are uniquely affected. ### b. Define a grouping variable, this can be dichotomous or categorical.
The grouping variable for this research will be categorical, and focus on whether or not a person has moved, and if so, why. The three categories are: 1) Not moved 2) Moved voluntarily 3) Moved displaced
### c. Do you have a research hypothesis about the survival patterns for the levels of the categorical variable? State it.
The research hypothesis for the survival patterns for this analysis is as follows:
H1: Those that are displaced from their home will be more likely to transition from good health to poor health than those that moved voluntarily and those who did not move.

d. Comparison of Kaplan-Meier survival across grouping variable in your data. Interpret your results.

As seen in the Survival probability graph, for just three waves (2005, 2007 and 2009), it appears that we can conclude that those that are displaced are more at risk to be in poorer health than those that moved voluntarily, however there is not sufficient evidence to ascetain a difference betwee those that are displaced and those the did not move as their confidence interals (CI) are overlapping. The CI for those that are displaced is larger than the other two categories because the number in the sample is much lower.

It does appear as if a trend exists, and I hope by bringing in four more waves, we may see conclusive eveidince to separate the three groups.

e. Plot the hazard function for the analysis for each level of the group variable

having issues plotting this

B. Parametric models (15 points)

1) Carry out the following analysis:

Define your outcome as in part A. Also consider what covariates are hypothesized to affect the outcome variable. Define these and construct a parametric model for your outcome. Fit the parametric model of your choosing to the data.

dat.comb <- rbind(dat05,dat07,dat09)

#str(dat.comb$race)
dat.comb <- dat.comb %>% 
    mutate(
      race = fct_relevel(race, 'nhw','nhb','hisp','asn','nh-other'),
    move.y = ifelse(move == 'not move', 'not move',
                    ifelse(move.why == 'not displaced', 'not displaced', 'displaced')), 
    educ = ifelse(edu <=11, 'lths',
                           ifelse(edu>=12, 'hs+',NA)),
    time_start = ifelse(year==2005,0,
                        ifelse(year==2007,1,2)),
    time = time_start +1,
    healthd = ifelse(health == 'good', 0,1),
  )%>% 
    arrange(id)
dat.comb <- dat.comb %>% 
  mutate(
    educ  = fct_relevel(educ, 'hs+','lths'),
    move.y = fct_relevel(move.y, 'not move','not displaced','displaced')
  )
#table(dat.comb$edu)


#remove those in poor health at time 1
phealth <- subset(dat.comb, heal1 == 1)
ph <- phealth$id
dat.comb <- dat.comb[!(dat.comb$id %in% ph),]

#assigning values for transitions to poor health
dat.comb$healtran <- NA
dat.comb$healtran[dat.comb$heal1==0&dat.comb$heal2==1&dat.comb$time==2] <- 1
dat.comb$healtran[dat.comb$heal2==0&dat.comb$heal3==1&dat.comb$time==3] <- 1
dat.comb$healtran[dat.comb$heal1==0&dat.comb$heal2==0&dat.comb$time==1] <- 0
dat.comb$healtran[dat.comb$heal1==0&dat.comb$heal2==0&dat.comb$time==2] <- 0
dat.comb$healtran[dat.comb$heal2==0&dat.comb$heal3==0&dat.comb$time==3] <- 0
dat.comb <- dat.comb[complete.cases(dat.comb), ]
#dat.comb$move.y <- levels(dat.comb$move.y)
#str(dat.comb)
dat.comb$age2 <- dat.comb$age^2

fit <- survfit(Surv(time,healtran)~1,data = dat.comb)
plot(fit)

fita <- survreg(Surv(year, healtran)~move.y+age2+educ+sex+race, weights = (wt+.0001), dist = "weibull", data = dat.comb)#AFT
fita

Call: survreg(formula = Surv(year, healtran) ~ move.y + age2 + educ + sex + race, data = dat.comb, weights = (wt + 1e-04), dist = “weibull”)

Coefficients: (Intercept) move.ynot displaced move.ydisplaced 7.606190e+00 3.504755e-05 -4.414531e-05 age2 educlths sex -8.251546e-08 -2.557523e-04 -8.081186e-05 racenhb racehisp raceasn -2.108076e-04 -1.181436e-04 -2.690633e-04 racenh-other -1.204430e-04

Scale= 0.0003946002

Loglik(model)= -276329.6 Loglik(intercept only)= -292727.3 Chisq= 32795.4 on 9 degrees of freedom, p= <2e-16 n= 38843

summary(fita)

Call: survreg(formula = Surv(year, healtran) ~ move.y + age2 + educ + sex + race, data = dat.comb, weights = (wt + 1e-04), dist = “weibull”) Value Std. Error z p (Intercept) 7.61e+00 3.65e-06 2.08e+06 <2e-16 move.ynot displaced 3.50e-05 3.68e-06 9.52e+00 <2e-16 move.ydisplaced -4.41e-05 5.00e-06 -8.84e+00 <2e-16 age2 -8.25e-08 6.47e-10 -1.27e+02 <2e-16 educlths -2.56e-04 3.13e-06 -8.18e+01 <2e-16 sex -8.08e-05 2.97e-06 -2.72e+01 <2e-16 racenhb -2.11e-04 3.57e-06 -5.90e+01 <2e-16 racehisp -1.18e-04 4.13e-06 -2.86e+01 <2e-16 raceasn -2.69e-04 1.34e-05 -2.00e+01 <2e-16 racenh-other -1.20e-04 1.19e-05 -1.01e+01 <2e-16 Log(scale) -7.84e+00 2.63e-03 -2.98e+03 <2e-16

Scale= 0.000395

Weibull distribution Loglik(model)= -276329.6 Loglik(intercept only)= -292727.3 Chisq= 32795.4 on 9 degrees of freedom, p= 0 Number of Newton-Raphson Iterations: 9 n= 38843

#plot(fita)
fitb <- phreg(Surv(year, healtran) ~ move.y+age2+edu+race, dist = "weibull", data = dat.comb)
fitb

Call: phreg(formula = Surv(year, healtran) ~ move.y + age2 + edu + race, data = dat.comb, dist = “weibull”)

Covariate W.mean Coef Exp(Coef) se(Coef) Wald p move.y not move 0.677 0 1 (reference) not displaced 0.274 -0.059 0.942 0.039 0.124 displaced 0.048 0.119 1.126 0.054 0.028 age2 1895.106 0.000 1.000 0.000 0.000 edu 13.014 -0.124 0.883 0.006 0.000 race nhw 0.577 0 1 (reference) nhb 0.323 0.418 1.520 0.033 0.000 hisp 0.086 0.192 1.212 0.054 0.000 asn 0.006 0.470 1.600 0.156 0.003 nh-other 0.009 0.238 1.269 0.151 0.114

log(scale) 7.605 0.000 0.000 log(shape) 7.854 0.012 0.000

Events 4524 Total time at risk 77962913 Max. log. likelihood -13577 LR test statistic 1439.54 Degrees of freedom 8 Overall p-value 0

summary(fitb)

Call: phreg(formula = Surv(year, healtran) ~ move.y + age2 + edu + race, data = dat.comb, dist = “weibull”)

Covariate W.mean Coef Exp(Coef) se(Coef) Wald p move.y not move 0.677 0 1 (reference) not displaced 0.274 -0.059 0.942 0.039 0.124 displaced 0.048 0.119 1.126 0.054 0.028 age2 1895.106 0.000 1.000 0.000 0.000 edu 13.014 -0.124 0.883 0.006 0.000 race nhw 0.577 0 1 (reference) nhb 0.323 0.418 1.520 0.033 0.000 hisp 0.086 0.192 1.212 0.054 0.000 asn 0.006 0.470 1.600 0.156 0.003 nh-other 0.009 0.238 1.269 0.151 0.114

log(scale) 7.605 0.000 0.000 log(shape) 7.854 0.012 0.000

Events 4524 Total time at risk 77962913 Max. log. likelihood -13577 LR test statistic 1439.54 Degrees of freedom 8 Overall p-value 0

plot(fitb)#this one appears to work

plot(fitb, fn="haz")

#plot(fita, conf.int = T)
#hazard plot

#fit.haz.km <- kphaz.fit(dat.comb$time, dat.comb$healtran, strata=dat.comb$move.y, method = "product-limit")
#fit.haz
#summary(fit.haz)
#plot(fit.haz)
#fit.haz.sm <- muhaz(dat.comb$time, dat.comb$healtran)

#base PH models
fit.ex <- phreg(Surv(year, healtran)~  move.y , data= dat.comb, dist = "weibull", shape = 1)
fit.wei <- phreg(Surv(year, healtran)~  move.y , data= dat.comb, dist = "weibull")
fit.ln <- phreg(Surv(year, healtran)~  move.y , data= dat.comb, dist = "lognormal")
fit.ll <- phreg(Surv(year, healtran)~  move.y , data= dat.comb, dist = "loglogistic")

fail in [dsytrf]; info = 5

## Warning in phreg(Surv(year, healtran) ~ move.y, data = dat.comb, dist =
## "loglogistic"): Failed with error code 5
fit.ex

Call: phreg(formula = Surv(year, healtran) ~ move.y, data = dat.comb, dist = “weibull”, shape = 1)

Covariate W.mean Coef Exp(Coef) se(Coef) Wald p move.y not move 0.677 0 1 (reference) not displaced 0.274 -0.330 0.719 0.037 0.000 displaced 0.048 0.528 1.695 0.054 0.000

log(scale) 9.710 0.018 0.000

Shape is fixed at 1

Events 4524 Total time at risk 77962913 Max. log. likelihood -48556 LR test statistic 195.76 Degrees of freedom 2 Overall p-value 0

fit.wei

Call: phreg(formula = Surv(year, healtran) ~ move.y, data = dat.comb, dist = “weibull”)

Covariate W.mean Coef Exp(Coef) se(Coef) Wald p move.y not move 0.677 0 1 (reference) not displaced 0.274 -0.248 0.780 0.037 0.000 displaced 0.048 0.143 1.153 0.054 0.008

log(scale) 7.606 0.000 0.000 log(shape) 7.852 0.012 0.000

Events 4524 Total time at risk 77962913 Max. log. likelihood -14266 LR test statistic 60.35 Degrees of freedom 2 Overall p-value 7.84928e-14

fit.ln

Call: phreg(formula = Surv(year, healtran) ~ move.y, data = dat.comb, dist = “lognormal”)

Covariate W.mean Coef Exp(Coef) se(Coef) Wald p (Intercept) 84.246 15.031 0.000 move.y not move 0.677 0 1 (reference) not displaced 0.274 -0.248 0.780 0.037 0.000 displaced 0.048 0.142 1.153 0.054 0.008

log(scale) 7.670 0.012 0.000 log(shape) 5.289 0.088 0.000

Events 4524 Total time at risk 77962913 Max. log. likelihood -14268 LR test statistic 61.18 Degrees of freedom 2 Overall p-value 5.18474e-14

fit.ll

[1] 1

plot(fit.ex)

plot(fit.wei)

plot(fit.ln)

plot(fit.ll)

#fitted PH models
fit.exf <- phreg(Surv(year, healtran)~  move.y+race+educ+age2
                , data= dat.comb, dist = "weibull", shape = 1)
fit.weif <- phreg(Surv(year, healtran)~  move.y+race+educ+age2 ,
                 data= dat.comb, dist = "weibull")
fit.lnf <- phreg(Surv(year, healtran)~  move.y+race+educ+age2 ,
                data= dat.comb, dist = "lognormal")
fit.llf <- phreg(Surv(year, healtran)~  move.y+race+educ+age2 ,
                data= dat.comb, dist = "loglogistic")
fit.exf

Call: phreg(formula = Surv(year, healtran) ~ move.y + race + educ + age2, data = dat.comb, dist = “weibull”, shape = 1)

Covariate W.mean Coef Exp(Coef) se(Coef) Wald p move.y not move 0.677 0 1 (reference) not displaced 0.274 -0.123 0.884 0.038 0.001 displaced 0.048 0.539 1.715 0.054 0.000 race nhw 0.577 0 1 (reference) nhb 0.323 0.545 1.724 0.033 0.000 hisp 0.086 0.485 1.624 0.052 0.000 asn 0.006 0.695 2.004 0.156 0.000 nh-other 0.009 0.226 1.254 0.151 0.133 educ hs+ 0.829 0 1 (reference) lths 0.171 0.571 1.770 0.034 0.000 age2 1895.106 0.000 1.000 0.000 0.000

log(scale) 10.632 0.034 0.000

Shape is fixed at 1

Events 4524 Total time at risk 77962913 Max. log. likelihood -47895 LR test statistic 1517.15 Degrees of freedom 8 Overall p-value 0

fit.weif

Call: phreg(formula = Surv(year, healtran) ~ move.y + race + educ + age2, data = dat.comb, dist = “weibull”)

Covariate W.mean Coef Exp(Coef) se(Coef) Wald p move.y not move 0.677 0 1 (reference) not displaced 0.274 -0.062 0.940 0.039 0.107 displaced 0.048 0.143 1.154 0.054 0.008 race nhw 0.577 0 1 (reference) nhb 0.323 0.478 1.613 0.033 0.000 hisp 0.086 0.376 1.457 0.051 0.000 asn 0.006 0.565 1.760 0.156 0.000 nh-other 0.009 0.261 1.298 0.151 0.083 educ hs+ 0.829 0 1 (reference) lths 0.171 0.638 1.892 0.034 0.000 age2 1895.106 0.000 1.000 0.000 0.000

log(scale) 7.606 0.000 0.000 log(shape) 7.853 0.012 0.000

Events 4524 Total time at risk 77962913 Max. log. likelihood -13650 LR test statistic 1291.80 Degrees of freedom 8 Overall p-value 0

fit.lnf

Call: phreg(formula = Surv(year, healtran) ~ move.y + race + educ + age2, data = dat.comb, dist = “lognormal”)

Covariate W.mean Coef Exp(Coef) se(Coef) Wald p (Intercept) 52.669 2.327 0.000 move.y not move 0.677 0 1 (reference) not displaced 0.274 -0.062 0.940 0.038 0.106 displaced 0.048 0.143 1.154 0.054 0.008 race nhw 0.577 0 1 (reference) nhb 0.323 0.478 1.613 0.033 0.000 hisp 0.086 0.376 1.457 0.051 0.000 asn 0.006 0.565 1.760 0.156 0.000 nh-other 0.009 0.261 1.298 0.150 0.083 educ hs+ 0.829 0 1 (reference) lths 0.171 0.638 1.892 0.034 0.000 age2 1895.106 0.000 1.000 0.000 0.000

log(scale) 7.646 0.001 0.000 log(shape) 5.513 0.016 0.000

Events 4524 Total time at risk 77962913 Max. log. likelihood -13653 LR test statistic 1290.12 Degrees of freedom 8 Overall p-value 0

fit.llf

Call: phreg(formula = Surv(year, healtran) ~ move.y + race + educ + age2, data = dat.comb, dist = “loglogistic”)

Covariate W.mean Coef Exp(Coef) se(Coef) Wald p (Intercept) 103.089 134089.526 0.999 move.y not move 0.677 0 1 (reference) not displaced 0.274 -0.062 0.940 0.039 0.107 displaced 0.048 0.143 1.154 0.054 0.008 race nhw 0.577 0 1 (reference) nhb 0.323 0.478 1.613 0.033 0.000 hisp 0.086 0.376 1.457 0.051 0.000 asn 0.006 0.565 1.760 0.156 0.000 nh-other 0.009 0.261 1.298 0.151 0.083 educ hs+ 0.829 0 1 (reference) lths 0.171 0.638 1.892 0.034 0.000 age2 1895.106 0.000 1.000 0.000 0.000

log(scale) 7.646 52.094 0.883 log(shape) 7.853 0.012 0.000

Events 4524 Total time at risk 77962913 Max. log. likelihood -13650 LR test statistic 1291.80 Degrees of freedom 8 Overall p-value 0

plot(fit.exf)

plot(fit.weif)

plot(fit.lnf)

plot(fit.llf)

plot(fit.weif, fn='haz')

#summary(dat09)

#checking on model fit



#stargazer(fit.exf,fit.weif,fit.llf,fit.lnf, summary=F, style = "demography", type = "html",
       #   column.labels = c("Exponential","Weibull","LogLog","LogNorm"))
#stargazer(summary(fit.exf),summary(fit.weif),summary(fit.llf),summary(fit.lnf), summary=F, style = "demography", type = "html", column.labels = c("Exponential","Weibull","LogLog","LogNorm"))
        
#stargazer(fit.weif, summary=F, style = "demography", type = "html")

a. Did you choose an AFT or PH model and why?

I’m going to assume that the chance of moving to poorer health due to being displaced is reamins constant over time and therefore choose a proportional hazard model for this analysis.

b. Justify what parametric distribution you choose

I chose the weibull distribution. As can be seen from the below table:

stargazer(fit.exf,fit.weif,fit.llf,fit.lnf, summary=F, style = "demography", type = "html",
          column.labels = c("Exponential","Weibull","LogLog","LogNorm"), align = T) 
year
Exponential Weibull LogLog LogNorm
Model 1 Model 2 Model 3 Model 4
move.ynot displaced -0.123** -0.062 -0.062 -0.062
(0.038) (0.039) (0.039) (0.038)
move.ydisplaced 0.539*** 0.143** 0.143** 0.143**
(0.054) (0.054) (0.054) (0.054)
racenhb 0.545*** 0.478*** 0.478*** 0.478***
(0.033) (0.033) (0.033) (0.033)
racehisp 0.485*** 0.376*** 0.376*** 0.376***
(0.052) (0.051) (0.051) (0.051)
raceasn 0.695*** 0.565*** 0.565*** 0.565***
(0.156) (0.156) (0.156) (0.156)
racenh-other 0.226 0.261 0.261 0.261
(0.151) (0.151) (0.151) (0.150)
educlths 0.571*** 0.638*** 0.638*** 0.638***
(0.034) (0.034) (0.034) (0.034)
age2 0.0002*** 0.0002*** 0.0002*** 0.0002***
(0.00001) (0.00001) (0.00001) (0.00001)
log(scale) 10.632*** 7.606*** 7.646 7.646***
(0.034) (0.00002) (52.094) (0.001)
log(shape) 7.853*** 7.853*** 5.513***
(0.012) (0.012) (0.016)
Constant 103.089 52.669***
(134,089.500) (2.327)
N 38,843 38,843 38,843 38,843
Log Likelihood -47,895.200 -13,650.370 -13,650.370 -13,653.450
p < .05; p < .01; p < .001

The exponential model is the only model that has statistically significant differences. The weibull, log-normal and log-logistic all virtually identical coefficents and similar p-values. The Log likelihood, and thus AIC (same number of paramaters) is closer to zero for both the weibul and the log-logistic. When fitting the models, the log-logistic had some warnings while the weibull ran clean, so, all other things being equal, I went with the weibull.

c. Carry out model fit diagnostics for the model

emp <- coxreg(Surv(time, healtran)~move.y+race+educ+age2, data = dat.comb)
check.dist(sp=emp,pp=fit.exf, main="empirical vs exponential")

check.dist(sp=emp,pp=fit.weif, main="empirical vs Weibull")

check.dist(sp=emp,pp=fit.lnf, main="empirical vs Log Normal")

check.dist(sp=emp,pp=fit.llf, main="empirical vs Log Log")

This anlysis shows that the exponential is closest to the ideal, however, I dismiss the exponential as the appropriate model as its AIC is so much larger than the other models. This check does not provide evidence that I have a good fitting model

d. Include all main effects in the model

Thats what I’ve been working with.

e. Test for an interaction between at least two of the predictors

fit.weif.int <- phreg(Surv(year, healtran)~  move.y+race+educ+age2+move.y*race+move.y*educ ,
                 data= dat.comb, dist = "weibull")
#fit.weif.int
stargazer(fit.weif.int, summary=F, style = "demography", type = "html",
          column.labels = c("Weibull"), align = T)
year
Weibull
move.ynot displaced -0.035
(0.061)
move.ydisplaced 0.242**
(0.093)
racenhb 0.490***
(0.039)
racehisp 0.388***
(0.062)
raceasn 0.729***
(0.169)
racenh-other 0.442**
(0.160)
educlths 0.642***
(0.040)
age2 0.0002***
(0.00001)
move.ynot displaced:racenhb -0.022
(0.081)
move.ydisplaced:racenhb -0.117
(0.119)
move.ynot displaced:racehisp -0.069
(0.127)
move.ydisplaced:racehisp -0.009
(0.176)
move.ynot displaced:raceasn -0.830
(0.531)
move.ydisplaced:raceasn -0.784
(0.733)
move.ynot displaced:racenh-other -1.286
(0.727)
move.ydisplaced:racenh-other -0.820
(0.605)
move.ynot displaced:educlths 0.006
(0.085)
move.ydisplaced:educlths -0.072
(0.116)
log(scale) 7.606***
(0.00002)
log(shape) 7.853***
(0.012)
N 38,843
Log Likelihood -13,644.540
p < .05; p < .01; p < .001

After testing for interaactions between reasons for moving and race and education, it appears as if none of the effects is statistically significant.

f. Interpret your results and write them up

summary(fit.weif)
## Call:
## phreg(formula = Surv(year, healtran) ~ move.y + race + educ + 
##     age2, data = dat.comb, dist = "weibull")
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## move.y 
##         not move    0.677     0         1           (reference)
##    not displaced    0.274    -0.062     0.940     0.039     0.107 
##        displaced    0.048     0.143     1.154     0.054     0.008 
## race 
##              nhw    0.577     0         1           (reference)
##              nhb    0.323     0.478     1.613     0.033     0.000 
##             hisp    0.086     0.376     1.457     0.051     0.000 
##              asn    0.006     0.565     1.760     0.156     0.000 
##         nh-other    0.009     0.261     1.298     0.151     0.083 
## educ 
##              hs+    0.829     0         1           (reference)
##             lths    0.171     0.638     1.892     0.034     0.000 
## age2             1895.106     0.000     1.000     0.000     0.000 
## 
## log(scale)                    7.606               0.000     0.000 
## log(shape)                    7.853               0.012     0.000 
## 
## Events                    4524 
## Total time at risk        77962913 
## Max. log. likelihood      -13650 
## LR test statistic         1291.80 
## Degrees of freedom        8 
## Overall p-value           0

After examining the results of the wiebull proportional hazards model, we can see that there is no statistical difference between those who mved and were not displaced and thos that did not move, however there is a significant difference between those who were displaced and those who did not move (reference). Those that are displaced are 1.15 times more at risk of transitioning from good health to poor health. Eith respect to race, non-Hispanic Black (1.6 times), Hispanic (1.5 times) and asian (1.8 times) are all more likely to transition to poor health than non-Hispanic white. The results of this analysis gives evidence to support our hypoithesis that those who are displaced are more at risk of transitioning into poor health than either of the other two groups those who did not move and those who moved but were not displaced.

C. Write Up: (10 points, 1-3 pages on how you set up the analysis, and interpreted the results of your analysis)

dat.nmov <- subset(dat.comb, move.y == 'not move')
dat.mov <- subset(dat.comb, move.y == 'not displaced')
dat.dis <- subset(dat.comb, move.y == 'displaced')
race.nmov <- table(dat.nmov$race)
race.mov <- table(dat.mov$race)
race.dis <- table(dat.dis$race)
race.tot <- table(dat.comb$race)

edu.nmov <- table(dat.nmov$educ)
edu.mov <- table(dat.mov$educ)
edu.dis <- table(dat.dis$educ)
edu.tot <- table(dat.comb$educ)

nhw = c(race.nmov[1]/race.tot[1], race.mov[1]/race.tot[1], race.dis[1]/race.tot[1])
nhb = c(race.nmov[2]/race.tot[2], race.mov[2]/race.tot[2], race.dis[2]/race.tot[2])
hisp = c(race.nmov[3]/race.tot[3], race.mov[3]/race.tot[3], race.dis[3]/race.tot[3])
asn = c(race.nmov[4]/race.tot[4], race.mov[4]/race.tot[4], race.dis[4]/race.tot[4])
nh_other = c(race.nmov[5]/race.tot[5], race.mov[5]/race.tot[5], race.dis[5]/race.tot[5])

hs = c(edu.nmov[1]/edu.tot[1], edu.mov[1]/edu.tot[1], edu.dis[1]/edu.tot[1])
lths = c(edu.nmov[2]/edu.tot[2], edu.mov[2]/edu.tot[2], edu.dis[2]/edu.tot[2])

desc <- data.frame('nhw'=nhw,'nhb'=nhb,'hisp'=hisp,'asn'=asn,'nh_oth'=nh_other,
                   'hs_plus'=hs, 'lths'=lths)


#kable(desc, digits = 4,format = "html", row.names=c("not move", "not displaced", "displaced")

#stargazer(desc, summary=F,type = "html", align = T, row.labels = c("not move", "not dis", "dis"), column.labels = c("nhw","nhb","hisp","asn","nh_oth"))
#stargazer(desc, digits=3, summary = F, type = "text", rownames = T)

This should follow the format of a journal article’s methods, data and results sections

a. Descriptive write up consisting of

i. Tables of descriptive statistics

ii. Graphs of the necessary functions

iii. Results of statistical tests

iv. Interpretation of the results

Part C

There is a vast amount of qualitative literature that shows that people that are displaced from their homes often transition into poorer health than either of those who moved voluntarily, or those who did not move, but there is very little quantitaively to show this relationship exists. The purpose of this intial study will be to examine the transition of moving from good health to poor health amongst three groups of people: 1) those who did not move 2) those who moved voluntarily 3) those who were displaced. This study will utlize data from the Panel Study of Income Dynamics (PSID) from the University of Michigan with both family and individual data from 2005 (n=14,921), 2007 (n=15,338) and 2009 (n=16,681) waves.

stargazer(desc, digits=3, summary = F, type = "text", rownames = T)
## 
## ==============================================
##    nhw   nhb  hisp   asn  nh_oth hs_plus lths 
## ----------------------------------------------
## 1 0.711 0.626 0.643 0.683 0.709   0.677  0.680
## 2 0.252 0.307 0.302 0.267 0.238   0.278  0.257
## 3 0.037 0.066 0.055 0.050 0.053   0.045  0.062
## ----------------------------------------------
In the above table, the first row is those that did not move; the second is those who moved volunarily and the last row is those who were displaced.  The table shows the proportions by both race/ethnicity and education  of those people who participated in the study.  

The first step of our analysis is to see if there is some diffference in the self reported health of the three groups.

fit2
## Call:
## survdiff(formula = Surv(time, healtran) ~ move.y, data = datcomb)
## 
##                          N Observed Expected (O-E)^2/E (O-E)^2/V
## move.y=displaced      1874      387      309     19.95      24.6
## move.y=not displaced 10658      933     1155     42.72      65.7
## move.y=not move      26311     3204     3060      6.75      23.9
## 
##  Chisq= 79.6  on 2 degrees of freedom, p= <2e-16

From this test we can see that, at the .05 level of significance, that these is evidence of statistical significance that there is a difference between the transitions from good to poor health in these three groups.

Next, we examine the kaplan meier survor analysis to see if we can observe any differnces in the data:  
ggsurvplot(fit1, conf.int=T, ylim=c(.88,1), xlim=c(1,4),title = "Survival Function of the health of those people surveyed \n not moved vs move not displaced vs moved displaced")

From this graph, it appears that there is a difference between those who move and were not displaced and the other two groups: thoe that did not move and those that were displaced; but there is not sufficient evidence to state that those who were displaced is different than the other two groups.  With this in mind we shall further explore this relationship.  
Working with the Hypothesis of:  
H1: Those who are displaced from their homes are at greater risk of transitioning from good self reported health (SRH) to poor SRH than those who did not move or those that moved voluntarily.  
With the assumption that the risk of transition to poor health is a proportional hazard over time we will atempt to fit a parametric proportional hazards model to our data.  We will examine an exponential, a weibull, a log-normal and a log-logistic models:  
stargazer(fit.exf,fit.weif,fit.llf,fit.lnf, summary=F, style = "demography", type = "html",
         column.labels = c("Exponential","Weibull","LogLog","LogNorm"), align = T)
year
Exponential Weibull LogLog LogNorm
Model 1 Model 2 Model 3 Model 4
move.ynot displaced -0.123** -0.062 -0.062 -0.062
(0.038) (0.039) (0.039) (0.038)
move.ydisplaced 0.539*** 0.143** 0.143** 0.143**
(0.054) (0.054) (0.054) (0.054)
racenhb 0.545*** 0.478*** 0.478*** 0.478***
(0.033) (0.033) (0.033) (0.033)
racehisp 0.485*** 0.376*** 0.376*** 0.376***
(0.052) (0.051) (0.051) (0.051)
raceasn 0.695*** 0.565*** 0.565*** 0.565***
(0.156) (0.156) (0.156) (0.156)
racenh-other 0.226 0.261 0.261 0.261
(0.151) (0.151) (0.151) (0.150)
educlths 0.571*** 0.638*** 0.638*** 0.638***
(0.034) (0.034) (0.034) (0.034)
age2 0.0002*** 0.0002*** 0.0002*** 0.0002***
(0.00001) (0.00001) (0.00001) (0.00001)
log(scale) 10.632*** 7.606*** 7.646 7.646***
(0.034) (0.00002) (52.094) (0.001)
log(shape) 7.853*** 7.853*** 5.513***
(0.012) (0.012) (0.016)
Constant 103.089 52.669***
(134,089.500) (2.327)
N 38,843 38,843 38,843 38,843
Log Likelihood -47,895.200 -13,650.370 -13,650.370 -13,653.450
p < .05; p < .01; p < .001

From the above table we can see that the weibull and log-logistic models have the log likelihood closest to zero, so for fitting a model we will choose the weibull.

We will next examine the model fit diagnostics:

check.dist(sp=emp,pp=fit.weif, main="empirical vs Weibull")

The fit does not appear to be ideal, but after exaining the other models and not seeing one that appears better; we will press forward with this one.
We will next examine the covariates of the model:

fit.weif
## Call:
## phreg(formula = Surv(year, healtran) ~ move.y + race + educ + 
##     age2, data = dat.comb, dist = "weibull")
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## move.y 
##         not move    0.677     0         1           (reference)
##    not displaced    0.274    -0.062     0.940     0.039     0.107 
##        displaced    0.048     0.143     1.154     0.054     0.008 
## race 
##              nhw    0.577     0         1           (reference)
##              nhb    0.323     0.478     1.613     0.033     0.000 
##             hisp    0.086     0.376     1.457     0.051     0.000 
##              asn    0.006     0.565     1.760     0.156     0.000 
##         nh-other    0.009     0.261     1.298     0.151     0.083 
## educ 
##              hs+    0.829     0         1           (reference)
##             lths    0.171     0.638     1.892     0.034     0.000 
## age2             1895.106     0.000     1.000     0.000     0.000 
## 
## log(scale)                    7.606               0.000     0.000 
## log(shape)                    7.853               0.012     0.000 
## 
## Events                    4524 
## Total time at risk        77962913 
## Max. log. likelihood      -13650 
## LR test statistic         1291.80 
## Degrees of freedom        8 
## Overall p-value           0
From the results of this model we can see that, unlike the kaplan-meier survivorship plot, there is evidence to show a statistical difference between those that were displaced and the other two groups: those that did not move and those who moved voluntarily.  From this data we can see that those that are displaced have a 1.15 times more likely to trasntion from good health to poor health than someone who did not move.  We also see that, for the most part, race has an effect: non-Hispanic black (1.6 times), Hispanic (1.5 times), and asian (1.8 times) are all ore likely to transtion from good to poor health than the reference group, non-Hispanic white.  Education also appearss to have an effect as those with less than a high school education are 1.9 times more likely to transtion from good to poor health than those with at least a high school education.  

In follow on studies we will add more waves to the data in an attempt to further examine the the transition from good health to poor health on  these three groups of people.