DEM 7223 Homework 4 DEM 7223 - Event History Analysis - Cox Proportional Hazards Model Ricardo Lowe

library(car)
## Loading required package: carData
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(muhaz)
library(eha)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ stringr 1.4.0
## ✓ tidyr   1.1.1     ✓ forcats 0.5.0
## ✓ readr   1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
## Loading required package: ipumsr
usa_00122 <- read_ipums_ddi("usa_00122.xml")
usa_00122_ <- read_ipums_micro(usa_00122)
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
ipums_view(usa_00122_)
#Variable Recodes/Creating Covariates 


usa_00122_$Grenada <- ifelse(usa_00122_$BPLD == 26094 | 
                              usa_00122_$BPLD >= 25000 & usa_00122_$BPLD <= 26054 |
                              usa_00122_$BPLD >= 26056 & usa_00122_$BPLD <= 26091, 0,
                       ifelse(usa_00122_$BPLD == 26055, 1,NA))


usa_00122_$Female <-   ifelse(usa_00122_$SEX == 1, 0, 1)
usa_00122_$HighEd <-   ifelse(usa_00122_$EDUC <= 6, 0, 1)
n2 <- usa_00122_%>%
  filter(YRIMMIG < 9000 & BIRTHYR < 9000 & RACE == 2)


n2$YRIMMIG3 <- as.numeric(ifelse(n2$YRIMMIG >= 2000,"",n2$YRIMMIG))



n2$secbi<-ifelse(is.na(n2$YRIMMIG3)=="",
                   "", 
                   (n2$YRIMMIG-n2$BIRTHYR))

n2$b2event<-ifelse(is.na(n2$YRIMMIG3)==T,0,1) 

n2$b2event<-as.numeric(n2$b2event)
n2$secbi<-as.numeric(n2$secbi)

fit<-survfit(Surv(secbi, b2event)~1, n2)
fit
## Call: survfit(formula = Surv(secbi, b2event) ~ 1, data = n2)
## 
##       n  events  median 0.95LCL 0.95UCL 
##  763654  429250      29      29      30
n2 %>% select(BIRTHYR,YRIMMIG,YRIMMIG3,b2event,secbi)
## # A tibble: 763,654 x 5
##      BIRTHYR                            YRIMMIG YRIMMIG3 b2event secbi
##    <dbl+lbl>                          <int+lbl>    <dbl>   <dbl> <dbl>
##  1      1973 1996                                   1996       1    23
##  2      1987 1988                                   1988       1     1
##  3      1988 2004                                     NA       0    16
##  4      1974 1988                                   1988       1    14
##  5      1991 1993                                   1993       1     2
##  6      1960 1982                                   1982       1    22
##  7      1983 2008                                     NA       0    25
##  8      1973 1980 [1980 (1980 PUMS: 1975-1980)]     1980       1     7
##  9      1977 2001                                     NA       0    24
## 10      1982 1992                                   1992       1    10
## # … with 763,644 more rows
n2
## # A tibble: 763,654 x 33
##     YEAR MULTYEAR       SAMPLE SERIAL CBSERIAL  HHWT CLUSTER STRATA      GQ
##    <int>    <dbl>    <int+lbl>  <dbl>    <dbl> <dbl>   <dbl>  <dbl> <int+l>
##  1  2014     2010 201403 [201…    251  2.01e12    22 2.01e12 130001 1 [Hou…
##  2  2014     2010 201403 [201…    715  2.01e12     5 2.01e12 260001 1 [Hou…
##  3  2014     2010 201403 [201…    957  2.01e12    11 2.01e12 190001 4 [Oth…
##  4  2014     2010 201403 [201…   1215  2.01e12     5 2.01e12 260001 1 [Hou…
##  5  2014     2010 201403 [201…   1233  2.01e12    16 2.01e12 180001 4 [Oth…
##  6  2014     2010 201403 [201…   1237  2.01e12    21 2.01e12  30001 1 [Hou…
##  7  2014     2010 201403 [201…   1237  2.01e12    21 2.01e12  30001 1 [Hou…
##  8  2014     2010 201403 [201…   1298  2.01e12    64 2.01e12 190001 1 [Hou…
##  9  2014     2010 201403 [201…   1328  2.01e12    61 2.01e12  20001 1 [Hou…
## 10  2014     2010 201403 [201…   1782  2.01e12    18 2.01e12 120001 1 [Hou…
## # … with 763,644 more rows, and 24 more variables: PERNUM <dbl>, PERWT <dbl>,
## #   SEX <int+lbl>, AGE <int+lbl>, MARST <int+lbl>, BIRTHYR <dbl+lbl>,
## #   RACE <int+lbl>, RACED <int+lbl>, HISPAN <int+lbl>, HISPAND <int+lbl>,
## #   BPL <int+lbl>, BPLD <int+lbl>, YRNATUR <int+lbl>, YRIMMIG <int+lbl>,
## #   SPEAKENG <int+lbl>, EDUC <int+lbl>, EDUCD <int+lbl>, LABFORCE <int+lbl>,
## #   Grenada <dbl>, Female <dbl>, HighEd <dbl>, YRIMMIG3 <dbl>, secbi <dbl>,
## #   b2event <dbl>
#using coxph in survival library

fit.cox1<-coxph(Surv(secbi,b2event)~Grenada+HighEd+Female, data=n2)
summary(fit.cox1)
## Call:
## coxph(formula = Surv(secbi, b2event) ~ Grenada + HighEd + Female, 
##     data = n2)
## 
##   n= 381234, number of events= 255624 
##    (382420 observations deleted due to missingness)
## 
##              coef exp(coef)  se(coef)      z Pr(>|z|)    
## Grenada  0.184762  1.202932  0.013560  13.62   <2e-16 ***
## HighEd   0.517671  1.678114  0.004012 129.03   <2e-16 ***
## Female  -0.086361  0.917263  0.003994 -21.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## Grenada    1.2029     0.8313    1.1714    1.2353
## HighEd     1.6781     0.5959    1.6650    1.6914
## Female     0.9173     1.0902    0.9101    0.9245
## 
## Concordance= 0.587  (se = 0.001 )
## Likelihood ratio test= 16885  on 3 df,   p=<2e-16
## Wald test            = 17247  on 3 df,   p=<2e-16
## Score (logrank) test = 17597  on 3 df,   p=<2e-16
plot(fit, conf.int=T, ylab="S(t)", xlab="Years")
title(main="Survival Function for Immigration Interval")

#use survey design

options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~CLUSTER, strata=~STRATA, data=n2, weight=~PERWT)

cox.s<-svycoxph(Surv(secbi,b2event)~Grenada+HighEd, design=des)
summary(cox.s)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (454452) clusters.
## svydesign(ids = ~CLUSTER, strata = ~STRATA, data = n2, weight = ~PERWT)
## Call:
## svycoxph(formula = Surv(secbi, b2event) ~ Grenada + HighEd, design = des)
## 
##   n= 381234, number of events= 255624 
##    (382420 observations deleted due to missingness)
## 
##             coef exp(coef) se(coef) robust se     z Pr(>|z|)    
## Grenada 0.207708  1.230854 0.014533  0.016249 12.78   <2e-16 ***
## HighEd  0.543778  1.722502 0.004187  0.005575 97.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## Grenada     1.231     0.8124     1.192     1.271
## HighEd      1.723     0.5806     1.704     1.741
## 
## Concordance= 0.584  (se = 0.001 )
## Likelihood ratio test= NA  on 2 df,   p=NA
## Wald test            = 9711  on 2 df,   p=<2e-16
## Score (logrank) test = NA  on 2 df,   p=NA
## 
##   (Note: the likelihood ratio and score tests assume independence of
##      observations within a cluster, the Wald and robust score tests do not).
dat<-expand.grid(Grenada=c(0,1), Female=c(0,1), HighEd=c(0,1), b2event=1) 
head(dat)
##   Grenada Female HighEd b2event
## 1       0      0      0       1
## 2       1      0      0       1
## 3       0      1      0       1
## 4       1      1      0       1
## 5       0      0      1       1
## 6       1      0      1       1
#plot some survival function estimates for various types of children
plot(survfit(fit.cox1, conf.int = F), xlim=c(0, 90), ylab="S(t)", xlab="Years")
title (main = "Survival Plots for for Immigration Interval")
lines(survfit(fit.cox1,
newdata=data.frame(Grenada=1, HighEd=1, Female=1), conf.int = F), col="red")
lines(survfit(fit.cox1,
newdata=data.frame(Grenada=0, HighEd=1, Female=1), conf.int = F), col="green")
legend("topright",
legend=c("Means of Covariates", "Grenada high edu men", "Carib high edu men"), lty=1, col=c(1,"red", "green"), cex=.8)

#Now we look at the cumulative hazard functions
sf1<-survfit(fit.cox1) 
sf2<-survfit(fit.cox1,
newdata=data.frame(Grenada=1, Female=1, HighEd=1)) 
sf3<-survfit(fit.cox1,
newdata=data.frame(Grenada=0, Female=1, HighEd=1))

H1<--log(sf1$surv) 
H2<--log(sf2$surv) 
H3<--log(sf3$surv)

times<-sf1$time
plot(H1~times, type="s", ylab="H(t)",ylim=c(0, 6), xlab="Years") 
lines(H2~times,col="red", type="s")
lines(H3~times,col="green", type="s")
legend("topleft",
legend=c("Means of Covariates", "Grenada high edu", "Carib high edu"), lty=1, col=c(1,"red", "green"), cex=.8)

#and the hazard function
hs1<-loess(diff(c(0,H1))~times, degree=1, span=.25) 
hs2<-loess(diff(c(0,H2))~times, degree=1, span=.25) 
hs3<-loess(diff(c(0,H3))~times, degree=1, span=.25)

plot(predict(hs1)~times,type="l", ylab="smoothed h(t)", xlab="Years", ylim=c(0, .10))
title(main="Smoothed hazard plots") 
lines(predict(hs2)~times, type="l", col="red") 
lines(predict(hs3)~times, type="l", col="green") 
legend("topright",
legend=c("Means of Covariates", "Grenada high edu men", "Carib high edu men"), lty=1, col=c(1,"red", "green"), cex=.8)

options(survey.lonely.psu = "adjust") 
des<-svydesign(ids=~CLUSTER, strata=~STRATA,
  data=n2, weight=~PERWT )
cox.s<-svycoxph(Surv(secbi,b2event)~Grenada+Female+HighEd, design=des)
summary(cox.s)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (454452) clusters.
## svydesign(ids = ~CLUSTER, strata = ~STRATA, data = n2, weight = ~PERWT)
## Call:
## svycoxph(formula = Surv(secbi, b2event) ~ Grenada + Female + 
##     HighEd, design = des)
## 
##   n= 381234, number of events= 255624 
##    (382420 observations deleted due to missingness)
## 
##              coef exp(coef)  se(coef) robust se      z Pr(>|z|)    
## Grenada  0.207474  1.230566  0.014533  0.016299  12.73   <2e-16 ***
## Female  -0.098739  0.905979  0.004159  0.004967 -19.88   <2e-16 ***
## HighEd   0.544049  1.722970  0.004186  0.005580  97.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## Grenada     1.231     0.8126    1.1919    1.2705
## Female      0.906     1.1038    0.8972    0.9148
## HighEd      1.723     0.5804    1.7042    1.7419
## 
## Concordance= 0.591  (se = 0.001 )
## Likelihood ratio test= NA  on 3 df,   p=NA
## Wald test            = 10026  on 3 df,   p=<2e-16
## Score (logrank) test = NA  on 3 df,   p=NA
## 
##   (Note: the likelihood ratio and score tests assume independence of
##      observations within a cluster, the Wald and robust score tests do not).
#Schoenfeld test
fit.test<-cox.zph(cox.s) 
fit.test
##          chisq df       p
## Grenada  128.1  1 < 2e-16
## Female    48.6  1 3.2e-12
## HighEd  4400.8  1 < 2e-16
## GLOBAL  4586.4  3 < 2e-16
library("survival")
library("survminer")
## Loading required package: ggpubr
plot(fit.test, df=2)

fita<-aareg(Surv(secbi,b2event)~Grenada+Female+HighEd,#+cluster(strata), 
            n2)#, weights = PERWT)
summary(fita)
## $table
##                 slope          coef     se(coef)         z             p
## Intercept  0.03961433  4.222517e-06 1.645523e-08 256.60641  0.000000e+00
## Grenada    0.00668357  9.127322e-07 7.654802e-08  11.92366  8.911074e-33
## Female    -0.00400410 -4.631156e-07 1.991214e-08 -23.25795 1.181978e-119
## HighEd     0.03472919  2.543306e-06 2.097935e-08 121.22901  0.000000e+00
## 
## $test
## [1] "aalen"
## 
## $test.statistic
## Intercept   Grenada    Female    HighEd 
## 74182.040   879.652 -5822.391 30512.222 
## 
## $test.var
##            b0                                        
## b0  83572.337 -1560.774740 -54293.65809 -25271.724954
##     -1560.775  5442.556685    -21.51278      1.761801
##    -54293.658   -21.512783  62670.02112   2188.377384
##    -25271.725     1.761801   2188.37738  63348.241439
## 
## $test.var2
## NULL
## 
## $chisq
##          [,1]
## [1,] 15593.03
## 
## $n
## [1] 381234     86     86
## 
## attr(,"class")
## [1] "summary.aareg"
#install.packages("ggfortify")
library(ggfortify)
autoplot(fita)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `mutate_()` is deprecated as of dplyr 0.7.0.
## Please use `mutate()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.