DEM 7223 Homework 4 DEM 7223 - Event History Analysis - Cox Proportional Hazards Model Ricardo Lowe
library(car)
## Loading required package: carData
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(muhaz)
library(eha)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ stringr 1.4.0
## ✓ tidyr 1.1.1 ✓ forcats 0.5.0
## ✓ readr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
## x tidyr::unpack() masks Matrix::unpack()
if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")
## Loading required package: ipumsr
usa_00122 <- read_ipums_ddi("usa_00122.xml")
usa_00122_ <- read_ipums_micro(usa_00122)
## Use of data from IPUMS USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
ipums_view(usa_00122_)
#Variable Recodes/Creating Covariates
usa_00122_$Grenada <- ifelse(usa_00122_$BPLD == 26094 |
usa_00122_$BPLD >= 25000 & usa_00122_$BPLD <= 26054 |
usa_00122_$BPLD >= 26056 & usa_00122_$BPLD <= 26091, 0,
ifelse(usa_00122_$BPLD == 26055, 1,NA))
usa_00122_$Female <- ifelse(usa_00122_$SEX == 1, 0, 1)
usa_00122_$HighEd <- ifelse(usa_00122_$EDUC <= 6, 0, 1)
n2 <- usa_00122_%>%
filter(YRIMMIG < 9000 & BIRTHYR < 9000 & RACE == 2)
n2$YRIMMIG3 <- as.numeric(ifelse(n2$YRIMMIG >= 2000,"",n2$YRIMMIG))
n2$secbi<-ifelse(is.na(n2$YRIMMIG3)=="",
"",
(n2$YRIMMIG-n2$BIRTHYR))
n2$b2event<-ifelse(is.na(n2$YRIMMIG3)==T,0,1)
n2$b2event<-as.numeric(n2$b2event)
n2$secbi<-as.numeric(n2$secbi)
fit<-survfit(Surv(secbi, b2event)~1, n2)
fit
## Call: survfit(formula = Surv(secbi, b2event) ~ 1, data = n2)
##
## n events median 0.95LCL 0.95UCL
## 763654 429250 29 29 30
n2 %>% select(BIRTHYR,YRIMMIG,YRIMMIG3,b2event,secbi)
## # A tibble: 763,654 x 5
## BIRTHYR YRIMMIG YRIMMIG3 b2event secbi
## <dbl+lbl> <int+lbl> <dbl> <dbl> <dbl>
## 1 1973 1996 1996 1 23
## 2 1987 1988 1988 1 1
## 3 1988 2004 NA 0 16
## 4 1974 1988 1988 1 14
## 5 1991 1993 1993 1 2
## 6 1960 1982 1982 1 22
## 7 1983 2008 NA 0 25
## 8 1973 1980 [1980 (1980 PUMS: 1975-1980)] 1980 1 7
## 9 1977 2001 NA 0 24
## 10 1982 1992 1992 1 10
## # … with 763,644 more rows
n2
## # A tibble: 763,654 x 33
## YEAR MULTYEAR SAMPLE SERIAL CBSERIAL HHWT CLUSTER STRATA GQ
## <int> <dbl> <int+lbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int+l>
## 1 2014 2010 201403 [201… 251 2.01e12 22 2.01e12 130001 1 [Hou…
## 2 2014 2010 201403 [201… 715 2.01e12 5 2.01e12 260001 1 [Hou…
## 3 2014 2010 201403 [201… 957 2.01e12 11 2.01e12 190001 4 [Oth…
## 4 2014 2010 201403 [201… 1215 2.01e12 5 2.01e12 260001 1 [Hou…
## 5 2014 2010 201403 [201… 1233 2.01e12 16 2.01e12 180001 4 [Oth…
## 6 2014 2010 201403 [201… 1237 2.01e12 21 2.01e12 30001 1 [Hou…
## 7 2014 2010 201403 [201… 1237 2.01e12 21 2.01e12 30001 1 [Hou…
## 8 2014 2010 201403 [201… 1298 2.01e12 64 2.01e12 190001 1 [Hou…
## 9 2014 2010 201403 [201… 1328 2.01e12 61 2.01e12 20001 1 [Hou…
## 10 2014 2010 201403 [201… 1782 2.01e12 18 2.01e12 120001 1 [Hou…
## # … with 763,644 more rows, and 24 more variables: PERNUM <dbl>, PERWT <dbl>,
## # SEX <int+lbl>, AGE <int+lbl>, MARST <int+lbl>, BIRTHYR <dbl+lbl>,
## # RACE <int+lbl>, RACED <int+lbl>, HISPAN <int+lbl>, HISPAND <int+lbl>,
## # BPL <int+lbl>, BPLD <int+lbl>, YRNATUR <int+lbl>, YRIMMIG <int+lbl>,
## # SPEAKENG <int+lbl>, EDUC <int+lbl>, EDUCD <int+lbl>, LABFORCE <int+lbl>,
## # Grenada <dbl>, Female <dbl>, HighEd <dbl>, YRIMMIG3 <dbl>, secbi <dbl>,
## # b2event <dbl>
#using coxph in survival library
fit.cox1<-coxph(Surv(secbi,b2event)~Grenada+HighEd+Female, data=n2)
summary(fit.cox1)
## Call:
## coxph(formula = Surv(secbi, b2event) ~ Grenada + HighEd + Female,
## data = n2)
##
## n= 381234, number of events= 255624
## (382420 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## Grenada 0.184762 1.202932 0.013560 13.62 <2e-16 ***
## HighEd 0.517671 1.678114 0.004012 129.03 <2e-16 ***
## Female -0.086361 0.917263 0.003994 -21.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## Grenada 1.2029 0.8313 1.1714 1.2353
## HighEd 1.6781 0.5959 1.6650 1.6914
## Female 0.9173 1.0902 0.9101 0.9245
##
## Concordance= 0.587 (se = 0.001 )
## Likelihood ratio test= 16885 on 3 df, p=<2e-16
## Wald test = 17247 on 3 df, p=<2e-16
## Score (logrank) test = 17597 on 3 df, p=<2e-16
plot(fit, conf.int=T, ylab="S(t)", xlab="Years")
title(main="Survival Function for Immigration Interval")
#use survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~CLUSTER, strata=~STRATA, data=n2, weight=~PERWT)
cox.s<-svycoxph(Surv(secbi,b2event)~Grenada+HighEd, design=des)
summary(cox.s)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (454452) clusters.
## svydesign(ids = ~CLUSTER, strata = ~STRATA, data = n2, weight = ~PERWT)
## Call:
## svycoxph(formula = Surv(secbi, b2event) ~ Grenada + HighEd, design = des)
##
## n= 381234, number of events= 255624
## (382420 observations deleted due to missingness)
##
## coef exp(coef) se(coef) robust se z Pr(>|z|)
## Grenada 0.207708 1.230854 0.014533 0.016249 12.78 <2e-16 ***
## HighEd 0.543778 1.722502 0.004187 0.005575 97.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## Grenada 1.231 0.8124 1.192 1.271
## HighEd 1.723 0.5806 1.704 1.741
##
## Concordance= 0.584 (se = 0.001 )
## Likelihood ratio test= NA on 2 df, p=NA
## Wald test = 9711 on 2 df, p=<2e-16
## Score (logrank) test = NA on 2 df, p=NA
##
## (Note: the likelihood ratio and score tests assume independence of
## observations within a cluster, the Wald and robust score tests do not).
dat<-expand.grid(Grenada=c(0,1), Female=c(0,1), HighEd=c(0,1), b2event=1)
head(dat)
## Grenada Female HighEd b2event
## 1 0 0 0 1
## 2 1 0 0 1
## 3 0 1 0 1
## 4 1 1 0 1
## 5 0 0 1 1
## 6 1 0 1 1
#plot some survival function estimates for various types of children
plot(survfit(fit.cox1, conf.int = F), xlim=c(0, 90), ylab="S(t)", xlab="Years")
title (main = "Survival Plots for for Immigration Interval")
lines(survfit(fit.cox1,
newdata=data.frame(Grenada=1, HighEd=1, Female=1), conf.int = F), col="red")
lines(survfit(fit.cox1,
newdata=data.frame(Grenada=0, HighEd=1, Female=1), conf.int = F), col="green")
legend("topright",
legend=c("Means of Covariates", "Grenada high edu men", "Carib high edu men"), lty=1, col=c(1,"red", "green"), cex=.8)
#Now we look at the cumulative hazard functions
sf1<-survfit(fit.cox1)
sf2<-survfit(fit.cox1,
newdata=data.frame(Grenada=1, Female=1, HighEd=1))
sf3<-survfit(fit.cox1,
newdata=data.frame(Grenada=0, Female=1, HighEd=1))
H1<--log(sf1$surv)
H2<--log(sf2$surv)
H3<--log(sf3$surv)
times<-sf1$time
plot(H1~times, type="s", ylab="H(t)",ylim=c(0, 6), xlab="Years")
lines(H2~times,col="red", type="s")
lines(H3~times,col="green", type="s")
legend("topleft",
legend=c("Means of Covariates", "Grenada high edu", "Carib high edu"), lty=1, col=c(1,"red", "green"), cex=.8)
#and the hazard function
hs1<-loess(diff(c(0,H1))~times, degree=1, span=.25)
hs2<-loess(diff(c(0,H2))~times, degree=1, span=.25)
hs3<-loess(diff(c(0,H3))~times, degree=1, span=.25)
plot(predict(hs1)~times,type="l", ylab="smoothed h(t)", xlab="Years", ylim=c(0, .10))
title(main="Smoothed hazard plots")
lines(predict(hs2)~times, type="l", col="red")
lines(predict(hs3)~times, type="l", col="green")
legend("topright",
legend=c("Means of Covariates", "Grenada high edu men", "Carib high edu men"), lty=1, col=c(1,"red", "green"), cex=.8)
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~CLUSTER, strata=~STRATA,
data=n2, weight=~PERWT )
cox.s<-svycoxph(Surv(secbi,b2event)~Grenada+Female+HighEd, design=des)
summary(cox.s)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (454452) clusters.
## svydesign(ids = ~CLUSTER, strata = ~STRATA, data = n2, weight = ~PERWT)
## Call:
## svycoxph(formula = Surv(secbi, b2event) ~ Grenada + Female +
## HighEd, design = des)
##
## n= 381234, number of events= 255624
## (382420 observations deleted due to missingness)
##
## coef exp(coef) se(coef) robust se z Pr(>|z|)
## Grenada 0.207474 1.230566 0.014533 0.016299 12.73 <2e-16 ***
## Female -0.098739 0.905979 0.004159 0.004967 -19.88 <2e-16 ***
## HighEd 0.544049 1.722970 0.004186 0.005580 97.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## Grenada 1.231 0.8126 1.1919 1.2705
## Female 0.906 1.1038 0.8972 0.9148
## HighEd 1.723 0.5804 1.7042 1.7419
##
## Concordance= 0.591 (se = 0.001 )
## Likelihood ratio test= NA on 3 df, p=NA
## Wald test = 10026 on 3 df, p=<2e-16
## Score (logrank) test = NA on 3 df, p=NA
##
## (Note: the likelihood ratio and score tests assume independence of
## observations within a cluster, the Wald and robust score tests do not).
#Schoenfeld test
fit.test<-cox.zph(cox.s)
fit.test
## chisq df p
## Grenada 128.1 1 < 2e-16
## Female 48.6 1 3.2e-12
## HighEd 4400.8 1 < 2e-16
## GLOBAL 4586.4 3 < 2e-16
library("survival")
library("survminer")
## Loading required package: ggpubr
plot(fit.test, df=2)
fita<-aareg(Surv(secbi,b2event)~Grenada+Female+HighEd,#+cluster(strata),
n2)#, weights = PERWT)
summary(fita)
## $table
## slope coef se(coef) z p
## Intercept 0.03961433 4.222517e-06 1.645523e-08 256.60641 0.000000e+00
## Grenada 0.00668357 9.127322e-07 7.654802e-08 11.92366 8.911074e-33
## Female -0.00400410 -4.631156e-07 1.991214e-08 -23.25795 1.181978e-119
## HighEd 0.03472919 2.543306e-06 2.097935e-08 121.22901 0.000000e+00
##
## $test
## [1] "aalen"
##
## $test.statistic
## Intercept Grenada Female HighEd
## 74182.040 879.652 -5822.391 30512.222
##
## $test.var
## b0
## b0 83572.337 -1560.774740 -54293.65809 -25271.724954
## -1560.775 5442.556685 -21.51278 1.761801
## -54293.658 -21.512783 62670.02112 2188.377384
## -25271.725 1.761801 2188.37738 63348.241439
##
## $test.var2
## NULL
##
## $chisq
## [,1]
## [1,] 15593.03
##
## $n
## [1] 381234 86 86
##
## attr(,"class")
## [1] "summary.aareg"
#install.packages("ggfortify")
library(ggfortify)
autoplot(fita)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `mutate_()` is deprecated as of dplyr 0.7.0.
## Please use `mutate()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.