Homework 4 - Cox Regression Models

Author

Jules Gonzalez

Data

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.2      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

library(haven)
library(survival)
library(car)

Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

library(survey)

Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack


Attaching package: 'survey'

The following object is masked from 'package:graphics':

    dotchart

library(muhaz)
library(eha)
library(ipumsr)

ddi <- read_ipums_ddi("nhis_00008.xml")
alc <- read_ipums_micro(ddi)

Use of data from IPUMS NHIS is subject to conditions including that users
should cite the data appropriately. Use command `ipums_conditions()` for more
details.

alc<-haven::zap_labels(alc)

nams<-names(alc)

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(alc)<-newnames

Recode

library(dplyr)

#rent or own 
alc$home<-Recode(alc$ownership, recodes="10:12='own'; 20='rent'; else=NA", as.factor=T)

#us citizen
alc$us<-Recode(alc$citizen, recodes="1='nous'; 2='yesus'; else=NA", as.factor=T)
summary(alc$us)

  nous  yesus   NA's 
 94603 890065   8119

#marital status
alc$marital<-car::Recode(as.numeric(alc$marst),recodes = "10:13='married'; 20='widowed'; 30='divorced'; 40='seperated';50='never married';else=NA")

 #sex                                    
alc$sex<-Recode(alc$sex, recodes="1='male'; 2='female'; else=NA", as.factor=T)

#smoker status
alc$smoke<-car::Recode(as.numeric(alc$smokestatus1),recodes = "10='never smoked'; 20='current smoker'; 30='former smoker';else=NA")

#asthma_ER
alc$aer<-car::Recode(as.numeric(alc$astheryr),recodes = "1='no er'; 2='yes er';else=NA")

#asthma attack
alc$aattack<-car::Recode(as.numeric(alc$asthatakyr),recodes = "1='no'; 2='yes';else=NA")

#poverty level
alc$povl<-car::Recode(as.numeric(alc$pooryn),recodes = "1='at or above povl'; 2='below povl';else=NA")

#education
alc$edu<-car::Recode(as.numeric(alc$educrec1),recodes = "01:13='lths'; 14='somecollege'; 15='bachelors';16='five+ yrs of college';else=NA")

#ever had lung cancer
alc$lung<-car::Recode(as.numeric(alc$cnlung),recodes = "0:1=0; 2=1;else=NA")

alc$aev<-car::Recode(as.numeric(alc$asthmaev),recodes = "0:1=0; 2=1;else=NA")

alc <- alc %>%
  filter(mortelig == 1, mortdody<9999, is.na(aev)==F, is.na(edu)==F)

Define your outcome as in HW 1.

Asthma based on education.

What covariates are hypothesized to affect the outcome variable?

Education level as a risk factor for having asthma.

Include all main effects in the model.

alc <- alc %>%
  mutate(death_time = ifelse( mortelig ==1, 
                             mortdody - year , 
                             2002:2012 - year ), 
         d5.event = ifelse(mortelig == 1 & death_time <= 5, 1, 0))

library(survival)

time_fit <- survfit(Surv(death_time, d5.event) ~ 1, 
                  conf.type = "log",
                   data = alc)

library(ggsurvfit)

time_fit %>%
  ggsurvfit()+
  xlim(-1, 6) +
  add_censor_mark() +
  add_confidence_interval()+
  add_quantile(y_value = .975)

Warning: Removed 11 row(s) containing missing values (geom_path).

Test for an interaction between at least two of the predictors

#contructing survey design and fitting basic cox model


options(survey.lonely.psu = "adjust")

des2<-svydesign(ids = ~psu,
                strata = ~strata,
                weights=~mortwtsa, 
                data=alc,
                nest=TRUE)

aa<-svycoxph(Surv(death_time, d5.event)~edu+aev,
                  design = des2)




summary(aa)

Stratified 1 - level Cluster Sampling design (with replacement)
With (1278) clusters.
svydesign(ids = ~psu, strata = ~strata, weights = ~mortwtsa, 
    data = alc, nest = TRUE)

Call:
svycoxph(formula = Surv(death_time, d5.event) ~ edu + aev, design = des2)

  n= 46046, number of events= 18641 

                            coef exp(coef) se(coef) robust se      z Pr(>|z|)
edufive+ yrs of college  0.01543   1.01555  0.02918   0.04454  0.347 0.728910
edulths                  0.10435   1.10998  0.01932   0.03114  3.351 0.000804
edusomecollege          -0.01249   0.98759  0.02153   0.03374 -0.370 0.711287
aev                      0.11274   1.11934  0.01646   0.01872  6.023 1.72e-09

edufive+ yrs of college    
edulths                 ***
edusomecollege             
aev                     ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                        exp(coef) exp(-coef) lower .95 upper .95
edufive+ yrs of college    1.0156     0.9847    0.9307     1.108
edulths                    1.1100     0.9009    1.0443     1.180
edusomecollege             0.9876     1.0126    0.9244     1.055
aev                        1.1193     0.8934    1.0790     1.161

Concordance= 0.518  (se = 0.003 )
Likelihood ratio test= NA  on 4 df,   p=NA
Wald test            = 70.06  on 4 df,   p=2e-14
Score (logrank) test = NA  on 4 df,   p=NA

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

#Fit the model
fitl1<-svycoxph(Surv(time = death_time, event = aev)~edu+aev, design=des2)

Warning in coxph(formula = Surv(time = death_time, event = aev) ~ edu + : a
variable appears on both the left and right sides of the formula

Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
Loglik converged before variable 4 ; coefficient may be infinite.

summary(fitl1)

Stratified 1 - level Cluster Sampling design (with replacement)
With (1278) clusters.
svydesign(ids = ~psu, strata = ~strata, weights = ~mortwtsa, 
    data = alc, nest = TRUE)

Call:
svycoxph(formula = Surv(time = death_time, event = aev) ~ edu + 
    aev, design = des2)

  n= 46046, number of events= 5727 

                              coef  exp(coef)   se(coef)  robust se      z
edufive+ yrs of college -1.957e-02  9.806e-01  5.572e-02  7.847e-02 -0.249
edulths                  1.013e-01  1.107e+00  3.669e-02  5.417e-02  1.871
edusomecollege           5.411e-02  1.056e+00  3.979e-02  5.599e-02  0.966
aev                      2.140e+01  1.970e+09  1.512e+02  7.625e+05  0.000
                        Pr(>|z|)  
edufive+ yrs of college   0.8031  
edulths                   0.0614 .
edusomecollege            0.3339  
aev                       1.0000  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                        exp(coef) exp(-coef) lower .95 upper .95
edufive+ yrs of college 9.806e-01  1.020e+00    0.8408     1.144
edulths                 1.107e+00  9.036e-01    0.9952     1.231
edusomecollege          1.056e+00  9.473e-01    0.9459     1.178
aev                     1.970e+09  5.076e-10    0.0000       Inf

Concordance= 0.949  (se = 0.001 )
Likelihood ratio test= NA  on 4 df,   p=NA
Wald test            = 6  on 4 df,   p=0.2
Score (logrank) test = NA  on 4 df,   p=NA

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

Individuals with a less than high school education are more likely to experience the hazard compared to individuals with five years or more of college.

#Shoenfeld residuals

schoenresid<-resid(fitl1, type="schoenfeld")

fit.sr<-lm(schoenresid~des2$variables$death_time[des2$variables$aev==1])

fit.sr%>%
  broom::tidy()%>%
  filter(term != "(Intercept)")%>%
  select(response, estimate, statistic, p.value)

# A tibble: 4 × 4
  response                  estimate statistic p.value
  <chr>                        <dbl>     <dbl>   <dbl>
1 edufive+ yrs of college -0.000279    -0.381    0.703
2 edulths                  0.000171     0.111    0.912
3 edusomecollege           0.0000523    0.0391   0.969
4 aev                      0.00124      1.54     0.124

Variables not correlated. No significant model here.

#Grambsch and Therneau
fit.test<-cox.zph(fitl1)
fit.test

          chisq df p
edu    1.20e-03  3 1
aev    2.14e-06  1 1
GLOBAL 1.21e-03  4 1

#plot for G&T 
plot(fit.test, df=2)

G&T Fit test and Plot indicate no significant results.

Produce plots of the survival function and the cumulative hazard function for different “risk profiles”, as done in the notes

##survivalship
plot(survfit(aa),xlim=c(6,9),
     main="survivorship cox regression ")

Evaluate the assumptions of the Cox model, including proportionality of hazards of covariates, and if you use a continuous variable, evaluate linearity of the effect using Martingale residuals

#fiting Cox model

cox.s<-svycoxph(Surv(death_time,d5.event)~aev+edu,
                design=des2)
summary(cox.s)

Stratified 1 - level Cluster Sampling design (with replacement)
With (1278) clusters.
svydesign(ids = ~psu, strata = ~strata, weights = ~mortwtsa, 
    data = alc, nest = TRUE)

Call:
svycoxph(formula = Surv(death_time, d5.event) ~ aev + edu, design = des2)

  n= 46046, number of events= 18641 

                            coef exp(coef) se(coef) robust se      z Pr(>|z|)
aev                      0.11274   1.11934  0.01646   0.01872  6.023 1.72e-09
edufive+ yrs of college  0.01543   1.01555  0.02918   0.04454  0.347 0.728910
edulths                  0.10435   1.10998  0.01932   0.03114  3.351 0.000804
edusomecollege          -0.01249   0.98759  0.02153   0.03374 -0.370 0.711287

aev                     ***
edufive+ yrs of college    
edulths                 ***
edusomecollege             
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                        exp(coef) exp(-coef) lower .95 upper .95
aev                        1.1193     0.8934    1.0790     1.161
edufive+ yrs of college    1.0156     0.9847    0.9307     1.108
edulths                    1.1100     0.9009    1.0443     1.180
edusomecollege             0.9876     1.0126    0.9244     1.055

Concordance= 0.518  (se = 0.003 )
Likelihood ratio test= NA  on 4 df,   p=NA
Wald test            = 70.06  on 4 df,   p=2e-14
Score (logrank) test = NA  on 4 df,   p=NA

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

#Schoenfeld test
fit.test<-cox.zph(cox.s)
fit.test

          chisq df    p
aev    8.14e-05  1 0.99
edu    7.34e-04  3 1.00
GLOBAL 8.08e-04  4 1.00

plot(fit.test, df=2)

Aalen’s additive regression model

#Aalen's additive regression model

fita<-aareg(Surv(death_time,d5.event)~edu+aev+(strata),
            alc, weights = mortwtsa)

summary(fita)

                            slope      coef se(coef)       z         p
Intercept               -1.35e-01 -3.54e-05 1.47e-06 -24.000 1.23e-127
edufive+ yrs of college  1.15e-03  4.71e-07 6.87e-07   0.686  4.93e-01
edulths                  1.27e-02  1.87e-06 4.47e-07   4.170  3.04e-05
edusomecollege          -9.36e-04  8.47e-08 4.88e-07   0.173  8.62e-01
aev                      7.70e-03  1.21e-06 4.29e-07   2.820  4.85e-03
strata                   3.89e-05  8.59e-09 2.58e-10  33.300 1.12e-242

Chisq=1165.72 on 5 df, p=<2e-16; test weights=aalen

library(ggfortify)
autoplot(fita)

Warning: `gather_()` was deprecated in tidyr 1.2.0.
Please use `gather()` instead.

Plots show less than high school education to increase in hazard (ever been told they had asthma). some to higher education seems to decrease the hazard.