Homework 4 - Cox Regression Models

Author

Jules Gonzalez

Data

library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.2      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(haven)
library(survival)
library(car)
Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some
library(survey)
Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack


Attaching package: 'survey'

The following object is masked from 'package:graphics':

    dotchart
library(muhaz)
library(eha)
library(ipumsr)

ddi <- read_ipums_ddi("nhis_00008.xml")
alc <- read_ipums_micro(ddi)
Use of data from IPUMS NHIS is subject to conditions including that users
should cite the data appropriately. Use command `ipums_conditions()` for more
details.
alc<-haven::zap_labels(alc)

nams<-names(alc)

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(alc)<-newnames

Recode

library(dplyr)

#rent or own 
alc$home<-Recode(alc$ownership, recodes="10:12='own'; 20='rent'; else=NA", as.factor=T)

#us citizen
alc$us<-Recode(alc$citizen, recodes="1='nous'; 2='yesus'; else=NA", as.factor=T)
summary(alc$us)
  nous  yesus   NA's 
 94603 890065   8119 
#marital status
alc$marital<-car::Recode(as.numeric(alc$marst),recodes = "10:13='married'; 20='widowed'; 30='divorced'; 40='seperated';50='never married';else=NA")

 #sex                                    
alc$sex<-Recode(alc$sex, recodes="1='male'; 2='female'; else=NA", as.factor=T)

#smoker status
alc$smoke<-car::Recode(as.numeric(alc$smokestatus1),recodes = "10='never smoked'; 20='current smoker'; 30='former smoker';else=NA")

#asthma_ER
alc$aer<-car::Recode(as.numeric(alc$astheryr),recodes = "1='no er'; 2='yes er';else=NA")

#asthma attack
alc$aattack<-car::Recode(as.numeric(alc$asthatakyr),recodes = "1='no'; 2='yes';else=NA")

#poverty level
alc$povl<-car::Recode(as.numeric(alc$pooryn),recodes = "1='at or above povl'; 2='below povl';else=NA")

#education
alc$edu<-car::Recode(as.numeric(alc$educrec1),recodes = "01:13='lths'; 14='somecollege'; 15='bachelors';16='five+ yrs of college';else=NA")

#ever had lung cancer
alc$lung<-car::Recode(as.numeric(alc$cnlung),recodes = "0:1=0; 2=1;else=NA")

alc$aev<-car::Recode(as.numeric(alc$asthmaev),recodes = "0:1=0; 2=1;else=NA")

alc <- alc %>%
  filter(mortelig == 1, mortdody<9999, is.na(aev)==F, is.na(edu)==F)

Define your outcome as in HW 1.

Asthma based on education.

What covariates are hypothesized to affect the outcome variable?

Education level as a risk factor for having asthma.

Include all main effects in the model.

  1. alc <- alc %>%
      mutate(death_time = ifelse( mortelig ==1, 
                                 mortdody - year , 
                                 2002:2012 - year ), 
             d5.event = ifelse(mortelig == 1 & death_time <= 5, 1, 0))
    
    library(survival)
    
    time_fit <- survfit(Surv(death_time, d5.event) ~ 1, 
                      conf.type = "log",
                       data = alc)
    
    library(ggsurvfit)
    
    time_fit %>%
      ggsurvfit()+
      xlim(-1, 6) +
      add_censor_mark() +
      add_confidence_interval()+
      add_quantile(y_value = .975)
    Warning: Removed 11 row(s) containing missing values (geom_path).

  2. Test for an interaction between at least two of the predictors

    #contructing survey design and fitting basic cox model
    
    
    options(survey.lonely.psu = "adjust")
    
    des2<-svydesign(ids = ~psu,
                    strata = ~strata,
                    weights=~mortwtsa, 
                    data=alc,
                    nest=TRUE)
    
    aa<-svycoxph(Surv(death_time, d5.event)~edu+aev,
                      design = des2)
    
    
    
    
    summary(aa)
    Stratified 1 - level Cluster Sampling design (with replacement)
    With (1278) clusters.
    svydesign(ids = ~psu, strata = ~strata, weights = ~mortwtsa, 
        data = alc, nest = TRUE)
    Call:
    svycoxph(formula = Surv(death_time, d5.event) ~ edu + aev, design = des2)
    
      n= 46046, number of events= 18641 
    
                                coef exp(coef) se(coef) robust se      z Pr(>|z|)
    edufive+ yrs of college  0.01543   1.01555  0.02918   0.04454  0.347 0.728910
    edulths                  0.10435   1.10998  0.01932   0.03114  3.351 0.000804
    edusomecollege          -0.01249   0.98759  0.02153   0.03374 -0.370 0.711287
    aev                      0.11274   1.11934  0.01646   0.01872  6.023 1.72e-09
    
    edufive+ yrs of college    
    edulths                 ***
    edusomecollege             
    aev                     ***
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
                            exp(coef) exp(-coef) lower .95 upper .95
    edufive+ yrs of college    1.0156     0.9847    0.9307     1.108
    edulths                    1.1100     0.9009    1.0443     1.180
    edusomecollege             0.9876     1.0126    0.9244     1.055
    aev                        1.1193     0.8934    1.0790     1.161
    
    Concordance= 0.518  (se = 0.003 )
    Likelihood ratio test= NA  on 4 df,   p=NA
    Wald test            = 70.06  on 4 df,   p=2e-14
    Score (logrank) test = NA  on 4 df,   p=NA
    
      (Note: the likelihood ratio and score tests assume independence of
         observations within a cluster, the Wald and robust score tests do not).
    #Fit the model
    fitl1<-svycoxph(Surv(time = death_time, event = aev)~edu+aev, design=des2)
    Warning in coxph(formula = Surv(time = death_time, event = aev) ~ edu + : a
    variable appears on both the left and right sides of the formula
    Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
    Loglik converged before variable 4 ; coefficient may be infinite.
    summary(fitl1) 
    Stratified 1 - level Cluster Sampling design (with replacement)
    With (1278) clusters.
    svydesign(ids = ~psu, strata = ~strata, weights = ~mortwtsa, 
        data = alc, nest = TRUE)
    Call:
    svycoxph(formula = Surv(time = death_time, event = aev) ~ edu + 
        aev, design = des2)
    
      n= 46046, number of events= 5727 
    
                                  coef  exp(coef)   se(coef)  robust se      z
    edufive+ yrs of college -1.957e-02  9.806e-01  5.572e-02  7.847e-02 -0.249
    edulths                  1.013e-01  1.107e+00  3.669e-02  5.417e-02  1.871
    edusomecollege           5.411e-02  1.056e+00  3.979e-02  5.599e-02  0.966
    aev                      2.140e+01  1.970e+09  1.512e+02  7.625e+05  0.000
                            Pr(>|z|)  
    edufive+ yrs of college   0.8031  
    edulths                   0.0614 .
    edusomecollege            0.3339  
    aev                       1.0000  
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
                            exp(coef) exp(-coef) lower .95 upper .95
    edufive+ yrs of college 9.806e-01  1.020e+00    0.8408     1.144
    edulths                 1.107e+00  9.036e-01    0.9952     1.231
    edusomecollege          1.056e+00  9.473e-01    0.9459     1.178
    aev                     1.970e+09  5.076e-10    0.0000       Inf
    
    Concordance= 0.949  (se = 0.001 )
    Likelihood ratio test= NA  on 4 df,   p=NA
    Wald test            = 6  on 4 df,   p=0.2
    Score (logrank) test = NA  on 4 df,   p=NA
    
      (Note: the likelihood ratio and score tests assume independence of
         observations within a cluster, the Wald and robust score tests do not).

    Individuals with a less than high school education are more likely to experience the hazard compared to individuals with five years or more of college.

    #Shoenfeld residuals
    
    schoenresid<-resid(fitl1, type="schoenfeld")
    
    fit.sr<-lm(schoenresid~des2$variables$death_time[des2$variables$aev==1])
    
    fit.sr%>%
      broom::tidy()%>%
      filter(term != "(Intercept)")%>%
      select(response, estimate, statistic, p.value)
    # A tibble: 4 × 4
      response                  estimate statistic p.value
      <chr>                        <dbl>     <dbl>   <dbl>
    1 edufive+ yrs of college -0.000279    -0.381    0.703
    2 edulths                  0.000171     0.111    0.912
    3 edusomecollege           0.0000523    0.0391   0.969
    4 aev                      0.00124      1.54     0.124

    Variables not correlated. No significant model here.

    #Grambsch and Therneau
    fit.test<-cox.zph(fitl1)
    fit.test
              chisq df p
    edu    1.20e-03  3 1
    aev    2.14e-06  1 1
    GLOBAL 1.21e-03  4 1
    #plot for G&T 
    plot(fit.test, df=2)

    G&T Fit test and Plot indicate no significant results.

    Produce plots of the survival function and the cumulative hazard function for different “risk profiles”, as done in the notes

    ##survivalship
    plot(survfit(aa),xlim=c(6,9),
         main="survivorship cox regression ")

    Evaluate the assumptions of the Cox model, including proportionality of hazards of covariates, and if you use a continuous variable, evaluate linearity of the effect using Martingale residuals

    #fiting Cox model
    
    cox.s<-svycoxph(Surv(death_time,d5.event)~aev+edu,
                    design=des2)
    summary(cox.s)
    Stratified 1 - level Cluster Sampling design (with replacement)
    With (1278) clusters.
    svydesign(ids = ~psu, strata = ~strata, weights = ~mortwtsa, 
        data = alc, nest = TRUE)
    Call:
    svycoxph(formula = Surv(death_time, d5.event) ~ aev + edu, design = des2)
    
      n= 46046, number of events= 18641 
    
                                coef exp(coef) se(coef) robust se      z Pr(>|z|)
    aev                      0.11274   1.11934  0.01646   0.01872  6.023 1.72e-09
    edufive+ yrs of college  0.01543   1.01555  0.02918   0.04454  0.347 0.728910
    edulths                  0.10435   1.10998  0.01932   0.03114  3.351 0.000804
    edusomecollege          -0.01249   0.98759  0.02153   0.03374 -0.370 0.711287
    
    aev                     ***
    edufive+ yrs of college    
    edulths                 ***
    edusomecollege             
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
                            exp(coef) exp(-coef) lower .95 upper .95
    aev                        1.1193     0.8934    1.0790     1.161
    edufive+ yrs of college    1.0156     0.9847    0.9307     1.108
    edulths                    1.1100     0.9009    1.0443     1.180
    edusomecollege             0.9876     1.0126    0.9244     1.055
    
    Concordance= 0.518  (se = 0.003 )
    Likelihood ratio test= NA  on 4 df,   p=NA
    Wald test            = 70.06  on 4 df,   p=2e-14
    Score (logrank) test = NA  on 4 df,   p=NA
    
      (Note: the likelihood ratio and score tests assume independence of
         observations within a cluster, the Wald and robust score tests do not).
    #Schoenfeld test
    fit.test<-cox.zph(cox.s)
    fit.test
              chisq df    p
    aev    8.14e-05  1 0.99
    edu    7.34e-04  3 1.00
    GLOBAL 8.08e-04  4 1.00
    plot(fit.test, df=2)

    Aalen’s additive regression model

    #Aalen's additive regression model
    
    fita<-aareg(Surv(death_time,d5.event)~edu+aev+(strata),
                alc, weights = mortwtsa)
    
    summary(fita)
                                slope      coef se(coef)       z         p
    Intercept               -1.35e-01 -3.54e-05 1.47e-06 -24.000 1.23e-127
    edufive+ yrs of college  1.15e-03  4.71e-07 6.87e-07   0.686  4.93e-01
    edulths                  1.27e-02  1.87e-06 4.47e-07   4.170  3.04e-05
    edusomecollege          -9.36e-04  8.47e-08 4.88e-07   0.173  8.62e-01
    aev                      7.70e-03  1.21e-06 4.29e-07   2.820  4.85e-03
    strata                   3.89e-05  8.59e-09 2.58e-10  33.300 1.12e-242
    
    Chisq=1165.72 on 5 df, p=<2e-16; test weights=aalen
    library(ggfortify)
    autoplot(fita)
    Warning: `gather_()` was deprecated in tidyr 1.2.0.
    Please use `gather()` instead.

    Plots show less than high school education to increase in hazard (ever been told they had asthma). some to higher education seems to decrease the hazard.