# Setup SAS run in R Markdown
require(SASmarkdown)
sasexe <- "C:/Program Files/SASHome/SASFoundation/9.4/sas.exe"
sasopts <- "-nosplash -ls 75"
options(digits = 4)

library(tidyverse) #%>%
library(foreign) #read.spss
library(survival) #survfit

Outline

Introduction to the data

The dataset was provided in Chaper 4, Machine Learning in Medicine-Cookbook 1 (Cleophas and Zwinderman (2014)). We can get the original data in SPSS SAV format Coxoutcomeprediction.sav) from the website; published year: 2014; ISBN number: 978-3-319-04180-3

Execute in SAS

Load data in SAS

proc import datafile = "&pathfile./Coxoutcomeprediction.sav" 
            out= Coxoutcomeprediction;
run;

proc format; 
    value tx  0 = 'No'
                    1 = 'Yes';
    value   ev  0 = 'Occured'
                    1 = 'Censored';
run;
                
data Coxoutcomeprediction;
    set Coxoutcomeprediction;
    format treatment tx. 
                 event ev. 
                 followupmonth 2.;
run;
proc contents data=Coxoutcomeprediction;
run;
##                           The CONTENTS Procedure
## 
## Data Set Name        WORK.COXOUTCOMEPREDICTION     Observations          60
## Member Type          DATA                          Variables             4 
## Engine               V9                            Indexes               0 
## Created              11/17/2020 00:39:11           Observation Length    32
## Last Modified        11/17/2020 00:39:11           Deleted Observations  0 
## Protection                                         Compressed            NO
## Data Set Type                                      Sorted                NO
## Label                                                                      
## Data Representation  WINDOWS_64                                            
## Encoding             wlatin1  Western (Windows)                            
## 
## 
##                      Engine/Host Dependent Information
## 
## Data Set Page Size          65536                                          
## Number of Data Set Pages    1                                              
## First Data Page             1                                              
## Max Obs per Page            2039                                           
## Obs in First Data Page      60                                             
## Number of Data Set Repairs  0                                              
## ExtendObsCounter            YES                                            
## Filename                    C:\Users\minhh\AppData\Local\Temp\SAS          
##                             Temporary                                      
##                             Files\_TD72980_HN-LAPTOP_\coxoutcomeprediction.
##                             sas7bdat                                       
## Release Created             9.0401M3                                       
## Host Created                X64_8PRO                                       
## 
## 
##                 Alphabetic List of Variables and Attributes
##  
##        #    Variable         Type    Len    Format    Label
## 
##        4    age              Num       8    F8.2      age          
##        2    event            Num       8    EV.       event        
##        1    followupmonth    Num       8    2.        followupmonth
##        3    treatment        Num       8    TX.       treatment
proc print data=Coxoutcomeprediction (obs=10); 
run;
##          Obs    followupmonth     event      treatment         age
## 
##            1          1          Censored       No           65.00
##            2          1          Censored       No           66.00
##            3          2          Censored       No           73.00
##            4          2          Censored       No           91.00
##            5          2          Censored       No           86.00
##            6          2          Censored       No           87.00
##            7          2          Censored       No           54.00
##            8          2          Censored       No           66.00
##            9          2          Censored       No           64.00
##           10          3          Occured        No           62.00

The Kaplan-Meier Method

ODS GRAPHICS ON;
proc lifetest data=Coxoutcomeprediction PLOTS=(S(TEST) LLS);
    time followupmonth*event(0);
    strata treatment;
run;
ODS GRAPHICS OFF;
##                           The LIFETEST Procedure
##  
##                          Stratum 1: treatment = No
## 
##                      Product-Limit Survival Estimates
##  
##                                             Survival
##                                             Standard    Number    Number
##   followupmonth     Survival    Failure      Error      Failed     Left
## 
##          0.0000       1.0000           0           0       0        30  
##          1.0000            .           .           .       1        29  
##          1.0000       0.9333      0.0667      0.0455       2        28  
##          2.0000            .           .           .       3        27  
##          2.0000            .           .           .       4        26  
##          2.0000            .           .           .       5        25  
##          2.0000            .           .           .       6        24  
##          2.0000            .           .           .       7        23  
##          2.0000            .           .           .       8        22  
##          2.0000       0.7000      0.3000      0.0837       9        21  
##          3.0000*           .           .           .       9        20  
##          4.0000       0.6650      0.3350      0.0865      10        19  
##          5.0000       0.6300      0.3700      0.0887      11        18  
##          6.0000            .           .           .      12        17  
##          6.0000       0.5600      0.4400      0.0917      13        16  
##          7.0000       0.5250      0.4750      0.0924      14        15  
##          9.0000            .           .           .      15        14  
##          9.0000       0.4550      0.5450      0.0924      16        13  
##         11.0000       0.4200      0.5800      0.0917      17        12  
##         12.0000       0.3850      0.6150      0.0905      18        11  
##         14.0000       0.3500      0.6500      0.0887      19        10  
##         16.0000       0.3150      0.6850      0.0865      20         9  
##         17.0000       0.2800      0.7200      0.0837      21         8  
##         18.0000       0.2450      0.7550      0.0802      22         7  
##         30.0000*           .           .           .      22         6  
##         30.0000*           .           .           .      22         5  
##         30.0000*           .           .           .      22         4  
##         30.0000*           .           .           .      22         3  
##         30.0000*           .           .           .      22         2  
##         30.0000*           .           .           .      22         1  
##         30.0000*      0.2450      0.7550           .      22         0  
## 
##         NOTE: The marked survival times are censored observations.
## 
## 
##             Summary Statistics for Time Variable followupmonth
## 
##                             Quartile Estimates
##  
##                        Point         95% Confidence Interval
##          Percent    Estimate    Transform      [Lower      Upper)
## 
##               75     18.0000    LOGLOG        11.0000       .    
##               50      9.0000    LOGLOG         4.0000     16.0000
##               25      2.0000    LOGLOG         2.0000      6.0000
## 
## 
##                                        Standard
##                                Mean       Error
## 
##                              9.6333      1.2732
## 
## NOTE: The mean survival time and its standard error were underestimated 
##       because the largest observation was censored and the estimation was 
##       restricted to the largest event time.
##  
##                                                                            
##  
##                           The LIFETEST Procedure
##  
##                         Stratum 2: treatment = Yes
## 
##                      Product-Limit Survival Estimates
##  
##                                             Survival
##                                             Standard    Number    Number
##   followupmonth     Survival    Failure      Error      Failed     Left
## 
##          0.0000       1.0000           0           0       0        30  
##         16.0000            .           .           .       1        29  
##         16.0000       0.9333      0.0667      0.0455       2        28  
##         17.0000       0.9000      0.1000      0.0548       3        27  
##         18.0000       0.8667      0.1333      0.0621       4        26  
##         19.0000            .           .           .       5        25  
##         19.0000       0.8000      0.2000      0.0730       6        24  
##         20.0000       0.7667      0.2333      0.0772       7        23  
##         21.0000       0.7333      0.2667      0.0807       8        22  
##         22.0000       0.7000      0.3000      0.0837       9        21  
##         22.0000*           .           .           .       9        20  
##         23.0000       0.6650      0.3350      0.0865      10        19  
##         24.0000       0.6300      0.3700      0.0887      11        18  
##         26.0000       0.5950      0.4050      0.0905      12        17  
##         27.0000       0.5600      0.4400      0.0917      13        16  
##         28.0000            .           .           .      14        15  
##         28.0000       0.4900      0.5100      0.0926      15        14  
##         28.0000*           .           .           .      15        13  
##         29.0000            .           .           .      16        12  
##         29.0000            .           .           .      17        11  
##         29.0000       0.3769      0.6231      0.0914      18        10  
##         30.0000*           .           .           .      18         9  
##         30.0000*           .           .           .      18         8  
##         30.0000*           .           .           .      18         7  
##         30.0000*           .           .           .      18         6  
##         30.0000*           .           .           .      18         5  
##         30.0000*           .           .           .      18         4  
##         30.0000*           .           .           .      18         3  
##         30.0000*           .           .           .      18         2  
##         30.0000*           .           .           .      18         1  
##         30.0000*      0.3769      0.6231           .      18         0  
## 
##         NOTE: The marked survival times are censored observations.
## 
## 
##             Summary Statistics for Time Variable followupmonth
## 
##                             Quartile Estimates
##  
##                        Point         95% Confidence Interval
##          Percent    Estimate    Transform      [Lower      Upper)
## 
##               75       .        LOGLOG        29.0000       .    
##               50     28.0000    LOGLOG        23.0000       .    
##               25     21.0000    LOGLOG        17.0000     26.0000
## 
## 
##                                        Standard
##                                Mean       Error
## 
##                             25.2700      0.8829
## 
## NOTE: The mean survival time and its standard error were underestimated 
##       because the largest observation was censored and the estimation was 
##       restricted to the largest event time.
## 
## 
##           Summary of the Number of Censored and Uncensored Values
##  
##                                                               Percent
##      Stratum    treatment       Total  Failed    Censored    Censored
## 
##            1    No                 30      22           8       26.67
##            2    Yes                30      18          12       40.00
##      ----------------------------------------------------------------
##        Total                       60      40          20       33.33
##  
##                                                                            
##  
##                           The LIFETEST Procedure
## 
##    Testing Homogeneity of Survival Curves for followupmonth over Strata
## 
## 
##                               Rank Statistics
##  
##                      treatment    Log-Rank    Wilcoxon
## 
##                      No             8.6349      520.00
##                      Yes           -8.6349     -520.00
## 
## 
##                Covariance Matrix for the Log-Rank Statistics
##  
##                    treatment            No           Yes
## 
##                    No              8.17046      -8.17046
##                    Yes            -8.17046       8.17046
## 
## 
##                Covariance Matrix for the Wilcoxon Statistics
##  
##                    treatment            No           Yes
## 
##                    No              15272.0      -15272.0
##                    Yes            -15272.0       15272.0
## 
## 
##                        Test of Equality over Strata
##  
##                                                    Pr >
##                 Test      Chi-Square      DF    Chi-Square
## 
##                 Log-Rank      9.1257       1      0.0025  
##                 Wilcoxon     17.7057       1      <.0001  
##                 -2Log(LR)     8.7203       1      0.0031

The proportional hazards model

proc phreg data=Coxoutcomeprediction;
  class treatment(ref=first);
  model followupmonth*event(0) = treatment age/TIES=EFRON;
run;
##                             The PHREG Procedure
## 
##                             Model Information
## 
##    Data Set                 WORK.COXOUTCOMEPREDICTION                  
##    Dependent Variable       followupmonth                 followupmonth
##    Censoring Variable       event                         event        
##    Censoring Value(s)       0                                          
##    Ties Handling            EFRON                                      
## 
## 
##                   Number of Observations Read          60
##                   Number of Observations Used          60
## 
## 
##                           Class Level Information
##  
##                                                Design
##                      Class         Value     Variables
## 
##                      treatment     No                0
##                                    Yes               1
## 
## 
##             Summary of the Number of Event and Censored Values
##  
##                                                     Percent
##                   Total       Event    Censored    Censored
## 
##                      60          40          20       33.33
## 
## 
##                             Convergence Status
## 
##               Convergence criterion (GCONV=1E-8) satisfied.          
## 
## 
##                            Model Fit Statistics
##  
##                                    Without           With
##                   Criterion     Covariates     Covariates
## 
##                   -2 LOG L         289.629        275.233
##                   AIC              289.629        279.233
##                   SBC              289.629        282.611
## 
## 
##                   Testing Global Null Hypothesis: BETA=0
##  
##           Test                 Chi-Square       DF     Pr > ChiSq
## 
##           Likelihood Ratio        14.3956        2         0.0007
##           Score                   14.7743        2         0.0006
##           Wald                    13.9329        2         0.0009
## 
## 
##                                Type 3 Tests
##  
##                                           Wald
##                Effect         DF    Chi-Square    Pr > ChiSq
## 
##                treatment       1        5.9694        0.0146
##                age             1        5.8150        0.0159
## 
## 
##                  Analysis of Maximum Likelihood Estimates
##  
##                         Parameter      Standard
## Parameter        DF      Estimate         Error    Chi-Square    Pr > ChiSq
## 
## treatment Yes     1      -0.80685       0.33024        5.9694        0.0146
## age               1       0.02888       0.01198        5.8150        0.0159
## 
##                  Analysis of Maximum Likelihood Estimates
##  
##                                    Hazard
##                 Parameter           Ratio    Label
## 
##                 treatment Yes       0.446    treatment Yes
##                 age                 1.029    age

Testing the Proportional Hazards Assumption

Graphs

proc phreg data=Coxoutcomeprediction noprint;
model followupmonth*event(0) = age; 
strata treatment;
baseline out=temp survival=survival;
run;

data temp ; set temp;
loglogs=log(-log(survival));
logtime=log(followupmonth);
run;

symbol l=1 i=join v=dot;
proc gplot data=temp;
plot loglogs*followupmonth=treatment /vaxis=axis1 haxis=axis2 legend=legend1;
axis1 order=(-5 to 0 by 1)
label=(f=swiss h=2 a=90 'Log-minus-log survival')
value=(f=swiss h=2);
axis2 order=(0 to 60 by 6)
label=(f=swiss h=2 'Months')
value=(f=swiss h=2);
legend1 label=(h=2 'Treatment =')
value=(f=swiss h=2);
run;

Interactions with Time as Time-Dependent Covariates

proc phreg data=Coxoutcomeprediction;
model followupmonth*event(0) = treatment age treatmentfollowupmonth;
treatmentfollowupmonth=treatment*followupmonth;
run;
##                             The PHREG Procedure
## 
##                             Model Information
## 
##    Data Set                 WORK.COXOUTCOMEPREDICTION                  
##    Dependent Variable       followupmonth                 followupmonth
##    Censoring Variable       event                         event        
##    Censoring Value(s)       0                                          
##    Ties Handling            BRESLOW                                    
## 
## 
##                   Number of Observations Read          60
##                   Number of Observations Used          60
## 
## 
##             Summary of the Number of Event and Censored Values
##  
##                                                     Percent
##                   Total       Event    Censored    Censored
## 
##                      60          40          20       33.33
## 
## 
##                             Convergence Status
## 
##               Convergence criterion (GCONV=1E-8) satisfied.          
## 
## 
##                            Model Fit Statistics
##  
##                                    Without           With
##                   Criterion     Covariates     Covariates
## 
##                   -2 LOG L         291.232        244.498
##                   AIC              291.232        250.498
##                   SBC              291.232        255.565
## 
## 
##                   Testing Global Null Hypothesis: BETA=0
##  
##           Test                 Chi-Square       DF     Pr > ChiSq
## 
##           Likelihood Ratio        46.7343        3         <.0001
##           Score                   32.1967        3         <.0001
##           Wald                     8.6364        3         0.0345
## 
## 
##                  Analysis of Maximum Likelihood Estimates
##  
##                             Parameter    Standard
##  Parameter             DF    Estimate       Error  Chi-Square  Pr > ChiSq
## 
##  treatment              1   -13.86685     7.18281      3.7271      0.0535
##  age                    1     0.02333     0.01150      4.1139      0.0425
##  treatmentfollowupmon   1     0.78493     0.42440      3.4207      0.0644
## 
##                  Analysis of Maximum Likelihood Estimates
##  
##                                         Hazard
##                 Parameter                Ratio    Label
## 
##                 treatment                0.000    treatment
##                 age                      1.024    age      
##                 treatmentfollowupmon     2.192

Execute in R

# load data and reshape
df<-read.spss("Data/Coxoutcomeprediction.sav",to.data.frame = T,use.value.labels = TRUE) %>%
  as_tibble() %>%
  mutate(., Event = factor(.$event, levels = c(1,0), labels = c("Occured","Censored")), event = (.$event == 1))
## re-encoding from CP1252
df$treatment <- relevel(factor(df$treatment, levels = c(0,1), labels = c("No","Yes")), ref = "No")


df
followupmonth event treatment age Event
1 TRUE No 65 Occured
1 TRUE No 66 Occured
2 TRUE No 73 Occured
2 TRUE No 91 Occured
2 TRUE No 86 Occured
2 TRUE No 87 Occured
2 TRUE No 54 Occured
2 TRUE No 66 Occured
2 TRUE No 64 Occured
3 FALSE No 62 Censored
4 TRUE No 57 Occured
5 TRUE No 85 Occured
6 TRUE No 85 Occured
6 TRUE No 76 Occured
7 TRUE No 76 Occured
9 TRUE No 76 Occured
9 TRUE No 65 Occured
11 TRUE No 84 Occured
12 TRUE No 34 Occured
14 TRUE No 45 Occured
16 TRUE No 56 Occured
17 TRUE No 67 Occured
18 TRUE No 86 Occured
30 FALSE No 75 Censored
30 FALSE No 65 Censored
30 FALSE No 54 Censored
30 FALSE No 46 Censored
30 FALSE No 54 Censored
30 FALSE No 75 Censored
30 FALSE No 56 Censored
30 FALSE Yes 56 Censored
30 FALSE Yes 53 Censored
30 FALSE Yes 34 Censored
30 FALSE Yes 35 Censored
30 FALSE Yes 37 Censored
30 FALSE Yes 65 Censored
30 FALSE Yes 45 Censored
30 FALSE Yes 66 Censored
30 FALSE Yes 55 Censored
30 FALSE Yes 88 Censored
29 TRUE Yes 67 Occured
29 TRUE Yes 56 Occured
29 TRUE Yes 54 Occured
28 FALSE Yes 57 Censored
28 TRUE Yes 57 Occured
28 TRUE Yes 76 Occured
27 TRUE Yes 67 Occured
26 TRUE Yes 66 Occured
24 TRUE Yes 56 Occured
23 TRUE Yes 66 Occured
22 TRUE Yes 84 Occured
22 FALSE Yes 56 Censored
21 TRUE Yes 46 Occured
20 TRUE Yes 45 Occured
19 TRUE Yes 76 Occured
19 TRUE Yes 65 Occured
18 TRUE Yes 45 Occured
17 TRUE Yes 76 Occured
16 TRUE Yes 56 Occured
16 TRUE Yes 45 Occured
Y <- Surv(df$followupmonth, df$event==1)
kmfit1 <- survfit(Y ~ 1)
summary(kmfit1)
## Call: survfit(formula = Y ~ 1)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     60       2    0.967  0.0232        0.922        1.000
##     2     58       7    0.850  0.0461        0.764        0.945
##     4     50       1    0.833  0.0482        0.744        0.933
##     5     49       1    0.816  0.0501        0.723        0.920
##     6     48       2    0.782  0.0535        0.684        0.894
##     7     46       1    0.765  0.0550        0.665        0.881
##     9     45       2    0.731  0.0575        0.626        0.853
##    11     43       1    0.714  0.0587        0.608        0.839
##    12     42       1    0.697  0.0597        0.589        0.824
##    14     41       1    0.680  0.0606        0.571        0.810
##    16     40       3    0.629  0.0628        0.517        0.765
##    17     37       2    0.595  0.0638        0.482        0.734
##    18     35       2    0.561  0.0646        0.448        0.703
##    19     33       2    0.527  0.0650        0.414        0.671
##    20     31       1    0.510  0.0651        0.397        0.655
##    21     30       1    0.493  0.0651        0.381        0.639
##    22     29       1    0.476  0.0650        0.364        0.622
##    23     27       1    0.458  0.0650        0.347        0.605
##    24     26       1    0.441  0.0648        0.330        0.588
##    26     25       1    0.423  0.0646        0.314        0.571
##    27     24       1    0.405  0.0642        0.297        0.553
##    28     23       2    0.370  0.0633        0.265        0.518
##    29     20       3    0.315  0.0614        0.215        0.461
plot(kmfit1, main = "Plot of Survival Curve", xlab = "Length of Survival", ylab = "Proportion of Individuals who have Survived")

kmfit2 <-survfit(Y ~ df$treatment)
summary(kmfit2)
## Call: survfit(formula = Y ~ df$treatment)
## 
##                 df$treatment=No 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     30       2    0.933  0.0455        0.848        1.000
##     2     28       7    0.700  0.0837        0.554        0.885
##     4     20       1    0.665  0.0865        0.515        0.858
##     5     19       1    0.630  0.0887        0.478        0.830
##     6     18       2    0.560  0.0917        0.406        0.772
##     7     16       1    0.525  0.0924        0.372        0.741
##     9     15       2    0.455  0.0924        0.306        0.677
##    11     13       1    0.420  0.0917        0.274        0.644
##    12     12       1    0.385  0.0905        0.243        0.610
##    14     11       1    0.350  0.0887        0.213        0.575
##    16     10       1    0.315  0.0865        0.184        0.540
##    17      9       1    0.280  0.0837        0.156        0.503
##    18      8       1    0.245  0.0802        0.129        0.465
## 
##                 df$treatment=Yes 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    16     30       2    0.933  0.0455        0.848        1.000
##    17     28       1    0.900  0.0548        0.799        1.000
##    18     27       1    0.867  0.0621        0.753        0.997
##    19     26       2    0.800  0.0730        0.669        0.957
##    20     24       1    0.767  0.0772        0.629        0.934
##    21     23       1    0.733  0.0807        0.591        0.910
##    22     22       1    0.700  0.0837        0.554        0.885
##    23     20       1    0.665  0.0865        0.515        0.858
##    24     19       1    0.630  0.0887        0.478        0.830
##    26     18       1    0.595  0.0905        0.442        0.802
##    27     17       1    0.560  0.0917        0.406        0.772
##    28     16       2    0.490  0.0926        0.338        0.710
##    29     13       3    0.377  0.0914        0.234        0.606
plot(kmfit2, lty = c("solid", "dashed"), col=c("black","grey"),
     xlab="survival time in months",ylab="survival probabilities")
legend("topright", c("Non-Treatment","Treatment"), lty=c("solid","dashed"), col=c("black","grey"))

survdiff(Y ~ df$treatment)
## Call:
## survdiff(formula = Y ~ df$treatment)
## 
##                   N Observed Expected (O-E)^2/E (O-E)^2/V
## df$treatment=No  30       22     13.4      5.58      9.13
## df$treatment=Yes 30       18     26.6      2.80      9.13
## 
##  Chisq= 9.1  on 1 degrees of freedom, p= 0.003
plot(kmfit2,fun="cloglog",xlab="time in months using logarithmic
     scale",ylab="log-log survival", main="log-log curves by clinic")

coxph(Y ~ treatment + age, data = df, method="efron")
## Call:
## coxph(formula = Y ~ treatment + age, data = df, method = "efron")
## 
##               coef exp(coef) se(coef)  z    p
## treatmentYes -0.81      0.45     0.33 -2 0.01
## age           0.03      1.03     0.01  2 0.02
## 
## Likelihood ratio test=14  on 2 df, p=7e-04
## n= 60, number of events= 40
#coxph(Y ~ strata(treatment) + age, data = df, method="efron")
#summary(coxph(Y ~ strata(treatment) + age, data = df, method="efron"))

References

Cleophas, Ton J., and Aeilko H. Zwinderman. 2014. Machine Learning in Medicine - Cookbook. Book. 2014th ed. SpringerBriefs in Statistics. Cham: Springer International Publishing AG. doi:10.1007/978-3-319-04181-0.