data(package = "survival")
data(cancer)
head(lung) # View first few rows
## inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1 3 306 2 74 1 1 90 100 1175 NA
## 2 3 455 2 68 1 0 90 90 1225 15
## 3 3 1010 1 56 1 0 90 90 NA 15
## 4 5 210 2 57 1 1 90 60 1150 11
## 5 1 883 2 60 1 0 100 90 NA 0
## 6 12 1022 1 74 1 1 50 80 513 0
str(lung) # Check structure of data
## 'data.frame': 228 obs. of 10 variables:
## $ inst : num 3 3 3 5 1 12 7 11 1 7 ...
## $ time : num 306 455 1010 210 883 ...
## $ status : num 2 2 1 2 2 1 2 2 2 2 ...
## $ age : num 74 68 56 57 60 74 68 71 53 61 ...
## $ sex : num 1 1 1 1 1 1 2 2 1 1 ...
## $ ph.ecog : num 1 0 0 1 0 1 2 2 1 2 ...
## $ ph.karno : num 90 90 90 90 100 50 70 60 70 70 ...
## $ pat.karno: num 100 90 90 60 90 80 60 80 80 70 ...
## $ meal.cal : num 1175 1225 NA 1150 NA ...
## $ wt.loss : num NA 15 15 11 0 0 10 1 16 34 ...
lung$status <- ifelse(lung$status == 2, 1, 0) # 1 = death, 0 = censored
sum(is.na(lung)) # Count missing values
## [1] 67
lung <- na.omit(lung) # Remove rows with missing data
summary(lung)
## inst time status age
## Min. : 1.00 Min. : 5.0 Min. :0.0000 Min. :39.00
## 1st Qu.: 3.00 1st Qu.: 174.5 1st Qu.:0.0000 1st Qu.:57.00
## Median :11.00 Median : 268.0 Median :1.0000 Median :64.00
## Mean :10.71 Mean : 309.9 Mean :0.7186 Mean :62.57
## 3rd Qu.:15.00 3rd Qu.: 419.5 3rd Qu.:1.0000 3rd Qu.:70.00
## Max. :32.00 Max. :1022.0 Max. :1.0000 Max. :82.00
## sex ph.ecog ph.karno pat.karno
## Min. :1.000 Min. :0.0000 Min. : 50.00 Min. : 30.00
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 70.00 1st Qu.: 70.00
## Median :1.000 Median :1.0000 Median : 80.00 Median : 80.00
## Mean :1.383 Mean :0.9581 Mean : 82.04 Mean : 79.58
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.: 90.00 3rd Qu.: 90.00
## Max. :2.000 Max. :3.0000 Max. :100.00 Max. :100.00
## meal.cal wt.loss
## Min. : 96.0 Min. :-24.000
## 1st Qu.: 619.0 1st Qu.: 0.000
## Median : 975.0 Median : 7.000
## Mean : 929.1 Mean : 9.719
## 3rd Qu.:1162.5 3rd Qu.: 15.000
## Max. :2600.0 Max. : 68.000
##Kaplen-Meir Estimator
# Fit Kaplan-Meier model
km_fit <- survfit(Surv(time, status) ~sex, data = lung)
# View summary
summary(km_fit)
## Call: survfit(formula = Surv(time, status) ~ sex, data = lung)
##
## sex=1
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 11 103 1 0.9903 0.00966 0.9715 1.000
## 12 102 1 0.9806 0.01360 0.9543 1.000
## 13 101 1 0.9709 0.01657 0.9389 1.000
## 15 100 1 0.9612 0.01904 0.9246 0.999
## 26 99 1 0.9515 0.02118 0.9108 0.994
## 30 98 1 0.9417 0.02308 0.8976 0.988
## 31 97 1 0.9320 0.02480 0.8847 0.982
## 53 96 2 0.9126 0.02782 0.8597 0.969
## 54 94 1 0.9029 0.02917 0.8475 0.962
## 59 93 1 0.8932 0.03043 0.8355 0.955
## 60 92 1 0.8835 0.03161 0.8237 0.948
## 65 91 1 0.8738 0.03272 0.8119 0.940
## 88 90 1 0.8641 0.03377 0.8004 0.933
## 92 89 1 0.8544 0.03476 0.7889 0.925
## 93 88 1 0.8447 0.03569 0.7775 0.918
## 95 87 1 0.8350 0.03658 0.7663 0.910
## 110 86 1 0.8252 0.03742 0.7551 0.902
## 118 85 1 0.8155 0.03822 0.7440 0.894
## 135 84 1 0.8058 0.03898 0.7329 0.886
## 142 83 1 0.7961 0.03970 0.7220 0.878
## 147 82 1 0.7864 0.04038 0.7111 0.870
## 156 81 2 0.7670 0.04165 0.6895 0.853
## 163 79 3 0.7379 0.04333 0.6576 0.828
## 166 76 1 0.7282 0.04384 0.6471 0.819
## 170 75 1 0.7184 0.04432 0.6366 0.811
## 176 73 1 0.7086 0.04479 0.6260 0.802
## 179 72 1 0.6988 0.04523 0.6155 0.793
## 180 71 1 0.6889 0.04566 0.6050 0.784
## 181 70 2 0.6692 0.04642 0.5842 0.767
## 183 68 1 0.6594 0.04677 0.5738 0.758
## 197 64 1 0.6491 0.04716 0.5629 0.748
## 207 62 1 0.6386 0.04755 0.5519 0.739
## 210 61 1 0.6282 0.04791 0.5409 0.729
## 212 60 1 0.6177 0.04824 0.5300 0.720
## 218 59 1 0.6072 0.04855 0.5191 0.710
## 222 57 1 0.5966 0.04885 0.5081 0.700
## 223 55 1 0.5857 0.04915 0.4969 0.690
## 229 52 1 0.5745 0.04948 0.4852 0.680
## 230 51 1 0.5632 0.04977 0.4736 0.670
## 246 50 1 0.5519 0.05004 0.4621 0.659
## 267 48 1 0.5404 0.05030 0.4503 0.649
## 269 47 1 0.5289 0.05053 0.4386 0.638
## 270 46 1 0.5174 0.05072 0.4270 0.627
## 283 45 1 0.5059 0.05088 0.4154 0.616
## 284 44 1 0.4944 0.05101 0.4039 0.605
## 285 42 1 0.4827 0.05113 0.3922 0.594
## 286 41 1 0.4709 0.05122 0.3805 0.583
## 288 40 1 0.4591 0.05128 0.3689 0.571
## 291 39 1 0.4473 0.05129 0.3573 0.560
## 301 36 1 0.4349 0.05135 0.3451 0.548
## 303 34 1 0.4221 0.05141 0.3325 0.536
## 320 32 1 0.4089 0.05147 0.3195 0.523
## 337 31 1 0.3957 0.05147 0.3067 0.511
## 353 30 2 0.3694 0.05131 0.2813 0.485
## 363 28 1 0.3562 0.05114 0.2688 0.472
## 371 27 1 0.3430 0.05092 0.2564 0.459
## 390 26 1 0.3298 0.05064 0.2441 0.446
## 428 23 1 0.3154 0.05043 0.2306 0.432
## 429 22 1 0.3011 0.05014 0.2173 0.417
## 455 21 1 0.2868 0.04976 0.2041 0.403
## 457 20 1 0.2724 0.04929 0.1911 0.388
## 460 18 1 0.2573 0.04882 0.1774 0.373
## 477 17 1 0.2422 0.04824 0.1639 0.358
## 519 16 1 0.2270 0.04754 0.1506 0.342
## 524 15 1 0.2119 0.04672 0.1375 0.326
## 558 14 1 0.1968 0.04577 0.1247 0.310
## 567 13 1 0.1816 0.04468 0.1121 0.294
## 574 12 1 0.1665 0.04344 0.0998 0.278
## 583 11 1 0.1514 0.04205 0.0878 0.261
## 613 10 1 0.1362 0.04048 0.0761 0.244
## 643 9 1 0.1211 0.03870 0.0647 0.227
## 655 8 1 0.1059 0.03671 0.0537 0.209
## 689 7 1 0.0908 0.03444 0.0432 0.191
## 707 6 1 0.0757 0.03185 0.0332 0.173
## 791 5 1 0.0605 0.02886 0.0238 0.154
## 814 3 1 0.0404 0.02533 0.0118 0.138
##
## sex=2
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 5 64 1 0.984 0.0155 0.9545 1.000
## 60 63 1 0.969 0.0217 0.9270 1.000
## 61 62 1 0.953 0.0264 0.9027 1.000
## 62 61 1 0.938 0.0303 0.8800 0.999
## 79 60 1 0.922 0.0335 0.8584 0.990
## 81 59 1 0.906 0.0364 0.8376 0.981
## 95 58 1 0.891 0.0390 0.8174 0.970
## 107 56 1 0.875 0.0414 0.7972 0.960
## 145 55 1 0.859 0.0436 0.7774 0.949
## 153 54 1 0.843 0.0456 0.7581 0.937
## 167 53 1 0.827 0.0475 0.7390 0.925
## 199 50 1 0.810 0.0493 0.7194 0.913
## 201 49 1 0.794 0.0510 0.7000 0.900
## 226 45 1 0.776 0.0528 0.6794 0.887
## 239 43 1 0.758 0.0546 0.6584 0.873
## 245 40 1 0.739 0.0564 0.6366 0.859
## 268 37 1 0.719 0.0583 0.6136 0.843
## 285 34 1 0.698 0.0603 0.5894 0.827
## 293 32 1 0.676 0.0623 0.5647 0.810
## 305 30 1 0.654 0.0641 0.5394 0.792
## 310 29 1 0.631 0.0658 0.5146 0.774
## 345 27 1 0.608 0.0674 0.4892 0.755
## 348 26 1 0.584 0.0687 0.4642 0.736
## 351 25 1 0.561 0.0698 0.4397 0.716
## 361 24 1 0.538 0.0707 0.4155 0.696
## 363 23 1 0.514 0.0714 0.3918 0.675
## 426 19 1 0.487 0.0726 0.3639 0.653
## 433 18 1 0.460 0.0734 0.3366 0.629
## 444 17 1 0.433 0.0739 0.3100 0.605
## 450 16 1 0.406 0.0741 0.2839 0.581
## 473 15 1 0.379 0.0739 0.2585 0.556
## 520 13 1 0.350 0.0738 0.2314 0.529
## 550 11 1 0.318 0.0736 0.2020 0.501
## 641 8 1 0.278 0.0744 0.1648 0.470
## 687 7 1 0.239 0.0736 0.1303 0.437
## 705 6 1 0.199 0.0713 0.0984 0.401
## 731 5 1 0.159 0.0672 0.0695 0.364
## 765 3 1 0.106 0.0623 0.0335 0.335
# Plot KM survival curve
ggsurvplot(km_fit, data = lung,
conf.int = TRUE, # Show confidence intervals
pval = TRUE, # Show p-value
risk.table = TRUE,# Show risk table
xlab = "Time (days)",
ylab = "Survival Probability",
ggtheme = theme_minimal())
##Interpretation ### Kaplan-Meier Survival Analysis by Sex The
Kaplan-Meier survival analysis examined survival differences between
males and females.
The log-rank test resulted in a p-value of 0.014,
indicating a statistically significant difference in survival between
the two groups.
The survival curves suggest that female sex have a highersurvival rate
then male sex.
##Nelson-Aalen estimator
# Fit the Nelson-Aalen estimator
na_fit <- survfit(Surv(time, status) ~ sex, data = lung)
# Plot the cumulative hazard function
ggsurvplot(na_fit, fun = "cumhaz", data = lung, conf.int = TRUE,
title = "Nelson-Aalen Cumulative Hazard Estimate",
xlab = "Time (days)", ylab = "Cumulative Hazard",
legend.title = "Sex",
legend.labs = c("Male", "Female"))
##Interpretation ##Survival analysis by Nelson-Aalen on sex -male group
has a higher cumulative hazard than the female group, it suggests that
males experience the event ( death) at a higher rate. -A steeper curve
indicates a higher cumulative hazard, meaning male in that group
experience a higher risk of the event ( death) over time.
##Log-Rank Test for Survival Differences
survdiff(Surv(time, status) ~ sex, data =lung)
## Call:
## survdiff(formula = Surv(time, status) ~ sex, data = lung)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=1 103 82 68.7 2.57 6.05
## sex=2 64 38 51.3 3.44 6.05
##
## Chisq= 6 on 1 degrees of freedom, p= 0.01
##Interpretation ## Log-Rank Test for Survival Differences
The Log-Rank Test was conducted to compare survival distributions between groups (sex). The test yielded a Chi-Square statistic of 6.00, with 1 degree of freedom, and a p-value of 0.01. Since the p-value is less than 0.05, we conclude that there is a statistically significant difference in survival between the groups.
##Cox Proportional Hazards model
# Fit Cox Proportional Hazards model
cox_fit <- coxph(Surv(time, status) ~ ph.ecog, data = lung)
# View model summary
summary(cox_fit)
## Call:
## coxph(formula = Surv(time, status) ~ ph.ecog, data = lung)
##
## n= 167, number of events= 120
##
## coef exp(coef) se(coef) z Pr(>|z|)
## ph.ecog 0.4693 1.5988 0.1331 3.527 0.00042 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## ph.ecog 1.599 0.6255 1.232 2.075
##
## Concordance= 0.615 (se = 0.028 )
## Likelihood ratio test= 12.41 on 1 df, p=4e-04
## Wald test = 12.44 on 1 df, p=4e-04
## Score (logrank) test = 12.61 on 1 df, p=4e-04
##Interpretation 1. Model Summary The Cox Proportional Hazards model was fitted to assess the impact of ph.ecog on survival. The dataset contained 167 observations, with 120 events (deaths) recorded.
Variable Coefficient Hazard Ratio (HR) 95% CI (Lower) 95% CI (Upper) p-value ph.ecog 0.4693 1.5988 1.232 2.075 0.00042 Hazard Ratio (HR) = 1.599 → A one-unit increase in ph.ecog increases the hazard (risk of death) by 59.9%. p-value = 0.00042 → Since p < 0.05, ph.ecog is statistically significant, meaning it has a strong association with survival. 95% Confidence Interval (1.232 - 2.075) → Since the entire confidence interval is above 1, this confirms that higher ph.ecog values are associated with an increased risk of death.
Model Performance and Significance Tests Concordance = 0.615 → The model correctly predicts survival ranking 61.5% of the time, indicating moderate predictive performance. Likelihood Ratio Test, Wald Test, and Score Test All tests resulted in p < 0.05, confirming that the model is statistically significant and that ph.ecog is an important predictor of survival.
Final Conclusion Higher ph.ecog scores significantly increase the risk of death (HR = 1.599, p = 0.00042). The model provides a moderate fit to the data (Concordance = 0.615). Since ph.ecog is statistically significant, it should be considered an important factor in predicting survival outcomes.
# Check proportional hazards assumption
cox.zph(cox_fit)
## chisq df p
## ph.ecog 4.72 1 0.03
## GLOBAL 4.72 1 0.03
##Interpretation since the p value is less than 0.05 the proportional hazard assumption holds
# Fit the Cox Proportional Hazards Model
cox_model <- coxph(Surv(time, status) ~ ph.ecog, data = lung)
# Create survival curves stratified by ph.ecog
cox_surv_fit <- survfit(Surv(time, status) ~ ph.ecog, data = lung)
# Plot the survival curves
ggsurvplot(cox_surv_fit, data = lung,
ggtheme = theme_minimal(),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE)