library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(naniar)
library(readr)
library(ggplot2)
library(broom)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyr)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(haven)
library(survival)
library(survminer)
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
##
## The following object is masked from 'package:survival':
##
## myeloma
library(readxl)
sas_data <- read_sas("/Users/anna/Downloads/Exam1Exercise1.sas7bdat")
View(sas_data)
km_fit_grouped_1 <- survfit(Surv(time, fail) ~ treatmnt, data = sas_data)
summary(km_fit_grouped_1)
## Call: survfit(formula = Surv(time, fail) ~ treatmnt, data = sas_data)
##
## treatmnt=0
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 5 42 2 0.9524 0.0329 0.8901 1.000
## 8 37 1 0.9266 0.0408 0.8500 1.000
## 10 36 1 0.9009 0.0471 0.8131 0.998
## 11 35 2 0.8494 0.0568 0.7451 0.968
## 12 32 4 0.7432 0.0702 0.6176 0.894
## 13 26 2 0.6861 0.0756 0.5528 0.851
## 15 21 1 0.6534 0.0787 0.5160 0.827
## 17 19 1 0.6190 0.0817 0.4778 0.802
## 19 18 1 0.5846 0.0841 0.4409 0.775
## 20 17 1 0.5502 0.0859 0.4052 0.747
## 21 16 2 0.4815 0.0879 0.3367 0.689
## 27 13 1 0.4444 0.0886 0.3007 0.657
## 30 12 1 0.4074 0.0886 0.2660 0.624
## 32 9 1 0.3621 0.0896 0.2230 0.588
## 34 8 2 0.2716 0.0871 0.1449 0.509
## 38 4 1 0.2037 0.0879 0.0874 0.475
## 39 3 1 0.1358 0.0807 0.0424 0.435
## 84 2 1 0.0679 0.0627 0.0111 0.415
## 88 1 1 0.0000 NaN NA NA
##
## treatmnt=1
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 146 1 0.9932 0.00683 0.97986 1.000
## 2 142 2 0.9792 0.01191 0.95610 1.000
## 3 137 3 0.9577 0.01690 0.92517 0.991
## 4 133 2 0.9433 0.01947 0.90592 0.982
## 5 129 5 0.9068 0.02464 0.85972 0.956
## 6 124 2 0.8921 0.02633 0.84200 0.945
## 7 119 6 0.8472 0.03074 0.78899 0.910
## 8 112 6 0.8018 0.03423 0.73741 0.872
## 9 106 6 0.7564 0.03697 0.68729 0.832
## 10 100 2 0.7413 0.03774 0.67085 0.819
## 11 95 10 0.6632 0.04105 0.58746 0.749
## 12 83 2 0.6472 0.04159 0.57066 0.734
## 13 79 3 0.6227 0.04236 0.54494 0.711
## 14 75 3 0.5978 0.04304 0.51909 0.688
## 15 69 3 0.5718 0.04370 0.49222 0.664
## 16 65 1 0.5630 0.04391 0.48317 0.656
## 17 64 3 0.5366 0.04441 0.45623 0.631
## 18 60 2 0.5187 0.04470 0.43809 0.614
## 19 55 3 0.4904 0.04515 0.40945 0.587
## 20 51 2 0.4712 0.04538 0.39013 0.569
## 21 48 4 0.4319 0.04565 0.35110 0.531
## 22 43 4 0.3917 0.04561 0.31181 0.492
## 24 38 2 0.3711 0.04548 0.29188 0.472
## 25 35 2 0.3499 0.04528 0.27152 0.451
## 26 33 1 0.3393 0.04514 0.26143 0.440
## 27 32 1 0.3287 0.04495 0.25142 0.430
## 29 27 2 0.3044 0.04480 0.22808 0.406
## 30 25 1 0.2922 0.04463 0.21659 0.394
## 34 22 2 0.2656 0.04435 0.19149 0.368
## 35 19 1 0.2516 0.04416 0.17840 0.355
## 36 18 1 0.2377 0.04387 0.16552 0.341
## 38 17 1 0.2237 0.04346 0.15284 0.327
## 39 16 1 0.2097 0.04293 0.14039 0.313
## 40 15 2 0.1817 0.04151 0.11615 0.284
## 41 13 2 0.1538 0.03955 0.09289 0.255
## 55 8 1 0.1346 0.03900 0.07624 0.237
## 56 7 1 0.1153 0.03787 0.06060 0.220
## 59 6 1 0.0961 0.03611 0.04602 0.201
## 62 4 1 0.0721 0.03415 0.02848 0.182
## 63 3 1 0.0481 0.03006 0.01411 0.164
## 78 2 1 0.0240 0.02268 0.00378 0.153
ggsurvplot(km_fit_grouped_1, conf.int = TRUE, pval = TRUE, risk.table = TRUE)
Log-rank test
dif_1 <- survdiff(Surv(time, fail) ~ treatmnt, data = sas_data)
dif_1
## Call:
## survdiff(formula = Surv(time, fail) ~ treatmnt, data = sas_data)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## treatmnt=0 48 27 32.6 0.962 1.37
## treatmnt=1 146 100 94.4 0.332 1.37
##
## Chisq= 1.4 on 1 degrees of freedom, p= 0.2
The p-value is 0.2, indicating that there is no statistically significant difference at the α = 0.05 level in disease spread (local or metastatic) between treatment groups A and B.
cox_model <- coxph(Surv(time, fail) ~ treatmnt, data = sas_data)
summary(cox_model)
## Call:
## coxph(formula = Surv(time, fail) ~ treatmnt, data = sas_data)
##
## n= 194, number of events= 127
##
## coef exp(coef) se(coef) z Pr(>|z|)
## treatmnt 0.2511 1.2855 0.2189 1.147 0.251
##
## exp(coef) exp(-coef) lower .95 upper .95
## treatmnt 1.285 0.7779 0.837 1.974
##
## Concordance= 0.537 (se = 0.02 )
## Likelihood ratio test= 1.38 on 1 df, p=0.2
## Wald test = 1.32 on 1 df, p=0.3
## Score (logrank) test = 1.32 on 1 df, p=0.3
In the univariate Cox model, when adjusted only for the treatment group, the effect of treatment was not statistically significant (p = 0.25). This indicates insufficient evidence to reject the null hypothesis that the coefficient differs from zero.
cox_model_2 <- coxph(Surv(time, fail) ~ treatmnt + age + perfstat, data = sas_data)
summary(cox_model_2)
## Call:
## coxph(formula = Surv(time, fail) ~ treatmnt + age + perfstat,
## data = sas_data)
##
## n= 194, number of events= 127
##
## coef exp(coef) se(coef) z Pr(>|z|)
## treatmnt 0.394148 1.483120 0.225479 1.748 0.08046 .
## age -0.011073 0.988988 0.009331 -1.187 0.23537
## perfstat 0.612568 1.845164 0.190255 3.220 0.00128 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## treatmnt 1.483 0.6743 0.9533 2.307
## age 0.989 1.0111 0.9711 1.007
## perfstat 1.845 0.5420 1.2708 2.679
##
## Concordance= 0.619 (se = 0.027 )
## Likelihood ratio test= 12.1 on 3 df, p=0.007
## Wald test = 12.16 on 3 df, p=0.007
## Score (logrank) test = 12.39 on 3 df, p=0.006
After adjusting for treatment group, age, and performance status, we found that performance status was highly statistically significant, with non-ambulatory patients having an 84.5% higher hazard of the disease spread. Age was not a statistically significant predictor.
Additionally, treatment remained a non-significant predictor; however, its coefficient was higher than in the univariate model, suggesting a 48.3% higher hazard for the treatment B. This may suggest potential confounding or an interaction effect with age and performance status.
c.The likelihood ratio test (p = 0.007) indicates that the overall model is statistically significant.
cox.zph(cox_model_2)
## chisq df p
## treatmnt 0.353 1 0.552
## age 3.185 1 0.074
## perfstat 1.605 1 0.205
## GLOBAL 7.610 3 0.055
Schoenfeld residuals test for the proportional hazards assumption:
For individual covariates, there is no strong evidence to suggest a violation of the proportional hazards assumption for treatment, age, and performance status. The global test indicates a mild concern (p-value near 0.05), but the overall conclusion is that the model generally does not violate the proportional hazards assumption based on these test.
residuals_schoenfeld <- cox.zph(cox_model_2)
plot(residuals_schoenfeld)
The Schoenfeld residual plots for age, treatment, and performance status mostly show random scatter around zero, suggesting that there is no clear trend over time. This indicates that the proportional hazards assumption is likely not violated for these covariates.
res_martingale <- residuals(cox_model_2, type = "martingale")
plot(sas_data$age, res_martingale)
We observe a random scatter in the Martingale residuals plot, which
indicates that the relationship between age and the log hazard is
correctly specified as linear. There is no systematic pattern,
suggesting that no major violations of the linearity assumption are
present. This means that the covariates included in the model do not
require transformation or non-linear adjustments.
survfit_obj <- survfit(cox_model_2)
plot(survfit_obj, fun = "cloglog")
Log(-Log(Survival)) vs. Log(Time) plot shows that the overall trend of
the curves is approximately parallel, the proportional hazards (PH)
assumption holds.
dfbeta_values <- residuals(cox_model_2, type = "dfbeta")
matplot(dfbeta_values, type = "h", lty = 1, col = 1:ncol(dfbeta_values),
ylab = "DFBeta", main = "Influence of Each Observation")
The DFBeta plot shows a few spikes, indicating the presence of
influential points. However, overall, the plot appears normal, with most
of the observations having no outsized effect on the model’s
coefficients.
data_csv <- read_csv("/Users/anna/Downloads/Exam1Exercise2.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (10): hihypert, hidiabet, himi, lveejf, tx, age, egfr, gender, death, t2...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(data_csv)
pct_complete(data_csv) #checking_missing_data
## [1] 100
cox_full_model <- coxph(Surv(t2death, death) ~ tx + hihypert + hidiabet + himi + lveejf + age + egfr + gender, data=data_csv)
cox_step_model <- step(cox_full_model, direction = "both")
## Start: AIC=1781.4
## Surv(t2death, death) ~ tx + hihypert + hidiabet + himi + lveejf +
## age + egfr + gender
##
## Df AIC
## - himi 1 1780.4
## - hihypert 1 1780.5
## - gender 1 1780.6
## - egfr 1 1780.6
## <none> 1781.4
## - tx 1 1783.4
## - hidiabet 1 1786.2
## - lveejf 1 1786.9
## - age 1 1814.6
##
## Step: AIC=1780.43
## Surv(t2death, death) ~ tx + hihypert + hidiabet + lveejf + age +
## egfr + gender
##
## Df AIC
## - hihypert 1 1779.4
## - gender 1 1779.5
## - egfr 1 1779.5
## <none> 1780.4
## + himi 1 1781.4
## - tx 1 1782.5
## - hidiabet 1 1784.8
## - lveejf 1 1787.5
## - age 1 1812.8
##
## Step: AIC=1779.44
## Surv(t2death, death) ~ tx + hidiabet + lveejf + age + egfr +
## gender
##
## Df AIC
## - gender 1 1778.4
## - egfr 1 1778.6
## <none> 1779.4
## + hihypert 1 1780.4
## + himi 1 1780.5
## - tx 1 1781.5
## - hidiabet 1 1784.0
## - lveejf 1 1786.2
## - age 1 1812.5
##
## Step: AIC=1778.36
## Surv(t2death, death) ~ tx + hidiabet + lveejf + age + egfr
##
## Df AIC
## - egfr 1 1777.2
## <none> 1778.4
## + gender 1 1779.4
## + hihypert 1 1779.5
## + himi 1 1779.5
## - tx 1 1780.2
## - hidiabet 1 1782.8
## - lveejf 1 1786.5
## - age 1 1810.9
##
## Step: AIC=1777.24
## Surv(t2death, death) ~ tx + hidiabet + lveejf + age
##
## Df AIC
## <none> 1777.2
## + hihypert 1 1778.3
## + egfr 1 1778.4
## + himi 1 1778.5
## + gender 1 1778.6
## - tx 1 1779.3
## - hidiabet 1 1781.7
## - lveejf 1 1785.5
## - age 1 1812.6
summary(cox_step_model)
## Call:
## coxph(formula = Surv(t2death, death) ~ tx + hidiabet + lveejf +
## age, data = data_csv)
##
## n= 1000, number of events= 141
##
## coef exp(coef) se(coef) z Pr(>|z|)
## tx -0.343635 0.709188 0.171913 -1.999 0.04562 *
## hidiabet -0.504948 0.603537 0.190884 -2.645 0.00816 **
## lveejf -0.030826 0.969645 0.009882 -3.119 0.00181 **
## age 0.070334 1.072866 0.012002 5.860 4.62e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## tx 0.7092 1.4101 0.5063 0.9933
## hidiabet 0.6035 1.6569 0.4152 0.8774
## lveejf 0.9696 1.0313 0.9510 0.9886
## age 1.0729 0.9321 1.0479 1.0984
##
## Concordance= 0.688 (se = 0.021 )
## Likelihood ratio test= 60.61 on 4 df, p=2e-12
## Wald test = 57.33 on 4 df, p=1e-11
## Score (logrank) test = 58.42 on 4 df, p=6e-12
Using stepwise regression in both directions, the final model identified treatment (p=0.0456), history of diabetes(p=0.0082), left ventricular ejection fraction (p= 0.0018), and age (p= 4.6e-09) as statistically significant predictors of survival. The analysis shows that the experimental treatment significantly reduces the risk of the event, with a hazard ratio of 0.709, indicating a 29% reduction in risk compared to the placebo. A history of diabetes increases the risk, with a hazard ratio of 0.604, suggesting a 60% higher risk for individuals with diabetes. However, the effect of Left Ventricular Ejection Fraction (lveejf) is minimal, with a hazard ratio of 0.970, indicating only a slight decrease in risk per unit increase in ejection fraction.
cox_full_interaction <- coxph(Surv(t2death, death) ~ tx * (hidiabet + lveejf + age), data = data_csv)
cox_step_interaction <- step(cox_full_interaction, direction = "both")
## Start: AIC=1783.11
## Surv(t2death, death) ~ tx * (hidiabet + lveejf + age)
##
## Df AIC
## - tx:hidiabet 1 1781.1
## - tx:lveejf 1 1781.1
## - tx:age 1 1781.2
## <none> 1783.1
##
## Step: AIC=1781.12
## Surv(t2death, death) ~ tx + hidiabet + lveejf + age + tx:lveejf +
## tx:age
##
## Df AIC
## - tx:lveejf 1 1779.1
## - tx:age 1 1779.2
## <none> 1781.1
## + tx:hidiabet 1 1783.1
## - hidiabet 1 1785.6
##
## Step: AIC=1779.14
## Surv(t2death, death) ~ tx + hidiabet + lveejf + age + tx:age
##
## Df AIC
## - tx:age 1 1777.2
## <none> 1779.1
## + tx:lveejf 1 1781.1
## + tx:hidiabet 1 1781.1
## - hidiabet 1 1783.6
## - lveejf 1 1787.4
##
## Step: AIC=1777.24
## Surv(t2death, death) ~ tx + hidiabet + lveejf + age
##
## Df AIC
## <none> 1777.2
## + tx:age 1 1779.1
## + tx:lveejf 1 1779.2
## + tx:hidiabet 1 1779.2
## - tx 1 1779.3
## - hidiabet 1 1781.7
## - lveejf 1 1785.5
## - age 1 1812.6
summary(cox_step_interaction)
## Call:
## coxph(formula = Surv(t2death, death) ~ tx + hidiabet + lveejf +
## age, data = data_csv)
##
## n= 1000, number of events= 141
##
## coef exp(coef) se(coef) z Pr(>|z|)
## tx -0.343635 0.709188 0.171913 -1.999 0.04562 *
## hidiabet -0.504948 0.603537 0.190884 -2.645 0.00816 **
## lveejf -0.030826 0.969645 0.009882 -3.119 0.00181 **
## age 0.070334 1.072866 0.012002 5.860 4.62e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## tx 0.7092 1.4101 0.5063 0.9933
## hidiabet 0.6035 1.6569 0.4152 0.8774
## lveejf 0.9696 1.0313 0.9510 0.9886
## age 1.0729 0.9321 1.0479 1.0984
##
## Concordance= 0.688 (se = 0.021 )
## Likelihood ratio test= 60.61 on 4 df, p=2e-12
## Wald test = 57.33 on 4 df, p=1e-11
## Score (logrank) test = 58.42 on 4 df, p=6e-12
The stepwise regression did not identify any significant interaction terms, indicating no statistical evidence of effect modification between treatment and age, history of diabetes, or left ventricular ejection fraction.
cox_model_3 <- coxph(Surv(t2death, death) ~ tx + hidiabet + lveejf + age, data=data_csv)
summary(cox_model_3)
## Call:
## coxph(formula = Surv(t2death, death) ~ tx + hidiabet + lveejf +
## age, data = data_csv)
##
## n= 1000, number of events= 141
##
## coef exp(coef) se(coef) z Pr(>|z|)
## tx -0.343635 0.709188 0.171913 -1.999 0.04562 *
## hidiabet -0.504948 0.603537 0.190884 -2.645 0.00816 **
## lveejf -0.030826 0.969645 0.009882 -3.119 0.00181 **
## age 0.070334 1.072866 0.012002 5.860 4.62e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## tx 0.7092 1.4101 0.5063 0.9933
## hidiabet 0.6035 1.6569 0.4152 0.8774
## lveejf 0.9696 1.0313 0.9510 0.9886
## age 1.0729 0.9321 1.0479 1.0984
##
## Concordance= 0.688 (se = 0.021 )
## Likelihood ratio test= 60.61 on 4 df, p=2e-12
## Wald test = 57.33 on 4 df, p=1e-11
## Score (logrank) test = 58.42 on 4 df, p=6e-12
Likelihood Ratio Test (p = 2e-12), Wald Test (p = 1e-11), and Score (Log-rank) Test (p = 6e-12) all have highly significant p-values (p < 0.001), suggesting that this model is statistically significant.
cox.zph(cox_model_3)
## chisq df p
## tx 0.998 1 0.32
## hidiabet 0.682 1 0.41
## lveejf 0.289 1 0.59
## age 0.678 1 0.41
## GLOBAL 2.699 4 0.61
residuals_schoenfeld_2 <- cox.zph(cox_model_3)
plot(residuals_schoenfeld_2)
The Schoenfeld residuals plots for individual covariates do not show any
major concerns regarding trends over time, mostly showing a random
scatter around 0. While some points with extreme values and potential
outliers are observed, they do not raise significant concerns regarding
the model’s assumptions. The global test and individual predictor tests
for Schoenfeld residuals had p-values greater than 0.05, indicating that
there are no violations of the proportional hazards assumption for the
model.
survfit_obj_2 <- survfit(cox_model_3)
plot(survfit_obj_2, fun = "cloglog")
Log(-Log(Survival)) vs. Log(Time) plot shows that the overall trend of
the curves is approximately parallel (they do not cross), the
proportional hazards (PH) assumption holds.
dfbeta_values2 <- residuals(cox_model_3, type = "dfbeta")
matplot(dfbeta_values2, type = "h", lty = 1, col = 1:ncol(dfbeta_values),
ylab = "DFBeta", main = "Influence of Each Observation")
Most DFBeta values are close to zero, indicating minimal impact from
most observations. However, some extreme values suggest influential
observations that significantly alter the regression coefficients. High
DFBeta values should be investigated further, as they may indicate
outliers or highly influential data points that could impact model
stability.