install.packages(“dplyr”) library(dplyr) library(stargazer) library(caret) library(MASS) library ( ISLR ) library(“psych”) library(“MASS”) library(“ggord”) library(“devtools”) library(“ggplot2”) library(“tidyverse”) library(“survival”)
#library(epitools)
## Load Data
Final_data <- read.csv(file = "C:/Users/Issac/OneDrive/Desktop/Meharry/MSDS 565 Predictive Modeling and Analytics/Predicitive Modeling/GUSTO_sample4p.csv")
summary(Final_data)
DAY30 SEX AGE A65 KILLIP
Min. :0.00000 Min. :0.0000 Min. :28.37 Min. :0.0000 Min. :1.000
1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:53.43 1st Qu.:0.0000 1st Qu.:1.000
Median :0.00000 Median :0.0000 Median :62.84 Median :0.0000 Median :1.000
Mean :0.06624 Mean :0.2586 Mean :61.82 Mean :0.4166 Mean :1.243
3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:70.67 3rd Qu.:1.0000 3rd Qu.:1.000
Max. :1.00000 Max. :1.0000 Max. :86.00 Max. :1.0000 Max. :4.000
SHO DIA HYP HRT ANT
Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.00000 Median :0.0000 Median :0.00000 Median :0.0000 Median :0.0000
Mean :0.02675 Mean :0.1083 Mean :0.05096 Mean :0.2662 Mean :0.3554
3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
PMI HEI WEI HTN SMK
Min. :0.0000 Min. :141.3 Min. : 40.00 Min. :0.0000 Min. :1.000
1st Qu.:0.0000 1st Qu.:163.0 1st Qu.: 66.00 1st Qu.:0.0000 1st Qu.:1.000
Median :0.0000 Median :170.0 Median : 75.00 Median :0.0000 Median :2.000
Mean :0.1783 Mean :169.2 Mean : 75.24 Mean :0.3758 Mean :1.943
3rd Qu.:0.0000 3rd Qu.:175.0 3rd Qu.: 84.00 3rd Qu.:1.0000 3rd Qu.:3.000
Max. :1.0000 Max. :197.0 Max. :128.00 Max. :1.0000 Max. :3.000
LIP PAN FAM STE ST4
Min. :0.0000 Min. :0.0000 Min. :0.000 Min. : 0.000 Min. :0.000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.: 3.000 1st Qu.:0.000
Median :0.0000 Median :0.0000 Median :0.000 Median : 4.000 Median :0.000
Mean :0.3885 Mean :0.3758 Mean :0.414 Mean : 4.266 Mean :0.414
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.: 6.000 3rd Qu.:1.000
Max. :1.0000 Max. :1.0000 Max. :1.000 Max. :10.000 Max. :1.000
TTR
Min. :0.0000
1st Qu.:0.0000
Median :1.0000
Mean :0.5032
3rd Qu.:1.0000
Max. :1.0000
##a) Mortality of patients with hypertension
model_hypertension <- glm(DAY30 ~ HYP, data = Final_data, family = binomial)
summary(model_hypertension)
Call:
glm(formula = DAY30 ~ HYP, family = binomial, data = Final_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.6981 0.1507 -17.9 <2e-16 ***
HYP 0.7522 0.5013 1.5 0.133
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 382.78 on 784 degrees of freedom
Residual deviance: 380.86 on 783 degrees of freedom
AIC: 384.86
Number of Fisher Scoring iterations: 5
# Calculate the unadjusted odds ratio
exp(coef(model_hypertension))
(Intercept) HYP
0.06733524 2.12158055
# Surprisingly it appears that hypertension is statistically insignificant pertaining to the 30 Day mortality rate.
#Exclude Hypertension
no_hyper <- subset(Final_data,select = -c(HYP))
no_hyper
##a) Mortality of patients without hypertension
wo_hypertension <- glm(DAY30 ~. , data = no_hyper, family = binomial)
summary(wo_hypertension)
Call:
glm(formula = DAY30 ~ ., family = binomial, data = no_hyper)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.01007 4.72859 -1.060 0.289361
SEX -0.71396 0.49227 -1.450 0.146963
AGE 0.11343 0.03232 3.510 0.000448 ***
A65 -0.70775 0.59934 -1.181 0.237646
KILLIP 0.51446 0.35980 1.430 0.152763
SHO 0.12724 0.82069 0.155 0.876785
DIA 0.74227 0.46837 1.585 0.113012
HRT 0.97605 0.33619 2.903 0.003693 **
ANT 0.47970 0.38312 1.252 0.210543
PMI 0.75349 0.40826 1.846 0.064949 .
HEI -0.02546 0.02665 -0.956 0.339287
WEI -0.02083 0.01658 -1.256 0.209068
HTN 0.06971 0.33346 0.209 0.834402
SMK 0.15481 0.24463 0.633 0.526844
LIP 0.08528 0.34831 0.245 0.806574
PAN -0.34268 0.38874 -0.882 0.378038
FAM -0.27687 0.34141 -0.811 0.417384
STE -0.29543 0.17546 -1.684 0.092224 .
ST4 0.74336 0.58951 1.261 0.207316
TTR 0.69204 0.33941 2.039 0.041453 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 382.78 on 784 degrees of freedom
Residual deviance: 293.12 on 765 degrees of freedom
AIC: 333.12
Number of Fisher Scoring iterations: 7
# Calculate the unadjusted odds ratio
exp(coef(wo_hypertension))
(Intercept) SEX AGE A65 KILLIP SHO DIA
0.00667047 0.48969966 1.12011778 0.49275065 1.67272791 1.13569409 2.10070433
HRT ANT PMI HEI WEI HTN SMK
2.65395049 1.61559074 2.12441171 0.97485758 0.97938354 1.07220153 1.16743286
LIP PAN FAM STE ST4 TTR
1.08902468 0.70986455 0.75815407 0.74421238 2.10298460 1.99779667
##b) Mortality of patients with a family history of MI compared to patients without family history
MI_FAM <- subset(Final_data,FAM==1)
NO_MI_FAM <- subset(Final_data,FAM==0)
mi_history <- glm(DAY30 ~., data = MI_FAM, family = binomial)
summary(mi_history)
Call:
glm(formula = DAY30 ~ ., family = binomial, data = MI_FAM)
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.285545 7.956871 -0.287 0.7739
SEX -0.927046 0.872610 -1.062 0.2881
AGE 0.078983 0.055312 1.428 0.1533
A65 -0.781837 1.013008 -0.772 0.4402
KILLIP 0.690892 0.653219 1.058 0.2902
SHO -0.380607 1.580247 -0.241 0.8097
DIA 0.556760 0.784431 0.710 0.4779
HYP 0.699866 1.140773 0.614 0.5395
HRT 0.372433 0.617677 0.603 0.5465
ANT 0.622005 0.680552 0.914 0.3607
PMI 1.386438 0.640317 2.165 0.0304 *
HEI -0.045504 0.046088 -0.987 0.3235
WEI -0.007371 0.029400 -0.251 0.8020
HTN 0.985317 0.633269 1.556 0.1197
SMK 0.248901 0.454502 0.548 0.5839
LIP 0.207366 0.595804 0.348 0.7278
PAN -0.508513 0.655756 -0.775 0.4381
FAM NA NA NA NA
STE -0.104075 0.310706 -0.335 0.7377
ST4 0.495747 1.112214 0.446 0.6558
TTR 0.853317 0.631218 1.352 0.1764
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 139.15 on 324 degrees of freedom
Residual deviance: 104.40 on 305 degrees of freedom
AIC: 144.4
Number of Fisher Scoring iterations: 7
no_mi_history <- glm(DAY30 ~., data = NO_MI_FAM, family = binomial)
summary(no_mi_history)
Call:
glm(formula = DAY30 ~ ., family = binomial, data = NO_MI_FAM)
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.984e+00 6.352e+00 -1.414 0.15724
SEX -7.658e-01 6.791e-01 -1.128 0.25948
AGE 1.225e-01 4.356e-02 2.813 0.00491 **
A65 -5.412e-01 7.948e-01 -0.681 0.49588
KILLIP 6.269e-01 4.567e-01 1.373 0.16982
SHO 1.243e-02 1.052e+00 0.012 0.99057
DIA 8.411e-01 6.363e-01 1.322 0.18624
HYP 1.140e+00 7.263e-01 1.570 0.11642
HRT 1.379e+00 4.435e-01 3.109 0.00188 **
ANT 5.425e-01 5.153e-01 1.053 0.29240
PMI 3.007e-01 5.765e-01 0.522 0.60198
HEI -1.211e-05 3.511e-02 0.000 0.99972
WEI -3.516e-02 2.241e-02 -1.569 0.11662
HTN -2.855e-01 4.559e-01 -0.626 0.53114
SMK 1.434e-01 3.131e-01 0.458 0.64703
LIP 1.509e-01 4.650e-01 0.324 0.74560
PAN -1.850e-01 5.100e-01 -0.363 0.71679
FAM NA NA NA NA
STE -3.638e-01 2.126e-01 -1.711 0.08712 .
ST4 7.489e-01 7.441e-01 1.006 0.31420
TTR 6.396e-01 4.339e-01 1.474 0.14053
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 242.55 on 459 degrees of freedom
Residual deviance: 175.46 on 440 degrees of freedom
AIC: 215.46
Number of Fisher Scoring iterations: 7
# Calculate the unadjusted odds ratio
exp(coef(mi_history))
(Intercept) SEX AGE A65 KILLIP SHO DIA
0.1017186 0.3957210 1.0821854 0.4575648 1.9954956 0.6834463 1.7450097
HYP HRT ANT PMI HEI WEI HTN
2.0134830 1.4512611 1.8626594 4.0005729 0.9555156 0.9926562 2.6786610
SMK LIP PAN FAM STE ST4 TTR
1.2826154 1.2304325 0.6013890 NA 0.9011579 1.6417245 2.3474199
# Calculate the unadjusted odds ratio
exp(coef(no_mi_history))
(Intercept) SEX AGE A65 KILLIP SHO
0.0001253599 0.4649744250 1.1303499522 0.5820311181 1.8717770142 1.0125123945
DIA HYP HRT ANT PMI HEI
2.3188933823 3.1274226230 3.9704679302 1.7203769887 1.3508117497 0.9999878856
WEI HTN SMK LIP PAN FAM
0.9654484385 0.7516349561 1.1541477093 1.1628367769 0.8310948009 NA
STE ST4 TTR
0.6950545548 2.1146501246 1.8956578473
# Table of unadjusted odds ratios
table(exp(coef(mi_history)),exp(coef(no_mi_history)))
0.000125359940671297 0.464974424974044 0.58203111811277
0.101718593276656 1 0 0
0.395721004571528 0 1 0
0.457564786811585 0 0 1
0.601388976165652 0 0 0
0.683446341544921 0 0 0
0.90115786366715 0 0 0
0.955515632560991 0 0 0
0.992656212837062 0 0 0
1.08218540628884 0 0 0
1.23043246747591 0 0 0
1.28261541677541 0 0 0
1.45126114168593 0 0 0
1.64172448870533 0 0 0
1.74500967180091 0 0 0
1.86265935590802 0 0 0
1.99549560317865 0 0 0
2.01348301047689 0 0 0
2.34741985411732 0 0 0
2.67866102529081 0 0 0
4.00057287954461 0 0 0
0.695054554761247 0.751634956103207 0.831094800883081
0.101718593276656 0 0 0
0.395721004571528 0 0 0
0.457564786811585 0 0 0
0.601388976165652 0 0 1
0.683446341544921 0 0 0
0.90115786366715 1 0 0
0.955515632560991 0 0 0
0.992656212837062 0 0 0
1.08218540628884 0 0 0
1.23043246747591 0 0 0
1.28261541677541 0 0 0
1.45126114168593 0 0 0
1.64172448870533 0 0 0
1.74500967180091 0 0 0
1.86265935590802 0 0 0
1.99549560317865 0 0 0
2.01348301047689 0 0 0
2.34741985411732 0 0 0
2.67866102529081 0 1 0
4.00057287954461 0 0 0
0.965448438517011 0.999987885593354 1.01251239452578
0.101718593276656 0 0 0
0.395721004571528 0 0 0
0.457564786811585 0 0 0
0.601388976165652 0 0 0
0.683446341544921 0 0 1
0.90115786366715 0 0 0
0.955515632560991 0 1 0
0.992656212837062 1 0 0
1.08218540628884 0 0 0
1.23043246747591 0 0 0
1.28261541677541 0 0 0
1.45126114168593 0 0 0
1.64172448870533 0 0 0
1.74500967180091 0 0 0
1.86265935590802 0 0 0
1.99549560317865 0 0 0
2.01348301047689 0 0 0
2.34741985411732 0 0 0
2.67866102529081 0 0 0
4.00057287954461 0 0 0
1.13034995219644 1.15414770931069 1.16283677692187 1.35081174974651
0.101718593276656 0 0 0 0
0.395721004571528 0 0 0 0
0.457564786811585 0 0 0 0
0.601388976165652 0 0 0 0
0.683446341544921 0 0 0 0
0.90115786366715 0 0 0 0
0.955515632560991 0 0 0 0
0.992656212837062 0 0 0 0
1.08218540628884 1 0 0 0
1.23043246747591 0 0 1 0
1.28261541677541 0 1 0 0
1.45126114168593 0 0 0 0
1.64172448870533 0 0 0 0
1.74500967180091 0 0 0 0
1.86265935590802 0 0 0 0
1.99549560317865 0 0 0 0
2.01348301047689 0 0 0 0
2.34741985411732 0 0 0 0
2.67866102529081 0 0 0 0
4.00057287954461 0 0 0 1
1.72037698866098 1.87177701418154 1.89565784727504 2.11465012460796
0.101718593276656 0 0 0 0
0.395721004571528 0 0 0 0
0.457564786811585 0 0 0 0
0.601388976165652 0 0 0 0
0.683446341544921 0 0 0 0
0.90115786366715 0 0 0 0
0.955515632560991 0 0 0 0
0.992656212837062 0 0 0 0
1.08218540628884 0 0 0 0
1.23043246747591 0 0 0 0
1.28261541677541 0 0 0 0
1.45126114168593 0 0 0 0
1.64172448870533 0 0 0 1
1.74500967180091 0 0 0 0
1.86265935590802 1 0 0 0
1.99549560317865 0 1 0 0
2.01348301047689 0 0 0 0
2.34741985411732 0 0 1 0
2.67866102529081 0 0 0 0
4.00057287954461 0 0 0 0
2.31889338231425 3.12742262298551 3.97046793020662
0.101718593276656 0 0 0
0.395721004571528 0 0 0
0.457564786811585 0 0 0
0.601388976165652 0 0 0
0.683446341544921 0 0 0
0.90115786366715 0 0 0
0.955515632560991 0 0 0
0.992656212837062 0 0 0
1.08218540628884 0 0 0
1.23043246747591 0 0 0
1.28261541677541 0 0 0
1.45126114168593 0 0 1
1.64172448870533 0 0 0
1.74500967180091 1 0 0
1.86265935590802 0 0 0
1.99549560317865 0 0 0
2.01348301047689 0 1 0
2.34741985411732 0 0 0
2.67866102529081 0 0 0
4.00057287954461 0 0 0
#d) Compare the adjusted odds ration in b) and c) with the ones in question 1, are they the same? Why? (10 points)
# The odds ratios are diffent as question one took into hyptension alone as other variables. The no mi history is obviously different at 3.1274226230 vs 2.12158055
## Question 2: Predict 30-day Mortality
## a) Report your in an equation
prediction1 <- predict(model_hypertension,Final_data)
prediction2 <- predict(wo_hypertension,Final_data)
summary(prediction1)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.698 -2.698 -2.698 -2.660 -2.698 -1.946
summary(prediction2)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-8.261 -4.554 -3.597 -3.553 -2.554 1.443
#chooseCRANmirror()
# Specify a CRAN mirror directly in the install.packages function
install.packages("ggplot2", repos = "https://cloud.r-project.org")
Error in install.packages : Updating loaded packages
# Add this line to your .Rprofile file
options(repos = c(CRAN = "https://cloud.r-project.org"))
# Visual comparison of Mortality based on non-hypertension and hypertension
install.packages("ggplot2")
Error in install.packages : Updating loaded packages
library(ggplot2)
Warning: package ‘ggplot2’ was built under R version 4.3.3
ggplot(model_hypertension, aes(x = HYP )) + geom_point(aes(y = DAY30), color = "blue") + geom_line(aes(y = prediction1), color = "red") + ggtitle("Hypertension vs Non-Hypertension") + theme_minimal()
NA
NA
## c) Adjusted Odds Ratio for Family History of MI
# Calculate the adjusted odds ratio for family history
exp(coef(mi_history)["FAM"])
FAM
NA
## Question 3: Interaction Terms
model_interaction <- glm(DAY30 ~ SEX * AGE + DIA * HYP, data = Final_data, family = binomial)
summary(model_interaction)
Call:
glm(formula = DAY30 ~ SEX * AGE + DIA * HYP, family = binomial,
data = Final_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -9.59845 1.42973 -6.713 1.90e-11 ***
SEX -3.10633 3.38787 -0.917 0.3592
AGE 0.10316 0.02041 5.053 4.34e-07 ***
DIA 0.80311 0.45364 1.770 0.0767 .
HYP 0.35128 0.65526 0.536 0.5919
SEX:AGE 0.03967 0.04605 0.862 0.3889
DIA:HYP 0.56757 1.20835 0.470 0.6386
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 382.78 on 784 degrees of freedom
Residual deviance: 328.29 on 778 degrees of freedom
AIC: 342.29
Number of Fisher Scoring iterations: 6
num_rows <- nrow(Final_data)
train_size <- round(0.7 * num_rows)
train_indices <- sample(1:num_rows, size = train_size)
training_data <- Final_data[train_indices, ]
testing_data <- Final_data[-train_indices, ]
## Question 4: Predict ST Elevation
## a) Poisson Regression Model
model_poisson <- glm(STE ~ SEX + AGE + DIA + HYP + SMK + FAM, data = Final_data, family = poisson)
summary(model_poisson)
Call:
glm(formula = STE ~ SEX + AGE + DIA + HYP + SMK + FAM, family = poisson,
data = Final_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.4460504 0.1038218 13.928 <2e-16 ***
SEX 0.0179893 0.0416059 0.432 0.6655
AGE 0.0003943 0.0017347 0.227 0.8202
DIA 0.0620090 0.0547282 1.133 0.2572
HYP -0.1547837 0.0844770 -1.832 0.0669 .
SMK -0.0020605 0.0237522 -0.087 0.9309
FAM -0.0488123 0.0356768 -1.368 0.1713
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 645.02 on 784 degrees of freedom
Residual deviance: 638.21 on 778 degrees of freedom
AIC: 3172.2
Number of Fisher Scoring iterations: 4
prediction_poss <- predict(model_poisson,training_data)
summary(prediction_poss)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.254 1.420 1.463 1.449 1.471 1.550
## b) Negative Binomial Regression Model
library(MASS)
# Set a control of 50 iteration was chosen to bypass the error message "Warning: iteration limit reachedWarning: iteration limit reached"
model_negbinom <- glm.nb(STE ~ SEX + AGE + DIA + HYP + SMK + FAM, data = Final_data,control= glm.control(maxit = 50))
Warning: iteration limit reachedWarning: NaNs producedWarning: iteration limit reachedWarning: NaNs produced
summary(model_negbinom)
Call:
glm.nb(formula = STE ~ SEX + AGE + DIA + HYP + SMK + FAM, data = Final_data,
control = glm.control(maxit = 50), init.theta = 237724054.9,
link = log)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.4460504 0.1038224 13.928 <2e-16 ***
SEX 0.0179893 0.0416060 0.432 0.6655
AGE 0.0003943 0.0017347 0.227 0.8202
DIA 0.0620090 0.0547285 1.133 0.2572
HYP -0.1547837 0.0844781 -1.832 0.0669 .
SMK -0.0020605 0.0237524 -0.087 0.9309
FAM -0.0488123 0.0356770 -1.368 0.1713
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for Negative Binomial(237724055) family taken to be 1)
Null deviance: 645.02 on 784 degrees of freedom
Residual deviance: 638.21 on 778 degrees of freedom
AIC: 3174.2
Number of Fisher Scoring iterations: 1
Error in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, :
invalid 'nsmall' argument