mydata <- read.csv("C:/Users/MihaPristov/Desktop/faks/R-multiverse/Hw4/heart.csv", header=TRUE, sep=",", dec=".")
mydata <- mydata[c(-3,-4,-6,-7,-9,-10)]
Variables:
Age: age of the patient [years] Sex: sex of the patient [M: Male, F: Female] Cholesterol: serum cholesterol [mm/dl] MaxHR: maximum heart rate achieved [beats per minute] ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping] HeartDisease: output class [1: heart disease, 0: Normal]
The data set is retrieved from Kaggle on 19.2.2023 from: https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction
RESEARCH QUESTION: What variables (and how much) affect if a person has heart disease?
mydata$HeartDisease_F <- factor(mydata$HeartDisease,
levels = c(0,1),
labels = c("No", "Yes"))
mydata$Sex_F <- factor(mydata$Sex,
levels = c("F", "M"),
labels = c(1,2))
mydata$ST_SlopeF <- factor(mydata$ST_Slope,
levels = c("Up", "Flat","Down"),
labels = c(1,2,3))
summary(mydata[,c(-2,-6,-7,-8,-5)])
## Age Cholesterol MaxHR ST_SlopeF
## Min. :28.00 Min. : 0.0 Min. : 60.0 1:395
## 1st Qu.:47.00 1st Qu.:173.2 1st Qu.:120.0 2:460
## Median :54.00 Median :223.0 Median :138.0 3: 63
## Mean :53.51 Mean :198.8 Mean :136.8
## 3rd Qu.:60.00 3rd Qu.:267.0 3rd Qu.:156.0
## Max. :77.00 Max. :603.0 Max. :202.0
library(car)
## Loading required package: carData
fit <- glm(HeartDisease ~ 1,
family = binomial,
data = mydata)
summary(fit)
##
## Call:
## glm(formula = HeartDisease ~ 1, family = binomial, data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.270 -1.270 1.088 1.088 1.088
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.21432 0.06639 3.228 0.00125 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1262.1 on 917 degrees of freedom
## Residual deviance: 1262.1 on 917 degrees of freedom
## AIC: 1264.1
##
## Number of Fisher Scoring iterations: 3
We estimate our first regression model, a simple one.
exp(cbind(odds = fit$coefficients, confint.default(fit)))
## odds 2.5 % 97.5 %
## (Intercept) 1.239024 1.087852 1.411205
head(fitted(fit))
## 1 2 3 4 5 6
## 0.5533769 0.5533769 0.5533769 0.5533769 0.5533769 0.5533769
Pseudo_R2_fit <- 508/918
Pseudo_R2_fit
## [1] 0.5533769
After calculating odds, and R squared we can see that the probability that a unit is correctly classified is 55,3
fit1 <- glm(HeartDisease ~ Cholesterol,
family = binomial,
data = mydata)
summary(fit1)
##
## Call:
## glm(formula = HeartDisease ~ Cholesterol, family = binomial,
## data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6904 -1.2030 0.7401 1.1297 1.9063
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.154892 0.156934 7.359 1.85e-13 ***
## Cholesterol -0.004634 0.000679 -6.826 8.75e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1262.1 on 917 degrees of freedom
## Residual deviance: 1210.3 on 916 degrees of freedom
## AIC: 1214.3
##
## Number of Fisher Scoring iterations: 4
In the second model I added cholesterol a numeric variable that is significant and already improves the model.
anova(fit, fit1, test = "Chi")
## Analysis of Deviance Table
##
## Model 1: HeartDisease ~ 1
## Model 2: HeartDisease ~ Cholesterol
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 917 1262.1
## 2 916 1210.3 1 51.822 6.078e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H0: Simple model is better
H1: Complex model is better
At a very small p value (p<0.001) we can reject null hypothesis and prove that the second model fit1 is better. But we go on to add more variables to even further better the model.
fit2 <- glm(HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F + ST_SlopeF,
family = binomial,
data = mydata)
summary(fit2)
##
## Call:
## glm(formula = HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F +
## ST_SlopeF, family = binomial, data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7471 -0.5949 0.2740 0.5482 2.5826
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.5692085 1.0049989 -1.561 0.118429
## Age 0.0341267 0.0107790 3.166 0.001545 **
## Cholesterol -0.0041433 0.0009226 -4.491 7.09e-06 ***
## MaxHR -0.0144608 0.0042296 -3.419 0.000629 ***
## Sex_F2 1.5757589 0.2386976 6.601 4.07e-11 ***
## ST_SlopeF2 2.9349635 0.2067029 14.199 < 2e-16 ***
## ST_SlopeF3 2.1928506 0.3577111 6.130 8.78e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1262.14 on 917 degrees of freedom
## Residual deviance: 746.38 on 911 degrees of freedom
## AIC: 760.38
##
## Number of Fisher Scoring iterations: 5
The fit2 model shows all the odds ratios are significant and we check the model for multicolinearity, outliers and units with high impact.
vif(fit2)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.119891 1 1.058249
## Cholesterol 1.094423 1 1.046147
## MaxHR 1.134595 1 1.065174
## Sex_F 1.094766 1 1.046311
## ST_SlopeF 1.168230 2 1.039638
mean(vif(fit2))
## [1] 1.124495
Looking a the VIF vlaues we seem to have no problems with multicolinearity.
mydata$StdResid <- rstandard(fit2)
mydata$CookDist <- cooks.distance(fit2)
hist(mydata$StdResid,
main = "Histogram of standardized residuals",
ylab = "Frequency",
xlab = "Standardized residuals")
From the histogram we can see that there there are no standardized
residuals outside 3 or -3 so we have no outliers.
hist(mydata$CookDist,
main = "Histogram of cooks distances",
ylab = "Frequency",
xlab = "Cooks distances")
head(mydata[order(-mydata$CookDist), c("Age","CookDist")])
## Age CookDist
## 497 58 0.02507343
## 435 63 0.01803886
## 557 75 0.01787412
## 308 53 0.01744217
## 323 38 0.01712333
## 315 53 0.01624915
mydata <- mydata[-497,]
We remove a unit with high impact. And reestamte both the models.
fit1 <- glm(HeartDisease ~ Cholesterol,
family = binomial,
data = mydata)
fit2 <- glm(HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F + ST_SlopeF,
family = binomial,
data = mydata)
anova(fit1, fit2, test = "Chi")
## Analysis of Deviance Table
##
## Model 1: HeartDisease ~ Cholesterol
## Model 2: HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F + ST_SlopeF
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 915 1209.67
## 2 910 743.15 5 466.51 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We again do the Chi squared test to compare the two models:
H0: Simple model is better H1: Complex model is better
We can reject null hypothesis at p<0.001 and prove the second model is better.
summary(fit2)
##
## Call:
## glm(formula = HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F +
## ST_SlopeF, family = binomial, data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7350 -0.5891 0.2740 0.5423 2.5862
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.4360627 1.0072115 -1.426 0.153932
## Age 0.0335926 0.0107913 3.113 0.001852 **
## Cholesterol -0.0039321 0.0009285 -4.235 2.29e-05 ***
## MaxHR -0.0155250 0.0042756 -3.631 0.000282 ***
## Sex_F2 1.5830055 0.2388527 6.628 3.41e-11 ***
## ST_SlopeF2 2.9203561 0.2062523 14.159 < 2e-16 ***
## ST_SlopeF3 2.2769510 0.3664226 6.214 5.17e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1260.52 on 916 degrees of freedom
## Residual deviance: 743.15 on 910 degrees of freedom
## AIC: 757.15
##
## Number of Fisher Scoring iterations: 5
exp(cbind(OR = fit2$coefficients, confint.default(fit2)))
## OR 2.5 % 97.5 %
## (Intercept) 0.2378625 0.03303588 1.7126392
## Age 1.0341632 1.01251980 1.0562692
## Cholesterol 0.9960756 0.99426449 0.9978900
## MaxHR 0.9845949 0.97637856 0.9928804
## Sex_F2 4.8695694 3.04914987 7.7768253
## ST_SlopeF2 18.5478911 12.38033227 27.7879669
## ST_SlopeF3 9.7469172 4.75299940 19.9878826
If Age increases by 1 year, the odds for Heart Disease are equal 1.03 times initial odds under the assumption that the remaining explanatory variables will not change (p < 0.02).
If Cholesterol increases by 1 mm/dl, the odds for for Heart Disease are equal 0.99 times initial odds under the assumption that the remaining explanatory variables will not change (p < 0.001).
If MaxHR increases by 1 beat per minute, the odds for for Heart Disease are equal 0.98 times initial odds under the assumption that the remaining explanatory variables will not change (p < 0.001).
These odds are very close to 1 witch would suggest there is almost no effect, but the p value shows them as significant. This is mostly likley due to the size of the sample. We have to take this fact in consideration and understand that if there is effect it is very small.
Given the value of all other variables, the odds of female getting heart disease are 4.87 times the odds of male getting heart disease (p < 0.01).
Given the value of all other variables, the odds of a person that has a flat slope of getting heart disease are 18.5 times the odds of a person with a up slope of ST segment getting heart disease (p < 0.01).
Given the value of all other variables, the odds of a person that has a down slope of getting heart disease are 9.75 times the odds of a person with a up slope of ST segment getting heart disease (p < 0.01).
mydata$EstimatedProbabkility <- fitted(fit2)
head(mydata)
## Age Sex Cholesterol MaxHR ST_Slope HeartDisease HeartDisease_F Sex_F
## 1 40 M 289 172 Up 0 No 2
## 2 49 F 180 156 Flat 1 Yes 1
## 3 37 M 283 98 Up 0 No 2
## 4 48 F 214 108 Flat 1 Yes 1
## 5 54 M 195 122 Up 0 No 2
## 6 39 M 339 170 Up 0 No 2
## ST_SlopeF StdResid CookDist EstimatedProbabkility
## 1 1 -0.4353314 5.941491e-05 0.08980403
## 2 2 1.1702562 1.981517e-03 0.50016368
## 3 1 -0.6944184 7.311626e-04 0.22367656
## 4 2 0.9604217 1.412107e-03 0.64073782
## 5 1 -0.8921112 3.959114e-04 0.33184191
## 6 1 -0.3932120 4.852667e-05 0.07480090
According to estimated probability we can see how likely a person is to have heart diesase for example the first person has an 8% chance while the second has 50%.
mydata$Classification <- ifelse(test = mydata$EstimatedProbabkility < 0.5,
yes = "No",
no = "Yes")
ClassificationTable <- table(mydata$HeartDisease_F, mydata$Classification)
ClassificationTable
##
## No Yes
## No 329 80
## Yes 77 431
We can see that for example 80 people did not have heart disease but we classified them to have it.
Pseudo_R2_fit2 <- (ClassificationTable[1,1] + ClassificationTable[2,2])/nrow(mydata)
Pseudo_R2_fit2
## [1] 0.8287895
We see that 82.8 percent of all units were correctly classified.