mydata <- read.csv("C:/Users/MihaPristov/Desktop/faks/R-multiverse/Hw4/heart.csv", header=TRUE, sep=",", dec=".")
mydata <- mydata[c(-3,-4,-6,-7,-9,-10)]

Variables:

Age: age of the patient [years] Sex: sex of the patient [M: Male, F: Female] Cholesterol: serum cholesterol [mm/dl] MaxHR: maximum heart rate achieved [beats per minute] ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping] HeartDisease: output class [1: heart disease, 0: Normal]

The data set is retrieved from Kaggle on 19.2.2023 from: https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction

RESEARCH QUESTION: What variables (and how much) affect if a person has heart disease?

mydata$HeartDisease_F <- factor(mydata$HeartDisease,
                       levels = c(0,1),
                       labels = c("No", "Yes"))

mydata$Sex_F <- factor(mydata$Sex,
                       levels = c("F", "M"),
                       labels = c(1,2))

mydata$ST_SlopeF <- factor(mydata$ST_Slope,
                       levels = c("Up", "Flat","Down"),
                       labels = c(1,2,3))

summary(mydata[,c(-2,-6,-7,-8,-5)])
##       Age         Cholesterol        MaxHR       ST_SlopeF
##  Min.   :28.00   Min.   :  0.0   Min.   : 60.0   1:395    
##  1st Qu.:47.00   1st Qu.:173.2   1st Qu.:120.0   2:460    
##  Median :54.00   Median :223.0   Median :138.0   3: 63    
##  Mean   :53.51   Mean   :198.8   Mean   :136.8            
##  3rd Qu.:60.00   3rd Qu.:267.0   3rd Qu.:156.0            
##  Max.   :77.00   Max.   :603.0   Max.   :202.0
library(car)
## Loading required package: carData
fit <- glm(HeartDisease ~ 1,
               family = binomial,
               data = mydata)

summary(fit)
## 
## Call:
## glm(formula = HeartDisease ~ 1, family = binomial, data = mydata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.270  -1.270   1.088   1.088   1.088  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.21432    0.06639   3.228  0.00125 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1262.1  on 917  degrees of freedom
## Residual deviance: 1262.1  on 917  degrees of freedom
## AIC: 1264.1
## 
## Number of Fisher Scoring iterations: 3

We estimate our first regression model, a simple one.

exp(cbind(odds = fit$coefficients, confint.default(fit)))
##                 odds    2.5 %   97.5 %
## (Intercept) 1.239024 1.087852 1.411205
head(fitted(fit))
##         1         2         3         4         5         6 
## 0.5533769 0.5533769 0.5533769 0.5533769 0.5533769 0.5533769
Pseudo_R2_fit <-  508/918
Pseudo_R2_fit
## [1] 0.5533769

After calculating odds, and R squared we can see that the probability that a unit is correctly classified is 55,3

fit1 <- glm(HeartDisease ~ Cholesterol,
               family = binomial,
               data = mydata)

summary(fit1)
## 
## Call:
## glm(formula = HeartDisease ~ Cholesterol, family = binomial, 
##     data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6904  -1.2030   0.7401   1.1297   1.9063  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.154892   0.156934   7.359 1.85e-13 ***
## Cholesterol -0.004634   0.000679  -6.826 8.75e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1262.1  on 917  degrees of freedom
## Residual deviance: 1210.3  on 916  degrees of freedom
## AIC: 1214.3
## 
## Number of Fisher Scoring iterations: 4

In the second model I added cholesterol a numeric variable that is significant and already improves the model.

anova(fit, fit1, test = "Chi")
## Analysis of Deviance Table
## 
## Model 1: HeartDisease ~ 1
## Model 2: HeartDisease ~ Cholesterol
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       917     1262.1                          
## 2       916     1210.3  1   51.822 6.078e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

H0: Simple model is better

H1: Complex model is better

At a very small p value (p<0.001) we can reject null hypothesis and prove that the second model fit1 is better. But we go on to add more variables to even further better the model.

fit2 <- glm(HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F + ST_SlopeF,
               family = binomial,
               data = mydata)

summary(fit2)
## 
## Call:
## glm(formula = HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F + 
##     ST_SlopeF, family = binomial, data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7471  -0.5949   0.2740   0.5482   2.5826  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.5692085  1.0049989  -1.561 0.118429    
## Age          0.0341267  0.0107790   3.166 0.001545 ** 
## Cholesterol -0.0041433  0.0009226  -4.491 7.09e-06 ***
## MaxHR       -0.0144608  0.0042296  -3.419 0.000629 ***
## Sex_F2       1.5757589  0.2386976   6.601 4.07e-11 ***
## ST_SlopeF2   2.9349635  0.2067029  14.199  < 2e-16 ***
## ST_SlopeF3   2.1928506  0.3577111   6.130 8.78e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1262.14  on 917  degrees of freedom
## Residual deviance:  746.38  on 911  degrees of freedom
## AIC: 760.38
## 
## Number of Fisher Scoring iterations: 5

The fit2 model shows all the odds ratios are significant and we check the model for multicolinearity, outliers and units with high impact.

vif(fit2)
##                 GVIF Df GVIF^(1/(2*Df))
## Age         1.119891  1        1.058249
## Cholesterol 1.094423  1        1.046147
## MaxHR       1.134595  1        1.065174
## Sex_F       1.094766  1        1.046311
## ST_SlopeF   1.168230  2        1.039638
mean(vif(fit2))
## [1] 1.124495

Looking a the VIF vlaues we seem to have no problems with multicolinearity.

mydata$StdResid <- rstandard(fit2)
mydata$CookDist <- cooks.distance(fit2)
hist(mydata$StdResid, 
     main = "Histogram of standardized residuals",
     ylab = "Frequency", 
     xlab = "Standardized residuals")

From the histogram we can see that there there are no standardized residuals outside 3 or -3 so we have no outliers.

hist(mydata$CookDist, 
     main = "Histogram of cooks distances",
     ylab = "Frequency", 
     xlab = "Cooks distances")

head(mydata[order(-mydata$CookDist), c("Age","CookDist")])
##     Age   CookDist
## 497  58 0.02507343
## 435  63 0.01803886
## 557  75 0.01787412
## 308  53 0.01744217
## 323  38 0.01712333
## 315  53 0.01624915
mydata <- mydata[-497,]

We remove a unit with high impact. And reestamte both the models.

fit1 <- glm(HeartDisease ~ Cholesterol,
               family = binomial,
               data = mydata)



fit2 <- glm(HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F + ST_SlopeF,
               family = binomial,
               data = mydata)
anova(fit1, fit2, test = "Chi")
## Analysis of Deviance Table
## 
## Model 1: HeartDisease ~ Cholesterol
## Model 2: HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F + ST_SlopeF
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       915    1209.67                          
## 2       910     743.15  5   466.51 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We again do the Chi squared test to compare the two models:

H0: Simple model is better H1: Complex model is better

We can reject null hypothesis at p<0.001 and prove the second model is better.

summary(fit2)
## 
## Call:
## glm(formula = HeartDisease ~ Age + Cholesterol + MaxHR + Sex_F + 
##     ST_SlopeF, family = binomial, data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7350  -0.5891   0.2740   0.5423   2.5862  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.4360627  1.0072115  -1.426 0.153932    
## Age          0.0335926  0.0107913   3.113 0.001852 ** 
## Cholesterol -0.0039321  0.0009285  -4.235 2.29e-05 ***
## MaxHR       -0.0155250  0.0042756  -3.631 0.000282 ***
## Sex_F2       1.5830055  0.2388527   6.628 3.41e-11 ***
## ST_SlopeF2   2.9203561  0.2062523  14.159  < 2e-16 ***
## ST_SlopeF3   2.2769510  0.3664226   6.214 5.17e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1260.52  on 916  degrees of freedom
## Residual deviance:  743.15  on 910  degrees of freedom
## AIC: 757.15
## 
## Number of Fisher Scoring iterations: 5
exp(cbind(OR = fit2$coefficients, confint.default(fit2)))
##                     OR       2.5 %     97.5 %
## (Intercept)  0.2378625  0.03303588  1.7126392
## Age          1.0341632  1.01251980  1.0562692
## Cholesterol  0.9960756  0.99426449  0.9978900
## MaxHR        0.9845949  0.97637856  0.9928804
## Sex_F2       4.8695694  3.04914987  7.7768253
## ST_SlopeF2  18.5478911 12.38033227 27.7879669
## ST_SlopeF3   9.7469172  4.75299940 19.9878826

If Age increases by 1 year, the odds for Heart Disease are equal 1.03 times initial odds under the assumption that the remaining explanatory variables will not change (p < 0.02).

If Cholesterol increases by 1 mm/dl, the odds for for Heart Disease are equal 0.99 times initial odds under the assumption that the remaining explanatory variables will not change (p < 0.001).

If MaxHR increases by 1 beat per minute, the odds for for Heart Disease are equal 0.98 times initial odds under the assumption that the remaining explanatory variables will not change (p < 0.001).

These odds are very close to 1 witch would suggest there is almost no effect, but the p value shows them as significant. This is mostly likley due to the size of the sample. We have to take this fact in consideration and understand that if there is effect it is very small.

Given the value of all other variables, the odds of female getting heart disease are 4.87 times the odds of male getting heart disease (p < 0.01).

Given the value of all other variables, the odds of a person that has a flat slope of getting heart disease are 18.5 times the odds of a person with a up slope of ST segment getting heart disease (p < 0.01).

Given the value of all other variables, the odds of a person that has a down slope of getting heart disease are 9.75 times the odds of a person with a up slope of ST segment getting heart disease (p < 0.01).

mydata$EstimatedProbabkility <- fitted(fit2)
head(mydata)
##   Age Sex Cholesterol MaxHR ST_Slope HeartDisease HeartDisease_F Sex_F
## 1  40   M         289   172       Up            0             No     2
## 2  49   F         180   156     Flat            1            Yes     1
## 3  37   M         283    98       Up            0             No     2
## 4  48   F         214   108     Flat            1            Yes     1
## 5  54   M         195   122       Up            0             No     2
## 6  39   M         339   170       Up            0             No     2
##   ST_SlopeF   StdResid     CookDist EstimatedProbabkility
## 1         1 -0.4353314 5.941491e-05            0.08980403
## 2         2  1.1702562 1.981517e-03            0.50016368
## 3         1 -0.6944184 7.311626e-04            0.22367656
## 4         2  0.9604217 1.412107e-03            0.64073782
## 5         1 -0.8921112 3.959114e-04            0.33184191
## 6         1 -0.3932120 4.852667e-05            0.07480090

According to estimated probability we can see how likely a person is to have heart diesase for example the first person has an 8% chance while the second has 50%.

mydata$Classification <- ifelse(test = mydata$EstimatedProbabkility < 0.5, 
                                 yes = "No",
                                 no = "Yes")

ClassificationTable <- table(mydata$HeartDisease_F, mydata$Classification)

ClassificationTable
##      
##        No Yes
##   No  329  80
##   Yes  77 431

We can see that for example 80 people did not have heart disease but we classified them to have it.

Pseudo_R2_fit2 <- (ClassificationTable[1,1] + ClassificationTable[2,2])/nrow(mydata)
Pseudo_R2_fit2
## [1] 0.8287895

We see that 82.8 percent of all units were correctly classified.