*** Predicting Heart Disease using data given on 11 different variables***
##Project Objective
Use logistic regression to build a model that will help pinpoint the most relevant risk factors of heart disease as well as predict the overall risk uaing the given dataset
install.packages('caret')
install.packages('gains')
install.packages('rpart')
install.packages('rpart.plot')
install.packages('pROC')
install.packages('boot')
install.packages('ggplot2')
install.packages('gains')
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(gains)
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(boot)
## Warning: package 'boot' was built under R version 4.4.3
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
library(ggplot2)
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.4.3
###import data
library(readr)
heart <- read_csv("C:/Users/lenla/OneDrive/Desktop/heart.csv")
## Rows: 918 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sex, ChestPainType, RestingECG, ExerciseAngina, ST_Slope
## dbl (7): Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak, HeartDisease
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(heart)
View data structure
str(heart)
## spc_tbl_ [918 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:918] 40 49 37 48 54 39 45 54 37 48 ...
## $ Sex : chr [1:918] "M" "F" "M" "F" ...
## $ ChestPainType : chr [1:918] "ATA" "NAP" "ATA" "ASY" ...
## $ RestingBP : num [1:918] 140 160 130 138 150 120 130 110 140 120 ...
## $ Cholesterol : num [1:918] 289 180 283 214 195 339 237 208 207 284 ...
## $ FastingBS : num [1:918] 0 0 0 0 0 0 0 0 0 0 ...
## $ RestingECG : chr [1:918] "Normal" "Normal" "ST" "Normal" ...
## $ MaxHR : num [1:918] 172 156 98 108 122 170 170 142 130 120 ...
## $ ExerciseAngina: chr [1:918] "N" "N" "N" "Y" ...
## $ Oldpeak : num [1:918] 0 1 0 1.5 0 0 0 0 1.5 0 ...
## $ ST_Slope : chr [1:918] "Up" "Flat" "Up" "Flat" ...
## $ HeartDisease : num [1:918] 0 1 0 1 0 0 0 0 1 0 ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Sex = col_character(),
## .. ChestPainType = col_character(),
## .. RestingBP = col_double(),
## .. Cholesterol = col_double(),
## .. FastingBS = col_double(),
## .. RestingECG = col_character(),
## .. MaxHR = col_double(),
## .. ExerciseAngina = col_character(),
## .. Oldpeak = col_double(),
## .. ST_Slope = col_character(),
## .. HeartDisease = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
interpretation: this data set reports seven categorical variables which include:Sex, ChestPainType, FastingBS,
RestingECG, ExerciseAngina, ST_Slope,
and the response variable Heart Disease
str(heart)
## spc_tbl_ [918 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:918] 40 49 37 48 54 39 45 54 37 48 ...
## $ Sex : chr [1:918] "M" "F" "M" "F" ...
## $ ChestPainType : chr [1:918] "ATA" "NAP" "ATA" "ASY" ...
## $ RestingBP : num [1:918] 140 160 130 138 150 120 130 110 140 120 ...
## $ Cholesterol : num [1:918] 289 180 283 214 195 339 237 208 207 284 ...
## $ FastingBS : num [1:918] 0 0 0 0 0 0 0 0 0 0 ...
## $ RestingECG : chr [1:918] "Normal" "Normal" "ST" "Normal" ...
## $ MaxHR : num [1:918] 172 156 98 108 122 170 170 142 130 120 ...
## $ ExerciseAngina: chr [1:918] "N" "N" "N" "Y" ...
## $ Oldpeak : num [1:918] 0 1 0 1.5 0 0 0 0 1.5 0 ...
## $ ST_Slope : chr [1:918] "Up" "Flat" "Up" "Flat" ...
## $ HeartDisease : num [1:918] 0 1 0 1 0 0 0 0 1 0 ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Sex = col_character(),
## .. ChestPainType = col_character(),
## .. RestingBP = col_double(),
## .. Cholesterol = col_double(),
## .. FastingBS = col_double(),
## .. RestingECG = col_character(),
## .. MaxHR = col_double(),
## .. ExerciseAngina = col_character(),
## .. Oldpeak = col_double(),
## .. ST_Slope = col_character(),
## .. HeartDisease = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(heart)
## Age Sex ChestPainType RestingBP
## Min. :28.00 Length:918 Length:918 Min. : 0.0
## 1st Qu.:47.00 Class :character Class :character 1st Qu.:120.0
## Median :54.00 Mode :character Mode :character Median :130.0
## Mean :53.51 Mean :132.4
## 3rd Qu.:60.00 3rd Qu.:140.0
## Max. :77.00 Max. :200.0
## Cholesterol FastingBS RestingECG MaxHR
## Min. : 0.0 Min. :0.0000 Length:918 Min. : 60.0
## 1st Qu.:173.2 1st Qu.:0.0000 Class :character 1st Qu.:120.0
## Median :223.0 Median :0.0000 Mode :character Median :138.0
## Mean :198.8 Mean :0.2331 Mean :136.8
## 3rd Qu.:267.0 3rd Qu.:0.0000 3rd Qu.:156.0
## Max. :603.0 Max. :1.0000 Max. :202.0
## ExerciseAngina Oldpeak ST_Slope HeartDisease
## Length:918 Min. :-2.6000 Length:918 Min. :0.0000
## Class :character 1st Qu.: 0.0000 Class :character 1st Qu.:0.0000
## Mode :character Median : 0.6000 Mode :character Median :1.0000
## Mean : 0.8874 Mean :0.5534
## 3rd Qu.: 1.5000 3rd Qu.:1.0000
## Max. : 6.2000 Max. :1.0000
heart$HeartDisease <- as.factor(heart$HeartDisease) #target variable
heart$Sex <- as.factor(heart$Sex)
heart$ChestPainType <- as.factor(heart$ChestPainType)
heart$FastingBS <- as.factor(heart$FastingBS)
heart$RestingECG <- as.factor(heart$RestingECG)
heart$ExerciseAngina <- as.factor(heart$ExerciseAngina)
heart$ST_Slope <- as.factor(heart$ST_Slope)
str(heart)
## spc_tbl_ [918 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:918] 40 49 37 48 54 39 45 54 37 48 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 1 2 1 2 2 1 2 2 1 ...
## $ ChestPainType : Factor w/ 4 levels "ASY","ATA","NAP",..: 2 3 2 1 3 3 2 2 1 2 ...
## $ RestingBP : num [1:918] 140 160 130 138 150 120 130 110 140 120 ...
## $ Cholesterol : num [1:918] 289 180 283 214 195 339 237 208 207 284 ...
## $ FastingBS : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ RestingECG : Factor w/ 3 levels "LVH","Normal",..: 2 2 3 2 2 2 2 2 2 2 ...
## $ MaxHR : num [1:918] 172 156 98 108 122 170 170 142 130 120 ...
## $ ExerciseAngina: Factor w/ 2 levels "N","Y": 1 1 1 2 1 1 1 1 2 1 ...
## $ Oldpeak : num [1:918] 0 1 0 1.5 0 0 0 0 1.5 0 ...
## $ ST_Slope : Factor w/ 3 levels "Down","Flat",..: 3 2 3 2 3 3 3 3 2 3 ...
## $ HeartDisease : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 1 1 2 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Sex = col_character(),
## .. ChestPainType = col_character(),
## .. RestingBP = col_double(),
## .. Cholesterol = col_double(),
## .. FastingBS = col_double(),
## .. RestingECG = col_character(),
## .. MaxHR = col_double(),
## .. ExerciseAngina = col_character(),
## .. Oldpeak = col_double(),
## .. ST_Slope = col_character(),
## .. HeartDisease = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Interpretation: In order to fit our model we need to convert all the categorical values into factors
boxplot(heart$Cholesterol,heart$RestingBP,heart$FastingBS,heart$MaxHR,heart$Oldpeak)
When taking a look at the boxplots we can see that the variables RestingBP, and Cholesterol have some values that indicate 0. Since it is unusal for a persons RestingBP, and Cholesterol to be 0, we assume that the
data was never collected or unavailable.
To fix this we will be replacing each 0 that is reported in RestingBP & Cholesterol variables with their corrisponding median value for the respected catagory.
median(heart$Cholesterol) #median = 223
## [1] 223
heart$Cholesterol <- ifelse(heart$Cholesterol == 0, median(heart$Cholesterol), heart$Cholesterol)
median(heart$RestingBP) #median = 223
## [1] 130
heart$RestingBP <- ifelse(heart$RestingBP == 0, median(heart$RestingBP), heart$RestingBP)
summary(heart)
## Age Sex ChestPainType RestingBP Cholesterol
## Min. :28.00 F:193 ASY:496 Min. : 80.0 Min. : 85.0
## 1st Qu.:47.00 M:725 ATA:173 1st Qu.:120.0 1st Qu.:214.0
## Median :54.00 NAP:203 Median :130.0 Median :223.0
## Mean :53.51 TA : 46 Mean :132.5 Mean :240.6
## 3rd Qu.:60.00 3rd Qu.:140.0 3rd Qu.:267.0
## Max. :77.00 Max. :200.0 Max. :603.0
## FastingBS RestingECG MaxHR ExerciseAngina Oldpeak
## 0:704 LVH :188 Min. : 60.0 N:547 Min. :-2.6000
## 1:214 Normal:552 1st Qu.:120.0 Y:371 1st Qu.: 0.0000
## ST :178 Median :138.0 Median : 0.6000
## Mean :136.8 Mean : 0.8874
## 3rd Qu.:156.0 3rd Qu.: 1.5000
## Max. :202.0 Max. : 6.2000
## ST_Slope HeartDisease
## Down: 63 0:410
## Flat:460 1:508
## Up :395
##
##
##
Since the response variable is Binary and not contiuous we will be using a Logistic regression model
###Step 1: Splitting the data 70/30 into a training set and a testing set
subset <- sample(nrow(heart), nrow(heart) * 0.7)
heart_train = heart[subset, ]
heart_test = heart[-subset, ]
This will help us predict the accuracy of our model by allowing us to compare how well the model does on the out-of-sample data after getting results from the in-sample prediction.
###Step 2: Identifying signifigant variables (variables with a p-value < 0.05)
##original model all variables
heart_glm0 <- glm(HeartDisease~.,
family=binomial, data=heart_train)
summary(heart_glm0)
##
## Call:
## glm(formula = HeartDisease ~ ., family = binomial, data = heart_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.871356 1.825195 -1.573 0.115678
## Age 0.023475 0.016070 1.461 0.144057
## SexM 1.758071 0.342014 5.140 2.74e-07 ***
## ChestPainTypeATA -2.257208 0.413474 -5.459 4.78e-08 ***
## ChestPainTypeNAP -1.520443 0.313443 -4.851 1.23e-06 ***
## ChestPainTypeTA -1.740906 0.507689 -3.429 0.000606 ***
## RestingBP 0.004335 0.007504 0.578 0.563452
## Cholesterol 0.003059 0.002508 1.220 0.222582
## FastingBS1 0.889495 0.316334 2.812 0.004925 **
## RestingECGNormal 0.206850 0.312023 0.663 0.507374
## RestingECGST 0.111332 0.411032 0.271 0.786500
## MaxHR -0.004171 0.005794 -0.720 0.471606
## ExerciseAnginaY 0.790386 0.300265 2.632 0.008481 **
## Oldpeak 0.388556 0.132241 2.938 0.003301 **
## ST_SlopeFlat 0.757814 0.575357 1.317 0.187799
## ST_SlopeUp -1.600445 0.588923 -2.718 0.006576 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 880.01 on 641 degrees of freedom
## Residual deviance: 424.29 on 626 degrees of freedom
## AIC: 456.29
##
## Number of Fisher Scoring iterations: 5
###Look at the sig p values (less than 0.05)
summary(heart_glm0)
##
## Call:
## glm(formula = HeartDisease ~ ., family = binomial, data = heart_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.871356 1.825195 -1.573 0.115678
## Age 0.023475 0.016070 1.461 0.144057
## SexM 1.758071 0.342014 5.140 2.74e-07 ***
## ChestPainTypeATA -2.257208 0.413474 -5.459 4.78e-08 ***
## ChestPainTypeNAP -1.520443 0.313443 -4.851 1.23e-06 ***
## ChestPainTypeTA -1.740906 0.507689 -3.429 0.000606 ***
## RestingBP 0.004335 0.007504 0.578 0.563452
## Cholesterol 0.003059 0.002508 1.220 0.222582
## FastingBS1 0.889495 0.316334 2.812 0.004925 **
## RestingECGNormal 0.206850 0.312023 0.663 0.507374
## RestingECGST 0.111332 0.411032 0.271 0.786500
## MaxHR -0.004171 0.005794 -0.720 0.471606
## ExerciseAnginaY 0.790386 0.300265 2.632 0.008481 **
## Oldpeak 0.388556 0.132241 2.938 0.003301 **
## ST_SlopeFlat 0.757814 0.575357 1.317 0.187799
## ST_SlopeUp -1.600445 0.588923 -2.718 0.006576 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 880.01 on 641 degrees of freedom
## Residual deviance: 424.29 on 626 degrees of freedom
## AIC: 456.29
##
## Number of Fisher Scoring iterations: 5
#backward stepwise regression
heart_glmst = step(heart_glm0, direction = "backward")
## Start: AIC=456.29
## HeartDisease ~ Age + Sex + ChestPainType + RestingBP + Cholesterol +
## FastingBS + RestingECG + MaxHR + ExerciseAngina + Oldpeak +
## ST_Slope
##
## Df Deviance AIC
## - RestingECG 2 424.74 452.74
## - RestingBP 1 424.63 454.63
## - MaxHR 1 424.81 454.81
## - Cholesterol 1 425.84 455.84
## <none> 424.29 456.29
## - Age 1 426.45 456.45
## - ExerciseAngina 1 431.20 461.20
## - FastingBS 1 432.54 462.54
## - Oldpeak 1 433.20 463.20
## - Sex 1 453.68 483.68
## - ChestPainType 3 472.83 498.83
## - ST_Slope 2 504.37 532.37
##
## Step: AIC=452.74
## HeartDisease ~ Age + Sex + ChestPainType + RestingBP + Cholesterol +
## FastingBS + MaxHR + ExerciseAngina + Oldpeak + ST_Slope
##
## Df Deviance AIC
## - RestingBP 1 425.09 451.09
## - MaxHR 1 425.45 451.45
## - Cholesterol 1 426.15 452.15
## - Age 1 426.55 452.55
## <none> 424.74 452.74
## - ExerciseAngina 1 431.80 457.80
## - FastingBS 1 433.21 459.21
## - Oldpeak 1 433.53 459.53
## - Sex 1 454.18 480.18
## - ChestPainType 3 472.85 494.85
## - ST_Slope 2 505.36 529.36
##
## Step: AIC=451.09
## HeartDisease ~ Age + Sex + ChestPainType + Cholesterol + FastingBS +
## MaxHR + ExerciseAngina + Oldpeak + ST_Slope
##
## Df Deviance AIC
## - MaxHR 1 425.80 449.80
## - Cholesterol 1 426.69 450.69
## <none> 425.09 451.09
## - Age 1 427.31 451.31
## - ExerciseAngina 1 432.93 456.93
## - FastingBS 1 433.67 457.67
## - Oldpeak 1 434.21 458.21
## - Sex 1 454.43 478.43
## - ChestPainType 3 472.94 492.94
## - ST_Slope 2 505.70 527.70
##
## Step: AIC=449.8
## HeartDisease ~ Age + Sex + ChestPainType + Cholesterol + FastingBS +
## ExerciseAngina + Oldpeak + ST_Slope
##
## Df Deviance AIC
## - Cholesterol 1 427.26 449.26
## <none> 425.80 449.80
## - Age 1 429.49 451.49
## - FastingBS 1 434.29 456.29
## - Oldpeak 1 434.41 456.41
## - ExerciseAngina 1 434.82 456.82
## - Sex 1 455.89 477.89
## - ChestPainType 3 477.55 495.55
## - ST_Slope 2 513.58 533.58
##
## Step: AIC=449.26
## HeartDisease ~ Age + Sex + ChestPainType + FastingBS + ExerciseAngina +
## Oldpeak + ST_Slope
##
## Df Deviance AIC
## <none> 427.26 449.26
## - Age 1 430.62 450.62
## - FastingBS 1 435.80 455.80
## - Oldpeak 1 435.80 455.80
## - ExerciseAngina 1 436.95 456.95
## - Sex 1 456.13 476.13
## - ChestPainType 3 479.75 495.75
## - ST_Slope 2 517.33 535.33
##gets new model w sig variables
AIC(heart_glmst)
## [1] 449.2576
heart_glm1 <- glm(HeartDisease ~ Age + Sex + ChestPainType + FastingBS + MaxHR +
ExerciseAngina + Oldpeak + ST_Slope,
family = binomial,
data = heart_train)
summary(heart_glm1)
##
## Call:
## glm(formula = HeartDisease ~ Age + Sex + ChestPainType + FastingBS +
## MaxHR + ExerciseAngina + Oldpeak + ST_Slope, family = binomial,
## data = heart_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.378834 1.465526 -0.941 0.346784
## Age 0.021728 0.015189 1.430 0.152586
## SexM 1.691416 0.334776 5.052 4.36e-07 ***
## ChestPainTypeATA -2.192541 0.406110 -5.399 6.71e-08 ***
## ChestPainTypeNAP -1.524821 0.309447 -4.928 8.33e-07 ***
## ChestPainTypeTA -1.740553 0.501698 -3.469 0.000522 ***
## FastingBS1 0.904766 0.314853 2.874 0.004058 **
## MaxHR -0.004193 0.005548 -0.756 0.449788
## ExerciseAnginaY 0.862264 0.293500 2.938 0.003305 **
## Oldpeak 0.388281 0.131774 2.947 0.003213 **
## ST_SlopeFlat 0.833998 0.575136 1.450 0.147034
## ST_SlopeUp -1.550482 0.587840 -2.638 0.008350 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 880.01 on 641 degrees of freedom
## Residual deviance: 426.69 on 630 degrees of freedom
## AIC: 450.69
##
## Number of Fisher Scoring iterations: 5
Compare each models AIC lower = better fitting model
AIC(heart_glm0)
## [1] 456.2915
AIC(heart_glm1)
## [1] 450.6866
AIC of the new model proves to be a better fit with AIC being
###Step 4: Validation
Asymmetric cost - due to the cost of having FN being more critical than FP in
predicting Heart disease we put the cost of diagnosing a FN higher than diagnosing a paitent with a false positive
cost2 <- function(r, pi, pcut = 1/(5+1)) { # Default threshold
weight1 <- 5 # FN penalty
weight0 <- 1 # FP penalty
c1 <- (r == 1) & (pi < pcut) # False negatives
c0 <- (r == 0) & (pi > pcut) # False positives
mean(weight1 * c1 + weight0 * c0)
}
k- 10 fold validataion:
library(boot)
heart_K102 <- glm(HeartDisease ~ Age + Sex + ChestPainType + FastingBS + MaxHR +
ExerciseAngina + Oldpeak + ST_Slope,
family = binomial,
data = heart_train)
cv_result2 <- cv.glm(
data = heart_train,
glmfit = heart_K102,
cost = cost2,
K = 10
)
cv_result2$delta[2]
## [1] 0.2481245
AUC Out of sample
pred_glm0_test<- predict(heart_glm0,
newdata = heart_test, type="response")
pred1 <- prediction(pred_glm0_test, heart_test$HeartDisease)
perf1 <- performance(pred1, "tpr", "fpr")
plot(perf1, colorize=TRUE)
unlist(slot(performance(pred1, "auc"), "y.values"))
## [1] 0.9221115
table(heart_test$HeartDisease, (pred_glm0_test > 1/(5+1))*1,
dnn = c("Truth", "Predicted"))
## Predicted
## Truth 0 1
## 0 78 51
## 1 4 143
###interpretation of this model
ROC curve has a good spread. This is a sign of a good model and its ability to predict whether or not a patient is going to end up with some sort of heart disease and eventually heat failure. Our model is further supported when taking a look at the AUC. The AUC being closer to 1 means that the model fitted is good at predicting whether or not a patient will develop heart disease. we can see that this model was able to correctly predict 80 normal cases and 152 sick cases accurately. The overall accuracy of this model is 84.1% correctly being able to identify 232/276 cases. When predicting whether or not a paitent has heart disease the model was 97.1% accurate. Our model lacks when it comes to identifying healthy paitents with a 33.3% inaccuratecy.