*** Predicting Heart Disease using data given on 11 different variables***

##Project Objective

Use logistic regression to build a model that will help pinpoint the most relevant risk factors of heart disease as well as predict the overall risk uaing the given dataset

load packages

install.packages('caret') 
install.packages('gains') 
install.packages('rpart') 
install.packages('rpart.plot') 
install.packages('pROC') 
install.packages('boot') 
install.packages('ggplot2') 
install.packages('gains') 
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(gains)
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(boot)
## Warning: package 'boot' was built under R version 4.4.3
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
library(ggplot2)
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.4.3

###import data

library(readr)
heart <- read_csv("C:/Users/lenla/OneDrive/Desktop/heart.csv")
## Rows: 918 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sex, ChestPainType, RestingECG, ExerciseAngina, ST_Slope
## dbl (7): Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak, HeartDisease
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(heart)
View data structure
str(heart)
## spc_tbl_ [918 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age           : num [1:918] 40 49 37 48 54 39 45 54 37 48 ...
##  $ Sex           : chr [1:918] "M" "F" "M" "F" ...
##  $ ChestPainType : chr [1:918] "ATA" "NAP" "ATA" "ASY" ...
##  $ RestingBP     : num [1:918] 140 160 130 138 150 120 130 110 140 120 ...
##  $ Cholesterol   : num [1:918] 289 180 283 214 195 339 237 208 207 284 ...
##  $ FastingBS     : num [1:918] 0 0 0 0 0 0 0 0 0 0 ...
##  $ RestingECG    : chr [1:918] "Normal" "Normal" "ST" "Normal" ...
##  $ MaxHR         : num [1:918] 172 156 98 108 122 170 170 142 130 120 ...
##  $ ExerciseAngina: chr [1:918] "N" "N" "N" "Y" ...
##  $ Oldpeak       : num [1:918] 0 1 0 1.5 0 0 0 0 1.5 0 ...
##  $ ST_Slope      : chr [1:918] "Up" "Flat" "Up" "Flat" ...
##  $ HeartDisease  : num [1:918] 0 1 0 1 0 0 0 0 1 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Sex = col_character(),
##   ..   ChestPainType = col_character(),
##   ..   RestingBP = col_double(),
##   ..   Cholesterol = col_double(),
##   ..   FastingBS = col_double(),
##   ..   RestingECG = col_character(),
##   ..   MaxHR = col_double(),
##   ..   ExerciseAngina = col_character(),
##   ..   Oldpeak = col_double(),
##   ..   ST_Slope = col_character(),
##   ..   HeartDisease = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
interpretation: this data set reports seven categorical variables which include:Sex, ChestPainType, FastingBS,
RestingECG, ExerciseAngina, ST_Slope,
and the response variable Heart Disease

Data conversion//Cleaning data

str(heart)
## spc_tbl_ [918 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age           : num [1:918] 40 49 37 48 54 39 45 54 37 48 ...
##  $ Sex           : chr [1:918] "M" "F" "M" "F" ...
##  $ ChestPainType : chr [1:918] "ATA" "NAP" "ATA" "ASY" ...
##  $ RestingBP     : num [1:918] 140 160 130 138 150 120 130 110 140 120 ...
##  $ Cholesterol   : num [1:918] 289 180 283 214 195 339 237 208 207 284 ...
##  $ FastingBS     : num [1:918] 0 0 0 0 0 0 0 0 0 0 ...
##  $ RestingECG    : chr [1:918] "Normal" "Normal" "ST" "Normal" ...
##  $ MaxHR         : num [1:918] 172 156 98 108 122 170 170 142 130 120 ...
##  $ ExerciseAngina: chr [1:918] "N" "N" "N" "Y" ...
##  $ Oldpeak       : num [1:918] 0 1 0 1.5 0 0 0 0 1.5 0 ...
##  $ ST_Slope      : chr [1:918] "Up" "Flat" "Up" "Flat" ...
##  $ HeartDisease  : num [1:918] 0 1 0 1 0 0 0 0 1 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Sex = col_character(),
##   ..   ChestPainType = col_character(),
##   ..   RestingBP = col_double(),
##   ..   Cholesterol = col_double(),
##   ..   FastingBS = col_double(),
##   ..   RestingECG = col_character(),
##   ..   MaxHR = col_double(),
##   ..   ExerciseAngina = col_character(),
##   ..   Oldpeak = col_double(),
##   ..   ST_Slope = col_character(),
##   ..   HeartDisease = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(heart)
##       Age            Sex            ChestPainType        RestingBP    
##  Min.   :28.00   Length:918         Length:918         Min.   :  0.0  
##  1st Qu.:47.00   Class :character   Class :character   1st Qu.:120.0  
##  Median :54.00   Mode  :character   Mode  :character   Median :130.0  
##  Mean   :53.51                                         Mean   :132.4  
##  3rd Qu.:60.00                                         3rd Qu.:140.0  
##  Max.   :77.00                                         Max.   :200.0  
##   Cholesterol      FastingBS       RestingECG            MaxHR      
##  Min.   :  0.0   Min.   :0.0000   Length:918         Min.   : 60.0  
##  1st Qu.:173.2   1st Qu.:0.0000   Class :character   1st Qu.:120.0  
##  Median :223.0   Median :0.0000   Mode  :character   Median :138.0  
##  Mean   :198.8   Mean   :0.2331                      Mean   :136.8  
##  3rd Qu.:267.0   3rd Qu.:0.0000                      3rd Qu.:156.0  
##  Max.   :603.0   Max.   :1.0000                      Max.   :202.0  
##  ExerciseAngina        Oldpeak          ST_Slope          HeartDisease   
##  Length:918         Min.   :-2.6000   Length:918         Min.   :0.0000  
##  Class :character   1st Qu.: 0.0000   Class :character   1st Qu.:0.0000  
##  Mode  :character   Median : 0.6000   Mode  :character   Median :1.0000  
##                     Mean   : 0.8874                      Mean   :0.5534  
##                     3rd Qu.: 1.5000                      3rd Qu.:1.0000  
##                     Max.   : 6.2000                      Max.   :1.0000
heart$HeartDisease <- as.factor(heart$HeartDisease) #target variable
heart$Sex <- as.factor(heart$Sex)
heart$ChestPainType <- as.factor(heart$ChestPainType)
heart$FastingBS <-  as.factor(heart$FastingBS)
heart$RestingECG <-  as.factor(heart$RestingECG)
heart$ExerciseAngina <-  as.factor(heart$ExerciseAngina)
heart$ST_Slope <-  as.factor(heart$ST_Slope)
str(heart)
## spc_tbl_ [918 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age           : num [1:918] 40 49 37 48 54 39 45 54 37 48 ...
##  $ Sex           : Factor w/ 2 levels "F","M": 2 1 2 1 2 2 1 2 2 1 ...
##  $ ChestPainType : Factor w/ 4 levels "ASY","ATA","NAP",..: 2 3 2 1 3 3 2 2 1 2 ...
##  $ RestingBP     : num [1:918] 140 160 130 138 150 120 130 110 140 120 ...
##  $ Cholesterol   : num [1:918] 289 180 283 214 195 339 237 208 207 284 ...
##  $ FastingBS     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ RestingECG    : Factor w/ 3 levels "LVH","Normal",..: 2 2 3 2 2 2 2 2 2 2 ...
##  $ MaxHR         : num [1:918] 172 156 98 108 122 170 170 142 130 120 ...
##  $ ExerciseAngina: Factor w/ 2 levels "N","Y": 1 1 1 2 1 1 1 1 2 1 ...
##  $ Oldpeak       : num [1:918] 0 1 0 1.5 0 0 0 0 1.5 0 ...
##  $ ST_Slope      : Factor w/ 3 levels "Down","Flat",..: 3 2 3 2 3 3 3 3 2 3 ...
##  $ HeartDisease  : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 1 1 2 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Sex = col_character(),
##   ..   ChestPainType = col_character(),
##   ..   RestingBP = col_double(),
##   ..   Cholesterol = col_double(),
##   ..   FastingBS = col_double(),
##   ..   RestingECG = col_character(),
##   ..   MaxHR = col_double(),
##   ..   ExerciseAngina = col_character(),
##   ..   Oldpeak = col_double(),
##   ..   ST_Slope = col_character(),
##   ..   HeartDisease = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
Interpretation: In order to fit our model we need to convert all the categorical values into factors

Checking for outliers / Incomplete data

boxplot(heart$Cholesterol,heart$RestingBP,heart$FastingBS,heart$MaxHR,heart$Oldpeak)

When taking a look at the boxplots we can see that the variables RestingBP, and Cholesterol have some values that indicate 0. Since it is unusal for a persons RestingBP, and Cholesterol to be 0, we assume that the
data was never collected or unavailable.
To fix this we will be replacing each 0 that is reported in RestingBP & Cholesterol variables with their corrisponding median value for the respected catagory.

Replacing”0” values in each category

median(heart$Cholesterol) #median = 223
## [1] 223
heart$Cholesterol <- ifelse(heart$Cholesterol == 0, median(heart$Cholesterol), heart$Cholesterol)

median(heart$RestingBP) #median = 223
## [1] 130
heart$RestingBP <- ifelse(heart$RestingBP == 0, median(heart$RestingBP), heart$RestingBP)
summary(heart)
##       Age        Sex     ChestPainType   RestingBP      Cholesterol   
##  Min.   :28.00   F:193   ASY:496       Min.   : 80.0   Min.   : 85.0  
##  1st Qu.:47.00   M:725   ATA:173       1st Qu.:120.0   1st Qu.:214.0  
##  Median :54.00           NAP:203       Median :130.0   Median :223.0  
##  Mean   :53.51           TA : 46       Mean   :132.5   Mean   :240.6  
##  3rd Qu.:60.00                         3rd Qu.:140.0   3rd Qu.:267.0  
##  Max.   :77.00                         Max.   :200.0   Max.   :603.0  
##  FastingBS  RestingECG      MaxHR       ExerciseAngina    Oldpeak       
##  0:704     LVH   :188   Min.   : 60.0   N:547          Min.   :-2.6000  
##  1:214     Normal:552   1st Qu.:120.0   Y:371          1st Qu.: 0.0000  
##            ST    :178   Median :138.0                  Median : 0.6000  
##                         Mean   :136.8                  Mean   : 0.8874  
##                         3rd Qu.:156.0                  3rd Qu.: 1.5000  
##                         Max.   :202.0                  Max.   : 6.2000  
##  ST_Slope   HeartDisease
##  Down: 63   0:410       
##  Flat:460   1:508       
##  Up  :395               
##                         
##                         
## 

MODEL 1: Logestic Regression

Since the response variable is Binary and not contiuous we will be using a Logistic regression model

###Step 1: Splitting the data 70/30 into a training set and a testing set

subset <- sample(nrow(heart), nrow(heart) * 0.7)
heart_train = heart[subset, ]
heart_test = heart[-subset, ]
This will help us predict the accuracy of our model by allowing us to compare how well the model does on the out-of-sample data after getting results from the in-sample prediction.

###Step 2: Identifying signifigant variables (variables with a p-value < 0.05)

##original model all variables
heart_glm0 <- glm(HeartDisease~.,
                   family=binomial, data=heart_train)
summary(heart_glm0)
## 
## Call:
## glm(formula = HeartDisease ~ ., family = binomial, data = heart_train)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -2.871356   1.825195  -1.573 0.115678    
## Age               0.023475   0.016070   1.461 0.144057    
## SexM              1.758071   0.342014   5.140 2.74e-07 ***
## ChestPainTypeATA -2.257208   0.413474  -5.459 4.78e-08 ***
## ChestPainTypeNAP -1.520443   0.313443  -4.851 1.23e-06 ***
## ChestPainTypeTA  -1.740906   0.507689  -3.429 0.000606 ***
## RestingBP         0.004335   0.007504   0.578 0.563452    
## Cholesterol       0.003059   0.002508   1.220 0.222582    
## FastingBS1        0.889495   0.316334   2.812 0.004925 ** 
## RestingECGNormal  0.206850   0.312023   0.663 0.507374    
## RestingECGST      0.111332   0.411032   0.271 0.786500    
## MaxHR            -0.004171   0.005794  -0.720 0.471606    
## ExerciseAnginaY   0.790386   0.300265   2.632 0.008481 ** 
## Oldpeak           0.388556   0.132241   2.938 0.003301 ** 
## ST_SlopeFlat      0.757814   0.575357   1.317 0.187799    
## ST_SlopeUp       -1.600445   0.588923  -2.718 0.006576 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 880.01  on 641  degrees of freedom
## Residual deviance: 424.29  on 626  degrees of freedom
## AIC: 456.29
## 
## Number of Fisher Scoring iterations: 5

###Look at the sig p values (less than 0.05)

summary(heart_glm0)
## 
## Call:
## glm(formula = HeartDisease ~ ., family = binomial, data = heart_train)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -2.871356   1.825195  -1.573 0.115678    
## Age               0.023475   0.016070   1.461 0.144057    
## SexM              1.758071   0.342014   5.140 2.74e-07 ***
## ChestPainTypeATA -2.257208   0.413474  -5.459 4.78e-08 ***
## ChestPainTypeNAP -1.520443   0.313443  -4.851 1.23e-06 ***
## ChestPainTypeTA  -1.740906   0.507689  -3.429 0.000606 ***
## RestingBP         0.004335   0.007504   0.578 0.563452    
## Cholesterol       0.003059   0.002508   1.220 0.222582    
## FastingBS1        0.889495   0.316334   2.812 0.004925 ** 
## RestingECGNormal  0.206850   0.312023   0.663 0.507374    
## RestingECGST      0.111332   0.411032   0.271 0.786500    
## MaxHR            -0.004171   0.005794  -0.720 0.471606    
## ExerciseAnginaY   0.790386   0.300265   2.632 0.008481 ** 
## Oldpeak           0.388556   0.132241   2.938 0.003301 ** 
## ST_SlopeFlat      0.757814   0.575357   1.317 0.187799    
## ST_SlopeUp       -1.600445   0.588923  -2.718 0.006576 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 880.01  on 641  degrees of freedom
## Residual deviance: 424.29  on 626  degrees of freedom
## AIC: 456.29
## 
## Number of Fisher Scoring iterations: 5
#backward stepwise regression 
heart_glmst = step(heart_glm0, direction = "backward")
## Start:  AIC=456.29
## HeartDisease ~ Age + Sex + ChestPainType + RestingBP + Cholesterol + 
##     FastingBS + RestingECG + MaxHR + ExerciseAngina + Oldpeak + 
##     ST_Slope
## 
##                  Df Deviance    AIC
## - RestingECG      2   424.74 452.74
## - RestingBP       1   424.63 454.63
## - MaxHR           1   424.81 454.81
## - Cholesterol     1   425.84 455.84
## <none>                424.29 456.29
## - Age             1   426.45 456.45
## - ExerciseAngina  1   431.20 461.20
## - FastingBS       1   432.54 462.54
## - Oldpeak         1   433.20 463.20
## - Sex             1   453.68 483.68
## - ChestPainType   3   472.83 498.83
## - ST_Slope        2   504.37 532.37
## 
## Step:  AIC=452.74
## HeartDisease ~ Age + Sex + ChestPainType + RestingBP + Cholesterol + 
##     FastingBS + MaxHR + ExerciseAngina + Oldpeak + ST_Slope
## 
##                  Df Deviance    AIC
## - RestingBP       1   425.09 451.09
## - MaxHR           1   425.45 451.45
## - Cholesterol     1   426.15 452.15
## - Age             1   426.55 452.55
## <none>                424.74 452.74
## - ExerciseAngina  1   431.80 457.80
## - FastingBS       1   433.21 459.21
## - Oldpeak         1   433.53 459.53
## - Sex             1   454.18 480.18
## - ChestPainType   3   472.85 494.85
## - ST_Slope        2   505.36 529.36
## 
## Step:  AIC=451.09
## HeartDisease ~ Age + Sex + ChestPainType + Cholesterol + FastingBS + 
##     MaxHR + ExerciseAngina + Oldpeak + ST_Slope
## 
##                  Df Deviance    AIC
## - MaxHR           1   425.80 449.80
## - Cholesterol     1   426.69 450.69
## <none>                425.09 451.09
## - Age             1   427.31 451.31
## - ExerciseAngina  1   432.93 456.93
## - FastingBS       1   433.67 457.67
## - Oldpeak         1   434.21 458.21
## - Sex             1   454.43 478.43
## - ChestPainType   3   472.94 492.94
## - ST_Slope        2   505.70 527.70
## 
## Step:  AIC=449.8
## HeartDisease ~ Age + Sex + ChestPainType + Cholesterol + FastingBS + 
##     ExerciseAngina + Oldpeak + ST_Slope
## 
##                  Df Deviance    AIC
## - Cholesterol     1   427.26 449.26
## <none>                425.80 449.80
## - Age             1   429.49 451.49
## - FastingBS       1   434.29 456.29
## - Oldpeak         1   434.41 456.41
## - ExerciseAngina  1   434.82 456.82
## - Sex             1   455.89 477.89
## - ChestPainType   3   477.55 495.55
## - ST_Slope        2   513.58 533.58
## 
## Step:  AIC=449.26
## HeartDisease ~ Age + Sex + ChestPainType + FastingBS + ExerciseAngina + 
##     Oldpeak + ST_Slope
## 
##                  Df Deviance    AIC
## <none>                427.26 449.26
## - Age             1   430.62 450.62
## - FastingBS       1   435.80 455.80
## - Oldpeak         1   435.80 455.80
## - ExerciseAngina  1   436.95 456.95
## - Sex             1   456.13 476.13
## - ChestPainType   3   479.75 495.75
## - ST_Slope        2   517.33 535.33
##gets new model w sig variables 
AIC(heart_glmst)
## [1] 449.2576

step 3 fit new model with only significant variables

heart_glm1 <- glm(HeartDisease ~ Age + Sex + ChestPainType + FastingBS + MaxHR + 
                    ExerciseAngina + Oldpeak + ST_Slope,
                  family = binomial, 
                  data = heart_train)

summary(heart_glm1)
## 
## Call:
## glm(formula = HeartDisease ~ Age + Sex + ChestPainType + FastingBS + 
##     MaxHR + ExerciseAngina + Oldpeak + ST_Slope, family = binomial, 
##     data = heart_train)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.378834   1.465526  -0.941 0.346784    
## Age               0.021728   0.015189   1.430 0.152586    
## SexM              1.691416   0.334776   5.052 4.36e-07 ***
## ChestPainTypeATA -2.192541   0.406110  -5.399 6.71e-08 ***
## ChestPainTypeNAP -1.524821   0.309447  -4.928 8.33e-07 ***
## ChestPainTypeTA  -1.740553   0.501698  -3.469 0.000522 ***
## FastingBS1        0.904766   0.314853   2.874 0.004058 ** 
## MaxHR            -0.004193   0.005548  -0.756 0.449788    
## ExerciseAnginaY   0.862264   0.293500   2.938 0.003305 ** 
## Oldpeak           0.388281   0.131774   2.947 0.003213 ** 
## ST_SlopeFlat      0.833998   0.575136   1.450 0.147034    
## ST_SlopeUp       -1.550482   0.587840  -2.638 0.008350 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 880.01  on 641  degrees of freedom
## Residual deviance: 426.69  on 630  degrees of freedom
## AIC: 450.69
## 
## Number of Fisher Scoring iterations: 5
Compare each models AIC lower = better fitting model
AIC(heart_glm0)
## [1] 456.2915
AIC(heart_glm1)
## [1] 450.6866
AIC of the new model proves to be a better fit with AIC being 
###Step 4: Validation 
Asymmetric cost - due to the cost of having FN being more critical than FP in 
predicting Heart disease we put the cost of diagnosing a FN higher than diagnosing a paitent with a false positive 
cost2 <- function(r, pi, pcut = 1/(5+1)) {  # Default threshold
  weight1 <- 5  # FN penalty
  weight0 <- 1  # FP penalty
  c1 <- (r == 1) & (pi < pcut)  # False negatives
  c0 <- (r == 0) & (pi > pcut)  # False positives
  mean(weight1 * c1 + weight0 * c0)
}
k- 10 fold validataion:
library(boot)
heart_K102 <- glm(HeartDisease ~ Age + Sex + ChestPainType + FastingBS + MaxHR + 
                    ExerciseAngina + Oldpeak + ST_Slope,
                  family = binomial, 
                  data = heart_train)
cv_result2 <- cv.glm(
  data = heart_train,
  glmfit = heart_K102,
  cost = cost2,
  K = 10
)
cv_result2$delta[2]
## [1] 0.2481245
AUC Out of sample
pred_glm0_test<- predict(heart_glm0,
                         newdata = heart_test, type="response")
  pred1 <- prediction(pred_glm0_test, heart_test$HeartDisease)
  perf1 <- performance(pred1, "tpr", "fpr")
  plot(perf1, colorize=TRUE)

  unlist(slot(performance(pred1, "auc"), "y.values"))
## [1] 0.9221115
table(heart_test$HeartDisease, (pred_glm0_test > 1/(5+1))*1,
      dnn = c("Truth", "Predicted"))
##      Predicted
## Truth   0   1
##     0  78  51
##     1   4 143

###interpretation of this model

ROC curve has a good spread. This is a sign of a good model and its ability to predict whether or not a patient is going to end up with some sort of heart disease and eventually heat failure. Our model is further supported when taking a look at the AUC. The AUC being closer to 1 means that the model fitted is good at predicting whether or not a patient will develop heart disease. we can see that this model was able to correctly predict 80 normal cases and 152 sick cases accurately. The overall accuracy of this model is 84.1% correctly being able to identify 232/276 cases. When predicting whether or not a paitent has heart disease the model was 97.1% accurate. Our model lacks when it comes to identifying healthy paitents with a 33.3% inaccuratecy.