This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
### Required Packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(readxl)
## Warning: package 'readxl' was built under R version 3.5.2
library(caret)
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: lattice
library(pROC)
## Warning: package 'pROC' was built under R version 3.5.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(LogicReg)
## Warning: package 'LogicReg' was built under R version 3.5.2
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.5.2
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
##Step 1. Scale or normalize your data. Make sure to apply imputation if needed
### Import the data in to R
Heartattack <- read.csv("/Users/anirudhgurnani/Desktop/heart.csv", stringsAsFactors = FALSE)
glimpse(Heartattack)
## Observations: 1,025
## Variables: 14
## $ age <int> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51,…
## $ sex <int> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0…
## $ cp <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1…
## $ trestbps <int> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112, …
## $ chol <int> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149, …
## $ fbs <int> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ restecg <int> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1…
## $ thalach <int> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125, …
## $ exang <int> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0…
## $ oldpeak <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6, …
## $ slope <int> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2…
## $ ca <int> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0…
## $ thal <int> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2…
## $ target <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1…
summary(Heartattack)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.0000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :1.0000 Median :130.0
## Mean :54.43 Mean :0.6956 Mean :0.9424 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.0000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.0000 Max. :200.0
## chol fbs restecg thalach
## Min. :126 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:132.0
## Median :240 Median :0.0000 Median :1.0000 Median :152.0
## Mean :246 Mean :0.1493 Mean :0.5298 Mean :149.1
## 3rd Qu.:275 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.800 Median :1.000 Median :0.0000
## Mean :0.3366 Mean :1.072 Mean :1.385 Mean :0.7541
## 3rd Qu.:1.0000 3rd Qu.:1.800 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.200 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.324 Mean :0.5132
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
library(caret)
preproc1 <- preProcess(Heartattack[,c(1:14)], method=c("center", "scale"))
norm1 <- predict(preproc1, Heartattack[,c(1:14)])
summary(norm1)
## age sex cp trestbps
## Min. :-2.8035 Min. :-1.5110 Min. :-0.9153 Min. :-2.14719
## 1st Qu.:-0.7092 1st Qu.:-1.5110 1st Qu.:-0.9153 1st Qu.:-0.66289
## Median : 0.1726 Median : 0.6612 Median : 0.0559 Median :-0.09201
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.7237 3rd Qu.: 0.6612 3rd Qu.: 1.0271 3rd Qu.: 0.47887
## Max. : 2.4873 Max. : 0.6612 Max. : 1.9983 Max. : 3.90417
## chol fbs restecg thalach
## Min. :-2.3259 Min. :-0.4187 Min. :-1.0036 Min. :-3.3954
## 1st Qu.:-0.6784 1st Qu.:-0.4187 1st Qu.:-1.0036 1st Qu.:-0.7439
## Median :-0.1163 Median :-0.4187 Median : 0.8908 Median : 0.1254
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5621 3rd Qu.:-0.4187 3rd Qu.: 0.8908 3rd Qu.: 0.7340
## Max. : 6.1637 Max. : 2.3862 Max. : 2.7852 Max. : 2.2988
## exang oldpeak slope ca
## Min. :-0.7119 Min. :-0.9119 Min. :-2.2426 Min. :-0.7316
## 1st Qu.:-0.7119 1st Qu.:-0.9119 1st Qu.:-0.6238 1st Qu.:-0.7316
## Median :-0.7119 Median :-0.2311 Median :-0.6238 Median :-0.7316
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.4032 3rd Qu.: 0.6200 3rd Qu.: 0.9949 3rd Qu.: 0.2385
## Max. : 1.4032 Max. : 4.3645 Max. : 0.9949 Max. : 3.1489
## thal target
## Min. :-3.7442 Min. :-1.0262
## 1st Qu.:-0.5219 1st Qu.:-1.0262
## Median :-0.5219 Median : 0.9735
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.0893 3rd Qu.: 0.9735
## Max. : 1.0893 Max. : 0.9735
Heartattack_scaled <- as.data.frame(scale(Heartattack[,c(1:14)]))
summary(Heartattack_scaled)
## age sex cp trestbps
## Min. :-2.8035 Min. :-1.5110 Min. :-0.9153 Min. :-2.14719
## 1st Qu.:-0.7092 1st Qu.:-1.5110 1st Qu.:-0.9153 1st Qu.:-0.66289
## Median : 0.1726 Median : 0.6612 Median : 0.0559 Median :-0.09201
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.7237 3rd Qu.: 0.6612 3rd Qu.: 1.0271 3rd Qu.: 0.47887
## Max. : 2.4873 Max. : 0.6612 Max. : 1.9983 Max. : 3.90417
## chol fbs restecg thalach
## Min. :-2.3259 Min. :-0.4187 Min. :-1.0036 Min. :-3.3954
## 1st Qu.:-0.6784 1st Qu.:-0.4187 1st Qu.:-1.0036 1st Qu.:-0.7439
## Median :-0.1163 Median :-0.4187 Median : 0.8908 Median : 0.1254
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5621 3rd Qu.:-0.4187 3rd Qu.: 0.8908 3rd Qu.: 0.7340
## Max. : 6.1637 Max. : 2.3862 Max. : 2.7852 Max. : 2.2988
## exang oldpeak slope ca
## Min. :-0.7119 Min. :-0.9119 Min. :-2.2426 Min. :-0.7316
## 1st Qu.:-0.7119 1st Qu.:-0.9119 1st Qu.:-0.6238 1st Qu.:-0.7316
## Median :-0.7119 Median :-0.2311 Median :-0.6238 Median :-0.7316
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.4032 3rd Qu.: 0.6200 3rd Qu.: 0.9949 3rd Qu.: 0.2385
## Max. : 1.4032 Max. : 4.3645 Max. : 0.9949 Max. : 3.1489
## thal target
## Min. :-3.7442 Min. :-1.0262
## 1st Qu.:-0.5219 1st Qu.:-1.0262
## Median :-0.5219 Median : 0.9735
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.0893 3rd Qu.: 0.9735
## Max. : 1.0893 Max. : 0.9735
Heartattack_norm <- as.data.frame(apply(Heartattack[, 1:14], 2, function(x) (x - min(x))/(max(x)-min(x))))
str(Heartattack_norm)
## 'data.frame': 1025 obs. of 14 variables:
## $ age : num 0.479 0.5 0.854 0.667 0.688 ...
## $ sex : num 1 1 1 1 0 0 1 1 1 1 ...
## $ cp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ trestbps: num 0.292 0.434 0.481 0.509 0.415 ...
## $ chol : num 0.196 0.176 0.11 0.176 0.384 ...
## $ fbs : num 0 1 0 0 1 0 0 0 0 0 ...
## $ restecg : num 0.5 0 0.5 0.5 0.5 0 1 0 0 0 ...
## $ thalach : num 0.74 0.641 0.412 0.687 0.267 ...
## $ exang : num 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 0.161 0.5 0.419 0 0.306 ...
## $ slope : num 1 0 0 1 0.5 0.5 0 0.5 1 0.5 ...
## $ ca : num 0.5 0 0 0.25 0.75 0 0.75 0.25 0 0.5 ...
## $ thal : num 1 1 1 1 0.667 ...
## $ target : num 0 0 0 0 0 1 0 0 0 0 ...
dim(Heartattack)
## [1] 1025 14
Heartattack$target[Heartattack$target > 0] <- 1
#Step 2. Build a multiple linear regression model or logistic regression
Heartattacktest1 <- dplyr::select(Heartattack_norm, age, sex, cp, trestbps, chol,fbs, restecg, thalach, exang, oldpeak, slope, ca, thal, target)
heartattack_index = createDataPartition(Heartattack$target, p =0.70, list = FALSE)
dim(heartattack_index)
## [1] 718 1
heartattack_Vald= Heartattack[-heartattack_index,]
dim(heartattack_Vald)
## [1] 307 14
Heartattacktest = Heartattacktest1[heartattack_index,]
dim(Heartattacktest )
## [1] 718 14
Heartattacktest$target= as.factor(Heartattacktest$target)
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
AUC = list()
Accuracy = list()
#Logistic Regression (LR)
set.seed(10)
fit.lr <- train(target~., Heartattacktest, method="glm", metric=metric, trControl=control, family = 'binomial')
set.seed(10)
logRegModel <- train(target~., data=Heartattacktest, method = 'glm', family = 'binomial')
logRegPrediction <- predict(logRegModel, Heartattacktest)
logRegPredictionprob <- predict(logRegModel, Heartattacktest, type='prob')[2]
logRegConfMat <- confusionMatrix(logRegPrediction, Heartattacktest[,"target"])
#ROC Curve
library(pROC)
AUC$logReg <- roc(as.numeric(Heartattacktest$target),as.numeric(as.matrix((logRegPredictionprob))))$auc
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
Accuracy$logReg <- logRegConfMat$overall['Accuracy']
##Step 3. Print summary and interpret table
logRegConfMat
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 283 33
## 1 65 337
##
## Accuracy : 0.8635
## 95% CI : (0.8362, 0.8878)
## No Information Rate : 0.5153
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.726
##
## Mcnemar's Test P-Value : 0.001739
##
## Sensitivity : 0.8132
## Specificity : 0.9108
## Pos Pred Value : 0.8956
## Neg Pred Value : 0.8383
## Prevalence : 0.4847
## Detection Rate : 0.3942
## Detection Prevalence : 0.4401
## Balanced Accuracy : 0.8620
##
## 'Positive' Class : 0
##
##Step 4. Perform another model and evaluate which model performs better
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(10)
RFModel <- randomForest(target ~ .,
data=Heartattacktest,
importance=TRUE,
ntree=2000)
#varImpPlot(RFModel)
RFPrediction <- predict(RFModel,Heartattacktest)
RFPredictionprob = predict(RFModel,Heartattacktest,type="prob")[, 2]
RFConfMat <- confusionMatrix(RFPrediction, Heartattacktest[,"target"])
AUC$RF <- roc(as.numeric(Heartattacktest$target),as.numeric(as.matrix((RFPredictionprob))))$auc
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
Accuracy$RF <- RFConfMat$overall['Accuracy']
RFConfMat
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 348 0
## 1 0 370
##
## Accuracy : 1
## 95% CI : (0.9949, 1)
## No Information Rate : 0.5153
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4847
## Detection Rate : 0.4847
## Detection Prevalence : 0.4847
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##