Week 5 | Regression Assignment

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

### Required Packages
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.5.2

library(readxl)

## Warning: package 'readxl' was built under R version 3.5.2

library(caret)

## Warning: package 'caret' was built under R version 3.5.2

## Loading required package: lattice

## Loading required package: lattice
library(pROC)

## Warning: package 'pROC' was built under R version 3.5.2

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(LogicReg)

## Warning: package 'LogicReg' was built under R version 3.5.2

## Loading required package: survival

## Warning: package 'survival' was built under R version 3.5.2

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster

##Step 1. Scale or normalize your data. Make sure to apply imputation if needed
### Import the data in to R

Heartattack <- read.csv("/Users/anirudhgurnani/Desktop/heart.csv", stringsAsFactors = FALSE)
glimpse(Heartattack)

## Observations: 1,025
## Variables: 14
## $ age      <int> 52, 53, 70, 61, 62, 58, 58, 55, 46, 54, 71, 43, 34, 51,…
## $ sex      <int> 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0…
## $ cp       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 1…
## $ trestbps <int> 125, 140, 145, 148, 138, 100, 114, 160, 120, 122, 112, …
## $ chol     <int> 212, 203, 174, 203, 294, 248, 318, 289, 249, 286, 149, …
## $ fbs      <int> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ restecg  <int> 1, 0, 1, 1, 1, 0, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1…
## $ thalach  <int> 168, 155, 125, 161, 106, 122, 140, 145, 144, 116, 125, …
## $ exang    <int> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0…
## $ oldpeak  <dbl> 1.0, 3.1, 2.6, 0.0, 1.9, 1.0, 4.4, 0.8, 0.8, 3.2, 1.6, …
## $ slope    <int> 2, 0, 0, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2…
## $ ca       <int> 2, 0, 0, 1, 3, 0, 3, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0…
## $ thal     <int> 3, 3, 3, 3, 2, 2, 1, 3, 3, 2, 2, 3, 2, 3, 0, 2, 2, 3, 2…
## $ target   <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1…

summary(Heartattack)

##       age             sex               cp            trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.0000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :1.0000   Median :130.0  
##  Mean   :54.43   Mean   :0.6956   Mean   :0.9424   Mean   :131.6  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:2.0000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.0000   Max.   :200.0  
##       chol          fbs            restecg          thalach     
##  Min.   :126   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:132.0  
##  Median :240   Median :0.0000   Median :1.0000   Median :152.0  
##  Mean   :246   Mean   :0.1493   Mean   :0.5298   Mean   :149.1  
##  3rd Qu.:275   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope             ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :1.000   Median :0.0000  
##  Mean   :0.3366   Mean   :1.072   Mean   :1.385   Mean   :0.7541  
##  3rd Qu.:1.0000   3rd Qu.:1.800   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.324   Mean   :0.5132  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000

library(caret)

preproc1 <- preProcess(Heartattack[,c(1:14)], method=c("center", "scale"))

norm1 <- predict(preproc1, Heartattack[,c(1:14)])

summary(norm1)

##       age               sex                cp             trestbps       
##  Min.   :-2.8035   Min.   :-1.5110   Min.   :-0.9153   Min.   :-2.14719  
##  1st Qu.:-0.7092   1st Qu.:-1.5110   1st Qu.:-0.9153   1st Qu.:-0.66289  
##  Median : 0.1726   Median : 0.6612   Median : 0.0559   Median :-0.09201  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.7237   3rd Qu.: 0.6612   3rd Qu.: 1.0271   3rd Qu.: 0.47887  
##  Max.   : 2.4873   Max.   : 0.6612   Max.   : 1.9983   Max.   : 3.90417  
##       chol              fbs             restecg           thalach       
##  Min.   :-2.3259   Min.   :-0.4187   Min.   :-1.0036   Min.   :-3.3954  
##  1st Qu.:-0.6784   1st Qu.:-0.4187   1st Qu.:-1.0036   1st Qu.:-0.7439  
##  Median :-0.1163   Median :-0.4187   Median : 0.8908   Median : 0.1254  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5621   3rd Qu.:-0.4187   3rd Qu.: 0.8908   3rd Qu.: 0.7340  
##  Max.   : 6.1637   Max.   : 2.3862   Max.   : 2.7852   Max.   : 2.2988  
##      exang            oldpeak            slope               ca         
##  Min.   :-0.7119   Min.   :-0.9119   Min.   :-2.2426   Min.   :-0.7316  
##  1st Qu.:-0.7119   1st Qu.:-0.9119   1st Qu.:-0.6238   1st Qu.:-0.7316  
##  Median :-0.7119   Median :-0.2311   Median :-0.6238   Median :-0.7316  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.4032   3rd Qu.: 0.6200   3rd Qu.: 0.9949   3rd Qu.: 0.2385  
##  Max.   : 1.4032   Max.   : 4.3645   Max.   : 0.9949   Max.   : 3.1489  
##       thal             target       
##  Min.   :-3.7442   Min.   :-1.0262  
##  1st Qu.:-0.5219   1st Qu.:-1.0262  
##  Median :-0.5219   Median : 0.9735  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.0893   3rd Qu.: 0.9735  
##  Max.   : 1.0893   Max.   : 0.9735

Heartattack_scaled <- as.data.frame(scale(Heartattack[,c(1:14)]))
summary(Heartattack_scaled)

##       age               sex                cp             trestbps       
##  Min.   :-2.8035   Min.   :-1.5110   Min.   :-0.9153   Min.   :-2.14719  
##  1st Qu.:-0.7092   1st Qu.:-1.5110   1st Qu.:-0.9153   1st Qu.:-0.66289  
##  Median : 0.1726   Median : 0.6612   Median : 0.0559   Median :-0.09201  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.7237   3rd Qu.: 0.6612   3rd Qu.: 1.0271   3rd Qu.: 0.47887  
##  Max.   : 2.4873   Max.   : 0.6612   Max.   : 1.9983   Max.   : 3.90417  
##       chol              fbs             restecg           thalach       
##  Min.   :-2.3259   Min.   :-0.4187   Min.   :-1.0036   Min.   :-3.3954  
##  1st Qu.:-0.6784   1st Qu.:-0.4187   1st Qu.:-1.0036   1st Qu.:-0.7439  
##  Median :-0.1163   Median :-0.4187   Median : 0.8908   Median : 0.1254  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5621   3rd Qu.:-0.4187   3rd Qu.: 0.8908   3rd Qu.: 0.7340  
##  Max.   : 6.1637   Max.   : 2.3862   Max.   : 2.7852   Max.   : 2.2988  
##      exang            oldpeak            slope               ca         
##  Min.   :-0.7119   Min.   :-0.9119   Min.   :-2.2426   Min.   :-0.7316  
##  1st Qu.:-0.7119   1st Qu.:-0.9119   1st Qu.:-0.6238   1st Qu.:-0.7316  
##  Median :-0.7119   Median :-0.2311   Median :-0.6238   Median :-0.7316  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.4032   3rd Qu.: 0.6200   3rd Qu.: 0.9949   3rd Qu.: 0.2385  
##  Max.   : 1.4032   Max.   : 4.3645   Max.   : 0.9949   Max.   : 3.1489  
##       thal             target       
##  Min.   :-3.7442   Min.   :-1.0262  
##  1st Qu.:-0.5219   1st Qu.:-1.0262  
##  Median :-0.5219   Median : 0.9735  
##  Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 1.0893   3rd Qu.: 0.9735  
##  Max.   : 1.0893   Max.   : 0.9735

Heartattack_norm <- as.data.frame(apply(Heartattack[, 1:14], 2, function(x) (x - min(x))/(max(x)-min(x))))


str(Heartattack_norm)

## 'data.frame':    1025 obs. of  14 variables:
##  $ age     : num  0.479 0.5 0.854 0.667 0.688 ...
##  $ sex     : num  1 1 1 1 0 0 1 1 1 1 ...
##  $ cp      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps: num  0.292 0.434 0.481 0.509 0.415 ...
##  $ chol    : num  0.196 0.176 0.11 0.176 0.384 ...
##  $ fbs     : num  0 1 0 0 1 0 0 0 0 0 ...
##  $ restecg : num  0.5 0 0.5 0.5 0.5 0 1 0 0 0 ...
##  $ thalach : num  0.74 0.641 0.412 0.687 0.267 ...
##  $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  0.161 0.5 0.419 0 0.306 ...
##  $ slope   : num  1 0 0 1 0.5 0.5 0 0.5 1 0.5 ...
##  $ ca      : num  0.5 0 0 0.25 0.75 0 0.75 0.25 0 0.5 ...
##  $ thal    : num  1 1 1 1 0.667 ...
##  $ target  : num  0 0 0 0 0 1 0 0 0 0 ...

dim(Heartattack)

## [1] 1025   14

Heartattack$target[Heartattack$target > 0] <- 1


#Step 2. Build a multiple linear regression model or logistic regression
Heartattacktest1 <- dplyr::select(Heartattack_norm, age, sex, cp, trestbps, chol,fbs, restecg, thalach, exang, oldpeak, slope, ca, thal, target)


heartattack_index = createDataPartition(Heartattack$target, p =0.70, list = FALSE)
dim(heartattack_index)

## [1] 718   1

heartattack_Vald= Heartattack[-heartattack_index,]
dim(heartattack_Vald)

## [1] 307  14

Heartattacktest = Heartattacktest1[heartattack_index,]
dim(Heartattacktest )

## [1] 718  14

Heartattacktest$target= as.factor(Heartattacktest$target)


control <- trainControl(method="cv", number=10)
metric <- "Accuracy"

AUC = list()
Accuracy = list()

#Logistic Regression (LR)

set.seed(10)
fit.lr <- train(target~., Heartattacktest, method="glm", metric=metric, trControl=control, family = 'binomial')

set.seed(10)
logRegModel <- train(target~., data=Heartattacktest, method = 'glm', family = 'binomial')
logRegPrediction <- predict(logRegModel, Heartattacktest)
logRegPredictionprob <- predict(logRegModel, Heartattacktest, type='prob')[2]
logRegConfMat <- confusionMatrix(logRegPrediction, Heartattacktest[,"target"])

#ROC Curve
library(pROC)
AUC$logReg <- roc(as.numeric(Heartattacktest$target),as.numeric(as.matrix((logRegPredictionprob))))$auc

## Setting levels: control = 1, case = 2

## Setting direction: controls < cases

Accuracy$logReg <- logRegConfMat$overall['Accuracy']  
##Step 3. Print summary and interpret table
logRegConfMat

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 283  33
##          1  65 337
##                                           
##                Accuracy : 0.8635          
##                  95% CI : (0.8362, 0.8878)
##     No Information Rate : 0.5153          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.726           
##                                           
##  Mcnemar's Test P-Value : 0.001739        
##                                           
##             Sensitivity : 0.8132          
##             Specificity : 0.9108          
##          Pos Pred Value : 0.8956          
##          Neg Pred Value : 0.8383          
##              Prevalence : 0.4847          
##          Detection Rate : 0.3942          
##    Detection Prevalence : 0.4401          
##       Balanced Accuracy : 0.8620          
##                                           
##        'Positive' Class : 0               
##

##Step 4. Perform another model and evaluate which model performs better
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

set.seed(10)
RFModel <- randomForest(target ~ .,
                        data=Heartattacktest,
                        importance=TRUE,
                        ntree=2000)
#varImpPlot(RFModel)
RFPrediction <- predict(RFModel,Heartattacktest)
RFPredictionprob = predict(RFModel,Heartattacktest,type="prob")[, 2]

RFConfMat <- confusionMatrix(RFPrediction, Heartattacktest[,"target"])

AUC$RF <- roc(as.numeric(Heartattacktest$target),as.numeric(as.matrix((RFPredictionprob))))$auc

## Setting levels: control = 1, case = 2

## Setting direction: controls < cases

Accuracy$RF <- RFConfMat$overall['Accuracy']  
RFConfMat

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 348   0
##          1   0 370
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9949, 1)
##     No Information Rate : 0.5153     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4847     
##          Detection Rate : 0.4847     
##    Detection Prevalence : 0.4847     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

Week 5 | Regression Assignment

Anirudh Gurnani

7/13/2020

R Markdown

The Accuracy for RForest Model is 1 where as in Logistic Regression Model Accuracy is 0.85. Therefore RForest model performs better.