library(data.table) # Importing data
library(car) # model diagnostics
library(MASS) # step function for model selection
library(rpart) # desicion tree
library(rattle) # creating decision tree plot
library(rpart.plot) # descision tree
library(RColorBrewer) # colloring decision tree
library(formatR) # colloring tree
library(binomTools) # standardazed pearson residuals
library(ROCR) # ploting ROC curve
library(e1071) # Performing Support Vector Machine

1 Objective

In this project I use classification techniques such as: logistic regression, decision tree, support vector machines to predict the presence of heart disease. Comparison in terms of accuracy, true positive rate and false positive rate will be conducted. Also, models are evaluated in terms of ROC curves and AUC measures.

Since, dealing with disease investigation we want to reduce cases when we conclude that a person is not sick while he/she is sick, the best model is selected using true positive rate, which is a fraction of TP/actual yes.

The data was collected from four sources:

1. Cleveland Clinic Foundation (cleveland.data)

2. Hungarian Institute of Cardiology, Budapest (hungarian.data)

3. V.A. Medical Center, Long Beach, CA (long-beach-va.data)

4. University Hospital, Zurich, Switzerland (switzerland.data)

All four data sets can be found here: http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/.

The following is the description of the variables:

  • age: age in years
  • sex: sex
    • 1 = male
    • 0 = female)
  • cp: chest pain type
    • 1 = typical angina
    • 2 = atypical angina
    • 3 = non-anginal pain
    • 4 = asymptomatic
  • trestbps: resting blood pressure (in mm Hg on admission to the hospital)
  • chol: serum cholesterol in mg/dl
  • fbs: (fasting blood sugar > 120 mg/dl)
    • 1 = true
    • 0 = false
  • restecg: resting electrocardiograph results
    • 0 = normal
    • 1 = having ST-T wave abnormality (T wave inversions and/or ST
      elevation or depression of > 0.05 mV)
    • 2 = showing probable or definite left ventricular hypertrophy
      by Estes’ criteria
  • thalach: maximum heart rate achieved
  • exang: exercise induced angina:
    • 1 = yes
    • 0 = no
  • oldpeak: ST depression induced by exercise relative to rest
  • slope: the slope of the peak exercise ST segment
    • 1 = upsloping
    • 2 = flat
    • 3 = downsloping
  • ca: number of major vessels (0-3) colored by flourosopy
  • thal:
    • 3 = normal
    • 6 = fixed defect
    • 7 = reversible defect
  • num: diagnosis of heart disease (angiographic disease status) (the predicted attribute)
    • 0 = < 50% diameter narrowing
    • 1 = > 50% diameter narrowing

All R code with some explanation is provided below for reproducible research.

2 Importing data

# Importing data
url_hungarian   = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data'
url_cleveland   = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
url_switzerland = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data'
url_va          = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.va.data'

mydat_hungarian   = fread(url_hungarian,
                        na.strings = "?",
                        col.names = c("age","sex", "cp", "trestbps", "chol", 
                                      "fbs", "restecg", "thalach", "exang", 
                                      "oldpeak", "slope", "ca", "thal", "num"))

mydat_cleveland   = fread(url_cleveland,
                        na.strings = "?",
                        col.names = c("age","sex", "cp", "trestbps", "chol", 
                                      "fbs", "restecg", "thalach", "exang", 
                                      "oldpeak", "slope", "ca", "thal", "num"))

mydat_switzerland = fread(url_switzerland,
                        na.strings = "?",
                        col.names = c("age","sex", "cp", "trestbps", "chol", 
                                      "fbs", "restecg", "thalach", "exang", 
                                      "oldpeak", "slope", "ca", "thal", "num"))

mydat_va          = fread(url_va,
                        na.strings = "?",
                        col.names = c("age","sex", "cp", "trestbps", "chol", 
                                      "fbs", "restecg", "thalach", "exang", 
                                      "oldpeak", "slope", "ca", "thal", "num"))
# Adding source column
mydat_hungarian$source   = "hungarian"
mydat_cleveland$source   = "cleveland"
mydat_switzerland$source = "switzerland"
mydat_va$source          = "va"

# Combining data sets
mydat = rbind(mydat_cleveland,
              mydat_hungarian,
              mydat_switzerland,
              mydat_va)

# Converting categorical variables to factor type
mydat$num     = factor(mydat$num > 0,
                       levels = c(FALSE, TRUE),
                       labels = c("not sick", "sick"))
mydat$sex     = factor(mydat$sex,
                       levels = c(0, 1),
                       labels = c("female", "male"))
mydat$cp      = factor(mydat$cp,
                       levels = c(1, 2, 3, 4),
                       labels = c("typical angina", "atypical angina", 
                                  "non-anginal pain", "asymptomatic"))
mydat$exang   = factor(mydat$exang,
                       levels = c(0, 1),
                       labels = c("no", "yes"))
mydat$slope   = factor(mydat$slope,
                       levels = c(1, 2, 3),
                       labels = c("upsloping", "flat", "downsloping"))
mydat$thal    = factor(mydat$thal,
                       levels = c(3, 6, 7),
                       labels = c("normal","fixed defect","reversable defect"))
mydat$fbs     = factor(mydat$fbs,
                       levels = c(0, 1),
                       labels = c("false", "true"))
mydat$restecg = as.factor(mydat$restecg)

# Summary of the data
attach(mydat)
summary(mydat)
##       age            sex                     cp         trestbps    
##  Min.   :28.00   female:194   typical angina  : 46   Min.   :  0.0  
##  1st Qu.:47.00   male  :726   atypical angina :174   1st Qu.:120.0  
##  Median :54.00                non-anginal pain:204   Median :130.0  
##  Mean   :53.51                asymptomatic    :496   Mean   :132.1  
##  3rd Qu.:60.00                                       3rd Qu.:140.0  
##  Max.   :77.00                                       Max.   :200.0  
##                                                      NA's   :59     
##       chol          fbs      restecg       thalach       exang    
##  Min.   :  0.0   false:692   0   :551   Min.   : 60.0   no  :528  
##  1st Qu.:175.0   true :138   1   :179   1st Qu.:120.0   yes :337  
##  Median :223.0   NA's : 90   2   :188   Median :140.0   NA's: 55  
##  Mean   :199.1               NA's:  2   Mean   :137.5             
##  3rd Qu.:268.0                          3rd Qu.:157.0             
##  Max.   :603.0                          Max.   :202.0             
##  NA's   :30                             NA's   :55                
##     oldpeak                slope           ca        
##  Min.   :-2.6000   upsloping  :203   Min.   :0.0000  
##  1st Qu.: 0.0000   flat       :345   1st Qu.:0.0000  
##  Median : 0.5000   downsloping: 63   Median :0.0000  
##  Mean   : 0.8788   NA's       :309   Mean   :0.6764  
##  3rd Qu.: 1.5000                     3rd Qu.:1.0000  
##  Max.   : 6.2000                     Max.   :3.0000  
##  NA's   :62                          NA's   :611     
##                 thal           num         source         
##  normal           :196   not sick:411   Length:920        
##  fixed defect     : 46   sick    :509   Class :character  
##  reversable defect:192                  Mode  :character  
##  NA's             :486                                    
##                                                           
##                                                           
## 

We can notice that some of the variables have a lot of missing values and if we simple delete observations that have at least one missing values we will lose a lot of observation which can affect our model. Thus, I am going to consider two data sets below.

  1. The first one d1, that has all the variables mentioned above (except source) without observations that has at least one missing value.
  2. The second one d2, one without variables ca, thal, slope and source, ones that have most of the missing values.
# Data set with all variables exept 'source' and observations that has 
# at least one missing values
d_1 = mydat[complete.cases(mydat[, -15]), -15]

# Data set without variables ca, thal, slope and source and without
# observations that have at least one missing value
d_2 = mydat[complete.cases(mydat[, -c(11, 12, 13, 15)]), -c(11, 12, 13, 15)]

# Summary of d_1 and d_2
summary(d_1)
##       age            sex                     cp         trestbps    
##  Min.   :29.00   female: 96   typical angina  : 23   Min.   : 94.0  
##  1st Qu.:48.00   male  :203   atypical angina : 49   1st Qu.:120.0  
##  Median :56.00                non-anginal pain: 83   Median :130.0  
##  Mean   :54.52                asymptomatic    :144   Mean   :131.7  
##  3rd Qu.:61.00                                       3rd Qu.:140.0  
##  Max.   :77.00                                       Max.   :200.0  
##       chol          fbs      restecg    thalach      exang    
##  Min.   :100.0   false:256   0:149   Min.   : 71.0   no :200  
##  1st Qu.:211.0   true : 43   1:  4   1st Qu.:132.5   yes: 99  
##  Median :242.0               2:146   Median :152.0            
##  Mean   :246.8                       Mean   :149.3            
##  3rd Qu.:275.5                       3rd Qu.:165.5            
##  Max.   :564.0                       Max.   :202.0            
##     oldpeak              slope           ca        
##  Min.   :0.000   upsloping  :139   Min.   :0.0000  
##  1st Qu.:0.000   flat       :139   1st Qu.:0.0000  
##  Median :0.800   downsloping: 21   Median :0.0000  
##  Mean   :1.059                     Mean   :0.6722  
##  3rd Qu.:1.600                     3rd Qu.:1.0000  
##  Max.   :6.200                     Max.   :3.0000  
##                 thal           num     
##  normal           :164   not sick:160  
##  fixed defect     : 18   sick    :139  
##  reversable defect:117                 
##                                        
##                                        
## 
summary(d_2)
##       age           sex                     cp         trestbps    
##  Min.   :28.0   female:174   typical angina  : 37   Min.   :  0.0  
##  1st Qu.:46.0   male  :566   atypical angina :150   1st Qu.:120.0  
##  Median :54.0                non-anginal pain:161   Median :130.0  
##  Mean   :53.1                asymptomatic    :392   Mean   :132.8  
##  3rd Qu.:60.0                                       3rd Qu.:140.0  
##  Max.   :77.0                                       Max.   :200.0  
##       chol          fbs      restecg    thalach      exang    
##  Min.   :  0.0   false:629   0:445   Min.   : 60.0   no :444  
##  1st Qu.:197.0   true :111   1:120   1st Qu.:120.0   yes:296  
##  Median :231.0               2:175   Median :140.0            
##  Mean   :220.1                       Mean   :138.7            
##  3rd Qu.:271.0                       3rd Qu.:159.2            
##  Max.   :603.0                       Max.   :202.0            
##     oldpeak              num     
##  Min.   :-1.0000   not sick:357  
##  1st Qu.: 0.0000   sick    :383  
##  Median : 0.5000                 
##  Mean   : 0.8943                 
##  3rd Qu.: 1.5000                 
##  Max.   : 6.2000

In order to avoid any preordered structure of the data, before splitting into training and testing sets, I shuffle data.

# Set random seed for reproducible research
set.seed(97459)

# Shuffle the data
n_1 = nrow(d_1)
n_2 = nrow(d_2)
d_1 = d_1[sample(n_1), ]
d_2 = d_2[sample(n_2), ]

3 Creating training/testing sets

I split data into training and testing sets (75% vs 25%). I am using training set to create models and testing set for assessing predictive power of my models.

# Split the data in train and test
split    = 0.75
d1_train = d_1[1:round(split * n_1), ]
d1_test  = d_1[(round(split * n_1) + 1):n_1, ]

d2_train = d_2[1:round(split * n_2), ]
d2_test  = d_2[(round(split * n_2) + 1):n_2, ]

# Check class bias
table(d_1$num, dnn="Overal d_1")
## Overal d_1
## not sick     sick 
##      160      139
table(d1_train$num, dnn="training d_1")
## training d_1
## not sick     sick 
##      117      107
table(d1_test$num, dnn="testing d_1")
## testing d_1
## not sick     sick 
##       43       32
table(d_2$num, dnn="Overal d_2")
## Overal d_2
## not sick     sick 
##      357      383
table(d2_train$num, dnn="training d_2")
## training d_2
## not sick     sick 
##      264      291
table(d2_test$num, dnn="testing d_2")
## testing d_2
## not sick     sick 
##       93       92

We can see that for both d_1 and d_2, training and testing sets are well balanced.

4 Logistic regression

4.1 Variable Selection

To select best model, we use stepwise procedures (backward, forward, both).

# Creating full(including all varibales) and null(intercept only) models for d_1
d1_logit_full = glm(num~., data = d1_train, family = binomial)
d1_logit_null = glm(num~1, data = d1_train, family = binomial)

# Creating full(including all varibales) and null(intercept only) models for d_2
d2_logit_full = glm(num~., data = d2_train, family = binomial)
d2_logit_null = glm(num~1, data = d2_train, family = binomial)
# Forward selection procedure
forward_d1 = step(d1_logit_null, scope = list(lower = d1_logit_null, upper = d1_logit_full), direction = "forward")
forward_d2 = step(d2_logit_null, scope = list(lower = d2_logit_null, upper = d2_logit_full), direction = "forward")

# Backward selection procedure
backward_d1 = step(d1_logit_full, scope = list(lower = d1_logit_null, upper = d1_logit_full), direction = "backward")
backward_d2 = step(d2_logit_full, scope = list(lower = d2_logit_null, upper = d2_logit_full), direction = "backward")

# Both 
both_d1 = step(d1_logit_full, scope = list(lower = d1_logit_null, upper = d1_logit_full), direction = "both")
both_d2 = step(d2_logit_full, scope = list(lower = d2_logit_null, upper = d2_logit_full), direction = "both")

Below are the models returned by each procedure using AIC criterion for both data sets.

Forward for data set d_1

forward_d1$formula
## num ~ thal + exang + ca + cp + oldpeak + sex

Backward for data set d_1

backward_d1$formula
## num ~ sex + cp + exang + oldpeak + ca + thal

Both for data set d_1

both_d1$formula
## num ~ sex + cp + exang + oldpeak + ca + thal

Forward for data set d_2

forward_d2$formula
## num ~ cp + oldpeak + sex + thalach + fbs + exang + age + chol

Backward for data set d_2

backward_d2$formula
## num ~ age + sex + cp + chol + fbs + thalach + exang + oldpeak

Both for data set d_2

both_d2$formula
## num ~ age + sex + cp + chol + fbs + thalach + exang + oldpeak

We can notice that for both data sets d1 and d2, models returned by all three procedures are similar.

log_d1 = glm(both_d1$formula, data=d1_train, family = binomial)
log_d2 = glm(both_d2$formula, data=d2_train, family = binomial)

summary(log_d1)
## 
## Call:
## glm(formula = both_d1$formula, family = binomial, data = d1_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4224  -0.4991  -0.1861   0.4032   2.4807  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -3.13197    0.80415  -3.895 9.83e-05 ***
## sexmale                1.01733    0.50383   2.019  0.04347 *  
## cpatypical angina     -0.07704    0.82230  -0.094  0.92536    
## cpnon-anginal pain    -0.91516    0.71307  -1.283  0.19935    
## cpasymptomatic         0.97346    0.70574   1.379  0.16779    
## exangyes               1.52514    0.50148   3.041  0.00236 ** 
## oldpeak                0.61890    0.22052   2.807  0.00501 ** 
## ca                     0.95478    0.24338   3.923 8.74e-05 ***
## thalfixed defect       1.33200    0.98519   1.352  0.17637    
## thalreversable defect  1.09442    0.46571   2.350  0.01877 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 310.08  on 223  degrees of freedom
## Residual deviance: 159.48  on 214  degrees of freedom
## AIC: 179.48
## 
## Number of Fisher Scoring iterations: 6
summary(log_d2)
## 
## Call:
## glm(formula = both_d2$formula, family = binomial, data = d2_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5017  -0.5696   0.1628   0.5591   2.6430  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -2.359753   1.455254  -1.622  0.10490    
## age                 0.027506   0.014674   1.874  0.06087 .  
## sexmale             1.403713   0.316186   4.440 9.02e-06 ***
## cpatypical angina  -0.079274   0.570184  -0.139  0.88942    
## cpnon-anginal pain  0.016324   0.538130   0.030  0.97580    
## cpasymptomatic      1.810697   0.528339   3.427  0.00061 ***
## chol               -0.001962   0.001365  -1.437  0.15074    
## fbstrue             0.958730   0.347534   2.759  0.00580 ** 
## thalach            -0.012228   0.005472  -2.234  0.02545 *  
## exangyes            0.730213   0.274659   2.659  0.00785 ** 
## oldpeak             0.826590   0.148694   5.559 2.71e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 768.08  on 554  degrees of freedom
## Residual deviance: 445.08  on 544  degrees of freedom
## AIC: 467.08
## 
## Number of Fisher Scoring iterations: 5

Let’s check whether variables cp is valuable in log_d1 model.

# Anova test using Chisquare distribution (deviance residuals follow Chisquare distribution)
m_f = glm(num~sex + exang + oldpeak + ca + thal, data = d1_train, family = binomial)
anova(log_d1, m_f, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: num ~ sex + cp + exang + oldpeak + ca + thal
## Model 2: num ~ sex + exang + oldpeak + ca + thal
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1       214     159.48                        
## 2       217     173.84 -3  -14.354 0.002461 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Anova test shows that we should leave cp in the model. Now, lets check whether we should leave chol in log_d2 model.

# Anova test using Chisquare distribution (deviance residuals follow Chisquare distribution)
m_f = glm(num~ age + sex + cp + exang + oldpeak + fbs + thalach, data = d2_train, family = binomial)
anova(log_d2, m_f, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: num ~ age + sex + cp + chol + fbs + thalach + exang + oldpeak
## Model 2: num ~ age + sex + cp + exang + oldpeak + fbs + thalach
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       544     445.08                     
## 2       545     447.20 -1  -2.1237    0.145
# Update the log_d2 model by excluding `chol` variable
log_d2 = glm(num~ age + sex + cp + exang + oldpeak + fbs + thalach, data = d2_train, family = binomial)

Based on the above test, we can exclude the variable chol from the model.

4.2 Model diagnostics

Once we selected models for two data sets, let check some of the regression assumptions.

4.2.1 Normality of deviance|Pearson residuals

# QQ plot for deviance residuals
par(mfrow = c (1,2))
qqPlot(residuals(log_d1, "pearson"), main = "Pearson Residuals for log_d1")
qqPlot(residuals(log_d1, "deviance"), main = "Deviance Residuals for log_d1")

qqPlot(residuals(log_d2, "pearson"), main = "Pearson Residuals for log_d2")
qqPlot(residuals(log_d2, "deviance"), main = "Deviance Residuals for log_d2")

We can notice that normality of Pearson and deviance residuals violated for log_d1. For log_d2 we can notice that deviance residuals are more or less resembling normal distribution. Although, an assumption for normality is essential for multiple linear regression case, in logistic regression it is not as crucial and most of the times is violated. I leave this analysis for future research.

par(mrflow = c(1,2))
# Evaluate homoscedasticity
scatter.smooth(log_d1$fitted.values,
               residuals(log_d1,"pearson"),
               span = 2,
               lpars = list(col = "green", lwd = 3, lty = 3))

scatter.smooth(log_d2$fitted.values,
               residuals(log_d2,"pearson"),
               span = 2,
               lpars = list(col = "green", lwd = 3, lty = 3))

We can see that for both models, lowess line goes vertically through the origin. It tells us that an assumptions about constant variance is supported.

# Specification test
residualPlots(log_d1, tests = FALSE)

residualPlots(log_d2, tests = FALSE)

By looking at the graphs Pearson residuals vs numeric covariates, we can see that lowess lines goes vertically through the origin for both of the models. It means the assumption of linear specification of log odds ratio vs covariates is supported.

4.3 Checking outliers, leverage/influential point

# Detecting outliers using Bonferroni Outlier Test
o1 = names(Residuals(log_d1, type = "standard.deviance")[abs(Residuals(log_d1, type = "standard.deviance")) > 2] )
o2 = names(Residuals(log_d2, type = "standard.deviance")[abs(Residuals(log_d2, type = "standard.deviance")) > 2] )

par(mrflow = c(1,2))
# Ploting fitted vs standard pearson residuals ( Checking outliers)
plot(log_d1$fitted.values, Residuals(log_d1, type = "standard.deviance") )
points(log_d1$fitted.values[o1], Residuals(log_d1, type = "standard.deviance")[o1], col="red", pch=16)

plot(log_d2$fitted.values, Residuals(log_d2, type = "standard.deviance") )
points(log_d2$fitted.values[o2], Residuals(log_d2, type = "standard.deviance")[o2], col="red", pch=16)

By looking at standard deviance residual vs fitted values plot, we can see that there might be some outliers if we set up a 2 sigma rule. Since I am not a specialist in the area of heart disease, the decision of how to deal with these observation I leave for further research.

# Calulating different values for checking for influential observations
inflm.SR_d1 = influence.measures(log_d1)
inflm.SR_d2 = influence.measures(log_d2)

# What observations are influential
id1 = as.numeric(names(which(apply(inflm.SR_d1$is.inf, 1, any))))
id2 = as.numeric(names(which(apply(inflm.SR_d2$is.inf, 1, any))))

summary(inflm.SR_d1)
## Potentially influential observations of
##   glm(formula = both_d1$formula, family = binomial, data = d1_train) :
## 
##     dfb.1_ dfb.sxml dfb.cpta dfb.cp-p dfb.cpsy dfb.exng dfb.oldp dfb.ca
## 3    0.05  -0.03    -0.08    -0.06    -0.22     0.24     0.04     0.18 
## 17  -0.20   0.00     0.22     0.45     0.36     0.07    -0.61     0.13 
## 37   0.09  -0.10     0.00     0.01    -0.05    -0.24     0.11    -0.12 
## 39  -0.33   0.07     0.27     0.29     0.32     0.04     0.15     0.11 
## 44   0.03  -0.09     0.05    -0.14     0.00    -0.02     0.18    -0.40 
## 53   0.06   0.07    -0.05     0.14     0.00    -0.05    -0.15    -0.12 
## 62  -0.03   0.00     0.01     0.08    -0.04     0.18    -0.07     0.04 
## 80   0.17   0.01    -0.07    -0.17    -0.04     0.03    -0.08    -0.46 
## 103  0.13  -0.24     0.19    -0.01    -0.04    -0.07    -0.05     0.07 
## 118  0.06   0.07    -0.05     0.14     0.00    -0.05    -0.15    -0.12 
## 127 -0.09  -0.03     0.09     0.16     0.06    -0.09     0.15     0.01 
## 137  0.11  -0.05    -0.02    -0.01    -0.12     0.06     0.10    -0.39 
## 147 -0.15   0.16     0.11     0.20     0.11    -0.11     0.52    -0.19 
## 159  0.44   0.07    -0.41    -0.39    -0.42     0.01    -0.18    -0.12 
## 161  0.21  -0.22    -0.07     0.01     0.07    -0.18    -0.12    -0.12 
## 213  0.21  -0.22    -0.07     0.01     0.07    -0.18    -0.12    -0.12 
##     dfb.thlfd dfb.thlrd dffit   cov.r   cook.d hat    
## 3   -0.89      0.01     -1.02_*  1.18_*  0.06   0.24_*
## 17   0.02     -0.13     -0.86_*  0.92    0.07   0.12  
## 37   0.03     -0.15     -0.40    0.76_*  0.03   0.02  
## 39  -0.55     -0.01     -0.70_*  1.33_*  0.02   0.26_*
## 44   0.09      0.11     -0.55    1.11    0.02   0.14_*
## 53  -0.06     -0.11      0.34    0.71_*  0.03   0.01  
## 62   0.39      0.01      0.48    1.26_*  0.01   0.20_*
## 80   0.01     -0.23     -0.61    0.83_*  0.05   0.06  
## 103  0.03      0.04      0.47    0.79_*  0.03   0.03  
## 118 -0.06     -0.11      0.34    0.71_*  0.03   0.01  
## 127  0.58      0.01      0.67_*  1.27_*  0.02   0.23_*
## 137  0.01     -0.17     -0.50    0.73_*  0.06   0.03  
## 147 -0.10     -0.25      0.70_*  1.00    0.03   0.12  
## 159 -0.10     -0.18      0.56    0.82_*  0.04   0.05  
## 161 -0.01     -0.05      0.41    0.80_*  0.02   0.03  
## 213 -0.01     -0.05      0.41    0.80_*  0.02   0.03
summary(inflm.SR_d2)
## Potentially influential observations of
##   glm(formula = num ~ age + sex + cp + exang + oldpeak + fbs +      thalach, family = binomial, data = d2_train) :
## 
##     dfb.1_ dfb.age dfb.sxml dfb.cpta dfb.cp-p dfb.cpsy dfb.exng dfb.oldp
## 7   -0.12   0.03    0.08     0.11     0.12     0.13     0.02     0.02   
## 15   0.14  -0.15   -0.07    -0.07    -0.05    -0.12     0.14    -0.19   
## 32  -0.04   0.05   -0.12     0.07     0.00     0.00    -0.02    -0.04   
## 37   0.03  -0.03   -0.06    -0.02    -0.01    -0.04    -0.07    -0.06   
## 68  -0.02   0.17   -0.17     0.00     0.08    -0.01    -0.07    -0.09   
## 74   0.02   0.06   -0.05    -0.04    -0.02    -0.06    -0.05    -0.18   
## 78  -0.04  -0.08   -0.07     0.31     0.34     0.36    -0.17     0.01   
## 89   0.05  -0.08   -0.18    -0.01     0.07     0.00    -0.01    -0.07   
## 100  0.26  -0.14    0.02    -0.34    -0.34    -0.36    -0.02    -0.01   
## 135  0.13  -0.08   -0.07    -0.04    -0.02    -0.07    -0.07    -0.11   
## 144 -0.16   0.10    0.05     0.10     0.01     0.02     0.00    -0.04   
## 150 -0.11   0.10    0.05     0.11     0.01     0.01    -0.02    -0.04   
## 165 -0.07   0.02    0.09     0.01    -0.03     0.01     0.03     0.03   
## 172  0.14  -0.13   -0.06     0.00    -0.08     0.01    -0.18     0.12   
## 178 -0.22   0.10   -0.03     0.34     0.35     0.39    -0.18     0.05   
## 206  0.09  -0.02    0.01    -0.12    -0.12    -0.12    -0.02     0.02   
## 213  0.07   0.07    0.04    -0.33    -0.34    -0.36     0.01    -0.14   
## 214  0.03  -0.14    0.04     0.15     0.04     0.06    -0.09     0.28   
## 218  0.00   0.04   -0.16     0.11     0.01     0.02    -0.06     0.05   
## 222 -0.03   0.08    0.04    -0.20    -0.22    -0.21    -0.01     0.02   
## 240  0.04   0.12   -0.05    -0.06    -0.14    -0.08     0.09    -0.31   
## 249 -0.22   0.14    0.23    -0.01    -0.01    -0.06     0.13    -0.08   
## 259  0.00  -0.04    0.03     0.04     0.07     0.04    -0.07     0.17   
## 292 -0.05   0.01    0.07     0.10     0.11     0.11     0.00     0.02   
## 323  0.05  -0.04   -0.17     0.00     0.08     0.00    -0.05     0.01   
## 330  0.09  -0.10    0.03    -0.22    -0.24    -0.22    -0.01     0.08   
## 332  0.13  -0.16    0.01     0.07    -0.02    -0.03    -0.01    -0.03   
## 339  0.05   0.07    0.05    -0.32    -0.33    -0.34    -0.01    -0.07   
## 347 -0.01   0.10    0.03     0.01     0.06     0.01    -0.05    -0.04   
## 364  0.10   0.01    0.02    -0.04     0.04    -0.06     0.10    -0.10   
## 371  0.21  -0.12    0.01    -0.34    -0.33    -0.36     0.03    -0.15   
## 382 -0.15  -0.04    0.15     0.21     0.24     0.24     0.08    -0.10   
## 390  0.10  -0.05    0.03    -0.17    -0.18    -0.18    -0.05     0.13   
## 395  0.11   0.12    0.04    -0.35    -0.35    -0.38    -0.01    -0.17   
## 396 -0.03   0.00    0.10    -0.05     0.01     0.02    -0.08     0.05   
## 399 -0.07  -0.09   -0.03     0.27     0.29     0.29     0.00     0.12   
## 424  0.03   0.04    0.04    -0.16    -0.18    -0.17    -0.05     0.09   
## 434 -0.09   0.14   -0.04     0.01     0.01    -0.02    -0.10    -0.07   
## 438  0.02  -0.09   -0.06     0.01     0.02    -0.01    -0.10     0.08   
## 456 -0.13   0.03   -0.04     0.26     0.27     0.28     0.05    -0.07   
## 466 -0.01  -0.06   -0.03     0.16     0.17     0.17     0.01     0.05   
## 489 -0.04  -0.02   -0.04     0.22     0.23     0.23     0.03    -0.05   
## 491 -0.02  -0.05    0.07     0.11     0.12     0.12     0.03    -0.01   
## 507 -0.05   0.01    0.04    -0.02     0.08    -0.01    -0.01    -0.08   
## 508 -0.03   0.03   -0.04     0.03     0.02     0.04    -0.05     0.10   
## 526  0.06  -0.06   -0.06    -0.02    -0.01    -0.04    -0.07    -0.06   
## 528  0.07  -0.11   -0.06    -0.04    -0.03    -0.09     0.13    -0.12   
## 535 -0.11   0.04   -0.02    -0.02    -0.01    -0.05     0.10    -0.07   
## 539 -0.04  -0.05   -0.06     0.26     0.29     0.27     0.06    -0.13   
## 545 -0.05   0.00   -0.04     0.00     0.01    -0.01    -0.07    -0.01   
## 551 -0.10   0.11   -0.02     0.23     0.24     0.26    -0.15     0.03   
##     dfb.fbst dfb.thlc dffit   cov.r   cook.d hat    
## 7    0.02     0.08    -0.17    1.07_*  0.00   0.05  
## 15   0.04    -0.04    -0.29    0.92_*  0.01   0.01  
## 32  -0.03     0.08     0.21    0.87_*  0.02   0.00  
## 37   0.02     0.01    -0.17    0.91_*  0.01   0.00  
## 68  -0.08    -0.04     0.34    0.93_*  0.02   0.02  
## 74   0.00    -0.03    -0.24    0.89_*  0.02   0.01  
## 78   0.08    -0.04    -0.44_*  1.02    0.02   0.06_*
## 89   0.18     0.05     0.32    0.91_*  0.02   0.01  
## 100 -0.05    -0.11     0.42_*  1.03    0.01   0.06_*
## 135  0.03    -0.10    -0.22    0.91_*  0.01   0.01  
## 144 -0.05     0.20     0.28    0.92_*  0.02   0.01  
## 150 -0.06     0.10     0.26    0.94_*  0.01   0.01  
## 165 -0.09     0.08    -0.17    1.06_*  0.00   0.05  
## 172 -0.18    -0.10    -0.38    1.03    0.01   0.06_*
## 178  0.05     0.09    -0.43_*  1.04    0.01   0.07_*
## 206  0.08    -0.08     0.19    1.07_*  0.00   0.05_*
## 213 -0.09     0.06     0.42_*  0.98    0.02   0.04  
## 214  0.00     0.01     0.38    1.06_*  0.01   0.07_*
## 218 -0.04     0.02     0.27    0.92_*  0.01   0.01  
## 222  0.14     0.11     0.34    1.07_*  0.01   0.08_*
## 240  0.01    -0.11    -0.42_*  1.06_*  0.01   0.08_*
## 249 -0.27     0.22    -0.44_*  1.02    0.02   0.06_*
## 259 -0.01    -0.01     0.20    1.06_*  0.00   0.05  
## 292 -0.07    -0.01    -0.17    1.06_*  0.00   0.05  
## 323 -0.03     0.01     0.25    0.91_*  0.01   0.01  
## 330  0.18     0.06     0.37    1.08_*  0.01   0.08_*
## 332 -0.01    -0.06     0.27    0.93_*  0.01   0.01  
## 339 -0.09     0.08     0.41_*  1.00    0.01   0.05  
## 347  0.11    -0.09     0.26    1.06_*  0.00   0.06_*
## 364 -0.05    -0.17     0.28    1.06_*  0.00   0.06_*
## 371 -0.05    -0.01     0.41_*  0.95    0.02   0.03  
## 382  0.06     0.11    -0.38    1.08_*  0.01   0.09_*
## 390 -0.03    -0.02     0.28    1.07_*  0.00   0.07_*
## 395 -0.11    -0.06     0.46_*  1.02    0.02   0.06_*
## 396 -0.11     0.03    -0.21    1.06_*  0.00   0.06_*
## 399 -0.15     0.03    -0.40    1.07_*  0.01   0.08_*
## 424 -0.04     0.00     0.26    1.06_*  0.00   0.06_*
## 434  0.00     0.05    -0.23    0.93_*  0.01   0.01  
## 438  0.05     0.07    -0.23    0.94_*  0.01   0.01  
## 456  0.06     0.05    -0.35    1.04    0.01   0.06_*
## 466  0.05    -0.03    -0.21    1.06_*  0.00   0.05  
## 489  0.05    -0.05    -0.30    1.05    0.01   0.06_*
## 491  0.03    -0.01    -0.18    1.06_*  0.00   0.05  
## 507 -0.05     0.11     0.24    0.93_*  0.01   0.01  
## 508 -0.01     0.02     0.12    1.06_*  0.00   0.04  
## 526  0.03    -0.02    -0.18    0.91_*  0.01   0.01  
## 528  0.04     0.04    -0.24    0.94_*  0.01   0.01  
## 535 -0.17     0.20    -0.30    0.90_*  0.02   0.01  
## 539 -0.21    -0.03    -0.46_*  1.00    0.02   0.05  
## 545  0.02     0.11    -0.20    0.91_*  0.01   0.01  
## 551  0.02    -0.07    -0.32    1.08_*  0.01   0.08_*
# from what data set they come from?
table(mydat$source[id1])
## 
## cleveland 
##        16
table(mydat$source[id2])
## 
## cleveland hungarian 
##        24        27

Based on the above analysis, using hat values and cooks distance, we have some leverage points in both models. We should properly investigate them before we decide whether to exclude them or use other technique of dealing with influential observations. Since I am not a specialist in heart disease, I will leave this analysis for a specialist in the field.

4.4 Model Performance

Now lets see how our models perform on our testing data. For that purposes I created a created a function that returns best combination of threshold and accuracy along with True positive and False Positive rates.

# Perform classification of testing data
log_pred_d1 = predict(log_d1,             # specifing the model given by model selection procedure
                      d1_test,            # predicting on the testing data   
                      type="response")    # predicting probability scores

log_pred_d2 = predict(log_d2,             # specifing the model given by model selection procedure
                      d2_test,            # predicting on the testing data   
                      type="response")    # predicting probability scores

# Function that finds the treshold that maximizes the accuracy
optim = function(precision,               # How many models you want to compare
                 pred,
                 dataset) {               # Predicted values 
  accuracy = matrix(0, nrow = precision, ncol = 4)
  k = 1/precision
  for (i in 1:precision) {
    f = factor(pred > i*k,
               levels = c(FALSE, TRUE),
               labels = c("not sick", "sick"))
    t = table(dataset$num, f)
    accuracy[i, 1] = i*k
    accuracy[i, 2] = sum(diag(t)/sum(t))
    accuracy[i, 3] = t[1, 2]/sum(t[1,])
    accuracy[i, 4] = t[2, 2]/sum(t[2,])
  }
  output = list(highest_accuracy = max(accuracy[ ,2]),                    # Highest accuracy
                optimal_treshold = accuracy[which.max(accuracy[ ,2]), 1], # Optimal threshold
                matrix = accuracy,                                        # All possible models    
                False.positive = accuracy[which.max(accuracy[ ,2]), 3],
                True.positive = accuracy[which.max(accuracy[ ,2]), 4])                                        
  return(output)
}

# Function that returns accuracy, false positive and true positive rates
ac = function(conf) {
  accuracy = round(sum(diag(conf))/sum(conf), 2)
  False.positive = round(conf[1, 2]/sum(conf[1,]), 2)
  True.positive = round(conf[2, 2]/sum(conf[2,]), 2)
  output = list(accuracy = accuracy, 
                False.positive = False.positive,
                True.positive = True.positive)
  return(output)
}

opt_d1 = optim(1000, log_pred_d1, d1_test)
opt_d2 = optim(1000, log_pred_d2, d2_test)

# Ploting Accuracy vs Treshold
plot(x= opt_d1$matrix[,1],
     y = opt_d1$matrix[,2],
     col=ifelse(opt_d1$matrix[, 1] == opt_d1$optimal_treshold, "red", "black"),
     ylab = "Accuracy",
     xlab = "Treshold",
     type = 'l',
     main = "Best treshold for log_d1")
abline(v = opt_d1$optimal_treshold, lty = 2)

plot(x= opt_d2$matrix[,1],
     y = opt_d2$matrix[,2],
     col = ifelse(opt_d1$matrix[, 1] == opt_d2$optimal_treshold, "red", "black"),
     ylab = "Accuracy",
     xlab = "Treshold",
     type = "l",
     main = "Best treshold for log_d2")
abline(v = opt_d2$optimal_treshold, lty = 2)

# The best combination of treshhold and accuracy 
data.frame("Optimal Treshold" = opt_d1$optimal_treshold, 
           "Highest Accuracy" = opt_d1$highest_accuracy,
           "False.positive" = opt_d1$False.positive,
           "True.positive" = opt_d1$True.positive)
##   Optimal.Treshold Highest.Accuracy False.positive True.positive
## 1            0.633        0.8266667      0.1627907        0.8125
data.frame("Optimal Treshold" = opt_d2$optimal_treshold, 
           "Highest Accuracy" = opt_d2$highest_accuracy,
          "False.positive" = opt_d2$False.positive,
           "True.positive" = opt_d2$True.positive)
##   Optimal.Treshold Highest.Accuracy False.positive True.positive
## 1            0.411         0.772973      0.2903226     0.8369565
f1 = factor(log_pred_d1> opt_d1$optimal_treshold,
               levels = c(FALSE, TRUE),
               labels = c("not sick", "sick"))
conf1 = table(d1_test$num, f1, dnn=c("Actual", "Predicted"))


f2 = factor(log_pred_d2> opt_d2$optimal_treshold,
               levels = c(FALSE, TRUE),
               labels = c("not sick", "sick"))
conf2 = table(d2_test$num, f2, dnn=c("Actual", "Predicted"))

summ_d1 = rbind(log_d1 = ac(conf1))
summ_d1
##        accuracy False.positive True.positive
## log_d1 0.83     0.16           0.81
summ_d2 = rbind(log_d2 = ac(conf2))
summ_d2
##        accuracy False.positive True.positive
## log_d2 0.77     0.29           0.84

So, based on the above analysis logistic model created for d1 has accuracy of 83% with TP rate of 81%. Although, the model for d2 has lower accuracy which is 77%, it has higher TP rate.

4.5 Assesing ROC curve

One of the ways to asses a model is to create a ROC curve that shows a different combinations of TP and FP rates for different thresholds. Model with higher AUC (area under the curve) considered to be better. This procedure is especially useful when dealing with high-bias data, that is, where one class is much more common than the other. But, in our case we have well balanced data, so decision based on simple accuracy metric is also a good one.

# Function that helps to construct ROC and AUC
perform = function (predicted, testdata) {
  result = list()
  pred = prediction(predicted, testdata)
  perf1 = performance(pred, "tpr", "fpr")
  perf2 = performance(pred, "auc")
  result = c(result, roc = perf1, auc = perf2@y.values[[1]])
 }

# ROC curves for log_d1 and log_d2
plot(perform(log_pred_d1, d1_test$num)$roc, col = "red")
plot(perform(log_pred_d2, d2_test$num)$roc, add = TRUE, col = "blue")
legend("right", 
       legend = c("log_d1", "log_d2"),
       col = c("red", "blue"), 
       lty = 1, 
       cex = 0.8)

# AUC for log_d1 and log_d2
AUC_d1 = rbind(log_d1 = perform(log_pred_d1, d1_test$num)$auc)
AUC_d1
##             [,1]
## log_d1 0.8968023
AUC_d2 = rbind(log_d2 = perform(log_pred_d2, d2_test$num)$auc)
AUC_d2
##             [,1]
## log_d2 0.8481767

As we can see both models have pretty high AUC, meaning they both are good in predicting heart disease.

5 Decision tree

5.1 Creating initial tree using Gini and Entropy

For the initial model, we chose very low complexity parameter (cp). Below are trees constructed using both Gini Index and Entropy measure.

# Build a tree model using GINI index
tree_g_d1 = rpart(num ~ .,
               data = d1_train,
               method = "class",
               control = rpart.control(cp = 0.00001),
               parms = list(split="gini"))
tree_g_d2 = rpart(num ~ .,
               data = d2_train,
               method = "class",
               control = rpart.control(cp = 0.00001),
               parms = list(split="gini"))

# Build a tree model using Entropy index
tree_e_d1 = rpart(num ~ .,
               data = d1_train,
               method = "class",
               control = rpart.control(cp=0.00001),
               parms = list(split="information"))
tree_e_d2 = rpart(num ~ .,
               data = d2_train,
               method = "class",
               control = rpart.control(cp=0.00001),
               parms = list(split="information"))

# Ploting tree
fancyRpartPlot(tree_g_d1,
               sub = "Tree based on GINI Index for d_1")

fancyRpartPlot(tree_e_d1, 
               sub = "Tree based on Entropy for d_1")

# Ploting tree
fancyRpartPlot(tree_g_d2,
               sub = "Tree based on GINI Index for d_2")

fancyRpartPlot(tree_e_d2, 
               sub = "Tree based on Entropy for d_2")

5.2 Assesing tree

# Using our model to predict the test set
pred_g_d1 = predict(tree_g_d1,
               d1_test,
               type = "class")

pred_e_d1 = predict(tree_e_d1,
               d1_test,
               type = "class")

pred_g_d2 = predict(tree_g_d2,
               d2_test,
               type = "class")

pred_e_d2 = predict(tree_e_d2,
               d2_test,
               type = "class")

# Construct the confusion matrix for d1 (gini vs entropy)
conf_g_d1 = table(d1_test$num,
                  pred_g_d1, dnn=c("Actual:", "Predicted:"))
conf_e_d1 = table(d1_test$num,
                  pred_e_d1, dnn=c("Actual:", "Predicted:"))

# Construct the confusion matrix for d2 (gini vs entropy)
conf_g_d2 = table(d2_test$num,
                  pred_g_d2, dnn=c("Actual:", "Predicted:"))
conf_e_d2 = table(d2_test$num,
                  pred_e_d2, dnn=c("Actual:", "Predicted:"))

summ_d1 = rbind(summ_d1, dtree_gini_d1 = ac(conf_g_d1),
                dtree_entropy_d1 = ac(conf_e_d1))
summ_d1
##                  accuracy False.positive True.positive
## log_d1           0.83     0.16           0.81         
## dtree_gini_d1    0.76     0.26           0.78         
## dtree_entropy_d1 0.83     0.23           0.91
summ_d2 = rbind(summ_d2, dtree_gini_d2 = ac(conf_g_d2),
                dtree_entropy_d2 = ac(conf_e_d2))
summ_d2
##                  accuracy False.positive True.positive
## log_d2           0.77     0.29           0.84         
## dtree_gini_d2    0.76     0.22           0.73         
## dtree_entropy_d2 0.76     0.26           0.78

Based on the above tables. we can notice for d1, the model that uses entropy measure gives higher accuracy as well as false and true positive rates. If we compare with logistic model, model based on entropy gives us higher true positive rate, but lower false positive rate.

For d2 logistic model performs better than decision tree models in terms of accuracy and true positive rate.

5.3 Possible pruning

To avoid over-fitting lets try to prune our trees based on the value of complexity parameter that minimizes cross-validated error.

# Cp table for d1
printcp(tree_g_d1)
## 
## Classification tree:
## rpart(formula = num ~ ., data = d1_train, method = "class", parms = list(split = "gini"), 
##     control = rpart.control(cp = 1e-05))
## 
## Variables actually used in tree construction:
## [1] ca       cp       oldpeak  thal     trestbps
## 
## Root node error: 107/224 = 0.47768
## 
## n= 224 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.5046729      0   1.00000 1.00000 0.069868
## 2 0.0607477      1   0.49533 0.61682 0.063767
## 3 0.0233645      3   0.37383 0.54206 0.061272
## 4 0.0186916      5   0.32710 0.55140 0.061611
## 5 0.0093458      6   0.30841 0.53271 0.060924
## 6 0.0000100      8   0.28972 0.57944 0.062581
printcp(tree_e_d1)
## 
## Classification tree:
## rpart(formula = num ~ ., data = d1_train, method = "class", parms = list(split = "information"), 
##     control = rpart.control(cp = 1e-05))
## 
## Variables actually used in tree construction:
## [1] age   ca    cp    exang thal 
## 
## Root node error: 107/224 = 0.47768
## 
## n= 224 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.5046729      0   1.00000 1.00000 0.069868
## 2 0.0607477      1   0.49533 0.57009 0.062266
## 3 0.0280374      3   0.37383 0.53271 0.060924
## 4 0.0093458      5   0.31776 0.45794 0.057824
## 5 0.0000100      6   0.30841 0.45794 0.057824
# Cross-valiedation error plot for d1
par(mfrow=c(1,2))
plotcp(tree_e_d1, sub = "Entropy_d1")
plotcp(tree_g_d1, sub = "Gini_d1")

Based on plots above, complexity parameter that minimizes cross-validated error for d1 is 0.0093458 for model with Gini index, and 0.0093458 for model with Entropy.

# Pruning tree for d1
pruned_g_d1 = prune(tree_g_d1, cp = tree_g_d1$cptable[which.min(tree_g_d1$cptable[,"xerror"]),"CP"])
pruned_e_d1 = prune(tree_e_d1, cp = tree_e_d1$cptable[which.min(tree_e_d1$cptable[,"xerror"]),"CP"])

par(mfrow = c(1,2))
# Ploting trees for Gini on d1
fancyRpartPlot(tree_g_d1, 
               sub = "Gini on d_1")
fancyRpartPlot(pruned_g_d1, 
               sub = "Pruned GINI on d_1")

# Ploting trees for Entropy on d1
fancyRpartPlot(tree_e_d1, 
               sub = "Entropy on d_1")

fancyRpartPlot(pruned_e_d1, 
               sub = "Pruned Entropy on d_1")

# Cp table for d2
printcp(tree_g_d2)
## 
## Classification tree:
## rpart(formula = num ~ ., data = d2_train, method = "class", parms = list(split = "gini"), 
##     control = rpart.control(cp = 1e-05))
## 
## Variables actually used in tree construction:
##  [1] age      chol     cp       exang    fbs      oldpeak  restecg 
##  [8] sex      thalach  trestbps
## 
## Root node error: 264/555 = 0.47568
## 
## n= 555 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.5227273      0   1.00000 1.00000 0.044565
## 2 0.0454545      1   0.47727 0.47727 0.037382
## 3 0.0113636      2   0.43182 0.48485 0.037590
## 4 0.0056818     13   0.30303 0.51136 0.038286
## 5 0.0037879     15   0.29167 0.51136 0.038286
## 6 0.0000100     16   0.28788 0.51515 0.038382
printcp(tree_e_d2)
## 
## Classification tree:
## rpart(formula = num ~ ., data = d2_train, method = "class", parms = list(split = "information"), 
##     control = rpart.control(cp = 1e-05))
## 
## Variables actually used in tree construction:
##  [1] age      chol     cp       exang    fbs      oldpeak  restecg 
##  [8] sex      thalach  trestbps
## 
## Root node error: 264/555 = 0.47568
## 
## n= 555 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.5227273      0   1.00000 1.00000 0.044565
## 2 0.0454545      1   0.47727 0.47727 0.037382
## 3 0.0170455      2   0.43182 0.44697 0.036512
## 4 0.0113636      4   0.39773 0.48485 0.037590
## 5 0.0075758      8   0.35227 0.46591 0.037063
## 6 0.0037879     14   0.30682 0.46970 0.037171
## 7 0.0018939     15   0.30303 0.46212 0.036955
## 8 0.0000100     17   0.29924 0.45833 0.036846
# Cross-valiedation error plot for d2
par(mfrow=c(1,2))
plotcp(tree_e_d2, sub = "Entropy_d1")
plotcp(tree_g_d2, sub = "Gini_d1")

Based on plots above, complexity parameter that minimizes cross-validated error for d2 is 0.0454545 for model with Gini index, and 0.0170455 for model with Entropy.

# Pruning tree for d2
pruned_g_d2 = prune(tree_g_d2, cp = tree_g_d2$cptable[which.min(tree_g_d2$cptable[,"xerror"]),"CP"])
pruned_e_d2 = prune(tree_e_d2, cp = tree_e_d2$cptable[which.min(tree_e_d2$cptable[,"xerror"]),"CP"])

par(mfrow = c(1,2))
# Ploting trees for Gini on d2
fancyRpartPlot(tree_g_d2, 
               sub = "Gini on d_2")

fancyRpartPlot(pruned_g_d2, 
               sub = "Pruned Gini on d_2")

# Ploting trees for Entropy on d2
fancyRpartPlot(tree_e_d2, 
               sub = "Entropy on d_2")

fancyRpartPlot(pruned_e_d2, 
               sub = "Pruned Entropy on d_2")

5.4 Assesing pruned trees

After we pruned trees, lets compare new trees with initial ones.

# Using our model to predict the test set
pred_pruned_g_d1 = predict(pruned_g_d1,
                           d1_test,
                           type = "class")

pred_pruned_e_d1 = predict(pruned_e_d1,
                            d1_test, 
                            type = "class")

pred_pruned_g_d2 = predict(pruned_g_d2,
                           d2_test,
                           type = "class")

pred_pruned_e_d2 = predict(pruned_e_d2,
                           d2_test,
                           type = "class")

# Construct the confusion matrix for d1 (gini vs entropy)
conf_pruned_g_d1 = table(d1_test$num,
                         pred_pruned_g_d1, dnn=c("Actual:", "Predicted:"))
conf_pruned_e_d1 = table(d1_test$num,
                         pred_pruned_e_d1, dnn=c("Actual:", "Predicted:"))

# Construct the confusion matrix for d2 (gini vs entropy)
conf_pruned_g_d2 = table(d2_test$num,
                         pred_pruned_g_d2, dnn=c("Actual:", "Predicted:"))
conf_pruned_e_d2 = table(d2_test$num,
                         pred_pruned_e_d2, dnn=c("Actual:", "Predicted:"))

summ_d1 = rbind(summ_d1, 
                pruned_gini_d1 = ac(conf_pruned_g_d1),
                pruned_entropy_d1 = ac(conf_pruned_e_d1))
summ_d1
##                   accuracy False.positive True.positive
## log_d1            0.83     0.16           0.81         
## dtree_gini_d1     0.76     0.26           0.78         
## dtree_entropy_d1  0.83     0.23           0.91         
## pruned_gini_d1    0.72     0.37           0.84         
## pruned_entropy_d1 0.84     0.14           0.81

It seems that pruning for Entropy based model, reduces false positive rate which is good, however it also reduces true positive rate which is bad. Since we care more about true positive rate, initial model is better.

summ_d2 = rbind(summ_d2, 
                pruned_gini_d2 = ac(conf_pruned_g_d2),
                pruned_entropy_d2 = ac(conf_pruned_e_d2))
summ_d2
##                   accuracy False.positive True.positive
## log_d2            0.77     0.29           0.84         
## dtree_gini_d2     0.76     0.22           0.73         
## dtree_entropy_d2  0.76     0.26           0.78         
## pruned_gini_d2    0.71     0.29           0.72         
## pruned_entropy_d2 0.72     0.31           0.75

For d2, pruning makes our model worse in terms of accuracy false positive and true positive rates. However, pruned trees for both Gini and Entropy has low number of splits which is good when we want to interpret results.

5.5 Assesing ROC curve

# Using our model to predict the test set
pred_g_d1 = predict(tree_g_d1,
               d1_test,
               type = "prob")[, 2]

pred_e_d1 = predict(tree_e_d1,
               d1_test,
               type = "prob")[, 2]

pred_g_d2 = predict(tree_g_d2,
               d2_test,
               type = "prob")[, 2]

pred_e_d2 = predict(tree_e_d2,
               d2_test,
               type = "prob")[, 2]

pred_pruned_g_d1 = predict(pruned_g_d1,
               d1_test,
               type = "prob")[, 2]

pred_pruned_e_d1 = predict(pruned_e_d1,
               d1_test,
               type = "prob")[, 2]

pred_pruned_g_d2 = predict(pruned_g_d2,
               d2_test,
               type = "prob")[, 2]

pred_pruned_e_d2 = predict(pruned_e_d2,
               d2_test,
               type = "prob")[, 2]

# ROC curves trees on data d1
plot(perform(pred_g_d1, d1_test$num)$roc, col = "red")
plot(perform(pred_e_d1, d1_test$num)$roc, add = TRUE, col = "blue")
plot(perform(pred_pruned_g_d1, d1_test$num)$roc, add = TRUE, col = "green")
plot(perform(pred_pruned_e_d1, d1_test$num)$roc, add = TRUE, col = "orange")
legend("right",
      legend=c("Gini_d1", "Entropy_d1", "Gini_pruned_d1", "Entropy_pruned_d1"),
      col=c("red", "blue", "green", "orange"), 
      lty=1, 
      cex=0.8)

# AUC values for trees on data d1
AUC_d1 = rbind(AUC_d1,
               GINI_d1 = perform(pred_g_d1, d1_test$num)$auc,
               Entropy_d1 = perform(pred_e_d1, d1_test$num)$auc,
               Pruned_Gini_d1 = perform(pred_pruned_g_d1, d1_test$num)$auc,
               Pruned_Entropy_d1 = perform(pred_pruned_e_d1, d1_test$num)$auc)
AUC_d1
##                        [,1]
## log_d1            0.8968023
## GINI_d1           0.8724564
## Entropy_d1        0.8710029
## Pruned_Gini_d1    0.8350291
## Pruned_Entropy_d1 0.8713663

Based on AUC metric, logistic model is the best one followed by model based on GINI index. However, pruned Entropy model is very close to the model base on GINI, and since it has lower number of splits, I would choose pruned Entropy compared to Gini.

# ROC curves trees on data d2
plot(perform(pred_g_d2, d2_test$num)$roc, col = "red")
plot(perform(pred_e_d2, d2_test$num)$roc, add = TRUE, col = "blue")
plot(perform(pred_pruned_g_d2, d2_test$num)$roc, add = TRUE, col = "green")
plot(perform(pred_pruned_e_d2, d2_test$num)$roc, add = TRUE, col = "orange")
legend("right", 
       legend=c("Gini_d2", "Entropy_d2", "Gini_pruned_d2", "Entropy_pruned_d2"),
       col=c("red", "blue", "green", "orange"), 
       lty=1:2, 
       cex=0.8)

# AUC values for trees on data d1
AUC_d2 = rbind(AUC_d2, GINI_d2 = perform(pred_g_d2, d2_test$num)$auc,
               Entropy_d2 = perform(pred_e_d2, d2_test$num)$auc,
               Pruned_Gini_d2 = perform(pred_pruned_g_d2, d2_test$num)$auc,
               Pruned_Entropy_d2 = perform(pred_pruned_e_d2, d2_test$num)$auc)
AUC_d2
##                        [,1]
## log_d2            0.8481767
## GINI_d2           0.8252688
## Entropy_d2        0.8172043
## Pruned_Gini_d2    0.7135344
## Pruned_Entropy_d2 0.7161057

For d2, logistic model is the best one in terms of AUC metric, followed by the model based on GINI.

6 Support Vector Machines

6.1 Finding the best model based on cost parameter

# Tuning svm fom d1 
tune_d1 = tune(svm, #ten-fold cross-validation on a set of models of interest.
                num ~ .,
                data = d1_train,
                kernel = "linear",
                ranges = list(cost=seq(0.01, 5, 0.1)))
# Assigning the best model based on cost component
bestmod_d1 = tune_d1$best.model 
summary(bestmod_d1) 
## 
## Call:
## best.tune(method = svm, train.x = num ~ ., data = d1_train, ranges = list(cost = seq(0.01, 
##     5, 0.1)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
##       gamma:  0.05263158 
## 
## Number of Support Vectors:  156
## 
##  ( 77 79 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  not sick sick
# Predicting on testing set
pred_svm_d1 = predict(bestmod_d1, d1_test)

svm.perf1 = table(d1_test$num,
                 pred_svm_d1,
                 dnn=c("Actual", "Predicted"))

# Tuning svm for d2 
tune_d2 = tune(svm, #ten-fold cross-validation on a set of models of interest.
                num ~ .,
                data = d2_train,
                kernel = "linear",
                ranges = list(cost=seq(0.01, 5, 0.1)))
# Assigning the best model based on cost component
bestmod_d2 = tune_d2$best.model 
summary(bestmod_d2) 
## 
## Call:
## best.tune(method = svm, train.x = num ~ ., data = d2_train, ranges = list(cost = seq(0.01, 
##     5, 0.1)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.41 
##       gamma:  0.07142857 
## 
## Number of Support Vectors:  248
## 
##  ( 125 123 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  not sick sick
# Predicting on testing set
pred_svm_d2 = predict(bestmod_d2, d2_test)

svm.perf2 = table(d2_test$num,
                 pred_svm_d2,
                 dnn=c("Actual", "Predicted"))

6.2 Assesing model

summ_d1 = rbind(summ_d1,
                svm_d1 = ac(svm.perf1))
summ_d1
##                   accuracy False.positive True.positive
## log_d1            0.83     0.16           0.81         
## dtree_gini_d1     0.76     0.26           0.78         
## dtree_entropy_d1  0.83     0.23           0.91         
## pruned_gini_d1    0.72     0.37           0.84         
## pruned_entropy_d1 0.84     0.14           0.81         
## svm_d1            0.85     0.14           0.84

Based on above table, we can see that for d1, svm performs pretty well in terms of accuracy, false positive rate and true positive rate. Since decision is based on the true positive rate, still, the best model for d1 is decision tree based on Entropy.

summ_d2 = rbind(summ_d2, 
                svm_d2 = ac(svm.perf2))
summ_d2
##                   accuracy False.positive True.positive
## log_d2            0.77     0.29           0.84         
## dtree_gini_d2     0.76     0.22           0.73         
## dtree_entropy_d2  0.76     0.26           0.78         
## pruned_gini_d2    0.71     0.29           0.72         
## pruned_entropy_d2 0.72     0.31           0.75         
## svm_d2            0.77     0.24           0.77

For d2, still the best model is logistic one.

svm_d1 <- svm(d1_train$num ~ ., 
              data = d1_train, 
              cost = bestmod_d1$cost, 
              kernel = "linear",
              probability = TRUE)
summary(svm_d1)
## 
## Call:
## svm(formula = d1_train$num ~ ., data = d1_train, cost = bestmod_d1$cost, 
##     kernel = "linear", probability = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.01 
##       gamma:  0.05263158 
## 
## Number of Support Vectors:  156
## 
##  ( 77 79 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  not sick sick
svm_d2 <- svm(d2_train$num ~ ., 
              data = d2_train, 
              cost = bestmod_d2$cost, 
              kernel = "linear",
              probability = TRUE)

pred_svm_d1 = predict(svm_d1,
                    d1_test,
                    probability=TRUE)

pred_svm_d2 = predict(svm_d2,
                    d2_test,
                    probability=TRUE)

f1 = attributes(pred_svm_d1)$probabilities[,1]
f2 = attributes(pred_svm_d2)$probabilities[,2]

plot(perform(f1, d1_test$num)$roc, col = "red")
plot(perform(f2, d2_test$num)$roc, add = TRUE, col = "blue")
legend("right", 
       legend = c("svm_d1", "svm_d2"),
       col = c("red", "blue", "green", "orange"), 
       lty = 1, 
       cex = 0.8)

# AUC values for trees on data d1
AUC_d1 = rbind(AUC_d1, svm_d1 = perform(f1, d1_test$num)$auc)
AUC_d1
##                        [,1]
## log_d1            0.8968023
## GINI_d1           0.8724564
## Entropy_d1        0.8710029
## Pruned_Gini_d1    0.8350291
## Pruned_Entropy_d1 0.8713663
## svm_d1            0.9382267

We can notice, that svm for d1 performs much better then other techniques in terms of AUC metric.

AUC_d2 = rbind(AUC_d2, svm_d2 = perform(f2, d2_test$num)$auc)
AUC_d2
##                        [,1]
## log_d2            0.8481767
## GINI_d2           0.8252688
## Entropy_d2        0.8172043
## Pruned_Gini_d2    0.7135344
## Pruned_Entropy_d2 0.7161057
## svm_d2            0.8412810

For d2, svm model is very close to logistic regression.

7 References

  1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
  2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D.
  3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D.
  4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: Robert Detrano, M.D., Ph.D.