library(data.table) # Importing data
library(car) # model diagnostics
library(MASS) # step function for model selection
library(rpart) # desicion tree
library(rattle) # creating decision tree plot
library(rpart.plot) # descision tree
library(RColorBrewer) # colloring decision tree
library(formatR) # colloring tree
library(binomTools) # standardazed pearson residuals
library(ROCR) # ploting ROC curve
library(e1071) # Performing Support Vector MachineIn this project I use classification techniques such as: logistic regression, decision tree, support vector machines to predict the presence of heart disease. Comparison in terms of accuracy, true positive rate and false positive rate will be conducted. Also, models are evaluated in terms of ROC curves and AUC measures.
Since, dealing with disease investigation we want to reduce cases when we conclude that a person is not sick while he/she is sick, the best model is selected using true positive rate, which is a fraction of TP/actual yes.
The data was collected from four sources:
1. Cleveland Clinic Foundation (cleveland.data)
2. Hungarian Institute of Cardiology, Budapest (hungarian.data)
3. V.A. Medical Center, Long Beach, CA (long-beach-va.data)
4. University Hospital, Zurich, Switzerland (switzerland.data)
All four data sets can be found here: http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/.
The following is the description of the variables:
age: age in yearssex: sex
cp: chest pain type
trestbps: resting blood pressure (in mm Hg on admission to the hospital)chol: serum cholesterol in mg/dlfbs: (fasting blood sugar > 120 mg/dl)
restecg: resting electrocardiograph results
thalach: maximum heart rate achievedexang: exercise induced angina:
oldpeak: ST depression induced by exercise relative to restslope: the slope of the peak exercise ST segment
ca: number of major vessels (0-3) colored by flourosopythal:
num: diagnosis of heart disease (angiographic disease status) (the predicted attribute)
All R code with some explanation is provided below for reproducible research.
# Importing data
url_hungarian = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data'
url_cleveland = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
url_switzerland = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data'
url_va = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.va.data'
mydat_hungarian = fread(url_hungarian,
na.strings = "?",
col.names = c("age","sex", "cp", "trestbps", "chol",
"fbs", "restecg", "thalach", "exang",
"oldpeak", "slope", "ca", "thal", "num"))
mydat_cleveland = fread(url_cleveland,
na.strings = "?",
col.names = c("age","sex", "cp", "trestbps", "chol",
"fbs", "restecg", "thalach", "exang",
"oldpeak", "slope", "ca", "thal", "num"))
mydat_switzerland = fread(url_switzerland,
na.strings = "?",
col.names = c("age","sex", "cp", "trestbps", "chol",
"fbs", "restecg", "thalach", "exang",
"oldpeak", "slope", "ca", "thal", "num"))
mydat_va = fread(url_va,
na.strings = "?",
col.names = c("age","sex", "cp", "trestbps", "chol",
"fbs", "restecg", "thalach", "exang",
"oldpeak", "slope", "ca", "thal", "num"))
# Adding source column
mydat_hungarian$source = "hungarian"
mydat_cleveland$source = "cleveland"
mydat_switzerland$source = "switzerland"
mydat_va$source = "va"
# Combining data sets
mydat = rbind(mydat_cleveland,
mydat_hungarian,
mydat_switzerland,
mydat_va)
# Converting categorical variables to factor type
mydat$num = factor(mydat$num > 0,
levels = c(FALSE, TRUE),
labels = c("not sick", "sick"))
mydat$sex = factor(mydat$sex,
levels = c(0, 1),
labels = c("female", "male"))
mydat$cp = factor(mydat$cp,
levels = c(1, 2, 3, 4),
labels = c("typical angina", "atypical angina",
"non-anginal pain", "asymptomatic"))
mydat$exang = factor(mydat$exang,
levels = c(0, 1),
labels = c("no", "yes"))
mydat$slope = factor(mydat$slope,
levels = c(1, 2, 3),
labels = c("upsloping", "flat", "downsloping"))
mydat$thal = factor(mydat$thal,
levels = c(3, 6, 7),
labels = c("normal","fixed defect","reversable defect"))
mydat$fbs = factor(mydat$fbs,
levels = c(0, 1),
labels = c("false", "true"))
mydat$restecg = as.factor(mydat$restecg)
# Summary of the data
attach(mydat)
summary(mydat)## age sex cp trestbps
## Min. :28.00 female:194 typical angina : 46 Min. : 0.0
## 1st Qu.:47.00 male :726 atypical angina :174 1st Qu.:120.0
## Median :54.00 non-anginal pain:204 Median :130.0
## Mean :53.51 asymptomatic :496 Mean :132.1
## 3rd Qu.:60.00 3rd Qu.:140.0
## Max. :77.00 Max. :200.0
## NA's :59
## chol fbs restecg thalach exang
## Min. : 0.0 false:692 0 :551 Min. : 60.0 no :528
## 1st Qu.:175.0 true :138 1 :179 1st Qu.:120.0 yes :337
## Median :223.0 NA's : 90 2 :188 Median :140.0 NA's: 55
## Mean :199.1 NA's: 2 Mean :137.5
## 3rd Qu.:268.0 3rd Qu.:157.0
## Max. :603.0 Max. :202.0
## NA's :30 NA's :55
## oldpeak slope ca
## Min. :-2.6000 upsloping :203 Min. :0.0000
## 1st Qu.: 0.0000 flat :345 1st Qu.:0.0000
## Median : 0.5000 downsloping: 63 Median :0.0000
## Mean : 0.8788 NA's :309 Mean :0.6764
## 3rd Qu.: 1.5000 3rd Qu.:1.0000
## Max. : 6.2000 Max. :3.0000
## NA's :62 NA's :611
## thal num source
## normal :196 not sick:411 Length:920
## fixed defect : 46 sick :509 Class :character
## reversable defect:192 Mode :character
## NA's :486
##
##
##
We can notice that some of the variables have a lot of missing values and if we simple delete observations that have at least one missing values we will lose a lot of observation which can affect our model. Thus, I am going to consider two data sets below.
d1, that has all the variables mentioned above (except source) without observations that has at least one missing value.d2, one without variables ca, thal, slope and source, ones that have most of the missing values.# Data set with all variables exept 'source' and observations that has
# at least one missing values
d_1 = mydat[complete.cases(mydat[, -15]), -15]
# Data set without variables ca, thal, slope and source and without
# observations that have at least one missing value
d_2 = mydat[complete.cases(mydat[, -c(11, 12, 13, 15)]), -c(11, 12, 13, 15)]
# Summary of d_1 and d_2
summary(d_1)## age sex cp trestbps
## Min. :29.00 female: 96 typical angina : 23 Min. : 94.0
## 1st Qu.:48.00 male :203 atypical angina : 49 1st Qu.:120.0
## Median :56.00 non-anginal pain: 83 Median :130.0
## Mean :54.52 asymptomatic :144 Mean :131.7
## 3rd Qu.:61.00 3rd Qu.:140.0
## Max. :77.00 Max. :200.0
## chol fbs restecg thalach exang
## Min. :100.0 false:256 0:149 Min. : 71.0 no :200
## 1st Qu.:211.0 true : 43 1: 4 1st Qu.:132.5 yes: 99
## Median :242.0 2:146 Median :152.0
## Mean :246.8 Mean :149.3
## 3rd Qu.:275.5 3rd Qu.:165.5
## Max. :564.0 Max. :202.0
## oldpeak slope ca
## Min. :0.000 upsloping :139 Min. :0.0000
## 1st Qu.:0.000 flat :139 1st Qu.:0.0000
## Median :0.800 downsloping: 21 Median :0.0000
## Mean :1.059 Mean :0.6722
## 3rd Qu.:1.600 3rd Qu.:1.0000
## Max. :6.200 Max. :3.0000
## thal num
## normal :164 not sick:160
## fixed defect : 18 sick :139
## reversable defect:117
##
##
##
summary(d_2)## age sex cp trestbps
## Min. :28.0 female:174 typical angina : 37 Min. : 0.0
## 1st Qu.:46.0 male :566 atypical angina :150 1st Qu.:120.0
## Median :54.0 non-anginal pain:161 Median :130.0
## Mean :53.1 asymptomatic :392 Mean :132.8
## 3rd Qu.:60.0 3rd Qu.:140.0
## Max. :77.0 Max. :200.0
## chol fbs restecg thalach exang
## Min. : 0.0 false:629 0:445 Min. : 60.0 no :444
## 1st Qu.:197.0 true :111 1:120 1st Qu.:120.0 yes:296
## Median :231.0 2:175 Median :140.0
## Mean :220.1 Mean :138.7
## 3rd Qu.:271.0 3rd Qu.:159.2
## Max. :603.0 Max. :202.0
## oldpeak num
## Min. :-1.0000 not sick:357
## 1st Qu.: 0.0000 sick :383
## Median : 0.5000
## Mean : 0.8943
## 3rd Qu.: 1.5000
## Max. : 6.2000
In order to avoid any preordered structure of the data, before splitting into training and testing sets, I shuffle data.
# Set random seed for reproducible research
set.seed(97459)
# Shuffle the data
n_1 = nrow(d_1)
n_2 = nrow(d_2)
d_1 = d_1[sample(n_1), ]
d_2 = d_2[sample(n_2), ]I split data into training and testing sets (75% vs 25%). I am using training set to create models and testing set for assessing predictive power of my models.
# Split the data in train and test
split = 0.75
d1_train = d_1[1:round(split * n_1), ]
d1_test = d_1[(round(split * n_1) + 1):n_1, ]
d2_train = d_2[1:round(split * n_2), ]
d2_test = d_2[(round(split * n_2) + 1):n_2, ]
# Check class bias
table(d_1$num, dnn="Overal d_1")## Overal d_1
## not sick sick
## 160 139
table(d1_train$num, dnn="training d_1")## training d_1
## not sick sick
## 117 107
table(d1_test$num, dnn="testing d_1")## testing d_1
## not sick sick
## 43 32
table(d_2$num, dnn="Overal d_2")## Overal d_2
## not sick sick
## 357 383
table(d2_train$num, dnn="training d_2")## training d_2
## not sick sick
## 264 291
table(d2_test$num, dnn="testing d_2")## testing d_2
## not sick sick
## 93 92
We can see that for both d_1 and d_2, training and testing sets are well balanced.
To select best model, we use stepwise procedures (backward, forward, both).
# Creating full(including all varibales) and null(intercept only) models for d_1
d1_logit_full = glm(num~., data = d1_train, family = binomial)
d1_logit_null = glm(num~1, data = d1_train, family = binomial)
# Creating full(including all varibales) and null(intercept only) models for d_2
d2_logit_full = glm(num~., data = d2_train, family = binomial)
d2_logit_null = glm(num~1, data = d2_train, family = binomial)# Forward selection procedure
forward_d1 = step(d1_logit_null, scope = list(lower = d1_logit_null, upper = d1_logit_full), direction = "forward")
forward_d2 = step(d2_logit_null, scope = list(lower = d2_logit_null, upper = d2_logit_full), direction = "forward")
# Backward selection procedure
backward_d1 = step(d1_logit_full, scope = list(lower = d1_logit_null, upper = d1_logit_full), direction = "backward")
backward_d2 = step(d2_logit_full, scope = list(lower = d2_logit_null, upper = d2_logit_full), direction = "backward")
# Both
both_d1 = step(d1_logit_full, scope = list(lower = d1_logit_null, upper = d1_logit_full), direction = "both")
both_d2 = step(d2_logit_full, scope = list(lower = d2_logit_null, upper = d2_logit_full), direction = "both")Below are the models returned by each procedure using AIC criterion for both data sets.
Forward for data set d_1
forward_d1$formula## num ~ thal + exang + ca + cp + oldpeak + sex
Backward for data set d_1
backward_d1$formula## num ~ sex + cp + exang + oldpeak + ca + thal
Both for data set d_1
both_d1$formula## num ~ sex + cp + exang + oldpeak + ca + thal
Forward for data set d_2
forward_d2$formula## num ~ cp + oldpeak + sex + thalach + fbs + exang + age + chol
Backward for data set d_2
backward_d2$formula## num ~ age + sex + cp + chol + fbs + thalach + exang + oldpeak
Both for data set d_2
both_d2$formula## num ~ age + sex + cp + chol + fbs + thalach + exang + oldpeak
We can notice that for both data sets d1 and d2, models returned by all three procedures are similar.
log_d1 = glm(both_d1$formula, data=d1_train, family = binomial)
log_d2 = glm(both_d2$formula, data=d2_train, family = binomial)
summary(log_d1)##
## Call:
## glm(formula = both_d1$formula, family = binomial, data = d1_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4224 -0.4991 -0.1861 0.4032 2.4807
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.13197 0.80415 -3.895 9.83e-05 ***
## sexmale 1.01733 0.50383 2.019 0.04347 *
## cpatypical angina -0.07704 0.82230 -0.094 0.92536
## cpnon-anginal pain -0.91516 0.71307 -1.283 0.19935
## cpasymptomatic 0.97346 0.70574 1.379 0.16779
## exangyes 1.52514 0.50148 3.041 0.00236 **
## oldpeak 0.61890 0.22052 2.807 0.00501 **
## ca 0.95478 0.24338 3.923 8.74e-05 ***
## thalfixed defect 1.33200 0.98519 1.352 0.17637
## thalreversable defect 1.09442 0.46571 2.350 0.01877 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 310.08 on 223 degrees of freedom
## Residual deviance: 159.48 on 214 degrees of freedom
## AIC: 179.48
##
## Number of Fisher Scoring iterations: 6
summary(log_d2)##
## Call:
## glm(formula = both_d2$formula, family = binomial, data = d2_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5017 -0.5696 0.1628 0.5591 2.6430
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.359753 1.455254 -1.622 0.10490
## age 0.027506 0.014674 1.874 0.06087 .
## sexmale 1.403713 0.316186 4.440 9.02e-06 ***
## cpatypical angina -0.079274 0.570184 -0.139 0.88942
## cpnon-anginal pain 0.016324 0.538130 0.030 0.97580
## cpasymptomatic 1.810697 0.528339 3.427 0.00061 ***
## chol -0.001962 0.001365 -1.437 0.15074
## fbstrue 0.958730 0.347534 2.759 0.00580 **
## thalach -0.012228 0.005472 -2.234 0.02545 *
## exangyes 0.730213 0.274659 2.659 0.00785 **
## oldpeak 0.826590 0.148694 5.559 2.71e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 768.08 on 554 degrees of freedom
## Residual deviance: 445.08 on 544 degrees of freedom
## AIC: 467.08
##
## Number of Fisher Scoring iterations: 5
Let’s check whether variables cp is valuable in log_d1 model.
# Anova test using Chisquare distribution (deviance residuals follow Chisquare distribution)
m_f = glm(num~sex + exang + oldpeak + ca + thal, data = d1_train, family = binomial)
anova(log_d1, m_f, test = "Chisq")## Analysis of Deviance Table
##
## Model 1: num ~ sex + cp + exang + oldpeak + ca + thal
## Model 2: num ~ sex + exang + oldpeak + ca + thal
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 214 159.48
## 2 217 173.84 -3 -14.354 0.002461 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova test shows that we should leave cp in the model. Now, lets check whether we should leave chol in log_d2 model.
# Anova test using Chisquare distribution (deviance residuals follow Chisquare distribution)
m_f = glm(num~ age + sex + cp + exang + oldpeak + fbs + thalach, data = d2_train, family = binomial)
anova(log_d2, m_f, test = "Chisq")## Analysis of Deviance Table
##
## Model 1: num ~ age + sex + cp + chol + fbs + thalach + exang + oldpeak
## Model 2: num ~ age + sex + cp + exang + oldpeak + fbs + thalach
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 544 445.08
## 2 545 447.20 -1 -2.1237 0.145
# Update the log_d2 model by excluding `chol` variable
log_d2 = glm(num~ age + sex + cp + exang + oldpeak + fbs + thalach, data = d2_train, family = binomial)Based on the above test, we can exclude the variable chol from the model.
Once we selected models for two data sets, let check some of the regression assumptions.
# QQ plot for deviance residuals
par(mfrow = c (1,2))
qqPlot(residuals(log_d1, "pearson"), main = "Pearson Residuals for log_d1")
qqPlot(residuals(log_d1, "deviance"), main = "Deviance Residuals for log_d1")qqPlot(residuals(log_d2, "pearson"), main = "Pearson Residuals for log_d2")
qqPlot(residuals(log_d2, "deviance"), main = "Deviance Residuals for log_d2")We can notice that normality of Pearson and deviance residuals violated for log_d1. For log_d2 we can notice that deviance residuals are more or less resembling normal distribution. Although, an assumption for normality is essential for multiple linear regression case, in logistic regression it is not as crucial and most of the times is violated. I leave this analysis for future research.
par(mrflow = c(1,2))
# Evaluate homoscedasticity
scatter.smooth(log_d1$fitted.values,
residuals(log_d1,"pearson"),
span = 2,
lpars = list(col = "green", lwd = 3, lty = 3))scatter.smooth(log_d2$fitted.values,
residuals(log_d2,"pearson"),
span = 2,
lpars = list(col = "green", lwd = 3, lty = 3))We can see that for both models, lowess line goes vertically through the origin. It tells us that an assumptions about constant variance is supported.
# Specification test
residualPlots(log_d1, tests = FALSE)residualPlots(log_d2, tests = FALSE)By looking at the graphs Pearson residuals vs numeric covariates, we can see that lowess lines goes vertically through the origin for both of the models. It means the assumption of linear specification of log odds ratio vs covariates is supported.
# Detecting outliers using Bonferroni Outlier Test
o1 = names(Residuals(log_d1, type = "standard.deviance")[abs(Residuals(log_d1, type = "standard.deviance")) > 2] )
o2 = names(Residuals(log_d2, type = "standard.deviance")[abs(Residuals(log_d2, type = "standard.deviance")) > 2] )
par(mrflow = c(1,2))
# Ploting fitted vs standard pearson residuals ( Checking outliers)
plot(log_d1$fitted.values, Residuals(log_d1, type = "standard.deviance") )
points(log_d1$fitted.values[o1], Residuals(log_d1, type = "standard.deviance")[o1], col="red", pch=16)plot(log_d2$fitted.values, Residuals(log_d2, type = "standard.deviance") )
points(log_d2$fitted.values[o2], Residuals(log_d2, type = "standard.deviance")[o2], col="red", pch=16)By looking at standard deviance residual vs fitted values plot, we can see that there might be some outliers if we set up a 2 sigma rule. Since I am not a specialist in the area of heart disease, the decision of how to deal with these observation I leave for further research.
# Calulating different values for checking for influential observations
inflm.SR_d1 = influence.measures(log_d1)
inflm.SR_d2 = influence.measures(log_d2)
# What observations are influential
id1 = as.numeric(names(which(apply(inflm.SR_d1$is.inf, 1, any))))
id2 = as.numeric(names(which(apply(inflm.SR_d2$is.inf, 1, any))))
summary(inflm.SR_d1)## Potentially influential observations of
## glm(formula = both_d1$formula, family = binomial, data = d1_train) :
##
## dfb.1_ dfb.sxml dfb.cpta dfb.cp-p dfb.cpsy dfb.exng dfb.oldp dfb.ca
## 3 0.05 -0.03 -0.08 -0.06 -0.22 0.24 0.04 0.18
## 17 -0.20 0.00 0.22 0.45 0.36 0.07 -0.61 0.13
## 37 0.09 -0.10 0.00 0.01 -0.05 -0.24 0.11 -0.12
## 39 -0.33 0.07 0.27 0.29 0.32 0.04 0.15 0.11
## 44 0.03 -0.09 0.05 -0.14 0.00 -0.02 0.18 -0.40
## 53 0.06 0.07 -0.05 0.14 0.00 -0.05 -0.15 -0.12
## 62 -0.03 0.00 0.01 0.08 -0.04 0.18 -0.07 0.04
## 80 0.17 0.01 -0.07 -0.17 -0.04 0.03 -0.08 -0.46
## 103 0.13 -0.24 0.19 -0.01 -0.04 -0.07 -0.05 0.07
## 118 0.06 0.07 -0.05 0.14 0.00 -0.05 -0.15 -0.12
## 127 -0.09 -0.03 0.09 0.16 0.06 -0.09 0.15 0.01
## 137 0.11 -0.05 -0.02 -0.01 -0.12 0.06 0.10 -0.39
## 147 -0.15 0.16 0.11 0.20 0.11 -0.11 0.52 -0.19
## 159 0.44 0.07 -0.41 -0.39 -0.42 0.01 -0.18 -0.12
## 161 0.21 -0.22 -0.07 0.01 0.07 -0.18 -0.12 -0.12
## 213 0.21 -0.22 -0.07 0.01 0.07 -0.18 -0.12 -0.12
## dfb.thlfd dfb.thlrd dffit cov.r cook.d hat
## 3 -0.89 0.01 -1.02_* 1.18_* 0.06 0.24_*
## 17 0.02 -0.13 -0.86_* 0.92 0.07 0.12
## 37 0.03 -0.15 -0.40 0.76_* 0.03 0.02
## 39 -0.55 -0.01 -0.70_* 1.33_* 0.02 0.26_*
## 44 0.09 0.11 -0.55 1.11 0.02 0.14_*
## 53 -0.06 -0.11 0.34 0.71_* 0.03 0.01
## 62 0.39 0.01 0.48 1.26_* 0.01 0.20_*
## 80 0.01 -0.23 -0.61 0.83_* 0.05 0.06
## 103 0.03 0.04 0.47 0.79_* 0.03 0.03
## 118 -0.06 -0.11 0.34 0.71_* 0.03 0.01
## 127 0.58 0.01 0.67_* 1.27_* 0.02 0.23_*
## 137 0.01 -0.17 -0.50 0.73_* 0.06 0.03
## 147 -0.10 -0.25 0.70_* 1.00 0.03 0.12
## 159 -0.10 -0.18 0.56 0.82_* 0.04 0.05
## 161 -0.01 -0.05 0.41 0.80_* 0.02 0.03
## 213 -0.01 -0.05 0.41 0.80_* 0.02 0.03
summary(inflm.SR_d2)## Potentially influential observations of
## glm(formula = num ~ age + sex + cp + exang + oldpeak + fbs + thalach, family = binomial, data = d2_train) :
##
## dfb.1_ dfb.age dfb.sxml dfb.cpta dfb.cp-p dfb.cpsy dfb.exng dfb.oldp
## 7 -0.12 0.03 0.08 0.11 0.12 0.13 0.02 0.02
## 15 0.14 -0.15 -0.07 -0.07 -0.05 -0.12 0.14 -0.19
## 32 -0.04 0.05 -0.12 0.07 0.00 0.00 -0.02 -0.04
## 37 0.03 -0.03 -0.06 -0.02 -0.01 -0.04 -0.07 -0.06
## 68 -0.02 0.17 -0.17 0.00 0.08 -0.01 -0.07 -0.09
## 74 0.02 0.06 -0.05 -0.04 -0.02 -0.06 -0.05 -0.18
## 78 -0.04 -0.08 -0.07 0.31 0.34 0.36 -0.17 0.01
## 89 0.05 -0.08 -0.18 -0.01 0.07 0.00 -0.01 -0.07
## 100 0.26 -0.14 0.02 -0.34 -0.34 -0.36 -0.02 -0.01
## 135 0.13 -0.08 -0.07 -0.04 -0.02 -0.07 -0.07 -0.11
## 144 -0.16 0.10 0.05 0.10 0.01 0.02 0.00 -0.04
## 150 -0.11 0.10 0.05 0.11 0.01 0.01 -0.02 -0.04
## 165 -0.07 0.02 0.09 0.01 -0.03 0.01 0.03 0.03
## 172 0.14 -0.13 -0.06 0.00 -0.08 0.01 -0.18 0.12
## 178 -0.22 0.10 -0.03 0.34 0.35 0.39 -0.18 0.05
## 206 0.09 -0.02 0.01 -0.12 -0.12 -0.12 -0.02 0.02
## 213 0.07 0.07 0.04 -0.33 -0.34 -0.36 0.01 -0.14
## 214 0.03 -0.14 0.04 0.15 0.04 0.06 -0.09 0.28
## 218 0.00 0.04 -0.16 0.11 0.01 0.02 -0.06 0.05
## 222 -0.03 0.08 0.04 -0.20 -0.22 -0.21 -0.01 0.02
## 240 0.04 0.12 -0.05 -0.06 -0.14 -0.08 0.09 -0.31
## 249 -0.22 0.14 0.23 -0.01 -0.01 -0.06 0.13 -0.08
## 259 0.00 -0.04 0.03 0.04 0.07 0.04 -0.07 0.17
## 292 -0.05 0.01 0.07 0.10 0.11 0.11 0.00 0.02
## 323 0.05 -0.04 -0.17 0.00 0.08 0.00 -0.05 0.01
## 330 0.09 -0.10 0.03 -0.22 -0.24 -0.22 -0.01 0.08
## 332 0.13 -0.16 0.01 0.07 -0.02 -0.03 -0.01 -0.03
## 339 0.05 0.07 0.05 -0.32 -0.33 -0.34 -0.01 -0.07
## 347 -0.01 0.10 0.03 0.01 0.06 0.01 -0.05 -0.04
## 364 0.10 0.01 0.02 -0.04 0.04 -0.06 0.10 -0.10
## 371 0.21 -0.12 0.01 -0.34 -0.33 -0.36 0.03 -0.15
## 382 -0.15 -0.04 0.15 0.21 0.24 0.24 0.08 -0.10
## 390 0.10 -0.05 0.03 -0.17 -0.18 -0.18 -0.05 0.13
## 395 0.11 0.12 0.04 -0.35 -0.35 -0.38 -0.01 -0.17
## 396 -0.03 0.00 0.10 -0.05 0.01 0.02 -0.08 0.05
## 399 -0.07 -0.09 -0.03 0.27 0.29 0.29 0.00 0.12
## 424 0.03 0.04 0.04 -0.16 -0.18 -0.17 -0.05 0.09
## 434 -0.09 0.14 -0.04 0.01 0.01 -0.02 -0.10 -0.07
## 438 0.02 -0.09 -0.06 0.01 0.02 -0.01 -0.10 0.08
## 456 -0.13 0.03 -0.04 0.26 0.27 0.28 0.05 -0.07
## 466 -0.01 -0.06 -0.03 0.16 0.17 0.17 0.01 0.05
## 489 -0.04 -0.02 -0.04 0.22 0.23 0.23 0.03 -0.05
## 491 -0.02 -0.05 0.07 0.11 0.12 0.12 0.03 -0.01
## 507 -0.05 0.01 0.04 -0.02 0.08 -0.01 -0.01 -0.08
## 508 -0.03 0.03 -0.04 0.03 0.02 0.04 -0.05 0.10
## 526 0.06 -0.06 -0.06 -0.02 -0.01 -0.04 -0.07 -0.06
## 528 0.07 -0.11 -0.06 -0.04 -0.03 -0.09 0.13 -0.12
## 535 -0.11 0.04 -0.02 -0.02 -0.01 -0.05 0.10 -0.07
## 539 -0.04 -0.05 -0.06 0.26 0.29 0.27 0.06 -0.13
## 545 -0.05 0.00 -0.04 0.00 0.01 -0.01 -0.07 -0.01
## 551 -0.10 0.11 -0.02 0.23 0.24 0.26 -0.15 0.03
## dfb.fbst dfb.thlc dffit cov.r cook.d hat
## 7 0.02 0.08 -0.17 1.07_* 0.00 0.05
## 15 0.04 -0.04 -0.29 0.92_* 0.01 0.01
## 32 -0.03 0.08 0.21 0.87_* 0.02 0.00
## 37 0.02 0.01 -0.17 0.91_* 0.01 0.00
## 68 -0.08 -0.04 0.34 0.93_* 0.02 0.02
## 74 0.00 -0.03 -0.24 0.89_* 0.02 0.01
## 78 0.08 -0.04 -0.44_* 1.02 0.02 0.06_*
## 89 0.18 0.05 0.32 0.91_* 0.02 0.01
## 100 -0.05 -0.11 0.42_* 1.03 0.01 0.06_*
## 135 0.03 -0.10 -0.22 0.91_* 0.01 0.01
## 144 -0.05 0.20 0.28 0.92_* 0.02 0.01
## 150 -0.06 0.10 0.26 0.94_* 0.01 0.01
## 165 -0.09 0.08 -0.17 1.06_* 0.00 0.05
## 172 -0.18 -0.10 -0.38 1.03 0.01 0.06_*
## 178 0.05 0.09 -0.43_* 1.04 0.01 0.07_*
## 206 0.08 -0.08 0.19 1.07_* 0.00 0.05_*
## 213 -0.09 0.06 0.42_* 0.98 0.02 0.04
## 214 0.00 0.01 0.38 1.06_* 0.01 0.07_*
## 218 -0.04 0.02 0.27 0.92_* 0.01 0.01
## 222 0.14 0.11 0.34 1.07_* 0.01 0.08_*
## 240 0.01 -0.11 -0.42_* 1.06_* 0.01 0.08_*
## 249 -0.27 0.22 -0.44_* 1.02 0.02 0.06_*
## 259 -0.01 -0.01 0.20 1.06_* 0.00 0.05
## 292 -0.07 -0.01 -0.17 1.06_* 0.00 0.05
## 323 -0.03 0.01 0.25 0.91_* 0.01 0.01
## 330 0.18 0.06 0.37 1.08_* 0.01 0.08_*
## 332 -0.01 -0.06 0.27 0.93_* 0.01 0.01
## 339 -0.09 0.08 0.41_* 1.00 0.01 0.05
## 347 0.11 -0.09 0.26 1.06_* 0.00 0.06_*
## 364 -0.05 -0.17 0.28 1.06_* 0.00 0.06_*
## 371 -0.05 -0.01 0.41_* 0.95 0.02 0.03
## 382 0.06 0.11 -0.38 1.08_* 0.01 0.09_*
## 390 -0.03 -0.02 0.28 1.07_* 0.00 0.07_*
## 395 -0.11 -0.06 0.46_* 1.02 0.02 0.06_*
## 396 -0.11 0.03 -0.21 1.06_* 0.00 0.06_*
## 399 -0.15 0.03 -0.40 1.07_* 0.01 0.08_*
## 424 -0.04 0.00 0.26 1.06_* 0.00 0.06_*
## 434 0.00 0.05 -0.23 0.93_* 0.01 0.01
## 438 0.05 0.07 -0.23 0.94_* 0.01 0.01
## 456 0.06 0.05 -0.35 1.04 0.01 0.06_*
## 466 0.05 -0.03 -0.21 1.06_* 0.00 0.05
## 489 0.05 -0.05 -0.30 1.05 0.01 0.06_*
## 491 0.03 -0.01 -0.18 1.06_* 0.00 0.05
## 507 -0.05 0.11 0.24 0.93_* 0.01 0.01
## 508 -0.01 0.02 0.12 1.06_* 0.00 0.04
## 526 0.03 -0.02 -0.18 0.91_* 0.01 0.01
## 528 0.04 0.04 -0.24 0.94_* 0.01 0.01
## 535 -0.17 0.20 -0.30 0.90_* 0.02 0.01
## 539 -0.21 -0.03 -0.46_* 1.00 0.02 0.05
## 545 0.02 0.11 -0.20 0.91_* 0.01 0.01
## 551 0.02 -0.07 -0.32 1.08_* 0.01 0.08_*
# from what data set they come from?
table(mydat$source[id1])##
## cleveland
## 16
table(mydat$source[id2])##
## cleveland hungarian
## 24 27
Based on the above analysis, using hat values and cooks distance, we have some leverage points in both models. We should properly investigate them before we decide whether to exclude them or use other technique of dealing with influential observations. Since I am not a specialist in heart disease, I will leave this analysis for a specialist in the field.
Now lets see how our models perform on our testing data. For that purposes I created a created a function that returns best combination of threshold and accuracy along with True positive and False Positive rates.
# Perform classification of testing data
log_pred_d1 = predict(log_d1, # specifing the model given by model selection procedure
d1_test, # predicting on the testing data
type="response") # predicting probability scores
log_pred_d2 = predict(log_d2, # specifing the model given by model selection procedure
d2_test, # predicting on the testing data
type="response") # predicting probability scores
# Function that finds the treshold that maximizes the accuracy
optim = function(precision, # How many models you want to compare
pred,
dataset) { # Predicted values
accuracy = matrix(0, nrow = precision, ncol = 4)
k = 1/precision
for (i in 1:precision) {
f = factor(pred > i*k,
levels = c(FALSE, TRUE),
labels = c("not sick", "sick"))
t = table(dataset$num, f)
accuracy[i, 1] = i*k
accuracy[i, 2] = sum(diag(t)/sum(t))
accuracy[i, 3] = t[1, 2]/sum(t[1,])
accuracy[i, 4] = t[2, 2]/sum(t[2,])
}
output = list(highest_accuracy = max(accuracy[ ,2]), # Highest accuracy
optimal_treshold = accuracy[which.max(accuracy[ ,2]), 1], # Optimal threshold
matrix = accuracy, # All possible models
False.positive = accuracy[which.max(accuracy[ ,2]), 3],
True.positive = accuracy[which.max(accuracy[ ,2]), 4])
return(output)
}
# Function that returns accuracy, false positive and true positive rates
ac = function(conf) {
accuracy = round(sum(diag(conf))/sum(conf), 2)
False.positive = round(conf[1, 2]/sum(conf[1,]), 2)
True.positive = round(conf[2, 2]/sum(conf[2,]), 2)
output = list(accuracy = accuracy,
False.positive = False.positive,
True.positive = True.positive)
return(output)
}
opt_d1 = optim(1000, log_pred_d1, d1_test)
opt_d2 = optim(1000, log_pred_d2, d2_test)
# Ploting Accuracy vs Treshold
plot(x= opt_d1$matrix[,1],
y = opt_d1$matrix[,2],
col=ifelse(opt_d1$matrix[, 1] == opt_d1$optimal_treshold, "red", "black"),
ylab = "Accuracy",
xlab = "Treshold",
type = 'l',
main = "Best treshold for log_d1")
abline(v = opt_d1$optimal_treshold, lty = 2)plot(x= opt_d2$matrix[,1],
y = opt_d2$matrix[,2],
col = ifelse(opt_d1$matrix[, 1] == opt_d2$optimal_treshold, "red", "black"),
ylab = "Accuracy",
xlab = "Treshold",
type = "l",
main = "Best treshold for log_d2")
abline(v = opt_d2$optimal_treshold, lty = 2)# The best combination of treshhold and accuracy
data.frame("Optimal Treshold" = opt_d1$optimal_treshold,
"Highest Accuracy" = opt_d1$highest_accuracy,
"False.positive" = opt_d1$False.positive,
"True.positive" = opt_d1$True.positive)## Optimal.Treshold Highest.Accuracy False.positive True.positive
## 1 0.633 0.8266667 0.1627907 0.8125
data.frame("Optimal Treshold" = opt_d2$optimal_treshold,
"Highest Accuracy" = opt_d2$highest_accuracy,
"False.positive" = opt_d2$False.positive,
"True.positive" = opt_d2$True.positive)## Optimal.Treshold Highest.Accuracy False.positive True.positive
## 1 0.411 0.772973 0.2903226 0.8369565
f1 = factor(log_pred_d1> opt_d1$optimal_treshold,
levels = c(FALSE, TRUE),
labels = c("not sick", "sick"))
conf1 = table(d1_test$num, f1, dnn=c("Actual", "Predicted"))
f2 = factor(log_pred_d2> opt_d2$optimal_treshold,
levels = c(FALSE, TRUE),
labels = c("not sick", "sick"))
conf2 = table(d2_test$num, f2, dnn=c("Actual", "Predicted"))
summ_d1 = rbind(log_d1 = ac(conf1))
summ_d1## accuracy False.positive True.positive
## log_d1 0.83 0.16 0.81
summ_d2 = rbind(log_d2 = ac(conf2))
summ_d2## accuracy False.positive True.positive
## log_d2 0.77 0.29 0.84
So, based on the above analysis logistic model created for d1 has accuracy of 83% with TP rate of 81%. Although, the model for d2 has lower accuracy which is 77%, it has higher TP rate.
One of the ways to asses a model is to create a ROC curve that shows a different combinations of TP and FP rates for different thresholds. Model with higher AUC (area under the curve) considered to be better. This procedure is especially useful when dealing with high-bias data, that is, where one class is much more common than the other. But, in our case we have well balanced data, so decision based on simple accuracy metric is also a good one.
# Function that helps to construct ROC and AUC
perform = function (predicted, testdata) {
result = list()
pred = prediction(predicted, testdata)
perf1 = performance(pred, "tpr", "fpr")
perf2 = performance(pred, "auc")
result = c(result, roc = perf1, auc = perf2@y.values[[1]])
}
# ROC curves for log_d1 and log_d2
plot(perform(log_pred_d1, d1_test$num)$roc, col = "red")
plot(perform(log_pred_d2, d2_test$num)$roc, add = TRUE, col = "blue")
legend("right",
legend = c("log_d1", "log_d2"),
col = c("red", "blue"),
lty = 1,
cex = 0.8)# AUC for log_d1 and log_d2
AUC_d1 = rbind(log_d1 = perform(log_pred_d1, d1_test$num)$auc)
AUC_d1## [,1]
## log_d1 0.8968023
AUC_d2 = rbind(log_d2 = perform(log_pred_d2, d2_test$num)$auc)
AUC_d2## [,1]
## log_d2 0.8481767
As we can see both models have pretty high AUC, meaning they both are good in predicting heart disease.
For the initial model, we chose very low complexity parameter (cp). Below are trees constructed using both Gini Index and Entropy measure.
# Build a tree model using GINI index
tree_g_d1 = rpart(num ~ .,
data = d1_train,
method = "class",
control = rpart.control(cp = 0.00001),
parms = list(split="gini"))
tree_g_d2 = rpart(num ~ .,
data = d2_train,
method = "class",
control = rpart.control(cp = 0.00001),
parms = list(split="gini"))
# Build a tree model using Entropy index
tree_e_d1 = rpart(num ~ .,
data = d1_train,
method = "class",
control = rpart.control(cp=0.00001),
parms = list(split="information"))
tree_e_d2 = rpart(num ~ .,
data = d2_train,
method = "class",
control = rpart.control(cp=0.00001),
parms = list(split="information"))
# Ploting tree
fancyRpartPlot(tree_g_d1,
sub = "Tree based on GINI Index for d_1")fancyRpartPlot(tree_e_d1,
sub = "Tree based on Entropy for d_1")# Ploting tree
fancyRpartPlot(tree_g_d2,
sub = "Tree based on GINI Index for d_2")fancyRpartPlot(tree_e_d2,
sub = "Tree based on Entropy for d_2")# Using our model to predict the test set
pred_g_d1 = predict(tree_g_d1,
d1_test,
type = "class")
pred_e_d1 = predict(tree_e_d1,
d1_test,
type = "class")
pred_g_d2 = predict(tree_g_d2,
d2_test,
type = "class")
pred_e_d2 = predict(tree_e_d2,
d2_test,
type = "class")
# Construct the confusion matrix for d1 (gini vs entropy)
conf_g_d1 = table(d1_test$num,
pred_g_d1, dnn=c("Actual:", "Predicted:"))
conf_e_d1 = table(d1_test$num,
pred_e_d1, dnn=c("Actual:", "Predicted:"))
# Construct the confusion matrix for d2 (gini vs entropy)
conf_g_d2 = table(d2_test$num,
pred_g_d2, dnn=c("Actual:", "Predicted:"))
conf_e_d2 = table(d2_test$num,
pred_e_d2, dnn=c("Actual:", "Predicted:"))
summ_d1 = rbind(summ_d1, dtree_gini_d1 = ac(conf_g_d1),
dtree_entropy_d1 = ac(conf_e_d1))
summ_d1## accuracy False.positive True.positive
## log_d1 0.83 0.16 0.81
## dtree_gini_d1 0.76 0.26 0.78
## dtree_entropy_d1 0.83 0.23 0.91
summ_d2 = rbind(summ_d2, dtree_gini_d2 = ac(conf_g_d2),
dtree_entropy_d2 = ac(conf_e_d2))
summ_d2## accuracy False.positive True.positive
## log_d2 0.77 0.29 0.84
## dtree_gini_d2 0.76 0.22 0.73
## dtree_entropy_d2 0.76 0.26 0.78
Based on the above tables. we can notice for d1, the model that uses entropy measure gives higher accuracy as well as false and true positive rates. If we compare with logistic model, model based on entropy gives us higher true positive rate, but lower false positive rate.
For d2 logistic model performs better than decision tree models in terms of accuracy and true positive rate.
To avoid over-fitting lets try to prune our trees based on the value of complexity parameter that minimizes cross-validated error.
# Cp table for d1
printcp(tree_g_d1)##
## Classification tree:
## rpart(formula = num ~ ., data = d1_train, method = "class", parms = list(split = "gini"),
## control = rpart.control(cp = 1e-05))
##
## Variables actually used in tree construction:
## [1] ca cp oldpeak thal trestbps
##
## Root node error: 107/224 = 0.47768
##
## n= 224
##
## CP nsplit rel error xerror xstd
## 1 0.5046729 0 1.00000 1.00000 0.069868
## 2 0.0607477 1 0.49533 0.61682 0.063767
## 3 0.0233645 3 0.37383 0.54206 0.061272
## 4 0.0186916 5 0.32710 0.55140 0.061611
## 5 0.0093458 6 0.30841 0.53271 0.060924
## 6 0.0000100 8 0.28972 0.57944 0.062581
printcp(tree_e_d1)##
## Classification tree:
## rpart(formula = num ~ ., data = d1_train, method = "class", parms = list(split = "information"),
## control = rpart.control(cp = 1e-05))
##
## Variables actually used in tree construction:
## [1] age ca cp exang thal
##
## Root node error: 107/224 = 0.47768
##
## n= 224
##
## CP nsplit rel error xerror xstd
## 1 0.5046729 0 1.00000 1.00000 0.069868
## 2 0.0607477 1 0.49533 0.57009 0.062266
## 3 0.0280374 3 0.37383 0.53271 0.060924
## 4 0.0093458 5 0.31776 0.45794 0.057824
## 5 0.0000100 6 0.30841 0.45794 0.057824
# Cross-valiedation error plot for d1
par(mfrow=c(1,2))
plotcp(tree_e_d1, sub = "Entropy_d1")
plotcp(tree_g_d1, sub = "Gini_d1")Based on plots above, complexity parameter that minimizes cross-validated error for d1 is 0.0093458 for model with Gini index, and 0.0093458 for model with Entropy.
# Pruning tree for d1
pruned_g_d1 = prune(tree_g_d1, cp = tree_g_d1$cptable[which.min(tree_g_d1$cptable[,"xerror"]),"CP"])
pruned_e_d1 = prune(tree_e_d1, cp = tree_e_d1$cptable[which.min(tree_e_d1$cptable[,"xerror"]),"CP"])
par(mfrow = c(1,2))
# Ploting trees for Gini on d1
fancyRpartPlot(tree_g_d1,
sub = "Gini on d_1")
fancyRpartPlot(pruned_g_d1,
sub = "Pruned GINI on d_1")# Ploting trees for Entropy on d1
fancyRpartPlot(tree_e_d1,
sub = "Entropy on d_1")
fancyRpartPlot(pruned_e_d1,
sub = "Pruned Entropy on d_1")# Cp table for d2
printcp(tree_g_d2)##
## Classification tree:
## rpart(formula = num ~ ., data = d2_train, method = "class", parms = list(split = "gini"),
## control = rpart.control(cp = 1e-05))
##
## Variables actually used in tree construction:
## [1] age chol cp exang fbs oldpeak restecg
## [8] sex thalach trestbps
##
## Root node error: 264/555 = 0.47568
##
## n= 555
##
## CP nsplit rel error xerror xstd
## 1 0.5227273 0 1.00000 1.00000 0.044565
## 2 0.0454545 1 0.47727 0.47727 0.037382
## 3 0.0113636 2 0.43182 0.48485 0.037590
## 4 0.0056818 13 0.30303 0.51136 0.038286
## 5 0.0037879 15 0.29167 0.51136 0.038286
## 6 0.0000100 16 0.28788 0.51515 0.038382
printcp(tree_e_d2)##
## Classification tree:
## rpart(formula = num ~ ., data = d2_train, method = "class", parms = list(split = "information"),
## control = rpart.control(cp = 1e-05))
##
## Variables actually used in tree construction:
## [1] age chol cp exang fbs oldpeak restecg
## [8] sex thalach trestbps
##
## Root node error: 264/555 = 0.47568
##
## n= 555
##
## CP nsplit rel error xerror xstd
## 1 0.5227273 0 1.00000 1.00000 0.044565
## 2 0.0454545 1 0.47727 0.47727 0.037382
## 3 0.0170455 2 0.43182 0.44697 0.036512
## 4 0.0113636 4 0.39773 0.48485 0.037590
## 5 0.0075758 8 0.35227 0.46591 0.037063
## 6 0.0037879 14 0.30682 0.46970 0.037171
## 7 0.0018939 15 0.30303 0.46212 0.036955
## 8 0.0000100 17 0.29924 0.45833 0.036846
# Cross-valiedation error plot for d2
par(mfrow=c(1,2))
plotcp(tree_e_d2, sub = "Entropy_d1")
plotcp(tree_g_d2, sub = "Gini_d1")Based on plots above, complexity parameter that minimizes cross-validated error for d2 is 0.0454545 for model with Gini index, and 0.0170455 for model with Entropy.
# Pruning tree for d2
pruned_g_d2 = prune(tree_g_d2, cp = tree_g_d2$cptable[which.min(tree_g_d2$cptable[,"xerror"]),"CP"])
pruned_e_d2 = prune(tree_e_d2, cp = tree_e_d2$cptable[which.min(tree_e_d2$cptable[,"xerror"]),"CP"])
par(mfrow = c(1,2))
# Ploting trees for Gini on d2
fancyRpartPlot(tree_g_d2,
sub = "Gini on d_2")
fancyRpartPlot(pruned_g_d2,
sub = "Pruned Gini on d_2")# Ploting trees for Entropy on d2
fancyRpartPlot(tree_e_d2,
sub = "Entropy on d_2")
fancyRpartPlot(pruned_e_d2,
sub = "Pruned Entropy on d_2")After we pruned trees, lets compare new trees with initial ones.
# Using our model to predict the test set
pred_pruned_g_d1 = predict(pruned_g_d1,
d1_test,
type = "class")
pred_pruned_e_d1 = predict(pruned_e_d1,
d1_test,
type = "class")
pred_pruned_g_d2 = predict(pruned_g_d2,
d2_test,
type = "class")
pred_pruned_e_d2 = predict(pruned_e_d2,
d2_test,
type = "class")
# Construct the confusion matrix for d1 (gini vs entropy)
conf_pruned_g_d1 = table(d1_test$num,
pred_pruned_g_d1, dnn=c("Actual:", "Predicted:"))
conf_pruned_e_d1 = table(d1_test$num,
pred_pruned_e_d1, dnn=c("Actual:", "Predicted:"))
# Construct the confusion matrix for d2 (gini vs entropy)
conf_pruned_g_d2 = table(d2_test$num,
pred_pruned_g_d2, dnn=c("Actual:", "Predicted:"))
conf_pruned_e_d2 = table(d2_test$num,
pred_pruned_e_d2, dnn=c("Actual:", "Predicted:"))
summ_d1 = rbind(summ_d1,
pruned_gini_d1 = ac(conf_pruned_g_d1),
pruned_entropy_d1 = ac(conf_pruned_e_d1))
summ_d1## accuracy False.positive True.positive
## log_d1 0.83 0.16 0.81
## dtree_gini_d1 0.76 0.26 0.78
## dtree_entropy_d1 0.83 0.23 0.91
## pruned_gini_d1 0.72 0.37 0.84
## pruned_entropy_d1 0.84 0.14 0.81
It seems that pruning for Entropy based model, reduces false positive rate which is good, however it also reduces true positive rate which is bad. Since we care more about true positive rate, initial model is better.
summ_d2 = rbind(summ_d2,
pruned_gini_d2 = ac(conf_pruned_g_d2),
pruned_entropy_d2 = ac(conf_pruned_e_d2))
summ_d2## accuracy False.positive True.positive
## log_d2 0.77 0.29 0.84
## dtree_gini_d2 0.76 0.22 0.73
## dtree_entropy_d2 0.76 0.26 0.78
## pruned_gini_d2 0.71 0.29 0.72
## pruned_entropy_d2 0.72 0.31 0.75
For d2, pruning makes our model worse in terms of accuracy false positive and true positive rates. However, pruned trees for both Gini and Entropy has low number of splits which is good when we want to interpret results.
# Using our model to predict the test set
pred_g_d1 = predict(tree_g_d1,
d1_test,
type = "prob")[, 2]
pred_e_d1 = predict(tree_e_d1,
d1_test,
type = "prob")[, 2]
pred_g_d2 = predict(tree_g_d2,
d2_test,
type = "prob")[, 2]
pred_e_d2 = predict(tree_e_d2,
d2_test,
type = "prob")[, 2]
pred_pruned_g_d1 = predict(pruned_g_d1,
d1_test,
type = "prob")[, 2]
pred_pruned_e_d1 = predict(pruned_e_d1,
d1_test,
type = "prob")[, 2]
pred_pruned_g_d2 = predict(pruned_g_d2,
d2_test,
type = "prob")[, 2]
pred_pruned_e_d2 = predict(pruned_e_d2,
d2_test,
type = "prob")[, 2]
# ROC curves trees on data d1
plot(perform(pred_g_d1, d1_test$num)$roc, col = "red")
plot(perform(pred_e_d1, d1_test$num)$roc, add = TRUE, col = "blue")
plot(perform(pred_pruned_g_d1, d1_test$num)$roc, add = TRUE, col = "green")
plot(perform(pred_pruned_e_d1, d1_test$num)$roc, add = TRUE, col = "orange")
legend("right",
legend=c("Gini_d1", "Entropy_d1", "Gini_pruned_d1", "Entropy_pruned_d1"),
col=c("red", "blue", "green", "orange"),
lty=1,
cex=0.8)# AUC values for trees on data d1
AUC_d1 = rbind(AUC_d1,
GINI_d1 = perform(pred_g_d1, d1_test$num)$auc,
Entropy_d1 = perform(pred_e_d1, d1_test$num)$auc,
Pruned_Gini_d1 = perform(pred_pruned_g_d1, d1_test$num)$auc,
Pruned_Entropy_d1 = perform(pred_pruned_e_d1, d1_test$num)$auc)
AUC_d1## [,1]
## log_d1 0.8968023
## GINI_d1 0.8724564
## Entropy_d1 0.8710029
## Pruned_Gini_d1 0.8350291
## Pruned_Entropy_d1 0.8713663
Based on AUC metric, logistic model is the best one followed by model based on GINI index. However, pruned Entropy model is very close to the model base on GINI, and since it has lower number of splits, I would choose pruned Entropy compared to Gini.
# ROC curves trees on data d2
plot(perform(pred_g_d2, d2_test$num)$roc, col = "red")
plot(perform(pred_e_d2, d2_test$num)$roc, add = TRUE, col = "blue")
plot(perform(pred_pruned_g_d2, d2_test$num)$roc, add = TRUE, col = "green")
plot(perform(pred_pruned_e_d2, d2_test$num)$roc, add = TRUE, col = "orange")
legend("right",
legend=c("Gini_d2", "Entropy_d2", "Gini_pruned_d2", "Entropy_pruned_d2"),
col=c("red", "blue", "green", "orange"),
lty=1:2,
cex=0.8)# AUC values for trees on data d1
AUC_d2 = rbind(AUC_d2, GINI_d2 = perform(pred_g_d2, d2_test$num)$auc,
Entropy_d2 = perform(pred_e_d2, d2_test$num)$auc,
Pruned_Gini_d2 = perform(pred_pruned_g_d2, d2_test$num)$auc,
Pruned_Entropy_d2 = perform(pred_pruned_e_d2, d2_test$num)$auc)
AUC_d2## [,1]
## log_d2 0.8481767
## GINI_d2 0.8252688
## Entropy_d2 0.8172043
## Pruned_Gini_d2 0.7135344
## Pruned_Entropy_d2 0.7161057
For d2, logistic model is the best one in terms of AUC metric, followed by the model based on GINI.
# Tuning svm fom d1
tune_d1 = tune(svm, #ten-fold cross-validation on a set of models of interest.
num ~ .,
data = d1_train,
kernel = "linear",
ranges = list(cost=seq(0.01, 5, 0.1)))
# Assigning the best model based on cost component
bestmod_d1 = tune_d1$best.model
summary(bestmod_d1) ##
## Call:
## best.tune(method = svm, train.x = num ~ ., data = d1_train, ranges = list(cost = seq(0.01,
## 5, 0.1)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
## gamma: 0.05263158
##
## Number of Support Vectors: 156
##
## ( 77 79 )
##
##
## Number of Classes: 2
##
## Levels:
## not sick sick
# Predicting on testing set
pred_svm_d1 = predict(bestmod_d1, d1_test)
svm.perf1 = table(d1_test$num,
pred_svm_d1,
dnn=c("Actual", "Predicted"))
# Tuning svm for d2
tune_d2 = tune(svm, #ten-fold cross-validation on a set of models of interest.
num ~ .,
data = d2_train,
kernel = "linear",
ranges = list(cost=seq(0.01, 5, 0.1)))
# Assigning the best model based on cost component
bestmod_d2 = tune_d2$best.model
summary(bestmod_d2) ##
## Call:
## best.tune(method = svm, train.x = num ~ ., data = d2_train, ranges = list(cost = seq(0.01,
## 5, 0.1)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.41
## gamma: 0.07142857
##
## Number of Support Vectors: 248
##
## ( 125 123 )
##
##
## Number of Classes: 2
##
## Levels:
## not sick sick
# Predicting on testing set
pred_svm_d2 = predict(bestmod_d2, d2_test)
svm.perf2 = table(d2_test$num,
pred_svm_d2,
dnn=c("Actual", "Predicted"))summ_d1 = rbind(summ_d1,
svm_d1 = ac(svm.perf1))
summ_d1## accuracy False.positive True.positive
## log_d1 0.83 0.16 0.81
## dtree_gini_d1 0.76 0.26 0.78
## dtree_entropy_d1 0.83 0.23 0.91
## pruned_gini_d1 0.72 0.37 0.84
## pruned_entropy_d1 0.84 0.14 0.81
## svm_d1 0.85 0.14 0.84
Based on above table, we can see that for d1, svm performs pretty well in terms of accuracy, false positive rate and true positive rate. Since decision is based on the true positive rate, still, the best model for d1 is decision tree based on Entropy.
summ_d2 = rbind(summ_d2,
svm_d2 = ac(svm.perf2))
summ_d2## accuracy False.positive True.positive
## log_d2 0.77 0.29 0.84
## dtree_gini_d2 0.76 0.22 0.73
## dtree_entropy_d2 0.76 0.26 0.78
## pruned_gini_d2 0.71 0.29 0.72
## pruned_entropy_d2 0.72 0.31 0.75
## svm_d2 0.77 0.24 0.77
For d2, still the best model is logistic one.
svm_d1 <- svm(d1_train$num ~ .,
data = d1_train,
cost = bestmod_d1$cost,
kernel = "linear",
probability = TRUE)
summary(svm_d1)##
## Call:
## svm(formula = d1_train$num ~ ., data = d1_train, cost = bestmod_d1$cost,
## kernel = "linear", probability = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
## gamma: 0.05263158
##
## Number of Support Vectors: 156
##
## ( 77 79 )
##
##
## Number of Classes: 2
##
## Levels:
## not sick sick
svm_d2 <- svm(d2_train$num ~ .,
data = d2_train,
cost = bestmod_d2$cost,
kernel = "linear",
probability = TRUE)
pred_svm_d1 = predict(svm_d1,
d1_test,
probability=TRUE)
pred_svm_d2 = predict(svm_d2,
d2_test,
probability=TRUE)
f1 = attributes(pred_svm_d1)$probabilities[,1]
f2 = attributes(pred_svm_d2)$probabilities[,2]
plot(perform(f1, d1_test$num)$roc, col = "red")
plot(perform(f2, d2_test$num)$roc, add = TRUE, col = "blue")
legend("right",
legend = c("svm_d1", "svm_d2"),
col = c("red", "blue", "green", "orange"),
lty = 1,
cex = 0.8)# AUC values for trees on data d1
AUC_d1 = rbind(AUC_d1, svm_d1 = perform(f1, d1_test$num)$auc)
AUC_d1## [,1]
## log_d1 0.8968023
## GINI_d1 0.8724564
## Entropy_d1 0.8710029
## Pruned_Gini_d1 0.8350291
## Pruned_Entropy_d1 0.8713663
## svm_d1 0.9382267
We can notice, that svm for d1 performs much better then other techniques in terms of AUC metric.
AUC_d2 = rbind(AUC_d2, svm_d2 = perform(f2, d2_test$num)$auc)
AUC_d2## [,1]
## log_d2 0.8481767
## GINI_d2 0.8252688
## Entropy_d2 0.8172043
## Pruned_Gini_d2 0.7135344
## Pruned_Entropy_d2 0.7161057
## svm_d2 0.8412810
For d2, svm model is very close to logistic regression.