library(e1071)
This project compares Logistic regression model and support vector machine. We have a data in csv format and the data is divided into training and testing sets. We will generate 50 training and testing data sets
Reading the data
heart<- read.csv("/Users/neha/Documents/DS-630-ML/Project/Heart.csv")
hdata <- na.omit(heart)
We begin with the logistic regression
# split the data
hrt_smp_size1 <-floor(0.80* nrow(hdata))
Creating list to store the results for 50 runs
#
listModel_1 <- list()
listModel_2 <- list()
listModel_3 <- list()
Running 50 times on the random training and test datasets, we will choose the best model based on the performance of the model in each run
for(i in 1:50){
train_ind <-sample(seq_len(nrow(heart)), size = hrt_smp_size1)
train_set <- heart[train_ind,]
test_set <-heart[-train_ind,]
# logistic model
log_model <-glm(AHD ~ Age + as.factor(Sex) + ChestPain + RestBP + Chol + as.factor(Fbs) +
(RestECG) + MaxHR + as.factor(ExAng)+ Oldpeak +
as.factor(Slope)+ Ca + Thal,data=train_set, family="binomial")
test_set$predicted = predict(log_model, newdata=test_set, type="response")
AHD_pred <- predict(log_model, newdata=test_set, type="response")
AHD_pred <- ifelse(AHD_pred>0.5,"Yes", "No")
# accuracy <- table(AHD_pred,test_set$AHD)
confusionmatrix <- table(test_set$AHD, AHD_pred)
accuracy<- sum(diag(confusionmatrix))/sum(confusionmatrix)
listModel_1[[i]] <- confusionmatrix
listModel_2[[i]] <- summary(log_model)
listModel_3[[i]] <- accuracy
}
listModel_1 contains the confusion matrix listModel_2 contains the best logistic model lostModel_3 contains the accuracy of the best model Lets get the numbers
a <-which.max( listModel_3[] )
a
## [1] 20
max(unlist(listModel_3))
## [1] 0.9076923
listModel_2[a]
## [[1]]
##
## Call:
## glm(formula = AHD ~ Age + as.factor(Sex) + ChestPain + RestBP +
## Chol + as.factor(Fbs) + (RestECG) + MaxHR + as.factor(ExAng) +
## Oldpeak + as.factor(Slope) + Ca + Thal, family = "binomial",
## data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5856 -0.4961 -0.1692 0.3822 2.3561
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.599377 3.132756 -1.468 0.142062
## Age 0.001580 0.028123 0.056 0.955185
## as.factor(Sex)1 1.166142 0.580811 2.008 0.044667 *
## ChestPainnonanginal -1.884731 0.559754 -3.367 0.000760 ***
## ChestPainnontypical -1.071522 0.614575 -1.744 0.081244 .
## ChestPaintypical -1.985856 0.725869 -2.736 0.006222 **
## RestBP 0.027003 0.012577 2.147 0.031796 *
## Chol 0.006264 0.005255 1.192 0.233255
## as.factor(Fbs)1 -0.593988 0.709824 -0.837 0.402699
## RestECG 0.346194 0.217079 1.595 0.110760
## MaxHR -0.016976 0.011691 -1.452 0.146465
## as.factor(ExAng)1 0.431164 0.496505 0.868 0.385177
## Oldpeak 0.571578 0.267468 2.137 0.032598 *
## as.factor(Slope)2 0.890087 0.531708 1.674 0.094128 .
## as.factor(Slope)3 -0.915582 1.242054 -0.737 0.461030
## Ca 1.130532 0.303339 3.727 0.000194 ***
## Thalnormal -0.691007 0.912611 -0.757 0.448945
## Thalreversable 0.737270 0.892750 0.826 0.408894
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 319.89 on 231 degrees of freedom
## Residual deviance: 158.28 on 214 degrees of freedom
## (5 observations deleted due to missingness)
## AIC: 194.28
##
## Number of Fisher Scoring iterations: 6
listModel_1[[a]]
## AHD_pred
## No Yes
## No 31 3
## Yes 3 28
Plotting the confusion Matrix
ctable <- as.table(matrix(unlist(listModel_1[[a]]), nrow = 2, byrow = TRUE))
fourfoldplot(ctable, color = c("#CC6666", "#99CC49"),
conf.level = 0, margin = 1, main = "Confusion Matrix Logistic Regression")
Lets see the SVM model now. Again we start by sampling the data into 80:20 ratio.
#Remove NA's
hdata_1 <- na.omit(heart)
#Taking sample
hrt_smp_size <-floor(0.80* nrow(hdata_1))
# Declaring variables to store values for each
# iteration to find average values.
x1 <- 0
x2 <- 0
x3 <- 0
x4 <- 0
Lets run 50 times on random test and training data sets
# Number of Iterations
factor <- 50
for(i in 1:factor){ # Iterating factor times
train_ind_1 <-sample(seq_len(nrow(heart)), size = hrt_smp_size)
heart_train_1 <- heart[train_ind,]
heart_test_1 <-heart[-train_ind,]
heart_test_1 <- na.omit(heart_test_1)
heart_train_1 <- na.omit(heart_train_1)
#View(heart_test)
#View(heart_train)
# Model with train data , radial kernal , cost=1 and gamma=-0
heartsvm <- svm(AHD~ . , data=heart_train_1 ,kernel ="linear", cost=1 ,scale=FALSE)
#summary(heartsvm)
# Cross validating model and finding best model in terms of cost and gamma
tune.out=tune(svm, AHD~., data=heart_train_1 ,kernel="linear",ranges=list(cost=c( 0.01, 0.1, 1,5,10)))
# Predicting with test model
trainpred=predict(tune.out$best.model ,heart_test_1)
# Creating table for cross validation
x = table(predict=trainpred, truth=heart_test_1$AHD)
x1 = x1 + x[1]
x2= x2 + x[2]
x3 = x3 + x[3]
x4= x4 + x[4]
}
x11 and X22 are true negative and True positive X12 ans X21 are false positive and false negative
svm.out <- matrix(c(x1/factor,x2/factor,x3/factor,x4/factor) , nrow=2,ncol=2)
colnames(svm.out) <- c("no", "yes")
rownames(svm.out) <- c("no", "yes")
svm.out
## no yes
## no 31.38 4.7
## yes 3.62 25.3
Confusion Matrix
ctable1 <- as.table(matrix(svm.out, nrow = 2, byrow = TRUE))
fourfoldplot(ctable1, color = c("#CC6666", "#99CC49"),
conf.level = 0, margin = 1, main = "Confusion Matrix SVM")
Repeated this procedure with kernel=‘radial’. Out of these two models ‘linear’ shows better performence.In most of the cases Logistic performed better than the SVM model, as seen in the confusion matrix. But the difference in performance is less.