library(e1071)

This project compares Logistic regression model and support vector machine. We have a data in csv format and the data is divided into training and testing sets. We will generate 50 training and testing data sets

Reading the data

heart<- read.csv("/Users/neha/Documents/DS-630-ML/Project/Heart.csv")
hdata <- na.omit(heart)

We begin with the logistic regression

# split the data
hrt_smp_size1 <-floor(0.80* nrow(hdata))

Creating list to store the results for 50 runs

#
listModel_1 <- list()
listModel_2 <- list()
listModel_3 <- list()

Running 50 times on the random training and test datasets, we will choose the best model based on the performance of the model in each run

for(i in 1:50){
  
  train_ind <-sample(seq_len(nrow(heart)), size = hrt_smp_size1)
  train_set <- heart[train_ind,]
  test_set  <-heart[-train_ind,]
  
  # logistic model
  log_model <-glm(AHD ~ Age + as.factor(Sex) + ChestPain + RestBP + Chol + as.factor(Fbs) +
                        (RestECG) + MaxHR + as.factor(ExAng)+ Oldpeak + 
                     as.factor(Slope)+ Ca + Thal,data=train_set, family="binomial")
  
  test_set$predicted = predict(log_model, newdata=test_set, type="response")
  AHD_pred <- predict(log_model, newdata=test_set, type="response")
  AHD_pred <- ifelse(AHD_pred>0.5,"Yes", "No")
 # accuracy <- table(AHD_pred,test_set$AHD)
  confusionmatrix <- table(test_set$AHD, AHD_pred)
 accuracy<- sum(diag(confusionmatrix))/sum(confusionmatrix)
  
  listModel_1[[i]] <- confusionmatrix
  listModel_2[[i]] <- summary(log_model)
  listModel_3[[i]] <- accuracy
  

}  

listModel_1 contains the confusion matrix listModel_2 contains the best logistic model lostModel_3 contains the accuracy of the best model Lets get the numbers

a <-which.max( listModel_3[] )
a
## [1] 20
max(unlist(listModel_3))
## [1] 0.9076923
listModel_2[a]
## [[1]]
## 
## Call:
## glm(formula = AHD ~ Age + as.factor(Sex) + ChestPain + RestBP + 
##     Chol + as.factor(Fbs) + (RestECG) + MaxHR + as.factor(ExAng) + 
##     Oldpeak + as.factor(Slope) + Ca + Thal, family = "binomial", 
##     data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5856  -0.4961  -0.1692   0.3822   2.3561  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -4.599377   3.132756  -1.468 0.142062    
## Age                  0.001580   0.028123   0.056 0.955185    
## as.factor(Sex)1      1.166142   0.580811   2.008 0.044667 *  
## ChestPainnonanginal -1.884731   0.559754  -3.367 0.000760 ***
## ChestPainnontypical -1.071522   0.614575  -1.744 0.081244 .  
## ChestPaintypical    -1.985856   0.725869  -2.736 0.006222 ** 
## RestBP               0.027003   0.012577   2.147 0.031796 *  
## Chol                 0.006264   0.005255   1.192 0.233255    
## as.factor(Fbs)1     -0.593988   0.709824  -0.837 0.402699    
## RestECG              0.346194   0.217079   1.595 0.110760    
## MaxHR               -0.016976   0.011691  -1.452 0.146465    
## as.factor(ExAng)1    0.431164   0.496505   0.868 0.385177    
## Oldpeak              0.571578   0.267468   2.137 0.032598 *  
## as.factor(Slope)2    0.890087   0.531708   1.674 0.094128 .  
## as.factor(Slope)3   -0.915582   1.242054  -0.737 0.461030    
## Ca                   1.130532   0.303339   3.727 0.000194 ***
## Thalnormal          -0.691007   0.912611  -0.757 0.448945    
## Thalreversable       0.737270   0.892750   0.826 0.408894    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 319.89  on 231  degrees of freedom
## Residual deviance: 158.28  on 214  degrees of freedom
##   (5 observations deleted due to missingness)
## AIC: 194.28
## 
## Number of Fisher Scoring iterations: 6
listModel_1[[a]]
##      AHD_pred
##       No Yes
##   No  31   3
##   Yes  3  28

Plotting the confusion Matrix

ctable <- as.table(matrix(unlist(listModel_1[[a]]), nrow = 2, byrow = TRUE))
fourfoldplot(ctable, color = c("#CC6666", "#99CC49"),
             conf.level = 0, margin = 1, main = "Confusion Matrix Logistic Regression")

Lets see the SVM model now. Again we start by sampling the data into 80:20 ratio.

#Remove NA's
hdata_1 <- na.omit(heart)
#Taking sample
hrt_smp_size <-floor(0.80* nrow(hdata_1))

# Declaring variables to store values for each 
# iteration to find average values. 
x1 <- 0
x2 <- 0
x3 <- 0
x4 <- 0

Lets run 50 times on random test and training data sets

# Number of Iterations
factor <- 50

for(i in 1:factor){ # Iterating factor times
  train_ind_1 <-sample(seq_len(nrow(heart)), size = hrt_smp_size)
  heart_train_1 <- heart[train_ind,]
  heart_test_1  <-heart[-train_ind,]
  heart_test_1 <- na.omit(heart_test_1)
  heart_train_1 <- na.omit(heart_train_1)
  #View(heart_test)
  
  #View(heart_train)
  # Model with train data , radial kernal , cost=1 and gamma=-0
  heartsvm <- svm(AHD~ . , data=heart_train_1 ,kernel ="linear", cost=1 ,scale=FALSE)
  #summary(heartsvm)
  # Cross validating model and finding best model in terms of cost and gamma
  tune.out=tune(svm, AHD~., data=heart_train_1 ,kernel="linear",ranges=list(cost=c( 0.01, 0.1, 1,5,10)))
  
  
  # Predicting with test model 
  trainpred=predict(tune.out$best.model ,heart_test_1)
  # Creating table for cross validation
  x = table(predict=trainpred, truth=heart_test_1$AHD)
  x1 = x1 + x[1]
  x2=  x2 +  x[2]
  x3 = x3 +  x[3]
  x4= x4 +  x[4]
}

x11 and X22 are true negative and True positive X12 ans X21 are false positive and false negative

svm.out  <- matrix(c(x1/factor,x2/factor,x3/factor,x4/factor) , nrow=2,ncol=2)
colnames(svm.out) <- c("no", "yes")
rownames(svm.out) <- c("no", "yes")
svm.out
##        no  yes
## no  31.38  4.7
## yes  3.62 25.3

Confusion Matrix

ctable1 <- as.table(matrix(svm.out, nrow = 2, byrow = TRUE))
fourfoldplot(ctable1, color = c("#CC6666", "#99CC49"),
             conf.level = 0, margin = 1, main = "Confusion Matrix SVM")

Conclusion

Repeated this procedure with kernel=‘radial’. Out of these two models ‘linear’ shows better performence.In most of the cases Logistic performed better than the SVM model, as seen in the confusion matrix. But the difference in performance is less.