library(ggplot2)
library(dplyr)
library(e1071)
library(caret)
heart <- read.csv("heart_tidy_svm.csv")
colnames(heart) <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exaang", "oldpeak", "slope", "ca", "thal", "class")
colSums(is.na(heart))
## age sex cp trestbps chol fbs restecg thalach
## 0 0 0 0 0 0 0 0
## exaang oldpeak slope ca thal class
## 0 0 0 0 0 0
str(heart)
## 'data.frame': 299 obs. of 14 variables:
## $ age : int 67 67 37 41 56 62 57 63 53 57 ...
## $ sex : int 1 1 1 0 1 0 0 1 1 1 ...
## $ cp : int 4 4 3 2 2 4 4 4 4 4 ...
## $ trestbps: int 160 120 130 130 120 140 120 130 140 140 ...
## $ chol : int 286 229 250 204 236 268 354 254 203 192 ...
## $ fbs : int 0 0 0 0 0 0 0 0 1 0 ...
## $ restecg : int 2 2 0 2 0 2 0 2 2 0 ...
## $ thalach : int 108 129 187 172 178 160 163 147 155 148 ...
## $ exaang : int 1 1 0 0 0 0 1 0 1 0 ...
## $ oldpeak : num 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 0.4 ...
## $ slope : int 2 2 3 1 1 3 1 2 3 2 ...
## $ ca : int 3 2 0 0 0 2 0 1 0 0 ...
## $ thal : int 3 7 3 3 3 3 3 7 7 6 ...
## $ class : int 1 1 0 0 0 1 0 1 1 0 ...
head(heart)
## age sex cp trestbps chol fbs restecg thalach exaang oldpeak slope ca
## 1 67 1 4 160 286 0 2 108 1 1.5 2 3
## 2 67 1 4 120 229 0 2 129 1 2.6 2 2
## 3 37 1 3 130 250 0 0 187 0 3.5 3 0
## 4 41 0 2 130 204 0 2 172 0 1.4 1 0
## 5 56 1 2 120 236 0 0 178 0 0.8 1 0
## 6 62 0 4 140 268 0 2 160 0 3.6 3 2
## thal class
## 1 3 1
## 2 7 1
## 3 3 0
## 4 3 0
## 5 3 0
## 6 3 1
heart$class <- as.factor(heart$class)
ggplot(data = heart, aes(x = class, fill = class)) +
geom_bar()
set.seed(123)
index = sample(1:nrow(heart), .7*nrow(heart))
train <- heart[index,]
test <- heart[-index,]
accuracy <- c()
j <- 1
cost <- c(0.001,.01, .05, .1, .2, .5, 1, 2,5)
for (i in cost)
{
svm.linear <- svm(class ~ ., data = train, kernel = "linear", cost = i)
test$pred <- predict(svm.linear, newdata = test)
accuracy[j] = mean(test$pred == test$class)
j <- j + 1
}
ggplot(data = data.frame(accuracy,cost), aes(x = cost, y = accuracy)) +
geom_point()
From the plot we see that out model has the best accuracy when cost = 0.05 or 0.01
svm.linear <- svm(class ~ ., data = train, kernel = "linear", cost = .01)
test$pred <- predict(svm.linear, newdata = test)
confusionMatrix(test$pred, test$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 48 9
## 1 4 29
##
## Accuracy : 0.8556
## 95% CI : (0.7657, 0.9208)
## No Information Rate : 0.5778
## P-Value [Acc > NIR] : 1.296e-08
##
## Kappa : 0.6986
## Mcnemar's Test P-Value : 0.2673
##
## Sensitivity : 0.9231
## Specificity : 0.7632
## Pos Pred Value : 0.8421
## Neg Pred Value : 0.8788
## Prevalence : 0.5778
## Detection Rate : 0.5333
## Detection Prevalence : 0.6333
## Balanced Accuracy : 0.8431
##
## 'Positive' Class : 0
##
accuracy <- c()
degree <- c(1,2,3,4,5,6)
c <- 0.05
j <- 1
for (i in degree)
{
svm.poly <- svm(class ~., data = train, kernel = "polynomial", degree = i, cost = c)
test$pred <- predict(svm.poly, newdata = test)
accuracy[j] <- mean(test$pred == test$class)
j <- j + 1
}
ggplot(data = data.frame(accuracy,degree), aes(x = degree, y = accuracy)) +
geom_point()
For cost = 0.05 we get the higest accuracy when degree = 1 followed by degree = 3. We know that degree = 1 implies a linear model so lets make model by varying cost with constant degree 3
accuracy <- c()
cost <- c(0.001,.01, .05, .1, .2, .5, 1, 2,5)
j <- 1
for (i in cost)
{
svm.poly <- svm(class ~., data = train, kernel = "polynomial", degree = 3, cost = i)
test$pred <- predict(svm.poly, newdata = test)
accuracy[j] <- mean(test$pred == test$class)
j <- j + 1
}
ggplot(data = data.frame(accuracy,cost), aes(x = cost, y = accuracy)) +
geom_point()
In polynomial SVM we get our best fit fir degree 3 when cost = 0.5
svm.poly <- svm(class ~., data = train, kernel = "polynomial", degree = 3, cost = 0.5)
test$pred <- predict(svm.poly, newdata = test)
confusionMatrix(test$pred, test$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 49 12
## 1 3 26
##
## Accuracy : 0.8333
## 95% CI : (0.74, 0.9036)
## No Information Rate : 0.5778
## P-Value [Acc > NIR] : 2.034e-07
##
## Kappa : 0.6472
## Mcnemar's Test P-Value : 0.03887
##
## Sensitivity : 0.9423
## Specificity : 0.6842
## Pos Pred Value : 0.8033
## Neg Pred Value : 0.8966
## Prevalence : 0.5778
## Detection Rate : 0.5444
## Detection Prevalence : 0.6778
## Balanced Accuracy : 0.8133
##
## 'Positive' Class : 0
##
We see that our accuracy for polynomial SVM is almost equal to linear SVM.
accuracy <- c(0)
cost <- c(0.001,.01, .05, .1, .2, .5, 1, 2,5)
gamma <- c(10^-6,10^-5,10^-4,10^-3,10^-2,10^-1,10^0,10^1,10^2,10^3)
j <- 1
for (i in cost)
{
for (k in gamma)
{
svm.radial <- svm(class ~., data = train, kernel = "radial", gamma = k, cost = i)
test$pred <- predict(svm.radial, newdata = test)
accuracy[j] <- mean(test$pred == test$class)
if(accuracy[j] >= max(accuracy))
{
des_gamma <- k
des_cost <- i
}
j <- j + 1
}
}
des_gamma
## [1] 0.001
des_cost
## [1] 2
we get our highest accuracy at gamma = 0.001 and cost = 0.2. It is observed that higher penalty is desirable in radial model as compared to liner model
svm.radial <- svm(class ~., data = train, kernel = "radial", gamma = .001, cost = 2)
test$pred <- predict(svm.radial, newdata = test)
confusionMatrix(test$pred, test$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 48 8
## 1 4 30
##
## Accuracy : 0.8667
## 95% CI : (0.7787, 0.9292)
## No Information Rate : 0.5778
## P-Value [Acc > NIR] : 2.884e-09
##
## Kappa : 0.7228
## Mcnemar's Test P-Value : 0.3865
##
## Sensitivity : 0.9231
## Specificity : 0.7895
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.8824
## Prevalence : 0.5778
## Detection Rate : 0.5333
## Detection Prevalence : 0.6222
## Balanced Accuracy : 0.8563
##
## 'Positive' Class : 0
##
accuracy <- c(0)
cost <- c(0.001,.01, .05, .1, .2, .5, 1, 2,5)
gamma <- c(10^-6,10^-5,10^-4,10^-3,10^-2,10^-1,10^0,10^1,10^2,10^3)
j <- 1
for (i in cost)
{
for (k in gamma)
{
svm.sigmoid <- svm(class ~., data = train, kernel = "sigmoid", gamma = k, cost = i)
test$pred <- predict(svm.sigmoid, newdata = test)
accuracy[j] <- mean(test$pred == test$class)
if(accuracy[j] >= max(accuracy))
{
des_gamma <- k
des_cost <- i
}
j <- j + 1
}
}
des_gamma
## [1] 0.001
des_cost
## [1] 5
we get our highest accuracy at gamma = 0.001 and cost = 5. It is observed that higher penalty is desirable in sigmoid model as compared to liner or even raidal model
svm.sigmoid <- svm(class ~., data = train, kernel = "sigmoid", gamma = .001, cost = 5)
test$pred <- predict(svm.sigmoid, newdata = test)
confusionMatrix(test$pred, test$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 47 8
## 1 5 30
##
## Accuracy : 0.8556
## 95% CI : (0.7657, 0.9208)
## No Information Rate : 0.5778
## P-Value [Acc > NIR] : 1.296e-08
##
## Kappa : 0.7008
## Mcnemar's Test P-Value : 0.5791
##
## Sensitivity : 0.9038
## Specificity : 0.7895
## Pos Pred Value : 0.8545
## Neg Pred Value : 0.8571
## Prevalence : 0.5778
## Detection Rate : 0.5222
## Detection Prevalence : 0.6111
## Balanced Accuracy : 0.8467
##
## 'Positive' Class : 0
##
Overall we get our highest accuracy in Radial model 0.8667 with cost = 2 and gamma = 0.001