############################
# SVM : 0.71 #
# Logistic : 0.75 #
############################
#• PCA
#1. Load File
#2. Remove NA
#3. Remove unwanted columns
#4. Check correlation
#5. Converted categoricalcolumns into dummy variables
#6. Run princomp.
#7. Check variance and Components selected for 98% variance
#8. There are 3 PCA components selected for 98% variance.
#9. Plot graphs for variance to make sure with Variance Curve
#10. Created dataset with Predicted variable AHD and Rest with 3 PCA components.
#• SVM
#1. Dataset from PCA divided into test and train randomly for 50 iterations.
#2. Checked Linear and Radial kernel which one will be better.
#3. Linear come out best option.
#4. Run SVM and tuned it with tune function which will choose best parameters for SVM
#5. Ran 50 iterations.
#6. Predicted output for all 50 iterations and averaged to find accuracy
#7. Plot Confusion matrix
#8. SVM accuracy
# Same procedure repeated for Logistic.
# Readi file
library(e1071)
library(ggplot2)
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(graphics)
library(stats)
library(utils)
setwd("~/Desktop/rcode/ML/Proj_Heart_pca")
Heart <- read.csv("~/Desktop/rcode/ML/Proj_Heart_pca/Heart.csv")
#View(Heart)
dim(Heart)
## [1] 303 15
Heart <- na.omit(Heart)
#View(Heart)
dim(Heart)
## [1] 297 15
Heart_data <- Heart
# Remove First Column , Last Column
Heart_data <- Heart_data[,-1]
Heart_data <- Heart_data[,-14]
#print Col names
colnames(Heart_data)
## [1] "Age" "Sex" "ChestPain" "RestBP" "Chol"
## [6] "Fbs" "RestECG" "MaxHR" "ExAng" "Oldpeak"
## [11] "Slope" "Ca" "Thal"
# plot to check Correltaion
#plot(Heart_data)
# Remove Categorical Values to find Correlation
cor_heart <- Heart_data[,-3]
cor_heart <- cor_heart[,-12]
# Check Correlation
colnames(cor_heart)
## [1] "Age" "Sex" "RestBP" "Chol" "Fbs" "RestECG" "MaxHR"
## [8] "ExAng" "Oldpeak" "Slope" "Ca"
cor(cor_heart)
## Age Sex RestBP Chol Fbs
## Age 1.00000000 -0.09239948 0.29047626 2.026435e-01 0.1320619890
## Sex -0.09239948 1.00000000 -0.06634020 -1.980891e-01 0.0388502996
## RestBP 0.29047626 -0.06634020 1.00000000 1.315357e-01 0.1808595428
## Chol 0.20264355 -0.19808906 0.13153571 1.000000e+00 0.0127082808
## Fbs 0.13206199 0.03885030 0.18085954 1.270828e-02 1.0000000000
## RestECG 0.14991651 0.03389683 0.14924228 1.650460e-01 0.0688311070
## MaxHR -0.39456288 -0.06049601 -0.04910766 -7.456799e-05 -0.0078423590
## ExAng 0.09648880 0.14358125 0.06669107 5.933893e-02 -0.0008930821
## Oldpeak 0.19712262 0.10656724 0.19124314 3.859579e-02 0.0083106671
## Slope 0.15940474 0.03334496 0.12117205 -9.215240e-03 0.0478190123
## Ca 0.36221034 0.09192480 0.09795376 1.159446e-01 0.1520858900
## RestECG MaxHR ExAng Oldpeak Slope
## Age 0.14991651 -3.945629e-01 0.0964888046 0.197122616 0.15940474
## Sex 0.03389683 -6.049601e-02 0.1435812504 0.106567243 0.03334496
## RestBP 0.14924228 -4.910766e-02 0.0666910687 0.191243136 0.12117205
## Chol 0.16504603 -7.456799e-05 0.0593389323 0.038595794 -0.00921524
## Fbs 0.06883111 -7.842359e-03 -0.0008930821 0.008310667 0.04781901
## RestECG 1.00000000 -7.228965e-02 0.0818739197 0.113726420 0.13514058
## MaxHR -0.07228965 1.000000e+00 -0.3843675321 -0.347639972 -0.38930674
## ExAng 0.08187392 -3.843675e-01 1.0000000000 0.289309666 0.25057152
## Oldpeak 0.11372642 -3.476400e-01 0.2893096659 1.000000000 0.57903735
## Slope 0.13514058 -3.893067e-01 0.2505715154 0.579037353 1.00000000
## Ca 0.12902063 -2.687270e-01 0.1482322256 0.294452277 0.10976112
## Ca
## Age 0.36221034
## Sex 0.09192480
## RestBP 0.09795376
## Chol 0.11594459
## Fbs 0.15208589
## RestECG 0.12902063
## MaxHR -0.26872698
## ExAng 0.14823223
## Oldpeak 0.29445228
## Slope 0.10976112
## Ca 1.00000000
#plot(cor_heart)
#sink("cor_heart.txt")
#cor(cor_heart)
#sink
# Transform Categorical variable to dummy variables
new_my_Heart_data <- dummy.data.frame(Heart_data, names = c("ChestPain","Thal"))
dim(new_my_Heart_data)
## [1] 297 18
new_my_Heart_data <- na.omit(new_my_Heart_data)
dim(new_my_Heart_data)
## [1] 297 18
#View New data with Dummy variables.
#View(new_my_Heart_data)
# Find PCA components with PRINCOMP
pca_heart <- princomp(new_my_Heart_data)
summary(pca_heart)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 52.0044246 23.2921488 17.63224572 7.61501572
## Proportion of Variance 0.7469583 0.1498425 0.08586794 0.01601612
## Cumulative Proportion 0.7469583 0.8968008 0.98266874 0.99868486
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 1.1937424661 0.960879673 0.8420419543 0.6671064363
## Proportion of Variance 0.0003935837 0.000255008 0.0001958318 0.0001229153
## Cumulative Proportion 0.9990784460 0.999333454 0.9995292858 0.9996522011
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 5.416043e-01 4.634986e-01 4.430064e-01 3.943334e-01
## Proportion of Variance 8.101772e-05 5.933522e-05 5.420455e-05 4.294799e-05
## Cumulative Proportion 9.997332e-01 9.997926e-01 9.998468e-01 9.998897e-01
## Comp.13 Comp.14 Comp.15 Comp.16
## Standard deviation 3.640134e-01 3.305688e-01 2.899636e-01 2.710558e-01
## Proportion of Variance 3.659742e-05 3.018142e-05 2.322216e-05 2.029239e-05
## Cumulative Proportion 9.999263e-01 9.999565e-01 9.999797e-01 1.000000e+00
## Comp.17 Comp.18
## Standard deviation 1.181527e-07 0
## Proportion of Variance 3.855699e-18 0
## Cumulative Proportion 1.000000e+00 1
# Comp.1 Comp.2 Comp.3 Comp.4
# Standard deviation 52.0044246 23.2921488 17.63224572 7.61501572
# Proportion of Variance 0.7469583 0.1498425 0.08586794 0.01601612
# Cumulative Proportion 0.7469583 0.8968008 0.98266874 0.99868486
# 98% of variation explained by 3 pca components.
#sink("pca_heart_scores.csv")
#sink()
#Loadings
pca_heart$loadings[,1:3]
## Comp.1 Comp.2 Comp.3
## Age 3.713622e-02 -0.1817400284 -0.1264280109
## Sex -1.790110e-03 -0.0010540334 0.0013347199
## ChestPainasymptomatic 6.371152e-04 -0.0080133871 0.0011777288
## ChestPainnonanginal -2.146093e-04 0.0031207357 0.0006064159
## ChestPainnontypical -1.449953e-04 0.0042139926 0.0008623148
## ChestPaintypical -2.775106e-04 0.0006786587 -0.0026464594
## RestBP 5.130524e-02 -0.1152723579 -0.9803671562
## Chol 9.979803e-01 0.0146269739 0.0548725777
## Fbs 1.129427e-04 -0.0004894884 -0.0036627092
## RestECG 3.210979e-03 -0.0037557734 -0.0066181299
## MaxHR -1.915410e-03 0.9760429586 -0.1404129590
## ExAng 5.523395e-04 -0.0076584039 0.0001974733
## Oldpeak 9.715700e-04 -0.0182173507 -0.0087440171
## Slope -6.735881e-05 -0.0105145350 -0.0021279103
## Ca 2.159995e-03 -0.0116384880 -0.0028021636
## Thalfixed -4.466737e-04 -0.0017264610 -0.0008786941
## Thalnormal -2.141342e-05 0.0063590561 0.0027700637
## Thalreversable 4.680872e-04 -0.0046325951 -0.0018913696
#Plot PCA
biplot(pca_heart)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

plot(pca_heart)

#Dimention PCA loadings
dim(pca_heart$loadings)
## [1] 18 18
#PCA data for LOGISTIC
pca_heart_data <- (pca_heart$scores[,1:3])
var1 <- (pca_heart$sdev)^2
var_total <- var1/sum(var1)
var_total
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## 7.469583e-01 1.498425e-01 8.586794e-02 1.601612e-02 3.935837e-04
## Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## 2.550080e-04 1.958318e-04 1.229153e-04 8.101772e-05 5.933522e-05
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## 5.420455e-05 4.294799e-05 3.659742e-05 3.018142e-05 2.322216e-05
## Comp.16 Comp.17 Comp.18
## 2.029239e-05 3.855699e-18 0.000000e+00
# Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
#7.467626e-01 1.503456e-01 8.558414e-02 1.598513e-02 3.954660e-04
#Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
#2.563285e-04 1.952784e-04 1.231547e-04 8.151282e-05 5.948448e-05
#Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
#5.427019e-05 4.382952e-05 3.648255e-05 3.066586e-05 2.315721e-05
#Comp.16 Comp.17 Comp.18 Comp.19
#2.050005e-05 2.358192e-06 1.097665e-17 2.855090e-21
# 98% Varince explained by first 3 components
# Variance Plot
plot(var_total, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")

#Cumulative variance
plot(cumsum(var_total), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")

# Final PCA data for Logistic and SVM
PCA.heart.data <- data.frame(AHD = Heart$AHD,pca_heart_data)
head(PCA.heart.data)
## AHD Comp.1 Comp.2 Comp.3
## 1 No -13.32402 -2.921788 -14.984466
## 2 Yes 40.57505 -45.614011 -21.378044
## 3 Yes -18.40286 -21.359344 11.748755
## 4 No 1.83077 39.891454 -1.240968
## 5 No -43.89267 23.904930 -2.156321
## 6 No -11.93360 28.673262 6.684193
# split the data
hrt_smp_size <-floor(0.80* nrow(PCA.heart.data))
train_ind <-sample(seq_len(nrow(PCA.heart.data)), size = hrt_smp_size)
train_set <- PCA.heart.data[train_ind,]
test_set <-PCA.heart.data[-train_ind,]
#Number of Iterations
factor <- 50
# Declaring variables to store values for each
# iteration to find average values.
x1 <- 0
x2 <- 0
x3 <- 0
x4 <- 0
head(PCA.heart.data)
## AHD Comp.1 Comp.2 Comp.3
## 1 No -13.32402 -2.921788 -14.984466
## 2 Yes 40.57505 -45.614011 -21.378044
## 3 Yes -18.40286 -21.359344 11.748755
## 4 No 1.83077 39.891454 -1.240968
## 5 No -43.89267 23.904930 -2.156321
## 6 No -11.93360 28.673262 6.684193
for(i in 1:factor){ # Iterating factor times
train_ind_1 <-sample(seq_len(nrow(PCA.heart.data)), size = hrt_smp_size)
head( train_ind_1)
heart_train_1 <- PCA.heart.data[train_ind,]
heart_test_1 <- PCA.heart.data[-train_ind,]
heart_test_1 <- na.omit(heart_test_1)
heart_train_1 <- na.omit(heart_train_1)
#View(heart_test)
#View(heart_train)
# Model with train data , linear kernal , cost=1 and gamma=-0
heartsvm <- svm(AHD~ . , data=heart_train_1 ,kernel ="linear", cost=1 ,scale=FALSE)
#summary(heartsvm)
# Cross validating model and finding best model in terms of cost and gamma
tune.out=tune(svm, AHD~., data=heart_train_1 ,kernel="linear",ranges=list(cost=c( 0.01, 0.1, 1,5)))
#Checked between Radial and Linear . linear performed better. With tune funtion got maximum Accuracy
# Predicting with test model
trainpred=predict(tune.out$best.model ,heart_test_1)
# Creating table for cross validation
x = table(predict=trainpred, truth=heart_test_1$AHD)
x1 = x1 + x[1]
x2= x2 + x[2]
x3 = x3 + x[3]
x4= x4 + x[4]
}
svm.out <- matrix(c(x1/factor,x2/factor,x3/factor,x4/factor) , nrow=2,ncol=2)
colnames(svm.out) <- c("no", "yes")
rownames(svm.out) <- c("no", "yes")
svm.out
## no yes
## no 25.9 10
## yes 9.1 15
accuracy<- sum(x1/factor,x4/factor)/sum(x1/factor,x2/factor,x3/factor,x4/factor)
# Accuracy
accuracy
## [1] 0.6816667
ctable1 <- as.table(matrix(svm.out, nrow = 2, byrow = TRUE))
fourfoldplot(ctable1, color = c("#CC6666", "#99CC49"),
conf.level = 0, margin = 1, main = "Confusion Matrix SVM")

hrt_smp_size <-floor(0.80* nrow(PCA.heart.data))
# Creating list to store the values for 50 runs
listModel_1 <- list()
listModel_2 <- list()
listModel_3 <- list()
for(i in 1:50){
train_ind <-sample(seq_len(nrow(PCA.heart.data)), size = hrt_smp_size)
train_set <- PCA.heart.data[train_ind,]
test_set <-PCA.heart.data[-train_ind,]
# logistic model
log_model <-glm(AHD ~ Comp.1+ Comp.2+ Comp.3 ,data=train_set, family="binomial")
test_set$predicted = predict(log_model, newdata=test_set, type="response")
AHD_pred <- predict(log_model, newdata=test_set, type="response")
AHD_pred <- ifelse(AHD_pred>0.5,"Yes", "No")
# accuracy <- table(AHD_pred,test_set$AHD)
confusionmatrix <- table(test_set$AHD, AHD_pred)
accuracy<- sum(diag(confusionmatrix))/sum(confusionmatrix)
listModel_1[[i]] <- confusionmatrix
listModel_2[[i]] <- log_model
listModel_3[[i]] <- accuracy
}
which.max( listModel_3[] )
## [1] 7
max(unlist(listModel_3))
## [1] 0.7833333
listModel_2[18]
## [[1]]
##
## Call: glm(formula = AHD ~ Comp.1 + Comp.2 + Comp.3, family = "binomial",
## data = train_set)
##
## Coefficients:
## (Intercept) Comp.1 Comp.2 Comp.3
## -0.104297 0.003949 -0.042844 -0.015465
##
## Degrees of Freedom: 236 Total (i.e. Null); 233 Residual
## Null Deviance: 328.3
## Residual Deviance: 278.2 AIC: 286.2
# End
a <-which.max( listModel_3[] )
max(unlist(listModel_3))
## [1] 0.7833333
listModel_2[a]
## [[1]]
##
## Call: glm(formula = AHD ~ Comp.1 + Comp.2 + Comp.3, family = "binomial",
## data = train_set)
##
## Coefficients:
## (Intercept) Comp.1 Comp.2 Comp.3
## -0.189347 0.003107 -0.041258 -0.004997
##
## Degrees of Freedom: 236 Total (i.e. Null); 233 Residual
## Null Deviance: 326.3
## Residual Deviance: 284.6 AIC: 292.6
listModel_1[[a]]
## AHD_pred
## No Yes
## No 26 4
## Yes 9 21
ctable <- as.table(matrix(unlist(listModel_1[[a]]), nrow = 2, byrow = TRUE))
fourfoldplot(ctable, color = c("#CC6666", "#99CC49"),
conf.level = 0, margin = 1, main = "Confusion Matrix Logistic Regression")

accuracy
## [1] 0.6333333