PCA_svm_log.R

############################
# SVM       :     0.71     #
# Logistic  :     0.75     #
############################
#•  PCA 
#1. Load File
#2. Remove NA
#3. Remove unwanted columns
#4. Check correlation
#5. Converted categoricalcolumns into dummy variables
#6. Run princomp.
#7. Check variance and Components selected for 98% variance 
#8. There are 3 PCA components selected for 98% variance.
#9. Plot graphs for variance to make sure with Variance Curve
#10.    Created dataset with Predicted variable AHD and Rest with 3 PCA components. 
#•  SVM
#1. Dataset from PCA divided into test and train randomly for 50 iterations.
#2. Checked Linear and Radial kernel which one will be better.
#3. Linear come out best option.
#4. Run SVM and tuned it with tune function which will choose best parameters for SVM 
#5. Ran 50 iterations.
#6. Predicted output for all 50 iterations and averaged to find accuracy
#7. Plot Confusion matrix
#8. SVM accuracy
# Same procedure repeated for Logistic.

# Readi file 
library(e1071)
library(ggplot2)
library(dummies)

## dummies-1.5.6 provided by Decision Patterns

library(graphics)
library(stats)
library(utils)
setwd("~/Desktop/rcode/ML/Proj_Heart_pca")
Heart <- read.csv("~/Desktop/rcode/ML/Proj_Heart_pca/Heart.csv")
#View(Heart)
dim(Heart)

## [1] 303  15

Heart <- na.omit(Heart)
#View(Heart)
dim(Heart)

## [1] 297  15

Heart_data <- Heart
# Remove First Column , Last Column 
Heart_data <- Heart_data[,-1]
Heart_data <- Heart_data[,-14]
#print Col names 
colnames(Heart_data)

##  [1] "Age"       "Sex"       "ChestPain" "RestBP"    "Chol"     
##  [6] "Fbs"       "RestECG"   "MaxHR"     "ExAng"     "Oldpeak"  
## [11] "Slope"     "Ca"        "Thal"

# plot to check Correltaion
#plot(Heart_data)
# Remove Categorical Values to find Correlation
cor_heart <- Heart_data[,-3]
cor_heart <- cor_heart[,-12]
# Check Correlation
colnames(cor_heart)

##  [1] "Age"     "Sex"     "RestBP"  "Chol"    "Fbs"     "RestECG" "MaxHR"  
##  [8] "ExAng"   "Oldpeak" "Slope"   "Ca"

cor(cor_heart)

##                 Age         Sex      RestBP          Chol           Fbs
## Age      1.00000000 -0.09239948  0.29047626  2.026435e-01  0.1320619890
## Sex     -0.09239948  1.00000000 -0.06634020 -1.980891e-01  0.0388502996
## RestBP   0.29047626 -0.06634020  1.00000000  1.315357e-01  0.1808595428
## Chol     0.20264355 -0.19808906  0.13153571  1.000000e+00  0.0127082808
## Fbs      0.13206199  0.03885030  0.18085954  1.270828e-02  1.0000000000
## RestECG  0.14991651  0.03389683  0.14924228  1.650460e-01  0.0688311070
## MaxHR   -0.39456288 -0.06049601 -0.04910766 -7.456799e-05 -0.0078423590
## ExAng    0.09648880  0.14358125  0.06669107  5.933893e-02 -0.0008930821
## Oldpeak  0.19712262  0.10656724  0.19124314  3.859579e-02  0.0083106671
## Slope    0.15940474  0.03334496  0.12117205 -9.215240e-03  0.0478190123
## Ca       0.36221034  0.09192480  0.09795376  1.159446e-01  0.1520858900
##             RestECG         MaxHR         ExAng      Oldpeak       Slope
## Age      0.14991651 -3.945629e-01  0.0964888046  0.197122616  0.15940474
## Sex      0.03389683 -6.049601e-02  0.1435812504  0.106567243  0.03334496
## RestBP   0.14924228 -4.910766e-02  0.0666910687  0.191243136  0.12117205
## Chol     0.16504603 -7.456799e-05  0.0593389323  0.038595794 -0.00921524
## Fbs      0.06883111 -7.842359e-03 -0.0008930821  0.008310667  0.04781901
## RestECG  1.00000000 -7.228965e-02  0.0818739197  0.113726420  0.13514058
## MaxHR   -0.07228965  1.000000e+00 -0.3843675321 -0.347639972 -0.38930674
## ExAng    0.08187392 -3.843675e-01  1.0000000000  0.289309666  0.25057152
## Oldpeak  0.11372642 -3.476400e-01  0.2893096659  1.000000000  0.57903735
## Slope    0.13514058 -3.893067e-01  0.2505715154  0.579037353  1.00000000
## Ca       0.12902063 -2.687270e-01  0.1482322256  0.294452277  0.10976112
##                  Ca
## Age      0.36221034
## Sex      0.09192480
## RestBP   0.09795376
## Chol     0.11594459
## Fbs      0.15208589
## RestECG  0.12902063
## MaxHR   -0.26872698
## ExAng    0.14823223
## Oldpeak  0.29445228
## Slope    0.10976112
## Ca       1.00000000

#plot(cor_heart)
#sink("cor_heart.txt")
#cor(cor_heart)
#sink
# Transform Categorical variable to  dummy variables
new_my_Heart_data <- dummy.data.frame(Heart_data, names = c("ChestPain","Thal"))
dim(new_my_Heart_data)

## [1] 297  18

new_my_Heart_data <- na.omit(new_my_Heart_data)
dim(new_my_Heart_data)

## [1] 297  18

#View New data with Dummy variables.
#View(new_my_Heart_data)
# Find PCA components with PRINCOMP
pca_heart <- princomp(new_my_Heart_data)
summary(pca_heart)

## Importance of components:
##                            Comp.1     Comp.2      Comp.3     Comp.4
## Standard deviation     52.0044246 23.2921488 17.63224572 7.61501572
## Proportion of Variance  0.7469583  0.1498425  0.08586794 0.01601612
## Cumulative Proportion   0.7469583  0.8968008  0.98266874 0.99868486
##                              Comp.5      Comp.6       Comp.7       Comp.8
## Standard deviation     1.1937424661 0.960879673 0.8420419543 0.6671064363
## Proportion of Variance 0.0003935837 0.000255008 0.0001958318 0.0001229153
## Cumulative Proportion  0.9990784460 0.999333454 0.9995292858 0.9996522011
##                              Comp.9      Comp.10      Comp.11      Comp.12
## Standard deviation     5.416043e-01 4.634986e-01 4.430064e-01 3.943334e-01
## Proportion of Variance 8.101772e-05 5.933522e-05 5.420455e-05 4.294799e-05
## Cumulative Proportion  9.997332e-01 9.997926e-01 9.998468e-01 9.998897e-01
##                             Comp.13      Comp.14      Comp.15      Comp.16
## Standard deviation     3.640134e-01 3.305688e-01 2.899636e-01 2.710558e-01
## Proportion of Variance 3.659742e-05 3.018142e-05 2.322216e-05 2.029239e-05
## Cumulative Proportion  9.999263e-01 9.999565e-01 9.999797e-01 1.000000e+00
##                             Comp.17 Comp.18
## Standard deviation     1.181527e-07       0
## Proportion of Variance 3.855699e-18       0
## Cumulative Proportion  1.000000e+00       1

# Comp.1     Comp.2      Comp.3     Comp.4
# Standard deviation     52.0044246 23.2921488 17.63224572 7.61501572
# Proportion of Variance  0.7469583  0.1498425  0.08586794 0.01601612
# Cumulative Proportion   0.7469583  0.8968008  0.98266874 0.99868486
# 98% of variation explained by 3 pca components. 
#sink("pca_heart_scores.csv")

#sink()
#Loadings
pca_heart$loadings[,1:3]

##                              Comp.1        Comp.2        Comp.3
## Age                    3.713622e-02 -0.1817400284 -0.1264280109
## Sex                   -1.790110e-03 -0.0010540334  0.0013347199
## ChestPainasymptomatic  6.371152e-04 -0.0080133871  0.0011777288
## ChestPainnonanginal   -2.146093e-04  0.0031207357  0.0006064159
## ChestPainnontypical   -1.449953e-04  0.0042139926  0.0008623148
## ChestPaintypical      -2.775106e-04  0.0006786587 -0.0026464594
## RestBP                 5.130524e-02 -0.1152723579 -0.9803671562
## Chol                   9.979803e-01  0.0146269739  0.0548725777
## Fbs                    1.129427e-04 -0.0004894884 -0.0036627092
## RestECG                3.210979e-03 -0.0037557734 -0.0066181299
## MaxHR                 -1.915410e-03  0.9760429586 -0.1404129590
## ExAng                  5.523395e-04 -0.0076584039  0.0001974733
## Oldpeak                9.715700e-04 -0.0182173507 -0.0087440171
## Slope                 -6.735881e-05 -0.0105145350 -0.0021279103
## Ca                     2.159995e-03 -0.0116384880 -0.0028021636
## Thalfixed             -4.466737e-04 -0.0017264610 -0.0008786941
## Thalnormal            -2.141342e-05  0.0063590561  0.0027700637
## Thalreversable         4.680872e-04 -0.0046325951 -0.0018913696

#Plot PCA
biplot(pca_heart)

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

plot(pca_heart)

#Dimention PCA loadings
dim(pca_heart$loadings)

## [1] 18 18

#PCA data for LOGISTIC 
pca_heart_data <- (pca_heart$scores[,1:3])
var1 <- (pca_heart$sdev)^2
var_total <- var1/sum(var1)
var_total

##       Comp.1       Comp.2       Comp.3       Comp.4       Comp.5 
## 7.469583e-01 1.498425e-01 8.586794e-02 1.601612e-02 3.935837e-04 
##       Comp.6       Comp.7       Comp.8       Comp.9      Comp.10 
## 2.550080e-04 1.958318e-04 1.229153e-04 8.101772e-05 5.933522e-05 
##      Comp.11      Comp.12      Comp.13      Comp.14      Comp.15 
## 5.420455e-05 4.294799e-05 3.659742e-05 3.018142e-05 2.322216e-05 
##      Comp.16      Comp.17      Comp.18 
## 2.029239e-05 3.855699e-18 0.000000e+00

#      Comp.1       Comp.2       Comp.3       Comp.4       Comp.5 
#7.467626e-01 1.503456e-01 8.558414e-02 1.598513e-02 3.954660e-04 
#Comp.6       Comp.7       Comp.8       Comp.9      Comp.10 
#2.563285e-04 1.952784e-04 1.231547e-04 8.151282e-05 5.948448e-05 
#Comp.11      Comp.12      Comp.13      Comp.14      Comp.15 
#5.427019e-05 4.382952e-05 3.648255e-05 3.066586e-05 2.315721e-05 
#Comp.16      Comp.17      Comp.18      Comp.19 
#2.050005e-05 2.358192e-06 1.097665e-17 2.855090e-21
# 98% Varince explained by first 3 components

# Variance Plot
plot(var_total, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     type = "b")

#Cumulative variance 
plot(cumsum(var_total), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     type = "b")

# Final PCA data for Logistic and SVM 
PCA.heart.data <- data.frame(AHD = Heart$AHD,pca_heart_data)
head(PCA.heart.data)

##   AHD    Comp.1     Comp.2     Comp.3
## 1  No -13.32402  -2.921788 -14.984466
## 2 Yes  40.57505 -45.614011 -21.378044
## 3 Yes -18.40286 -21.359344  11.748755
## 4  No   1.83077  39.891454  -1.240968
## 5  No -43.89267  23.904930  -2.156321
## 6  No -11.93360  28.673262   6.684193

# split the data
hrt_smp_size <-floor(0.80* nrow(PCA.heart.data))
  train_ind <-sample(seq_len(nrow(PCA.heart.data)), size = hrt_smp_size)
  train_set <- PCA.heart.data[train_ind,]
  test_set  <-PCA.heart.data[-train_ind,]

#Number of Iterations
  factor <- 50
# Declaring variables to store values for each 
# iteration to find average values. 
x1 <- 0
x2 <- 0
x3 <- 0
x4 <- 0
head(PCA.heart.data)

##   AHD    Comp.1     Comp.2     Comp.3
## 1  No -13.32402  -2.921788 -14.984466
## 2 Yes  40.57505 -45.614011 -21.378044
## 3 Yes -18.40286 -21.359344  11.748755
## 4  No   1.83077  39.891454  -1.240968
## 5  No -43.89267  23.904930  -2.156321
## 6  No -11.93360  28.673262   6.684193

for(i in 1:factor){ # Iterating factor times
  train_ind_1 <-sample(seq_len(nrow(PCA.heart.data)), size = hrt_smp_size)
  head( train_ind_1)
  heart_train_1 <- PCA.heart.data[train_ind,]
  heart_test_1  <- PCA.heart.data[-train_ind,]
  heart_test_1 <- na.omit(heart_test_1)
  heart_train_1 <- na.omit(heart_train_1)
  #View(heart_test)
  #View(heart_train)
  # Model with train data , linear kernal , cost=1 and gamma=-0
  
  heartsvm <- svm(AHD~ . , data=heart_train_1 ,kernel ="linear", cost=1 ,scale=FALSE)
  #summary(heartsvm)
  # Cross validating model and finding best model in terms of cost and gamma
  tune.out=tune(svm, AHD~., data=heart_train_1 ,kernel="linear",ranges=list(cost=c( 0.01, 0.1, 1,5)))
  #Checked between Radial and Linear . linear performed better. With tune funtion got maximum Accuracy
  # Predicting with test model 
  trainpred=predict(tune.out$best.model ,heart_test_1)
  # Creating table for cross validation
  x = table(predict=trainpred, truth=heart_test_1$AHD)
  x1 = x1 + x[1]
  x2=  x2 +  x[2]
  x3 = x3 +  x[3]
  x4= x4 +  x[4]
}
svm.out  <- matrix(c(x1/factor,x2/factor,x3/factor,x4/factor) , nrow=2,ncol=2)
colnames(svm.out) <- c("no", "yes")
rownames(svm.out) <- c("no", "yes")
svm.out

##       no yes
## no  25.9  10
## yes  9.1  15

accuracy<- sum(x1/factor,x4/factor)/sum(x1/factor,x2/factor,x3/factor,x4/factor)
# Accuracy 
accuracy

## [1] 0.6816667

ctable1 <- as.table(matrix(svm.out, nrow = 2, byrow = TRUE))
fourfoldplot(ctable1, color = c("#CC6666", "#99CC49"),
             conf.level = 0, margin = 1, main = "Confusion Matrix SVM")

hrt_smp_size <-floor(0.80* nrow(PCA.heart.data))
# Creating list to store the values for 50 runs  
listModel_1 <- list()
listModel_2 <- list()
listModel_3 <- list()
for(i in 1:50){
  train_ind <-sample(seq_len(nrow(PCA.heart.data)), size = hrt_smp_size)
  train_set <- PCA.heart.data[train_ind,]
  test_set  <-PCA.heart.data[-train_ind,]
  # logistic model
  log_model <-glm(AHD ~ Comp.1+ Comp.2+ Comp.3 ,data=train_set, family="binomial")
  test_set$predicted = predict(log_model, newdata=test_set, type="response")
  AHD_pred <- predict(log_model, newdata=test_set, type="response")
  AHD_pred <- ifelse(AHD_pred>0.5,"Yes", "No")
  # accuracy <- table(AHD_pred,test_set$AHD)
  confusionmatrix <- table(test_set$AHD, AHD_pred)
  accuracy<- sum(diag(confusionmatrix))/sum(confusionmatrix)
  listModel_1[[i]] <- confusionmatrix
  listModel_2[[i]] <- log_model
  listModel_3[[i]] <- accuracy
  }  
which.max( listModel_3[] )

## [1] 7

max(unlist(listModel_3))

## [1] 0.7833333

listModel_2[18]

## [[1]]
## 
## Call:  glm(formula = AHD ~ Comp.1 + Comp.2 + Comp.3, family = "binomial", 
##     data = train_set)
## 
## Coefficients:
## (Intercept)       Comp.1       Comp.2       Comp.3  
##   -0.104297     0.003949    -0.042844    -0.015465  
## 
## Degrees of Freedom: 236 Total (i.e. Null);  233 Residual
## Null Deviance:       328.3 
## Residual Deviance: 278.2     AIC: 286.2

# End
a <-which.max( listModel_3[] )
max(unlist(listModel_3))

## [1] 0.7833333

listModel_2[a]

## [[1]]
## 
## Call:  glm(formula = AHD ~ Comp.1 + Comp.2 + Comp.3, family = "binomial", 
##     data = train_set)
## 
## Coefficients:
## (Intercept)       Comp.1       Comp.2       Comp.3  
##   -0.189347     0.003107    -0.041258    -0.004997  
## 
## Degrees of Freedom: 236 Total (i.e. Null);  233 Residual
## Null Deviance:       326.3 
## Residual Deviance: 284.6     AIC: 292.6

listModel_1[[a]]

##      AHD_pred
##       No Yes
##   No  26   4
##   Yes  9  21

ctable <- as.table(matrix(unlist(listModel_1[[a]]), nrow = 2, byrow = TRUE))
fourfoldplot(ctable, color = c("#CC6666", "#99CC49"),
             conf.level = 0, margin = 1, main = "Confusion Matrix Logistic Regression")

accuracy

## [1] 0.6333333

PCA_svm_log.R

jyothi

Fri Dec 16 14:47:04 2016