# Readi file 
setwd("~/Desktop/rcode/ML/Proj_Heart_pca")
Heart <- read.csv("~/Desktop/rcode/ML/Proj_Heart_pca/Heart.csv")
#View(Heart)
dim(Heart)
## [1] 303  15
Heart <- na.omit(Heart)
#View(Heart)
dim(Heart)
## [1] 297  15
Heart_data <- Heart
# Remove First Column : its an index , Last Column -Response Variable
Heart_data <- Heart_data[,-1]
Heart_data <- Heart_data[,-14]
#print Col names 
colnames(Heart_data)
##  [1] "Age"       "Sex"       "ChestPain" "RestBP"    "Chol"     
##  [6] "Fbs"       "RestECG"   "MaxHR"     "ExAng"     "Oldpeak"  
## [11] "Slope"     "Ca"        "Thal"
# plot to check Correltaion
#plot(Heart_data)
# Remove Categorical Values to find Correlation
cor_heart <- Heart_data[,-3]
cor_heart <- cor_heart[,-12]
# Check Correlation
colnames(cor_heart)
##  [1] "Age"     "Sex"     "RestBP"  "Chol"    "Fbs"     "RestECG" "MaxHR"  
##  [8] "ExAng"   "Oldpeak" "Slope"   "Ca"
cor(cor_heart)
##                 Age         Sex      RestBP          Chol           Fbs
## Age      1.00000000 -0.09239948  0.29047626  2.026435e-01  0.1320619890
## Sex     -0.09239948  1.00000000 -0.06634020 -1.980891e-01  0.0388502996
## RestBP   0.29047626 -0.06634020  1.00000000  1.315357e-01  0.1808595428
## Chol     0.20264355 -0.19808906  0.13153571  1.000000e+00  0.0127082808
## Fbs      0.13206199  0.03885030  0.18085954  1.270828e-02  1.0000000000
## RestECG  0.14991651  0.03389683  0.14924228  1.650460e-01  0.0688311070
## MaxHR   -0.39456288 -0.06049601 -0.04910766 -7.456799e-05 -0.0078423590
## ExAng    0.09648880  0.14358125  0.06669107  5.933893e-02 -0.0008930821
## Oldpeak  0.19712262  0.10656724  0.19124314  3.859579e-02  0.0083106671
## Slope    0.15940474  0.03334496  0.12117205 -9.215240e-03  0.0478190123
## Ca       0.36221034  0.09192480  0.09795376  1.159446e-01  0.1520858900
##             RestECG         MaxHR         ExAng      Oldpeak       Slope
## Age      0.14991651 -3.945629e-01  0.0964888046  0.197122616  0.15940474
## Sex      0.03389683 -6.049601e-02  0.1435812504  0.106567243  0.03334496
## RestBP   0.14924228 -4.910766e-02  0.0666910687  0.191243136  0.12117205
## Chol     0.16504603 -7.456799e-05  0.0593389323  0.038595794 -0.00921524
## Fbs      0.06883111 -7.842359e-03 -0.0008930821  0.008310667  0.04781901
## RestECG  1.00000000 -7.228965e-02  0.0818739197  0.113726420  0.13514058
## MaxHR   -0.07228965  1.000000e+00 -0.3843675321 -0.347639972 -0.38930674
## ExAng    0.08187392 -3.843675e-01  1.0000000000  0.289309666  0.25057152
## Oldpeak  0.11372642 -3.476400e-01  0.2893096659  1.000000000  0.57903735
## Slope    0.13514058 -3.893067e-01  0.2505715154  0.579037353  1.00000000
## Ca       0.12902063 -2.687270e-01  0.1482322256  0.294452277  0.10976112
##                  Ca
## Age      0.36221034
## Sex      0.09192480
## RestBP   0.09795376
## Chol     0.11594459
## Fbs      0.15208589
## RestECG  0.12902063
## MaxHR   -0.26872698
## ExAng    0.14823223
## Oldpeak  0.29445228
## Slope    0.10976112
## Ca       1.00000000
plot(cor_heart)

#sink("cor_heart.txt")
#cor(cor_heart)
#sink
#There isn o trace of Correlation
# Transform Categorical variable to  dummy variables
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
new_my_Heart_data <- dummy.data.frame(Heart_data, names = c("ChestPain","Thal"))
dim(new_my_Heart_data)
## [1] 297  18
new_my_Heart_data <- na.omit(new_my_Heart_data)
dim(new_my_Heart_data)
## [1] 297  18
#View New data with Dummy variables.
#View(new_my_Heart_data)

# Find PCA components with PRINCOMP
pca_heart <- princomp(new_my_Heart_data)
summary(pca_heart)
## Importance of components:
##                            Comp.1     Comp.2      Comp.3     Comp.4
## Standard deviation     52.0044246 23.2921488 17.63224572 7.61501572
## Proportion of Variance  0.7469583  0.1498425  0.08586794 0.01601612
## Cumulative Proportion   0.7469583  0.8968008  0.98266874 0.99868486
##                              Comp.5      Comp.6       Comp.7       Comp.8
## Standard deviation     1.1937424661 0.960879673 0.8420419543 0.6671064363
## Proportion of Variance 0.0003935837 0.000255008 0.0001958318 0.0001229153
## Cumulative Proportion  0.9990784460 0.999333454 0.9995292858 0.9996522011
##                              Comp.9      Comp.10      Comp.11      Comp.12
## Standard deviation     5.416043e-01 4.634986e-01 4.430064e-01 3.943334e-01
## Proportion of Variance 8.101772e-05 5.933522e-05 5.420455e-05 4.294799e-05
## Cumulative Proportion  9.997332e-01 9.997926e-01 9.998468e-01 9.998897e-01
##                             Comp.13      Comp.14      Comp.15      Comp.16
## Standard deviation     3.640134e-01 3.305688e-01 2.899636e-01 2.710558e-01
## Proportion of Variance 3.659742e-05 3.018142e-05 2.322216e-05 2.029239e-05
## Cumulative Proportion  9.999263e-01 9.999565e-01 9.999797e-01 1.000000e+00
##                             Comp.17 Comp.18
## Standard deviation     1.181527e-07       0
## Proportion of Variance 3.855699e-18       0
## Cumulative Proportion  1.000000e+00       1
# Comp.1     Comp.2      Comp.3     Comp.4
# Standard deviation     52.0044246 23.2921488 17.63224572 7.61501572
# Proportion of Variance  0.7469583  0.1498425  0.08586794 0.01601612
# Cumulative Proportion   0.7469583  0.8968008  0.98266874 0.99868486
# 98% of variation explained by 3 pca components. 


#sink("pca_heart_scores.csv")

#sink()
#Loadings
pca_heart$loadings[,1:3]
##                              Comp.1        Comp.2        Comp.3
## Age                    3.713622e-02 -0.1817400284 -0.1264280109
## Sex                   -1.790110e-03 -0.0010540334  0.0013347199
## ChestPainasymptomatic  6.371152e-04 -0.0080133871  0.0011777288
## ChestPainnonanginal   -2.146093e-04  0.0031207357  0.0006064159
## ChestPainnontypical   -1.449953e-04  0.0042139926  0.0008623148
## ChestPaintypical      -2.775106e-04  0.0006786587 -0.0026464594
## RestBP                 5.130524e-02 -0.1152723579 -0.9803671562
## Chol                   9.979803e-01  0.0146269739  0.0548725777
## Fbs                    1.129427e-04 -0.0004894884 -0.0036627092
## RestECG                3.210979e-03 -0.0037557734 -0.0066181299
## MaxHR                 -1.915410e-03  0.9760429586 -0.1404129590
## ExAng                  5.523395e-04 -0.0076584039  0.0001974733
## Oldpeak                9.715700e-04 -0.0182173507 -0.0087440171
## Slope                 -6.735881e-05 -0.0105145350 -0.0021279103
## Ca                     2.159995e-03 -0.0116384880 -0.0028021636
## Thalfixed             -4.466737e-04 -0.0017264610 -0.0008786941
## Thalnormal            -2.141342e-05  0.0063590561  0.0027700637
## Thalreversable         4.680872e-04 -0.0046325951 -0.0018913696
#Plot PCA
biplot(pca_heart)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped

plot(pca_heart)

#Dimention PCA loadings
dim(pca_heart$loadings)
## [1] 18 18
#PCA data for LOGISTIC 
pca_heart_data <- (pca_heart$scores[,1:3])



var1 <- (pca_heart$sdev)^2
var_total <- var1/sum(var1)
var_total
##       Comp.1       Comp.2       Comp.3       Comp.4       Comp.5 
## 7.469583e-01 1.498425e-01 8.586794e-02 1.601612e-02 3.935837e-04 
##       Comp.6       Comp.7       Comp.8       Comp.9      Comp.10 
## 2.550080e-04 1.958318e-04 1.229153e-04 8.101772e-05 5.933522e-05 
##      Comp.11      Comp.12      Comp.13      Comp.14      Comp.15 
## 5.420455e-05 4.294799e-05 3.659742e-05 3.018142e-05 2.322216e-05 
##      Comp.16      Comp.17      Comp.18 
## 2.029239e-05 3.855699e-18 0.000000e+00
#      Comp.1       Comp.2       Comp.3       Comp.4       Comp.5 
#7.467626e-01 1.503456e-01 8.558414e-02 1.598513e-02 3.954660e-04 
#Comp.6       Comp.7       Comp.8       Comp.9      Comp.10 
#2.563285e-04 1.952784e-04 1.231547e-04 8.151282e-05 5.948448e-05 
#Comp.11      Comp.12      Comp.13      Comp.14      Comp.15 
#5.427019e-05 4.382952e-05 3.648255e-05 3.066586e-05 2.315721e-05 
#Comp.16      Comp.17      Comp.18      Comp.19 
#2.050005e-05 2.358192e-06 1.097665e-17 2.855090e-21
# 98% Varince explained by first 3 components

# Variance Plot
plot(var_total, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     type = "b")

#Cumulative variance 
plot(cumsum(var_total), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     type = "b")

# Final PCA data for Logistic and SVM 
PCA.heart.data <- data.frame(AHD = Heart$AHD,pca_heart_data)
head(PCA.heart.data)
##   AHD    Comp.1     Comp.2     Comp.3
## 1  No -13.32402  -2.921788 -14.984466
## 2 Yes  40.57505 -45.614011 -21.378044
## 3 Yes -18.40286 -21.359344  11.748755
## 4  No   1.83077  39.891454  -1.240968
## 5  No -43.89267  23.904930  -2.156321
## 6  No -11.93360  28.673262   6.684193