Load Data

data <- read.csv('heart-attack-risk-prediction-dataset.csv')
head(data)
##          Age Cholesterol Heart.rate Diabetes Family.History Smoking Obesity
## 1 0.59550562  0.31428571 0.04766269        0              0       1       0
## 2 0.59550562  0.09642857 0.04766269        1              1       1       1
## 3 0.59550562  0.18928571 0.04766269        0              0       1       0
## 4 0.07865169  0.96071429 0.07149404        1              1       1       1
## 5 0.07865169  0.79285714 0.07149404        1              0       1       1
## 6 0.07865169  0.77142857 0.07149404        1              1       1       0
##   Alcohol.Consumption Exercise.Hours.Per.Week Diet Previous.Heart.Problems
## 1                   0              0.20832621    0                       0
## 2                   1              0.75241972    1                       0
## 3                   1              0.20099847    2                       1
## 4                   1              0.09055687    2                       1
## 5                   0              0.60103027    2                       1
## 6                   1              0.65886386    2                       1
##   Medication.Use Stress.Level Sedentary.Hours.Per.Day     Income       BMI
## 1              0            9               0.5512344 0.86226830 0.6023629
## 2              0            3               0.7107863 0.82542195 0.2744693
## 3              1            9               0.3204783 0.00504123 0.8903316
## 4              0            1               0.4135835 0.94931616 0.4179445
## 5              0            4               0.3150260 0.30019079 0.1694947
## 6              0            2               0.9106280 0.14605991 0.3941747
##   Triglycerides Physical.Activity.Days.Per.Week Sleep.Hours.Per.Day
## 1    0.33246753                               0           0.3333333
## 2    0.27662338                               2           0.6666667
## 3    0.05714286                               4           1.0000000
## 4    0.26623377                               1           0.5000000
## 5    0.76883117                               1           0.1666667
## 6    0.48571429                               2           0.6666667
##   Heart.Attack.Risk..Binary. Blood.sugar      CK.MB   Troponin
## 1                          0   0.2270176 0.04822879 0.03651237
## 2                          0   0.2270176 0.04822879 0.03651237
## 3                          0   0.2270176 0.04822879 0.03651237
## 4                          0   0.2270176 0.04822879 0.03651237
## 5                          0   0.2270176 0.04822879 0.03651237
## 6                          0   0.2270176 0.04822879 0.03651237
##   Heart.Attack.Risk..Text. Gender Systolic.blood.pressure
## 1                        0   Male               0.6000000
## 2                        0   Male               0.5741935
## 3                        0   Male               0.1870968
## 4                        0   Male               0.6451613
## 5                        0   Male               0.2516129
## 6                        0   Male               0.7354839
##   Diastolic.blood.pressure
## 1                0.5348837
## 2                0.5697674
## 3                0.6744186
## 4                0.5930233
## 5                0.3837209
## 6                0.3837209

Mengambil kolom 1 hingga kolom 10 saja

Age Cholesterol Heart rate Diabetes Family History Smoking Obesity Alcohol Consumption Exercise Hours Per Week Diet

new <- data[,1:10]

Cek Missing Value

sum(is.na(new))
## [1] 1370
p <- ncol(new)

##Handling Missing Value Mengisi Missing Value dengan Mean

data_clean <- new
for(i in 1:ncol(data_clean)) {
  data_clean[is.na(data_clean[, i]), i] <- mean(data_clean[, i], na.rm = TRUE)  
}

Cek KMO

library(psych)
r <- cor(data_clean)
KMO(r)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA =  0.5
## MSA for each item = 
##                     Age             Cholesterol              Heart.rate 
##                    0.50                    0.43                    0.51 
##                Diabetes          Family.History                 Smoking 
##                    0.46                    0.50                    0.50 
##                 Obesity     Alcohol.Consumption Exercise.Hours.Per.Week 
##                    0.48                    0.46                    0.51 
##                    Diet 
##                    0.45

##Bartlett test

bartlett.test(data_clean)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  data_clean
## Bartlett's K-squared = 74881, df = 9, p-value < 2.2e-16

#Principal Component Analysis ##Manual ##Menghitung Eigen Value dan Eigen Vector

scale_data = scale(data_clean)
r = cov(scale_data)
pc <- eigen(r)
pc$values
##  [1] 1.3946952 1.0424853 1.0367978 1.0316120 1.0075230 0.9898937 0.9752606
##  [8] 0.9636293 0.9545709 0.6035322

##Menghitung proporsi varians dan kumulatif

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sumvar <- sum(pc$values)
propvar <- (pc$values / sumvar) * 100
cumvar <- data.frame(cbind(value = pc$values, propvar))
cumvar <- cumvar %>% mutate(cum = cumsum(propvar))
rownames(cumvar) <- paste0("PC", seq_len(nrow(cumvar)))
print(cumvar)
##          value   propvar       cum
## PC1  1.3946952 13.946952  13.94695
## PC2  1.0424853 10.424853  24.37181
## PC3  1.0367978 10.367978  34.73978
## PC4  1.0316120 10.316120  45.05590
## PC5  1.0075230 10.075230  55.13113
## PC6  0.9898937  9.898937  65.03007
## PC7  0.9752606  9.752606  74.78268
## PC8  0.9636293  9.636293  84.41897
## PC9  0.9545709  9.545709  93.96468
## PC10 0.6035322  6.035322 100.00000

##Hasil PCA

pc$vectors
##               [,1]         [,2]         [,3]        [,4]        [,5]
##  [1,]  0.705873381 -0.006851996  0.040371734  0.02345586 -0.01178801
##  [2,]  0.013382877 -0.445799411 -0.133127052 -0.47892927  0.32997770
##  [3,] -0.030309386 -0.187525794  0.678841993  0.05567382  0.01715517
##  [4,] -0.027480516  0.034437290  0.116086258  0.59521458  0.41750794
##  [5,]  0.023819619  0.421836358  0.100277613 -0.25878227 -0.59544023
##  [6,]  0.706333034 -0.009206266 -0.008643456  0.03090712  0.03229136
##  [7,] -0.006607839 -0.332224911 -0.030526206  0.50346383 -0.43226721
##  [8,]  0.013334746  0.503177576  0.118287081 -0.12186105  0.40902921
##  [9,]  0.013870120 -0.467648155  0.001865789 -0.22109187 -0.07749587
## [10,] -0.001497156 -0.067657638  0.693748778 -0.16602870 -0.02472516
##               [,6]        [,7]        [,8]        [,9]        [,10]
##  [1,] -0.031570715 -0.02221525  0.04234887 -0.00959659 -0.704273427
##  [2,] -0.058908331  0.17785849 -0.59453528  0.23171702 -0.053231874
##  [3,]  0.008971093  0.38961723  0.27152813  0.52346389  0.008429667
##  [4,]  0.290997331 -0.42050076 -0.34615472  0.27128986 -0.032679959
##  [5,]  0.294840315 -0.13704056 -0.38646670  0.36707069 -0.010269155
##  [6,]  0.011954936  0.02263161 -0.02785256  0.03051326  0.704680256
##  [7,]  0.131811903  0.46645619 -0.36926320 -0.27676978 -0.020192744
##  [8,]  0.448995510  0.49748317 -0.08945932 -0.30385535 -0.032713171
##  [9,]  0.755997840 -0.23042587  0.28892718 -0.13449387  0.005077312
## [10,] -0.182195936 -0.31904717 -0.27058138 -0.52527360  0.042928685
scores <- as.matrix(scale_data) %*% pc$vectors
head(scores)
##            [,1]       [,2]       [,3]        [,4]       [,5]       [,6]
## [1,]  0.6664374  0.1167354 -1.1982696 -0.16967510 -0.1634984 -1.9253742
## [2,]  0.6849606  0.7843595  0.3399694  1.09563333 -0.9415894  1.7696590
## [3,]  0.6843668  1.2108794  0.7028743 -0.58826610  0.4836130 -1.4087135
## [4,] -0.9156311  0.2746476  1.2888754 -0.03615188  0.2553591 -0.2966758
## [5,] -0.9746252 -2.1985287  0.9223091  0.62775794  0.2830905 -0.4351966
## [6,] -0.8834553  0.3122695  1.4431182 -1.18007684  0.7581914  0.9834924
##             [,7]         [,8]        [,9]       [,10]
## [1,]  0.08411995  1.753150021  0.48346776 -0.11453364
## [2,] -0.05700067 -0.007424249 -0.42364814 -0.21381336
## [3,]  0.30735605  1.198550466 -1.45387710 -0.06015226
## [4,]  1.07368248 -2.631826740  0.51005629  1.24488539
## [5,] -0.19636881 -0.793617671  0.01659896  1.37398286
## [6,] -0.45081999 -0.910530463  0.64905841  1.33140653

#Factor Analysis Sesuai dengan Kaiser Rule hanya mengambil yang PC >1, sehingga hanya PC 1- PC 5

varcov = cov(scale_data)
pc = eigen(varcov)
pc$values
##  [1] 1.3946952 1.0424853 1.0367978 1.0316120 1.0075230 0.9898937 0.9752606
##  [8] 0.9636293 0.9545709 0.6035322
pc$vectors
##               [,1]         [,2]         [,3]        [,4]        [,5]
##  [1,]  0.705873381 -0.006851996  0.040371734  0.02345586 -0.01178801
##  [2,]  0.013382877 -0.445799411 -0.133127052 -0.47892927  0.32997770
##  [3,] -0.030309386 -0.187525794  0.678841993  0.05567382  0.01715517
##  [4,] -0.027480516  0.034437290  0.116086258  0.59521458  0.41750794
##  [5,]  0.023819619  0.421836358  0.100277613 -0.25878227 -0.59544023
##  [6,]  0.706333034 -0.009206266 -0.008643456  0.03090712  0.03229136
##  [7,] -0.006607839 -0.332224911 -0.030526206  0.50346383 -0.43226721
##  [8,]  0.013334746  0.503177576  0.118287081 -0.12186105  0.40902921
##  [9,]  0.013870120 -0.467648155  0.001865789 -0.22109187 -0.07749587
## [10,] -0.001497156 -0.067657638  0.693748778 -0.16602870 -0.02472516
##               [,6]        [,7]        [,8]        [,9]        [,10]
##  [1,] -0.031570715 -0.02221525  0.04234887 -0.00959659 -0.704273427
##  [2,] -0.058908331  0.17785849 -0.59453528  0.23171702 -0.053231874
##  [3,]  0.008971093  0.38961723  0.27152813  0.52346389  0.008429667
##  [4,]  0.290997331 -0.42050076 -0.34615472  0.27128986 -0.032679959
##  [5,]  0.294840315 -0.13704056 -0.38646670  0.36707069 -0.010269155
##  [6,]  0.011954936  0.02263161 -0.02785256  0.03051326  0.704680256
##  [7,]  0.131811903  0.46645619 -0.36926320 -0.27676978 -0.020192744
##  [8,]  0.448995510  0.49748317 -0.08945932 -0.30385535 -0.032713171
##  [9,]  0.755997840 -0.23042587  0.28892718 -0.13449387  0.005077312
## [10,] -0.182195936 -0.31904717 -0.27058138 -0.52527360  0.042928685
sp = sum(pc$values[1:5])

L1 = sqrt(pc$values[1])*pc$vectors[,1]
L2 = sqrt(pc$values[2])*pc$vectors[,2]
L3 = sqrt(pc$values[3])*pc$vectors[,3]
L4 = sqrt(pc$values[4])*pc$vectors[,4]
L5 = sqrt(pc$values[5])*pc$vectors[,5]

L = cbind(L1,L2,L3,L4,L5)
L
##                 L1           L2           L3          L4          L5
##  [1,]  0.833616803 -0.006996037  0.041107819  0.02382372 -0.01183227
##  [2,]  0.015804805 -0.455170876 -0.135554316 -0.48644034  0.33121658
##  [3,] -0.035794541 -0.191467906  0.691219105  0.05654696  0.01721958
##  [4,] -0.032453724  0.035161220  0.118202823  0.60454936  0.41907545
##  [5,]  0.028130307  0.430704078  0.102105943 -0.26284076 -0.59767578
##  [6,]  0.834159640 -0.009399798 -0.008801050  0.03139184  0.03241260
##  [7,] -0.007803674 -0.339208846 -0.031082781  0.51135967 -0.43389013
##  [8,]  0.015747963  0.513755228  0.120443772 -0.12377220  0.41056489
##  [9,]  0.016380225 -0.477478918  0.001899807 -0.22455927 -0.07778683
## [10,] -0.001768099 -0.069079917  0.706397681 -0.16863253 -0.02481799

##Perform factor analysis

library(psych)
fa <- fa(r = scale_data, 
         covar = TRUE,
         nfactors = 3, 
         rotate = "varimax") 
summary(fa)
## 
## Factor analysis with Call: fa(r = scale_data, nfactors = 3, rotate = "varimax", covar = TRUE)
## 
## Test of the hypothesis that 3 factors are sufficient.
## The degrees of freedom for the model is 18  and the objective function was  0 
## The number of observations was  9651  with Chi Square =  21.21  with prob <  0.27 
## 
## The root mean square of the residuals (RMSA) is  0.01 
## The df corrected root mean square of the residuals is  0.01 
## 
## Tucker Lewis Index of factoring reliability =  0.995
## RMSEA index =  0.004  and the 10 % confidence intervals are  0 0.01
## BIC =  -143.93
load <- fa$loadings

plot(load[,c(1,3)],type="n") # set up plot
text(load[,c(1,3)],labels=names(data),cex=.7)

fa.diagram(load)