data <- read.csv('heart-attack-risk-prediction-dataset.csv')
head(data)
## Age Cholesterol Heart.rate Diabetes Family.History Smoking Obesity
## 1 0.59550562 0.31428571 0.04766269 0 0 1 0
## 2 0.59550562 0.09642857 0.04766269 1 1 1 1
## 3 0.59550562 0.18928571 0.04766269 0 0 1 0
## 4 0.07865169 0.96071429 0.07149404 1 1 1 1
## 5 0.07865169 0.79285714 0.07149404 1 0 1 1
## 6 0.07865169 0.77142857 0.07149404 1 1 1 0
## Alcohol.Consumption Exercise.Hours.Per.Week Diet Previous.Heart.Problems
## 1 0 0.20832621 0 0
## 2 1 0.75241972 1 0
## 3 1 0.20099847 2 1
## 4 1 0.09055687 2 1
## 5 0 0.60103027 2 1
## 6 1 0.65886386 2 1
## Medication.Use Stress.Level Sedentary.Hours.Per.Day Income BMI
## 1 0 9 0.5512344 0.86226830 0.6023629
## 2 0 3 0.7107863 0.82542195 0.2744693
## 3 1 9 0.3204783 0.00504123 0.8903316
## 4 0 1 0.4135835 0.94931616 0.4179445
## 5 0 4 0.3150260 0.30019079 0.1694947
## 6 0 2 0.9106280 0.14605991 0.3941747
## Triglycerides Physical.Activity.Days.Per.Week Sleep.Hours.Per.Day
## 1 0.33246753 0 0.3333333
## 2 0.27662338 2 0.6666667
## 3 0.05714286 4 1.0000000
## 4 0.26623377 1 0.5000000
## 5 0.76883117 1 0.1666667
## 6 0.48571429 2 0.6666667
## Heart.Attack.Risk..Binary. Blood.sugar CK.MB Troponin
## 1 0 0.2270176 0.04822879 0.03651237
## 2 0 0.2270176 0.04822879 0.03651237
## 3 0 0.2270176 0.04822879 0.03651237
## 4 0 0.2270176 0.04822879 0.03651237
## 5 0 0.2270176 0.04822879 0.03651237
## 6 0 0.2270176 0.04822879 0.03651237
## Heart.Attack.Risk..Text. Gender Systolic.blood.pressure
## 1 0 Male 0.6000000
## 2 0 Male 0.5741935
## 3 0 Male 0.1870968
## 4 0 Male 0.6451613
## 5 0 Male 0.2516129
## 6 0 Male 0.7354839
## Diastolic.blood.pressure
## 1 0.5348837
## 2 0.5697674
## 3 0.6744186
## 4 0.5930233
## 5 0.3837209
## 6 0.3837209
Age Cholesterol Heart rate Diabetes Family History Smoking Obesity Alcohol Consumption Exercise Hours Per Week Diet
new <- data[,1:10]
sum(is.na(new))
## [1] 1370
p <- ncol(new)
##Handling Missing Value Mengisi Missing Value dengan Mean
data_clean <- new
for(i in 1:ncol(data_clean)) {
data_clean[is.na(data_clean[, i]), i] <- mean(data_clean[, i], na.rm = TRUE)
}
library(psych)
r <- cor(data_clean)
KMO(r)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA = 0.5
## MSA for each item =
## Age Cholesterol Heart.rate
## 0.50 0.43 0.51
## Diabetes Family.History Smoking
## 0.46 0.50 0.50
## Obesity Alcohol.Consumption Exercise.Hours.Per.Week
## 0.48 0.46 0.51
## Diet
## 0.45
##Bartlett test
bartlett.test(data_clean)
##
## Bartlett test of homogeneity of variances
##
## data: data_clean
## Bartlett's K-squared = 74881, df = 9, p-value < 2.2e-16
#Principal Component Analysis ##Manual ##Menghitung Eigen Value dan Eigen Vector
scale_data = scale(data_clean)
r = cov(scale_data)
pc <- eigen(r)
pc$values
## [1] 1.3946952 1.0424853 1.0367978 1.0316120 1.0075230 0.9898937 0.9752606
## [8] 0.9636293 0.9545709 0.6035322
##Menghitung proporsi varians dan kumulatif
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sumvar <- sum(pc$values)
propvar <- (pc$values / sumvar) * 100
cumvar <- data.frame(cbind(value = pc$values, propvar))
cumvar <- cumvar %>% mutate(cum = cumsum(propvar))
rownames(cumvar) <- paste0("PC", seq_len(nrow(cumvar)))
print(cumvar)
## value propvar cum
## PC1 1.3946952 13.946952 13.94695
## PC2 1.0424853 10.424853 24.37181
## PC3 1.0367978 10.367978 34.73978
## PC4 1.0316120 10.316120 45.05590
## PC5 1.0075230 10.075230 55.13113
## PC6 0.9898937 9.898937 65.03007
## PC7 0.9752606 9.752606 74.78268
## PC8 0.9636293 9.636293 84.41897
## PC9 0.9545709 9.545709 93.96468
## PC10 0.6035322 6.035322 100.00000
##Hasil PCA
pc$vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.705873381 -0.006851996 0.040371734 0.02345586 -0.01178801
## [2,] 0.013382877 -0.445799411 -0.133127052 -0.47892927 0.32997770
## [3,] -0.030309386 -0.187525794 0.678841993 0.05567382 0.01715517
## [4,] -0.027480516 0.034437290 0.116086258 0.59521458 0.41750794
## [5,] 0.023819619 0.421836358 0.100277613 -0.25878227 -0.59544023
## [6,] 0.706333034 -0.009206266 -0.008643456 0.03090712 0.03229136
## [7,] -0.006607839 -0.332224911 -0.030526206 0.50346383 -0.43226721
## [8,] 0.013334746 0.503177576 0.118287081 -0.12186105 0.40902921
## [9,] 0.013870120 -0.467648155 0.001865789 -0.22109187 -0.07749587
## [10,] -0.001497156 -0.067657638 0.693748778 -0.16602870 -0.02472516
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.031570715 -0.02221525 0.04234887 -0.00959659 -0.704273427
## [2,] -0.058908331 0.17785849 -0.59453528 0.23171702 -0.053231874
## [3,] 0.008971093 0.38961723 0.27152813 0.52346389 0.008429667
## [4,] 0.290997331 -0.42050076 -0.34615472 0.27128986 -0.032679959
## [5,] 0.294840315 -0.13704056 -0.38646670 0.36707069 -0.010269155
## [6,] 0.011954936 0.02263161 -0.02785256 0.03051326 0.704680256
## [7,] 0.131811903 0.46645619 -0.36926320 -0.27676978 -0.020192744
## [8,] 0.448995510 0.49748317 -0.08945932 -0.30385535 -0.032713171
## [9,] 0.755997840 -0.23042587 0.28892718 -0.13449387 0.005077312
## [10,] -0.182195936 -0.31904717 -0.27058138 -0.52527360 0.042928685
scores <- as.matrix(scale_data) %*% pc$vectors
head(scores)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.6664374 0.1167354 -1.1982696 -0.16967510 -0.1634984 -1.9253742
## [2,] 0.6849606 0.7843595 0.3399694 1.09563333 -0.9415894 1.7696590
## [3,] 0.6843668 1.2108794 0.7028743 -0.58826610 0.4836130 -1.4087135
## [4,] -0.9156311 0.2746476 1.2888754 -0.03615188 0.2553591 -0.2966758
## [5,] -0.9746252 -2.1985287 0.9223091 0.62775794 0.2830905 -0.4351966
## [6,] -0.8834553 0.3122695 1.4431182 -1.18007684 0.7581914 0.9834924
## [,7] [,8] [,9] [,10]
## [1,] 0.08411995 1.753150021 0.48346776 -0.11453364
## [2,] -0.05700067 -0.007424249 -0.42364814 -0.21381336
## [3,] 0.30735605 1.198550466 -1.45387710 -0.06015226
## [4,] 1.07368248 -2.631826740 0.51005629 1.24488539
## [5,] -0.19636881 -0.793617671 0.01659896 1.37398286
## [6,] -0.45081999 -0.910530463 0.64905841 1.33140653
#Factor Analysis Sesuai dengan Kaiser Rule hanya mengambil yang PC >1, sehingga hanya PC 1- PC 5
varcov = cov(scale_data)
pc = eigen(varcov)
pc$values
## [1] 1.3946952 1.0424853 1.0367978 1.0316120 1.0075230 0.9898937 0.9752606
## [8] 0.9636293 0.9545709 0.6035322
pc$vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.705873381 -0.006851996 0.040371734 0.02345586 -0.01178801
## [2,] 0.013382877 -0.445799411 -0.133127052 -0.47892927 0.32997770
## [3,] -0.030309386 -0.187525794 0.678841993 0.05567382 0.01715517
## [4,] -0.027480516 0.034437290 0.116086258 0.59521458 0.41750794
## [5,] 0.023819619 0.421836358 0.100277613 -0.25878227 -0.59544023
## [6,] 0.706333034 -0.009206266 -0.008643456 0.03090712 0.03229136
## [7,] -0.006607839 -0.332224911 -0.030526206 0.50346383 -0.43226721
## [8,] 0.013334746 0.503177576 0.118287081 -0.12186105 0.40902921
## [9,] 0.013870120 -0.467648155 0.001865789 -0.22109187 -0.07749587
## [10,] -0.001497156 -0.067657638 0.693748778 -0.16602870 -0.02472516
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.031570715 -0.02221525 0.04234887 -0.00959659 -0.704273427
## [2,] -0.058908331 0.17785849 -0.59453528 0.23171702 -0.053231874
## [3,] 0.008971093 0.38961723 0.27152813 0.52346389 0.008429667
## [4,] 0.290997331 -0.42050076 -0.34615472 0.27128986 -0.032679959
## [5,] 0.294840315 -0.13704056 -0.38646670 0.36707069 -0.010269155
## [6,] 0.011954936 0.02263161 -0.02785256 0.03051326 0.704680256
## [7,] 0.131811903 0.46645619 -0.36926320 -0.27676978 -0.020192744
## [8,] 0.448995510 0.49748317 -0.08945932 -0.30385535 -0.032713171
## [9,] 0.755997840 -0.23042587 0.28892718 -0.13449387 0.005077312
## [10,] -0.182195936 -0.31904717 -0.27058138 -0.52527360 0.042928685
sp = sum(pc$values[1:5])
L1 = sqrt(pc$values[1])*pc$vectors[,1]
L2 = sqrt(pc$values[2])*pc$vectors[,2]
L3 = sqrt(pc$values[3])*pc$vectors[,3]
L4 = sqrt(pc$values[4])*pc$vectors[,4]
L5 = sqrt(pc$values[5])*pc$vectors[,5]
L = cbind(L1,L2,L3,L4,L5)
L
## L1 L2 L3 L4 L5
## [1,] 0.833616803 -0.006996037 0.041107819 0.02382372 -0.01183227
## [2,] 0.015804805 -0.455170876 -0.135554316 -0.48644034 0.33121658
## [3,] -0.035794541 -0.191467906 0.691219105 0.05654696 0.01721958
## [4,] -0.032453724 0.035161220 0.118202823 0.60454936 0.41907545
## [5,] 0.028130307 0.430704078 0.102105943 -0.26284076 -0.59767578
## [6,] 0.834159640 -0.009399798 -0.008801050 0.03139184 0.03241260
## [7,] -0.007803674 -0.339208846 -0.031082781 0.51135967 -0.43389013
## [8,] 0.015747963 0.513755228 0.120443772 -0.12377220 0.41056489
## [9,] 0.016380225 -0.477478918 0.001899807 -0.22455927 -0.07778683
## [10,] -0.001768099 -0.069079917 0.706397681 -0.16863253 -0.02481799
##Perform factor analysis
library(psych)
fa <- fa(r = scale_data,
covar = TRUE,
nfactors = 3,
rotate = "varimax")
summary(fa)
##
## Factor analysis with Call: fa(r = scale_data, nfactors = 3, rotate = "varimax", covar = TRUE)
##
## Test of the hypothesis that 3 factors are sufficient.
## The degrees of freedom for the model is 18 and the objective function was 0
## The number of observations was 9651 with Chi Square = 21.21 with prob < 0.27
##
## The root mean square of the residuals (RMSA) is 0.01
## The df corrected root mean square of the residuals is 0.01
##
## Tucker Lewis Index of factoring reliability = 0.995
## RMSEA index = 0.004 and the 10 % confidence intervals are 0 0.01
## BIC = -143.93
load <- fa$loadings
plot(load[,c(1,3)],type="n") # set up plot
text(load[,c(1,3)],labels=names(data),cex=.7)
fa.diagram(load)