########################################################
## LOAD LIBRARY
########################################################
library(psych)
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.95 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
########################################################
## LOAD DATA
########################################################
data <- read.csv(file.choose())
str(data)
## 'data.frame': 6000 obs. of 19 variables:
## $ Patient_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 77 54 25 23 70 57 22 39 79 60 ...
## $ gender : chr "Female" "Male" "Male" "Female" ...
## $ bmi : num 33.8 19.2 33.7 32.8 33.7 33 31.9 28.8 41.7 35.8 ...
## $ blood_pressure : int 154 123 141 140 165 145 140 123 177 145 ...
## $ fasting_glucose_level : int 93 94 150 145 90 88 120 88 83 146 ...
## $ insulin_level : num 12.1 4.6 10.8 11.6 18.3 4.8 30.5 3.6 22 15.6 ...
## $ HbA1c_level : num 5.2 5.4 6.9 6.8 5.6 5.5 6.3 5.3 5.3 7.5 ...
## $ cholesterol_level : int 242 212 247 195 217 217 197 204 257 217 ...
## $ triglycerides_level : int 194 76 221 193 170 234 178 109 144 243 ...
## $ physical_activity_level : chr "Low" "High" "Low" "Low" ...
## $ daily_calorie_intake : int 2169 1881 2811 2826 2610 2263 2247 2218 2553 2650 ...
## $ sugar_intake_grams_per_day: num 78.4 16.5 147.9 98.3 65.8 ...
## $ sleep_hours : num 8.1 6.6 6.7 4.4 9.1 5.4 4 7.9 7.7 7.7 ...
## $ stress_level : int 4 3 10 9 5 5 10 5 5 6 ...
## $ family_history_diabetes : chr "No" "No" "Yes" "Yes" ...
## $ waist_circumference_cm : num 101.1 60 114.7 96.6 107.4 ...
## $ diabetes_risk_score : num 52.3 3.7 87.3 76.1 47.7 40.2 56.9 5.7 81.2 94.3 ...
## $ diabetes_risk_category : chr "Prediabetes" "Low Risk" "High Risk" "High Risk" ...
head(data)
## Patient_ID age gender bmi blood_pressure fasting_glucose_level insulin_level
## 1 1 77 Female 33.8 154 93 12.1
## 2 2 54 Male 19.2 123 94 4.6
## 3 3 25 Male 33.7 141 150 10.8
## 4 4 23 Female 32.8 140 145 11.6
## 5 5 70 Male 33.7 165 90 18.3
## 6 6 57 Female 33.0 145 88 4.8
## HbA1c_level cholesterol_level triglycerides_level physical_activity_level
## 1 5.2 242 194 Low
## 2 5.4 212 76 High
## 3 6.9 247 221 Low
## 4 6.8 195 193 Low
## 5 5.6 217 170 Moderate
## 6 5.5 217 234 Low
## daily_calorie_intake sugar_intake_grams_per_day sleep_hours stress_level
## 1 2169 78.4 8.1 4
## 2 1881 16.5 6.6 3
## 3 2811 147.9 6.7 10
## 4 2826 98.3 4.4 9
## 5 2610 65.8 9.1 5
## 6 2263 86.7 5.4 5
## family_history_diabetes waist_circumference_cm diabetes_risk_score
## 1 No 101.1 52.3
## 2 No 60.0 3.7
## 3 Yes 114.7 87.3
## 4 Yes 96.6 76.1
## 5 Yes 107.4 47.7
## 6 No 105.2 40.2
## diabetes_risk_category
## 1 Prediabetes
## 2 Low Risk
## 3 High Risk
## 4 High Risk
## 5 Prediabetes
## 6 Prediabetes
########################################################
## HAPUS VARIABEL NON-INDIKATOR
########################################################
data_clean <- data %>%
select(-Patient_ID, -gender, -physical_activity_level, -family_history_diabetes, -diabetes_risk_category)
########################################################
## HAPUS VARIABEL KUALITATIF
########################################################
# Ambil hanya variabel numerik
data_numeric <- data_clean %>%
select(where(is.numeric))
# Cek variabel yang dipakai
colnames(data_numeric)
## [1] "age" "bmi"
## [3] "blood_pressure" "fasting_glucose_level"
## [5] "insulin_level" "HbA1c_level"
## [7] "cholesterol_level" "triglycerides_level"
## [9] "daily_calorie_intake" "sugar_intake_grams_per_day"
## [11] "sleep_hours" "stress_level"
## [13] "waist_circumference_cm" "diabetes_risk_score"
########################################################
## CEK & TANGANI MISSING VALUE
########################################################
# Total NA
sum(is.na(data_numeric))
## [1] 0
# NA per kolom
colSums(is.na(data_numeric))
## age bmi
## 0 0
## blood_pressure fasting_glucose_level
## 0 0
## insulin_level HbA1c_level
## 0 0
## cholesterol_level triglycerides_level
## 0 0
## daily_calorie_intake sugar_intake_grams_per_day
## 0 0
## sleep_hours stress_level
## 0 0
## waist_circumference_cm diabetes_risk_score
## 0 0
# Isi NA dengan median
data_numeric <- data_numeric %>%
mutate(across(everything(),
~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
########################################################
## STANDARISASI DATA
########################################################
data_scaled <- scale(data_numeric)
# Matriks korelasi
cor_matrix <- cor(data_scaled)
########################################################
## STATISTIK DESKRIPTIF
########################################################
summary(data_numeric)
## age bmi blood_pressure fasting_glucose_level
## Min. :20.00 Min. :16.00 Min. : 91.0 Min. : 60.0
## 1st Qu.:36.00 1st Qu.:26.90 1st Qu.:132.0 1st Qu.: 87.0
## Median :53.00 Median :32.75 Median :144.0 Median : 96.0
## Mean :52.23 Mean :32.94 Mean :144.5 Mean :106.1
## 3rd Qu.:68.00 3rd Qu.:38.50 3rd Qu.:157.0 3rd Qu.:114.0
## Max. :84.00 Max. :50.00 Max. :200.0 Max. :281.0
## insulin_level HbA1c_level cholesterol_level triglycerides_level
## Min. : 2.00 Min. : 4.100 Min. :139.0 Min. : 50.0
## 1st Qu.: 7.10 1st Qu.: 5.200 1st Qu.:200.0 1st Qu.:137.0
## Median :13.80 Median : 5.500 Median :217.0 Median :173.0
## Mean :15.08 Mean : 5.785 Mean :217.5 Mean :177.4
## 3rd Qu.:21.00 3rd Qu.: 6.100 3rd Qu.:234.0 3rd Qu.:213.0
## Max. :55.90 Max. :11.000 Max. :309.0 Max. :383.0
## daily_calorie_intake sugar_intake_grams_per_day sleep_hours
## Min. :1200 Min. : 0.00 Min. : 4.000
## 1st Qu.:2055 1st Qu.: 35.10 1st Qu.: 6.100
## Median :2385 Median : 58.30 Median : 7.100
## Mean :2480 Mean : 66.56 Mean : 6.999
## 3rd Qu.:2848 3rd Qu.: 89.30 3rd Qu.: 8.000
## Max. :5249 Max. :255.00 Max. :10.000
## stress_level waist_circumference_cm diabetes_risk_score
## Min. : 1.000 Min. : 60.0 Min. : 0.00
## 1st Qu.: 3.000 1st Qu.: 84.7 1st Qu.: 12.90
## Median : 5.000 Median :104.6 Median : 43.90
## Mean : 5.087 Mean :105.0 Mean : 48.69
## 3rd Qu.: 7.000 3rd Qu.:124.2 3rd Qu.: 88.80
## Max. :10.000 Max. :150.0 Max. :100.00
psych::describe(data_numeric)
## vars n mean sd median trimmed mad
## age 1 6000 52.23 18.76 53.00 52.27 23.72
## bmi 2 6000 32.94 7.51 32.75 32.78 8.52
## blood_pressure 3 6000 144.48 17.87 144.00 144.39 19.27
## fasting_glucose_level 4 6000 106.11 31.21 96.00 100.24 17.79
## insulin_level 5 6000 15.08 9.25 13.80 14.18 10.08
## HbA1c_level 6 6000 5.78 0.96 5.50 5.62 0.59
## cholesterol_level 7 6000 217.49 24.89 217.00 217.29 25.20
## triglycerides_level 8 6000 177.35 51.90 173.00 175.09 56.34
## daily_calorie_intake 9 6000 2479.55 548.41 2385.00 2440.79 572.28
## sugar_intake_grams_per_day 10 6000 66.56 39.58 58.30 62.29 38.10
## sleep_hours 11 6000 7.00 1.33 7.10 7.06 1.33
## stress_level 12 6000 5.09 2.34 5.00 5.03 2.97
## waist_circumference_cm 13 6000 105.05 25.08 104.60 104.72 29.36
## diabetes_risk_score 14 6000 48.69 37.24 43.90 48.36 52.48
## min max range skew kurtosis se
## age 20.0 84.0 64.0 -0.02 -1.21 0.24
## bmi 16.0 50.0 34.0 0.15 -0.71 0.10
## blood_pressure 91.0 200.0 109.0 0.06 -0.35 0.23
## fasting_glucose_level 60.0 281.0 221.0 2.00 4.38 0.40
## insulin_level 2.0 55.9 53.9 0.76 -0.01 0.12
## HbA1c_level 4.1 11.0 6.9 1.86 4.00 0.01
## cholesterol_level 139.0 309.0 170.0 0.09 -0.15 0.32
## triglycerides_level 50.0 383.0 333.0 0.37 -0.30 0.67
## daily_calorie_intake 1200.0 5249.0 4049.0 0.67 0.27 7.08
## sugar_intake_grams_per_day 0.0 255.0 255.0 0.93 0.52 0.51
## sleep_hours 4.0 10.0 6.0 -0.35 -0.33 0.02
## stress_level 1.0 10.0 9.0 0.17 -0.69 0.03
## waist_circumference_cm 60.0 150.0 90.0 0.09 -0.96 0.32
## diabetes_risk_score 0.0 100.0 100.0 0.14 -1.49 0.48
########################################################
## HEATMAP KORELASI
########################################################
windows(width = 12, height = 10)
corrplot(
cor_matrix,
method = "color",
type = "upper",
order = "hclust",
addCoef.col = "black",
tl.col = "black",
tl.cex = 0.8,
number.cex = 0.6,
col = colorRampPalette(c("red","white","blue"))(200),
mar = c(0,0,2,0),
title = "Diabetes Risk"
)
round(cor_matrix, 2)
## age bmi blood_pressure fasting_glucose_level
## age 1.00 0.25 0.64 -0.01
## bmi 0.25 1.00 0.72 0.47
## blood_pressure 0.64 0.72 1.00 0.33
## fasting_glucose_level -0.01 0.47 0.33 1.00
## insulin_level -0.02 0.53 0.35 0.41
## HbA1c_level -0.02 0.47 0.33 0.98
## cholesterol_level 0.53 0.70 0.68 0.29
## triglycerides_level 0.10 0.77 0.54 0.58
## daily_calorie_intake 0.00 0.89 0.53 0.44
## sugar_intake_grams_per_day -0.01 0.56 0.39 0.61
## sleep_hours 0.01 -0.30 -0.23 -0.36
## stress_level -0.03 0.35 0.42 0.40
## waist_circumference_cm 0.22 0.96 0.71 0.49
## diabetes_risk_score 0.26 0.88 0.69 0.68
## insulin_level HbA1c_level cholesterol_level
## age -0.02 -0.02 0.53
## bmi 0.53 0.47 0.70
## blood_pressure 0.35 0.33 0.68
## fasting_glucose_level 0.41 0.98 0.29
## insulin_level 1.00 0.41 0.33
## HbA1c_level 0.41 1.00 0.28
## cholesterol_level 0.33 0.28 1.00
## triglycerides_level 0.53 0.57 0.51
## daily_calorie_intake 0.47 0.43 0.54
## sugar_intake_grams_per_day 0.51 0.60 0.34
## sleep_hours -0.33 -0.35 -0.18
## stress_level 0.36 0.39 0.22
## waist_circumference_cm 0.53 0.48 0.67
## diabetes_risk_score 0.59 0.66 0.64
## triglycerides_level daily_calorie_intake
## age 0.10 0.00
## bmi 0.77 0.89
## blood_pressure 0.54 0.53
## fasting_glucose_level 0.58 0.44
## insulin_level 0.53 0.47
## HbA1c_level 0.57 0.43
## cholesterol_level 0.51 0.54
## triglycerides_level 1.00 0.69
## daily_calorie_intake 0.69 1.00
## sugar_intake_grams_per_day 0.85 0.51
## sleep_hours -0.37 -0.28
## stress_level 0.41 0.33
## waist_circumference_cm 0.76 0.85
## diabetes_risk_score 0.78 0.75
## sugar_intake_grams_per_day sleep_hours stress_level
## age -0.01 0.01 -0.03
## bmi 0.56 -0.30 0.35
## blood_pressure 0.39 -0.23 0.42
## fasting_glucose_level 0.61 -0.36 0.40
## insulin_level 0.51 -0.33 0.36
## HbA1c_level 0.60 -0.35 0.39
## cholesterol_level 0.34 -0.18 0.22
## triglycerides_level 0.85 -0.37 0.41
## daily_calorie_intake 0.51 -0.28 0.33
## sugar_intake_grams_per_day 1.00 -0.39 0.42
## sleep_hours -0.39 1.00 -0.32
## stress_level 0.42 -0.32 1.00
## waist_circumference_cm 0.57 -0.32 0.45
## diabetes_risk_score 0.65 -0.37 0.46
## waist_circumference_cm diabetes_risk_score
## age 0.22 0.26
## bmi 0.96 0.88
## blood_pressure 0.71 0.69
## fasting_glucose_level 0.49 0.68
## insulin_level 0.53 0.59
## HbA1c_level 0.48 0.66
## cholesterol_level 0.67 0.64
## triglycerides_level 0.76 0.78
## daily_calorie_intake 0.85 0.75
## sugar_intake_grams_per_day 0.57 0.65
## sleep_hours -0.32 -0.37
## stress_level 0.45 0.46
## waist_circumference_cm 1.00 0.88
## diabetes_risk_score 0.88 1.00
########################################################
## UJI ASUMSI (KMO & BARTLETT)
########################################################
# KMO
kmo_result <- KMO(cor_matrix)
kmo_result
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor_matrix)
## Overall MSA = 0.87
## MSA for each item =
## age bmi
## 0.53 0.83
## blood_pressure fasting_glucose_level
## 0.85 0.78
## insulin_level HbA1c_level
## 0.96 0.79
## cholesterol_level triglycerides_level
## 0.95 0.89
## daily_calorie_intake sugar_intake_grams_per_day
## 0.91 0.86
## sleep_hours stress_level
## 0.98 0.77
## waist_circumference_cm diabetes_risk_score
## 0.91 0.96
# MSA tiap variabel
kmo_result$MSAi
## age bmi
## 0.5294325 0.8348306
## blood_pressure fasting_glucose_level
## 0.8541390 0.7840502
## insulin_level HbA1c_level
## 0.9636213 0.7884565
## cholesterol_level triglycerides_level
## 0.9524065 0.8897510
## daily_calorie_intake sugar_intake_grams_per_day
## 0.9059523 0.8574375
## sleep_hours stress_level
## 0.9771369 0.7667277
## waist_circumference_cm diabetes_risk_score
## 0.9061635 0.9575505
# Bartlett
cortest.bartlett(cor_matrix, n = nrow(data_scaled))
## $chisq
## [1] 98647.54
##
## $p.value
## [1] 0
##
## $df
## [1] 91
########################################################
## PCA
########################################################
pca_result <- prcomp(data_scaled, center = TRUE, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.7359 1.3970 1.0206 0.92815 0.83880 0.77342 0.74449
## Proportion of Variance 0.5347 0.1394 0.0744 0.06153 0.05026 0.04273 0.03959
## Cumulative Proportion 0.5347 0.6741 0.7485 0.80999 0.86025 0.90297 0.94256
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.56025 0.38617 0.34543 0.31864 0.26941 0.15962 0.14892
## Proportion of Variance 0.02242 0.01065 0.00852 0.00725 0.00518 0.00182 0.00158
## Cumulative Proportion 0.96498 0.97564 0.98416 0.99141 0.99660 0.99842 1.00000
eigen_values <- pca_result$sdev^2
eigen_values
## [1] 7.48531246 1.95149057 1.04160443 0.86146359 0.70358395 0.59817990
## [7] 0.55426931 0.31387544 0.14912476 0.11932499 0.10153319 0.07258415
## [13] 0.02547715 0.02217611
fviz_eig(pca_result, addlabels = TRUE)
## Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
## Ignoring empty aesthetic: `width`.

loadings_pca <- pca_result$rotation
round(loadings_pca[,1:3], 2)
## PC1 PC2 PC3
## age -0.09 0.56 -0.49
## bmi -0.33 0.16 0.26
## blood_pressure -0.27 0.37 -0.20
## fasting_glucose_level -0.26 -0.33 -0.40
## insulin_level -0.23 -0.14 0.20
## HbA1c_level -0.26 -0.33 -0.41
## cholesterol_level -0.25 0.38 -0.05
## triglycerides_level -0.32 -0.08 0.11
## daily_calorie_intake -0.30 0.04 0.43
## sugar_intake_grams_per_day -0.28 -0.24 -0.02
## sleep_hours 0.16 0.21 0.14
## stress_level -0.20 -0.16 -0.11
## waist_circumference_cm -0.33 0.13 0.23
## diabetes_risk_score -0.35 0.03 -0.01
########################################################
## FA
########################################################
fa.parallel(data_scaled, fa = "fa", n.iter = 100)

## Parallel analysis suggests that the number of factors = 5 and the number of components = NA
fa_result <- fa(data_scaled,
nfactors = 5,
rotate = "varimax",
fm = "ml")
print(fa_result, cut = 0.4)
## Factor Analysis using method = ml
## Call: fa(r = data_scaled, nfactors = 5, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML3 ML2 ML4 ML5 h2 u2 com
## age 0.99 1.00 0.0050 1.0
## bmi 0.91 1.00 0.0049 1.4
## blood_pressure 0.51 0.66 0.81 0.1858 2.5
## fasting_glucose_level 0.93 1.00 0.0050 1.3
## insulin_level 0.43 0.38 0.6241 3.0
## HbA1c_level 0.91 0.96 0.0402 1.3
## cholesterol_level 0.56 0.53 0.63 0.3660 2.2
## triglycerides_level 0.57 0.64 0.85 0.1457 2.6
## daily_calorie_intake 0.88 0.85 0.1499 1.2
## sugar_intake_grams_per_day 0.87 0.99 0.0077 1.6
## sleep_hours 0.20 0.7988 3.9
## stress_level 0.87 0.86 0.1413 1.3
## waist_circumference_cm 0.86 0.94 0.0572 1.6
## diabetes_risk_score 0.70 0.43 0.89 0.1107 2.7
##
## ML1 ML3 ML2 ML4 ML5
## SS loadings 4.19 2.37 1.91 1.73 1.15
## Proportion Var 0.30 0.17 0.14 0.12 0.08
## Cumulative Var 0.30 0.47 0.61 0.73 0.81
## Proportion Explained 0.37 0.21 0.17 0.15 0.10
## Cumulative Proportion 0.37 0.58 0.75 0.90 1.00
##
## Mean item complexity = 2
## Test of the hypothesis that 5 factors are sufficient.
##
## df null model = 91 with the objective function = 16.46 with Chi Square = 98647.54
## df of the model are 31 and the objective function was 0.12
##
## The root mean square of the residuals (RMSR) is 0.01
## The df corrected root mean square of the residuals is 0.02
##
## The harmonic n.obs is 6000 with the empirical chi square 60.69 with prob < 0.0011
## The total n.obs was 6000 with Likelihood Chi Square = 705.05 with prob < 2.1e-128
##
## Tucker Lewis Index of factoring reliability = 0.98
## RMSEA index = 0.06 and the 90 % confidence intervals are 0.056 0.064
## BIC = 435.37
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## ML1 ML3 ML2 ML4 ML5
## Correlation of (regression) scores with factors 1.00 1.00 1.00 0.99 0.92
## Multiple R square of scores with factors 0.99 0.99 0.99 0.98 0.84
## Minimum correlation of possible factor scores 0.98 0.98 0.99 0.96 0.69
fa_result$communality
## age bmi
## 0.9950013 0.9950602
## blood_pressure fasting_glucose_level
## 0.8141633 0.9950038
## insulin_level HbA1c_level
## 0.3758905 0.9597864
## cholesterol_level triglycerides_level
## 0.6339524 0.8543336
## daily_calorie_intake sugar_intake_grams_per_day
## 0.8500642 0.9922554
## sleep_hours stress_level
## 0.2012363 0.8587106
## waist_circumference_cm diabetes_risk_score
## 0.9427882 0.8892518
fa_result$Vaccounted
## ML1 ML3 ML2 ML4 ML5
## SS loadings 4.1881882 2.3723525 1.9138739 1.7289318 1.15415153
## Proportion Var 0.2991563 0.1694538 0.1367053 0.1234951 0.08243939
## Cumulative Var 0.2991563 0.4686101 0.6053153 0.7288105 0.81124985
## Proportion Explained 0.3687598 0.2088799 0.1685119 0.1522282 0.10162023
## Cumulative Proportion 0.3687598 0.5776396 0.7461515 0.8983798 1.00000000