########################################################
## LOAD LIBRARY
########################################################
library(psych)
library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.95 loaded
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
########################################################
## LOAD DATA
########################################################
data <- read.csv(file.choose())

str(data)
## 'data.frame':    6000 obs. of  19 variables:
##  $ Patient_ID                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age                       : int  77 54 25 23 70 57 22 39 79 60 ...
##  $ gender                    : chr  "Female" "Male" "Male" "Female" ...
##  $ bmi                       : num  33.8 19.2 33.7 32.8 33.7 33 31.9 28.8 41.7 35.8 ...
##  $ blood_pressure            : int  154 123 141 140 165 145 140 123 177 145 ...
##  $ fasting_glucose_level     : int  93 94 150 145 90 88 120 88 83 146 ...
##  $ insulin_level             : num  12.1 4.6 10.8 11.6 18.3 4.8 30.5 3.6 22 15.6 ...
##  $ HbA1c_level               : num  5.2 5.4 6.9 6.8 5.6 5.5 6.3 5.3 5.3 7.5 ...
##  $ cholesterol_level         : int  242 212 247 195 217 217 197 204 257 217 ...
##  $ triglycerides_level       : int  194 76 221 193 170 234 178 109 144 243 ...
##  $ physical_activity_level   : chr  "Low" "High" "Low" "Low" ...
##  $ daily_calorie_intake      : int  2169 1881 2811 2826 2610 2263 2247 2218 2553 2650 ...
##  $ sugar_intake_grams_per_day: num  78.4 16.5 147.9 98.3 65.8 ...
##  $ sleep_hours               : num  8.1 6.6 6.7 4.4 9.1 5.4 4 7.9 7.7 7.7 ...
##  $ stress_level              : int  4 3 10 9 5 5 10 5 5 6 ...
##  $ family_history_diabetes   : chr  "No" "No" "Yes" "Yes" ...
##  $ waist_circumference_cm    : num  101.1 60 114.7 96.6 107.4 ...
##  $ diabetes_risk_score       : num  52.3 3.7 87.3 76.1 47.7 40.2 56.9 5.7 81.2 94.3 ...
##  $ diabetes_risk_category    : chr  "Prediabetes" "Low Risk" "High Risk" "High Risk" ...
head(data)
##   Patient_ID age gender  bmi blood_pressure fasting_glucose_level insulin_level
## 1          1  77 Female 33.8            154                    93          12.1
## 2          2  54   Male 19.2            123                    94           4.6
## 3          3  25   Male 33.7            141                   150          10.8
## 4          4  23 Female 32.8            140                   145          11.6
## 5          5  70   Male 33.7            165                    90          18.3
## 6          6  57 Female 33.0            145                    88           4.8
##   HbA1c_level cholesterol_level triglycerides_level physical_activity_level
## 1         5.2               242                 194                     Low
## 2         5.4               212                  76                    High
## 3         6.9               247                 221                     Low
## 4         6.8               195                 193                     Low
## 5         5.6               217                 170                Moderate
## 6         5.5               217                 234                     Low
##   daily_calorie_intake sugar_intake_grams_per_day sleep_hours stress_level
## 1                 2169                       78.4         8.1            4
## 2                 1881                       16.5         6.6            3
## 3                 2811                      147.9         6.7           10
## 4                 2826                       98.3         4.4            9
## 5                 2610                       65.8         9.1            5
## 6                 2263                       86.7         5.4            5
##   family_history_diabetes waist_circumference_cm diabetes_risk_score
## 1                      No                  101.1                52.3
## 2                      No                   60.0                 3.7
## 3                     Yes                  114.7                87.3
## 4                     Yes                   96.6                76.1
## 5                     Yes                  107.4                47.7
## 6                      No                  105.2                40.2
##   diabetes_risk_category
## 1            Prediabetes
## 2               Low Risk
## 3              High Risk
## 4              High Risk
## 5            Prediabetes
## 6            Prediabetes
########################################################
## HAPUS VARIABEL NON-INDIKATOR
########################################################
data_clean <- data %>%
  select(-Patient_ID, -gender, -physical_activity_level, -family_history_diabetes, -diabetes_risk_category)
########################################################
## HAPUS VARIABEL KUALITATIF
########################################################
# Ambil hanya variabel numerik
data_numeric <- data_clean %>%
  select(where(is.numeric))

# Cek variabel yang dipakai
colnames(data_numeric)
##  [1] "age"                        "bmi"                       
##  [3] "blood_pressure"             "fasting_glucose_level"     
##  [5] "insulin_level"              "HbA1c_level"               
##  [7] "cholesterol_level"          "triglycerides_level"       
##  [9] "daily_calorie_intake"       "sugar_intake_grams_per_day"
## [11] "sleep_hours"                "stress_level"              
## [13] "waist_circumference_cm"     "diabetes_risk_score"
########################################################
## CEK & TANGANI MISSING VALUE
########################################################
# Total NA
sum(is.na(data_numeric))
## [1] 0
# NA per kolom
colSums(is.na(data_numeric))
##                        age                        bmi 
##                          0                          0 
##             blood_pressure      fasting_glucose_level 
##                          0                          0 
##              insulin_level                HbA1c_level 
##                          0                          0 
##          cholesterol_level        triglycerides_level 
##                          0                          0 
##       daily_calorie_intake sugar_intake_grams_per_day 
##                          0                          0 
##                sleep_hours               stress_level 
##                          0                          0 
##     waist_circumference_cm        diabetes_risk_score 
##                          0                          0
# Isi NA dengan median
data_numeric <- data_numeric %>%
  mutate(across(everything(),
                ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
########################################################
## STANDARISASI DATA
########################################################
data_scaled <- scale(data_numeric)

# Matriks korelasi
cor_matrix <- cor(data_scaled)
########################################################
## STATISTIK DESKRIPTIF
########################################################
summary(data_numeric)
##       age             bmi        blood_pressure  fasting_glucose_level
##  Min.   :20.00   Min.   :16.00   Min.   : 91.0   Min.   : 60.0        
##  1st Qu.:36.00   1st Qu.:26.90   1st Qu.:132.0   1st Qu.: 87.0        
##  Median :53.00   Median :32.75   Median :144.0   Median : 96.0        
##  Mean   :52.23   Mean   :32.94   Mean   :144.5   Mean   :106.1        
##  3rd Qu.:68.00   3rd Qu.:38.50   3rd Qu.:157.0   3rd Qu.:114.0        
##  Max.   :84.00   Max.   :50.00   Max.   :200.0   Max.   :281.0        
##  insulin_level    HbA1c_level     cholesterol_level triglycerides_level
##  Min.   : 2.00   Min.   : 4.100   Min.   :139.0     Min.   : 50.0      
##  1st Qu.: 7.10   1st Qu.: 5.200   1st Qu.:200.0     1st Qu.:137.0      
##  Median :13.80   Median : 5.500   Median :217.0     Median :173.0      
##  Mean   :15.08   Mean   : 5.785   Mean   :217.5     Mean   :177.4      
##  3rd Qu.:21.00   3rd Qu.: 6.100   3rd Qu.:234.0     3rd Qu.:213.0      
##  Max.   :55.90   Max.   :11.000   Max.   :309.0     Max.   :383.0      
##  daily_calorie_intake sugar_intake_grams_per_day  sleep_hours    
##  Min.   :1200         Min.   :  0.00             Min.   : 4.000  
##  1st Qu.:2055         1st Qu.: 35.10             1st Qu.: 6.100  
##  Median :2385         Median : 58.30             Median : 7.100  
##  Mean   :2480         Mean   : 66.56             Mean   : 6.999  
##  3rd Qu.:2848         3rd Qu.: 89.30             3rd Qu.: 8.000  
##  Max.   :5249         Max.   :255.00             Max.   :10.000  
##   stress_level    waist_circumference_cm diabetes_risk_score
##  Min.   : 1.000   Min.   : 60.0          Min.   :  0.00     
##  1st Qu.: 3.000   1st Qu.: 84.7          1st Qu.: 12.90     
##  Median : 5.000   Median :104.6          Median : 43.90     
##  Mean   : 5.087   Mean   :105.0          Mean   : 48.69     
##  3rd Qu.: 7.000   3rd Qu.:124.2          3rd Qu.: 88.80     
##  Max.   :10.000   Max.   :150.0          Max.   :100.00
psych::describe(data_numeric)
##                            vars    n    mean     sd  median trimmed    mad
## age                           1 6000   52.23  18.76   53.00   52.27  23.72
## bmi                           2 6000   32.94   7.51   32.75   32.78   8.52
## blood_pressure                3 6000  144.48  17.87  144.00  144.39  19.27
## fasting_glucose_level         4 6000  106.11  31.21   96.00  100.24  17.79
## insulin_level                 5 6000   15.08   9.25   13.80   14.18  10.08
## HbA1c_level                   6 6000    5.78   0.96    5.50    5.62   0.59
## cholesterol_level             7 6000  217.49  24.89  217.00  217.29  25.20
## triglycerides_level           8 6000  177.35  51.90  173.00  175.09  56.34
## daily_calorie_intake          9 6000 2479.55 548.41 2385.00 2440.79 572.28
## sugar_intake_grams_per_day   10 6000   66.56  39.58   58.30   62.29  38.10
## sleep_hours                  11 6000    7.00   1.33    7.10    7.06   1.33
## stress_level                 12 6000    5.09   2.34    5.00    5.03   2.97
## waist_circumference_cm       13 6000  105.05  25.08  104.60  104.72  29.36
## diabetes_risk_score          14 6000   48.69  37.24   43.90   48.36  52.48
##                               min    max  range  skew kurtosis   se
## age                          20.0   84.0   64.0 -0.02    -1.21 0.24
## bmi                          16.0   50.0   34.0  0.15    -0.71 0.10
## blood_pressure               91.0  200.0  109.0  0.06    -0.35 0.23
## fasting_glucose_level        60.0  281.0  221.0  2.00     4.38 0.40
## insulin_level                 2.0   55.9   53.9  0.76    -0.01 0.12
## HbA1c_level                   4.1   11.0    6.9  1.86     4.00 0.01
## cholesterol_level           139.0  309.0  170.0  0.09    -0.15 0.32
## triglycerides_level          50.0  383.0  333.0  0.37    -0.30 0.67
## daily_calorie_intake       1200.0 5249.0 4049.0  0.67     0.27 7.08
## sugar_intake_grams_per_day    0.0  255.0  255.0  0.93     0.52 0.51
## sleep_hours                   4.0   10.0    6.0 -0.35    -0.33 0.02
## stress_level                  1.0   10.0    9.0  0.17    -0.69 0.03
## waist_circumference_cm       60.0  150.0   90.0  0.09    -0.96 0.32
## diabetes_risk_score           0.0  100.0  100.0  0.14    -1.49 0.48
########################################################
## HEATMAP KORELASI
########################################################
windows(width = 12, height = 10)

corrplot(
  cor_matrix,
  method = "color",
  type = "upper",
  order = "hclust",
  addCoef.col = "black",
  tl.col = "black",
  tl.cex = 0.8,
  number.cex = 0.6,
  col = colorRampPalette(c("red","white","blue"))(200),
  mar = c(0,0,2,0),
  title = "Diabetes Risk"
)

round(cor_matrix, 2)
##                              age   bmi blood_pressure fasting_glucose_level
## age                         1.00  0.25           0.64                 -0.01
## bmi                         0.25  1.00           0.72                  0.47
## blood_pressure              0.64  0.72           1.00                  0.33
## fasting_glucose_level      -0.01  0.47           0.33                  1.00
## insulin_level              -0.02  0.53           0.35                  0.41
## HbA1c_level                -0.02  0.47           0.33                  0.98
## cholesterol_level           0.53  0.70           0.68                  0.29
## triglycerides_level         0.10  0.77           0.54                  0.58
## daily_calorie_intake        0.00  0.89           0.53                  0.44
## sugar_intake_grams_per_day -0.01  0.56           0.39                  0.61
## sleep_hours                 0.01 -0.30          -0.23                 -0.36
## stress_level               -0.03  0.35           0.42                  0.40
## waist_circumference_cm      0.22  0.96           0.71                  0.49
## diabetes_risk_score         0.26  0.88           0.69                  0.68
##                            insulin_level HbA1c_level cholesterol_level
## age                                -0.02       -0.02              0.53
## bmi                                 0.53        0.47              0.70
## blood_pressure                      0.35        0.33              0.68
## fasting_glucose_level               0.41        0.98              0.29
## insulin_level                       1.00        0.41              0.33
## HbA1c_level                         0.41        1.00              0.28
## cholesterol_level                   0.33        0.28              1.00
## triglycerides_level                 0.53        0.57              0.51
## daily_calorie_intake                0.47        0.43              0.54
## sugar_intake_grams_per_day          0.51        0.60              0.34
## sleep_hours                        -0.33       -0.35             -0.18
## stress_level                        0.36        0.39              0.22
## waist_circumference_cm              0.53        0.48              0.67
## diabetes_risk_score                 0.59        0.66              0.64
##                            triglycerides_level daily_calorie_intake
## age                                       0.10                 0.00
## bmi                                       0.77                 0.89
## blood_pressure                            0.54                 0.53
## fasting_glucose_level                     0.58                 0.44
## insulin_level                             0.53                 0.47
## HbA1c_level                               0.57                 0.43
## cholesterol_level                         0.51                 0.54
## triglycerides_level                       1.00                 0.69
## daily_calorie_intake                      0.69                 1.00
## sugar_intake_grams_per_day                0.85                 0.51
## sleep_hours                              -0.37                -0.28
## stress_level                              0.41                 0.33
## waist_circumference_cm                    0.76                 0.85
## diabetes_risk_score                       0.78                 0.75
##                            sugar_intake_grams_per_day sleep_hours stress_level
## age                                             -0.01        0.01        -0.03
## bmi                                              0.56       -0.30         0.35
## blood_pressure                                   0.39       -0.23         0.42
## fasting_glucose_level                            0.61       -0.36         0.40
## insulin_level                                    0.51       -0.33         0.36
## HbA1c_level                                      0.60       -0.35         0.39
## cholesterol_level                                0.34       -0.18         0.22
## triglycerides_level                              0.85       -0.37         0.41
## daily_calorie_intake                             0.51       -0.28         0.33
## sugar_intake_grams_per_day                       1.00       -0.39         0.42
## sleep_hours                                     -0.39        1.00        -0.32
## stress_level                                     0.42       -0.32         1.00
## waist_circumference_cm                           0.57       -0.32         0.45
## diabetes_risk_score                              0.65       -0.37         0.46
##                            waist_circumference_cm diabetes_risk_score
## age                                          0.22                0.26
## bmi                                          0.96                0.88
## blood_pressure                               0.71                0.69
## fasting_glucose_level                        0.49                0.68
## insulin_level                                0.53                0.59
## HbA1c_level                                  0.48                0.66
## cholesterol_level                            0.67                0.64
## triglycerides_level                          0.76                0.78
## daily_calorie_intake                         0.85                0.75
## sugar_intake_grams_per_day                   0.57                0.65
## sleep_hours                                 -0.32               -0.37
## stress_level                                 0.45                0.46
## waist_circumference_cm                       1.00                0.88
## diabetes_risk_score                          0.88                1.00
########################################################
## UJI ASUMSI (KMO & BARTLETT)
########################################################
# KMO
kmo_result <- KMO(cor_matrix)
kmo_result
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor_matrix)
## Overall MSA =  0.87
## MSA for each item = 
##                        age                        bmi 
##                       0.53                       0.83 
##             blood_pressure      fasting_glucose_level 
##                       0.85                       0.78 
##              insulin_level                HbA1c_level 
##                       0.96                       0.79 
##          cholesterol_level        triglycerides_level 
##                       0.95                       0.89 
##       daily_calorie_intake sugar_intake_grams_per_day 
##                       0.91                       0.86 
##                sleep_hours               stress_level 
##                       0.98                       0.77 
##     waist_circumference_cm        diabetes_risk_score 
##                       0.91                       0.96
# MSA tiap variabel
kmo_result$MSAi
##                        age                        bmi 
##                  0.5294325                  0.8348306 
##             blood_pressure      fasting_glucose_level 
##                  0.8541390                  0.7840502 
##              insulin_level                HbA1c_level 
##                  0.9636213                  0.7884565 
##          cholesterol_level        triglycerides_level 
##                  0.9524065                  0.8897510 
##       daily_calorie_intake sugar_intake_grams_per_day 
##                  0.9059523                  0.8574375 
##                sleep_hours               stress_level 
##                  0.9771369                  0.7667277 
##     waist_circumference_cm        diabetes_risk_score 
##                  0.9061635                  0.9575505
# Bartlett
cortest.bartlett(cor_matrix, n = nrow(data_scaled))
## $chisq
## [1] 98647.54
## 
## $p.value
## [1] 0
## 
## $df
## [1] 91
########################################################
## PCA
########################################################
pca_result <- prcomp(data_scaled, center = TRUE, scale. = TRUE)

summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.7359 1.3970 1.0206 0.92815 0.83880 0.77342 0.74449
## Proportion of Variance 0.5347 0.1394 0.0744 0.06153 0.05026 0.04273 0.03959
## Cumulative Proportion  0.5347 0.6741 0.7485 0.80999 0.86025 0.90297 0.94256
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.56025 0.38617 0.34543 0.31864 0.26941 0.15962 0.14892
## Proportion of Variance 0.02242 0.01065 0.00852 0.00725 0.00518 0.00182 0.00158
## Cumulative Proportion  0.96498 0.97564 0.98416 0.99141 0.99660 0.99842 1.00000
eigen_values <- pca_result$sdev^2
eigen_values
##  [1] 7.48531246 1.95149057 1.04160443 0.86146359 0.70358395 0.59817990
##  [7] 0.55426931 0.31387544 0.14912476 0.11932499 0.10153319 0.07258415
## [13] 0.02547715 0.02217611
fviz_eig(pca_result, addlabels = TRUE)
## Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
## Ignoring empty aesthetic: `width`.

loadings_pca <- pca_result$rotation
round(loadings_pca[,1:3], 2)
##                              PC1   PC2   PC3
## age                        -0.09  0.56 -0.49
## bmi                        -0.33  0.16  0.26
## blood_pressure             -0.27  0.37 -0.20
## fasting_glucose_level      -0.26 -0.33 -0.40
## insulin_level              -0.23 -0.14  0.20
## HbA1c_level                -0.26 -0.33 -0.41
## cholesterol_level          -0.25  0.38 -0.05
## triglycerides_level        -0.32 -0.08  0.11
## daily_calorie_intake       -0.30  0.04  0.43
## sugar_intake_grams_per_day -0.28 -0.24 -0.02
## sleep_hours                 0.16  0.21  0.14
## stress_level               -0.20 -0.16 -0.11
## waist_circumference_cm     -0.33  0.13  0.23
## diabetes_risk_score        -0.35  0.03 -0.01
########################################################
## FA
########################################################
fa.parallel(data_scaled, fa = "fa", n.iter = 100)

## Parallel analysis suggests that the number of factors =  5  and the number of components =  NA
fa_result <- fa(data_scaled,
                nfactors = 5,
                rotate = "varimax",
                fm = "ml")

print(fa_result, cut = 0.4)
## Factor Analysis using method =  ml
## Call: fa(r = data_scaled, nfactors = 5, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                              ML1   ML3   ML2   ML4   ML5   h2     u2 com
## age                                     0.99             1.00 0.0050 1.0
## bmi                         0.91                         1.00 0.0049 1.4
## blood_pressure              0.51        0.66             0.81 0.1858 2.5
## fasting_glucose_level             0.93                   1.00 0.0050 1.3
## insulin_level               0.43                         0.38 0.6241 3.0
## HbA1c_level                       0.91                   0.96 0.0402 1.3
## cholesterol_level           0.56        0.53             0.63 0.3660 2.2
## triglycerides_level         0.57              0.64       0.85 0.1457 2.6
## daily_calorie_intake        0.88                         0.85 0.1499 1.2
## sugar_intake_grams_per_day                    0.87       0.99 0.0077 1.6
## sleep_hours                                              0.20 0.7988 3.9
## stress_level                                        0.87 0.86 0.1413 1.3
## waist_circumference_cm      0.86                         0.94 0.0572 1.6
## diabetes_risk_score         0.70  0.43                   0.89 0.1107 2.7
## 
##                        ML1  ML3  ML2  ML4  ML5
## SS loadings           4.19 2.37 1.91 1.73 1.15
## Proportion Var        0.30 0.17 0.14 0.12 0.08
## Cumulative Var        0.30 0.47 0.61 0.73 0.81
## Proportion Explained  0.37 0.21 0.17 0.15 0.10
## Cumulative Proportion 0.37 0.58 0.75 0.90 1.00
## 
## Mean item complexity =  2
## Test of the hypothesis that 5 factors are sufficient.
## 
## df null model =  91  with the objective function =  16.46 with Chi Square =  98647.54
## df of  the model are 31  and the objective function was  0.12 
## 
## The root mean square of the residuals (RMSR) is  0.01 
## The df corrected root mean square of the residuals is  0.02 
## 
## The harmonic n.obs is  6000 with the empirical chi square  60.69  with prob <  0.0011 
## The total n.obs was  6000  with Likelihood Chi Square =  705.05  with prob <  2.1e-128 
## 
## Tucker Lewis Index of factoring reliability =  0.98
## RMSEA index =  0.06  and the 90 % confidence intervals are  0.056 0.064
## BIC =  435.37
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    ML1  ML3  ML2  ML4  ML5
## Correlation of (regression) scores with factors   1.00 1.00 1.00 0.99 0.92
## Multiple R square of scores with factors          0.99 0.99 0.99 0.98 0.84
## Minimum correlation of possible factor scores     0.98 0.98 0.99 0.96 0.69
fa_result$communality
##                        age                        bmi 
##                  0.9950013                  0.9950602 
##             blood_pressure      fasting_glucose_level 
##                  0.8141633                  0.9950038 
##              insulin_level                HbA1c_level 
##                  0.3758905                  0.9597864 
##          cholesterol_level        triglycerides_level 
##                  0.6339524                  0.8543336 
##       daily_calorie_intake sugar_intake_grams_per_day 
##                  0.8500642                  0.9922554 
##                sleep_hours               stress_level 
##                  0.2012363                  0.8587106 
##     waist_circumference_cm        diabetes_risk_score 
##                  0.9427882                  0.8892518
fa_result$Vaccounted
##                             ML1       ML3       ML2       ML4        ML5
## SS loadings           4.1881882 2.3723525 1.9138739 1.7289318 1.15415153
## Proportion Var        0.2991563 0.1694538 0.1367053 0.1234951 0.08243939
## Cumulative Var        0.2991563 0.4686101 0.6053153 0.7288105 0.81124985
## Proportion Explained  0.3687598 0.2088799 0.1685119 0.1522282 0.10162023
## Cumulative Proportion 0.3687598 0.5776396 0.7461515 0.8983798 1.00000000