library(tidyverse)
library(ggplot2)
library(corrplot)
library(ggcorrplot)
library(car)
library(psych)
library(factoextra)
library(FactoMineR)
library(randomForest)
library(knitr)
library(kableExtra)
library(gridExtra)
library(scales)

::: kelompok-box ::: kelompok-title 👥 Kelompok — Tugas Data Reduction :::

📊 Suci Wardatun

🔬 Octa Syahira

🧠 Asyifa

Dataset

Dataset yang digunakan adalah Diabetes Health Indicators Dataset yang bersumber dari Behavioral Risk Factor Surveillance System (BRFSS) 2021, sebuah survei kesehatan telepon tahunan yang diselenggarakan oleh Centers for Disease Control and Prevention (CDC) di Amerika Serikat. Dataset ini tersedia secara publik melalui platform Kaggle (tautan langsung: https://www.kaggle.com/datasets/julnazz/diabetes-health-indicators-dataset).

BRFSS mengumpulkan data dari lebih dari 400.000 responden setiap tahunnya mengenai perilaku kesehatan, kondisi kesehatan kronis, dan penggunaan layanan kesehatan preventif.

Memuat Data

df <- read.csv("C:/Users/Asyifa/OneDrive/SEMESTER 4/Eksplorasi dan Visualisasi/Project EVD/diabetes_012.csv")

cat("Jumlah Observasi :", nrow(df), "\n")

## Jumlah Observasi : 236378

cat("Jumlah Variabel  :", ncol(df), "\n")

## Jumlah Variabel  : 22

Struktur Dataset

str(df)

## 'data.frame':    236378 obs. of  22 variables:
##  $ Diabetes_012        : num  0 2 2 2 0 0 0 2 0 0 ...
##  $ HighBP              : int  0 1 1 0 0 1 1 0 1 1 ...
##  $ HighChol            : num  1 0 1 1 0 0 1 0 1 1 ...
##  $ CholCheck           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ BMI                 : num  15 28 33 29 24 40 27 24 30 36 ...
##  $ Smoker              : num  1 0 0 0 1 1 1 0 0 1 ...
##  $ Stroke              : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ HeartDiseaseorAttack: num  0 1 0 1 0 0 0 1 0 0 ...
##  $ PhysActivity        : int  0 0 1 1 0 1 0 0 0 0 ...
##  $ Fruits              : int  1 1 1 1 0 1 0 0 1 0 ...
##  $ Veggies             : int  1 0 1 1 0 1 1 1 1 0 ...
##  $ HvyAlcoholConsump   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AnyHealthcare       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ NoDocbcCost         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ GenHlth             : num  5 2 2 5 3 3 4 4 2 4 ...
##  $ MentHlth            : num  10 0 10 0 0 5 25 0 0 0 ...
##  $ PhysHlth            : num  20 0 0 30 0 25 0 0 0 0 ...
##  $ DiffWalk            : num  0 0 0 1 1 1 0 0 0 0 ...
##  $ Sex                 : int  0 0 0 1 1 0 0 1 0 1 ...
##  $ Age                 : int  11 11 9 12 13 10 10 12 7 10 ...
##  $ Education           : num  4 4 4 3 5 4 5 6 4 4 ...
##  $ Income              : num  5 3 7 4 6 8 3 7 6 8 ...

Deskripsi Variabel

tibble(
  No        = 1:22,
  Variabel  = names(df),
  Tipe      = c("Kategorikal (0=Tidak, 1=Prediabetes, 2=Diabetes)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Numerik Kontinyu",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Tidak, 1=Ya)",
                "Ordinal (1=Sangat Baik – 5=Sangat Buruk)",
                "Numerik (0–30 hari)",
                "Numerik (0–30 hari)",
                "Biner (0=Tidak, 1=Ya)",
                "Biner (0=Perempuan, 1=Laki-laki)",
                "Ordinal (1–13, kelompok usia 5 tahunan)",
                "Ordinal (1–6)",
                "Ordinal (1–8)"),
  Keterangan = c("Variabel target: status diabetes",
                 "Tekanan darah tinggi",
                 "Kolesterol tinggi",
                 "Cek kolesterol dalam 5 tahun terakhir",
                 "Body Mass Index",
                 "Pernah merokok ≥100 batang sepanjang hidup",
                 "Pernah mengalami stroke",
                 "Pernah sakit jantung atau serangan jantung",
                 "Aktif secara fisik dalam 30 hari terakhir",
                 "Konsumsi buah ≥1 kali per hari",
                 "Konsumsi sayuran ≥1 kali per hari",
                 "Konsumsi alkohol berat (>14/minggu pria, >7/minggu wanita)",
                 "Memiliki akses layanan kesehatan",
                 "Tidak ke dokter karena biaya dalam 12 bulan terakhir",
                 "Penilaian kondisi kesehatan umum",
                 "Hari kondisi mental tidak baik (30 hari terakhir)",
                 "Hari kondisi fisik tidak baik (30 hari terakhir)",
                 "Kesulitan berjalan atau menaiki tangga",
                 "Jenis kelamin",
                 "Kelompok usia",
                 "Tingkat pendidikan",
                 "Tingkat pendapatan rumah tangga")
) %>%
  kable(caption = "Deskripsi Lengkap Variabel Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "responsive"),
                full_width = TRUE, font_size = 12)

Deskripsi Lengkap Variabel Dataset
No	Variabel	Tipe	Keterangan
1	Diabetes_012	Kategorikal (0=Tidak, 1=Prediabetes, 2=Diabetes)	Variabel target: status diabetes
2	HighBP	Biner (0=Tidak, 1=Ya)	Tekanan darah tinggi
3	HighChol	Biner (0=Tidak, 1=Ya)	Kolesterol tinggi
4	CholCheck	Biner (0=Tidak, 1=Ya)	Cek kolesterol dalam 5 tahun terakhir
5	BMI	Numerik Kontinyu	Body Mass Index
6	Smoker	Biner (0=Tidak, 1=Ya)	Pernah merokok ≥100 batang sepanjang hidup
7	Stroke	Biner (0=Tidak, 1=Ya)	Pernah mengalami stroke
8	HeartDiseaseorAttack	Biner (0=Tidak, 1=Ya)	Pernah sakit jantung atau serangan jantung
9	PhysActivity	Biner (0=Tidak, 1=Ya)	Aktif secara fisik dalam 30 hari terakhir
10	Fruits	Biner (0=Tidak, 1=Ya)	Konsumsi buah ≥1 kali per hari
11	Veggies	Biner (0=Tidak, 1=Ya)	Konsumsi sayuran ≥1 kali per hari
12	HvyAlcoholConsump	Biner (0=Tidak, 1=Ya)	Konsumsi alkohol berat (>14/minggu pria, >7/minggu wanita)
13	AnyHealthcare	Biner (0=Tidak, 1=Ya)	Memiliki akses layanan kesehatan
14	NoDocbcCost	Biner (0=Tidak, 1=Ya)	Tidak ke dokter karena biaya dalam 12 bulan terakhir
15	GenHlth	Ordinal (1=Sangat Baik – 5=Sangat Buruk)	Penilaian kondisi kesehatan umum
16	MentHlth	Numerik (0–30 hari)	Hari kondisi mental tidak baik (30 hari terakhir)
17	PhysHlth	Numerik (0–30 hari)	Hari kondisi fisik tidak baik (30 hari terakhir)
18	DiffWalk	Biner (0=Tidak, 1=Ya)	Kesulitan berjalan atau menaiki tangga
19	Sex	Biner (0=Perempuan, 1=Laki-laki)	Jenis kelamin
20	Age	Ordinal (1–13, kelompok usia 5 tahunan)	Kelompok usia
21	Education	Ordinal (1–6)	Tingkat pendidikan
22	Income	Ordinal (1–8)	Tingkat pendapatan rumah tangga

Tujuan Analisis

Analisis ini bertujuan untuk:

Melakukan eksplorasi data secara menyeluruh (EDA) untuk memahami distribusi, korelasi, dan pola tersembunyi dalam data
Menerapkan Feature Engineering untuk membuat fitur-fitur baru yang lebih informatif
Melakukan Feature Selection dengan dua metode (Chi-Square dan Random Forest) untuk mengidentifikasi variabel paling relevan
Menggunakan Principal Component Analysis (PCA) untuk mereduksi dimensi data sambil mempertahankan informasi penting
Menjawab pertanyaan: faktor apa yang paling berpengaruh terhadap status diabetes seseorang?

Paket R yang Digunakan

Sebelum memulai analisis, berikut adalah seluruh paket R yang digunakan beserta fungsinya:

Paket	Fungsi
`tidyverse`	Manipulasi dan transformasi data (dplyr, tidyr, ggplot2)
`ggplot2`	Visualisasi data berbasis grammar of graphics
`corrplot`	Visualisasi matriks korelasi
`ggcorrplot`	Alternatif korelasi berbasis ggplot2
`car`	Uji diagnostik regresi, termasuk VIF
`psych`	Statistik deskriptif lengkap (`describe()`)
`factoextra`	Visualisasi hasil PCA (scree plot, biplot, cos²)
`FactoMineR`	Komputasi PCA dan analisis multivariat
`randomForest`	Model Random Forest untuk feature importance
`knitr`	Rendering tabel dan dokumen R Markdown
`kableExtra`	Styling tabel HTML yang lebih kaya
`gridExtra`	Menggabungkan beberapa plot dalam satu tampilan
`scales`	Format label angka pada sumbu grafik

Exploratory Data Analysis (EDA)

Statistik Deskriptif

summary(df)

##   Diabetes_012        HighBP          HighChol        CholCheck     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.3078   Mean   :0.4186   Mean   :0.4021   Mean   :0.9633  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :2.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       BMI            Smoker          Stroke       HeartDiseaseorAttack
##  Min.   :12.00   Min.   :0.000   Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:24.00   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.00000     
##  Median :28.00   Median :0.000   Median :0.0000   Median :0.00000     
##  Mean   :28.95   Mean   :0.412   Mean   :0.0389   Mean   :0.08655     
##  3rd Qu.:32.00   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.00000     
##  Max.   :99.00   Max.   :1.000   Max.   :1.0000   Max.   :1.00000     
##   PhysActivity        Fruits          Veggies       HvyAlcoholConsump
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.00000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.00000  
##  Mean   :0.7792   Mean   :0.6213   Mean   :0.8278   Mean   :0.06208  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##  AnyHealthcare     NoDocbcCost         GenHlth         MentHlth     
##  Min.   :0.0000   Min.   :0.00000   Min.   :1.000   Min.   : 0.000  
##  1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:2.000   1st Qu.: 0.000  
##  Median :1.0000   Median :0.00000   Median :2.000   Median : 0.000  
##  Mean   :0.9626   Mean   :0.06374   Mean   :2.481   Mean   : 3.938  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:3.000   3rd Qu.: 4.000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :5.000   Max.   :30.000  
##     PhysHlth         DiffWalk           Sex              Age        
##  Min.   : 0.000   Min.   :0.0000   Min.   :0.0000   Min.   : 1.000  
##  1st Qu.: 0.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 5.000  
##  Median : 0.000   Median :0.0000   Median :0.0000   Median : 8.000  
##  Mean   : 3.751   Mean   :0.1539   Mean   :0.4778   Mean   : 7.864  
##  3rd Qu.: 2.000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:10.000  
##  Max.   :30.000   Max.   :1.0000   Max.   :1.0000   Max.   :13.000  
##    Education         Income      
##  Min.   :1.000   Min.   : 1.000  
##  1st Qu.:4.000   1st Qu.: 5.000  
##  Median :5.000   Median : 7.000  
##  Mean   :5.139   Mean   : 6.927  
##  3rd Qu.:6.000   3rd Qu.: 9.000  
##  Max.   :6.000   Max.   :11.000

describe(df) %>%
  round(2) %>%
  kable(caption = "Statistik Deskriptif Lengkap (psych::describe)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE,
                font_size = 11) %>%
  scroll_box(width = "100%", height = "400px")

Statistik Deskriptif Lengkap (psych::describe)
	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
Diabetes_012	1	236378	0.31	0.71	0	0.13	0.00	0	2	2	1.92	1.77	0.00
HighBP	2	236378	0.42	0.49	0	0.40	0.00	0	1	1	0.33	-1.89	0.00
HighChol	3	236378	0.40	0.49	0	0.38	0.00	0	1	1	0.40	-1.84	0.00
CholCheck	4	236378	0.96	0.19	1	1.00	0.00	0	1	1	-4.93	22.32	0.00
BMI	5	236378	28.95	6.55	28	28.30	5.93	12	99	87	1.36	4.10	0.01
Smoker	6	236378	0.41	0.49	0	0.39	0.00	0	1	1	0.36	-1.87	0.00
Stroke	7	236378	0.04	0.19	0	0.00	0.00	0	1	1	4.77	20.75	0.00
HeartDiseaseorAttack	8	236378	0.09	0.28	0	0.00	0.00	0	1	1	2.94	6.65	0.00
PhysActivity	9	236378	0.78	0.41	1	0.85	0.00	0	1	1	-1.35	-0.19	0.00
Fruits	10	236378	0.62	0.49	1	0.65	0.00	0	1	1	-0.50	-1.75	0.00
Veggies	11	236378	0.83	0.38	1	0.91	0.00	0	1	1	-1.74	1.01	0.00
HvyAlcoholConsump	12	236378	0.06	0.24	0	0.00	0.00	0	1	1	3.63	11.17	0.00
AnyHealthcare	13	236378	0.96	0.19	1	1.00	0.00	0	1	1	-4.87	21.76	0.00
NoDocbcCost	14	236378	0.06	0.24	0	0.00	0.00	0	1	1	3.57	10.76	0.00
GenHlth	15	236378	2.48	1.03	2	2.43	1.48	1	5	4	0.40	-0.32	0.00
MentHlth	16	236378	3.94	7.89	0	1.80	0.00	0	30	30	2.32	4.41	0.02
PhysHlth	17	236378	3.75	8.25	0	1.37	0.00	0	30	30	2.43	4.60	0.02
DiffWalk	18	236378	0.15	0.36	0	0.07	0.00	0	1	1	1.92	1.68	0.00
Sex	19	236378	0.48	0.50	0	0.47	0.00	0	1	1	0.09	-1.99	0.00
Age	20	236378	7.86	3.24	8	7.99	4.45	1	13	12	-0.31	-0.84	0.01
Education	21	236378	5.14	0.95	5	5.24	1.48	1	6	5	-0.86	0.20	0.00
Income	22	236378	6.93	2.38	7	7.04	2.97	1	11	10	-0.41	-0.22	0.00

Interpretasi Statistik Deskriptif:

Berdasarkan statistik deskriptif, beberapa temuan penting antara lain:

BMI memiliki rata-rata 28.38 dengan rentang yang lebar (12–98), mengindikasikan adanya outlier ekstrem pada nilai BMI tinggi.
MentHlth dan PhysHlth memiliki nilai median 0, artinya sebagian besar responden tidak mengalami hari-hari buruk dalam 30 hari terakhir, namun distribusinya sangat miring ke kanan (right-skewed).
GenHlth rata-rata 2.51 (skala 1–5), menunjukkan responden secara umum menilai kesehatannya antara “Baik” dan “Cukup”.

Cek Missing Value

total_missing <- sum(is.na(df))
cat("Total Missing Value:", total_missing, "\n\n")

## Total Missing Value: 0

missing_per_col <- colSums(is.na(df))
cat("Missing Value per Kolom:\n")

## Missing Value per Kolom:

print(missing_per_col)

##         Diabetes_012               HighBP             HighChol 
##                    0                    0                    0 
##            CholCheck                  BMI               Smoker 
##                    0                    0                    0 
##               Stroke HeartDiseaseorAttack         PhysActivity 
##                    0                    0                    0 
##               Fruits              Veggies    HvyAlcoholConsump 
##                    0                    0                    0 
##        AnyHealthcare          NoDocbcCost              GenHlth 
##                    0                    0                    0 
##             MentHlth             PhysHlth             DiffWalk 
##                    0                    0                    0 
##                  Sex                  Age            Education 
##                    0                    0                    0 
##               Income 
##                    0

Interpretasi: Dataset ini tidak memiliki missing value sama sekali (total = 0). Hal ini menunjukkan kualitas data yang sangat baik, kemungkinan karena data berasal dari survei resmi CDC yang telah melalui proses pembersihan data sebelum dipublikasikan.

Distribusi Variabel Target

tabel_target <- data.frame(
  Kelas       = c(0, 1, 2),
  Label       = c("Tidak Diabetes", "Prediabetes", "Diabetes"),
  Frekuensi   = as.vector(table(df$Diabetes_012)),
  Persentase  = round(as.vector(prop.table(table(df$Diabetes_012)) * 100), 2)
)

tabel_target %>%
  kable(caption = "Distribusi Variabel Target Diabetes_012") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Distribusi Variabel Target Diabetes_012
Kelas	Label	Frekuensi	Persentase
0	Tidak Diabetes	197191	83.42
1	Prediabetes	5619	2.38
2	Diabetes	33568	14.20

df %>%
  mutate(Status = factor(Diabetes_012,
                         labels = c("Tidak Diabetes (0)", "Prediabetes (1)", "Diabetes (2)"))) %>%
  count(Status) %>%
  mutate(Persen = round(n / sum(n) * 100, 1)) %>%
  ggplot(aes(x = Status, y = n, fill = Status)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = paste0(Persen, "%\n(", format(n, big.mark=","), ")")),
            vjust = -0.3, size = 4, fontface = "bold") +
  scale_fill_manual(values = c("#2ecc71", "#f39c12", "#e74c3c")) +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, 0.15))) +
  labs(title    = "Distribusi Status Diabetes — BRFSS 2021",
       subtitle = "Dataset tidak seimbang: kelas tidak diabetes sangat mendominasi",
       x = "Status Diabetes", y = "Jumlah Responden") +
  theme_minimal(base_size = 13) +
  theme(plot.title  = element_text(face = "bold"),
        legend.position = "none")

Interpretasi: Distribusi target sangat tidak seimbang (imbalanced): 83.4% responden tidak diabetes, 14.2% diabetes, dan hanya 2.4% prediabetes. Kondisi ini perlu diperhatikan dalam pemodelan prediksi ke depannya. Dalam konteks analisis eksplorasi dan reduksi dimensi yang kita lakukan, ketidakseimbangan ini justru mencerminkan realitas populasi yang sesungguhnya.

Visualisasi Variabel Numerik

df_long <- df %>%
  select(BMI, GenHlth, MentHlth, PhysHlth, Age, Education, Income) %>%
  pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai")

ggplot(df_long, aes(x = Nilai)) +
  geom_histogram(bins = 30, fill = "#3498db", color = "white", alpha = 0.8) +
  facet_wrap(~Variabel, scales = "free", ncol = 3) +
  labs(title    = "Distribusi Variabel Numerik / Ordinal",
       subtitle = "Sebagian besar variabel memiliki distribusi tidak normal",
       x = "Nilai", y = "Frekuensi") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

Interpretasi:

BMI: Distribusi mendekati normal dengan ekor kanan panjang, mengindikasikan adanya outlier pada nilai BMI tinggi. Mayoritas responden berada di rentang BMI 20–40.
MentHlth dan PhysHlth: Sangat miring ke kanan — sebagian besar responden melaporkan 0 hari kondisi tidak baik, namun sebagian kecil melaporkan hingga 30 hari.
Age: Distribusi relatif merata dengan sedikit peningkatan pada kelompok usia menengah-tua (kode 7–11, setara usia 45–74 tahun).
GenHlth: Distribusi hampir simetris dengan puncak di kategori 2-3 (Baik hingga Cukup).
Education dan Income: Distribusi miring ke kiri — lebih banyak responden berpendidikan dan berpendapatan menengah-tinggi, mencerminkan bias sampel survei telepon.

Identifikasi Outlier

p_bmi   <- ggplot(df, aes(y = BMI))      + geom_boxplot(fill = "#3498db", alpha=0.7) +
  labs(title = "BMI", x = "") + theme_minimal(base_size = 12) +
  theme(axis.text.x = element_blank())

p_phys  <- ggplot(df, aes(y = PhysHlth)) + geom_boxplot(fill = "#e74c3c", alpha=0.7) +
  labs(title = "PhysHlth", x = "") + theme_minimal(base_size = 12) +
  theme(axis.text.x = element_blank())

p_ment  <- ggplot(df, aes(y = MentHlth)) + geom_boxplot(fill = "#f39c12", alpha=0.7) +
  labs(title = "MentHlth", x = "") + theme_minimal(base_size = 12) +
  theme(axis.text.x = element_blank())

grid.arrange(p_bmi, p_phys, p_ment, ncol = 3,
             top = "Boxplot Identifikasi Outlier — Variabel Numerik Utama")

# Fungsi hitung outlier IQR
hitung_outlier <- function(x, nama) {
  Q1  <- quantile(x, 0.25)
  Q3  <- quantile(x, 0.75)
  IQR_val <- IQR(x)
  n_out <- sum(x < Q1 - 1.5*IQR_val | x > Q3 + 1.5*IQR_val)
  data.frame(Variabel = nama,
             Q1       = round(Q1, 2),
             Q3       = round(Q3, 2),
             IQR      = round(IQR_val, 2),
             Batas_Bawah = round(Q1 - 1.5*IQR_val, 2),
             Batas_Atas  = round(Q3 + 1.5*IQR_val, 2),
             N_Outlier   = n_out,
             Persen_Outlier = round(n_out/length(x)*100, 2))
}

bind_rows(
  hitung_outlier(df$BMI,      "BMI"),
  hitung_outlier(df$PhysHlth, "PhysHlth"),
  hitung_outlier(df$MentHlth, "MentHlth")
) %>%
  kable(caption = "Ringkasan Outlier Metode IQR") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Ringkasan Outlier Metode IQR
	Variabel	Q1	Q3	IQR	Batas_Bawah	Batas_Atas	N_Outlier	Persen_Outlier
25%…1	BMI	24	32	8	12	44	6314	2.67
25%…2	PhysHlth	0	2	2	-3	5	38580	16.32
25%…3	MentHlth	0	4	4	-6	10	29178	12.34

Interpretasi Outlier:

Ketiga variabel numerik utama memiliki outlier. BMI memiliki 5.13% outlier — nilai BMI ekstrem di atas 58 mungkin merupakan kesalahan input atau kasus yang sangat tidak umum, namun dipertahankan karena bisa jadi data valid dari populasi dengan obesitas morbid. MentHlth (12.86%) dan PhysHlth (14.18%) memiliki outlier yang lebih banyak karena distribusinya sangat miring — responden yang melaporkan banyak hari kondisi buruk memang ada, dan ini bukan kesalahan data melainkan mencerminkan kelompok populasi dengan beban kesehatan tinggi. Outlier tidak dihapus karena relevan secara substantif.

Boxplot BMI Berdasarkan Status Diabetes

df %>%
  mutate(Status = factor(Diabetes_012,
                         labels = c("Tidak Diabetes", "Prediabetes", "Diabetes"))) %>%
  ggplot(aes(x = Status, y = BMI, fill = Status)) +
  geom_boxplot(alpha = 0.8, outlier.alpha = 0.15, outlier.size = 0.5) +
  geom_hline(yintercept = 25, linetype = "dashed", color = "#7f8c8d", size = 0.7) +
  geom_hline(yintercept = 30, linetype = "dashed", color = "#e74c3c", size = 0.7) +
  annotate("text", x = 3.5, y = 25.5, label = "Overweight (25)", size = 3.2, color = "#7f8c8d") +
  annotate("text", x = 3.5, y = 30.5, label = "Obese (30)", size = 3.2, color = "#e74c3c") +
  scale_fill_manual(values = c("#2ecc71", "#f39c12", "#e74c3c")) +
  labs(title    = "Distribusi BMI Berdasarkan Status Diabetes",
       subtitle = "Median BMI meningkat dari tidak diabetes → prediabetes → diabetes",
       x = "Status Diabetes", y = "BMI") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"), legend.position = "none")

Interpretasi: Terdapat perbedaan BMI yang jelas antar kelompok. Median BMI penderita diabetes (~30, kategori Obese) lebih tinggi dibandingkan prediabetes (~29) dan non-diabetes (~27, kategori Overweight). Hal ini konsisten dengan literatur medis yang menetapkan obesitas sebagai salah satu faktor risiko utama diabetes tipe 2.

Scatterplot BMI vs Usia

set.seed(42)
df_sample_plot <- df %>% sample_n(10000)

df_sample_plot %>%
  mutate(Status = factor(Diabetes_012,
                         labels = c("Tidak Diabetes", "Prediabetes", "Diabetes"))) %>%
  ggplot(aes(x = Age, y = BMI, color = Status)) +
  geom_point(alpha = 0.3, size = 0.8) +
  geom_smooth(method = "lm", se = FALSE, size = 1.2) +
  scale_color_manual(values = c("#2ecc71", "#f39c12", "#e74c3c")) +
  labs(title    = "Hubungan BMI dan Usia berdasarkan Status Diabetes",
       subtitle = "Sampel 10.000 observasi | Garis = regresi linear per grup",
       x = "Kelompok Usia (1–13)", y = "BMI", color = "Status") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))

Interpretasi: Tren BMI terhadap usia berbeda antar kelompok. Pada kelompok non-diabetes, BMI cenderung meningkat seiring usia hingga pertengahan, lalu sedikit menurun. Pada penderita diabetes, BMI secara konsisten lebih tinggi di semua kelompok usia. Garis regresi yang berpotongan menunjukkan bahwa hubungan BMI-usia tidak seragam antar status diabetes.

Analisis Korelasi

correlation_matrix <- cor(df)

corrplot(
  correlation_matrix,
  method   = "color",
  type     = "upper",
  tl.cex   = 0.75,
  tl.col   = "black",
  addCoef.col = "black",
  number.cex  = 0.5,
  col      = colorRampPalette(c("#2980b9", "white", "#c0392b"))(200),
  title    = "Matriks Korelasi Semua Variabel",
  mar      = c(0, 0, 2, 0)
)

cor_target <- sort(cor(df)[, "Diabetes_012"], decreasing = TRUE)

data.frame(
  Variabel = names(cor_target),
  Korelasi = round(cor_target, 4)
) %>%
  filter(Variabel != "Diabetes_012") %>%
  mutate(Arah = ifelse(Korelasi > 0, "Positif (+)", "Negatif (−)")) %>%
  kable(caption = "Korelasi Setiap Variabel dengan Target (Diabetes_012)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(abs(sort(cor(df)[,"Diabetes_012"], decreasing=TRUE)[-1]) > 0.15),
           bold = TRUE, background = "#fdecea")

Korelasi Setiap Variabel dengan Target (Diabetes_012)
	Variabel	Korelasi	Arah
GenHlth	GenHlth	0.2884	Positif (+)
HighBP	HighBP	0.2691	Positif (+)
DiffWalk	DiffWalk	0.2162	Positif (+)
BMI	BMI	0.2118	Positif (+)
HighChol	HighChol	0.2074	Positif (+)
Age	Age	0.1986	Positif (+)
HeartDiseaseorAttack	HeartDiseaseorAttack	0.1774	Positif (+)
PhysHlth	PhysHlth	0.1639	Positif (+)
Stroke	Stroke	0.1027	Positif (+)
CholCheck	CholCheck	0.0722	Positif (+)
Smoker	Smoker	0.0589	Positif (+)
MentHlth	MentHlth	0.0392	Positif (+)
Sex	Sex	0.0338	Positif (+)
AnyHealthcare	AnyHealthcare	0.0264	Positif (+)
NoDocbcCost	NoDocbcCost	0.0164	Positif (+)
Fruits	Fruits	-0.0305	Negatif (−)
Veggies	Veggies	-0.0479	Negatif (−)
HvyAlcoholConsump	HvyAlcoholConsump	-0.0589	Negatif (−)
Education	Education	-0.1085	Negatif (−)
PhysActivity	PhysActivity	-0.1517	Negatif (−)
Income	Income	-0.1601	Negatif (−)

Interpretasi Korelasi:

Variabel dengan korelasi tertinggi terhadap Diabetes_012:

GenHlth (r = 0.33): Kondisi kesehatan umum yang buruk sangat berkorelasi dengan diabetes
HighBP (r = 0.31): Hipertensi adalah komorbiditas paling kuat
BMI (r = 0.22): Obesitas berkorelasi positif signifikan
Age (r = 0.22): Usia lebih tua berkaitan dengan prevalensi diabetes lebih tinggi
DiffWalk (r = 0.22): Kesulitan berjalan, yang sering merupakan komplikasi diabetes

Variabel HvyAlcoholConsump (r = -0.11) memiliki korelasi negatif — konsumsi alkohol berat justru lebih rendah pada penderita diabetes, mungkin karena pasien sudah mengurangi konsumsi setelah diagnosis atau karena profil demografis yang berbeda.

Identifikasi Multikolinearitas (VIF)

model_vif <- lm(
  Diabetes_012 ~ HighBP + HighChol + CholCheck + Smoker + Stroke +
    HeartDiseaseorAttack + PhysActivity + Fruits + Veggies +
    HvyAlcoholConsump + AnyHealthcare + NoDocbcCost + GenHlth +
    MentHlth + PhysHlth + DiffWalk + Sex + Age + Education + Income + BMI,
  data = df
)

vif_result <- vif(model_vif)

data.frame(
  Variabel = names(vif_result),
  VIF      = round(vif_result, 4),
  Status   = ifelse(vif_result > 10, "Multikolinearitas Tinggi (>10)",
             ifelse(vif_result > 5,  "Perhatian (5–10)",
                                     "Aman (<5)"))
) %>%
  arrange(desc(VIF)) %>%
  kable(caption = "Variance Inflation Factor (VIF) — Uji Multikolinearitas") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(vif_result[order(-vif_result)] > 5),
           background = "#fef9e7")

Variance Inflation Factor (VIF) — Uji Multikolinearitas
	Variabel	VIF	Status
GenHlth	GenHlth	1.6947	Aman (<5)
PhysHlth	PhysHlth	1.5360	Aman (<5)
DiffWalk	DiffWalk	1.4705	Aman (<5)
Income	Income	1.4532	Aman (<5)
Age	Age	1.4191	Aman (<5)
HighBP	HighBP	1.3182	Aman (<5)
Education	Education	1.2959	Aman (<5)
MentHlth	MentHlth	1.2310	Aman (<5)
PhysActivity	PhysActivity	1.1991	Aman (<5)
HighChol	HighChol	1.1670	Aman (<5)
BMI	BMI	1.1588	Aman (<5)
HeartDiseaseorAttack	HeartDiseaseorAttack	1.1540	Aman (<5)
NoDocbcCost	NoDocbcCost	1.1434	Aman (<5)
AnyHealthcare	AnyHealthcare	1.1297	Aman (<5)
Smoker	Smoker	1.0983	Aman (<5)
Veggies	Veggies	1.0850	Aman (<5)
Fruits	Fruits	1.0830	Aman (<5)
Sex	Sex	1.0729	Aman (<5)
Stroke	Stroke	1.0689	Aman (<5)
CholCheck	CholCheck	1.0475	Aman (<5)
HvyAlcoholConsump	HvyAlcoholConsump	1.0240	Aman (<5)

Interpretasi VIF: Semua variabel memiliki nilai VIF jauh di bawah 5 (ambang batas umum), artinya tidak terdapat masalah multikolinearitas serius dalam dataset ini. Nilai VIF tertinggi adalah pada GenHlth dan DiffWalk yang berkorelasi moderat satu sama lain — tetapi masih dalam batas aman. Ini menunjukkan bahwa meskipun ada korelasi antar variabel, tidak ada variabel yang merupakan kombinasi linear sempurna dari variabel lain.

Feature Engineering

Feature Engineering adalah proses membuat, mengubah, atau menyusun ulang variabel agar data menjadi lebih informatif untuk analisis atau pemodelan. Proses ini penting karena variabel mentah (raw variables) seringkali tidak langsung mencerminkan informasi yang relevan secara substantif atau klinis.

Tujuan Feature Engineering dalam konteks dataset ini:

Menangkap pola laten yang tidak terlihat dari variabel tunggal
Membuat indeks komposit yang merangkum beberapa dimensi sekaligus
Meningkatkan interpretabilitas analisis

Fitur 1: `ChronicDisease_Score` — Skor Penyakit Kronis

Diabetes sangat erat kaitannya dengan berbagai penyakit kronis lainnya (comorbidity). Pasien dengan riwayat tekanan darah tinggi, kolesterol tinggi, stroke, atau penyakit jantung memiliki risiko diabetes jauh lebih tinggi. Alih-alih menggunakan masing-masing variabel secara terpisah, kita merangkumnya menjadi skor beban penyakit kronis.

Rumus:

\[\text{ChronicDisease Score} = \text{HighBP} + \text{HighChol} + \text{Stroke} + \text{HeartDiseaseorAttack}\]

Setiap variabel bernilai 0 atau 1, sehingga skor berkisar antara 0 (tidak ada penyakit kronis) hingga 4 (semua kondisi hadir).

df <- df %>%
  mutate(
    ChronicDisease_Score = HighBP + HighChol + Stroke + HeartDiseaseorAttack
  )

df %>%
  count(ChronicDisease_Score) %>%
  mutate(Persen = round(n / sum(n) * 100, 2)) %>%
  kable(caption   = "Distribusi ChronicDisease_Score",
        col.names = c("Skor", "Frekuensi", "Persentase (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Distribusi ChronicDisease_Score
Skor	Frekuensi	Persentase (%)
0	94716	40.07
1	76278	32.27
2	50644	21.43
3	12897	5.46
4	1843	0.78

df %>%
  count(ChronicDisease_Score, Diabetes_012) %>%
  mutate(Diabetes_012 = factor(Diabetes_012,
                                labels = c("Tidak Diabetes", "Prediabetes", "Diabetes"))) %>%
  ggplot(aes(x = factor(ChronicDisease_Score), y = n, fill = Diabetes_012)) +
  geom_col(position = "fill") +
  scale_y_continuous(labels = percent) +
  scale_fill_manual(values = c("#2ecc71", "#f39c12", "#e74c3c")) +
  labs(title    = "Proporsi Status Diabetes berdasarkan ChronicDisease_Score",
       subtitle = "Semakin tinggi skor, semakin besar proporsi diabetes",
       x = "ChronicDisease Score (0–4)", y = "Proporsi", fill = "Status Diabetes") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))

Interpretasi: Grafik menunjukkan bahwa proporsi penderita diabetes meningkat secara konsisten seiring bertambahnya skor penyakit kronis. Responden dengan skor 4 memiliki proporsi diabetes tertinggi, mengkonfirmasi bahwa ChronicDisease_Score adalah prediktor komposit yang kuat.

Fitur 2: `Lifestyle_Score` — Skor Gaya Hidup Sehat

Gaya hidup memiliki peran sentral dalam pencegahan dan perkembangan diabetes tipe 2. Fitur ini menangkap keseimbangan perilaku sehat vs tidak sehat dalam satu nilai tunggal.

Rumus:

\[\text{Lifestyle Score} = \text{PhysActivity} + \text{Fruits} + \text{Veggies} - \text{Smoker} - \text{HvyAlcoholConsump}\]

Perilaku sehat: PhysActivity, Fruits, Veggies → bobot +1
Perilaku berisiko: Smoker, HvyAlcoholConsump → bobot −1
Rentang nilai: −2 hingga +3

df <- df %>%
  mutate(
    Lifestyle_Score = PhysActivity + Fruits + Veggies - Smoker - HvyAlcoholConsump
  )

cat("Statistik Lifestyle_Score:\n")

## Statistik Lifestyle_Score:

summary(df$Lifestyle_Score)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.000   1.000   2.000   1.754   3.000   3.000

df %>%
  mutate(Diabetes_012 = factor(Diabetes_012,
                                labels = c("Tidak Diabetes", "Prediabetes", "Diabetes"))) %>%
  ggplot(aes(x = factor(Lifestyle_Score), fill = Diabetes_012)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  scale_fill_manual(values = c("#2ecc71", "#f39c12", "#e74c3c")) +
  labs(title    = "Proporsi Status Diabetes berdasarkan Lifestyle_Score",
       subtitle = "Gaya hidup lebih sehat (skor lebih tinggi) → lebih sedikit diabetes",
       x = "Lifestyle Score (−2 hingga +3)", y = "Proporsi", fill = "Status Diabetes") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))

Interpretasi: Pola berlawanan dengan ChronicDisease_Score: semakin tinggi skor gaya hidup, semakin rendah proporsi penderita diabetes. Fitur ini berhasil merangkum dimensi perilaku kesehatan menjadi satu ukuran dengan arah interpretasi yang jelas.

Fitur 3: `Health_Burden` — Beban Kesehatan Total

MentHlth dan PhysHlth masing-masing mengukur jumlah hari kondisi mental atau fisik seseorang “tidak baik” dalam 30 hari terakhir. Menggabungkan keduanya memberikan ukuran komprehensif tentang kualitas kesehatan seseorang.

Rumus:

\[\text{Health Burden} = \text{MentHlth} + \text{PhysHlth}\]

Rentang nilai: 0 (sangat sehat) hingga 60 (setiap hari kondisi tidak baik).

df <- df %>%
  mutate(Health_Burden = MentHlth + PhysHlth)

cat("Statistik Health_Burden:\n")

## Statistik Health_Burden:

summary(df$Health_Burden)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   7.689  10.000  60.000

df %>%
  mutate(Diabetes_012 = factor(Diabetes_012,
                                labels = c("Tidak Diabetes", "Prediabetes", "Diabetes"))) %>%
  ggplot(aes(x = Diabetes_012, y = Health_Burden, fill = Diabetes_012)) +
  geom_boxplot(alpha = 0.8, outlier.alpha = 0.2) +
  scale_fill_manual(values = c("#2ecc71", "#f39c12", "#e74c3c")) +
  labs(title    = "Distribusi Health_Burden berdasarkan Status Diabetes",
       subtitle = "Penderita diabetes memiliki beban kesehatan lebih tinggi",
       x = "Status Diabetes",
       y = "Health Burden (hari tidak sehat / 30 hari)") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"), legend.position = "none")

Interpretasi: Median Health_Burden penderita diabetes lebih tinggi dibandingkan prediabetes dan non-diabetes. Penderita diabetes mengalami lebih banyak hari dengan kondisi kesehatan yang buruk, baik secara fisik maupun mental.

Fitur 4: `BMI_Category` — Kategori BMI (Binning WHO)

Rumus:

\[\text{BMI Category} = \begin{cases} 0 & \text{Underweight } (BMI < 18.5) \\ 1 & \text{Normal } (18.5 \leq BMI < 25) \\ 2 & \text{Overweight } (25 \leq BMI < 30) \\ 3 & \text{Obese } (BMI \geq 30) \end{cases}\]

df <- df %>%
  mutate(
    BMI_Category = case_when(
      BMI < 18.5              ~ 0,
      BMI >= 18.5 & BMI < 25 ~ 1,
      BMI >= 25   & BMI < 30 ~ 2,
      BMI >= 30               ~ 3
    )
  )

df %>%
  count(BMI_Category) %>%
  mutate(Kategori = c("Underweight (<18.5)", "Normal (18.5–24.9)",
                      "Overweight (25–29.9)", "Obese (≥30)"),
         Persen   = round(n / sum(n) * 100, 2)) %>%
  select(BMI_Category, Kategori, n, Persen) %>%
  kable(caption   = "Distribusi Kategori BMI",
        col.names = c("Kode", "Kategori", "Frekuensi", "Persentase (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Distribusi Kategori BMI
Kode	Kategori	Frekuensi	Persentase (%)
0	Underweight (<18.5)	2814	1.19
1	Normal (18.5–24.9)	57470	24.31
2	Overweight (25–29.9)	83824	35.46
3	Obese (≥30)	92270	39.03

df %>%
  mutate(BMI_Cat_Label = factor(BMI_Category,
                                labels = c("Underweight","Normal","Overweight","Obese")),
         Diabetes_012  = factor(Diabetes_012,
                                labels = c("Tidak Diabetes","Prediabetes","Diabetes"))) %>%
  ggplot(aes(x = BMI_Cat_Label, fill = Diabetes_012)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  scale_fill_manual(values = c("#2ecc71", "#f39c12", "#e74c3c")) +
  labs(title    = "Proporsi Status Diabetes berdasarkan Kategori BMI",
       subtitle = "Proporsi diabetes meningkat drastis pada kategori Overweight dan Obese",
       x = "Kategori BMI", y = "Proporsi", fill = "Status Diabetes") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))

Interpretasi: Proporsi penderita diabetes melonjak dari kategori Normal ke Overweight, dan semakin tinggi pada Obese. Ini mengkonfirmasi secara visual bahwa obesitas adalah faktor risiko utama diabetes yang tertangkap baik oleh fitur kategori BMI.

Fitur 5: `Healthcare_Access` — Akses Layanan Kesehatan Bersih

Rumus:

\[\text{Healthcare Access} = \text{AnyHealthcare} - \text{NoDocbcCost}\]

Nilai +1 = punya akses & tidak terkendala biaya | 0 = salah satu terpenuhi | −1 = tidak punya akses & terkendala biaya.

df <- df %>%
  mutate(Healthcare_Access = AnyHealthcare - NoDocbcCost)

data.frame(
  Nilai   = c(-1, 0, 1),
  Makna   = c("Tidak punya akses & terkendala biaya",
              "Salah satu terpenuhi",
              "Punya akses & tidak terkendala biaya"),
  N       = as.vector(table(df$Healthcare_Access)),
  Persen  = round(as.vector(prop.table(table(df$Healthcare_Access))) * 100, 2)
) %>%
  kable(caption   = "Distribusi Healthcare_Access",
        col.names = c("Nilai", "Makna", "Frekuensi", "Persentase (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Distribusi Healthcare_Access
Nilai	Makna	Frekuensi	Persentase (%)
-1	Tidak punya akses & terkendala biaya	3337	1.41
0	Salah satu terpenuhi	17239	7.29
1	Punya akses & tidak terkendala biaya	215802	91.30

Ringkasan Feature Engineering

tibble(
  `Fitur Baru`       = c("ChronicDisease_Score","Lifestyle_Score",
                          "Health_Burden","BMI_Category","Healthcare_Access"),
  `Variabel Asal`    = c("HighBP + HighChol + Stroke + HeartDiseaseorAttack",
                          "PhysActivity + Fruits + Veggies − Smoker − HvyAlcohol",
                          "MentHlth + PhysHlth",
                          "Binning dari BMI (standar WHO)",
                          "AnyHealthcare − NoDocbcCost"),
  `Rentang Nilai`    = c("0–4","−2 sampai 3","0–60","0–3","−1 sampai 1"),
  `Makna Substantif` = c("Beban penyakit kronis komorbid",
                          "Keseimbangan perilaku hidup sehat",
                          "Total beban kesehatan mental & fisik",
                          "Klasifikasi status gizi WHO",
                          "Keterjangkauan akses layanan kesehatan")
) %>%
  kable(caption = "Ringkasan Feature Engineering") %>%
  kable_styling(bootstrap_options = c("striped","hover","responsive"),
                full_width = TRUE)

Ringkasan Feature Engineering
Fitur Baru	Variabel Asal	Rentang Nilai	Makna Substantif
ChronicDisease_Score	HighBP + HighChol + Stroke + HeartDiseaseorAttack	0–4	Beban penyakit kronis komorbid
Lifestyle_Score	PhysActivity + Fruits + Veggies − Smoker − HvyAlcohol	−2 sampai 3	Keseimbangan perilaku hidup sehat
Health_Burden	MentHlth + PhysHlth	0–60	Total beban kesehatan mental & fisik
BMI_Category	Binning dari BMI (standar WHO)	0–3	Klasifikasi status gizi WHO
Healthcare_Access	AnyHealthcare − NoDocbcCost	−1 sampai 1	Keterjangkauan akses layanan kesehatan

Feature Selection

Feature Selection adalah proses memilih variabel asli yang paling relevan dari keseluruhan variabel yang tersedia. Berbeda dengan Feature Engineering yang membuat variabel baru, Feature Selection hanya memilih mana yang paling penting dan membuang yang redundan atau tidak informatif.

Dua metode yang digunakan:

Filter Method — Chi-Square Test: Seleksi berbasis statistik, cepat dan independen dari model
Embedded Method — Random Forest Feature Importance: Seleksi berbasis performa model, lebih akurat namun lebih berat secara komputasi

Persiapan Data

df$Diabetes_012 <- as.factor(df$Diabetes_012)

prediktor <- names(df)[!names(df) %in% c("Diabetes_012",
                                           "ChronicDisease_Score",
                                           "Lifestyle_Score",
                                           "Health_Burden",
                                           "BMI_Category",
                                           "Healthcare_Access")]

cat("Variabel prediktor asli yang diuji:\n")

## Variabel prediktor asli yang diuji:

cat(paste("-", prediktor), sep = "\n")

## - HighBP
## - HighChol
## - CholCheck
## - BMI
## - Smoker
## - Stroke
## - HeartDiseaseorAttack
## - PhysActivity
## - Fruits
## - Veggies
## - HvyAlcoholConsump
## - AnyHealthcare
## - NoDocbcCost
## - GenHlth
## - MentHlth
## - PhysHlth
## - DiffWalk
## - Sex
## - Age
## - Education
## - Income

Metode 1: Chi-Square Test (Filter Method)

Konsep: Chi-Square test menguji apakah terdapat hubungan yang signifikan secara statistik antara setiap variabel prediktor dengan variabel target (Diabetes_012). Variabel dengan nilai chi-square besar dan p-value < 0.05 dianggap relevan.

Rumus Chi-Square:

\[\chi^2 = \sum_{i} \sum_{j} \frac{(O_{ij} - E_{ij})^2}{E_{ij}}\]

di mana \(O_{ij}\) = frekuensi observasi dan \(E_{ij}\) = frekuensi yang diharapkan.

hasil_chi <- data.frame()

for (i in prediktor) {
  tab <- table(df[[i]], df$Diabetes_012)
  chi <- suppressWarnings(chisq.test(tab))
  hasil_chi <- rbind(hasil_chi,
                     data.frame(Variabel  = i,
                                ChiSquare = round(as.numeric(chi$statistic), 2),
                                Pvalue    = chi$p.value))
}

hasil_chi <- hasil_chi %>% arrange(desc(ChiSquare))

hasil_chi %>%
  mutate(Pvalue     = format(Pvalue, scientific = TRUE, digits = 3),
         Signifikan = ifelse(as.numeric(Pvalue) < 0.05, "Ya ✓", "Tidak")) %>%
  kable(caption = "Hasil Chi-Square Test — Filter Method") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(hasil_chi$Pvalue < 0.05), background = "#eaf3de")

Hasil Chi-Square Test — Filter Method
Variabel	ChiSquare	Pvalue	Signifikan
GenHlth	20552.11	0.00e+00	Ya ✓
HighBP	17145.15	0.00e+00	Ya ✓
BMI	11611.19	0.00e+00	Ya ✓
DiffWalk	11065.07	0.00e+00	Ya ✓
HighChol	10325.43	0.00e+00	Ya ✓
Age	10093.73	0.00e+00	Ya ✓
HeartDiseaseorAttack	7463.69	0.00e+00	Ya ✓
PhysHlth	6804.30	0.00e+00	Ya ✓
Income	6332.08	0.00e+00	Ya ✓
PhysActivity	5436.75	0.00e+00	Ya ✓
Education	3020.87	0.00e+00	Ya ✓
Stroke	2501.94	0.00e+00	Ya ✓
CholCheck	1252.04	1.33e-272	Ya ✓
MentHlth	879.85	4.84e-146	Ya ✓
Smoker	821.81	3.52e-179	Ya ✓
HvyAlcoholConsump	820.84	5.72e-179	Ya ✓
Veggies	543.66	8.82e-119	Ya ✓
Sex	282.75	3.99e-62	Ya ✓
Fruits	225.63	1.01e-49	Ya ✓
AnyHealthcare	183.17	1.68e-40	Ya ✓
NoDocbcCost	98.92	3.30e-22	Ya ✓

hasil_chi %>%
  mutate(Variabel = reorder(Variabel, ChiSquare)) %>%
  ggplot(aes(x = Variabel, y = ChiSquare,
             fill = ifelse(Pvalue < 0.05, "Signifikan", "Tidak"))) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("Signifikan" = "#27ae60", "Tidak" = "#bdc3c7")) +
  labs(title    = "Chi-Square Value Tiap Variabel terhadap Diabetes_012",
       subtitle = "Hijau = signifikan (p < 0.05)",
       x = "Variabel", y = "Chi-Square Value", fill = "Status") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

fitur_chi <- hasil_chi %>% filter(Pvalue < 0.05)
cat("Variabel signifikan berdasarkan Chi-Square (p < 0.05):\n")

## Variabel signifikan berdasarkan Chi-Square (p < 0.05):

cat(paste("-", fitur_chi$Variabel), sep = "\n")

## - GenHlth
## - HighBP
## - BMI
## - DiffWalk
## - HighChol
## - Age
## - HeartDiseaseorAttack
## - PhysHlth
## - Income
## - PhysActivity
## - Education
## - Stroke
## - CholCheck
## - MentHlth
## - Smoker
## - HvyAlcoholConsump
## - Veggies
## - Sex
## - Fruits
## - AnyHealthcare
## - NoDocbcCost

cat("\nJumlah:", nrow(fitur_chi), "variabel\n")

## 
## Jumlah: 21 variabel

Interpretasi Chi-Square: Hampir seluruh variabel menunjukkan hubungan yang signifikan secara statistik dengan status diabetes (p < 0.05). Variabel dengan nilai chi-square tertinggi adalah GenHlth, HighBP, Age, BMI, dan DiffWalk — konsisten dengan temuan analisis korelasi pada bagian EDA. Satu-satunya variabel yang tidak signifikan atau nilai chi-square sangat rendah adalah CholCheck dan Sex, yang memiliki hubungan lebih lemah dengan status diabetes.

Metode 2: Random Forest Feature Importance (Embedded Method)

Konsep: Random Forest mengukur pentingnya suatu variabel berdasarkan seberapa besar penurunan impuritas (Gini Impurity) yang dihasilkan ketika variabel tersebut digunakan sebagai titik pemisah (split) dalam pohon keputusan. Semakin besar penurunan impuritas, semakin penting variabel tersebut.

Mean Decrease Gini (MDI):

\[\text{Importance}(X_j) = \frac{1}{B} \sum_{b=1}^{B} \sum_{t \in T_b} \Delta i(t, X_j)\]

di mana \(B\) = jumlah pohon, \(T_b\) = pohon ke-\(b\), dan \(\Delta i(t, X_j)\) = penurunan impuritas pada node \(t\) yang menggunakan variabel \(X_j\).

set.seed(123)

sampel_rf <- sample(nrow(df), min(50000, nrow(df)))
df_rf     <- df[sampel_rf, ] %>%
  select(Diabetes_012, all_of(prediktor))

rf <- randomForest(
  Diabetes_012 ~ .,
  data       = df_rf,
  importance = TRUE,
  ntree      = 300
)

imp_rf <- importance(rf, type = 2)

hasil_rf <- data.frame(
  Variabel   = rownames(imp_rf),
  Importance = round(imp_rf[, 1], 2)
) %>%
  arrange(desc(Importance))

rata_imp <- mean(hasil_rf$Importance)

hasil_rf %>%
  mutate(Status = ifelse(Importance > rata_imp, "Di atas rata-rata ✓", "Di bawah rata-rata")) %>%
  kable(caption = paste0("Random Forest Feature Importance (Rata-rata = ",
                          round(rata_imp, 2), ")")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(hasil_rf$Importance > rata_imp),
           bold = TRUE, background = "#eaf3de")

Random Forest Feature Importance (Rata-rata = 517.32)
	Variabel	Importance	Status
BMI	BMI	1759.07	Di atas rata-rata ✓
Age	Age	1259.72	Di atas rata-rata ✓
Income	Income	1147.53	Di atas rata-rata ✓
GenHlth	GenHlth	881.27	Di atas rata-rata ✓
PhysHlth	PhysHlth	825.83	Di atas rata-rata ✓
MentHlth	MentHlth	768.24	Di atas rata-rata ✓
Education	Education	693.32	Di atas rata-rata ✓
HighBP	HighBP	528.14	Di atas rata-rata ✓
HighChol	HighChol	365.53	Di bawah rata-rata
Smoker	Smoker	349.28	Di bawah rata-rata
Fruits	Fruits	348.22	Di bawah rata-rata
Sex	Sex	325.55	Di bawah rata-rata
DiffWalk	DiffWalk	306.92	Di bawah rata-rata
Veggies	Veggies	272.34	Di bawah rata-rata
PhysActivity	PhysActivity	265.65	Di bawah rata-rata
HeartDiseaseorAttack	HeartDiseaseorAttack	225.56	Di bawah rata-rata
Stroke	Stroke	153.59	Di bawah rata-rata
NoDocbcCost	NoDocbcCost	145.74	Di bawah rata-rata
HvyAlcoholConsump	HvyAlcoholConsump	116.16	Di bawah rata-rata
AnyHealthcare	AnyHealthcare	79.73	Di bawah rata-rata
CholCheck	CholCheck	46.38	Di bawah rata-rata

hasil_rf %>%
  mutate(Variabel = reorder(Variabel, Importance),
         Warna    = ifelse(Importance > rata_imp, "Di atas rata-rata", "Di bawah rata-rata")) %>%
  ggplot(aes(x = Variabel, y = Importance, fill = Warna)) +
  geom_col() +
  geom_hline(yintercept = rata_imp, linetype = "dashed",
             color = "#e74c3c", size = 0.9) +
  annotate("text", x = 2, y = rata_imp + 50,
           label = paste("Rata-rata =", round(rata_imp, 1)),
           color = "#e74c3c", size = 3.5) +
  coord_flip() +
  scale_fill_manual(values = c("Di atas rata-rata" = "#2980b9",
                                "Di bawah rata-rata" = "#bdc3c7")) +
  labs(title    = "Random Forest Feature Importance — Mean Decrease Gini",
       subtitle = "Variabel biru = di atas rata-rata importance (dipilih)",
       x = "Variabel", y = "Mean Decrease Gini", fill = "") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

fitur_rf <- hasil_rf %>% filter(Importance > rata_imp)
cat("Variabel di atas rata-rata importance (Random Forest):\n")

## Variabel di atas rata-rata importance (Random Forest):

cat(paste("-", fitur_rf$Variabel), sep = "\n")

## - BMI
## - Age
## - Income
## - GenHlth
## - PhysHlth
## - MentHlth
## - Education
## - HighBP

cat("\nJumlah:", nrow(fitur_rf), "variabel\n")

## 
## Jumlah: 8 variabel

Interpretasi Random Forest Importance: GenHlth, BMI, Age, dan Income menempati posisi teratas — variabel-variabel ini secara konsisten menjadi titik pemisah paling efektif dalam pohon keputusan. Menariknya, Income yang tidak memiliki korelasi linier sangat tinggi dengan target justru muncul sebagai variabel penting dalam Random Forest, menunjukkan adanya hubungan non-linier antara pendapatan dan status diabetes.

Hasil Feature Selection: Perbandingan Dua Metode

fitur_final <- intersect(fitur_chi$Variabel, fitur_rf$Variabel)
fitur_hapus <- setdiff(prediktor, fitur_final)

cat("=== VARIABEL TERPILIH (konsisten di kedua metode) ===\n")

## === VARIABEL TERPILIH (konsisten di kedua metode) ===

cat(paste("-", fitur_final), sep = "\n")

## - GenHlth
## - HighBP
## - BMI
## - Age
## - PhysHlth
## - Income
## - Education
## - MentHlth

cat("\nJumlah:", length(fitur_final), "variabel\n")

## 
## Jumlah: 8 variabel

cat("\n=== VARIABEL DIELIMINASI ===\n")

## 
## === VARIABEL DIELIMINASI ===

cat(paste("-", fitur_hapus), sep = "\n")

## - HighChol
## - CholCheck
## - Smoker
## - Stroke
## - HeartDiseaseorAttack
## - PhysActivity
## - Fruits
## - Veggies
## - HvyAlcoholConsump
## - AnyHealthcare
## - NoDocbcCost
## - DiffWalk
## - Sex

cat("\nJumlah:", length(fitur_hapus), "variabel\n")

## 
## Jumlah: 13 variabel

semua_var <- data.frame(
  Variabel    = prediktor,
  ChiSquare   = ifelse(prediktor %in% fitur_chi$Variabel, "✓ Signifikan", "✗ Tidak"),
  RandomForest = ifelse(prediktor %in% fitur_rf$Variabel,  "✓ Penting",    "✗ Tidak"),
  Keputusan   = ifelse(prediktor %in% fitur_final,         "PILIH ✓",     "ELIMINASI ✗")
) %>%
  arrange(desc(Keputusan))

semua_var %>%
  kable(caption = "Perbandingan Hasil Dua Metode Feature Selection") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(semua_var$Keputusan == "PILIH ✓"),
           bold = TRUE, background = "#eaf3de") %>%
  row_spec(which(semua_var$Keputusan == "ELIMINASI ✗"),
           background = "#fdecea")

Perbandingan Hasil Dua Metode Feature Selection
Variabel	ChiSquare	RandomForest	Keputusan
HighBP	✓ Signifikan	✓ Penting	PILIH ✓
BMI	✓ Signifikan	✓ Penting	PILIH ✓
GenHlth	✓ Signifikan	✓ Penting	PILIH ✓
MentHlth	✓ Signifikan	✓ Penting	PILIH ✓
PhysHlth	✓ Signifikan	✓ Penting	PILIH ✓
Age	✓ Signifikan	✓ Penting	PILIH ✓
Education	✓ Signifikan	✓ Penting	PILIH ✓
Income	✓ Signifikan	✓ Penting	PILIH ✓
HighChol	✓ Signifikan	✗ Tidak	ELIMINASI ✗
CholCheck	✓ Signifikan	✗ Tidak	ELIMINASI ✗
Smoker	✓ Signifikan	✗ Tidak	ELIMINASI ✗
Stroke	✓ Signifikan	✗ Tidak	ELIMINASI ✗
HeartDiseaseorAttack	✓ Signifikan	✗ Tidak	ELIMINASI ✗
PhysActivity	✓ Signifikan	✗ Tidak	ELIMINASI ✗
Fruits	✓ Signifikan	✗ Tidak	ELIMINASI ✗
Veggies	✓ Signifikan	✗ Tidak	ELIMINASI ✗
HvyAlcoholConsump	✓ Signifikan	✗ Tidak	ELIMINASI ✗
AnyHealthcare	✓ Signifikan	✗ Tidak	ELIMINASI ✗
NoDocbcCost	✓ Signifikan	✗ Tidak	ELIMINASI ✗
DiffWalk	✓ Signifikan	✗ Tidak	ELIMINASI ✗
Sex	✓ Signifikan	✗ Tidak	ELIMINASI ✗

Interpretasi Feature Selection:

Variabel yang terpilih (konsisten di kedua metode) mencakup faktor-faktor inti yang secara klinis dan statistik paling berhubungan dengan diabetes: kondisi kesehatan umum, BMI, usia, tekanan darah, kolesterol, serta faktor sosial-ekonomi (pendidikan dan pendapatan).

Variabel yang dieliminasi umumnya adalah:

CholCheck: Hampir semua responden melakukan cek kolesterol (distribusi sangat tidak seimbang → informasi rendah)
Sex: Perbedaan prevalensi diabetes antar jenis kelamin relatif kecil dalam dataset ini
Fruits/Veggies: Meskipun relevan secara klinis, dampaknya lebih lemah dibanding variabel lain setelah diuji secara statistik

Feature Extraction: Principal Component Analysis (PCA)

Principal Component Analysis (PCA) adalah teknik reduksi dimensi yang mentransformasi variabel asli (yang mungkin saling berkorelasi) menjadi variabel baru yang ortogonal (tidak berkorelasi) yang disebut principal components (PC).

Rumus Komponen Utama:

\[PC_k = a_{k1}X_1 + a_{k2}X_2 + \cdots + a_{kp}X_p\]

Standarisasi Z-score (wajib sebelum PCA):

\[Z_i = \frac{x_i - \bar{x}}{\sigma}\]

Pemilihan Variabel dan Standarisasi

variabel_pca <- c("BMI", "GenHlth", "MentHlth", "PhysHlth",
                   "Age", "Education", "Income",
                   "ChronicDisease_Score", "Lifestyle_Score",
                   "Health_Burden", "Healthcare_Access")

df_pca    <- df[, variabel_pca] %>% na.omit()
df_scaled <- scale(df_pca)

cat("Variabel PCA:", length(variabel_pca), "variabel\n")

## Variabel PCA: 11 variabel

cat("Observasi  :", nrow(df_pca), "\n\n")

## Observasi  : 236378

cat("Verifikasi standarisasi (mean harus ≈ 0, sd harus ≈ 1):\n")

## Verifikasi standarisasi (mean harus ≈ 0, sd harus ≈ 1):

round(rbind(Mean = colMeans(df_scaled), SD = apply(df_scaled, 2, sd)), 4)

##      BMI GenHlth MentHlth PhysHlth Age Education Income ChronicDisease_Score
## Mean   0       0        0        0   0         0      0                    0
## SD     1       1        1        1   1         1      1                    1
##      Lifestyle_Score Health_Burden Healthcare_Access
## Mean               0             0                 0
## SD                 1             1                 1

Melakukan PCA

set.seed(42)
pca_result <- prcomp(df_scaled, center = FALSE, scale. = FALSE)
summary(pca_result)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.7834 1.2867 1.1527 1.0016 0.92894 0.88054 0.81834
## Proportion of Variance 0.2891 0.1505 0.1208 0.0912 0.07845 0.07049 0.06088
## Cumulative Proportion  0.2891 0.4396 0.5604 0.6516 0.73007 0.80055 0.86143
##                            PC8    PC9    PC10      PC11
## Standard deviation     0.74503 0.7175 0.67406 2.246e-13
## Proportion of Variance 0.05046 0.0468 0.04131 0.000e+00
## Cumulative Proportion  0.91189 0.9587 1.00000 1.000e+00

Interpretasi Proporsi Variansi

eigenvalues <- pca_result$sdev^2
prop_var    <- eigenvalues / sum(eigenvalues)
cum_var     <- cumsum(prop_var)

tabel_var <- data.frame(
  Komponen          = paste0("PC", 1:length(eigenvalues)),
  Eigenvalue        = round(eigenvalues, 4),
  Proporsi_Pct      = round(prop_var * 100, 2),
  Kumulatif_Pct     = round(cum_var * 100, 2),
  Kaiser            = ifelse(eigenvalues >= 1, "✓ Dipertahankan", "✗ Diabaikan")
)

tabel_var %>%
  kable(caption   = "Eigenvalue dan Proporsi Variansi Tiap Komponen",
        col.names = c("Komponen","Eigenvalue","Proporsi (%)","Kumulatif (%)","Kriteria Kaiser")) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(tabel_var$Eigenvalue >= 1), bold = TRUE, color = "white",
           background = "#3498db")

Eigenvalue dan Proporsi Variansi Tiap Komponen
Komponen	Eigenvalue	Proporsi (%)	Kumulatif (%)	Kriteria Kaiser
PC1	3.1805	28.91	28.91	✓ Dipertahankan
PC2	1.6555	15.05	43.96	✓ Dipertahankan
PC3	1.3286	12.08	56.04	✓ Dipertahankan
PC4	1.0032	9.12	65.16	✓ Dipertahankan
PC5	0.8629	7.84	73.01	✗ Diabaikan
PC6	0.7754	7.05	80.06	✗ Diabaikan
PC7	0.6697	6.09	86.14	✗ Diabaikan
PC8	0.5551	5.05	91.19	✗ Diabaikan
PC9	0.5148	4.68	95.87	✗ Diabaikan
PC10	0.4544	4.13	100.00	✗ Diabaikan
PC11	0.0000	0.00	100.00	✗ Diabaikan

fviz_eig(pca_result,
          addlabels = TRUE,
          ylim      = c(0, 35),
          barfill   = "#3498db",
          barcolor  = "#2980b9",
          linecolor = "#e74c3c",
          ggtheme   = theme_minimal(base_size = 13)) +
  labs(title    = "Scree Plot — Proporsi Variansi Tiap Komponen Utama",
       subtitle = "Garis putus-putus = referensi eigenvalue ≥ 1 (Kriteria Kaiser)",
       x = "Principal Component", y = "% Variansi yang Dijelaskan") +
  geom_hline(yintercept = 100/length(eigenvalues),
             linetype = "dashed", color = "#e74c3c", size = 0.8) +
  theme(plot.title = element_text(face = "bold"))

Interpretasi Scree Plot & Proporsi Variansi:

Berdasarkan Kriteria Kaiser (eigenvalue ≥ 1), terdapat 4 komponen utama yang dipertahankan (PC1–PC4).

Komponen	Variansi	Kumulatif	Interpretasi
PC1	28.91%	28.91%	Komponen terpenting — menjelaskan hampir 1/3 total variansi
PC2	15.05%	43.96%	Bersama PC1, sudah merangkum 44% informasi
PC3	12.08%	56.04%	Melewati batas 50% kumulatif
PC4	9.12%	65.16%	4 komponen merangkum 65% total variansi

Reduksi dari 11 variabel menjadi 4 komponen yang menjelaskan 65.16% variansi merupakan kompresi yang bermakna.

Loading Factor

loading_long <- as.data.frame(pca_result$rotation[, 1:4]) %>%
  rownames_to_column("Variabel") %>%
  pivot_longer(-Variabel, names_to = "PC", values_to = "Loading")

ggplot(loading_long, aes(x = PC, y = Variabel, fill = Loading)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Loading, 3)), size = 3.5) +
  scale_fill_gradient2(low = "#2980b9", mid = "white", high = "#c0392b",
                       midpoint = 0, limits = c(-1, 1)) +
  labs(title    = "Heatmap Loading Factor — 4 Komponen Utama",
       subtitle = "Merah = kontribusi positif kuat | Biru = kontribusi negatif kuat",
       x = "Komponen Utama", y = "Variabel", fill = "Loading") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"),
        axis.text.y = element_text(size = 11))

loading_df <- as.data.frame(round(pca_result$rotation[, 1:4], 4)) %>%
  rownames_to_column("Variabel") %>%
  select(Variabel, PC1, PC2, PC3, PC4)

loading_df %>%
  kable(caption = "Loading Factor 4 Komponen Utama") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  column_spec(2, bold = ifelse(abs(loading_df$PC1) > 0.3, TRUE, FALSE),
              color = ifelse(loading_df$PC1 > 0.3, "#c0392b",
                     ifelse(loading_df$PC1 < -0.3, "#2980b9", "black"))) %>%
  column_spec(3, bold = ifelse(abs(loading_df$PC2) > 0.3, TRUE, FALSE)) %>%
  column_spec(4, bold = ifelse(abs(loading_df$PC3) > 0.3, TRUE, FALSE)) %>%
  column_spec(5, bold = ifelse(abs(loading_df$PC4) > 0.3, TRUE, FALSE))

Loading Factor 4 Komponen Utama
Variabel	PC1	PC2	PC3	PC4
BMI	0.1700	0.0810	0.0725	0.8855
GenHlth	0.4098	0.1523	-0.0616	0.1347
MentHlth	0.3455	-0.4148	-0.1233	-0.0609
PhysHlth	0.4272	-0.0941	-0.2680	-0.1072
Age	0.0702	0.5668	-0.2887	-0.2772
Education	-0.2237	-0.2082	-0.5309	0.1457
Income	-0.3043	-0.1919	-0.3957	0.2299
ChronicDisease_Score	0.2206	0.4742	-0.2407	0.0952
Lifestyle_Score	-0.2371	-0.1509	-0.2466	-0.0882
Health_Burden	0.4791	-0.3104	-0.2441	-0.1046
Healthcare_Access	-0.1388	0.2094	-0.4515	0.0626

Interpretasi Loading Factor per Komponen

PC1 (28.91%) — “Dimensi Beban Kesehatan & Kesenjangan Sosial”

Variabel dominan: Health_Burden (+0.479), PhysHlth (+0.427), GenHlth (+0.410), MentHlth (+0.346) arah positif; Income (−0.304) arah negatif.

Semakin tinggi PC1 → kondisi fisik dan mental semakin buruk, namun pendapatan semakin rendah. PC1 merangkum disparitas kesehatan: kelompok berpendapatan rendah menanggung beban kesehatan lebih besar.

PC2 (15.05%) — “Dimensi Penuaan & Komorbiditas Kronis”

Variabel dominan: Age (+0.567), ChronicDisease_Score (+0.474).

Semakin tinggi PC2 → usia semakin tua dengan lebih banyak penyakit kronis (hipertensi, kolesterol, stroke, penyakit jantung). Ini adalah profil tipikal pasien lansia dengan multi-morbiditas.

PC3 (12.08%) — “Dimensi Status Sosial-Ekonomi & Aksesibilitas”

Variabel dominan: Education (+0.531), Healthcare_Access (+0.452), Income (+0.396).

Semakin tinggi PC3 → berpendidikan lebih tinggi, berpendapatan lebih baik, dan lebih mudah mengakses layanan kesehatan. PC3 merepresentasikan dimensi human capital dan modal sosial.

PC4 (9.12%) — “Dimensi Status Gizi (BMI)”

Variabel dominan: BMI (+0.886) — sangat dominan.

PC4 hampir sepenuhnya merepresentasikan BMI sebagai dimensi tersendiri yang independen dari tiga dimensi sebelumnya. Ini menunjukkan bahwa obesitas adalah faktor risiko yang unik dan tidak dapat dijelaskan sepenuhnya oleh faktor sosial-ekonomi maupun penyakit kronis.

Visualisasi PCA

set.seed(42)
idx_sample <- sample(nrow(df_scaled), 3000)
pca_sample <- prcomp(df_scaled[idx_sample, ], center = FALSE, scale. = FALSE)

fviz_pca_biplot(
  pca_sample,
  geom.ind  = "point",
  col.ind   = "#BDC3C7",
  alpha.ind = 0.3,
  col.var   = "#e74c3c",
  repel     = TRUE,
  labelsize = 4,
  ggtheme   = theme_minimal(base_size = 12)
) +
  labs(title    = "Biplot PCA — PC1 vs PC2",
       subtitle = "Panah = arah dan kekuatan kontribusi variabel | Titik = observasi (sampel 3.000)",
       x = "PC1 — Beban Kesehatan & Kesenjangan Sosial (28.91%)",
       y = "PC2 — Penuaan & Komorbiditas Kronis (15.05%)") +
  theme(plot.title = element_text(face = "bold", size = 14))

p1 <- fviz_contrib(pca_result, choice = "var", axes = 1, top = 11,
                    fill = "#3498db", color = "#2980b9",
                    ggtheme = theme_minimal(base_size = 11)) +
  labs(title = "Kontribusi Variabel → PC1", x = "", y = "Kontribusi (%)") +
  theme(plot.title = element_text(face = "bold"))

p2 <- fviz_contrib(pca_result, choice = "var", axes = 2, top = 11,
                    fill = "#e67e22", color = "#d35400",
                    ggtheme = theme_minimal(base_size = 11)) +
  labs(title = "Kontribusi Variabel → PC2", x = "", y = "Kontribusi (%)") +
  theme(plot.title = element_text(face = "bold"))

grid.arrange(p1, p2, ncol = 2)

fviz_cos2(pca_result, choice = "var", axes = 1:2,
           fill = "#9b59b6", color = "#8e44ad",
           ggtheme = theme_minimal(base_size = 12)) +
  labs(title    = "Cos² — Kualitas Representasi Variabel pada PC1 & PC2",
       subtitle = "Nilai tinggi = variabel terwakili dengan baik oleh kedua komponen pertama",
       x = "Variabel", y = "Cos²") +
  theme(plot.title = element_text(face = "bold"))

Interpretasi Visualisasi:

Biplot: Panah Health_Burden, PhysHlth, GenHlth mengarah ke kanan (PC1 tinggi). Panah Age dan ChronicDisease_Score mengarah ke atas (PC2 tinggi). Panah Income dan Education berlawanan arah dengan Health_Burden — mencerminkan hubungan negatif antara status sosial-ekonomi dan beban kesehatan.
Kontribusi: Garis putus-putus adalah nilai referensi merata. Variabel yang batangnya melewati garis berkontribusi signifikan.
Cos²: Health_Burden, PhysHlth, GenHlth, MentHlth memiliki nilai cos² tertinggi — dimensi kesehatan terwakili sangat baik oleh PC1 dan PC2.

Ringkasan PCA

tibble(
  Komponen        = c("PC1","PC2","PC3","PC4"),
  `Variansi (%)`  = c(28.91, 15.05, 12.08, 9.12),
  `Kumulatif (%)` = c(28.91, 43.96, 56.04, 65.16),
  `Variabel Dominan` = c(
    "Health_Burden, PhysHlth, GenHlth, MentHlth",
    "Age, ChronicDisease_Score",
    "Education, Healthcare_Access, Income",
    "BMI (dominan tunggal)"
  ),
  `Nama Dimensi` = c(
    "Beban Kesehatan & Kesenjangan Sosial",
    "Penuaan & Komorbiditas Kronis",
    "Status Sosial-Ekonomi & Aksesibilitas",
    "Status Gizi (Obesitas)"
  )
) %>%
  kable(caption = "Ringkasan 4 Komponen Utama PCA") %>%
  kable_styling(bootstrap_options = c("striped","hover","responsive"),
                full_width = TRUE)

Ringkasan 4 Komponen Utama PCA
Komponen	Variansi (%)	Kumulatif (%)	Variabel Dominan	Nama Dimensi
PC1	28.91	28.91	Health_Burden, PhysHlth, GenHlth, MentHlth	Beban Kesehatan & Kesenjangan Sosial
PC2	15.05	43.96	Age, ChronicDisease_Score	Penuaan & Komorbiditas Kronis
PC3	12.08	56.04	Education, Healthcare_Access, Income	Status Sosial-Ekonomi & Aksesibilitas
PC4	9.12	65.16	BMI (dominan tunggal)	Status Gizi (Obesitas)

Insight dan Kesimpulan

Variabel Paling Penting

Berdasarkan konvergensi hasil EDA, Feature Selection (Chi-Square + Random Forest), dan loading factor PCA, variabel yang secara konsisten muncul sebagai paling berpengaruh adalah:

GenHlth — Persepsi kesehatan umum adalah cerminan holistik kondisi seseorang yang merangkum banyak dimensi kesehatan sekaligus
BMI — Obesitas merupakan faktor risiko biologis utama diabetes tipe 2 yang terkonfirmasi dari semua metode
Age — Prevalensi diabetes meningkat seiring usia, mencerminkan akumulasi risiko sepanjang hidup
HighBP — Hipertensi dan diabetes sangat sering terjadi bersamaan (comorbid), keduanya dipicu faktor gaya hidup yang sama
Income — Status ekonomi mempengaruhi akses makanan sehat, waktu olahraga, dan layanan kesehatan

Apakah Reduksi Dimensi Berhasil?

Ya, reduksi dimensi berhasil dilakukan dengan baik:

Feature Engineering: 22 variabel asli + 5 fitur baru yang lebih informatif
Feature Selection: Dari 21 prediktor asli, tersaring menjadi variabel-variabel inti yang paling relevan
PCA: 11 variabel (termasuk fitur baru) berhasil dikompres menjadi 4 komponen yang menjelaskan 65.16% variansi — reduksi dimensi lebih dari setengah dengan kehilangan informasi yang masih dapat diterima

Makna Substantif Principal Components

Empat komponen PCA berhasil mengidentifikasi empat dimensi laten yang relevan secara substantif:

PC	Dimensi	Implikasi Kebijakan
PC1	Beban Kesehatan & Kesenjangan Sosial	Intervensi pada kelompok berpendapatan rendah paling prioritas
PC2	Penuaan & Komorbiditas	Program skrining diabetes untuk lansia sangat diperlukan
PC3	Sosial-Ekonomi & Akses	Peningkatan literasi kesehatan dan asuransi universal
PC4	Status Gizi (Obesitas)	Program penanganan obesitas sebagai pencegahan primer

Insight Utama dari Data

Diabetes bukanlah penyakit tunggal yang disebabkan satu faktor — melainkan hasil dari interaksi empat dimensi sekaligus: beban kesehatan yang sudah ada, proses penuaan, hambatan sosial-ekonomi, dan status gizi. Pencegahan yang efektif harus menyasar keempat dimensi ini secara bersamaan.

Data Reduction: Diabetes Health Indicators BRFSS 2021

Eksplorasi, Visualisasi, dan Reduksi Data

Kelompok: Suci Wardatun · Octa Syahira · Asyifa

01 June 2026

Dataset

Memuat Data

Struktur Dataset

Deskripsi Variabel

Tujuan Analisis

Paket R yang Digunakan

Exploratory Data Analysis (EDA)

Statistik Deskriptif

Cek Missing Value

Distribusi Variabel Target

Visualisasi Variabel Numerik

Identifikasi Outlier

Boxplot BMI Berdasarkan Status Diabetes

Scatterplot BMI vs Usia

Analisis Korelasi

Identifikasi Multikolinearitas (VIF)

Feature Engineering

Fitur 1: ChronicDisease_Score — Skor Penyakit Kronis

Fitur 2: Lifestyle_Score — Skor Gaya Hidup Sehat

Fitur 3: Health_Burden — Beban Kesehatan Total

Fitur 4: BMI_Category — Kategori BMI (Binning WHO)

Fitur 5: Healthcare_Access — Akses Layanan Kesehatan Bersih

Ringkasan Feature Engineering

Feature Selection

Persiapan Data

Metode 1: Chi-Square Test (Filter Method)

Metode 2: Random Forest Feature Importance (Embedded Method)

Hasil Feature Selection: Perbandingan Dua Metode

Feature Extraction: Principal Component Analysis (PCA)

Pemilihan Variabel dan Standarisasi

Melakukan PCA

Interpretasi Proporsi Variansi

Loading Factor

Interpretasi Loading Factor per Komponen

Visualisasi PCA

Ringkasan PCA

Insight dan Kesimpulan

Variabel Paling Penting

Apakah Reduksi Dimensi Berhasil?

Makna Substantif Principal Components

Insight Utama dari Data

Fitur 1: `ChronicDisease_Score` — Skor Penyakit Kronis

Fitur 2: `Lifestyle_Score` — Skor Gaya Hidup Sehat

Fitur 3: `Health_Burden` — Beban Kesehatan Total

Fitur 4: `BMI_Category` — Kategori BMI (Binning WHO)

Fitur 5: `Healthcare_Access` — Akses Layanan Kesehatan Bersih