library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read.csv("C:/Users/LENOVO/Downloads/diabetes_dataset.csv", sep=",")
head(df)
## Age Pregnancies BMI Glucose BloodPressure HbA1c LDL HDL Triglycerides
## 1 69 5 28.39 130.1 77 5.4 130.4 44.0 50.0
## 2 32 1 26.49 116.5 72 4.5 87.4 54.2 129.9
## 3 89 13 25.34 101.0 82 4.9 112.5 56.8 177.6
## 4 78 13 29.91 146.0 104 5.7 50.7 39.1 117.0
## 5 38 8 24.56 103.2 74 4.7 102.5 29.1 145.9
## 6 41 10 17.47 67.0 71 4.2 105.3 58.8 140.7
## WaistCircumference HipCircumference WHR FamilyHistory DietType Hypertension
## 1 90.5 107.9 0.84 0 0 0
## 2 113.3 81.4 1.39 0 0 0
## 3 84.7 107.2 0.79 0 0 0
## 4 108.9 110.0 0.99 0 0 0
## 5 84.1 92.8 0.91 0 1 0
## 6 81.8 93.2 0.88 1 0 0
## MedicationUse Outcome
## 1 1 0
## 2 0 0
## 3 1 0
## 4 1 1
## 5 0 0
## 6 0 1
df <- df %>% mutate(across(everything(), as.numeric))
str(df)
## 'data.frame': 9538 obs. of 17 variables:
## $ Age : num 69 32 89 78 38 41 20 39 70 19 ...
## $ Pregnancies : num 5 1 13 13 8 10 16 4 3 1 ...
## $ BMI : num 28.4 26.5 25.3 29.9 24.6 ...
## $ Glucose : num 130 116 101 146 103 ...
## $ BloodPressure : num 77 72 82 104 74 71 60 94 90 62 ...
## $ HbA1c : num 5.4 4.5 4.9 5.7 4.7 4.2 4 4.5 4 4 ...
## $ LDL : num 130.4 87.4 112.5 50.7 102.5 ...
## $ HDL : num 44 54.2 56.8 39.1 29.1 58.8 43.4 50.1 51.3 64.3 ...
## $ Triglycerides : num 50 130 178 117 146 ...
## $ WaistCircumference: num 90.5 113.3 84.7 108.9 84.1 ...
## $ HipCircumference : num 107.9 81.4 107.2 110 92.8 ...
## $ WHR : num 0.84 1.39 0.79 0.99 0.91 0.88 0.65 1.01 0.75 0.76 ...
## $ FamilyHistory : num 0 0 0 0 0 1 0 1 0 1 ...
## $ DietType : num 0 0 0 0 1 0 1 0 1 0 ...
## $ Hypertension : num 0 0 0 0 0 0 0 0 0 0 ...
## $ MedicationUse : num 1 0 1 1 0 0 0 0 1 0 ...
## $ Outcome : num 0 0 0 1 0 1 0 1 0 1 ...
summary(df)
## Age Pregnancies BMI Glucose
## Min. :18.00 Min. : 0.000 Min. :15.00 Min. : 50.0
## 1st Qu.:36.00 1st Qu.: 4.000 1st Qu.:22.87 1st Qu.: 91.0
## Median :53.00 Median : 8.000 Median :27.05 Median :106.0
## Mean :53.58 Mean : 7.986 Mean :27.05 Mean :106.1
## 3rd Qu.:72.00 3rd Qu.:12.000 3rd Qu.:31.18 3rd Qu.:121.0
## Max. :89.00 Max. :16.000 Max. :49.66 Max. :207.2
## BloodPressure HbA1c LDL HDL
## Min. : 60.00 Min. :4.000 Min. :-12.0 Min. : -9.20
## 1st Qu.: 74.00 1st Qu.:4.300 1st Qu.: 80.1 1st Qu.: 39.70
## Median : 84.00 Median :4.600 Median : 99.9 Median : 50.20
## Mean : 84.48 Mean :4.651 Mean :100.1 Mean : 49.95
## 3rd Qu.: 94.00 3rd Qu.:5.000 3rd Qu.:120.2 3rd Qu.: 60.20
## Max. :138.00 Max. :6.900 Max. :202.2 Max. :107.80
## Triglycerides WaistCircumference HipCircumference WHR
## Min. : 50.0 Min. : 40.30 Min. : 54.8 Min. :0.4200
## 1st Qu.:117.2 1st Qu.: 83.40 1st Qu.: 94.0 1st Qu.:0.8200
## Median :150.6 Median : 93.80 Median :103.2 Median :0.9100
## Mean :151.1 Mean : 93.95 Mean :103.1 Mean :0.9174
## 3rd Qu.:185.1 3rd Qu.:104.60 3rd Qu.:112.1 3rd Qu.:1.0100
## Max. :345.8 Max. :163.00 Max. :156.6 Max. :1.4900
## FamilyHistory DietType Hypertension MedicationUse
## Min. :0.0000 Min. :0.0000 Min. :0.000000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:0.000
## Median :0.0000 Median :0.0000 Median :0.000000 Median :0.000
## Mean :0.3025 Mean :0.4862 Mean :0.001048 Mean :0.405
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.000000 3rd Qu.:1.000
## Max. :1.0000 Max. :2.0000 Max. :1.000000 Max. :1.000
## Outcome
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3441
## 3rd Qu.:1.0000
## Max. :1.0000
colSums(is.na(df))
## Age Pregnancies BMI Glucose
## 0 0 0 0
## BloodPressure HbA1c LDL HDL
## 0 0 0 0
## Triglycerides WaistCircumference HipCircumference WHR
## 0 0 0 0
## FamilyHistory DietType Hypertension MedicationUse
## 0 0 0 0
## Outcome
## 0
Interpretasi : Tidak ada Missing Value pada data frame diabetes
par(mfrow=c(3,3))
for(col in colnames(df)) {
hist(df[[col]], main=paste("Distribusi", col), xlab=col, col="skyblue", border="black")
}
par(mfrow=c(1,1))
Interpretasi : Pada hasil visualisasi pada data diabetes diatas, hanya terdapat tiga histogram yang memiliki distribusi, yaitu memiliki distqribusi normal pada data WHR, WaistCircumference, dan HipCircumference.
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
# Mengubah data ke format long agar bisa dibuat boxplot dengan ggplot2
df_long <- df %>% pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
# Membuat boxplot untuk setiap variabel
ggplot(df_long, aes(x = Variable, y = Value)) +
geom_boxplot(fill = "lightblue", color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot untuk Semua Variabel", x = "Variabel", y = "Nilai")
# Mengubah data ke format long agar bisa dibuat boxplot dengan ggplot2
df_long <- df %>% pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
# Membuat boxplot untuk setiap variabel
ggplot(df_long, aes(x = Variable, y = Value)) +
geom_boxplot(fill = "lightblue", color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot untuk Semua Variabel", x = "Variabel", y = "Nilai")
Interpretasi : Beberapa variabel memiliki banyak Outlier diantaranya :
Glucose: Memiliki beberapa outlier dengan nilai tinggi, menunjukkan adanya individu dengan kadar glukosa yang jauh di atas normal.
Triglycerides: Memiliki distribusi yang sangat lebar dengan banyak outlier pada nilai tinggi, menunjukkan bahwa beberapa individu memiliki kadar trigliserida yang jauh di atas normal.
LDL dan HDL juga menunjukkan banyak outlier, yang mengindikasikan variasi besar dalam kadar kolesterol.
Kemudian, Variabel biner/kategorikal seperti DietType,FamiliyHistory, HbA1c, Hypertension, MedicationUse, WHR, Outcome, terlihat hanya pada nilai tertentu (0 dan 1) tanpa distribusi yang luas seperti variabel lainnya.
# Load library
library(ggplot2)
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Menghitung matriks korelasi
cor_matrix <- cor(df, use="complete.obs")
# Menampilkan matriks korelasi
print(cor_matrix)
## Age Pregnancies BMI Glucose
## Age 1.000000000 0.010713797 0.021793701 0.487807610
## Pregnancies 0.010713797 1.000000000 -0.013992517 0.007125949
## BMI 0.021793701 -0.013992517 1.000000000 0.556682648
## Glucose 0.487807610 0.007125949 0.556682648 1.000000000
## BloodPressure 0.429334859 -0.008272473 0.605405105 0.541986672
## HbA1c 0.396354303 0.009060483 0.459851892 0.817493001
## LDL -0.003870920 0.006716723 0.019921205 0.019576394
## HDL -0.009565534 -0.000258451 -0.018187928 -0.020586805
## Triglycerides 0.010470254 0.005732030 -0.014391717 -0.008764540
## WaistCircumference 0.025591129 -0.018594613 0.761733881 0.428944446
## HipCircumference 0.020943913 -0.006865529 0.662458040 0.372785993
## WHR 0.009933264 -0.013981709 0.252677132 0.142553677
## FamilyHistory -0.012887442 -0.017076600 0.008037471 -0.005163065
## DietType -0.002629394 -0.002276881 -0.017385961 -0.017349827
## Hypertension 0.038417396 -0.011729783 0.073730882 0.072805940
## MedicationUse 0.852709405 0.001795024 0.016819379 0.415119221
## Outcome 0.068452792 -0.018188538 0.107247654 0.168324558
## BloodPressure HbA1c LDL HDL
## Age 0.4293348593 0.396354303 -0.0038709198 -0.009565534
## Pregnancies -0.0082724734 0.009060483 0.0067167233 -0.000258451
## BMI 0.6054051045 0.459851892 0.0199212051 -0.018187928
## Glucose 0.5419866723 0.817493001 0.0195763944 -0.020586805
## BloodPressure 1.0000000000 0.444979960 0.0002646506 -0.011031264
## HbA1c 0.4449799604 1.000000000 0.0119746973 -0.023278377
## LDL 0.0002646506 0.011974697 1.0000000000 -0.015699818
## HDL -0.0110312636 -0.023278377 -0.0156998176 1.000000000
## Triglycerides -0.0147421155 -0.016008677 0.0108642343 -0.013874825
## WaistCircumference 0.4601783963 0.352626317 0.0173209038 -0.016538369
## HipCircumference 0.4006259305 0.308792051 0.0184306343 -0.008864187
## WHR 0.1523048832 0.114623921 0.0042228481 -0.008544794
## FamilyHistory 0.0013959510 -0.007980574 0.0182118824 -0.005171346
## DietType -0.0087386224 -0.016616527 -0.0111977652 -0.007914133
## Hypertension 0.1035118057 0.071362645 -0.0028415250 -0.010167466
## MedicationUse 0.3690455905 0.339439873 -0.0014819496 -0.023658835
## Outcome 0.0963609752 0.151505584 0.0194973562 -0.008837176
## Triglycerides WaistCircumference HipCircumference
## Age 0.010470254 0.025591129 0.020943913
## Pregnancies 0.005732030 -0.018594613 -0.006865529
## BMI -0.014391717 0.761733881 0.662458040
## Glucose -0.008764540 0.428944446 0.372785993
## BloodPressure -0.014742115 0.460178396 0.400625931
## HbA1c -0.016008677 0.352626317 0.308792051
## LDL 0.010864234 0.017320904 0.018430634
## HDL -0.013874825 -0.016538369 -0.008864187
## Triglycerides 1.000000000 -0.005236727 -0.002493902
## WaistCircumference -0.005236727 1.000000000 0.512327970
## HipCircumference -0.002493902 0.512327970 1.000000000
## WHR -0.003193723 0.644491903 -0.314888464
## FamilyHistory -0.004994390 0.011752699 0.020878803
## DietType 0.020502899 -0.031460433 -0.012527172
## Hypertension 0.001232519 0.036727585 0.041223122
## MedicationUse 0.017657899 0.024709289 0.020334338
## Outcome -0.008487375 0.087492368 0.082797235
## WHR FamilyHistory DietType Hypertension
## Age 0.009933264 -0.0128874418 -0.002629394 0.0384173960
## Pregnancies -0.013981709 -0.0170766001 -0.002276881 -0.0117297831
## BMI 0.252677132 0.0080374713 -0.017385961 0.0737308825
## Glucose 0.142553677 -0.0051630653 -0.017349827 0.0728059398
## BloodPressure 0.152304883 0.0013959510 -0.008738622 0.1035118057
## HbA1c 0.114623921 -0.0079805735 -0.016616527 0.0713626452
## LDL 0.004222848 0.0182118824 -0.011197765 -0.0028415250
## HDL -0.008544794 -0.0051713462 -0.007914133 -0.0101674664
## Triglycerides -0.003193723 -0.0049943895 0.020502899 0.0012325192
## WaistCircumference 0.644491903 0.0117526990 -0.031460433 0.0367275849
## HipCircumference -0.314888464 0.0208788028 -0.012527172 0.0412231220
## WHR 1.000000000 -0.0041796716 -0.025320045 0.0047391736
## FamilyHistory -0.004179672 1.0000000000 -0.015389473 -0.0001745137
## DietType -0.025320045 -0.0153894725 1.000000000 0.0251800300
## Hypertension 0.004739174 -0.0001745137 0.025180030 1.0000000000
## MedicationUse 0.009986125 -0.0076526985 -0.002919979 0.0392662751
## Outcome 0.021726631 0.9091660892 -0.015549044 0.0310893008
## MedicationUse Outcome
## Age 0.852709405 0.068452792
## Pregnancies 0.001795024 -0.018188538
## BMI 0.016819379 0.107247654
## Glucose 0.415119221 0.168324558
## BloodPressure 0.369045590 0.096360975
## HbA1c 0.339439873 0.151505584
## LDL -0.001481950 0.019497356
## HDL -0.023658835 -0.008837176
## Triglycerides 0.017657899 -0.008487375
## WaistCircumference 0.024709289 0.087492368
## HipCircumference 0.020334338 0.082797235
## WHR 0.009986125 0.021726631
## FamilyHistory -0.007652698 0.909166089
## DietType -0.002919979 -0.015549044
## Hypertension 0.039266275 0.031089301
## MedicationUse 1.000000000 0.064626215
## Outcome 0.064626215 1.000000000
# Visualisasi korelasi dengan heatmap menggunakan corrplot
corrplot(cor_matrix, method="color", type="lower", tl.cex=0.8, tl.col="black", col=colorRampPalette(c("blue", "white", "red"))(200))
# Alternatif visualisasi dengan ggplot2 tanpa reshape2
cor_long <- as.data.frame(as.table(cor_matrix))
ggplot(cor_long, aes(Var1, Var2, fill=Freq)) +
geom_tile() +
scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0, limit=c(-1,1), space="Lab") +
theme_minimal() +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
labs(title="Heatmap Korelasi", x="", y="")
Berikut adalah beberapa interpretasi yang dapat diberikan :
Glukosa dan HbA1c adalah variabel penting yang dapat digunakan untuk analisis lebih lanjut terkait penyakit metabolik.
BMI dan lingkar pinggang/pinggul memiliki korelasi tinggi, sehingga dapat digunakan dalam analisis obesitas dan risikonya terhadap kesehatan.
Outcome tampaknya dipengaruhi oleh beberapa faktor utama, termasuk glukosa, HbA1c, hipertensi, dan BMI.
Perlu dilakukan analisis regresi atau machine learning untuk memahami pengaruh variabel-variabel ini secara lebih mendalam terhadap Outcome.
Q1 <- apply(df, 2, quantile, probs=0.25, na.rm=TRUE)
Q3 <- apply(df, 2, quantile, probs=0.75, na.rm=TRUE)
IQR <- Q3 - Q1
outliers <- df < (Q1 - 1.5 * IQR) | df > (Q3 + 1.5 * IQR)
colSums(outliers)
## Age Pregnancies BMI Glucose
## 5857 7373 6636 5791
## BloodPressure HbA1c LDL HDL
## 5327 7856 5868 6082
## Triglycerides WaistCircumference HipCircumference WHR
## 7391 5396 5549 5613
## FamilyHistory DietType Hypertension MedicationUse
## 5585 5673 5611 5621
## Outcome
## 5638
Interpretasi :
Semua variabel memiliki outlier dalam jumlah besar, ditunjukkan oleh jumlah yang cukup tinggi di setiap kolom.
Variabel dengan jumlah outlier tinggi:
Pregnancies (7373 outlier)
Triglycerides (7391 outlier)
BMI (6636 outlier)
HbA1c (7856 outlier, tertinggi)
Variabel dengan jumlah outlier lebih rendah tetapi tetap signifikan:
BloodPressure (5327 outlier)
HipCircumference (5549 outlier)
MedicationUse (5621 outlier)
ggplot(df, aes(x=WHR)) +
geom_density(fill="Pink", alpha=0.5) +
labs(title="Density Plot WHR", x="Glucose", y="Density") +
theme_minimal()
Interpretasi : Kesimpulan
Distribusi WHR dalam dataset cenderung normal dengan puncak di sekitar 0.9.
Tidak terdapat outlier ekstrem, tetapi ada sedikit skewness ke kanan.
Mayoritas individu memiliki WHR antara 0.8 dan 1.1, yang dapat digunakan untuk analisis lebih lanjut terkait risiko kesehatan.
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(df, aes(color=factor(Outcome)))
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(lm(Glucose ~ ., data=df))
## Age Pregnancies BMI BloodPressure
## 4.245725 1.001493 4.261145 2.183706
## HbA1c LDL HDL Triglycerides
## 1.680628 1.001682 1.002311 1.002100
## WaistCircumference HipCircumference WHR FamilyHistory
## 61.082728 39.378378 49.144666 6.680216
## DietType Hypertension MedicationUse Outcome
## 1.002890 1.016169 3.672722 6.851661
Variabel dengan VIF Rendah (<5):
LDL (1.0016), Pregnancies (1.0015), HDL (1.0023), Triglycerides (1.0021), DietType (1.0028), Hypertension (1.0161), MedicationUse (1.6727), HbA1c (1.6806) → Tidak ada indikasi multikolinearitas yang signifikan.
BMI (4.26), BloodPressure (2.18), Age (4.24) → Masih dalam batas aman, meskipun Age dan BMI mendekati batas moderat.
Variabel dengan VIF Moderat (>5 dan <10):
Outcome (6.85) → Indikasi adanya hubungan dengan variabel lain, tetapi belum terlalu tinggi.
Variabel dengan VIF Tinggi (>10, perlu ditangani):
WaistCircumference (61.08) → Sangat tinggi, menunjukkan korelasi kuat dengan variabel lain.
HipCircumference (39.37) → Juga sangat tinggi, kemungkinan berkorelasi dengan WaistCircumference dan BMI.
WHR (49.14) → Juga sangat tinggi, bisa jadi memiliki hubungan linear yang kuat dengan WaistCircumference dan HipCircumference.