library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- read.csv("C:/Users/LENOVO/Downloads/diabetes_dataset.csv", sep=",")
head(df)
##   Age Pregnancies   BMI Glucose BloodPressure HbA1c   LDL  HDL Triglycerides
## 1  69           5 28.39   130.1            77   5.4 130.4 44.0          50.0
## 2  32           1 26.49   116.5            72   4.5  87.4 54.2         129.9
## 3  89          13 25.34   101.0            82   4.9 112.5 56.8         177.6
## 4  78          13 29.91   146.0           104   5.7  50.7 39.1         117.0
## 5  38           8 24.56   103.2            74   4.7 102.5 29.1         145.9
## 6  41          10 17.47    67.0            71   4.2 105.3 58.8         140.7
##   WaistCircumference HipCircumference  WHR FamilyHistory DietType Hypertension
## 1               90.5            107.9 0.84             0        0            0
## 2              113.3             81.4 1.39             0        0            0
## 3               84.7            107.2 0.79             0        0            0
## 4              108.9            110.0 0.99             0        0            0
## 5               84.1             92.8 0.91             0        1            0
## 6               81.8             93.2 0.88             1        0            0
##   MedicationUse Outcome
## 1             1       0
## 2             0       0
## 3             1       0
## 4             1       1
## 5             0       0
## 6             0       1
df <- df %>% mutate(across(everything(), as.numeric))

EKSPLORASI DATA

Menampilkan Observasi dan Tipe Data

str(df)
## 'data.frame':    9538 obs. of  17 variables:
##  $ Age               : num  69 32 89 78 38 41 20 39 70 19 ...
##  $ Pregnancies       : num  5 1 13 13 8 10 16 4 3 1 ...
##  $ BMI               : num  28.4 26.5 25.3 29.9 24.6 ...
##  $ Glucose           : num  130 116 101 146 103 ...
##  $ BloodPressure     : num  77 72 82 104 74 71 60 94 90 62 ...
##  $ HbA1c             : num  5.4 4.5 4.9 5.7 4.7 4.2 4 4.5 4 4 ...
##  $ LDL               : num  130.4 87.4 112.5 50.7 102.5 ...
##  $ HDL               : num  44 54.2 56.8 39.1 29.1 58.8 43.4 50.1 51.3 64.3 ...
##  $ Triglycerides     : num  50 130 178 117 146 ...
##  $ WaistCircumference: num  90.5 113.3 84.7 108.9 84.1 ...
##  $ HipCircumference  : num  107.9 81.4 107.2 110 92.8 ...
##  $ WHR               : num  0.84 1.39 0.79 0.99 0.91 0.88 0.65 1.01 0.75 0.76 ...
##  $ FamilyHistory     : num  0 0 0 0 0 1 0 1 0 1 ...
##  $ DietType          : num  0 0 0 0 1 0 1 0 1 0 ...
##  $ Hypertension      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ MedicationUse     : num  1 0 1 1 0 0 0 0 1 0 ...
##  $ Outcome           : num  0 0 0 1 0 1 0 1 0 1 ...

Menampilkan List Data dan Ringkasan Statistik

summary(df)
##       Age         Pregnancies          BMI           Glucose     
##  Min.   :18.00   Min.   : 0.000   Min.   :15.00   Min.   : 50.0  
##  1st Qu.:36.00   1st Qu.: 4.000   1st Qu.:22.87   1st Qu.: 91.0  
##  Median :53.00   Median : 8.000   Median :27.05   Median :106.0  
##  Mean   :53.58   Mean   : 7.986   Mean   :27.05   Mean   :106.1  
##  3rd Qu.:72.00   3rd Qu.:12.000   3rd Qu.:31.18   3rd Qu.:121.0  
##  Max.   :89.00   Max.   :16.000   Max.   :49.66   Max.   :207.2  
##  BloodPressure        HbA1c            LDL             HDL        
##  Min.   : 60.00   Min.   :4.000   Min.   :-12.0   Min.   : -9.20  
##  1st Qu.: 74.00   1st Qu.:4.300   1st Qu.: 80.1   1st Qu.: 39.70  
##  Median : 84.00   Median :4.600   Median : 99.9   Median : 50.20  
##  Mean   : 84.48   Mean   :4.651   Mean   :100.1   Mean   : 49.95  
##  3rd Qu.: 94.00   3rd Qu.:5.000   3rd Qu.:120.2   3rd Qu.: 60.20  
##  Max.   :138.00   Max.   :6.900   Max.   :202.2   Max.   :107.80  
##  Triglycerides   WaistCircumference HipCircumference      WHR        
##  Min.   : 50.0   Min.   : 40.30     Min.   : 54.8    Min.   :0.4200  
##  1st Qu.:117.2   1st Qu.: 83.40     1st Qu.: 94.0    1st Qu.:0.8200  
##  Median :150.6   Median : 93.80     Median :103.2    Median :0.9100  
##  Mean   :151.1   Mean   : 93.95     Mean   :103.1    Mean   :0.9174  
##  3rd Qu.:185.1   3rd Qu.:104.60     3rd Qu.:112.1    3rd Qu.:1.0100  
##  Max.   :345.8   Max.   :163.00     Max.   :156.6    Max.   :1.4900  
##  FamilyHistory       DietType       Hypertension      MedicationUse  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000000   1st Qu.:0.000  
##  Median :0.0000   Median :0.0000   Median :0.000000   Median :0.000  
##  Mean   :0.3025   Mean   :0.4862   Mean   :0.001048   Mean   :0.405  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.000000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :2.0000   Max.   :1.000000   Max.   :1.000  
##     Outcome      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3441  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

Mengecek nilai yang hilang

colSums(is.na(df))
##                Age        Pregnancies                BMI            Glucose 
##                  0                  0                  0                  0 
##      BloodPressure              HbA1c                LDL                HDL 
##                  0                  0                  0                  0 
##      Triglycerides WaistCircumference   HipCircumference                WHR 
##                  0                  0                  0                  0 
##      FamilyHistory           DietType       Hypertension      MedicationUse 
##                  0                  0                  0                  0 
##            Outcome 
##                  0

Interpretasi : Tidak ada Missing Value pada data frame diabetes

Histogram

par(mfrow=c(3,3))  
for(col in colnames(df)) {
  hist(df[[col]], main=paste("Distribusi", col), xlab=col, col="skyblue", border="black")
}

par(mfrow=c(1,1))

Interpretasi : Pada hasil visualisasi pada data diabetes diatas, hanya terdapat tiga histogram yang memiliki distribusi, yaitu memiliki distqribusi normal pada data WHR, WaistCircumference, dan HipCircumference.

Boxplot

library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
# Mengubah data ke format long agar bisa dibuat boxplot dengan ggplot2
df_long <- df %>% pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

# Membuat boxplot untuk setiap variabel
ggplot(df_long, aes(x = Variable, y = Value)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Boxplot untuk Semua Variabel", x = "Variabel", y = "Nilai")

Membuat boxplot untuk setiap variabel

# Mengubah data ke format long agar bisa dibuat boxplot dengan ggplot2
df_long <- df %>% pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

# Membuat boxplot untuk setiap variabel
ggplot(df_long, aes(x = Variable, y = Value)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Boxplot untuk Semua Variabel", x = "Variabel", y = "Nilai")

Interpretasi : Beberapa variabel memiliki banyak Outlier diantaranya :

  • Glucose: Memiliki beberapa outlier dengan nilai tinggi, menunjukkan adanya individu dengan kadar glukosa yang jauh di atas normal.

  • Triglycerides: Memiliki distribusi yang sangat lebar dengan banyak outlier pada nilai tinggi, menunjukkan bahwa beberapa individu memiliki kadar trigliserida yang jauh di atas normal.

  • LDL dan HDL juga menunjukkan banyak outlier, yang mengindikasikan variasi besar dalam kadar kolesterol.

    Kemudian, Variabel biner/kategorikal seperti DietType,FamiliyHistory, HbA1c, Hypertension, MedicationUse, WHR, Outcome, terlihat hanya pada nilai tertentu (0 dan 1) tanpa distribusi yang luas seperti variabel lainnya.

Korelasi

# Load library
library(ggplot2)
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Menghitung matriks korelasi
cor_matrix <- cor(df, use="complete.obs")

# Menampilkan matriks korelasi
print(cor_matrix)
##                             Age  Pregnancies          BMI      Glucose
## Age                 1.000000000  0.010713797  0.021793701  0.487807610
## Pregnancies         0.010713797  1.000000000 -0.013992517  0.007125949
## BMI                 0.021793701 -0.013992517  1.000000000  0.556682648
## Glucose             0.487807610  0.007125949  0.556682648  1.000000000
## BloodPressure       0.429334859 -0.008272473  0.605405105  0.541986672
## HbA1c               0.396354303  0.009060483  0.459851892  0.817493001
## LDL                -0.003870920  0.006716723  0.019921205  0.019576394
## HDL                -0.009565534 -0.000258451 -0.018187928 -0.020586805
## Triglycerides       0.010470254  0.005732030 -0.014391717 -0.008764540
## WaistCircumference  0.025591129 -0.018594613  0.761733881  0.428944446
## HipCircumference    0.020943913 -0.006865529  0.662458040  0.372785993
## WHR                 0.009933264 -0.013981709  0.252677132  0.142553677
## FamilyHistory      -0.012887442 -0.017076600  0.008037471 -0.005163065
## DietType           -0.002629394 -0.002276881 -0.017385961 -0.017349827
## Hypertension        0.038417396 -0.011729783  0.073730882  0.072805940
## MedicationUse       0.852709405  0.001795024  0.016819379  0.415119221
## Outcome             0.068452792 -0.018188538  0.107247654  0.168324558
##                    BloodPressure        HbA1c           LDL          HDL
## Age                 0.4293348593  0.396354303 -0.0038709198 -0.009565534
## Pregnancies        -0.0082724734  0.009060483  0.0067167233 -0.000258451
## BMI                 0.6054051045  0.459851892  0.0199212051 -0.018187928
## Glucose             0.5419866723  0.817493001  0.0195763944 -0.020586805
## BloodPressure       1.0000000000  0.444979960  0.0002646506 -0.011031264
## HbA1c               0.4449799604  1.000000000  0.0119746973 -0.023278377
## LDL                 0.0002646506  0.011974697  1.0000000000 -0.015699818
## HDL                -0.0110312636 -0.023278377 -0.0156998176  1.000000000
## Triglycerides      -0.0147421155 -0.016008677  0.0108642343 -0.013874825
## WaistCircumference  0.4601783963  0.352626317  0.0173209038 -0.016538369
## HipCircumference    0.4006259305  0.308792051  0.0184306343 -0.008864187
## WHR                 0.1523048832  0.114623921  0.0042228481 -0.008544794
## FamilyHistory       0.0013959510 -0.007980574  0.0182118824 -0.005171346
## DietType           -0.0087386224 -0.016616527 -0.0111977652 -0.007914133
## Hypertension        0.1035118057  0.071362645 -0.0028415250 -0.010167466
## MedicationUse       0.3690455905  0.339439873 -0.0014819496 -0.023658835
## Outcome             0.0963609752  0.151505584  0.0194973562 -0.008837176
##                    Triglycerides WaistCircumference HipCircumference
## Age                  0.010470254        0.025591129      0.020943913
## Pregnancies          0.005732030       -0.018594613     -0.006865529
## BMI                 -0.014391717        0.761733881      0.662458040
## Glucose             -0.008764540        0.428944446      0.372785993
## BloodPressure       -0.014742115        0.460178396      0.400625931
## HbA1c               -0.016008677        0.352626317      0.308792051
## LDL                  0.010864234        0.017320904      0.018430634
## HDL                 -0.013874825       -0.016538369     -0.008864187
## Triglycerides        1.000000000       -0.005236727     -0.002493902
## WaistCircumference  -0.005236727        1.000000000      0.512327970
## HipCircumference    -0.002493902        0.512327970      1.000000000
## WHR                 -0.003193723        0.644491903     -0.314888464
## FamilyHistory       -0.004994390        0.011752699      0.020878803
## DietType             0.020502899       -0.031460433     -0.012527172
## Hypertension         0.001232519        0.036727585      0.041223122
## MedicationUse        0.017657899        0.024709289      0.020334338
## Outcome             -0.008487375        0.087492368      0.082797235
##                             WHR FamilyHistory     DietType  Hypertension
## Age                 0.009933264 -0.0128874418 -0.002629394  0.0384173960
## Pregnancies        -0.013981709 -0.0170766001 -0.002276881 -0.0117297831
## BMI                 0.252677132  0.0080374713 -0.017385961  0.0737308825
## Glucose             0.142553677 -0.0051630653 -0.017349827  0.0728059398
## BloodPressure       0.152304883  0.0013959510 -0.008738622  0.1035118057
## HbA1c               0.114623921 -0.0079805735 -0.016616527  0.0713626452
## LDL                 0.004222848  0.0182118824 -0.011197765 -0.0028415250
## HDL                -0.008544794 -0.0051713462 -0.007914133 -0.0101674664
## Triglycerides      -0.003193723 -0.0049943895  0.020502899  0.0012325192
## WaistCircumference  0.644491903  0.0117526990 -0.031460433  0.0367275849
## HipCircumference   -0.314888464  0.0208788028 -0.012527172  0.0412231220
## WHR                 1.000000000 -0.0041796716 -0.025320045  0.0047391736
## FamilyHistory      -0.004179672  1.0000000000 -0.015389473 -0.0001745137
## DietType           -0.025320045 -0.0153894725  1.000000000  0.0251800300
## Hypertension        0.004739174 -0.0001745137  0.025180030  1.0000000000
## MedicationUse       0.009986125 -0.0076526985 -0.002919979  0.0392662751
## Outcome             0.021726631  0.9091660892 -0.015549044  0.0310893008
##                    MedicationUse      Outcome
## Age                  0.852709405  0.068452792
## Pregnancies          0.001795024 -0.018188538
## BMI                  0.016819379  0.107247654
## Glucose              0.415119221  0.168324558
## BloodPressure        0.369045590  0.096360975
## HbA1c                0.339439873  0.151505584
## LDL                 -0.001481950  0.019497356
## HDL                 -0.023658835 -0.008837176
## Triglycerides        0.017657899 -0.008487375
## WaistCircumference   0.024709289  0.087492368
## HipCircumference     0.020334338  0.082797235
## WHR                  0.009986125  0.021726631
## FamilyHistory       -0.007652698  0.909166089
## DietType            -0.002919979 -0.015549044
## Hypertension         0.039266275  0.031089301
## MedicationUse        1.000000000  0.064626215
## Outcome              0.064626215  1.000000000
# Visualisasi korelasi dengan heatmap menggunakan corrplot
corrplot(cor_matrix, method="color", type="lower", tl.cex=0.8, tl.col="black", col=colorRampPalette(c("blue", "white", "red"))(200))

# Alternatif visualisasi dengan ggplot2 tanpa reshape2
cor_long <- as.data.frame(as.table(cor_matrix))

ggplot(cor_long, aes(Var1, Var2, fill=Freq)) +
  geom_tile() +
   scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0, limit=c(-1,1), space="Lab") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  labs(title="Heatmap Korelasi", x="", y="")

Berikut adalah beberapa interpretasi yang dapat diberikan :

  • Glukosa dan HbA1c adalah variabel penting yang dapat digunakan untuk analisis lebih lanjut terkait penyakit metabolik.

  • BMI dan lingkar pinggang/pinggul memiliki korelasi tinggi, sehingga dapat digunakan dalam analisis obesitas dan risikonya terhadap kesehatan.

  • Outcome tampaknya dipengaruhi oleh beberapa faktor utama, termasuk glukosa, HbA1c, hipertensi, dan BMI.

  • Perlu dilakukan analisis regresi atau machine learning untuk memahami pengaruh variabel-variabel ini secara lebih mendalam terhadap Outcome.

Melihat Outlier Tiap Kolom

Q1 <- apply(df, 2, quantile, probs=0.25, na.rm=TRUE)
Q3 <- apply(df, 2, quantile, probs=0.75, na.rm=TRUE)
IQR <- Q3 - Q1

outliers <- df < (Q1 - 1.5 * IQR) | df > (Q3 + 1.5 * IQR)
colSums(outliers)
##                Age        Pregnancies                BMI            Glucose 
##               5857               7373               6636               5791 
##      BloodPressure              HbA1c                LDL                HDL 
##               5327               7856               5868               6082 
##      Triglycerides WaistCircumference   HipCircumference                WHR 
##               7391               5396               5549               5613 
##      FamilyHistory           DietType       Hypertension      MedicationUse 
##               5585               5673               5611               5621 
##            Outcome 
##               5638

Interpretasi :

Semua variabel memiliki outlier dalam jumlah besar, ditunjukkan oleh jumlah yang cukup tinggi di setiap kolom.

  • Variabel dengan jumlah outlier tinggi:

    • Pregnancies (7373 outlier)

    • Triglycerides (7391 outlier)

    • BMI (6636 outlier)

    • HbA1c (7856 outlier, tertinggi)

  • Variabel dengan jumlah outlier lebih rendah tetapi tetap signifikan:

    • BloodPressure (5327 outlier)

    • HipCircumference (5549 outlier)

    • MedicationUse (5621 outlier)

Distribusi Data dengan Density Plot

ggplot(df, aes(x=WHR)) + 
  geom_density(fill="Pink", alpha=0.5) +
  labs(title="Density Plot WHR", x="Glucose", y="Density") +
  theme_minimal()

Interpretasi : Kesimpulan

  • Distribusi WHR dalam dataset cenderung normal dengan puncak di sekitar 0.9.

  • Tidak terdapat outlier ekstrem, tetapi ada sedikit skewness ke kanan.

  • Mayoritas individu memiliki WHR antara 0.8 dan 1.1, yang dapat digunakan untuk analisis lebih lanjut terkait risiko kesehatan.

Pair Plot (Scatterplot Matrix)

library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(df, aes(color=factor(Outcome)))
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero

Uji Multikolinearitas (Variance Inflation Factor - VIF)

library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(lm(Glucose ~ ., data=df))
##                Age        Pregnancies                BMI      BloodPressure 
##           4.245725           1.001493           4.261145           2.183706 
##              HbA1c                LDL                HDL      Triglycerides 
##           1.680628           1.001682           1.002311           1.002100 
## WaistCircumference   HipCircumference                WHR      FamilyHistory 
##          61.082728          39.378378          49.144666           6.680216 
##           DietType       Hypertension      MedicationUse            Outcome 
##           1.002890           1.016169           3.672722           6.851661

Analisis Hasil

  • Variabel dengan VIF Rendah (<5):

    LDL (1.0016), Pregnancies (1.0015), HDL (1.0023), Triglycerides (1.0021), DietType (1.0028), Hypertension (1.0161), MedicationUse (1.6727), HbA1c (1.6806) → Tidak ada indikasi multikolinearitas yang signifikan.

    BMI (4.26), BloodPressure (2.18), Age (4.24) → Masih dalam batas aman, meskipun Age dan BMI mendekati batas moderat.

  • Variabel dengan VIF Moderat (>5 dan <10):

    Outcome (6.85) → Indikasi adanya hubungan dengan variabel lain, tetapi belum terlalu tinggi.

  • Variabel dengan VIF Tinggi (>10, perlu ditangani):

    WaistCircumference (61.08) → Sangat tinggi, menunjukkan korelasi kuat dengan variabel lain.

    HipCircumference (39.37) → Juga sangat tinggi, kemungkinan berkorelasi dengan WaistCircumference dan BMI.

    WHR (49.14) → Juga sangat tinggi, bisa jadi memiliki hubungan linear yang kuat dengan WaistCircumference dan HipCircumference.