library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- read.csv("C:/Users/LENOVO/Downloads/DATA AED.csv", sep=";")
head(df)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0
df <- df %>% mutate(across(everything(), as.numeric))

EKSPLORASI DATA

Menampilkan Observasi dan Tipe Data

str(df)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : num  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num  1 0 1 0 1 0 1 0 1 1 ...

Menampilkan List Data dan Ringkasan Statistik

summary(df)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

Mengecek nilai yang hilang

colSums(is.na(df))
##              Pregnancies                  Glucose            BloodPressure 
##                        0                        0                        0 
##            SkinThickness                  Insulin                      BMI 
##                        0                        0                        0 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

Histogram

par(mfrow=c(3,3))  
for(col in colnames(df)) {
  hist(df[[col]], main=paste("Distribusi", col), xlab=col, col="skyblue", border="black")
}

par(mfrow=c(1,1))

Scatter Plot

library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
# Mengubah data ke format long agar bisa dibuat boxplot dengan ggplot2
df_long <- df %>% pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

# Membuat boxplot untuk setiap variabel
ggplot(df_long, aes(x = Variable, y = Value)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Boxplot untuk Semua Variabel", x = "Variabel", y = "Nilai")

Korelasi

# Load library
library(ggplot2)
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Menghitung matriks korelasi
cor_matrix <- cor(df, use="complete.obs")

# Menampilkan matriks korelasi
print(cor_matrix)
##                          Pregnancies    Glucose BloodPressure SkinThickness
## Pregnancies               1.00000000 0.12945867    0.14128198   -0.08167177
## Glucose                   0.12945867 1.00000000    0.15258959    0.05732789
## BloodPressure             0.14128198 0.15258959    1.00000000    0.20737054
## SkinThickness            -0.08167177 0.05732789    0.20737054    1.00000000
## Insulin                  -0.07353461 0.33135711    0.08893338    0.43678257
## BMI                       0.01768309 0.22107107    0.28180529    0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730    0.04126495    0.18392757
## Age                       0.54434123 0.26351432    0.23952795   -0.11397026
## Outcome                   0.22189815 0.46658140    0.06506836    0.07475223
##                              Insulin        BMI DiabetesPedigreeFunction
## Pregnancies              -0.07353461 0.01768309              -0.03352267
## Glucose                   0.33135711 0.22107107               0.13733730
## BloodPressure             0.08893338 0.28180529               0.04126495
## SkinThickness             0.43678257 0.39257320               0.18392757
## Insulin                   1.00000000 0.19785906               0.18507093
## BMI                       0.19785906 1.00000000               0.14064695
## DiabetesPedigreeFunction  0.18507093 0.14064695               1.00000000
## Age                      -0.04216295 0.03624187               0.03356131
## Outcome                   0.13054795 0.29269466               0.17384407
##                                  Age    Outcome
## Pregnancies               0.54434123 0.22189815
## Glucose                   0.26351432 0.46658140
## BloodPressure             0.23952795 0.06506836
## SkinThickness            -0.11397026 0.07475223
## Insulin                  -0.04216295 0.13054795
## BMI                       0.03624187 0.29269466
## DiabetesPedigreeFunction  0.03356131 0.17384407
## Age                       1.00000000 0.23835598
## Outcome                   0.23835598 1.00000000
# Visualisasi korelasi dengan heatmap menggunakan corrplot
corrplot(cor_matrix, method="color", type="lower", tl.cex=0.8, tl.col="black", col=colorRampPalette(c("blue", "white", "red"))(200))

# Alternatif visualisasi dengan ggplot2 tanpa reshape2
cor_long <- as.data.frame(as.table(cor_matrix))

ggplot(cor_long, aes(Var1, Var2, fill=Freq)) +
  geom_tile() +
   scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0, limit=c(-1,1), space="Lab") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  labs(title="Heatmap Korelasi", x="", y="")

Melihat Outlier Tiap Kolom

Q1 <- apply(df, 2, quantile, probs=0.25, na.rm=TRUE)
Q3 <- apply(df, 2, quantile, probs=0.75, na.rm=TRUE)
IQR <- Q3 - Q1

outliers <- df < (Q1 - 1.5 * IQR) | df > (Q3 + 1.5 * IQR)
colSums(outliers)
##              Pregnancies                  Glucose            BloodPressure 
##                      360                      561                      417 
##            SkinThickness                  Insulin                      BMI 
##                      374                      390                      378 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                      259                      375                      255

Distribusi Data dengan Density Plot

ggplot(df, aes(x=Glucose)) + 
  geom_density(fill="Pink", alpha=0.5) +
  labs(title="Density Plot Glukosa", x="Glucose", y="Density") +
  theme_minimal()

Pair Plot (Scatterplot Matrix)

library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(df, aes(color=factor(Outcome)))
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero

Uji Normalitas Data

shapiro.test(df$Glucose) 
## 
##  Shapiro-Wilk normality test
## 
## data:  df$Glucose
## W = 0.9701, p-value = 1.986e-11
shapiro.test(df$Pregnancies)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$Pregnancies
## W = 0.90428, p-value < 2.2e-16
shapiro.test(df$BloodPressure)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$BloodPressure
## W = 0.81892, p-value < 2.2e-16

Uji Multikolinearitas (Variance Inflation Factor - VIF)

library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(lm(Glucose ~ ., data=df))
##              Pregnancies            BloodPressure            SkinThickness 
##                 1.457579                 1.186929                 1.475857 
##                  Insulin                      BMI DiabetesPedigreeFunction 
##                 1.269406                 1.357473                 1.081635 
##                      Age                  Outcome 
##                 1.546036                 1.222493
vif(lm(Pregnancies ~ ., data=df))
##                  Glucose            BloodPressure            SkinThickness 
##                 1.521189                 1.193570                 1.507464 
##                  Insulin                      BMI DiabetesPedigreeFunction 
##                 1.427802                 1.365409                 1.076964 
##                      Age                  Outcome 
##                 1.201638                 1.405407