library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read.csv("C:/Users/LENOVO/Downloads/DATA AED.csv", sep=";")
head(df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
df <- df %>% mutate(across(everything(), as.numeric))
str(df)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : num 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num 1 0 1 0 1 0 1 0 1 1 ...
summary(df)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
colSums(is.na(df))
## Pregnancies Glucose BloodPressure
## 0 0 0
## SkinThickness Insulin BMI
## 0 0 0
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
par(mfrow=c(3,3))
for(col in colnames(df)) {
hist(df[[col]], main=paste("Distribusi", col), xlab=col, col="skyblue", border="black")
}
par(mfrow=c(1,1))
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
# Mengubah data ke format long agar bisa dibuat boxplot dengan ggplot2
df_long <- df %>% pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
# Membuat boxplot untuk setiap variabel
ggplot(df_long, aes(x = Variable, y = Value)) +
geom_boxplot(fill = "lightblue", color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Boxplot untuk Semua Variabel", x = "Variabel", y = "Nilai")
# Load library
library(ggplot2)
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
# Menghitung matriks korelasi
cor_matrix <- cor(df, use="complete.obs")
# Menampilkan matriks korelasi
print(cor_matrix)
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00000000 0.12945867 0.14128198 -0.08167177
## Glucose 0.12945867 1.00000000 0.15258959 0.05732789
## BloodPressure 0.14128198 0.15258959 1.00000000 0.20737054
## SkinThickness -0.08167177 0.05732789 0.20737054 1.00000000
## Insulin -0.07353461 0.33135711 0.08893338 0.43678257
## BMI 0.01768309 0.22107107 0.28180529 0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730 0.04126495 0.18392757
## Age 0.54434123 0.26351432 0.23952795 -0.11397026
## Outcome 0.22189815 0.46658140 0.06506836 0.07475223
## Insulin BMI DiabetesPedigreeFunction
## Pregnancies -0.07353461 0.01768309 -0.03352267
## Glucose 0.33135711 0.22107107 0.13733730
## BloodPressure 0.08893338 0.28180529 0.04126495
## SkinThickness 0.43678257 0.39257320 0.18392757
## Insulin 1.00000000 0.19785906 0.18507093
## BMI 0.19785906 1.00000000 0.14064695
## DiabetesPedigreeFunction 0.18507093 0.14064695 1.00000000
## Age -0.04216295 0.03624187 0.03356131
## Outcome 0.13054795 0.29269466 0.17384407
## Age Outcome
## Pregnancies 0.54434123 0.22189815
## Glucose 0.26351432 0.46658140
## BloodPressure 0.23952795 0.06506836
## SkinThickness -0.11397026 0.07475223
## Insulin -0.04216295 0.13054795
## BMI 0.03624187 0.29269466
## DiabetesPedigreeFunction 0.03356131 0.17384407
## Age 1.00000000 0.23835598
## Outcome 0.23835598 1.00000000
# Visualisasi korelasi dengan heatmap menggunakan corrplot
corrplot(cor_matrix, method="color", type="lower", tl.cex=0.8, tl.col="black", col=colorRampPalette(c("blue", "white", "red"))(200))
# Alternatif visualisasi dengan ggplot2 tanpa reshape2
cor_long <- as.data.frame(as.table(cor_matrix))
ggplot(cor_long, aes(Var1, Var2, fill=Freq)) +
geom_tile() +
scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0, limit=c(-1,1), space="Lab") +
theme_minimal() +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
labs(title="Heatmap Korelasi", x="", y="")
Q1 <- apply(df, 2, quantile, probs=0.25, na.rm=TRUE)
Q3 <- apply(df, 2, quantile, probs=0.75, na.rm=TRUE)
IQR <- Q3 - Q1
outliers <- df < (Q1 - 1.5 * IQR) | df > (Q3 + 1.5 * IQR)
colSums(outliers)
## Pregnancies Glucose BloodPressure
## 360 561 417
## SkinThickness Insulin BMI
## 374 390 378
## DiabetesPedigreeFunction Age Outcome
## 259 375 255
ggplot(df, aes(x=Glucose)) +
geom_density(fill="Pink", alpha=0.5) +
labs(title="Density Plot Glukosa", x="Glucose", y="Density") +
theme_minimal()
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(df, aes(color=factor(Outcome)))
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
Uji Normalitas Data
shapiro.test(df$Glucose)
##
## Shapiro-Wilk normality test
##
## data: df$Glucose
## W = 0.9701, p-value = 1.986e-11
shapiro.test(df$Pregnancies)
##
## Shapiro-Wilk normality test
##
## data: df$Pregnancies
## W = 0.90428, p-value < 2.2e-16
shapiro.test(df$BloodPressure)
##
## Shapiro-Wilk normality test
##
## data: df$BloodPressure
## W = 0.81892, p-value < 2.2e-16
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(lm(Glucose ~ ., data=df))
## Pregnancies BloodPressure SkinThickness
## 1.457579 1.186929 1.475857
## Insulin BMI DiabetesPedigreeFunction
## 1.269406 1.357473 1.081635
## Age Outcome
## 1.546036 1.222493
vif(lm(Pregnancies ~ ., data=df))
## Glucose BloodPressure SkinThickness
## 1.521189 1.193570 1.507464
## Insulin BMI DiabetesPedigreeFunction
## 1.427802 1.365409 1.076964
## Age Outcome
## 1.201638 1.405407