Menginstal package yang dibutuhkan untuk Visualisasi Data

Berikut adalah package yang akan digunakan, yaitu

1. ggplot2 dan ggpubr untuk memvisualisasikan data
2. dplyr untuk memanipulasi data   
3. ggthemes adalah Tema, Skala, dan Geom Ekstra untuk 'ggplot2'
4. RColoeBrewer menyediakan skema warna untuk grafik lainnya
5. gridExtra menyediakan sejumlah fungsi untuk grid grafik

Panggil package yang dibutuhkan

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.3
## Loading required package: magrittr
library(kableExtra)
library(data.table)
## Warning: package 'data.table' was built under R version 3.6.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.6.3
library(RColorBrewer) 
library(gridExtra)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Membaca Data

data = read.csv("heart_failure_clinical_records_dataset.csv",sep =";", header = TRUE)
head(data) #Melihat data teratas
##   umur anaemia creatinine_phosphokinase diabetes ejection_fraction
## 1   75       0                      582        0                20
## 2   55       0                     7861        0                38
## 3   65       0                      146        0                20
## 4   50       1                      111        0                20
## 5   65       1                      160        1                20
## 6   90       1                       47        0                40
##   tekanan_darah_tinggi platelets serum_creatinine serum_sodium Jenis_kelamin
## 1                    1    265000              1.9          130             1
## 2                    0    263358              1.1          136             1
## 3                    0    162000              1.3          129             1
## 4                    0    210000              1.9          137             1
## 5                    0    327000              2.7          116             0
## 6                    1    204000              2.1          132             1
##   smoking time Peristiwa_Kematian
## 1       0    4                  1
## 2       0    6                  1
## 3       1    7                  1
## 4       0    7                  1
## 5       0    8                  1
## 6       1    8                  1
tail(data) #Melihat data terbawah
##     umur anaemia creatinine_phosphokinase diabetes ejection_fraction
## 294   63       1                      103        1                35
## 295   62       0                       61        1                38
## 296   55       0                     1820        0                38
## 297   45       0                     2060        1                60
## 298   45       0                     2413        0                38
## 299   50       0                      196        0                45
##     tekanan_darah_tinggi platelets serum_creatinine serum_sodium Jenis_kelamin
## 294                    0    179000              0.9          136             1
## 295                    1    155000              1.1          143             1
## 296                    0    270000              1.2          139             0
## 297                    0    742000              0.8          138             0
## 298                    0    140000              1.4          140             1
## 299                    0    395000              1.6          136             1
##     smoking time Peristiwa_Kematian
## 294       1  270                  0
## 295       1  270                  0
## 296       0  271                  0
## 297       0  278                  0
## 298       1  280                  0
## 299       1  285                  0

Memahami Data

fitur <-names(data)

Deskripsi<-c(
"Menjelaskan usia subjek dalam kumpulan data",
"Kondisi di mana seseorang kekurangan sel darah merah",
"Tingkat enzim CPK dalam darah",
"Penyakit metabolisme yang menyebabkan gula darah tinggi",
"Persentase darah yang keluar",
"Menunjukkan apakah tekanan darah tinggi atau tidak",
"Jumlah trombosit dalam darah",
"Mengukur tingkat kreatinin dalam darah dan memberikan perkiraan seberapa baik ginjal menyaring",
"Tingkat natrium dalam darah", 
"Laki-laki atau Perempuan",
"Ya atau Tidak",
"Waktu",
"Meninggal atau Tidak Meninggal")

Pengukuran <-c(
"Tahun",
"Boolean",
"mcg/L",
"Boolean",
"Persen",
"Boolean",
"kiloplatelets/mL",
"mg/dL",
"mEq/L", 
"Binary",
"Boolean",
"Hari",
"Boolean")


n<-as.data.frame(cbind(fitur,Deskripsi,Pengukuran))
n %>% kable(caption = "Penjelasan Data, sumber: https://doi.org/10.1186/s12911-020-1023-5") %>% kable_styling()
Penjelasan Data, sumber: https://doi.org/10.1186/s12911-020-1023-5
fitur Deskripsi Pengukuran
umur Menjelaskan usia subjek dalam kumpulan data Tahun
anaemia Kondisi di mana seseorang kekurangan sel darah merah Boolean
creatinine_phosphokinase Tingkat enzim CPK dalam darah mcg/L
diabetes Penyakit metabolisme yang menyebabkan gula darah tinggi Boolean
ejection_fraction Persentase darah yang keluar Persen
tekanan_darah_tinggi Menunjukkan apakah tekanan darah tinggi atau tidak Boolean
platelets Jumlah trombosit dalam darah kiloplatelets/mL
serum_creatinine Mengukur tingkat kreatinin dalam darah dan memberikan perkiraan seberapa baik ginjal menyaring mg/dL
serum_sodium Tingkat natrium dalam darah mEq/L
Jenis_kelamin Laki-laki atau Perempuan Binary
smoking Ya atau Tidak Boolean
time Waktu Hari
Peristiwa_Kematian Meninggal atau Tidak Meninggal Boolean

Mengubah menjadi faktor

data$Peristiwa_Kematian <- factor(data$Peristiwa_Kematian)

Melihat distribusi data

Distribusi Umur

age <-ggplot(data,aes(x = umur))+geom_histogram(binwidth = 5, color = "white", fill = "#5757bc",alpha = 0.5)+theme_fivethirtyeight()+labs(title = "Distribusi Umur", caption = "i. Distribusi Umur")+
  theme(plot.caption = element_text(hjust = 0.5,face = "italic"))+
  scale_x_continuous(breaks = seq(40,100,10))

de<-ggplot(data,aes(x = umur, fill = Peristiwa_Kematian))+geom_histogram(binwidth = 5, position = "identity",alpha = 0.5,color = "white")+theme_fivethirtyeight()+scale_fill_manual(values = c("#b3d3dd", "#2d7291"))+
  labs(caption = "ii. Distribusi Usia dengan Peristiwa Kematian")+
  theme(plot.caption = element_text(hjust = 0.5,face = "italic"))+
  scale_x_continuous(breaks = seq(40,100,10))

gridExtra::grid.arrange(age,de)

Distribusi ejection_fraction

ef1<-ggplot(data, aes(x = ejection_fraction))+geom_density(fill = "#2043b7", alpha = 0.5)+theme_fivethirtyeight()+
  geom_vline(xintercept = 50, linetype = "dashed")+
  geom_vline(xintercept = 70, linetype = "dashed")+
  scale_x_continuous(breaks = seq(20,80,10))+
  annotate("text",x = 60, y = 0.03, label = "Normal", color  = "#0a4c41")+
  annotate("text", x = 78, y = 0.03, label = "Tinggi", color = "#ad652a")+
  annotate("text", x = 35, y = 0.03, label = "Rendah", color = "#082451")+
  labs(title = "Distribusi ejection_fraction", caption = "i. Distribusi ejection_fraction")+
  theme(plot.caption = element_text(hjust = 0.5, face = "italic"))
  
ef2<-ggplot(data, aes(x = ejection_fraction, fill = Peristiwa_Kematian))+geom_density(alpha = 0.5)+theme_fivethirtyeight()+
  scale_fill_manual(values = c("#a9d5e0", "#56c7e2"))+
  scale_x_continuous(breaks = seq(20,80,10))+
  geom_vline(aes(xintercept = mean(ejection_fraction[Peristiwa_Kematian == 0])), color = "#a8efe3")+
  geom_vline(aes(xintercept = mean(ejection_fraction[Peristiwa_Kematian == 1])), color = "#39d1b8")+
  geom_curve(aes(xend = mean(ejection_fraction[Peristiwa_Kematian == 0])), y = 0.05, x = 50, yend  = 0.04, arrow = arrow(length = unit(0.2,"cm")),color = "black")+
  geom_curve(aes(xend = mean(ejection_fraction[Peristiwa_Kematian == 1])), x = 27,yend= 0.04, y = 0.05, arrow = arrow(length = unit(0.2,"cm")), color = "black")+
  annotate("text", x = 50, y = 0.048, label = "Rata-rata kejadian tidak meninggal", size = 3)+
  annotate("text", x = 27, y = 0.052, label = "Rata-rata kejadian meninggal", size = 3)+
  geom_vline(xintercept = 50, linetype = "dashed")+
  geom_vline(xintercept = 70, linetype = "dashed")+
  theme(plot.caption = element_text(hjust = 0.5, face = "italic"))+
  labs(caption = "ii. Distribusi ejection_fraction dengan peristiwa kematian")

gridExtra::grid.arrange(ef1,ef2)

Distribusi serum_creatinine

sc1 <- ggplot(data, aes(x = serum_creatinine))+geom_density(fill = "#bca6bc", alpha = 0.5)+theme_fivethirtyeight()+
  geom_vline(xintercept = 0.84, linetype = "dashed")+
  geom_vline(xintercept = 1.4, linetype = "dashed")+
  annotate("text",x = 1.05, y = 0.5, label = "Normal", color  = "darkgreen", angle = 90)+
  annotate("text", x = 3, y = 0.5, label = "Kemungkinan kerusakan \nginjal", color = "#af6c35")+
  labs(title = "Distribusi serum_creatinine", caption = "i. Distribusi serum_creatinine")+
  theme(plot.caption = element_text(hjust = 0.5, face = "italic"))

sc2 <- ggplot(data, aes(x = serum_creatinine, fill = Peristiwa_Kematian))+geom_density(alpha = 0.5)+theme_fivethirtyeight()+
  scale_fill_manual(values = c("#76b5a1", "#108963"))+
  #scale_x_continuous(breaks = seq(20,80,10))+
  geom_vline(aes(xintercept = mean(serum_creatinine[Peristiwa_Kematian == 0])), color = "#a8efe3")+
  geom_vline(aes(xintercept = mean(serum_creatinine[Peristiwa_Kematian == 1])), color = "#39d1b8")+
  geom_curve(aes(xend = mean(serum_creatinine[Peristiwa_Kematian == 0])), yend = 0.9, x = 2.5, y  = 1.25, arrow = arrow(length = unit(0.2,"cm")),color = "#19299b")+
  geom_curve(aes(xend = mean(serum_creatinine[Peristiwa_Kematian == 1])), x = 3,yend= 0.5, y = 0.9, arrow = arrow(length = unit(0.2,"cm")), color = "#19299b")+
  annotate("text", x = 2.5, y = 1.2, label = "Rata-rata kejadian tidak meninggal", size = 2.5)+
  annotate("text", x = 3, y = 0.85, label = "Rata-rata kejadian meninggal", size = 2.5)+
  geom_vline(xintercept = 0.84, linetype = "dashed")+
  geom_vline(xintercept = 1.4, linetype = "dashed")+
  theme(plot.caption = element_text(hjust = 0.5, face = "italic"))+
  labs(caption = "ii. Distribution of creatinine with death event")+
  annotate("text",label = "creatinine > 2.5 \n60% kemungkinan kematian", x = 5, y = 0.5)

gridExtra::grid.arrange(sc1,sc2)

Visualisasi Kolom Kategorik

vis1 <- ggplot(data, aes(x = Peristiwa_Kematian, fill = factor(anaemia)))+geom_bar(position = "fill")+theme_fivethirtyeight()+
  scale_x_discrete(labels  = c("Peristiwa \nKematian:Tidak","Peristiwa \nKematian:Ya"))+scale_fill_manual(values = c("#a8efe3", "#39d1b8"), name = "Anaemia",
                                                              labels = c("Tidak","Ya"))+labs(subtitle = "Anemia")
  
vis1

vis2<-ggplot(data, aes(x = Peristiwa_Kematian, fill = factor(diabetes)))+geom_bar(position = "fill")+theme_fivethirtyeight()+
  scale_x_discrete(labels  = c("Peristiwa \nKematian:Tidak","Peristiwa \nKematian:Ya"))+scale_fill_manual(values = c("#a8efe3", "#39d1b8"), name = "Diabetes", labels = c("No","Yes"))+labs(subtitle = "Diabetes")

vis2

vis3<-ggplot(data, aes(x = Peristiwa_Kematian, fill = factor(tekanan_darah_tinggi)))+geom_bar(position = "fill")+theme_fivethirtyeight()+
  scale_x_discrete(labels  = c("Peristiwa \nKematian:Tidak","Peristiwa \nKematian:Ya"))+scale_fill_manual(values = c("#a8efe3", "#39d1b8"), name = "tekanan darah tinggi", labels = c("No","Yes"))+labs(subtitle = "tekanan darah tinggi")

vis3

vis4<-ggplot(data, aes(x = Peristiwa_Kematian, fill = factor(Jenis_kelamin)))+geom_bar(position = "fill")+theme_fivethirtyeight()+
  scale_x_discrete(labels  = c("Peristiwa \nKematian:Tidak","Peristiwa \nKematian:Ya"))+scale_fill_manual(values = c("#a8efe3", "#39d1b8"), name = "Jenis Kelamin", labels = c("Perempuan","Laki-laki"))+labs(subtitle = "Jenis Kelamin")

vis4

vis5<-ggplot(data, aes(x = Peristiwa_Kematian, fill = factor(smoking)))+geom_bar(position = "fill")+theme_fivethirtyeight()+
  scale_x_discrete(labels  = c("Peristiwa \nKematian:Tidak","Peristiwa \nKematian:Ya"))+scale_fill_manual(values = c("#a8efe3", "#39d1b8"), name = "Smoking", labels = c("Tidak","Ya"))+labs(subtitle = "Smoking")

vis5

Melihat Box Plot

Melihat Box Plot umur dengan peristiwa kematian

boxplot(umur ~ Peristiwa_Kematian, data = data, ylab = "Umur", frame = FALSE, col = "#8fb2e5")

Melihat Box Plot platelets dengan peristiwa kematian

boxplot(platelets ~ Peristiwa_Kematian, data = data, ylab = "platelets", frame = FALSE, col = "#c38fe5")

Melihat Box Plot ejection_fraction dengan peristiwa kematian

boxplot(ejection_fraction ~ Peristiwa_Kematian, data = data, ylab = "ejection_fraction", frame = FALSE, col = "#d35eb2")

Referensi

https://www.kaggle.com/shootboyxxx/heart-attack-eda-and-modeling