Menganalisa dasar parameter pasien yang terkena stroke, pada dokumen ini akan dibuat data visualization untuk melihat parameter apa saja yang menjadi penyebab stroke pada pasien.
source : https://www.kaggle.com/fedesoriano/stroke-prediction-dataset
knitr::include_graphics("data/brain-stroke-png.png")
Proses import dataset stroke
# Import data
stroke <- read.csv("data/healthcare-dataset-stroke-data.csv")
Cek Data
dim(stroke)
## [1] 5110 12
head(stroke, 5)
tail(stroke, 5)
str(stroke, 5)
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr "Private" "Self-employed" "Private" "Private" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : chr "36.6" "N/A" "32.5" "34.4" ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
anyNA(stroke)
## [1] FALSE
colSums(is.na(stroke))
## id gender age hypertension
## 0 0 0 0
## heart_disease ever_married work_type Residence_type
## 0 0 0 0
## avg_glucose_level bmi smoking_status stroke
## 0 0 0 0
ok, tidak ada missing data/value dan kita lanjut kita analisa
set tipe data menjadi factor dan numeric
stroke$gender <- as.factor(stroke$gender)
stroke$ever_married <- as.factor(stroke$ever_married)
stroke$work_type <- as.factor(stroke$work_type)
stroke$Residence_type <- as.factor(stroke$Residence_type)
stroke$smoking_status <- as.factor(stroke$smoking_status)
stroke$bmi <- as.numeric(stroke$bmi)
## Warning: NAs introduced by coercion
lihat summary data
summary(stroke)
## id gender age hypertension
## Min. : 67 Female:2994 Min. : 0.08 Min. :0.00000
## 1st Qu.:17741 Male :2115 1st Qu.:25.00 1st Qu.:0.00000
## Median :36932 Other : 1 Median :45.00 Median :0.00000
## Mean :36518 Mean :43.23 Mean :0.09746
## 3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000
## Max. :72940 Max. :82.00 Max. :1.00000
##
## heart_disease ever_married work_type Residence_type
## Min. :0.00000 No :1757 children : 687 Rural:2514
## 1st Qu.:0.00000 Yes:3353 Govt_job : 657 Urban:2596
## Median :0.00000 Never_worked : 22
## Mean :0.05401 Private :2925
## 3rd Qu.:0.00000 Self-employed: 819
## Max. :1.00000
##
## avg_glucose_level bmi smoking_status stroke
## Min. : 55.12 Min. :10.30 formerly smoked: 885 Min. :0.00000
## 1st Qu.: 77.25 1st Qu.:23.50 never smoked :1892 1st Qu.:0.00000
## Median : 91.89 Median :28.10 smokes : 789 Median :0.00000
## Mean :106.15 Mean :28.89 Unknown :1544 Mean :0.04873
## 3rd Qu.:114.09 3rd Qu.:33.10 3rd Qu.:0.00000
## Max. :271.74 Max. :97.60 Max. :1.00000
## NA's :201
dapat disimpulkan didalam data terdapat informasi diantaranya :
mengubah `stroke dari angka 1 dan 0 menjadi nama stroke dan no stroke
stroke$stroke <- sapply(X = as.character(stroke$stroke), FUN = switch,
"1" = "stroke",
"0" = "no stroke")
stroke$stroke <- as.factor(stroke$stroke)
head(stroke,5)
stroke$hypertension <- sapply(X = as.character(stroke$hypertension), FUN = switch,
"1" = "hypertension",
"0" = "no hypertension")
stroke$hypertension <- as.factor(stroke$hypertension)
head(stroke,5)
stroke$heart_disease <- sapply(X = as.character(stroke$heart_disease), FUN = switch,
"1" = "heart_disease",
"0" = "no heart_disease")
stroke$heart_disease <- as.factor(stroke$heart_disease)
head(stroke,5)
ifx_age <- function(X){
if (X < 20){
X <- "less than 20"
}else
if (X >= 20 & X < 30) {
X <- "between 20 and 30"
}else
if (X >= 30 & X < 40){
X <- "between 30 and 40"
}else
if (X >= 40 & X < 50){
X <- "between 40 and 50"
}else
if (X >= 50 & X < 60){
X <- "between 50 and 60"
}else
if (X >= 60 & X < 70){
X <- "between 60 and 70"
}else
X <- "More than 70"
}
stroke$age_segment <- sapply(X = stroke$age, FUN = ifx_age)
stroke$age_segment <- as.factor(stroke$age_segment)
head(stroke,5)
ifx_glucose <- function(X){
if (X < 80){
X <- "less than 80"
}else
if (X >= 80 & X < 120) {
X <- "between 80 and 120"
}else
if (X >= 120 & X < 180){
X <- "between 120 and 180"
}else
X <- "More than 180"
}
stroke$status_glucose <- sapply(X = stroke$avg_glucose_level, FUN = ifx_glucose)
stroke$status_glucose <- as.factor(stroke$status_glucose)
head(stroke,5)
stroke1 <- stroke[stroke$stroke=="stroke",]
stroke1
library(ggplot2)
ggplot(stroke1, aes(x = gender, y = age , fill = gender )) +
geom_boxplot(show.legend = T)+
labs(title = "Data Distribusi Umur Penderita Stroke terhadap Gender",
subtitle = "Redline indikasi rata-rata umur",
caption = "Source: https://www.kaggle.com",
x = " Gender", y = " Umur ")+
geom_hline(yintercept = mean(stroke1$age), color = "red", linetype = 5)
Dapat disimpulkan bahwa penderita stroke pada wanita lebih banyak, dan untuk wanita mempuyai nilai quartile terrendah daripada pria
trend_resident <- as.data.frame(table(stroke1$Residence_type))
trend_resident
ggplot(trend_resident, aes(x = Var1, Freq))+
geom_col(aes(fill = Freq))+
geom_text(aes(label = Freq),color = "black", size = 5, nudge_y = 5)+
scale_fill_gradient(low = "#d260d6",high = "#69316b")+
labs(title = "Data Berdasarkan Tempat Tinggal",
subtitle = "Data Penderita Stroke",
x = "Tipe Tempat Tinggal",
y = "Banyaknya")
bisa disimpulkan bahwa penderita stroke terbanyak terdapat pada didaerah perkotaan
trend_stroke <- as.data.frame(table(stroke1$work_type))
trend_stroke
ggplot(trend_stroke, aes(x = Freq, reorder(Var1, Freq)))+
geom_col(aes(fill = Freq))+
geom_text(aes(label = Freq),color = "white", size = 5, hjust = 1.0) +
scale_fill_gradient(low = "#300202",high = "#db0b0b")+
labs(title = "Data Pekerjaan Paling Sering Pengidap Stroke",
subtitle = "Data Penderita Stroke",
x = "Banyaknya",
y = "")
Pekerjaan yang paling banyak penderita storke adalah pada tipe Private dan tipe tidak pernah kerja/ belum bekerja tidak ada yang mengidap penyakit stroke
ggplot(stroke1,aes(x = avg_glucose_level, bmi)) +
geom_point(aes(col = gender)) +
geom_smooth(method = "lm") +
theme_minimal() +
labs(title = "Korelasi antara level glukosa dan bmi pasien stroke",
subtitle = " ",
x = "Rata Rata Level Glukosa",
y = "BMI",
col = "Gender") +
theme(legend.position = "bottom")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 40 rows containing non-finite values (stat_smooth).
## Warning: Removed 40 rows containing missing values (geom_point).
ggplot(stroke1,aes(x = avg_glucose_level, age)) +
geom_point(aes(col = gender)) +
geom_smooth(method = "lm") +
theme_minimal() +
labs(title = "Korelasi antara level glukosa dan umur pasien stroke",
subtitle = " ",
x = "Rata-Rata Level Glukosa",
y = "Umur",
col = "Gender") +
theme(legend.position = "bottom")
## `geom_smooth()` using formula 'y ~ x'
ggplot(stroke1, aes(x = frequency(gender) , y = age_segment)) +
geom_col(fill = "#d64fa5")+
facet_grid(hypertension~gender) +
labs(title = "Komparasi antara Wanita dan Pria Penderita Stroke ",
subtitle = "Umur dan Hypertensi",
x = "",
y = "Umur",
caption = "Source: https://www.kaggle.com")
bisa disimpulkan bahwa umur diatas 50 tahun berpotensi mengidap stroke dan lebih banyak pada wanita
ggplot(stroke1, aes(x = frequency(gender) , y = status_glucose)) +
geom_col()+
scale_fill_viridis_b()+
facet_grid(hypertension~gender) +
labs(title = "Komparasi antara Wanita dan Pria Penderita Stroke",
subtitle = " Status Glukosa dan Hypertensi",
x = "",
y = "Status Glukosa level",
caption = "Source: https://www.kaggle.com")
bisa disimpulkan bahwa pemicu dari stroke dapat disebabkan karena level glukosa yang tinggi atau komplkasi antara gula darah tinggi dan hypertensi
7.Komparasi antara hypertensi dengan kadar gula darah berdasarkan status merokok
ggplot(stroke1, aes(x = frequency(smoking_status) , y = status_glucose)) +
geom_col()+
scale_fill_viridis_b()+
facet_grid(heart_disease~smoking_status) +
labs(title = "Komparasi Status Merokok Penderita Stroke",
subtitle = " Status Glukosa, Hypertensi",
x = "",
y = "Status Glukosa level",
caption = "Source: https://www.kaggle.com")
Dapat disimpulkan bahwa lebih banyak penderita stroke yang tidak pernah merokok dan sebelumnya pernah merokok yang tidak punya penyakit jantung , dengan kadar gula darah pada More than 180 dan less than 180