According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.
# Read Data
stroke <- read.csv(file = "strokedata.csv", stringsAsFactors = T)
# Check Data
glimpse(stroke)#> Rows: 5,110
#> Columns: 12
#> $ id <int> 9046, 51676, 31112, 60182, 1665, 56669, 53882, 10434~
#> $ gender <fct> Male, Female, Male, Female, Female, Male, Male, Fema~
#> $ age <dbl> 67, 61, 80, 49, 79, 81, 74, 69, 59, 78, 81, 61, 54, ~
#> $ hypertension <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1~
#> $ heart_disease <int> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0~
#> $ ever_married <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes~
#> $ work_type <fct> Private, Self-employed, Private, Private, Self-emplo~
#> $ Residence_type <fct> Urban, Rural, Rural, Urban, Rural, Urban, Rural, Urb~
#> $ avg_glucose_level <dbl> 228.69, 202.21, 105.92, 171.23, 174.12, 186.21, 70.0~
#> $ bmi <fct> 36.6, N/A, 32.5, 34.4, 24, 29, 27.4, 22.8, N/A, 24.2~
#> $ smoking_status <fct> formerly smoked, never smoked, never smoked, smokes,~
#> $ stroke <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
# Data Wrangling
stroke_clean <- mutate(.data = stroke,
bmi = as.character(bmi),
id = as.character(id))
glimpse(stroke_clean)#> Rows: 5,110
#> Columns: 12
#> $ id <chr> "9046", "51676", "31112", "60182", "1665", "56669", ~
#> $ gender <fct> Male, Female, Male, Female, Female, Male, Male, Fema~
#> $ age <dbl> 67, 61, 80, 49, 79, 81, 74, 69, 59, 78, 81, 61, 54, ~
#> $ hypertension <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1~
#> $ heart_disease <int> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0~
#> $ ever_married <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes~
#> $ work_type <fct> Private, Self-employed, Private, Private, Self-emplo~
#> $ Residence_type <fct> Urban, Rural, Rural, Urban, Rural, Urban, Rural, Urb~
#> $ avg_glucose_level <dbl> 228.69, 202.21, 105.92, 171.23, 174.12, 186.21, 70.0~
#> $ bmi <chr> "36.6", "N/A", "32.5", "34.4", "24", "29", "27.4", "~
#> $ smoking_status <fct> formerly smoked, never smoked, never smoked, smokes,~
#> $ stroke <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
In 2015, the World Health Organization under the United Nations officially revised the age standards. According to the new age classification (in years old):
# Tambah kolom klasifikasi usia (age_category)
x <- c("0-12 (children)",
"13-24 (adolescent/youth)",
"25-44 (young age)",
"45-60 (middle age)",
"61-75 (elderly age)",
"76-90 (senile age)",
">90 (long-livers)"
)
stroke_clean <- stroke_clean %>%
mutate(age_category = if_else(age >= 0 & age <= 12, "0-12 (children)",
if_else(age >= 13 & age <= 24, "13-24 (adolescent/youth)",
if_else(age >= 25 & age <= 44, "25-44 (young age)",
if_else(age >= 45 & age <= 60, "45-60 (middle age)",
if_else(age >= 61 & age <= 75, "61-75 (elderly age)",
if_else(age >= 76 & age <= 90, "76-90 (senile age)",
if_else(age > 90, ">90 (long-livers)", "")))))))) %>%
mutate(age_category = factor(age_category, levels = x))
head(stroke_clean)# Check Data
anyNA(stroke_clean)#> [1] FALSE
colSums(is.na(stroke_clean))#> id gender age hypertension
#> 0 0 0 0
#> heart_disease ever_married work_type Residence_type
#> 0 0 0 0
#> avg_glucose_level bmi smoking_status stroke
#> 0 0 0 0
#> age_category
#> 0
stroke_age <- stroke_clean %>%
filter(stroke==1) %>%
group_by(age_category) %>%
summarise(count=n()) %>%
ungroup() %>%
arrange(-count)
head(stroke_age)Kelompok usia “senile age” (76-90 thn) paling rentan mengalami stroke
stroke_gender <- stroke_clean %>%
filter(stroke==1) %>%
group_by(gender) %>%
summarise(count=n()) %>%
ungroup() %>%
arrange(gender)
head(stroke_gender)Gender perempuan paling rentan mengalami stroke
stroke_most <- stroke_clean %>%
filter(stroke==1) %>%
group_by(age_category,gender,Residence_type,smoking_status, ever_married, work_type) %>%
summarise(count=n()) %>%
ungroup() %>%
arrange(-count)
head (stroke_most)Kelompok perempuan dengan pengalaman bekerja sebagai karyawan swasta, tinggal di area pedesaan, berkeluarga, tidak pernah merokok, usia 75-90 tahun
library(ggplot2)
library(plotly)
library(glue)
library(scales)stroke_gender_pie <- data.frame("Gender"=rownames(stroke_gender), stroke_gender)
data <- stroke_gender[,c('gender','count')]
colors <- c('rgb(0,0,0)', 'rgb(184,184,184)')
fig <- plot_ly(data, labels = ~gender, values = ~count, type = 'pie',
textposition = 'inside',
textinfo = 'label+percent',
insidetextfont = list(color = '#FFFFFF'),
hoverinfo = 'text',
text = ~paste(count, 'stroke cases'),
marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1),
showlegend = FALSE))
fig <- fig %>% layout(title = 'Stroke Cases by Gender',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figKasus stroke terjadi hampir sama banyak pada laki-laki dan perempuan, dimana tingkat kejadian pada kaum perempuan agak lebih tinggi daripada laki-laki.
## buat pie chart nya
stroke_res2 <- data.frame("Residence Area"=rownames(stroke_res), stroke_res)
data2 <- stroke_res[,c('Residence_type','count')]
colors2 <- c('rgb(0,0,0)', 'rgb(184,184,184)')
fig2 <- plot_ly(data2, labels = ~Residence_type, values = ~count, type = 'pie',
textposition = 'inside',
textinfo = 'label+percent',
insidetextfont = list(color = '#FFFFFF'),
hoverinfo = 'text',
text = ~paste(count, 'stroke cases'),
marker = list(colors = colors2,
line = list(color = '#FFFFFF', width = 1),
showlegend = FALSE))
fig2 <- fig2 %>% layout(title = 'Stroke Cases by Residence Area',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
fig2Kasus stroke juga hampir sama banyak terjadi baik bagi mereka yang tinggal di area perkotaan maupun pedesaan, dimana tingkat kejadian pada penduduk kota agak lebih tinggi daripada mereka yang tingal di area pedesaan.
stroke_most_age <- stroke_clean %>%
filter(stroke==1) %>%
group_by(age_category) %>%
summarise(count=n()) %>%
ungroup() %>%
arrange(-count)
stroke_most_age2 <- stroke_most_age %>%
mutate(label = glue(
"Age Category: {age_category}
Stroke Cases: {count}"
))
plot1 <- ggplot(data = stroke_most_age2, aes(x = count,
y = reorder(age_category, count),
text = label)) +
geom_col(aes(fill = count)) +
scale_fill_gradient(low="grey", high="black") +
labs(title = "Stroke Cases by Age Category",
x = "Stroke Cases",
y = NULL) +
theme_minimal() +
theme(legend.position = "none")
ggplotly(plot1, tooltip = "text")Pareto kasus stroke paling banyak terjadi pada kelompok usia lansia (>60 tahun), meskipun ada beberapa kasus stroke juga yang terjadi pada usia produktif (25-60 thn).
stroke_smoke <- stroke_clean %>%
filter(stroke==1) %>%
group_by(smoking_status) %>%
summarise(count=n()) %>%
ungroup() %>%
arrange(-count)
stroke_smoke2 <- stroke_smoke %>%
mutate(label = glue(
"Smoking History: {smoking_status}
Stroke Cases: {count}"
))
plot2 <- ggplot(data = stroke_smoke2, aes(x = count,
y = reorder(smoking_status, count),
text = label)) +
geom_col(aes(fill = count)) +
scale_fill_gradient(low="grey", high="black") +
labs(title = "Stroke Cases by Smoking History",
x = "Stroke Cases",
y = NULL) +
theme_minimal() +
theme(legend.position = "none")
ggplotly(plot2, tooltip = "text")Temuan menarik bahwa kasus stroke paling banyak terjadi pada mereka yang tidak pernah merokok. Perlu studi literasi lebih jauh mengenai penyebab kondisi ini.