1.Loading the needed packages
library(tidyverse)
library(FFTrees)
library(DT)
2.Loading the dataset
stroke <- read_csv("~/Desktop/R/csv/stroke.csv", col_types = "dfdfffffddff") %>%
mutate(
stroke = fct_recode(stroke, "TRUE" = "1", "FALSE" = "0"),
hypertension = fct_recode(hypertension, "Yes" = "1", "No" = "0"),
heart_disease = fct_recode(heart_disease, "Yes" = "1", "No" = "0"),
stroke = as.logical(stroke)
) %>%
mosaic::shuffle() %>%
select(-id, -orig.id)
datatable(stroke)
3. EDA
Checking for missing values
## # A tibble: 1 × 11
## gender age hypertens…¹ heart…² ever_…³ work_…⁴ Resid…⁵ avg_g…⁶ bmi smoki…⁷
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0 201 0
## # … with 1 more variable: stroke <int>, and abbreviated variable names
## # ¹hypertension, ²heart_disease, ³ever_married, ⁴work_type, ⁵Residence_type,
## # ⁶avg_glucose_level, ⁷smoking_status
## # A tibble: 1 × 11
## gender age hypertens…¹ heart…² ever_…³ work_…⁴ Resid…⁵ avg_g…⁶ bmi smoki…⁷
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0 0 0
## # … with 1 more variable: stroke <int>, and abbreviated variable names
## # ¹hypertension, ²heart_disease, ³ever_married, ⁴work_type, ⁵Residence_type,
## # ⁶avg_glucose_level, ⁷smoking_status
Plotting the numerical (predictor) variables

Plotting the nominal (target) variable

4.Splitting the data
stroke_train <- stroke %>% slice(1:2000)
stroke_test <- stroke %>% slice(2001:2500)
new_data <- stroke %>% slice(2501:2510)
5.Modelling
## FFTrees
## - Trees: 7 fast-and-frugal trees predicting stroke
## - Outcome costs: [hi = 0, mi = 1, fa = 1, cr = 0]
##
## FFT #1: Definition
## [1] If age > 49, decide Stroke.
## [2] If avg_glucose_level > 125.32, decide Stroke, otherwise, decide No Stroke.
##
## FFT #1: Training Accuracy
## Training data: N = 2,000, Pos (+) = 95 (5%)
##
## | | True + | True - | Totals:
## |----------|----------|----------|
## | Decide + | hi 91 | fa 934 | 1,025
## | Decide - | mi 4 | cr 971 | 975
## |----------|----------|----------|
## Totals: 95 1,905 N = 2,000
##
## acc = 53.1% ppv = 8.9% npv = 99.6%
## bacc = 73.4% sens = 95.8% spec = 51.0%
##
## FFT #1: Training Speed, Frugality, and Cost
## mcu = 1.56, pci = 0.84, E(cost) = 0.469
6.Plotting the results

7.Ranking the predictor accuracies

8.Predicting on the new data
## [1] TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE