1.Loading the needed packages

library(tidyverse)
library(FFTrees)
library(DT)

2.Loading the dataset

stroke <- read_csv("~/Desktop/R/csv/stroke.csv", col_types = "dfdfffffddff") %>%
  mutate(
    stroke = fct_recode(stroke, "TRUE" = "1", "FALSE" = "0"),
    hypertension = fct_recode(hypertension, "Yes" = "1", "No" = "0"),
    heart_disease = fct_recode(heart_disease, "Yes" = "1", "No" = "0"),
    stroke = as.logical(stroke)
  ) %>%
  mosaic::shuffle() %>% 
    select(-id, -orig.id)
datatable(stroke)

3. EDA

Checking for missing values

## # A tibble: 1 × 11
##   gender   age hypertens…¹ heart…² ever_…³ work_…⁴ Resid…⁵ avg_g…⁶   bmi smoki…⁷
##    <int> <int>       <int>   <int>   <int>   <int>   <int>   <int> <int>   <int>
## 1      0     0           0       0       0       0       0       0   201       0
## # … with 1 more variable: stroke <int>, and abbreviated variable names
## #   ¹​hypertension, ²​heart_disease, ³​ever_married, ⁴​work_type, ⁵​Residence_type,
## #   ⁶​avg_glucose_level, ⁷​smoking_status
## # A tibble: 1 × 11
##   gender   age hypertens…¹ heart…² ever_…³ work_…⁴ Resid…⁵ avg_g…⁶   bmi smoki…⁷
##    <int> <int>       <int>   <int>   <int>   <int>   <int>   <int> <int>   <int>
## 1      0     0           0       0       0       0       0       0     0       0
## # … with 1 more variable: stroke <int>, and abbreviated variable names
## #   ¹​hypertension, ²​heart_disease, ³​ever_married, ⁴​work_type, ⁵​Residence_type,
## #   ⁶​avg_glucose_level, ⁷​smoking_status

Plotting the numerical (predictor) variables

Plotting the nominal (predictor) variables

Plotting the nominal (target) variable

4.Splitting the data

stroke_train <- stroke %>% slice(1:2000)
stroke_test <- stroke %>% slice(2001:2500)
new_data <- stroke %>% slice(2501:2510)

5.Modelling

## FFTrees 
## - Trees: 7 fast-and-frugal trees predicting stroke
## - Outcome costs: [hi = 0, mi = 1, fa = 1, cr = 0]
## 
## FFT #1: Definition
## [1] If age > 49, decide Stroke.
## [2] If avg_glucose_level > 125.32, decide Stroke, otherwise, decide No Stroke.
## 
## FFT #1: Training Accuracy
## Training data: N = 2,000, Pos (+) = 95 (5%) 
## 
## |          | True +   | True -   |   Totals:
## |----------|----------|----------|
## | Decide + | hi    91 | fa   934 |     1,025
## | Decide - | mi     4 | cr   971 |       975
## |----------|----------|----------|
##   Totals:          95      1,905   N = 2,000
## 
## acc  = 53.1%   ppv  = 8.9%   npv  = 99.6%
## bacc = 73.4%   sens = 95.8%   spec = 51.0%
## 
## FFT #1: Training Speed, Frugality, and Cost
## mcu = 1.56,  pci = 0.84,  E(cost) = 0.469

6.Plotting the results

7.Ranking the predictor accuracies

8.Predicting on the new data

##  [1]  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE

9.Comparing predictive performance across algorithms