library(ggplot2)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggcorrplot)
library(parsnip)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()         masks ggplot2::%+%()
## ✖ psych::alpha()       masks ggplot2::alpha()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::lag()         masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rsample)
library(recipes)
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step
library(psych)
library(tune)
library(dplyr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(parsnip)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ workflows    1.1.4
## ✔ dials        1.2.1     ✔ workflowsets 1.1.0
## ✔ infer        1.0.7     ✔ yardstick    1.3.1
## ✔ modeldata    1.3.0     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ psych::%+%()         masks ggplot2::%+%()
## ✖ scales::alpha()      masks psych::alpha(), ggplot2::alpha()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ scales::discard()    masks purrr::discard()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ recipes::fixed()     masks stringr::fixed()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ yardstick::spec()    masks readr::spec()
## ✖ recipes::step()      masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(kknn)
## 
## Attaching package: 'kknn'
## 
## The following object is masked from 'package:caret':
## 
##     contr.dummy
library(ranger)
heart<-read.csv("heart.csv", stringsAsFactors = TRUE)
head(heart)
#check for data types
heart %>% glimpse()
## Rows: 918
## Columns: 12
## $ Age            <int> 40, 49, 37, 48, 54, 39, 45, 54, 37, 48, 37, 58, 39, 49,…
## $ Sex            <fct> M, F, M, F, M, M, F, M, M, F, F, M, M, M, F, F, M, F, M…
## $ ChestPainType  <fct> ATA, NAP, ATA, ASY, NAP, NAP, ATA, ATA, ASY, ATA, NAP, …
## $ RestingBP      <int> 140, 160, 130, 138, 150, 120, 130, 110, 140, 120, 130, …
## $ Cholesterol    <int> 289, 180, 283, 214, 195, 339, 237, 208, 207, 284, 211, …
## $ FastingBS      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ RestingECG     <fct> Normal, Normal, ST, Normal, Normal, Normal, Normal, Nor…
## $ MaxHR          <int> 172, 156, 98, 108, 122, 170, 170, 142, 130, 120, 142, 9…
## $ ExerciseAngina <fct> N, N, N, Y, N, N, N, N, Y, N, N, Y, N, Y, N, N, N, N, N…
## $ Oldpeak        <dbl> 0.0, 1.0, 0.0, 1.5, 0.0, 0.0, 0.0, 0.0, 1.5, 0.0, 0.0, …
## $ ST_Slope       <fct> Up, Flat, Up, Flat, Up, Up, Up, Up, Flat, Up, Up, Flat,…
## $ HeartDisease   <int> 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1…
set.seed(5346)
#Descriptive statistics for numberic values
heart %>%
  select(where(is.numeric)) %>%
  describe()
# Count unique values for categorical variables
heart %>%
  summarise(across(where(is.character), n_distinct))

In the first step of the analysis, we loaded the necessary R libraries and the dataset. We discovered that the dataset comprises 918 rows and 12 variables, including 5 categorical and the remainder numeric. There were no missing values, which simplified our initial data preparation. By examining the first few rows and calculating descriptive statistics for numeric values, we gained a foundational understanding of the data’s distribution. We also counted the unique entries for categorical variables, providing insights into the diversity of the dataset.

EXPLORATORY DATA ANALYSIS

In this step, we’ll do:

1.Distribution of numeric values 2.Counts of categorical values 3.Check for outliers (with box plots) 4.Correlation plot 5.Distribution of the target varibale

# Create distribution plots for numeric variables
options(repr.plot.width=12, repr.plot.height=7)

heart %>%
  select(where(is.numeric)) %>%
  select(-HeartDisease) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value, fill = variable)) +
  facet_wrap(~ variable, scales = "free") +
  geom_histogram(bins = 30, alpha = 0.7) +
  labs(title = "Distribution Plots for Numeric Variables") +
  scale_fill_viridis_d() +
  theme_minimal() +
  guides(fill = "none")

# Create box plots for numeric variables by the 'HeartDisease' category
heart %>%
  mutate(HeartDisease = factor(HeartDisease)) %>%
  select(HeartDisease, where(is.numeric)) %>%
  pivot_longer(cols = where(is.numeric), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = HeartDisease, y = value, fill = HeartDisease)) +
  geom_boxplot() +
  facet_wrap(~ variable, scales = "free") +
  labs(x = "Heart Disease", y = "Value", title = "Box Plots for Numeric Variables by Heart Disease") +
  scale_fill_viridis_d() +
  theme_minimal()

# Find correlated variables
heart %>% 
  select_if(is.numeric) %>% 
  cor()
##                      Age   RestingBP Cholesterol   FastingBS      MaxHR
## Age           1.00000000  0.25439936 -0.09528177  0.19803907 -0.3820447
## RestingBP     0.25439936  1.00000000  0.10089294  0.07019334 -0.1121350
## Cholesterol  -0.09528177  0.10089294  1.00000000 -0.26097433  0.2357924
## FastingBS     0.19803907  0.07019334 -0.26097433  1.00000000 -0.1314385
## MaxHR        -0.38204468 -0.11213500  0.23579240 -0.13143849  1.0000000
## Oldpeak       0.25861154  0.16480304  0.05014811  0.05269786 -0.1606906
## HeartDisease  0.28203851  0.10758898 -0.23274064  0.26729119 -0.4004208
##                  Oldpeak HeartDisease
## Age           0.25861154    0.2820385
## RestingBP     0.16480304    0.1075890
## Cholesterol   0.05014811   -0.2327406
## FastingBS     0.05269786    0.2672912
## MaxHR        -0.16069055   -0.4004208
## Oldpeak       1.00000000    0.4039507
## HeartDisease  0.40395072    1.0000000
# Correlation plot
heart %>%
  select_if(is.numeric) %>% 
  ggcorr(label = TRUE, label_size = 3)

# Create distribution of HeartDisease
options(repr.plot.width=7, repr.plot.height=7)

ggplot(heart, aes(x = factor(HeartDisease), fill = factor(HeartDisease))) +
  geom_bar() +
  labs(x = "HeartDisease", y = "Count", fill = "HeartDisease", title = "Distribution of HeartDisease") +
  scale_fill_viridis_d() +
  theme_minimal()

Distribution of Numeric Values: indicate that ‘Age’ and ‘MaxHR’ variables have a fairly normal distribution. The other variable shows a skewed distribution.

Counts of Categorical Values: The bar charts for categorical data reveal that certain categories dominate, like the ‘ASY’ (asymptomatic) type of chest pain and a higher prevalence of male patients. These imbalances may influence the model’s performance and should be considered in analysis and model training.

Outlier Detection: The box plots show that most of the numerical variables have outliers. These outliers may represent atypical cases or errors in data entry. However, they could also indicate true extremes that are clinically significant.

Correlation Plot: The correlation heatmap suggests some variables have a moderate positive or negative correlation with the occurrence of heart disease. For instance, ‘Oldpeak’ shows a positive correlation with the presence of heart disease, whereas ‘MaxHR’ appears to have a negative correlation. There are no highly correlated variables to be removed on feature ingineering step.

Distribution of the Target Variable (‘HeartDisease’): The final bar chart highlights a slight imbalance in the distribution of the target variable, with a larger number of patients not having heart disease compared to those who do. This suggests the need for careful consideration of class distribution when training predictive models to ensure they are not biased towards the majority class.

By interpreting these insights, we can make informed decisions on data preprocessing, such as outlier treatment and feature selection, which will ultimately contribute to building a more accurate heart failure prediction model.

DATA PREPARATION

In this step we’ll:

1.Convert the outcome variable to a factor type 2.Split the data to training and test sets 3.Normalize the data 4.Create dummy variables

# Convert outcome variable to factor and check the order 
heart$HeartDisease <- as.factor(heart$HeartDisease)
levels(heart[['HeartDisease']])
## [1] "0" "1"
# Split data to training and test sets
heart_split <- initial_split(heart,
                            prop = 0.7,
                            strata = HeartDisease)

heart_training <- heart_split %>% training()
heart_test <- heart_split %>% testing()

nrow(heart_training)
## [1] 642
nrow(heart_test)
## [1] 276
# Normalize and remove corelated numeric variables and create dummy for nominal variables
heart_recipe <- recipe(HeartDisease~.,
                      data = heart_training) %>%
    step_corr(all_numeric(), threshold = 0.8) %>%
    step_normalize(all_numeric()) %>%
    step_dummy(all_nominal(),-all_outcomes())

heart_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 11
## 
## ── Operations
## • Correlation filter on: all_numeric()
## • Centering and scaling for: all_numeric()
## • Dummy variables from: all_nominal() and -all_outcomes()
# Apply transformation recipe to training and test sets
heart_recipe_prep <- heart_recipe %>%
    prep(training = heart_training)

heart_training_prep <- heart_recipe_prep %>%
    bake(new_data = NULL)

heart_test_prep <- heart_recipe_prep %>%
    bake(new_data = heart_test)

head(heart_training_prep)
# Logistic regression model setup
glm_model <- logistic_reg() %>%
    set_engine('glm') %>%
    set_mode('classification')

# Fit the model
glm_fit <- glm_model %>%
    fit(HeartDisease ~ .,
        data = heart_training_prep)

# Check the fitted model
glm_fit
## parsnip model object
## 
## 
## Call:  stats::glm(formula = HeartDisease ~ ., family = stats::binomial, 
##     data = data)
## 
## Coefficients:
##       (Intercept)                Age          RestingBP        Cholesterol  
##          -0.81158            0.26648            0.15818           -0.55688  
##         FastingBS              MaxHR            Oldpeak              Sex_M  
##           0.48942           -0.04187            0.45310            1.51211  
## ChestPainType_ATA  ChestPainType_NAP   ChestPainType_TA  RestingECG_Normal  
##          -1.88456           -1.60445           -1.44373           -0.28973  
##     RestingECG_ST   ExerciseAngina_Y      ST_Slope_Flat        ST_Slope_Up  
##          -0.17532            0.98385            1.80001           -0.55787  
## 
## Degrees of Freedom: 641 Total (i.e. Null);  626 Residual
## Null Deviance:       882.8 
## Residual Deviance: 400.9     AIC: 432.9
# Logistic regression model predictions on test data
glm_class_pred <- predict(glm_fit,
                         new_data = heart_test_prep,
                         type = 'class')

glm_prob_pred <- predict(glm_fit,
                         new_data = heart_test_prep,
                         type = 'prob')

glm_results <- heart_test %>%
    select(HeartDisease) %>%
    bind_cols(glm_class_pred, glm_prob_pred)

# Performance metrics for logistic regression model
conf_mat(glm_results,
        truth = HeartDisease,
        estimate = .pred_class)
##           Truth
## Prediction   0   1
##          0  97  12
##          1  26 141
glm_metric_spec <-
    metric_set(accuracy, sens, spec, roc_auc)

glm_metrics <- glm_metric_spec(glm_results,
                              truth = HeartDisease,
                              estimate = .pred_class, .pred_0)
glm_metrics
# Logistic regression model ROC Curve
glm_results %>%
    roc_curve(truth = HeartDisease, .pred_0) %>%
    autoplot()

The Logistic Regression Model showed:

.accuracy of 0.8478261 .sensitivity of 0.8536585 .specificity of 0.8431373 .roc_auc of 0.9207716

K-Nearest Neighbors

# K-nearest neighbors model
knn_model <- nearest_neighbor() %>%
    set_engine('kknn') %>%
    set_mode('classification')

knn_fit <- knn_model %>%
    fit(HeartDisease~.,
       data = heart_training_prep)

knn_fit
## parsnip model object
## 
## 
## Call:
## kknn::train.kknn(formula = HeartDisease ~ ., data = data, ks = min_rows(5,     data, 5))
## 
## Type of response variable: nominal
## Minimal misclassification: 0.1323988
## Best kernel: optimal
## Best k: 5
# K-nearest neighbors model predictions on test data
knn_class_pred <- predict(knn_fit,
                         new_data = heart_test_prep,
                         type = 'class')

knn_prob_pred <- predict(knn_fit,
                         new_data = heart_test_prep,
                         type = 'prob')

knn_results <- heart_test %>%
    select(HeartDisease) %>%
    bind_cols(knn_class_pred, knn_prob_pred)

# Performance metrics for K-nearest neighbors model
conf_mat(knn_results,
        truth = HeartDisease,
        estimate = .pred_class)
##           Truth
## Prediction   0   1
##          0 100  20
##          1  23 133
knn_metric_spec <-
    metric_set(accuracy, sens, spec, roc_auc)

knn_metrics <- knn_metric_spec(knn_results,
                              truth = HeartDisease,
                              estimate = .pred_class, .pred_0)
knn_metrics
# Logistic regression model ROC Curve
knn_results %>%
    roc_curve(truth = HeartDisease, .pred_0) %>%
    autoplot()

# Decision tree model
dt_model <- decision_tree() %>%
    set_engine('rpart') %>%
    set_mode('classification')

dt_fit <- dt_model %>%
    fit(HeartDisease~.,
       data = heart_training_prep)

dt_fit
## parsnip model object
## 
## n= 642 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 642 287 1 (0.4470405 0.5529595)  
##    2) ST_Slope_Up>=0.5 278  57 0 (0.7949640 0.2050360)  
##      4) Cholesterol>=-1.273403 248  32 0 (0.8709677 0.1290323) *
##      5) Cholesterol< -1.273403 30   5 1 (0.1666667 0.8333333) *
##    3) ST_Slope_Up< 0.5 364  66 1 (0.1813187 0.8186813)  
##      6) MaxHR>=0.7463196 42  17 0 (0.5952381 0.4047619)  
##       12) RestingBP< 0.2875961 33  10 0 (0.6969697 0.3030303)  
##         24) Oldpeak>=-0.780126 24   4 0 (0.8333333 0.1666667) *
##         25) Oldpeak< -0.780126 9   3 1 (0.3333333 0.6666667) *
##       13) RestingBP>=0.2875961 9   2 1 (0.2222222 0.7777778) *
##      7) MaxHR< 0.7463196 322  41 1 (0.1273292 0.8726708) *
# Decision tree model predictions on test data
dt_class_pred <- predict(dt_fit,
                         new_data = heart_test_prep,
                         type = 'class')

dt_prob_pred <- predict(dt_fit,
                         new_data = heart_test_prep,
                         type = 'prob')

dt_results <- heart_test %>%
    select(HeartDisease) %>%
    bind_cols(dt_class_pred, dt_prob_pred)


# Performance metrics for decision tree model
conf_mat(dt_results,
        truth = HeartDisease,
        estimate = .pred_class)
##           Truth
## Prediction   0   1
##          0  89  19
##          1  34 134
dt_metric_spec <-
    metric_set(accuracy, sens, spec, roc_auc)

dt_metrics <- dt_metric_spec(dt_results,
                              truth = HeartDisease,
                              estimate = .pred_class, .pred_0)
dt_metrics
# Decision tree model ROC Curve
dt_results %>%
    roc_curve(truth = HeartDisease, .pred_0) %>%
    autoplot()

random forest

# Random forest model
rf_model <- rand_forest() %>%
    set_engine('ranger') %>%
    set_mode('classification')

rf_fit <- rf_model %>%
    fit(HeartDisease~.,
       data = heart_training_prep)

rf_fit
## parsnip model object
## 
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      642 
## Number of independent variables:  15 
## Mtry:                             3 
## Target node size:                 10 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.1010112
# Random forest model predictions on test data
rf_class_pred <- predict(rf_fit,
                         new_data = heart_test_prep,
                         type = 'class')

rf_prob_pred <- predict(rf_fit,
                         new_data = heart_test_prep,
                         type = 'prob')

rf_results <- heart_test %>%
    select(HeartDisease) %>%
    bind_cols(rf_class_pred, rf_prob_pred)

# Performance metrics for random forest model
conf_mat(rf_results,
        truth = HeartDisease,
        estimate = .pred_class)
##           Truth
## Prediction   0   1
##          0  97  13
##          1  26 140
rf_metric_spec <-
    metric_set(accuracy, sens, spec, roc_auc)

rf_metrics <- rf_metric_spec(rf_results,
                              truth = HeartDisease,
                              estimate = .pred_class, .pred_0)
rf_metrics
# Decision tree model ROC Curve
rf_results %>%
    roc_curve(truth = HeartDisease, .pred_0) %>%
    autoplot()

MODELS PERFORMANCE COMPARISON

# Model results comparison
combined_metrics <- bind_cols(
    select(glm_metrics, .metric),
    select(glm_metrics, .estimate) %>% rename(glm_results = .estimate),
    rename(select(knn_metrics, .estimate), knn_results = .estimate),
    rename(select(dt_metrics, .estimate), dt_results = .estimate),
    rename(select(rf_metrics, .estimate), rf_results = .estimate)  
)

combined_metrics
# Reshape data from wide to long format
combined_metrics_long <- pivot_longer(combined_metrics,
                                      cols = c(glm_results,knn_results, dt_results, rf_results),
                                      names_to = "model", values_to = "value")

options(repr.plot.width=12, repr.plot.height=10)

# Create faceted bar plots
ggplot(combined_metrics_long, aes(x = model, y = value, fill = model)) +
    geom_bar(stat = "identity", position = "dodge") +
    facet_wrap(~.metric, scales = "free_y") +
    labs(x = "Model", y = "Value", fill = "Model") +
    scale_fill_viridis_d() +
    theme_minimal() +
    guides(fill = "none")

CONCLUSION

The goal of this kernel was to determine what models are the best performing on ‘Heart Failure’ dataset.

Overall, the Random Forest model showed the best results in terms of accuracy, specificity and AUROC.

The Logistic Regression and KNN Models showed the highest sensitivity numbers.