library(ggplot2)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggcorrplot)
library(parsnip)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rsample)
library(recipes)
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(psych)
library(tune)
library(dplyr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(parsnip)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ workflows 1.1.4
## ✔ dials 1.2.1 ✔ workflowsets 1.1.0
## ✔ infer 1.0.7 ✔ yardstick 1.3.1
## ✔ modeldata 1.3.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ scales::alpha() masks psych::alpha(), ggplot2::alpha()
## ✖ gridExtra::combine() masks dplyr::combine()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:purrr':
##
## lift
library(kknn)
##
## Attaching package: 'kknn'
##
## The following object is masked from 'package:caret':
##
## contr.dummy
library(ranger)
heart<-read.csv("heart.csv", stringsAsFactors = TRUE)
head(heart)
#check for data types
heart %>% glimpse()
## Rows: 918
## Columns: 12
## $ Age <int> 40, 49, 37, 48, 54, 39, 45, 54, 37, 48, 37, 58, 39, 49,…
## $ Sex <fct> M, F, M, F, M, M, F, M, M, F, F, M, M, M, F, F, M, F, M…
## $ ChestPainType <fct> ATA, NAP, ATA, ASY, NAP, NAP, ATA, ATA, ASY, ATA, NAP, …
## $ RestingBP <int> 140, 160, 130, 138, 150, 120, 130, 110, 140, 120, 130, …
## $ Cholesterol <int> 289, 180, 283, 214, 195, 339, 237, 208, 207, 284, 211, …
## $ FastingBS <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ RestingECG <fct> Normal, Normal, ST, Normal, Normal, Normal, Normal, Nor…
## $ MaxHR <int> 172, 156, 98, 108, 122, 170, 170, 142, 130, 120, 142, 9…
## $ ExerciseAngina <fct> N, N, N, Y, N, N, N, N, Y, N, N, Y, N, Y, N, N, N, N, N…
## $ Oldpeak <dbl> 0.0, 1.0, 0.0, 1.5, 0.0, 0.0, 0.0, 0.0, 1.5, 0.0, 0.0, …
## $ ST_Slope <fct> Up, Flat, Up, Flat, Up, Up, Up, Up, Flat, Up, Up, Flat,…
## $ HeartDisease <int> 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1…
set.seed(5346)
#Descriptive statistics for numberic values
heart %>%
select(where(is.numeric)) %>%
describe()
# Count unique values for categorical variables
heart %>%
summarise(across(where(is.character), n_distinct))
In the first step of the analysis, we loaded the necessary R libraries and the dataset. We discovered that the dataset comprises 918 rows and 12 variables, including 5 categorical and the remainder numeric. There were no missing values, which simplified our initial data preparation. By examining the first few rows and calculating descriptive statistics for numeric values, we gained a foundational understanding of the data’s distribution. We also counted the unique entries for categorical variables, providing insights into the diversity of the dataset.
EXPLORATORY DATA ANALYSIS
In this step, we’ll do:
1.Distribution of numeric values 2.Counts of categorical values 3.Check for outliers (with box plots) 4.Correlation plot 5.Distribution of the target varibale
# Create distribution plots for numeric variables
options(repr.plot.width=12, repr.plot.height=7)
heart %>%
select(where(is.numeric)) %>%
select(-HeartDisease) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value, fill = variable)) +
facet_wrap(~ variable, scales = "free") +
geom_histogram(bins = 30, alpha = 0.7) +
labs(title = "Distribution Plots for Numeric Variables") +
scale_fill_viridis_d() +
theme_minimal() +
guides(fill = "none")
# Create box plots for numeric variables by the 'HeartDisease' category
heart %>%
mutate(HeartDisease = factor(HeartDisease)) %>%
select(HeartDisease, where(is.numeric)) %>%
pivot_longer(cols = where(is.numeric), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = HeartDisease, y = value, fill = HeartDisease)) +
geom_boxplot() +
facet_wrap(~ variable, scales = "free") +
labs(x = "Heart Disease", y = "Value", title = "Box Plots for Numeric Variables by Heart Disease") +
scale_fill_viridis_d() +
theme_minimal()
# Find correlated variables
heart %>%
select_if(is.numeric) %>%
cor()
## Age RestingBP Cholesterol FastingBS MaxHR
## Age 1.00000000 0.25439936 -0.09528177 0.19803907 -0.3820447
## RestingBP 0.25439936 1.00000000 0.10089294 0.07019334 -0.1121350
## Cholesterol -0.09528177 0.10089294 1.00000000 -0.26097433 0.2357924
## FastingBS 0.19803907 0.07019334 -0.26097433 1.00000000 -0.1314385
## MaxHR -0.38204468 -0.11213500 0.23579240 -0.13143849 1.0000000
## Oldpeak 0.25861154 0.16480304 0.05014811 0.05269786 -0.1606906
## HeartDisease 0.28203851 0.10758898 -0.23274064 0.26729119 -0.4004208
## Oldpeak HeartDisease
## Age 0.25861154 0.2820385
## RestingBP 0.16480304 0.1075890
## Cholesterol 0.05014811 -0.2327406
## FastingBS 0.05269786 0.2672912
## MaxHR -0.16069055 -0.4004208
## Oldpeak 1.00000000 0.4039507
## HeartDisease 0.40395072 1.0000000
# Correlation plot
heart %>%
select_if(is.numeric) %>%
ggcorr(label = TRUE, label_size = 3)
# Create distribution of HeartDisease
options(repr.plot.width=7, repr.plot.height=7)
ggplot(heart, aes(x = factor(HeartDisease), fill = factor(HeartDisease))) +
geom_bar() +
labs(x = "HeartDisease", y = "Count", fill = "HeartDisease", title = "Distribution of HeartDisease") +
scale_fill_viridis_d() +
theme_minimal()
Distribution of Numeric Values: indicate that ‘Age’ and ‘MaxHR’ variables have a fairly normal distribution. The other variable shows a skewed distribution.
Counts of Categorical Values: The bar charts for categorical data reveal that certain categories dominate, like the ‘ASY’ (asymptomatic) type of chest pain and a higher prevalence of male patients. These imbalances may influence the model’s performance and should be considered in analysis and model training.
Outlier Detection: The box plots show that most of the numerical variables have outliers. These outliers may represent atypical cases or errors in data entry. However, they could also indicate true extremes that are clinically significant.
Correlation Plot: The correlation heatmap suggests some variables have a moderate positive or negative correlation with the occurrence of heart disease. For instance, ‘Oldpeak’ shows a positive correlation with the presence of heart disease, whereas ‘MaxHR’ appears to have a negative correlation. There are no highly correlated variables to be removed on feature ingineering step.
Distribution of the Target Variable (‘HeartDisease’): The final bar chart highlights a slight imbalance in the distribution of the target variable, with a larger number of patients not having heart disease compared to those who do. This suggests the need for careful consideration of class distribution when training predictive models to ensure they are not biased towards the majority class.
By interpreting these insights, we can make informed decisions on data preprocessing, such as outlier treatment and feature selection, which will ultimately contribute to building a more accurate heart failure prediction model.
DATA PREPARATION
In this step we’ll:
1.Convert the outcome variable to a factor type 2.Split the data to training and test sets 3.Normalize the data 4.Create dummy variables
# Convert outcome variable to factor and check the order
heart$HeartDisease <- as.factor(heart$HeartDisease)
levels(heart[['HeartDisease']])
## [1] "0" "1"
# Split data to training and test sets
heart_split <- initial_split(heart,
prop = 0.7,
strata = HeartDisease)
heart_training <- heart_split %>% training()
heart_test <- heart_split %>% testing()
nrow(heart_training)
## [1] 642
nrow(heart_test)
## [1] 276
# Normalize and remove corelated numeric variables and create dummy for nominal variables
heart_recipe <- recipe(HeartDisease~.,
data = heart_training) %>%
step_corr(all_numeric(), threshold = 0.8) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(),-all_outcomes())
heart_recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 11
##
## ── Operations
## • Correlation filter on: all_numeric()
## • Centering and scaling for: all_numeric()
## • Dummy variables from: all_nominal() and -all_outcomes()
# Apply transformation recipe to training and test sets
heart_recipe_prep <- heart_recipe %>%
prep(training = heart_training)
heart_training_prep <- heart_recipe_prep %>%
bake(new_data = NULL)
heart_test_prep <- heart_recipe_prep %>%
bake(new_data = heart_test)
head(heart_training_prep)
# Logistic regression model setup
glm_model <- logistic_reg() %>%
set_engine('glm') %>%
set_mode('classification')
# Fit the model
glm_fit <- glm_model %>%
fit(HeartDisease ~ .,
data = heart_training_prep)
# Check the fitted model
glm_fit
## parsnip model object
##
##
## Call: stats::glm(formula = HeartDisease ~ ., family = stats::binomial,
## data = data)
##
## Coefficients:
## (Intercept) Age RestingBP Cholesterol
## -0.81158 0.26648 0.15818 -0.55688
## FastingBS MaxHR Oldpeak Sex_M
## 0.48942 -0.04187 0.45310 1.51211
## ChestPainType_ATA ChestPainType_NAP ChestPainType_TA RestingECG_Normal
## -1.88456 -1.60445 -1.44373 -0.28973
## RestingECG_ST ExerciseAngina_Y ST_Slope_Flat ST_Slope_Up
## -0.17532 0.98385 1.80001 -0.55787
##
## Degrees of Freedom: 641 Total (i.e. Null); 626 Residual
## Null Deviance: 882.8
## Residual Deviance: 400.9 AIC: 432.9
# Logistic regression model predictions on test data
glm_class_pred <- predict(glm_fit,
new_data = heart_test_prep,
type = 'class')
glm_prob_pred <- predict(glm_fit,
new_data = heart_test_prep,
type = 'prob')
glm_results <- heart_test %>%
select(HeartDisease) %>%
bind_cols(glm_class_pred, glm_prob_pred)
# Performance metrics for logistic regression model
conf_mat(glm_results,
truth = HeartDisease,
estimate = .pred_class)
## Truth
## Prediction 0 1
## 0 97 12
## 1 26 141
glm_metric_spec <-
metric_set(accuracy, sens, spec, roc_auc)
glm_metrics <- glm_metric_spec(glm_results,
truth = HeartDisease,
estimate = .pred_class, .pred_0)
glm_metrics
# Logistic regression model ROC Curve
glm_results %>%
roc_curve(truth = HeartDisease, .pred_0) %>%
autoplot()
The Logistic Regression Model showed:
.accuracy of 0.8478261 .sensitivity of 0.8536585 .specificity of 0.8431373 .roc_auc of 0.9207716
K-Nearest Neighbors
# K-nearest neighbors model
knn_model <- nearest_neighbor() %>%
set_engine('kknn') %>%
set_mode('classification')
knn_fit <- knn_model %>%
fit(HeartDisease~.,
data = heart_training_prep)
knn_fit
## parsnip model object
##
##
## Call:
## kknn::train.kknn(formula = HeartDisease ~ ., data = data, ks = min_rows(5, data, 5))
##
## Type of response variable: nominal
## Minimal misclassification: 0.1323988
## Best kernel: optimal
## Best k: 5
# K-nearest neighbors model predictions on test data
knn_class_pred <- predict(knn_fit,
new_data = heart_test_prep,
type = 'class')
knn_prob_pred <- predict(knn_fit,
new_data = heart_test_prep,
type = 'prob')
knn_results <- heart_test %>%
select(HeartDisease) %>%
bind_cols(knn_class_pred, knn_prob_pred)
# Performance metrics for K-nearest neighbors model
conf_mat(knn_results,
truth = HeartDisease,
estimate = .pred_class)
## Truth
## Prediction 0 1
## 0 100 20
## 1 23 133
knn_metric_spec <-
metric_set(accuracy, sens, spec, roc_auc)
knn_metrics <- knn_metric_spec(knn_results,
truth = HeartDisease,
estimate = .pred_class, .pred_0)
knn_metrics
# Logistic regression model ROC Curve
knn_results %>%
roc_curve(truth = HeartDisease, .pred_0) %>%
autoplot()
# Decision tree model
dt_model <- decision_tree() %>%
set_engine('rpart') %>%
set_mode('classification')
dt_fit <- dt_model %>%
fit(HeartDisease~.,
data = heart_training_prep)
dt_fit
## parsnip model object
##
## n= 642
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 642 287 1 (0.4470405 0.5529595)
## 2) ST_Slope_Up>=0.5 278 57 0 (0.7949640 0.2050360)
## 4) Cholesterol>=-1.273403 248 32 0 (0.8709677 0.1290323) *
## 5) Cholesterol< -1.273403 30 5 1 (0.1666667 0.8333333) *
## 3) ST_Slope_Up< 0.5 364 66 1 (0.1813187 0.8186813)
## 6) MaxHR>=0.7463196 42 17 0 (0.5952381 0.4047619)
## 12) RestingBP< 0.2875961 33 10 0 (0.6969697 0.3030303)
## 24) Oldpeak>=-0.780126 24 4 0 (0.8333333 0.1666667) *
## 25) Oldpeak< -0.780126 9 3 1 (0.3333333 0.6666667) *
## 13) RestingBP>=0.2875961 9 2 1 (0.2222222 0.7777778) *
## 7) MaxHR< 0.7463196 322 41 1 (0.1273292 0.8726708) *
# Decision tree model predictions on test data
dt_class_pred <- predict(dt_fit,
new_data = heart_test_prep,
type = 'class')
dt_prob_pred <- predict(dt_fit,
new_data = heart_test_prep,
type = 'prob')
dt_results <- heart_test %>%
select(HeartDisease) %>%
bind_cols(dt_class_pred, dt_prob_pred)
# Performance metrics for decision tree model
conf_mat(dt_results,
truth = HeartDisease,
estimate = .pred_class)
## Truth
## Prediction 0 1
## 0 89 19
## 1 34 134
dt_metric_spec <-
metric_set(accuracy, sens, spec, roc_auc)
dt_metrics <- dt_metric_spec(dt_results,
truth = HeartDisease,
estimate = .pred_class, .pred_0)
dt_metrics
# Decision tree model ROC Curve
dt_results %>%
roc_curve(truth = HeartDisease, .pred_0) %>%
autoplot()
random forest
# Random forest model
rf_model <- rand_forest() %>%
set_engine('ranger') %>%
set_mode('classification')
rf_fit <- rf_model %>%
fit(HeartDisease~.,
data = heart_training_prep)
rf_fit
## parsnip model object
##
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 500
## Sample size: 642
## Number of independent variables: 15
## Mtry: 3
## Target node size: 10
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error (Brier s.): 0.1010112
# Random forest model predictions on test data
rf_class_pred <- predict(rf_fit,
new_data = heart_test_prep,
type = 'class')
rf_prob_pred <- predict(rf_fit,
new_data = heart_test_prep,
type = 'prob')
rf_results <- heart_test %>%
select(HeartDisease) %>%
bind_cols(rf_class_pred, rf_prob_pred)
# Performance metrics for random forest model
conf_mat(rf_results,
truth = HeartDisease,
estimate = .pred_class)
## Truth
## Prediction 0 1
## 0 97 13
## 1 26 140
rf_metric_spec <-
metric_set(accuracy, sens, spec, roc_auc)
rf_metrics <- rf_metric_spec(rf_results,
truth = HeartDisease,
estimate = .pred_class, .pred_0)
rf_metrics
# Decision tree model ROC Curve
rf_results %>%
roc_curve(truth = HeartDisease, .pred_0) %>%
autoplot()
MODELS PERFORMANCE COMPARISON
# Model results comparison
combined_metrics <- bind_cols(
select(glm_metrics, .metric),
select(glm_metrics, .estimate) %>% rename(glm_results = .estimate),
rename(select(knn_metrics, .estimate), knn_results = .estimate),
rename(select(dt_metrics, .estimate), dt_results = .estimate),
rename(select(rf_metrics, .estimate), rf_results = .estimate)
)
combined_metrics
# Reshape data from wide to long format
combined_metrics_long <- pivot_longer(combined_metrics,
cols = c(glm_results,knn_results, dt_results, rf_results),
names_to = "model", values_to = "value")
options(repr.plot.width=12, repr.plot.height=10)
# Create faceted bar plots
ggplot(combined_metrics_long, aes(x = model, y = value, fill = model)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~.metric, scales = "free_y") +
labs(x = "Model", y = "Value", fill = "Model") +
scale_fill_viridis_d() +
theme_minimal() +
guides(fill = "none")
CONCLUSION
The goal of this kernel was to determine what models are the best performing on ‘Heart Failure’ dataset.
Overall, the Random Forest model showed the best results in terms of accuracy, specificity and AUROC.
The Logistic Regression and KNN Models showed the highest sensitivity numbers.