Analysis of Cardiovascular Disease Risk Factors

#Load Libraries
library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.5.1

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(corrplot)

## corrplot 0.95 loaded

library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Loaded data from https://www.kaggle.com/datasets/redwankarimsony/heart-disease-data

df<-read.csv("https://raw.githubusercontent.com/Andreina-A/Data698/refs/heads/main/heart_disease_uci.csv")

#view structure
str(df)

## 'data.frame':    920 obs. of  16 variables:
##  $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age     : int  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : chr  "Male" "Male" "Male" "Male" ...
##  $ dataset : chr  "Cleveland" "Cleveland" "Cleveland" "Cleveland" ...
##  $ cp      : chr  "typical angina" "asymptomatic" "asymptomatic" "non-anginal" ...
##  $ trestbps: int  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : int  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : logi  TRUE FALSE FALSE FALSE FALSE FALSE ...
##  $ restecg : chr  "lv hypertrophy" "lv hypertrophy" "lv hypertrophy" "normal" ...
##  $ thalch  : int  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : logi  FALSE TRUE TRUE FALSE FALSE FALSE ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : chr  "downsloping" "flat" "flat" "downsloping" ...
##  $ ca      : int  0 3 2 0 0 0 2 0 1 0 ...
##  $ thal    : chr  "fixed defect" "normal" "reversable defect" "normal" ...
##  $ num     : int  0 2 1 0 0 0 3 0 2 1 ...

Columns: id (Unique id for each patient) age (Age of the patient in years) origin (place of study) sex (Male/Female) cp chest pain type ([typical angina, atypical angina, non-anginal, asymptomatic]) trestbps resting blood pressure (resting blood pressure (in mm Hg on admission to the hospital)) chol (serum cholesterol in mg/dl) fbs (if fasting blood sugar > 120 mg/dl) restecg (resting electrocardiographic results) – Values: [normal, stt abnormality, lv hypertrophy] thalach: maximum heart rate achieved exang: exercise-induced angina (True/ False) oldpeak: ST depression induced by exercise relative to rest slope: the slope of the peak exercise ST segment ca: number of major vessels (0-3) colored by fluoroscopy thal: [normal; fixed defect; reversible defect] num: the predicted attribute 0: No heart disease (absence of disease).The exact meaning of values 1 through 4 can depend on the specific dataset, but they generally indicate the extent or severity of the disease 1: Mild heart disease, 2: Moderate heart disease, 3: Severe heart disease, and 4: Very severe heart disease.

License: Data files © Original Authors

Citation Request: The authors of the databases have requested that any publications resulting from the use of the data include the names of the principal investigator responsible for the data collection at each institution. They would be:

Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation:Robert Detrano, M.D., Ph.D.

#check for missing values
colSums(is.na(df))

##       id      age      sex  dataset       cp trestbps     chol      fbs 
##        0        0        0        0        0       59       30       90 
##  restecg   thalch    exang  oldpeak    slope       ca     thal      num 
##        0       55       55       62        0      611        0        0

Seven columns have missing data, I used a simple imputation where the the median will be used of the numeric values and mode will be used for the categorical values.

for(col in names(df)){
  if(is.numeric(df[[col]])){
    df[[col]][is.na(df[[col]])]<-median(df[[col]], na.rm=TRUE)
  }else{
    mode_val<-names(sort(table(df[[col]]),decreasing=TRUE))[1]
    df[[col]][is.na(df[[col]])]<-mode_val
  }
}

#check for missing values again
colSums(is.na(df))

##       id      age      sex  dataset       cp trestbps     chol      fbs 
##        0        0        0        0        0        0        0        0 
##  restecg   thalch    exang  oldpeak    slope       ca     thal      num 
##        0        0        0        0        0        0        0        0

I converted the target variable “num” into a binary variable, instead of using all heart stages I will set no heart disease at 0 and all other stages into just 1 where it indicatess heart disease.

#coverted target column into binary 
df$num <- ifelse(df$num == 0, 0, 1)
df$num <- as.factor(df$num)

#converted categorical columns to factors
df$sex <- as.factor(df$sex)
df$cp <- as.factor(df$cp)
df$fbs <- as.factor(df$fbs)
df$restecg <- as.factor(df$restecg)
df$exang <- as.factor(df$exang)
df$slope <- as.factor(df$slope)

#Train/Test split
set.seed(21)
trainIndex<- createDataPartition(df$num, p = 0.8, list = FALSE)

train_df <- df[trainIndex, ]
test_df  <- df[-trainIndex, ]

Correlation Analysis

numeric_data<-df|>
  select(where(is.numeric))
cor_matrix<-cor(numeric_data)

corrplot(cor_matrix, method="color", type="upper", tl.cex=0.8)

Logistic Regression Model

log_model<-glm(num~., data=train_df,family = "binomial")

log_probs<-predict(log_model, test_df,type="response")
log_pred<-ifelse(log_probs>0.5,1,0)|>
  as.factor()

Decision Tree Model

DT_model<-rpart(num~., data=train_df,method = "class")
rpart.plot(DT_model)

DT_pred<- predict(DT_model, test_df, type="class")
DT_probs<-predict(DT_model, test_df, type = "prob")[,2]

Random Forest model

rf_model<-randomForest(num~., data=train_df,ntree =100)

rf_pred<- predict(rf_model, test_df)
rf_probs<-predict(rf_model, test_df, type="prob")[,2]

Evaluation

#Evaulation Function
evaluate_model <- function(true, pred, probs) {
  cm <- confusionMatrix(pred, true, positive = "1")
  
  accuracy  <- unname(cm$overall["Accuracy"])
  precision <- unname(cm$byClass["Precision"])
  recall    <- unname(cm$byClass["Recall"])
  f1        <- unname(cm$byClass["F1"])
  
  true_numeric <- as.numeric(true) - 1
  
  roc_obj <- roc(true_numeric, probs)
  roc_auc <- as.numeric(auc(roc_obj))
  
  return(c(
    Accuracy = accuracy,
    Precision = precision,
    Recall = recall,
    F1_Score = f1,
    ROC_AUC = roc_auc
  ))
}

#Evaluation models
log_results<-evaluate_model(test_df$num, log_pred,log_probs)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

DT_results<-evaluate_model(test_df$num, DT_pred,DT_probs)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

rf_results<-evaluate_model(test_df$num, rf_pred,rf_probs)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Result
results<-rbind(
  Logistic_Regression = log_results,
  Decision_Tree       = DT_results,
  Random_Forest       = rf_results
)
print(results)

##                      Accuracy Precision    Recall  F1_Score   ROC_AUC
## Logistic_Regression 0.7978142 0.8200000 0.8118812 0.8159204 0.9054576
## Decision_Tree       0.8743169 0.8900000 0.8811881 0.8855721 0.9018957
## Random_Forest       0.8579235 0.8865979 0.8514851 0.8686869 0.9437938

plot(roc(test_df$num, DT_probs), col="blue")

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(roc(test_df$num, rf_probs), col="red", add=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot(roc(test_df$num, log_probs), col= "green", add= TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

legend("bottomright", legend=c("Decision Tree", "Random Forest", "Logistic Regression"),
       col=c("blue","red", "green"), lwd=2)

Analysis of Cardiovascular Disease Risk Factors

Andreina Arias

2026-04-01

Logistic Regression Model

Decision Tree Model

Random Forest model

Evaluation