library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
heart_data<- read_excel("data.xlsx")
head(5)
## [1] 5
# Transform continuous variables that should be factors
heart_data_transformed <- heart_data %>%
mutate(
sex = factor(sex, levels = 0:1,labels = c("female", "male")),
cp = factor(cp, levels = 0:3, labels = c("non-anginal pain", "atypical angina", "typical angina", "asymptomatic")),
fbs = factor(fbs, levels = 0:1, labels = c("false", "true")),
restecg = factor(restecg, levels = 0:2, labels = c("normal", "ST-T wave abnormality", "left ventricular hypertrophy")),
exang = factor(exang, levels = 0:1, labels = c("no", "yes")),
slope = factor(slope, levels = 0:2, labels = c("slope 0", "slope 1", "slope 2")),
thal = factor(thal, levels = 1:3, labels = c("normal", "fixed defect", "reversible defect")),
target = factor(target, levels = 0:1, labels = c("no disease", "disease")),
ca = factor(ca)
)
# Print the transformed dataset
print(heart_data_transformed)
## # A tibble: 303 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <fct> <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl> <fct>
## 1 63 male asympt… 145 233 true normal 150 no 2.3 slop…
## 2 37 male typica… 130 250 false ST-T w… 187 no 3.5 slop…
## 3 41 female atypic… 130 204 false normal 172 no 1.4 slop…
## 4 56 male atypic… 120 236 false ST-T w… 178 no 0.8 slop…
## 5 57 female non-an… 120 354 false ST-T w… 163 yes 0.6 slop…
## 6 57 male non-an… 140 192 false ST-T w… 148 no 0.4 slop…
## 7 56 female atypic… 140 294 false normal 153 no 1.3 slop…
## 8 44 male atypic… 120 263 false ST-T w… 173 no 0 slop…
## 9 52 male typica… 172 199 true ST-T w… 162 no 0.5 slop…
## 10 57 male typica… 150 168 false ST-T w… 174 no 1.6 slop…
## # ℹ 293 more rows
## # ℹ 3 more variables: ca <fct>, thal <fct>, target <fct>
str(heart_data_transformed)
## tibble [303 × 14] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:303] 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 2 2 2 ...
## $ cp : Factor w/ 4 levels "non-anginal pain",..: 4 3 2 2 1 1 2 2 3 3 ...
## $ trestbps: num [1:303] 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : num [1:303] 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : Factor w/ 2 levels "false","true": 2 1 1 1 1 1 1 1 2 1 ...
## $ restecg : Factor w/ 3 levels "normal","ST-T wave abnormality",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ thalach : num [1:303] 150 187 172 178 163 148 153 173 162 174 ...
## $ exang : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ oldpeak : num [1:303] 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : Factor w/ 3 levels "slope 0","slope 1",..: 1 1 3 3 3 2 2 3 3 3 ...
## $ ca : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ thal : Factor w/ 3 levels "normal","fixed defect",..: 1 2 2 2 2 1 2 3 3 2 ...
## $ target : Factor w/ 2 levels "no disease","disease": 2 2 2 2 2 2 2 2 2 2 ...
# Check for missing values
missing_values <- heart_data_transformed %>%
summarise_all(~ sum(is.na(.)))
missing_values
## # A tibble: 1 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0 0 0 0 0
## # ℹ 3 more variables: ca <int>, thal <int>, target <int>
#checking duolicates
duplicates <- heart_data_transformed[duplicated(heart_data_transformed), ]
duplicates
## # A tibble: 1 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <fct> <fct> <dbl> <dbl> <fct> <fct> <dbl> <fct> <dbl> <fct>
## 1 38 male typical … 138 175 false ST-T w… 173 no 0 slop…
## # ℹ 3 more variables: ca <fct>, thal <fct>, target <fct>
# Preliminary statistical summary
summary(heart_data_transformed)
## age sex cp trestbps
## Min. :29.00 female: 96 non-anginal pain:143 Min. : 94.0
## 1st Qu.:47.50 male :207 atypical angina : 50 1st Qu.:120.0
## Median :55.00 typical angina : 87 Median :130.0
## Mean :54.37 asymptomatic : 23 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:140.0
## Max. :77.00 Max. :200.0
## chol fbs restecg thalach
## Min. :126.0 false:258 normal :147 Min. : 71.0
## 1st Qu.:211.0 true : 45 ST-T wave abnormality :152 1st Qu.:133.5
## Median :240.0 left ventricular hypertrophy: 4 Median :153.0
## Mean :246.3 Mean :149.6
## 3rd Qu.:274.5 3rd Qu.:166.0
## Max. :564.0 Max. :202.0
## exang oldpeak slope ca thal
## no :204 Min. :0.00 slope 0: 21 0:175 normal : 20
## yes: 99 1st Qu.:0.00 slope 1:140 1: 65 fixed defect :166
## Median :0.80 slope 2:142 2: 38 reversible defect:117
## Mean :1.04 3: 20
## 3rd Qu.:1.60 4: 5
## Max. :6.20
## target
## no disease:138
## disease :165
##
##
##
##
# Explore categorical variables using count plots
ggplot(heart_data_transformed, aes(x = cp, fill = cp)) +
geom_bar() +
labs(title = "Chest Pain Type Distribution")

ggplot(heart_data_transformed, aes(x = fbs, fill = fbs)) +
geom_bar() +
labs(title = "Fasting Blood Sugar Distribution")

ggplot(heart_data_transformed, aes(x = restecg, fill = restecg)) +
geom_bar() +
labs(title = "Resting ECG Distribution")

ggplot(heart_data_transformed, aes(x = exang, fill = exang)) +
geom_bar() +
labs(title = "Exercise Induced Angina Distribution")

ggplot(heart_data_transformed, aes(x = slope, fill = slope)) +
geom_bar() +
labs(title = "Slope Distribution")

ggplot(heart_data_transformed, aes(x = ca, fill = ca)) +
geom_bar() +
labs(title = "Number of Major Vessels Distribution")

ggplot(heart_data_transformed, aes(x = thal, fill = thal)) +
geom_bar() +
labs(title = "Thal Distribution")

ggplot(heart_data_transformed, aes(x = sex, fill = sex)) +
geom_bar() +
labs(title = "Composition of Patients by Gender")

# Study the occurrence of CVD across different ages
ggplot(heart_data_transformed, aes(x = age, fill = target)) +
geom_histogram(binwidth = 5, position = "dodge") +
labs(title = "Occurrence of CVD Across Ages", x = "Age", y = "Count", fill = "CVD")

# Study the composition of patients by gender
ggplot(heart_data_transformed, aes(x = sex, fill = target)) +
geom_bar() +
labs(title = "Composition of Patients by Gender", fill = "CVD")

# Detect heart attack based on anomalies in resting blood pressure
ggplot(heart_data_transformed, aes(x = target, y = trestbps)) +
geom_boxplot() +
labs(title = "Detecting Heart Attack Based on Resting Blood Pressure", x = "Heart Attack", y = "Resting Blood Pressure")

# Relationship between cholesterol levels and target variable
ggplot(heart_data_transformed, aes(x = chol, fill = target)) +
geom_histogram(binwidth = 10, position = "dodge") +
labs(title = "Cholesterol Levels vs. CVD Occurrence", x = "Cholesterol Levels", y = "Count", fill = "CVD")

# Relationship between peak exercising and occurrence of heart attack
ggplot(heart_data_transformed, aes(x = target, y = oldpeak)) +
geom_boxplot() +
labs(title = "Peak Exercise ST Depression vs. Heart Attack", x = "Heart Attack", y = "ST Depression")

# Thalassemia and CVD relationship
ggplot(heart_data_transformed, aes(x = thal, fill = target)) +
geom_bar() +
labs(title = "Thalassemia and CVD Relationship", fill = "CVD")

# Use logistic regression to explore relationship between factors and CVD occurrence
log_reg_model <- glm(target ~ ., data = heart_data_transformed, family = "binomial")
summary(log_reg_model)
##
## Call:
## glm(formula = target ~ ., family = "binomial", data = heart_data_transformed)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9404 -0.2750 0.1042 0.4539 3.1088
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.625315 2.963517 0.886 0.375683
## age 0.027403 0.025394 1.079 0.280524
## sexmale -1.828525 0.571743 -3.198 0.001383 **
## cpatypical angina 0.884012 0.576235 1.534 0.125001
## cptypical angina 2.008412 0.532184 3.774 0.000161 ***
## cpasymptomatic 2.413208 0.717417 3.364 0.000769 ***
## trestbps -0.025479 0.011880 -2.145 0.031971 *
## chol -0.004082 0.004240 -0.963 0.335663
## fbstrue 0.341972 0.574744 0.595 0.551844
## restecgST-T wave abnormality 0.446960 0.399511 1.119 0.263240
## restecgleft ventricular hypertrophy -0.714066 2.772148 -0.258 0.796727
## thalach 0.019191 0.011778 1.629 0.103229
## exangyes -0.807201 0.450351 -1.792 0.073071 .
## oldpeak -0.390368 0.241556 -1.616 0.106082
## slopeslope 1 -0.819597 0.875754 -0.936 0.349337
## slopeslope 2 0.645486 0.945384 0.683 0.494748
## ca1 -2.298937 0.524000 -4.387 1.15e-05 ***
## ca2 -3.425023 0.806938 -4.244 2.19e-05 ***
## ca3 -2.203505 0.934782 -2.357 0.018411 *
## ca4 1.315756 1.743752 0.755 0.450516
## thalfixed defect -0.049327 0.801377 -0.062 0.950918
## thalreversible defect -1.498076 0.786286 -1.905 0.056747 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 180.76 on 281 degrees of freedom
## AIC: 224.76
##
## Number of Fisher Scoring iterations: 6
# Pair plot to understand relationship between all variables
pairs(heart_data_transformed)

# Build baseline logistic regression model
baseline_model <- glm(target ~ ., data = heart_data, family = "binomial")
summary(baseline_model)
##
## Call:
## glm(formula = target ~ ., family = "binomial", data = heart_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6018 -0.3901 0.1549 0.5824 2.6399
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.584930 2.581363 1.389 0.164902
## age -0.004633 0.023261 -0.199 0.842117
## sex -1.736286 0.469927 -3.695 0.000220 ***
## cp 0.858124 0.185604 4.623 3.77e-06 ***
## trestbps -0.019687 0.010350 -1.902 0.057155 .
## chol -0.004599 0.003779 -1.217 0.223556
## fbs 0.056954 0.528307 0.108 0.914150
## restecg 0.486371 0.349920 1.390 0.164545
## thalach 0.023577 0.010486 2.249 0.024544 *
## exang -0.962881 0.410811 -2.344 0.019086 *
## oldpeak -0.544410 0.214936 -2.533 0.011313 *
## slope 0.575268 0.351105 1.638 0.101328
## ca -0.780812 0.191813 -4.071 4.69e-05 ***
## thal -0.986962 0.297936 -3.313 0.000924 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 210.14 on 289 degrees of freedom
## AIC: 238.14
##
## Number of Fisher Scoring iterations: 6
# Variable importance chart based on the logistic regression model
var_importance <- data.frame(variable = names(coef(baseline_model)), importance = abs(coef(baseline_model)))
var_importance <- var_importance %>%
arrange(desc(importance))
ggplot(var_importance, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance for CVD Prediction", x = "Variable", y = "Importance")

# Split data into training and testing sets
set.seed(123) # For reproducibility
sample_indices <- sample(1:nrow(heart_data), 0.7 * nrow(heart_data))
train_data <- heart_data[sample_indices, ]
test_data <- heart_data[-sample_indices, ]
# Build logistic regression model on training data
model <- glm(target ~ ., data = train_data, family = "binomial")
# Predict on test data
test_predictions <- predict(model, newdata = test_data, type = "response")
test_predictions <- ifelse(test_predictions > 0.5, 1, 0)
# Calculate accuracy
accuracy <- sum(test_predictions == test_data$target) / nrow(test_data)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 0.846153846153846"