library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ppcor)
## Warning: package 'ppcor' was built under R version 4.2.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.3
library(cluster)
library(corrplot)
## corrplot 0.95 loaded
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(MASS)
We load the dataset using read_excel() function to import the heart disease data into R for analysis.
getwd()
## [1] "C:/Users/Kareem/Documents"
data <- read_excel("C:/Users/kareem/Downloads/heart.xlsx")
data$HeartDisease <- as.factor(data$HeartDisease)
data$Sex <- as.factor(data$Sex)
data$ChestPainType <- as.factor(data$ChestPainType)
data$RestingECG <- as.factor(data$RestingECG)
data$ExerciseAngina <- as.factor(data$ExerciseAngina)
data$ST_Slope <- as.factor(data$ST_Slope)
head(data)
## # A tibble: 6 × 12
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 40 M ATA 140 289 0 Normal 172
## 2 49 F NAP 160 180 0 Normal 156
## 3 37 M ATA 130 283 0 ST 98
## 4 48 F ASY 138 214 0 Normal 108
## 5 54 M NAP 150 195 0 Normal 122
## 6 39 M NAP 120 339 0 Normal 170
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## # HeartDisease <fct>
dim(data)
## [1] 918 12
summary(data)
## Age Sex ChestPainType RestingBP Cholesterol
## Min. :28.00 F:193 ASY:496 Min. : 0.0 Min. : 0.0
## 1st Qu.:47.00 M:725 ATA:173 1st Qu.:120.0 1st Qu.:173.2
## Median :54.00 NAP:203 Median :130.0 Median :223.0
## Mean :53.51 TA : 46 Mean :132.4 Mean :198.8
## 3rd Qu.:60.00 3rd Qu.:140.0 3rd Qu.:267.0
## Max. :77.00 Max. :200.0 Max. :603.0
## FastingBS RestingECG MaxHR ExerciseAngina Oldpeak
## Min. :0.0000 LVH :188 Min. : 60.0 N:547 Min. :-2.6000
## 1st Qu.:0.0000 Normal:552 1st Qu.:120.0 Y:371 1st Qu.: 0.0000
## Median :0.0000 ST :178 Median :138.0 Median : 0.6000
## Mean :0.2331 Mean :136.8 Mean : 0.8874
## 3rd Qu.:0.0000 3rd Qu.:156.0 3rd Qu.: 1.5000
## Max. :1.0000 Max. :202.0 Max. : 6.2000
## ST_Slope HeartDisease
## Down: 63 0:410
## Flat:460 1:508
## Up :395
##
##
##
Calculates the average age for each heart disease group.
tapply(data$Age, data$HeartDisease, mean)
## 0 1
## 50.55122 55.89961
Performs ANOVA test to check if cholesterol differs by chest pain type.
summary(aov(Cholesterol ~ ChestPainType, data=data))[[1]][["Pr(>F)"]][1]
## [1] 3.00616e-05
Chi-square test to check relationship between gender and heart disease.
chisq.test(table(data$Sex, data$HeartDisease))$p.value
## [1] 4.597617e-20
Tests whether age data follows a normal distribution.
shapiro.test(data$Age)$p.value
## [1] 2.165167e-05
Tests if variance of age is equal between heart disease groups.
var.test(Age ~ HeartDisease, data=data)$p.value
## [1] 0.09135174
data %>%
group_by(HeartDisease) %>%
mutate(avg_chol = mean(Cholesterol),
avg_hr = mean(MaxHR)) %>%
filter(Cholesterol > avg_chol & MaxHR < avg_hr) %>%
dplyr::select(Age, Cholesterol, HeartDisease) %>%
head(5)
## # A tibble: 5 × 3
## # Groups: HeartDisease [2]
## Age Cholesterol HeartDisease
## <dbl> <dbl> <fct>
## 1 37 283 0
## 2 48 214 1
## 3 48 284 0
## 4 60 248 1
## 5 53 260 0
data %>%
count(Sex, HeartDisease) %>%
group_by(Sex) %>%
mutate(percent = n/sum(n)*100)
## # A tibble: 4 × 4
## # Groups: Sex [2]
## Sex HeartDisease n percent
## <fct> <fct> <int> <dbl>
## 1 F 0 143 74.1
## 2 F 1 50 25.9
## 3 M 0 267 36.8
## 4 M 1 458 63.2
#Show top 3 highest cholesterol cases per group
data %>%
group_by(HeartDisease) %>%
slice_max(Cholesterol, n = 3)
## # A tibble: 6 × 12
## # Groups: HeartDisease [2]
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 67 F NAP 115 564 0 LVH 160
## 2 53 F ATA 113 468 0 Normal 127
## 3 58 M ASY 132 458 1 Normal 69
## 4 54 M ASY 130 603 1 Normal 125
## 5 32 M ASY 118 529 0 Normal 130
## 6 53 M NAP 145 518 0 Normal 130
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## # HeartDisease <fct>
#Show patients with above-average cholesterol
data %>%
group_by(HeartDisease) %>%
filter(Cholesterol > mean(Cholesterol)) %>%
slice_head(n=3)
## # A tibble: 6 × 12
## # Groups: HeartDisease [2]
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 40 M ATA 140 289 0 Normal 172
## 2 37 M ATA 130 283 0 ST 98
## 3 39 M NAP 120 339 0 Normal 170
## 4 49 F NAP 160 180 0 Normal 156
## 5 48 F ASY 138 214 0 Normal 108
## 6 37 M ASY 140 207 0 Normal 130
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## # HeartDisease <fct>
#Show lowest heart rate (MaxHR) cases per group
data %>%
group_by(HeartDisease) %>%
slice_min(MaxHR, n = 5)
## # A tibble: 10 × 12
## # Groups: HeartDisease [2]
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 58 M ASY 132 458 1 Normal 69
## 2 40 M NAP 106 240 0 Normal 80
## 3 62 M ASY 120 220 0 ST 86
## 4 62 M NAP 120 220 0 LVH 86
## 5 46 F ASY 130 238 0 Normal 90
## 6 51 M ASY 140 0 0 Normal 60
## 7 60 M ASY 135 0 0 Normal 63
## 8 65 M ASY 145 0 1 ST 67
## 9 61 M NAP 200 0 1 ST 70
## 10 67 M ASY 120 237 0 Normal 71
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## # HeartDisease <fct>
#Calculate mean values of Age, Cholesterol, and MaxHR for each group
data %>%
group_by(HeartDisease) %>%
summarise(across(c(Age, Cholesterol, MaxHR),
~mean(. ,na.rm=TRUE)))
## # A tibble: 2 × 4
## HeartDisease Age Cholesterol MaxHR
## <fct> <dbl> <dbl> <dbl>
## 1 0 50.6 227. 148.
## 2 1 55.9 176. 128.
data %>%
group_by(HeartDisease) %>%
slice_max(Cholesterol, n = 5) %>%
ggplot(aes(reorder(Age, Cholesterol), Cholesterol, fill=HeartDisease)) +
geom_bar(stat="identity") +
coord_flip() +
ggtitle("Top Cholesterol per Group")
### Q13: Build a Decision Tree model and show variable importance
tree_model <- rpart(HeartDisease ~ Age + Cholesterol + MaxHR, data = data)
tree_model$variable.importance
## MaxHR Cholesterol Age
## 68.00642 39.14198 28.96949
pred <- predict(tree_model, type = "class")
table(pred, data$HeartDisease)
##
## pred 0 1
## 0 283 104
## 1 127 404
data$risk_pred <- ifelse(predict(tree_model, type = "prob")[,2] > 0.5,
"High Risk", "Low Risk")
table(data$risk_pred)
##
## High Risk Low Risk
## 531 387
Scatter plot showing risk prediction based on age and cholesterol.
ggplot(data, aes(x = Age, y = Cholesterol, color = risk_pred)) +
geom_point(size = 2) +
ggtitle("Predicted High Risk Patients") +
xlab("Age") +
ylab("Cholesterol") +
theme_minimal()
### Q17: Visualize the distribution of Cholesterol using a histogram
Histogram showing cholesterol distribution by disease status.
ggplot(data, aes(x = Cholesterol, fill = HeartDisease)) +
geom_histogram(bins = 30, alpha = 0.6) +
ggtitle("Cholesterol Distribution by Heart Disease") +
theme_minimal()
## Level 4: Exploratory Data Analysis ### Q18: Calculate average MaxHR
by gender Calculates average maximum heart rate by gender.
tapply(data$MaxHR,
data$Sex,
mean)
## F M
## 146.1399 134.3255
Filters high-risk patients based on age and cholesterol.
subset(data,
Age > 50 &
Cholesterol > 200)
## # A tibble: 378 × 13
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## <dbl> <fct> <fct> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 54 M ATA 110 208 0 Normal 142
## 2 54 F ATA 120 273 0 Normal 150
## 3 60 M ASY 100 248 0 Normal 125
## 4 53 M ASY 124 260 0 ST 112
## 5 52 M ATA 120 284 0 Normal 118
## 6 53 F ATA 113 468 0 Normal 127
## 7 53 M NAP 145 518 0 Normal 130
## 8 54 M ASY 125 224 0 Normal 122
## 9 65 M ASY 140 306 1 Normal 87
## 10 54 F ATA 150 230 0 Normal 130
## # ℹ 368 more rows
## # ℹ 5 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## # HeartDisease <fct>, risk_pred <chr>
Calculates average heart rate by disease group.
aggregate(MaxHR ~ HeartDisease,
data=data,
mean)
## HeartDisease MaxHR
## 1 0 148.1512
## 2 1 127.6555
Shows distribution of age with density curve.
hist(data$Age, probability=TRUE)
lines(density(data$Age), lwd=2)
### Q22: Scatterplot relationship Scatter plot between age and maximum
heart rate.
plot(data$Age,
data$MaxHR,
pch=19)
### Q23: Pearson correlation analysis Measures correlation between age
and cholesterol
cor(data$Age,
data$Cholesterol)
## [1] -0.09528177
Builds a regression model to predict heart disease using all variables.
lm_model <- lm(as.numeric(HeartDisease) ~ .,
data=data)
summary(lm_model)
##
## Call:
## lm(formula = as.numeric(HeartDisease) ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00063 -0.15069 0.00408 0.17146 0.98592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3744753 0.1513240 9.083 < 2e-16 ***
## Age 0.0015671 0.0013610 1.151 0.249868
## SexM 0.1533492 0.0277245 5.531 4.17e-08 ***
## ChestPainTypeATA -0.2414193 0.0337125 -7.161 1.66e-12 ***
## ChestPainTypeNAP -0.2226382 0.0291374 -7.641 5.51e-14 ***
## ChestPainTypeTA -0.1892664 0.0520612 -3.635 0.000293 ***
## RestingBP 0.0002425 0.0006149 0.394 0.693447
## Cholesterol -0.0004328 0.0001120 -3.866 0.000119 ***
## FastingBS 0.1243860 0.0272640 4.562 5.76e-06 ***
## RestingECGNormal -0.0166201 0.0289300 -0.574 0.565778
## RestingECGST -0.0193027 0.0354363 -0.545 0.586085
## MaxHR 0.0005824 0.0006547 0.890 0.373962
## ExerciseAnginaY 0.1301886 0.0275694 4.722 2.70e-06 ***
## Oldpeak 0.0456301 0.0125389 3.639 0.000289 ***
## ST_SlopeFlat 0.1592011 0.0455949 3.492 0.000503 ***
## ST_SlopeUp -0.2063628 0.0509695 -4.049 5.59e-05 ***
## risk_predLow Risk -0.1113325 0.0354258 -3.143 0.001729 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3251 on 901 degrees of freedom
## Multiple R-squared: 0.5803, Adjusted R-squared: 0.5729
## F-statistic: 77.86 on 16 and 901 DF, p-value: < 2.2e-16
Models non-linear relationship between age and heart rate.
poly_model <- lm(MaxHR ~ poly(Age,2),
data=data)
summary(poly_model)
##
## Call:
## lm(formula = MaxHR ~ poly(Age, 2), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77.860 -15.652 0.678 18.492 60.243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 136.8094 0.7747 176.596 <2e-16 ***
## poly(Age, 2)1 -294.5526 23.4723 -12.549 <2e-16 ***
## poly(Age, 2)2 59.5379 23.4723 2.537 0.0114 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.47 on 915 degrees of freedom
## Multiple R-squared: 0.1519, Adjusted R-squared: 0.1501
## F-statistic: 81.95 on 2 and 915 DF, p-value: < 2.2e-16
Shows residual errors of regression model.
plot(lm_model$residuals)
aggregate(Cholesterol ~ HeartDisease,
data=data,
median)
## HeartDisease Cholesterol
## 1 0 227
## 2 1 217
Shows relationships between multiple variables.
pairs(~Age + MaxHR + Cholesterol + Oldpeak,
data=data)
Counts unique values in each column.
sapply(data, n_distinct)
## Age Sex ChestPainType RestingBP Cholesterol
## 50 2 4 67 222
## FastingBS RestingECG MaxHR ExerciseAngina Oldpeak
## 2 3 119 2 53
## ST_Slope HeartDisease risk_pred
## 3 2 2
Groups age into intervals and counts each group.
table(cut(data$Age,
breaks=c(20,40,60,80)))
##
## (20,40] (40,60] (60,80]
## 93 604 221