1 1. Load Libraries

library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ppcor)
## Warning: package 'ppcor' was built under R version 4.2.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.3
library(cluster)
library(corrplot)
## corrplot 0.95 loaded
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(MASS)

2 2. Data Loading & Preparation

We load the dataset using read_excel() function to import the heart disease data into R for analysis.

getwd()
## [1] "C:/Users/Kareem/Documents"
data <- read_excel("C:/Users/kareem/Downloads/heart.xlsx")

 

data$HeartDisease <- as.factor(data$HeartDisease)
data$Sex <- as.factor(data$Sex)
data$ChestPainType <- as.factor(data$ChestPainType)
data$RestingECG <- as.factor(data$RestingECG)
data$ExerciseAngina <- as.factor(data$ExerciseAngina)
data$ST_Slope <- as.factor(data$ST_Slope)

 
head(data)
## # A tibble: 6 × 12
##     Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <fct> <fct>             <dbl>       <dbl>     <dbl> <fct>      <dbl>
## 1    40 M     ATA                 140         289         0 Normal       172
## 2    49 F     NAP                 160         180         0 Normal       156
## 3    37 M     ATA                 130         283         0 ST            98
## 4    48 F     ASY                 138         214         0 Normal       108
## 5    54 M     NAP                 150         195         0 Normal       122
## 6    39 M     NAP                 120         339         0 Normal       170
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## #   HeartDisease <fct>
dim(data)
## [1] 918  12
summary(data)
##       Age        Sex     ChestPainType   RestingBP      Cholesterol   
##  Min.   :28.00   F:193   ASY:496       Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:47.00   M:725   ATA:173       1st Qu.:120.0   1st Qu.:173.2  
##  Median :54.00           NAP:203       Median :130.0   Median :223.0  
##  Mean   :53.51           TA : 46       Mean   :132.4   Mean   :198.8  
##  3rd Qu.:60.00                         3rd Qu.:140.0   3rd Qu.:267.0  
##  Max.   :77.00                         Max.   :200.0   Max.   :603.0  
##    FastingBS       RestingECG      MaxHR       ExerciseAngina    Oldpeak       
##  Min.   :0.0000   LVH   :188   Min.   : 60.0   N:547          Min.   :-2.6000  
##  1st Qu.:0.0000   Normal:552   1st Qu.:120.0   Y:371          1st Qu.: 0.0000  
##  Median :0.0000   ST    :178   Median :138.0                  Median : 0.6000  
##  Mean   :0.2331                Mean   :136.8                  Mean   : 0.8874  
##  3rd Qu.:0.0000                3rd Qu.:156.0                  3rd Qu.: 1.5000  
##  Max.   :1.0000                Max.   :202.0                  Max.   : 6.2000  
##  ST_Slope   HeartDisease
##  Down: 63   0:410       
##  Flat:460   1:508       
##  Up  :395               
##                         
##                         
## 

3 3. Level 1: Statistical Analysis

3.1 Q1: Does Age differ between heart disease groups?

Calculates the average age for each heart disease group.

tapply(data$Age, data$HeartDisease, mean)
##        0        1 
## 50.55122 55.89961

3.2 Q2: Does Cholesterol differ by chest pain type?

Performs ANOVA test to check if cholesterol differs by chest pain type.

summary(aov(Cholesterol ~ ChestPainType, data=data))[[1]][["Pr(>F)"]][1]
## [1] 3.00616e-05

3.3 Q3: Is there a relationship between gender and heart disease?

Chi-square test to check relationship between gender and heart disease.

chisq.test(table(data$Sex, data$HeartDisease))$p.value
## [1] 4.597617e-20

3.4 Q4: Is Age normally distributed?

Tests whether age data follows a normal distribution.

shapiro.test(data$Age)$p.value
## [1] 2.165167e-05

3.5 Q5: Is the variance of Age equal between groups?

Tests if variance of age is equal between heart disease groups.

var.test(Age ~ HeartDisease, data=data)$p.value
## [1] 0.09135174

4 4. Level 2: Data Manipulation

4.1 Q6: Identify high-risk patients (high cholesterol & low heart rate)

data %>%
  group_by(HeartDisease) %>%
  mutate(avg_chol = mean(Cholesterol),
         avg_hr = mean(MaxHR)) %>%
  filter(Cholesterol > avg_chol & MaxHR < avg_hr) %>%
  dplyr::select(Age, Cholesterol, HeartDisease) %>%
  head(5)
## # A tibble: 5 × 3
## # Groups:   HeartDisease [2]
##     Age Cholesterol HeartDisease
##   <dbl>       <dbl> <fct>       
## 1    37         283 0           
## 2    48         214 1           
## 3    48         284 0           
## 4    60         248 1           
## 5    53         260 0

4.2 Q7: Calculate percentage of heart disease within each gender

data %>%
  count(Sex, HeartDisease) %>%
  group_by(Sex) %>%
  mutate(percent = n/sum(n)*100)
## # A tibble: 4 × 4
## # Groups:   Sex [2]
##   Sex   HeartDisease     n percent
##   <fct> <fct>        <int>   <dbl>
## 1 F     0              143    74.1
## 2 F     1               50    25.9
## 3 M     0              267    36.8
## 4 M     1              458    63.2

4.3 Q8: Show top 3 highest cholesterol cases per group

#Show top 3 highest cholesterol cases per group

data %>%
  group_by(HeartDisease) %>%
  slice_max(Cholesterol, n = 3)
## # A tibble: 6 × 12
## # Groups:   HeartDisease [2]
##     Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <fct> <fct>             <dbl>       <dbl>     <dbl> <fct>      <dbl>
## 1    67 F     NAP                 115         564         0 LVH          160
## 2    53 F     ATA                 113         468         0 Normal       127
## 3    58 M     ASY                 132         458         1 Normal        69
## 4    54 M     ASY                 130         603         1 Normal       125
## 5    32 M     ASY                 118         529         0 Normal       130
## 6    53 M     NAP                 145         518         0 Normal       130
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## #   HeartDisease <fct>

4.4 Q9: Show patients with above-average cholesterol

#Show patients with above-average cholesterol

data %>%
  group_by(HeartDisease) %>%
  filter(Cholesterol > mean(Cholesterol)) %>%
  slice_head(n=3)
## # A tibble: 6 × 12
## # Groups:   HeartDisease [2]
##     Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <fct> <fct>             <dbl>       <dbl>     <dbl> <fct>      <dbl>
## 1    40 M     ATA                 140         289         0 Normal       172
## 2    37 M     ATA                 130         283         0 ST            98
## 3    39 M     NAP                 120         339         0 Normal       170
## 4    49 F     NAP                 160         180         0 Normal       156
## 5    48 F     ASY                 138         214         0 Normal       108
## 6    37 M     ASY                 140         207         0 Normal       130
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## #   HeartDisease <fct>

4.5 Q10: Show lowest heart rate (MaxHR) cases per group

#Show lowest heart rate (MaxHR) cases per group

data %>%
  group_by(HeartDisease) %>%
  slice_min(MaxHR, n = 5)
## # A tibble: 10 × 12
## # Groups:   HeartDisease [2]
##      Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##    <dbl> <fct> <fct>             <dbl>       <dbl>     <dbl> <fct>      <dbl>
##  1    58 M     ASY                 132         458         1 Normal        69
##  2    40 M     NAP                 106         240         0 Normal        80
##  3    62 M     ASY                 120         220         0 ST            86
##  4    62 M     NAP                 120         220         0 LVH           86
##  5    46 F     ASY                 130         238         0 Normal        90
##  6    51 M     ASY                 140           0         0 Normal        60
##  7    60 M     ASY                 135           0         0 Normal        63
##  8    65 M     ASY                 145           0         1 ST            67
##  9    61 M     NAP                 200           0         1 ST            70
## 10    67 M     ASY                 120         237         0 Normal        71
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## #   HeartDisease <fct>

4.6 Level 3: Advanced Analysis & Modeling

4.7 Q11: Calculate mean values of Age, Cholesterol, and MaxHR for each group

#Calculate mean values of Age, Cholesterol, and MaxHR for each group

data %>%
  group_by(HeartDisease) %>%
  summarise(across(c(Age, Cholesterol, MaxHR),
                   ~mean(. ,na.rm=TRUE)))
## # A tibble: 2 × 4
##   HeartDisease   Age Cholesterol MaxHR
##   <fct>        <dbl>       <dbl> <dbl>
## 1 0             50.6        227.  148.
## 2 1             55.9        176.  128.

4.8 Q12: Visualize top cholesterol cases using bar chart

data %>%

  group_by(HeartDisease) %>%
  slice_max(Cholesterol, n = 5) %>%
  ggplot(aes(reorder(Age, Cholesterol), Cholesterol, fill=HeartDisease)) +
  geom_bar(stat="identity") +
  coord_flip() +
  ggtitle("Top Cholesterol per Group")

### Q13: Build a Decision Tree model and show variable importance

tree_model <- rpart(HeartDisease ~ Age + Cholesterol + MaxHR, data = data)

 
tree_model$variable.importance
##       MaxHR Cholesterol         Age 
##    68.00642    39.14198    28.96949

4.9 Q14: Predict heart disease using the decision tree model

pred <- predict(tree_model, type = "class")
table(pred, data$HeartDisease)
##     
## pred   0   1
##    0 283 104
##    1 127 404

4.10 Q15: Classify patients into High Risk and Low Risk

data$risk_pred <- ifelse(predict(tree_model, type = "prob")[,2] > 0.5, 
                         "High Risk", "Low Risk")

table(data$risk_pred)
## 
## High Risk  Low Risk 
##       531       387

4.11 Q16: Visualize risk prediction

Scatter plot showing risk prediction based on age and cholesterol.

ggplot(data, aes(x = Age, y = Cholesterol, color = risk_pred)) +
  geom_point(size = 2) +
  ggtitle("Predicted High Risk Patients") +
  xlab("Age") +
  ylab("Cholesterol") +
  theme_minimal()

### Q17: Visualize the distribution of Cholesterol using a histogram Histogram showing cholesterol distribution by disease status.

ggplot(data, aes(x = Cholesterol, fill = HeartDisease)) +
  geom_histogram(bins = 30, alpha = 0.6) +
  ggtitle("Cholesterol Distribution by Heart Disease") +
  theme_minimal()

## Level 4: Exploratory Data Analysis ### Q18: Calculate average MaxHR by gender Calculates average maximum heart rate by gender.

tapply(data$MaxHR,
       data$Sex,
       mean)
##        F        M 
## 146.1399 134.3255

4.12 Q19: Filter high-risk patients

Filters high-risk patients based on age and cholesterol.

subset(data,
       Age > 50 &
       Cholesterol > 200)
## # A tibble: 378 × 13
##      Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##    <dbl> <fct> <fct>             <dbl>       <dbl>     <dbl> <fct>      <dbl>
##  1    54 M     ATA                 110         208         0 Normal       142
##  2    54 F     ATA                 120         273         0 Normal       150
##  3    60 M     ASY                 100         248         0 Normal       125
##  4    53 M     ASY                 124         260         0 ST           112
##  5    52 M     ATA                 120         284         0 Normal       118
##  6    53 F     ATA                 113         468         0 Normal       127
##  7    53 M     NAP                 145         518         0 Normal       130
##  8    54 M     ASY                 125         224         0 Normal       122
##  9    65 M     ASY                 140         306         1 Normal        87
## 10    54 F     ATA                 150         230         0 Normal       130
## # ℹ 368 more rows
## # ℹ 5 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## #   HeartDisease <fct>, risk_pred <chr>

4.13 Q20: Group-wise average MaxHR

Calculates average heart rate by disease group.

aggregate(MaxHR ~ HeartDisease,
          data=data,
          mean)
##   HeartDisease    MaxHR
## 1            0 148.1512
## 2            1 127.6555

4.14 Q21: Histogram with density curve

Shows distribution of age with density curve.

hist(data$Age, probability=TRUE)
lines(density(data$Age), lwd=2)

### Q22: Scatterplot relationship Scatter plot between age and maximum heart rate.

 plot(data$Age,
     data$MaxHR,
     pch=19)

### Q23: Pearson correlation analysis Measures correlation between age and cholesterol

cor(data$Age,
    data$Cholesterol)
## [1] -0.09528177

5 Level 5: Regression Analysis

5.1 Q24: Multiple Linear Regression

Builds a regression model to predict heart disease using all variables.

lm_model <- lm(as.numeric(HeartDisease) ~ .,
               data=data)
summary(lm_model)
## 
## Call:
## lm(formula = as.numeric(HeartDisease) ~ ., data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00063 -0.15069  0.00408  0.17146  0.98592 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.3744753  0.1513240   9.083  < 2e-16 ***
## Age                0.0015671  0.0013610   1.151 0.249868    
## SexM               0.1533492  0.0277245   5.531 4.17e-08 ***
## ChestPainTypeATA  -0.2414193  0.0337125  -7.161 1.66e-12 ***
## ChestPainTypeNAP  -0.2226382  0.0291374  -7.641 5.51e-14 ***
## ChestPainTypeTA   -0.1892664  0.0520612  -3.635 0.000293 ***
## RestingBP          0.0002425  0.0006149   0.394 0.693447    
## Cholesterol       -0.0004328  0.0001120  -3.866 0.000119 ***
## FastingBS          0.1243860  0.0272640   4.562 5.76e-06 ***
## RestingECGNormal  -0.0166201  0.0289300  -0.574 0.565778    
## RestingECGST      -0.0193027  0.0354363  -0.545 0.586085    
## MaxHR              0.0005824  0.0006547   0.890 0.373962    
## ExerciseAnginaY    0.1301886  0.0275694   4.722 2.70e-06 ***
## Oldpeak            0.0456301  0.0125389   3.639 0.000289 ***
## ST_SlopeFlat       0.1592011  0.0455949   3.492 0.000503 ***
## ST_SlopeUp        -0.2063628  0.0509695  -4.049 5.59e-05 ***
## risk_predLow Risk -0.1113325  0.0354258  -3.143 0.001729 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3251 on 901 degrees of freedom
## Multiple R-squared:  0.5803, Adjusted R-squared:  0.5729 
## F-statistic: 77.86 on 16 and 901 DF,  p-value: < 2.2e-16

5.2 Q25: Polynomial Regression

Models non-linear relationship between age and heart rate.

poly_model <- lm(MaxHR ~ poly(Age,2),
                 data=data)

summary(poly_model)
## 
## Call:
## lm(formula = MaxHR ~ poly(Age, 2), data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -77.860 -15.652   0.678  18.492  60.243 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    136.8094     0.7747 176.596   <2e-16 ***
## poly(Age, 2)1 -294.5526    23.4723 -12.549   <2e-16 ***
## poly(Age, 2)2   59.5379    23.4723   2.537   0.0114 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.47 on 915 degrees of freedom
## Multiple R-squared:  0.1519, Adjusted R-squared:  0.1501 
## F-statistic: 81.95 on 2 and 915 DF,  p-value: < 2.2e-16

5.3 Q26: Residual Analysis

Shows residual errors of regression model.

plot(lm_model$residuals)

6 Level 6: Advanced Data Exploration

6.1 Q27: Calculate Median Values for Each Group

aggregate(Cholesterol ~ HeartDisease,
          data=data,
          median)
##   HeartDisease Cholesterol
## 1            0         227
## 2            1         217

6.2 Q28: Pairwise Variable Exploration

Shows relationships between multiple variables.

pairs(~Age + MaxHR + Cholesterol + Oldpeak,
      data=data)

6.3 Q29: sapply(data, n_distinct)

Counts unique values in each column.

sapply(data, n_distinct)
##            Age            Sex  ChestPainType      RestingBP    Cholesterol 
##             50              2              4             67            222 
##      FastingBS     RestingECG          MaxHR ExerciseAngina        Oldpeak 
##              2              3            119              2             53 
##       ST_Slope   HeartDisease      risk_pred 
##              3              2              2

6.4 Q30: Age group distribution

Groups age into intervals and counts each group.

table(cut(data$Age,
          breaks=c(20,40,60,80)))
## 
## (20,40] (40,60] (60,80] 
##      93     604     221