Heart Disease

train_data <- read.csv("D:/project/archive (1)/heart_disease.csv")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.3

heart_data <- train_data   
 View(heart_data)

 # 1. Structure of dataset
 str(heart_data)

## 'data.frame':    1024 obs. of  15 variables:
##  $ age          : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex          : int  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp           : int  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps     : num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol         : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs          : int  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg      : int  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach      : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang        : int  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak      : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope        : int  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca           : num  0 3 2 0 0 0 2 0 1 0 ...
##  $ thal         : num  6 3 7 3 3 3 3 3 7 7 ...
##  $ num          : int  0 2 1 0 0 0 3 0 2 1 ...
##  $ target_binary: int  0 1 1 0 0 0 1 0 1 1 ...

 names(heart_data)

##  [1] "age"           "sex"           "cp"            "trestbps"     
##  [5] "chol"          "fbs"           "restecg"       "thalach"      
##  [9] "exang"         "oldpeak"       "slope"         "ca"           
## [13] "thal"          "num"           "target_binary"

 head(heart_data)

##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal num
## 1  63   1  1      145  233   1       2     150     0     2.3     3  0    6   0
## 2  67   1  4      160  286   0       2     108     1     1.5     2  3    3   2
## 3  67   1  4      120  229   0       2     129     1     2.6     2  2    7   1
## 4  37   1  3      130  250   0       0     187     0     3.5     3  0    3   0
## 5  41   0  2      130  204   0       2     172     0     1.4     1  0    3   0
## 6  56   1  2      120  236   0       0     178     0     0.8     1  0    3   0
##   target_binary
## 1             0
## 2             1
## 3             1
## 4             0
## 5             0
## 6             0

 tail(heart_data)

##           age sex cp trestbps     chol fbs restecg  thalach exang   oldpeak
## 1019 54.86364   1  4 129.5450 234.2354   0       2 148.6876     1 0.3534338
## 1020 60.07221   1  3 115.4221 181.7686   0       0 128.1559     1 1.0019411
## 1021 70.92840   1  2 166.7272 244.9936   1       0 108.4813     0 0.6983369
## 1022 57.33288   1  2 105.0752 233.1463   0       2 140.3420     0 1.5191382
## 1023 40.88155   0  4 125.0686 154.3707   1       2 123.7563     0 1.9366838
## 1024 46.86337   1  3 121.9098 291.1755   1       2 146.6523     1 1.9139512
##      slope ca thal num target_binary
## 1019     2  2    7   2             1
## 1020     2  1    7   2             1
## 1021     2  1    7   2             1
## 1022     1  0    7   2             1
## 1023     2  1    7   2             1
## 1024     2  2    3   2             1

 summary(heart_data)

##       age             sex               cp           trestbps     
##  Min.   :18.00   Min.   :0.0000   Min.   :1.000   Min.   : 84.87  
##  1st Qu.:48.19   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:120.00  
##  Median :55.00   Median :1.0000   Median :3.000   Median :130.01  
##  Mean   :54.53   Mean   :0.6855   Mean   :3.149   Mean   :131.48  
##  3rd Qu.:61.20   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:142.41  
##  Max.   :78.80   Max.   :1.0000   Max.   :4.000   Max.   :200.00  
##       chol            fbs           restecg         thalach     
##  Min.   :100.0   Min.   :0.000   Min.   :0.000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:133.6  
##  Median :245.0   Median :0.000   Median :1.000   Median :151.5  
##  Mean   :247.0   Mean   :0.165   Mean   :1.002   Mean   :149.6  
##  3rd Qu.:280.2   3rd Qu.:0.000   3rd Qu.:2.000   3rd Qu.:165.6  
##  Max.   :564.0   Max.   :1.000   Max.   :2.000   Max.   :227.2  
##      exang           oldpeak           slope             ca        
##  Min.   :0.0000   Min.   :0.0000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.1521   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.8550   Median :2.000   Median :0.0000  
##  Mean   :0.3457   Mean   :1.0901   Mean   :1.619   Mean   :0.6816  
##  3rd Qu.:1.0000   3rd Qu.:1.6305   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.2000   Max.   :3.000   Max.   :3.0000  
##       thal            num         target_binary  
##  Min.   :3.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:3.000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :3.000   Median :0.0000   Median :0.000  
##  Mean   :4.701   Mean   :0.9238   Mean   :0.459  
##  3rd Qu.:7.000   3rd Qu.:2.0000   3rd Qu.:1.000  
##  Max.   :7.000   Max.   :4.0000   Max.   :1.000

 dim(heart_data)

## [1] 1024   15

 # Convert categorical variables to factor
 heart_data$sex <- as.factor(heart_data$sex)
 heart_data$cp <- as.factor(heart_data$cp)
 heart_data$target <- as.factor(heart_data$target)
 
 #Interpretation:
#The dataset contains multiple features such as age, cholesterol, blood pressure, and heart disease status. It includes both numerical and categorical variables, making it suitable for exploratory data analysis.

 # 2. Missing values
 colSums(is.na(heart_data))

##           age           sex            cp      trestbps          chol 
##             0             0             0             0             0 
##           fbs       restecg       thalach         exang       oldpeak 
##             0             0             0             0             0 
##         slope            ca          thal           num target_binary 
##             0             0             0             0             0 
##        target 
##             0

#Interpretation:
#No significant missing values were found, indicating the dataset is clean and reliable for analysis.

 # 2.1 Filtering patients with high cholesterol
 high_chol <- heart_data %>%
   filter(chol > 240) %>%
   select(age, chol, target)
 
 head(high_chol)

##   age chol target
## 1  67  286      1
## 2  37  250      0
## 3  62  268      1
## 4  57  354      0
## 5  63  254      1
## 6  56  294      0

# Interpretation:
#Patients with cholesterol levels above 240 are at higher risk of heart disease, showing that cholesterol is an important health indicator.

# 2.2 Top 10 highest cholesterol patients
 top_chol <- heart_data %>%
   arrange(desc(chol)) %>%
   head(10)
 
 top_chol

##         age sex cp trestbps     chol fbs restecg  thalach exang   oldpeak slope
## 1  67.00000   0  3 115.0000 564.0000   0       2 160.0000     0 1.6000000     2
## 2  52.35865   1  4 133.2919 430.3003   0       0 180.0519     0 0.0000000     1
## 3  65.00000   0  3 140.0000 417.0000   1       2 157.0000     0 0.8000000     1
## 4  56.00000   0  4 134.0000 409.0000   0       2 150.0000     1 1.9000000     2
## 5  42.17283   0  4 163.0629 408.5644   0       0 194.7920     0 0.0000000     2
## 6  63.00000   0  4 150.0000 407.0000   0       2 154.0000     0 4.0000000     2
## 7  68.38154   1  4 135.7192 406.5728   0       2 179.0287     0 0.7979003     1
## 8  62.00000   0  4 140.0000 394.0000   0       2 157.0000     0 1.2000000     2
## 9  61.15558   0  3 115.7312 391.0362   0       0 137.6991     0 0.0000000     1
## 10 69.39888   0  2 119.5372 387.8709   0       2 135.1319     1 0.9898378     1
##    ca thal num target_binary target
## 1   0    7   0             0      0
## 2   0    3   0             0      0
## 3   1    3   0             0      0
## 4   2    7   2             1      1
## 5   0    7   0             0      0
## 6   3    7   4             1      1
## 7   2    7   2             1      1
## 8   0    3   0             0      0
## 9   0    3   0             0      0
## 10  0    3   0             0      0

#Interpretation:
#The top 10 patients have extremely high cholesterol levels, which may significantly increase their risk of developing heart disease.

 # 2.3 Ranking patients by cholesterol
 rank_chol <- heart_data %>%
   arrange(desc(chol)) %>%
   mutate(rank = row_number())
 
 head(rank_chol)

##        age sex cp trestbps     chol fbs restecg  thalach exang oldpeak slope ca
## 1 67.00000   0  3 115.0000 564.0000   0       2 160.0000     0     1.6     2  0
## 2 52.35865   1  4 133.2919 430.3003   0       0 180.0519     0     0.0     1  0
## 3 65.00000   0  3 140.0000 417.0000   1       2 157.0000     0     0.8     1  1
## 4 56.00000   0  4 134.0000 409.0000   0       2 150.0000     1     1.9     2  2
## 5 42.17283   0  4 163.0629 408.5644   0       0 194.7920     0     0.0     2  0
## 6 63.00000   0  4 150.0000 407.0000   0       2 154.0000     0     4.0     2  3
##   thal num target_binary target rank
## 1    7   0             0      0    1
## 2    3   0             0      0    2
## 3    3   0             0      0    3
## 4    7   2             1      1    4
## 5    7   0             0      0    5
## 6    7   4             1      1    6

 #Interpretation:
#Ranking helps identify the most critical patients based on cholesterol levels, useful for prioritizing medical attention.

 # 2.4 Patient with highest cholesterol
 top_patient <- rank_chol %>%
   filter(rank == 1)
 
 top_patient

##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal num
## 1  67   0  3      115  564   0       2     160     0     1.6     2  0    7   0
##   target_binary target rank
## 1             0      0    1

#Interpretation:
#The patient with the highest cholesterol level represents the most critical case and may have a higher risk of heart disease.

 # 3 Age vs Heart Disease
 ggplot(heart_data, aes(x = age, fill = target)) +
   geom_histogram(binwidth = 5) +
   labs(title = "Age Distribution by Heart Disease",
        x = "Age", y = "Count")

#Interpretation:
#Most heart disease cases are observed in middle-aged and older individuals, indicating age as a major risk factor.

 # 4 Cholesterol distribution
 ggplot(heart_data, aes(x = chol)) +
   geom_histogram(binwidth = 20, fill = "red", color = "black") +
   labs(title = "Cholesterol Distribution")

#Interpretation:
#Most patients fall within a moderate cholesterol range, while fewer patients have extremely high or low values.

# 5 Chest pain type vs disease
 ggplot(heart_data, aes(x = cp, fill = target)) +
   geom_bar() +
   labs(title = "Chest Pain Type vs Heart Disease")

#Interpretation:
#Certain types of chest pain are more frequently associated with heart disease, making it an important diagnostic feature.

 # 6 Scatter plot 
 ggplot(heart_data, aes(x = age, y = chol, color = target)) +
   geom_point(size = 2, alpha = 0.7) +
   geom_smooth(method = "lm", se = FALSE, color = "black") +
   labs(title = "Age vs Cholesterol (Relationship Check)")

## `geom_smooth()` using formula = 'y ~ x'

#Interpretation:
#The scatter plot shows a weak relationship between age and cholesterol. The regression line is relatively flat, indicating no strong correlation.

# 7 Blood pressure vs heart disease
 ggplot(heart_data, aes(x = trestbps, y = chol, color = target)) +
   geom_point() +
   geom_smooth(method = "lm", se = FALSE) +
   labs(title = "BP vs Cholesterol")

## `geom_smooth()` using formula = 'y ~ x'

#Interpretation:
#There is a slight trend suggesting higher blood pressure may be associated with higher cholesterol, but the relationship is not very strong.

 # 8 Average cholesterol by gender
 gender_avg <- heart_data %>%
   group_by(sex) %>%
   summarise(avg_chol = mean(chol))
 
 ggplot(gender_avg, aes(x = sex, y = avg_chol, fill = sex)) +
   geom_bar(stat = "identity") +
   labs(title = "Avg Cholesterol by Gender")

#Interpretation:
#There is a difference in average cholesterol levels between genders, which may indicate gender-based health patterns.

 # 9 Max heart rate vs disease
 ggplot(heart_data, aes(x = thalach, y = chol, color = target)) +
   geom_point() +
   geom_smooth(method = "lm") +
   labs(title = "Max Heart Rate vs Cholesterol")

## `geom_smooth()` using formula = 'y ~ x'

#Interpretation:
#The relationship between maximum heart rate and cholesterol appears weak, suggesting they are not strongly dependent on each other.

 # 10 Heart disease count
 ggplot(heart_data, aes(x = target, fill = target)) +
   geom_bar() +
   labs(title = "Heart Disease Distribution")

#Interpretation:
#The dataset shows the distribution of patients with and without heart disease, helping understand class balance.

# 11 Fasting blood sugar vs disease
 ggplot(heart_data, aes(x = fbs, fill = target)) +
   geom_bar() +
   labs(title = "Fasting Blood Sugar vs Disease")

#Interpretation:
#Patients with higher fasting blood sugar may show a higher tendency toward heart disease, though the relationship may vary.

 # 12 ST depression vs disease

 ggplot(heart_data, aes(x = oldpeak, y = chol, color = target)) +
   geom_point() +
   geom_smooth(method = "lm") +
   labs(title = "ST Depression vs Cholesterol")

## `geom_smooth()` using formula = 'y ~ x'

#Interpretation:
#There is a slight relationship between ST depression and cholesterol, but it is not strongly linear, suggesting other factors may also play a significant role in heart disease risk.

 # 13 Create age groups
 heart_data$age_group <- cut(heart_data$age,
                             breaks = c(20, 40, 60, 80),
                             labels = c("Young", "Middle", "Old"))
 
 # Plot
 ggplot(heart_data, aes(x = age_group, fill = target)) +
   geom_bar() +
   labs(title = "Heart Disease by Age Group",
        x = "Age Group",
        y = "Count")

#Interpretation:
#Heart disease is more common in middle-aged and older groups, confirming age as a key risk factor.

# 14 Create BP categories
 heart_data$bp_group <- ifelse(heart_data$trestbps < 120, "Normal",
                               ifelse(heart_data$trestbps < 140, "Elevated", "High"))
 
 # Plot
 ggplot(heart_data, aes(x = bp_group, fill = target)) +
   geom_bar() +
   labs(title = "Blood Pressure vs Heart Disease",
        x = "BP Category",
        y = "Count")

#Interpretation:
#Higher blood pressure categories show a greater prevalence of heart disease, indicating blood pressure as an important risk factor.
 #Patients with high blood pressure show a higher count of heart disease cases, indicating hypertension as a major risk.

# Q 15 Cholesterol vs heart disease
 ggplot(heart_data, aes(x = target, y = chol, fill = target)) +
   geom_boxplot() +
   labs(title = "Cholesterol vs Heart Disease",
        x = "Heart Disease (0 = No, 1 = Yes)",
        y = "Cholesterol")

#Interpretation:
#Patients with heart disease tend to have higher median cholesterol levels, and the spread indicates variability among patients.

# 16 CDF of Cholesterol

plot(
  ecdf(heart_data$chol),
  main = "CDF of Cholesterol",
  xlab = "Cholesterol",
  ylab = "Cumulative Probability",
  col = "blue"
)

# interpretation:
#The CDF graph shows that most patients have cholesterol levels concentrated within a moderate range, while only a few patients have extremely high cholesterol values.

# 17 Box Plot of Cholesterol by Gender

ggplot(heart_data,
       aes(x = sex,
           y = chol,
           fill = sex)) +
  geom_boxplot() +
  labs(
    title = "Cholesterol by Gender",
    x = "Gender",
    y = "Cholesterol"
  )

#interpretation:
#The box plot indicates differences in cholesterol distribution between genders and highlights outliers with unusually high cholesterol levels.

# 18 Box Plot of Age by Heart Disease

ggplot(heart_data,
       aes(x = target,
           y = age,
           fill = target)) +
  geom_boxplot() +
  labs(
    title = "Age vs Heart Disease",
    x = "Heart Disease",
    y = "Age"
  )

#interpretation:
#Patients with heart disease generally belong to older age groups, although some younger patients are also affected.

# 19 ANOVA for Cholesterol and Chest Pain Type

anova_cp <- aov(
  chol ~ cp,
  data = heart_data
)

summary(anova_cp)

##               Df  Sum Sq Mean Sq F value Pr(>F)  
## cp             3   29946    9982   3.641 0.0124 *
## Residuals   1020 2796279    2741                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#interpretation:
#Patients with heart disease generally belong to older age groups, although some younger patients are also affected.

# 20 ANOVA for Age Group and Cholesterol

anova_age <- aov(
  chol ~ age_group,
  data = heart_data
)

summary(anova_age)

##               Df  Sum Sq Mean Sq F value   Pr(>F)    
## age_group      2  110532   55266   20.79 1.41e-09 ***
## Residuals   1020 2711611    2658                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness

#interpretation:
#Different age groups show variations in cholesterol levels, with older groups tending to have higher values.

# 21 Correlation between Age and Cholesterol

cor(
  heart_data$age,
  heart_data$chol,
  use = "complete.obs"
)

## [1] 0.216582

#interpretation:
#The correlation coefficient indicates a weak positive relationship between age and cholesterol, suggesting that as age increases A positive relationship exists between age and cholesterol, indicating cholesterol tends to increase as age increases.

# 22 Correlation between BP and Cholesterol

cor(
  heart_data$trestbps,
  heart_data$chol,
  use = "complete.obs"
)

## [1] 0.1634889

#interpretation:
#Blood pressure and cholesterol show a moderate positive relationship, suggesting that higher blood pressure is associated with higher cholesterol.

# 23 Correlation Matrix

numeric_data <- heart_data %>%
  select(age, chol, trestbps, thalach, oldpeak)

cor(numeric_data)

##                 age        chol    trestbps      thalach     oldpeak
## age       1.0000000 0.216582048  0.30097560 -0.418977045  0.21192202
## chol      0.2165820 1.000000000  0.16348888  0.009172488  0.03408721
## trestbps  0.3009756 0.163488880  1.00000000 -0.058999440  0.12127645
## thalach  -0.4189770 0.009172488 -0.05899944  1.000000000 -0.35991552
## oldpeak   0.2119220 0.034087211  0.12127645 -0.359915516  1.00000000

#interpretation:
#The correlation matrix reveals the strength and direction of relationships among medical variables such as age, cholesterol, blood pressure, and heart rate.

# 24 Single Regression: Cholesterol vs Age

single_reg1 <- lm(
  chol ~ age,
  data = heart_data
)

summary(single_reg1)

## 
## Call:
## lm(formula = chol ~ age, data = heart_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -139.321  -34.588   -3.006   33.601  301.730 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 180.0226     9.5743  18.803  < 2e-16 ***
## age           1.2276     0.1731   7.092 2.46e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.34 on 1022 degrees of freedom
## Multiple R-squared:  0.04691,    Adjusted R-squared:  0.04598 
## F-statistic:  50.3 on 1 and 1022 DF,  p-value: 2.462e-12

#interpretation:
#The regression analysis shows that age has a statistically significant but weak positive effect on cholesterol levels,
#The regression model shows that age can be used to predict cholesterol levels with a positive trend.

# 25 Plot Single Regression

ggplot(heart_data,
       aes(x = age,
           y = chol)) +
  geom_point(color = "blue") +
  geom_smooth(
    method = "lm",
    se = FALSE,
    color = "red"
  ) +
  labs(
    title = "Age vs Cholesterol Regression",
    x = "Age",
    y = "Cholesterol"
  )

## `geom_smooth()` using formula = 'y ~ x'

#interpretation:
#The scatter plot with the regression line illustrates the relationship between age and cholesterol, showing a positive
#The regression line demonstrates an upward relationship between age and cholesterol levels.

# 26 Single Regression: BP vs Cholesterol

single_reg2 <- lm(
  chol ~ trestbps,
  data = heart_data
)

summary(single_reg2)

## 
## Call:
## lm(formula = chol ~ trestbps, data = heart_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -144.96  -35.32   -0.20   32.13  325.12 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 182.42699   12.28944  14.844  < 2e-16 ***
## trestbps      0.49086    0.09265   5.298 1.44e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.88 on 1022 degrees of freedom
## Multiple R-squared:  0.02673,    Adjusted R-squared:  0.02578 
## F-statistic: 28.07 on 1 and 1022 DF,  p-value: 1.435e-07

#interpretation:
#The regression analysis indicates that blood pressure has a statistically significant positive effect on cholesterol levels, suggesting that higher blood pressure is associated with higher cholesterol. The regression analysis indicates that cholesterol tends to rise with increasing blood pressure.

# 27 Multiple Regression

multiple_reg <- lm(
  chol ~ age + trestbps + thalach,
  data = heart_data
)

summary(multiple_reg)

## 
## Call:
## lm(formula = chol ~ age + trestbps + thalach, data = heart_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -143.461  -34.076   -2.771   32.232  302.816 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 97.40695   19.98270   4.875 1.26e-06 ***
## age          1.32592    0.19812   6.693 3.61e-11 ***
## trestbps     0.29950    0.09546   3.138 0.001753 ** 
## thalach      0.25311    0.07475   3.386 0.000736 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.82 on 1020 degrees of freedom
## Multiple R-squared:  0.06801,    Adjusted R-squared:  0.06527 
## F-statistic: 24.81 on 3 and 1020 DF,  p-value: 1.691e-15

#interpretation:
#The multiple regression model shows that age, blood pressure, and maximum heart rate all have statistically. Using multiple variables together improves the prediction of cholesterol levels compared to using a single variable.

# 28 Predict Cholesterol Values

predicted_chol <- predict(multiple_reg)

head(predicted_chol)

##        1        2        3        4        5        6 
## 262.3340 261.4995 254.8349 232.7326 234.2397 252.6522

#interpretation:
#The predicted cholesterol values from the multiple regression model provide estimates based on the combined effects of age. The predicted cholesterol values are reasonably close to the actual values for many patients.

# 29 Actual vs Predicted Plot

actual_predicted <- data.frame(
  Actual = heart_data$chol,
  Predicted = predicted_chol
)

ggplot(actual_predicted,
       aes(x = Actual,
           y = Predicted)) +
  geom_point(color = "darkgreen") +
  labs(
    title = "Actual vs Predicted Cholesterol",
    x = "Actual",
    y = "Predicted"
  )

#interpretation:
#The actual vs predicted plot shows how well the multiple regression model fits the data. Points close. Most predicted values follow the pattern of actual cholesterol values, showing acceptable model performance.

# 30 Polynomial Regression Degree 2

poly_reg <- lm(
  chol ~ poly(age, 2),
  data = heart_data
)

summary(poly_reg)

## 
## Call:
## lm(formula = chol ~ poly(age, 2), data = heart_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -139.000  -34.625   -2.617   34.130  303.777 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    246.965      1.603 154.036  < 2e-16 ***
## poly(age, 2)1  364.104     51.306   7.097 2.39e-12 ***
## poly(age, 2)2  -78.240     51.306  -1.525    0.128    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.31 on 1021 degrees of freedom
## Multiple R-squared:  0.04907,    Adjusted R-squared:  0.04721 
## F-statistic: 26.35 on 2 and 1021 DF,  p-value: 6.982e-12

#interpretation:
#The polynomial regression model indicates that the relationship between age and cholesterol is not strictly linear, and The polynomial regression model captures the curved relationship between age and cholesterol more effectively than linear regression.

# 31 Plot Polynomial Regression

ggplot(heart_data,
       aes(x = age,
           y = chol)) +
  geom_point(color = "purple") +
  stat_smooth(
    method = "lm",
    formula = y ~ poly(x, 2),
    se = FALSE,
    color = "red"
  ) +
  labs(
    title = "Polynomial Regression",
    x = "Age",
    y = "Cholesterol"
  )

#interpretation:
#The plot of the polynomial regression shows a curved relationship between age and cholesterol, indicating that cholesterol. The curved regression line fits the data better than a straight line, indicating a non-linear relationship.

# 32 Polynomial Regression Degree 3

poly_reg3 <- lm(
  chol ~ poly(age, 3),
  data = heart_data
)

summary(poly_reg3)

## 
## Call:
## lm(formula = chol ~ poly(age, 3), data = heart_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -139.743  -34.254   -2.356   33.496  303.917 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    246.965      1.604 154.014  < 2e-16 ***
## poly(age, 3)1  364.104     51.313   7.096  2.4e-12 ***
## poly(age, 3)2  -78.240     51.313  -1.525    0.128    
## poly(age, 3)3   43.299     51.313   0.844    0.399    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.31 on 1020 degrees of freedom
## Multiple R-squared:  0.04974,    Adjusted R-squared:  0.04694 
## F-statistic:  17.8 on 3 and 1020 DF,  p-value: 2.905e-11

#interpretation
#The cubic polynomial regression model suggests that the relationship between age and cholesterol is more complex. The higher-degree polynomial regression provides greater flexibility in modeling complex patterns within the dataset.

# 33 Residual Plot

plot(
  single_reg1$fitted.values,
  single_reg1$residuals,
  main = "Residual Plot",
  xlab = "Fitted Values",
  ylab = "Residuals",
  col = "blue"
)

abline(h = 0, col = "red")

#interpretation:
#The residual plot shows the distribution of residuals around the fitted values. The residuals are randomly scattered around zero, indicating that the regression model fits the data reasonably well.

# 34 Density Plot of Cholesterol

ggplot(heart_data,
       aes(x = chol)) +
  geom_density(fill = "lightblue") +
  labs(
    title = "Density Plot of Cholesterol",
    x = "Cholesterol",
    y = "Density"
  )

#interpretation:
#The density plot shows the distribution of cholesterol levels among patients. The distribution appears to be right skewed, with a longer tail on the higher cholesterol side, indicating that while most patients have moderate cholesterol levels, a few have very high levels.The density plot shows that cholesterol values are concentrated around the average range with fewer extreme values.

# 35 QQ Plot

qqnorm(heart_data$chol)

qqline(
  heart_data$chol,
  col = "red"
)

#interpretation:
#The QQ plot assesses the normality of cholesterol values.Most points lie close to the reference line, suggesting the cholesterol data is approximately normally distributed.

# 36 Covariance between Age and Cholesterol

cov(
  heart_data$age,
  heart_data$chol,
  use = "complete.obs"
)

## [1] 105.5667

#interpretation:
#The covariance value indicates the direction of the relationship between age and cholesterol. The positive covariance indicates that age and cholesterol tend to increase together. A positive covariance suggests that as age increases, cholesterol levels also tend to increase, indicating a positive relationship between the two variables.

# 37 Scatter Plot Matrix

pairs(
  heart_data[, c("age",
                 "chol",
                 "trestbps",
                 "thalach",
                 "oldpeak")],
  main = "Scatter Plot Matrix"
)

#interpretation:
# The scatter plot matrix shows pairwise relationships between multiple numerical variables. It helps identify potential correlations and patterns among age, cholesterol, blood pressure, heart rate, and ST depression.

# 38 Outlier Detection using Boxplot

boxplot(
  heart_data$chol,
  main = "Outlier Detection in Cholesterol",
  col = "pink"
)

#interpretation:
#The boxplot identifies outliers in cholesterol levels. Points outside the whiskers represent potential outliers, indicating patients with unusually high cholesterol levels that may require special attention. The boxplot identifies several extreme cholesterol values that may represent high-risk patients.

# 39 Distribution of Heart Rate

ggplot(heart_data,
       aes(x = thalach)) +
  geom_histogram(
    binwidth = 10,
    fill = "orange",
    color = "black"
  ) +
  labs(
    title = "Heart Rate Distribution",
    x = "Maximum Heart Rate",
    y = "Count"
  )

#interpretation:
#The histogram shows the distribution of maximum heart rates among patients. The distribution appears to be approximately
#normal, with most patients having heart rates in the moderate range and fewer patients with very high or low heart rates.

# 40 BP vs Heart Disease Boxplot
ggplot(heart_data,
       aes(x = target,
           y = trestbps,
           fill = target)) +
  geom_boxplot() +
  labs(
    title = "Blood Pressure vs Heart Disease",
    x = "Heart Disease",
    y = "Blood Pressure"
  )

#interpretation:
#The boxplot shows that patients with heart disease tend to have higher blood pressure levels compared to those without heart disease, indicating blood pressure as a significant risk factor. Patients with heart disease tend to show greater variation in blood pressure levels compared to patients without heart disease.

# final conclusion:

#The Heart Disease dataset analysis showed that factors such as age, cholesterol, blood pressure, and heart rate have a strong relationship with heart disease. Exploratory analysis and visualizations helped identify patterns, distributions, and outliers in patient health data. Correlation and regression analysis indicated that cholesterol levels increase with age and blood pressure. Multiple regression provided better prediction accuracy compared to single regression by considering several variables together. Polynomial regression captured non-linear relationships more effectively than linear regression. Overall, the analysis demonstrated how statistical techniques and data visualization can help understand heart disease risk factors and support predictive healthcare analysis.

Heart Disease

Keshav Kanhiya , Lokesh

2026-04-15