train_data <- read.csv("D:/project/archive (1)/heart_disease.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
heart_data <- train_data
View(heart_data)
# 1. Structure of dataset
str(heart_data)
## 'data.frame': 1024 obs. of 15 variables:
## $ age : num 63 67 67 37 41 56 62 57 63 53 ...
## $ sex : int 1 1 1 1 0 1 0 0 1 1 ...
## $ cp : int 1 4 4 3 2 2 4 4 4 4 ...
## $ trestbps : num 145 160 120 130 130 120 140 120 130 140 ...
## $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
## $ fbs : int 1 0 0 0 0 0 0 0 0 1 ...
## $ restecg : int 2 2 2 0 2 0 2 0 2 2 ...
## $ thalach : num 150 108 129 187 172 178 160 163 147 155 ...
## $ exang : int 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
## $ slope : int 3 2 2 3 1 1 3 1 2 3 ...
## $ ca : num 0 3 2 0 0 0 2 0 1 0 ...
## $ thal : num 6 3 7 3 3 3 3 3 7 7 ...
## $ num : int 0 2 1 0 0 0 3 0 2 1 ...
## $ target_binary: int 0 1 1 0 0 0 1 0 1 1 ...
names(heart_data)
## [1] "age" "sex" "cp" "trestbps"
## [5] "chol" "fbs" "restecg" "thalach"
## [9] "exang" "oldpeak" "slope" "ca"
## [13] "thal" "num" "target_binary"
head(heart_data)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal num
## 1 63 1 1 145 233 1 2 150 0 2.3 3 0 6 0
## 2 67 1 4 160 286 0 2 108 1 1.5 2 3 3 2
## 3 67 1 4 120 229 0 2 129 1 2.6 2 2 7 1
## 4 37 1 3 130 250 0 0 187 0 3.5 3 0 3 0
## 5 41 0 2 130 204 0 2 172 0 1.4 1 0 3 0
## 6 56 1 2 120 236 0 0 178 0 0.8 1 0 3 0
## target_binary
## 1 0
## 2 1
## 3 1
## 4 0
## 5 0
## 6 0
tail(heart_data)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## 1019 54.86364 1 4 129.5450 234.2354 0 2 148.6876 1 0.3534338
## 1020 60.07221 1 3 115.4221 181.7686 0 0 128.1559 1 1.0019411
## 1021 70.92840 1 2 166.7272 244.9936 1 0 108.4813 0 0.6983369
## 1022 57.33288 1 2 105.0752 233.1463 0 2 140.3420 0 1.5191382
## 1023 40.88155 0 4 125.0686 154.3707 1 2 123.7563 0 1.9366838
## 1024 46.86337 1 3 121.9098 291.1755 1 2 146.6523 1 1.9139512
## slope ca thal num target_binary
## 1019 2 2 7 2 1
## 1020 2 1 7 2 1
## 1021 2 1 7 2 1
## 1022 1 0 7 2 1
## 1023 2 1 7 2 1
## 1024 2 2 3 2 1
summary(heart_data)
## age sex cp trestbps
## Min. :18.00 Min. :0.0000 Min. :1.000 Min. : 84.87
## 1st Qu.:48.19 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:120.00
## Median :55.00 Median :1.0000 Median :3.000 Median :130.01
## Mean :54.53 Mean :0.6855 Mean :3.149 Mean :131.48
## 3rd Qu.:61.20 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:142.41
## Max. :78.80 Max. :1.0000 Max. :4.000 Max. :200.00
## chol fbs restecg thalach
## Min. :100.0 Min. :0.000 Min. :0.000 Min. : 71.0
## 1st Qu.:211.0 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:133.6
## Median :245.0 Median :0.000 Median :1.000 Median :151.5
## Mean :247.0 Mean :0.165 Mean :1.002 Mean :149.6
## 3rd Qu.:280.2 3rd Qu.:0.000 3rd Qu.:2.000 3rd Qu.:165.6
## Max. :564.0 Max. :1.000 Max. :2.000 Max. :227.2
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.0000 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.1521 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.8550 Median :2.000 Median :0.0000
## Mean :0.3457 Mean :1.0901 Mean :1.619 Mean :0.6816
## 3rd Qu.:1.0000 3rd Qu.:1.6305 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.2000 Max. :3.000 Max. :3.0000
## thal num target_binary
## Min. :3.000 Min. :0.0000 Min. :0.000
## 1st Qu.:3.000 1st Qu.:0.0000 1st Qu.:0.000
## Median :3.000 Median :0.0000 Median :0.000
## Mean :4.701 Mean :0.9238 Mean :0.459
## 3rd Qu.:7.000 3rd Qu.:2.0000 3rd Qu.:1.000
## Max. :7.000 Max. :4.0000 Max. :1.000
dim(heart_data)
## [1] 1024 15
# Convert categorical variables to factor
heart_data$sex <- as.factor(heart_data$sex)
heart_data$cp <- as.factor(heart_data$cp)
heart_data$target <- as.factor(heart_data$target)
#Interpretation:
#The dataset contains multiple features such as age, cholesterol, blood pressure, and heart disease status. It includes both numerical and categorical variables, making it suitable for exploratory data analysis.
# 2. Missing values
colSums(is.na(heart_data))
## age sex cp trestbps chol
## 0 0 0 0 0
## fbs restecg thalach exang oldpeak
## 0 0 0 0 0
## slope ca thal num target_binary
## 0 0 0 0 0
## target
## 0
#Interpretation:
#No significant missing values were found, indicating the dataset is clean and reliable for analysis.
# 2.1 Filtering patients with high cholesterol
high_chol <- heart_data %>%
filter(chol > 240) %>%
select(age, chol, target)
head(high_chol)
## age chol target
## 1 67 286 1
## 2 37 250 0
## 3 62 268 1
## 4 57 354 0
## 5 63 254 1
## 6 56 294 0
# Interpretation:
#Patients with cholesterol levels above 240 are at higher risk of heart disease, showing that cholesterol is an important health indicator.
# 2.2 Top 10 highest cholesterol patients
top_chol <- heart_data %>%
arrange(desc(chol)) %>%
head(10)
top_chol
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## 1 67.00000 0 3 115.0000 564.0000 0 2 160.0000 0 1.6000000 2
## 2 52.35865 1 4 133.2919 430.3003 0 0 180.0519 0 0.0000000 1
## 3 65.00000 0 3 140.0000 417.0000 1 2 157.0000 0 0.8000000 1
## 4 56.00000 0 4 134.0000 409.0000 0 2 150.0000 1 1.9000000 2
## 5 42.17283 0 4 163.0629 408.5644 0 0 194.7920 0 0.0000000 2
## 6 63.00000 0 4 150.0000 407.0000 0 2 154.0000 0 4.0000000 2
## 7 68.38154 1 4 135.7192 406.5728 0 2 179.0287 0 0.7979003 1
## 8 62.00000 0 4 140.0000 394.0000 0 2 157.0000 0 1.2000000 2
## 9 61.15558 0 3 115.7312 391.0362 0 0 137.6991 0 0.0000000 1
## 10 69.39888 0 2 119.5372 387.8709 0 2 135.1319 1 0.9898378 1
## ca thal num target_binary target
## 1 0 7 0 0 0
## 2 0 3 0 0 0
## 3 1 3 0 0 0
## 4 2 7 2 1 1
## 5 0 7 0 0 0
## 6 3 7 4 1 1
## 7 2 7 2 1 1
## 8 0 3 0 0 0
## 9 0 3 0 0 0
## 10 0 3 0 0 0
#Interpretation:
#The top 10 patients have extremely high cholesterol levels, which may significantly increase their risk of developing heart disease.
# 2.3 Ranking patients by cholesterol
rank_chol <- heart_data %>%
arrange(desc(chol)) %>%
mutate(rank = row_number())
head(rank_chol)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca
## 1 67.00000 0 3 115.0000 564.0000 0 2 160.0000 0 1.6 2 0
## 2 52.35865 1 4 133.2919 430.3003 0 0 180.0519 0 0.0 1 0
## 3 65.00000 0 3 140.0000 417.0000 1 2 157.0000 0 0.8 1 1
## 4 56.00000 0 4 134.0000 409.0000 0 2 150.0000 1 1.9 2 2
## 5 42.17283 0 4 163.0629 408.5644 0 0 194.7920 0 0.0 2 0
## 6 63.00000 0 4 150.0000 407.0000 0 2 154.0000 0 4.0 2 3
## thal num target_binary target rank
## 1 7 0 0 0 1
## 2 3 0 0 0 2
## 3 3 0 0 0 3
## 4 7 2 1 1 4
## 5 7 0 0 0 5
## 6 7 4 1 1 6
#Interpretation:
#Ranking helps identify the most critical patients based on cholesterol levels, useful for prioritizing medical attention.
# 2.4 Patient with highest cholesterol
top_patient <- rank_chol %>%
filter(rank == 1)
top_patient
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal num
## 1 67 0 3 115 564 0 2 160 0 1.6 2 0 7 0
## target_binary target rank
## 1 0 0 1
#Interpretation:
#The patient with the highest cholesterol level represents the most critical case and may have a higher risk of heart disease.
# 3 Age vs Heart Disease
ggplot(heart_data, aes(x = age, fill = target)) +
geom_histogram(binwidth = 5) +
labs(title = "Age Distribution by Heart Disease",
x = "Age", y = "Count")

#Interpretation:
#Most heart disease cases are observed in middle-aged and older individuals, indicating age as a major risk factor.
# 4 Cholesterol distribution
ggplot(heart_data, aes(x = chol)) +
geom_histogram(binwidth = 20, fill = "red", color = "black") +
labs(title = "Cholesterol Distribution")

#Interpretation:
#Most patients fall within a moderate cholesterol range, while fewer patients have extremely high or low values.
# 5 Chest pain type vs disease
ggplot(heart_data, aes(x = cp, fill = target)) +
geom_bar() +
labs(title = "Chest Pain Type vs Heart Disease")

#Interpretation:
#Certain types of chest pain are more frequently associated with heart disease, making it an important diagnostic feature.
# 6 Scatter plot
ggplot(heart_data, aes(x = age, y = chol, color = target)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(title = "Age vs Cholesterol (Relationship Check)")
## `geom_smooth()` using formula = 'y ~ x'

#Interpretation:
#The scatter plot shows a weak relationship between age and cholesterol. The regression line is relatively flat, indicating no strong correlation.
# 7 Blood pressure vs heart disease
ggplot(heart_data, aes(x = trestbps, y = chol, color = target)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "BP vs Cholesterol")
## `geom_smooth()` using formula = 'y ~ x'

#Interpretation:
#There is a slight trend suggesting higher blood pressure may be associated with higher cholesterol, but the relationship is not very strong.
# 8 Average cholesterol by gender
gender_avg <- heart_data %>%
group_by(sex) %>%
summarise(avg_chol = mean(chol))
ggplot(gender_avg, aes(x = sex, y = avg_chol, fill = sex)) +
geom_bar(stat = "identity") +
labs(title = "Avg Cholesterol by Gender")

#Interpretation:
#There is a difference in average cholesterol levels between genders, which may indicate gender-based health patterns.
# 9 Max heart rate vs disease
ggplot(heart_data, aes(x = thalach, y = chol, color = target)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Max Heart Rate vs Cholesterol")
## `geom_smooth()` using formula = 'y ~ x'

#Interpretation:
#The relationship between maximum heart rate and cholesterol appears weak, suggesting they are not strongly dependent on each other.
# 10 Heart disease count
ggplot(heart_data, aes(x = target, fill = target)) +
geom_bar() +
labs(title = "Heart Disease Distribution")

#Interpretation:
#The dataset shows the distribution of patients with and without heart disease, helping understand class balance.
# 11 Fasting blood sugar vs disease
ggplot(heart_data, aes(x = fbs, fill = target)) +
geom_bar() +
labs(title = "Fasting Blood Sugar vs Disease")

#Interpretation:
#Patients with higher fasting blood sugar may show a higher tendency toward heart disease, though the relationship may vary.
# 12 ST depression vs disease
ggplot(heart_data, aes(x = oldpeak, y = chol, color = target)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "ST Depression vs Cholesterol")
## `geom_smooth()` using formula = 'y ~ x'

#Interpretation:
#There is a slight relationship between ST depression and cholesterol, but it is not strongly linear, suggesting other factors may also play a significant role in heart disease risk.
# 13 Create age groups
heart_data$age_group <- cut(heart_data$age,
breaks = c(20, 40, 60, 80),
labels = c("Young", "Middle", "Old"))
# Plot
ggplot(heart_data, aes(x = age_group, fill = target)) +
geom_bar() +
labs(title = "Heart Disease by Age Group",
x = "Age Group",
y = "Count")

#Interpretation:
#Heart disease is more common in middle-aged and older groups, confirming age as a key risk factor.
# 14 Create BP categories
heart_data$bp_group <- ifelse(heart_data$trestbps < 120, "Normal",
ifelse(heart_data$trestbps < 140, "Elevated", "High"))
# Plot
ggplot(heart_data, aes(x = bp_group, fill = target)) +
geom_bar() +
labs(title = "Blood Pressure vs Heart Disease",
x = "BP Category",
y = "Count")

#Interpretation:
#Higher blood pressure categories show a greater prevalence of heart disease, indicating blood pressure as an important risk factor.
#Patients with high blood pressure show a higher count of heart disease cases, indicating hypertension as a major risk.
# Q 15 Cholesterol vs heart disease
ggplot(heart_data, aes(x = target, y = chol, fill = target)) +
geom_boxplot() +
labs(title = "Cholesterol vs Heart Disease",
x = "Heart Disease (0 = No, 1 = Yes)",
y = "Cholesterol")

#Interpretation:
#Patients with heart disease tend to have higher median cholesterol levels, and the spread indicates variability among patients.
# 16 CDF of Cholesterol
plot(
ecdf(heart_data$chol),
main = "CDF of Cholesterol",
xlab = "Cholesterol",
ylab = "Cumulative Probability",
col = "blue"
)

# interpretation:
#The CDF graph shows that most patients have cholesterol levels concentrated within a moderate range, while only a few patients have extremely high cholesterol values.
# 17 Box Plot of Cholesterol by Gender
ggplot(heart_data,
aes(x = sex,
y = chol,
fill = sex)) +
geom_boxplot() +
labs(
title = "Cholesterol by Gender",
x = "Gender",
y = "Cholesterol"
)

#interpretation:
#The box plot indicates differences in cholesterol distribution between genders and highlights outliers with unusually high cholesterol levels.
# 18 Box Plot of Age by Heart Disease
ggplot(heart_data,
aes(x = target,
y = age,
fill = target)) +
geom_boxplot() +
labs(
title = "Age vs Heart Disease",
x = "Heart Disease",
y = "Age"
)

#interpretation:
#Patients with heart disease generally belong to older age groups, although some younger patients are also affected.
# 19 ANOVA for Cholesterol and Chest Pain Type
anova_cp <- aov(
chol ~ cp,
data = heart_data
)
summary(anova_cp)
## Df Sum Sq Mean Sq F value Pr(>F)
## cp 3 29946 9982 3.641 0.0124 *
## Residuals 1020 2796279 2741
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#interpretation:
#Patients with heart disease generally belong to older age groups, although some younger patients are also affected.
# 20 ANOVA for Age Group and Cholesterol
anova_age <- aov(
chol ~ age_group,
data = heart_data
)
summary(anova_age)
## Df Sum Sq Mean Sq F value Pr(>F)
## age_group 2 110532 55266 20.79 1.41e-09 ***
## Residuals 1020 2711611 2658
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
#interpretation:
#Different age groups show variations in cholesterol levels, with older groups tending to have higher values.
# 21 Correlation between Age and Cholesterol
cor(
heart_data$age,
heart_data$chol,
use = "complete.obs"
)
## [1] 0.216582
#interpretation:
#The correlation coefficient indicates a weak positive relationship between age and cholesterol, suggesting that as age increases A positive relationship exists between age and cholesterol, indicating cholesterol tends to increase as age increases.
# 22 Correlation between BP and Cholesterol
cor(
heart_data$trestbps,
heart_data$chol,
use = "complete.obs"
)
## [1] 0.1634889
#interpretation:
#Blood pressure and cholesterol show a moderate positive relationship, suggesting that higher blood pressure is associated with higher cholesterol.
# 23 Correlation Matrix
numeric_data <- heart_data %>%
select(age, chol, trestbps, thalach, oldpeak)
cor(numeric_data)
## age chol trestbps thalach oldpeak
## age 1.0000000 0.216582048 0.30097560 -0.418977045 0.21192202
## chol 0.2165820 1.000000000 0.16348888 0.009172488 0.03408721
## trestbps 0.3009756 0.163488880 1.00000000 -0.058999440 0.12127645
## thalach -0.4189770 0.009172488 -0.05899944 1.000000000 -0.35991552
## oldpeak 0.2119220 0.034087211 0.12127645 -0.359915516 1.00000000
#interpretation:
#The correlation matrix reveals the strength and direction of relationships among medical variables such as age, cholesterol, blood pressure, and heart rate.
# 24 Single Regression: Cholesterol vs Age
single_reg1 <- lm(
chol ~ age,
data = heart_data
)
summary(single_reg1)
##
## Call:
## lm(formula = chol ~ age, data = heart_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.321 -34.588 -3.006 33.601 301.730
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 180.0226 9.5743 18.803 < 2e-16 ***
## age 1.2276 0.1731 7.092 2.46e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.34 on 1022 degrees of freedom
## Multiple R-squared: 0.04691, Adjusted R-squared: 0.04598
## F-statistic: 50.3 on 1 and 1022 DF, p-value: 2.462e-12
#interpretation:
#The regression analysis shows that age has a statistically significant but weak positive effect on cholesterol levels,
#The regression model shows that age can be used to predict cholesterol levels with a positive trend.
# 25 Plot Single Regression
ggplot(heart_data,
aes(x = age,
y = chol)) +
geom_point(color = "blue") +
geom_smooth(
method = "lm",
se = FALSE,
color = "red"
) +
labs(
title = "Age vs Cholesterol Regression",
x = "Age",
y = "Cholesterol"
)
## `geom_smooth()` using formula = 'y ~ x'

#interpretation:
#The scatter plot with the regression line illustrates the relationship between age and cholesterol, showing a positive
#The regression line demonstrates an upward relationship between age and cholesterol levels.
# 26 Single Regression: BP vs Cholesterol
single_reg2 <- lm(
chol ~ trestbps,
data = heart_data
)
summary(single_reg2)
##
## Call:
## lm(formula = chol ~ trestbps, data = heart_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -144.96 -35.32 -0.20 32.13 325.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 182.42699 12.28944 14.844 < 2e-16 ***
## trestbps 0.49086 0.09265 5.298 1.44e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.88 on 1022 degrees of freedom
## Multiple R-squared: 0.02673, Adjusted R-squared: 0.02578
## F-statistic: 28.07 on 1 and 1022 DF, p-value: 1.435e-07
#interpretation:
#The regression analysis indicates that blood pressure has a statistically significant positive effect on cholesterol levels, suggesting that higher blood pressure is associated with higher cholesterol. The regression analysis indicates that cholesterol tends to rise with increasing blood pressure.
# 27 Multiple Regression
multiple_reg <- lm(
chol ~ age + trestbps + thalach,
data = heart_data
)
summary(multiple_reg)
##
## Call:
## lm(formula = chol ~ age + trestbps + thalach, data = heart_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.461 -34.076 -2.771 32.232 302.816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97.40695 19.98270 4.875 1.26e-06 ***
## age 1.32592 0.19812 6.693 3.61e-11 ***
## trestbps 0.29950 0.09546 3.138 0.001753 **
## thalach 0.25311 0.07475 3.386 0.000736 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.82 on 1020 degrees of freedom
## Multiple R-squared: 0.06801, Adjusted R-squared: 0.06527
## F-statistic: 24.81 on 3 and 1020 DF, p-value: 1.691e-15
#interpretation:
#The multiple regression model shows that age, blood pressure, and maximum heart rate all have statistically. Using multiple variables together improves the prediction of cholesterol levels compared to using a single variable.
# 28 Predict Cholesterol Values
predicted_chol <- predict(multiple_reg)
head(predicted_chol)
## 1 2 3 4 5 6
## 262.3340 261.4995 254.8349 232.7326 234.2397 252.6522
#interpretation:
#The predicted cholesterol values from the multiple regression model provide estimates based on the combined effects of age. The predicted cholesterol values are reasonably close to the actual values for many patients.
# 29 Actual vs Predicted Plot
actual_predicted <- data.frame(
Actual = heart_data$chol,
Predicted = predicted_chol
)
ggplot(actual_predicted,
aes(x = Actual,
y = Predicted)) +
geom_point(color = "darkgreen") +
labs(
title = "Actual vs Predicted Cholesterol",
x = "Actual",
y = "Predicted"
)

#interpretation:
#The actual vs predicted plot shows how well the multiple regression model fits the data. Points close. Most predicted values follow the pattern of actual cholesterol values, showing acceptable model performance.
# 30 Polynomial Regression Degree 2
poly_reg <- lm(
chol ~ poly(age, 2),
data = heart_data
)
summary(poly_reg)
##
## Call:
## lm(formula = chol ~ poly(age, 2), data = heart_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.000 -34.625 -2.617 34.130 303.777
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 246.965 1.603 154.036 < 2e-16 ***
## poly(age, 2)1 364.104 51.306 7.097 2.39e-12 ***
## poly(age, 2)2 -78.240 51.306 -1.525 0.128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.31 on 1021 degrees of freedom
## Multiple R-squared: 0.04907, Adjusted R-squared: 0.04721
## F-statistic: 26.35 on 2 and 1021 DF, p-value: 6.982e-12
#interpretation:
#The polynomial regression model indicates that the relationship between age and cholesterol is not strictly linear, and The polynomial regression model captures the curved relationship between age and cholesterol more effectively than linear regression.
# 31 Plot Polynomial Regression
ggplot(heart_data,
aes(x = age,
y = chol)) +
geom_point(color = "purple") +
stat_smooth(
method = "lm",
formula = y ~ poly(x, 2),
se = FALSE,
color = "red"
) +
labs(
title = "Polynomial Regression",
x = "Age",
y = "Cholesterol"
)

#interpretation:
#The plot of the polynomial regression shows a curved relationship between age and cholesterol, indicating that cholesterol. The curved regression line fits the data better than a straight line, indicating a non-linear relationship.
# 32 Polynomial Regression Degree 3
poly_reg3 <- lm(
chol ~ poly(age, 3),
data = heart_data
)
summary(poly_reg3)
##
## Call:
## lm(formula = chol ~ poly(age, 3), data = heart_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.743 -34.254 -2.356 33.496 303.917
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 246.965 1.604 154.014 < 2e-16 ***
## poly(age, 3)1 364.104 51.313 7.096 2.4e-12 ***
## poly(age, 3)2 -78.240 51.313 -1.525 0.128
## poly(age, 3)3 43.299 51.313 0.844 0.399
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.31 on 1020 degrees of freedom
## Multiple R-squared: 0.04974, Adjusted R-squared: 0.04694
## F-statistic: 17.8 on 3 and 1020 DF, p-value: 2.905e-11
#interpretation
#The cubic polynomial regression model suggests that the relationship between age and cholesterol is more complex. The higher-degree polynomial regression provides greater flexibility in modeling complex patterns within the dataset.
# 33 Residual Plot
plot(
single_reg1$fitted.values,
single_reg1$residuals,
main = "Residual Plot",
xlab = "Fitted Values",
ylab = "Residuals",
col = "blue"
)
abline(h = 0, col = "red")

#interpretation:
#The residual plot shows the distribution of residuals around the fitted values. The residuals are randomly scattered around zero, indicating that the regression model fits the data reasonably well.
# 34 Density Plot of Cholesterol
ggplot(heart_data,
aes(x = chol)) +
geom_density(fill = "lightblue") +
labs(
title = "Density Plot of Cholesterol",
x = "Cholesterol",
y = "Density"
)

#interpretation:
#The density plot shows the distribution of cholesterol levels among patients. The distribution appears to be right skewed, with a longer tail on the higher cholesterol side, indicating that while most patients have moderate cholesterol levels, a few have very high levels.The density plot shows that cholesterol values are concentrated around the average range with fewer extreme values.
# 35 QQ Plot
qqnorm(heart_data$chol)
qqline(
heart_data$chol,
col = "red"
)

#interpretation:
#The QQ plot assesses the normality of cholesterol values.Most points lie close to the reference line, suggesting the cholesterol data is approximately normally distributed.
# 36 Covariance between Age and Cholesterol
cov(
heart_data$age,
heart_data$chol,
use = "complete.obs"
)
## [1] 105.5667
#interpretation:
#The covariance value indicates the direction of the relationship between age and cholesterol. The positive covariance indicates that age and cholesterol tend to increase together. A positive covariance suggests that as age increases, cholesterol levels also tend to increase, indicating a positive relationship between the two variables.
# 37 Scatter Plot Matrix
pairs(
heart_data[, c("age",
"chol",
"trestbps",
"thalach",
"oldpeak")],
main = "Scatter Plot Matrix"
)

#interpretation:
# The scatter plot matrix shows pairwise relationships between multiple numerical variables. It helps identify potential correlations and patterns among age, cholesterol, blood pressure, heart rate, and ST depression.
# 38 Outlier Detection using Boxplot
boxplot(
heart_data$chol,
main = "Outlier Detection in Cholesterol",
col = "pink"
)

#interpretation:
#The boxplot identifies outliers in cholesterol levels. Points outside the whiskers represent potential outliers, indicating patients with unusually high cholesterol levels that may require special attention. The boxplot identifies several extreme cholesterol values that may represent high-risk patients.
# 39 Distribution of Heart Rate
ggplot(heart_data,
aes(x = thalach)) +
geom_histogram(
binwidth = 10,
fill = "orange",
color = "black"
) +
labs(
title = "Heart Rate Distribution",
x = "Maximum Heart Rate",
y = "Count"
)

#interpretation:
#The histogram shows the distribution of maximum heart rates among patients. The distribution appears to be approximately
#normal, with most patients having heart rates in the moderate range and fewer patients with very high or low heart rates.
# 40 BP vs Heart Disease Boxplot
ggplot(heart_data,
aes(x = target,
y = trestbps,
fill = target)) +
geom_boxplot() +
labs(
title = "Blood Pressure vs Heart Disease",
x = "Heart Disease",
y = "Blood Pressure"
)

#interpretation:
#The boxplot shows that patients with heart disease tend to have higher blood pressure levels compared to those without heart disease, indicating blood pressure as a significant risk factor. Patients with heart disease tend to show greater variation in blood pressure levels compared to patients without heart disease.
# final conclusion:
#The Heart Disease dataset analysis showed that factors such as age, cholesterol, blood pressure, and heart rate have a strong relationship with heart disease. Exploratory analysis and visualizations helped identify patterns, distributions, and outliers in patient health data. Correlation and regression analysis indicated that cholesterol levels increase with age and blood pressure. Multiple regression provided better prediction accuracy compared to single regression by considering several variables together. Polynomial regression captured non-linear relationships more effectively than linear regression. Overall, the analysis demonstrated how statistical techniques and data visualization can help understand heart disease risk factors and support predictive healthcare analysis.