Heart disease is a major global health issue. This project aims to uncover hidden patterns in heart disease risk factors using Principal Component Analysis (PCA), Clustering (K-Means), and Association Rule Mining (Apriori Algorithm). The goal is to simplify complex medical data and identify key risk factors that contribute to heart disease.
-age: Age of the patient (in years) -sex: Sex of the patient (1 = male, 0 = female) -cp: Chest pain type (1-4) -trestbps: Resting blood pressure (in mm Hg on admission to the hospital) -chol: Serum cholesterol in mg/dl -fbs: Fasting blood sugar > 120 mg/dl (1 = true; 0 = false) -restecg: Resting electrocardiographic results (0-2) -thalach: Maximum heart rate achieved -exang: Exercise-induced angina (1 = yes; 0 = no) -oldpeak: ST depression induced by exercise relative to rest
library(dplyr)
library(caret)
library(ggplot2)
library(arules)
library(arulesViz)
library(FactoMineR)
library(factoextra)
library(psych)
library(corrplot)
setwd("C:/Users/ozdil/Downloads")
life <- read.csv("USL project2/archive (1)/heart-disease.csv")
head(life)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 1 3 145 233 1 0 150 0 2.3 0 0 1
## 2 37 1 2 130 250 0 1 187 0 3.5 0 0 2
## 3 41 0 1 130 204 0 0 172 0 1.4 2 0 2
## 4 56 1 1 120 236 0 1 178 0 0.8 2 0 2
## 5 57 0 0 120 354 0 1 163 1 0.6 2 0 2
## 6 57 1 0 140 192 0 1 148 0 0.4 1 0 1
## target
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
categorical_columns <- c("sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal")
sum(is.na(life)) # Check for missing values
## [1] 0
life <- na.omit(life) # Remove missing values
# Separate features
df_features <- life %>% select(-target)
df_numeric <- df_features %>% select(-one_of(categorical_columns))
# Standardize numerical features
df_scaled <- as.data.frame(scale(df_numeric))
df_scaled_full <- cbind(df_scaled, df_features[, categorical_columns])
fa.parallel(df_scaled_full, fa = "pc", n.iter = 100)
## Parallel analysis suggests that the number of factors = NA and the number of components = 2
pca_model <- prcomp(df_scaled_full, center = TRUE, scale. = TRUE)
summary(pca_model)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.6622 1.2396 1.10582 1.08681 1.01092 0.98489 0.92885
## Proportion of Variance 0.2125 0.1182 0.09406 0.09086 0.07861 0.07462 0.06637
## Cumulative Proportion 0.2125 0.3307 0.42481 0.51567 0.59428 0.66890 0.73527
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.88088 0.8479 0.78840 0.72808 0.65049 0.6098
## Proportion of Variance 0.05969 0.0553 0.04781 0.04078 0.03255 0.0286
## Cumulative Proportion 0.79495 0.8503 0.89807 0.93885 0.97140 1.0000
screeplot(pca_model, type = "lines", main = "Scree Plot")
fviz_eig(pca_model, addlabels = TRUE, main = "Cumulative Variance Explained by PCs")
factoextra::fviz_pca_var(pca_model, col.var="contrib")
fviz_nbclust(df_scaled, kmeans, method = "wss")
set.seed(123)
kmeans_result <- kmeans(df_scaled, centers = 3)
fviz_cluster(kmeans_result, data = df_scaled)
hclust_result <- hclust(dist(df_scaled))
fviz_dend(hclust_result, k = 3)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
pca_data_discrete <- data.frame(
PC1 = discretize(pca_model$x[,1], method = "frequency", breaks = 3),
PC2 = discretize(pca_model$x[,2], method = "frequency", breaks = 3),
PC3 = discretize(pca_model$x[,3], method = "frequency", breaks = 3),
PC4 = discretize(pca_model$x[,4], method = "frequency", breaks = 3)
)
pca_data_trans <- as(pca_data_discrete, "transactions")
rules <- apriori(pca_data_trans, parameter = list(supp = 0.1, conf = 0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 30
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[12 item(s), 303 transaction(s)] done [0.00s].
## sorting and recoding items ... [12 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(rules))
library(ggplot2)
df_pca <- as.data.frame(pca_model$x) # Extract PCA-transformed data
df_pca$Cluster <- as.factor(kmeans_result$cluster)
ggplot(df_pca, aes(x = Cluster, y = PC1, fill = Cluster)) +
geom_boxplot() +
theme_minimal() +
ggtitle("Distribution of PC1 Across Clusters")
rule_clusters <- table(df_pca$Cluster, pca_data_discrete$PC1)
rule_clusters_df <- as.data.frame(rule_clusters)
colnames(rule_clusters_df) <- c("Cluster", "PC1_Bin", "Count")
ggplot(rule_clusters_df, aes(x = PC1_Bin, y = Count, fill = Cluster)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
ggtitle("Frequency of PC1 Rules in Each Cluster")
cor_matrix <- cor(pca_model$x[,1:4], df_numeric)
corrplot(cor_matrix, method = "color", tl.cex = 0.8)