##Data Import & Understanding
# 1. Load Data
file_path <- "C:/Users/rijua/Downloads/CVD DATASET/cardio_train.csv"
data <- read_csv(file_path)
data <- janitor::clean_names(data)
# 2. Transform & Clean
data <- data %>%
mutate(
age_years = age / 365.25,
bmi = weight / (height / 100)^2,
gender = factor(gender, labels = c('Female', 'Male')),
cholesterol = factor(cholesterol, labels = c('Normal', 'Above Normal', 'Well Above Normal')),
gluc = factor(gluc, labels = c('Normal', 'Above Normal', 'Well Above Normal')),
smoke = factor(smoke, labels = c('No', 'Yes')),
alco = factor(alco, labels = c('No', 'Yes')),
active = factor(active, labels = c('No', 'Yes')),
cardio = factor(cardio, levels = c(0, 1), labels = c('Neg', 'Pos'))
) %>%
filter(
ap_hi >= 60 & ap_hi <= 240,
ap_lo >= 50 & ap_lo <= 180,
ap_hi > ap_lo,
bmi > 15 & bmi < 50,
height >= 140 & height <= 210
) %>%
select(-id, -age)
cat("# Data Structure\n\n")
## # Data Structure
glimpse(data)
## Rows: 68,331
## Columns: 13
## $ gender <fct> Male, Female, Female, Male, Female, Female, Female, Male, …
## $ height <dbl> 168, 156, 165, 169, 156, 151, 157, 178, 158, 164, 169, 173…
## $ weight <dbl> 62, 85, 64, 82, 56, 67, 93, 95, 71, 68, 80, 60, 60, 78, 95…
## $ ap_hi <dbl> 110, 140, 130, 150, 100, 120, 130, 130, 110, 110, 120, 120…
## $ ap_lo <dbl> 80, 90, 70, 100, 60, 80, 80, 90, 70, 60, 80, 80, 80, 70, 9…
## $ cholesterol <fct> Normal, Well Above Normal, Well Above Normal, Normal, Norm…
## $ gluc <fct> Normal, Normal, Normal, Normal, Normal, Above Normal, Norm…
## $ smoke <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, Ye…
## $ alco <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, Ye…
## $ active <fct> Yes, Yes, No, Yes, No, No, Yes, Yes, Yes, No, Yes, Yes, No…
## $ cardio <fct> Neg, Pos, Pos, Pos, Neg, Neg, Neg, Pos, Neg, Neg, Neg, Neg…
## $ age_years <dbl> 50.35729, 55.38125, 51.62765, 48.24914, 47.84120, 59.99726…
## $ bmi <dbl> 21.96712, 34.92768, 23.50781, 28.71048, 23.01118, 29.38468…
cat("\n\n# Data Summary\n\n")
##
##
## # Data Summary
knitr::kable(summary(data), caption = "Summary Statistics")
| gender | height | weight | ap_hi | ap_lo | cholesterol | gluc | smoke | alco | active | cardio | age_years | bmi | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Female:44471 | Min. :140.0 | Min. : 32.00 | Min. : 70.0 | Min. : 50.00 | Normal :51270 | Normal :58136 | No :62310 | No :64682 | No :13433 | Neg:34556 | Min. :29.56 | Min. :15.01 | |
| Male :23860 | 1st Qu.:159.0 | 1st Qu.: 65.00 | 1st Qu.:120.0 | 1st Qu.: 80.00 | Above Normal : 9242 | Above Normal : 5024 | Yes: 6021 | Yes: 3649 | Yes:54898 | Pos:33775 | 1st Qu.:48.34 | 1st Qu.:23.88 | |
| NA | Median :165.0 | Median : 72.00 | Median :120.0 | Median : 80.00 | Well Above Normal: 7819 | Well Above Normal: 5171 | NA | NA | NA | NA | Median :53.94 | Median :26.30 | |
| NA | Mean :164.5 | Mean : 73.99 | Mean :126.7 | Mean : 81.31 | NA | NA | NA | NA | NA | NA | Mean :53.29 | Mean :27.38 | |
| NA | 3rd Qu.:170.0 | 3rd Qu.: 82.00 | 3rd Qu.:140.0 | 3rd Qu.: 90.00 | NA | NA | NA | NA | NA | NA | 3rd Qu.:58.38 | 3rd Qu.:30.11 | |
| NA | Max. :207.0 | Max. :180.00 | Max. :240.0 | Max. :180.00 | NA | NA | NA | NA | NA | NA | Max. :64.92 | Max. :50.00 |
Transformed coded variables into readable factors and engineered new
features (age_years, bmi). Outliers were
filtered out to create a clean dataset ready for analysis. No missing
values were detected.
cat("#### Correlation Plot of Numeric Variables\n\n")
## #### Correlation Plot of Numeric Variables
num_vars <- select_if(data, is.numeric)
corrplot(cor(num_vars), method = "color", type = "lower",
col = brewer.pal(n = 8, name = "RdYlBu"))
cat("\n\n# Age Distribution\n\n")
##
##
## # Age Distribution
ggplot(data, aes(x = age_years)) +
geom_histogram(binwidth = 2, fill = "lightblue", color = "black", alpha = 0.8) +
labs(title = "Age Distribution of Patients", x = "Age in Years", y = "Count") +
theme_minimal()
The correlation plot shows that age_years,
ap_hi, and ap_lo are positively correlated, as
expected. weight and bmi are also strongly
correlated. The age distribution histogram reveals most patients are
between 50–65 years old.
cat("# Patient Counts by Outcome\n\n")
## # Patient Counts by Outcome
knitr::kable(table(data$cardio), col.names = c("Outcome", "Count"),
caption = "Patient Counts")
| Outcome | Count |
|---|---|
| Neg | 34556 |
| Pos | 33775 |
cat("\n\n# Average Stats by Outcome\n\n")
##
##
## # Average Stats by Outcome
mean_stats <- data %>%
select(cardio, age_years, ap_hi, ap_lo, bmi) %>%
group_by(cardio) %>%
summarise(across(everything(), list(mean = mean)))
knitr::kable(mean_stats, digits = 1, caption = "Average Age, BP, and BMI by Outcome")
| cardio | age_years_mean | ap_hi_mean | ap_lo_mean | bmi_mean |
|---|---|---|---|---|
| Neg | 51.7 | 119.6 | 78.1 | 26.4 |
| Pos | 54.9 | 133.9 | 84.6 | 28.4 |
The dataset is well-balanced between “No Disease” and “Disease” outcomes. Patients with disease are older and have higher blood pressure and BMI, suggesting these are key risk predictors.
set.seed(123)
# 1. TRAIN–TEST SPLIT
train_index <- createDataPartition(data$cardio, p = 0.75, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# FORCE correct factor structure
train_data$cardio <- factor(train_data$cardio, levels = c("Neg", "Pos"))
test_data$cardio <- factor(test_data$cardio, levels = c("Neg", "Pos"))
# 2. ANOVA (Blood Pressure)
cat("#### Statistical Test (ANOVA) for Blood Pressure\n\n")
## #### Statistical Test (ANOVA) for Blood Pressure
anova_model <- aov(ap_hi ~ cardio, data = data)
knitr::kable(broom::tidy(anova_model),
caption = "ANOVA: Systolic BP (ap_hi) vs. Cardio Outcome")
| term | df | sumsq | meansq | statistic | p.value |
|---|---|---|---|---|---|
| cardio | 1 | 3477941 | 3477941.4004 | 15347.17 | 0 |
| Residuals | 68329 | 15484565 | 226.6178 | NA | NA |
# 3. LOGISTIC REGRESSION
cat("\n\n#### Predictive Model 1: Logistic Regression\n\n")
##
##
## #### Predictive Model 1: Logistic Regression
glm_fit <- glm(cardio ~ ., data = train_data, family = "binomial")
# Predictors ranked by effect
odds_ratios <- broom::tidy(glm_fit, exponentiate = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(sort_key = abs(estimate - 1)) %>%
top_n(10, sort_key)
ggplot(odds_ratios, aes(x = estimate,
y = reorder(term, estimate),
fill = estimate > 1)) +
geom_col(alpha = 0.8) +
scale_fill_manual(values = c("#4575b4", "#d73027"),
labels = c("Decreases Risk", "Increases Risk")) +
labs(title = "Top 10 Predictors (Logistic Regression)",
x = "Odds Ratio", y = "Predictor") +
theme_minimal()
# 4. KNN (ROC)
cat("\n\n#### Predictive Model 2: K-Nearest Neighbors (KNN)\n\n")
##
##
## #### Predictive Model 2: K-Nearest Neighbors (KNN)
train_data$cardio <- factor(train_data$cardio, levels = c("Neg", "Pos"))
test_data$cardio <- factor(test_data$cardio, levels = c("Neg", "Pos"))
ctrl <- trainControl(
method = "cv",
number = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final"
)
knn_fit <- train(
cardio ~ .,
data = train_data,
method = "knn",
metric = "ROC",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneGrid = data.frame(k = c(5, 11, 17))
)
plot(knn_fit)
# 5. MODEL PERFORMANCE
cat("\n\n#### Model Performance on Test Set\n\n")
##
##
## #### Model Performance on Test Set
# Logistic Regression predictions
pred_prob_glm <- predict(glm_fit, test_data, type = "response")
pred_class_glm <- factor(
ifelse(pred_prob_glm > 0.5, "Pos", "Neg"),
levels = c("Neg", "Pos")
)
# KNN predictions
pred_prob_knn <- predict(knn_fit, test_data, type = "prob")[,"Pos"]
pred_class_knn <- predict(knn_fit, test_data)
# Final comparison table
model_stats <- data.frame(
Model = c("Logistic Regression", "KNN"),
Accuracy = c(
confusionMatrix(pred_class_glm, test_data$cardio, positive = "Pos")$overall["Accuracy"],
confusionMatrix(pred_class_knn, test_data$cardio, positive = "Pos")$overall["Accuracy"]
),
AUC = c(
auc(roc(test_data$cardio, pred_prob_glm, quiet = TRUE)),
auc(roc(test_data$cardio, pred_prob_knn, quiet = TRUE))
)
)
knitr::kable(model_stats, digits = 4, caption = "Model Performance Comparison")
| Model | Accuracy | AUC |
|---|---|---|
| Logistic Regression | 0.7242 | 0.7882 |
| KNN | 0.7157 | 0.7731 |
ANOVA confirms significant BP differences between groups. Logistic
Regression highlights ap_hi, cholesterol, and
age_years as key predictors. Logistic Regression performs
slightly better than KNN in both Accuracy and AUC.
cat("#### K-Means Cluster Plot (k=3)\n\n")
## #### K-Means Cluster Plot (k=3)
numeric_for_cluster <- data %>% select_if(is.numeric) %>% scale()
set.seed(123)
sample_data <- numeric_for_cluster[sample(1:nrow(numeric_for_cluster), 5000), ]
km_fit <- kmeans(sample_data, centers = 3, nstart = 25)
fviz_cluster(km_fit, data = sample_data, geom = "point", ellipse.type = "convex",
palette = "Set2", ggtheme = theme_minimal(), main = "K-Means Clusters (k=3)")
cat("\n\n#### Clusters vs. Actual Cardio Status\n\n")
##
##
## #### Clusters vs. Actual Cardio Status
km_full <- kmeans(numeric_for_cluster, centers = 3, nstart = 25)
cluster_table <- table(Cluster = km_full$cluster, Outcome = data$cardio)
knitr::kable(cluster_table, caption = "Cluster Membership vs. Actual Outcome")
| Neg | Pos |
|---|---|
| 25547 | 12305 |
| 3001 | 12429 |
| 6008 | 9041 |
knitr::kable(prop.table(cluster_table, 1), digits = 2, caption = "Proportion of Outcome by Cluster")
| Neg | Pos |
|---|---|
| 0.67 | 0.33 |
| 0.19 | 0.81 |
| 0.40 | 0.60 |
K-Means identifies three clear patient groups. Cluster–Outcome comparison tables show that clusters capture meaningful differences related to disease likelihood.
cat("#### Glucose Levels by Cardio Outcome\n\n")
## #### Glucose Levels by Cardio Outcome
ggplot(data, aes(x = cardio, y = gluc, fill = cardio)) +
geom_boxplot(alpha = 0.7) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Glucose Levels by Cardio Outcome", x = "Cardio Outcome", y = "Glucose Level") +
theme_minimal()
cat("\n\n#### Age vs. Blood Pressure Scatter Plot\n\n")
##
##
## #### Age vs. Blood Pressure Scatter Plot
ggplot(data, aes(x = age_years, y = ap_hi, color = cardio)) +
geom_point(alpha = 0.2) +
scale_color_manual(values = c("No Disease" = "#1b9e77", "Disease" = "#d95f02")) +
labs(title = "Age vs. Systolic Blood Pressure", x = "Age (Years)", y = "Systolic BP") +
theme_minimal()
The glucose boxplot shows little difference between groups, while the scatter plot reveals that older patients with higher systolic BP are much more likely to have cardiovascular disease.