##Data Import & Understanding

Code

# 1. Load Data
file_path <- "C:/Users/rijua/Downloads/CVD DATASET/cardio_train.csv"
data <- read_csv(file_path)
data <- janitor::clean_names(data)

# 2. Transform & Clean
data <- data %>%
  mutate(
    age_years = age / 365.25,
    bmi = weight / (height / 100)^2,
    gender = factor(gender, labels = c('Female', 'Male')),
    cholesterol = factor(cholesterol, labels = c('Normal', 'Above Normal', 'Well Above Normal')),
    gluc = factor(gluc, labels = c('Normal', 'Above Normal', 'Well Above Normal')),
    smoke = factor(smoke, labels = c('No', 'Yes')),
    alco = factor(alco, labels = c('No', 'Yes')),
    active = factor(active, labels = c('No', 'Yes')),
    cardio = factor(cardio, levels = c(0, 1), labels = c('Neg', 'Pos'))
  ) %>%
  filter(
    ap_hi >= 60 & ap_hi <= 240,
    ap_lo >= 50 & ap_lo <= 180,
    ap_hi > ap_lo,
    bmi > 15 & bmi < 50,
    height >= 140 & height <= 210
  ) %>%
  select(-id, -age)

cat("# Data Structure\n\n")
## # Data Structure
glimpse(data)
## Rows: 68,331
## Columns: 13
## $ gender      <fct> Male, Female, Female, Male, Female, Female, Female, Male, …
## $ height      <dbl> 168, 156, 165, 169, 156, 151, 157, 178, 158, 164, 169, 173…
## $ weight      <dbl> 62, 85, 64, 82, 56, 67, 93, 95, 71, 68, 80, 60, 60, 78, 95…
## $ ap_hi       <dbl> 110, 140, 130, 150, 100, 120, 130, 130, 110, 110, 120, 120…
## $ ap_lo       <dbl> 80, 90, 70, 100, 60, 80, 80, 90, 70, 60, 80, 80, 80, 70, 9…
## $ cholesterol <fct> Normal, Well Above Normal, Well Above Normal, Normal, Norm…
## $ gluc        <fct> Normal, Normal, Normal, Normal, Normal, Above Normal, Norm…
## $ smoke       <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, Ye…
## $ alco        <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, Ye…
## $ active      <fct> Yes, Yes, No, Yes, No, No, Yes, Yes, Yes, No, Yes, Yes, No…
## $ cardio      <fct> Neg, Pos, Pos, Pos, Neg, Neg, Neg, Pos, Neg, Neg, Neg, Neg…
## $ age_years   <dbl> 50.35729, 55.38125, 51.62765, 48.24914, 47.84120, 59.99726…
## $ bmi         <dbl> 21.96712, 34.92768, 23.50781, 28.71048, 23.01118, 29.38468…
cat("\n\n# Data Summary\n\n")
## 
## 
## # Data Summary
knitr::kable(summary(data), caption = "Summary Statistics")
Summary Statistics
gender height weight ap_hi ap_lo cholesterol gluc smoke alco active cardio age_years bmi
Female:44471 Min. :140.0 Min. : 32.00 Min. : 70.0 Min. : 50.00 Normal :51270 Normal :58136 No :62310 No :64682 No :13433 Neg:34556 Min. :29.56 Min. :15.01
Male :23860 1st Qu.:159.0 1st Qu.: 65.00 1st Qu.:120.0 1st Qu.: 80.00 Above Normal : 9242 Above Normal : 5024 Yes: 6021 Yes: 3649 Yes:54898 Pos:33775 1st Qu.:48.34 1st Qu.:23.88
NA Median :165.0 Median : 72.00 Median :120.0 Median : 80.00 Well Above Normal: 7819 Well Above Normal: 5171 NA NA NA NA Median :53.94 Median :26.30
NA Mean :164.5 Mean : 73.99 Mean :126.7 Mean : 81.31 NA NA NA NA NA NA Mean :53.29 Mean :27.38
NA 3rd Qu.:170.0 3rd Qu.: 82.00 3rd Qu.:140.0 3rd Qu.: 90.00 NA NA NA NA NA NA 3rd Qu.:58.38 3rd Qu.:30.11
NA Max. :207.0 Max. :180.00 Max. :240.0 Max. :180.00 NA NA NA NA NA NA Max. :64.92 Max. :50.00

Interpretation

Transformed coded variables into readable factors and engineered new features (age_years, bmi). Outliers were filtered out to create a clean dataset ready for analysis. No missing values were detected.


Descriptive Analytics

Analysis Questions

  • How are the numeric variables related?
  • What is the age distribution of patients?

Code

cat("#### Correlation Plot of Numeric Variables\n\n")
## #### Correlation Plot of Numeric Variables
num_vars <- select_if(data, is.numeric)
corrplot(cor(num_vars), method = "color", type = "lower", 
         col = brewer.pal(n = 8, name = "RdYlBu"))

cat("\n\n# Age Distribution\n\n")
## 
## 
## # Age Distribution
ggplot(data, aes(x = age_years)) +
  geom_histogram(binwidth = 2, fill = "lightblue", color = "black", alpha = 0.8) +
  labs(title = "Age Distribution of Patients", x = "Age in Years", y = "Count") +
  theme_minimal()

Interpretation

The correlation plot shows that age_years, ap_hi, and ap_lo are positively correlated, as expected. weight and bmi are also strongly correlated. The age distribution histogram reveals most patients are between 50–65 years old.


Industry Application Analysis

Analysis Questions

  • What is the distribution of patients with and without cardiovascular disease?
  • What are the average numeric values for each group?

Code

cat("# Patient Counts by Outcome\n\n")
## # Patient Counts by Outcome
knitr::kable(table(data$cardio), col.names = c("Outcome", "Count"), 
             caption = "Patient Counts")
Patient Counts
Outcome Count
Neg 34556
Pos 33775
cat("\n\n# Average Stats by Outcome\n\n")
## 
## 
## # Average Stats by Outcome
mean_stats <- data %>%
  select(cardio, age_years, ap_hi, ap_lo, bmi) %>%
  group_by(cardio) %>%
  summarise(across(everything(), list(mean = mean)))

knitr::kable(mean_stats, digits = 1, caption = "Average Age, BP, and BMI by Outcome")
Average Age, BP, and BMI by Outcome
cardio age_years_mean ap_hi_mean ap_lo_mean bmi_mean
Neg 51.7 119.6 78.1 26.4
Pos 54.9 133.9 84.6 28.4

Interpretation

The dataset is well-balanced between “No Disease” and “Disease” outcomes. Patients with disease are older and have higher blood pressure and BMI, suggesting these are key risk predictors.


Business Intelligence & Analytics

Analysis Questions

  • Is the difference in blood pressure significant?
  • Can we build predictive models (Logistic Regression & KNN)?
  • Which model performs better?

Code

set.seed(123)

# 1. TRAIN–TEST SPLIT

train_index <- createDataPartition(data$cardio, p = 0.75, list = FALSE)
train_data <- data[train_index, ]
test_data  <- data[-train_index, ]

# FORCE correct factor structure
train_data$cardio <- factor(train_data$cardio, levels = c("Neg", "Pos"))
test_data$cardio  <- factor(test_data$cardio, levels = c("Neg", "Pos"))

# 2. ANOVA (Blood Pressure)
cat("#### Statistical Test (ANOVA) for Blood Pressure\n\n")
## #### Statistical Test (ANOVA) for Blood Pressure
anova_model <- aov(ap_hi ~ cardio, data = data)
knitr::kable(broom::tidy(anova_model), 
             caption = "ANOVA: Systolic BP (ap_hi) vs. Cardio Outcome")
ANOVA: Systolic BP (ap_hi) vs. Cardio Outcome
term df sumsq meansq statistic p.value
cardio 1 3477941 3477941.4004 15347.17 0
Residuals 68329 15484565 226.6178 NA NA
# 3. LOGISTIC REGRESSION
cat("\n\n#### Predictive Model 1: Logistic Regression\n\n")
## 
## 
## #### Predictive Model 1: Logistic Regression
glm_fit <- glm(cardio ~ ., data = train_data, family = "binomial")

# Predictors ranked by effect
odds_ratios <- broom::tidy(glm_fit, exponentiate = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(sort_key = abs(estimate - 1)) %>%
  top_n(10, sort_key)

ggplot(odds_ratios, aes(x = estimate, 
                        y = reorder(term, estimate), 
                        fill = estimate > 1)) +
  geom_col(alpha = 0.8) +
  scale_fill_manual(values = c("#4575b4", "#d73027"), 
                    labels = c("Decreases Risk", "Increases Risk")) +
  labs(title = "Top 10 Predictors (Logistic Regression)",
       x = "Odds Ratio", y = "Predictor") +
  theme_minimal()

# 4. KNN (ROC)
cat("\n\n#### Predictive Model 2: K-Nearest Neighbors (KNN)\n\n")
## 
## 
## #### Predictive Model 2: K-Nearest Neighbors (KNN)
train_data$cardio <- factor(train_data$cardio, levels = c("Neg", "Pos"))
test_data$cardio  <- factor(test_data$cardio, levels = c("Neg", "Pos"))

ctrl <- trainControl(
  method = "cv",
  number = 3,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  savePredictions = "final"
)

knn_fit <- train(
  cardio ~ ., 
  data = train_data,
  method = "knn",
  metric = "ROC",
  trControl = ctrl,
  preProcess = c("center", "scale"),
  tuneGrid = data.frame(k = c(5, 11, 17))
)

plot(knn_fit)

# 5. MODEL PERFORMANCE
cat("\n\n#### Model Performance on Test Set\n\n")
## 
## 
## #### Model Performance on Test Set
# Logistic Regression predictions
pred_prob_glm <- predict(glm_fit, test_data, type = "response")
pred_class_glm <- factor(
  ifelse(pred_prob_glm > 0.5, "Pos", "Neg"),
  levels = c("Neg", "Pos")
)

# KNN predictions
pred_prob_knn <- predict(knn_fit, test_data, type = "prob")[,"Pos"]
pred_class_knn <- predict(knn_fit, test_data)

# Final comparison table
model_stats <- data.frame(
  Model = c("Logistic Regression", "KNN"),
  Accuracy = c(
    confusionMatrix(pred_class_glm, test_data$cardio, positive = "Pos")$overall["Accuracy"],
    confusionMatrix(pred_class_knn, test_data$cardio, positive = "Pos")$overall["Accuracy"]
  ),
  AUC = c(
    auc(roc(test_data$cardio, pred_prob_glm, quiet = TRUE)),
    auc(roc(test_data$cardio, pred_prob_knn, quiet = TRUE))
  )
)

knitr::kable(model_stats, digits = 4, caption = "Model Performance Comparison")
Model Performance Comparison
Model Accuracy AUC
Logistic Regression 0.7242 0.7882
KNN 0.7157 0.7731

Interpretation

ANOVA confirms significant BP differences between groups. Logistic Regression highlights ap_hi, cholesterol, and age_years as key predictors. Logistic Regression performs slightly better than KNN in both Accuracy and AUC.


Big Data Analytics (Clustering)

Analysis Questions

  • Can we find hidden patient groups with K-Means?
  • How do these clusters align with cardio outcome?

Code

cat("#### K-Means Cluster Plot (k=3)\n\n")
## #### K-Means Cluster Plot (k=3)
numeric_for_cluster <- data %>% select_if(is.numeric) %>% scale()
set.seed(123)
sample_data <- numeric_for_cluster[sample(1:nrow(numeric_for_cluster), 5000), ]
km_fit <- kmeans(sample_data, centers = 3, nstart = 25)

fviz_cluster(km_fit, data = sample_data, geom = "point", ellipse.type = "convex",
             palette = "Set2", ggtheme = theme_minimal(), main = "K-Means Clusters (k=3)")

cat("\n\n#### Clusters vs. Actual Cardio Status\n\n")
## 
## 
## #### Clusters vs. Actual Cardio Status
km_full <- kmeans(numeric_for_cluster, centers = 3, nstart = 25)
cluster_table <- table(Cluster = km_full$cluster, Outcome = data$cardio)
knitr::kable(cluster_table, caption = "Cluster Membership vs. Actual Outcome")
Cluster Membership vs. Actual Outcome
Neg Pos
25547 12305
3001 12429
6008 9041
knitr::kable(prop.table(cluster_table, 1), digits = 2, caption = "Proportion of Outcome by Cluster")
Proportion of Outcome by Cluster
Neg Pos
0.67 0.33
0.19 0.81
0.40 0.60

Interpretation

K-Means identifies three clear patient groups. Cluster–Outcome comparison tables show that clusters capture meaningful differences related to disease likelihood.


Data Visualization

Analysis Questions

  • How do glucose levels compare between disease groups?
  • How do age and systolic BP interact with outcome?

Code

cat("#### Glucose Levels by Cardio Outcome\n\n")
## #### Glucose Levels by Cardio Outcome
ggplot(data, aes(x = cardio, y = gluc, fill = cardio)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Glucose Levels by Cardio Outcome", x = "Cardio Outcome", y = "Glucose Level") +
  theme_minimal()

cat("\n\n#### Age vs. Blood Pressure Scatter Plot\n\n")
## 
## 
## #### Age vs. Blood Pressure Scatter Plot
ggplot(data, aes(x = age_years, y = ap_hi, color = cardio)) +
  geom_point(alpha = 0.2) +
  scale_color_manual(values = c("No Disease" = "#1b9e77", "Disease" = "#d95f02")) +
  labs(title = "Age vs. Systolic Blood Pressure", x = "Age (Years)", y = "Systolic BP") +
  theme_minimal()

Interpretation

The glucose boxplot shows little difference between groups, while the scatter plot reveals that older patients with higher systolic BP are much more likely to have cardiovascular disease.